/* | |
* Licensed to the Apache Software Foundation (ASF) under one | |
* or more contributor license agreements. See the NOTICE file | |
* distributed with this work for additional information | |
* regarding copyright ownership. The ASF licenses this file | |
* to you under the Apache License, Version 2.0 (the | |
* "License"); you may not use this file except in compliance | |
* with the License. You may obtain a copy of the License at | |
* | |
* http://www.apache.org/licenses/LICENSE-2.0 | |
* | |
* Unless required by applicable law or agreed to in writing, | |
* software distributed under the License is distributed on an | |
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | |
* KIND, either express or implied. See the License for the | |
* specific language governing permissions and limitations | |
* under the License. | |
*/ | |
package org.apache.uima.ruta.engine; | |
import java.util.HashSet; | |
import java.util.Iterator; | |
import java.util.Set; | |
import java.util.SortedSet; | |
import java.util.TreeSet; | |
import java.util.regex.Matcher; | |
import java.util.regex.Pattern; | |
import org.apache.commons.lang3.StringEscapeUtils; | |
import org.apache.commons.lang3.StringUtils; | |
import org.apache.uima.UimaContext; | |
import org.apache.uima.analysis_engine.AnalysisEngineProcessException; | |
import org.apache.uima.cas.CAS; | |
import org.apache.uima.cas.CASException; | |
import org.apache.uima.cas.FSIterator; | |
import org.apache.uima.cas.Feature; | |
import org.apache.uima.cas.Type; | |
import org.apache.uima.cas.TypeSystem; | |
import org.apache.uima.cas.text.AnnotationIndex; | |
import org.apache.uima.fit.component.JCasAnnotator_ImplBase; | |
import org.apache.uima.fit.descriptor.ConfigurationParameter; | |
import org.apache.uima.jcas.JCas; | |
import org.apache.uima.jcas.tcas.Annotation; | |
import org.apache.uima.resource.ResourceInitializationException; | |
import org.apache.uima.ruta.UIMAConstants; | |
import org.apache.uima.util.CasCopier; | |
import org.apache.uima.util.Level; | |
import org.htmlparser.Parser; | |
import org.htmlparser.util.NodeList; | |
import org.htmlparser.util.ParserException; | |
/** | |
* This Analysis Engine is able to convert html content from a source view into a plain string | |
* representation stored in an output view. Especially, the Analysis Engine transfers annotations | |
* under consideration of the changed document text and annotation offsets in the new view. The copy | |
* process also sets features, however, features of type annotation are currently not supported. | |
* Note that if an annotation would have the same start and end positions in the new view, i.e., if | |
* it would be mapped to an annotation of length 0, it is not moved to the new view. | |
* | |
* The HTML Converter also supports heuristic and explicit conversion patterns which default to | |
* html4 decoding, e.g., "{@literal }", "{@literal <}", etc. Concepts like tables or | |
* lists are not supported. | |
* | |
* Note that in general it is suggested to run an html cleaner before any further processing to | |
* avoid problems with malformed html. | |
* | |
* A descriptor file for this Analysis Engine is located in the folder <code>descriptor/utils</code> | |
* of a UIMA Ruta project. | |
* | |
*/ | |
public class HtmlConverter extends JCasAnnotator_ImplBase { | |
public static final String NAMESPACE = "org.apache.uima.ruta.type.html."; | |
public static final String DEFAULT_MODIFIED_VIEW = "plaintext"; | |
public static final String LINEBREAK = "\n"; | |
/** | |
* This string parameter specifies the name of the new view. The default value is | |
* <code>plaintext</code>. | |
*/ | |
public static final String PARAM_OUTPUT_VIEW = "outputView"; | |
@ConfigurationParameter(name = PARAM_OUTPUT_VIEW, mandatory = false, defaultValue = DEFAULT_MODIFIED_VIEW) | |
private String modifiedViewName; | |
/** | |
* This string parameter can optionally be set to specify the name of the input view. | |
*/ | |
public static final String PARAM_INPUT_VIEW = "inputView"; | |
@ConfigurationParameter(name = PARAM_INPUT_VIEW, mandatory = false) | |
private String inputViewName; | |
/** | |
* This boolean parameter determines if linebreaks inside the text nodes are kept or removed. The | |
* default behavior is <code>true</code>. | |
*/ | |
public static final String PARAM_REPLACE_LINEBREAKS = "replaceLinebreaks"; | |
@ConfigurationParameter(name = PARAM_REPLACE_LINEBREAKS, mandatory = false, defaultValue = "true") | |
private Boolean replaceLinebreaks; | |
/** | |
* This boolean parameter determines if the converter should skip whitespaces. Html documents | |
* often contains whitespaces for indentation and formatting, which should not be reproduced in | |
* the converted plain text document. If the parameter is set to false, then the whitespces are | |
* not removed. This behavior is useful, if not Html documents are converted, but XMl files. The | |
* default value is true. | |
*/ | |
public static final String PARAM_SKIP_WHITESPACES = "skipWhitespaces"; | |
@ConfigurationParameter(name = PARAM_SKIP_WHITESPACES, mandatory = false, defaultValue = "true") | |
private Boolean skipWhitespaces; | |
/** | |
* If this boolean parameter is set to true, then the tags of the complete document is processed | |
* and not only those within the body tag. | |
*/ | |
public static final String PARAM_PROCESS_ALL = "processAll"; | |
@ConfigurationParameter(name = PARAM_PROCESS_ALL, mandatory = false, defaultValue = "false") | |
private Boolean processAll; | |
/** | |
* If this boolean parameter is set to true, then zero-length annotation will not be dropped, but | |
* they will be assigned to the offset of the "nearest" annotation. In that case, a boolean | |
* feature names offsetsExpanded will be set to true if available. | |
*/ | |
public static final String PARAM_EXPAND_OFFSETS = "expandOffsets"; | |
@ConfigurationParameter(name = PARAM_EXPAND_OFFSETS, mandatory = false, defaultValue = "false") | |
private Boolean expandOffsets; | |
/** | |
* This string parameter determines the character sequence that replaces a linebreak. The default | |
* behavior is the empty string. | |
*/ | |
public static final String PARAM_LINEBREAK_REPLACEMENT = "linebreakReplacement"; | |
@ConfigurationParameter(name = PARAM_LINEBREAK_REPLACEMENT, mandatory = false, defaultValue = "") | |
private String linebreakReplacement; | |
/** | |
* This string array parameter sets the names of the html tags that create linebreaks in the | |
* output view. The default is <code>br, p, div, ul, ol, dl, li, h1, ..., h6, blockquote</code>. | |
*/ | |
public static final String PARAM_NEWLINE_INDUCING_TAGS = "newlineInducingTags"; | |
@ConfigurationParameter(name = PARAM_NEWLINE_INDUCING_TAGS, mandatory = false, defaultValue = { | |
"br", "p", "div", "ul", "ol", "dl", "li", "h1", "h2", "h3", "h4", "h5", "h6", "blockquote" }) | |
private String[] newlineInducingTags; | |
/** | |
* This string parameter contains a regular expression for HTML/XML elements. If the pattern | |
* matches, then the element will introduce a new line break similar to the element of the | |
* parameter <code>newlineInducingTags</code>. | |
*/ | |
public static final String PARAM_NEWLINE_INDUCING_TAG_REGEXP = "newlineInducingTagRegExp"; | |
@ConfigurationParameter(name = PARAM_NEWLINE_INDUCING_TAG_REGEXP, mandatory = false) | |
private String newlineInducingTagRegExp; | |
/** | |
* This string array parameter sets the names of the html tags that create additional text in the | |
* output view. The acutal string of the gap is defined by the parameter <code>gapText</code>. | |
*/ | |
public static final String PARAM_GAP_INDUCING_TAGS = "gapInducingTags"; | |
@ConfigurationParameter(name = PARAM_GAP_INDUCING_TAGS, mandatory = false) | |
private String[] gapInducingTags; | |
/** | |
* This string parameter determines the character sequence that is introduced by the html tags | |
* specified in the <code>gapInducingTags</code>. | |
*/ | |
public static final String PARAM_GAP_TEXT = "gapText"; | |
@ConfigurationParameter(name = PARAM_GAP_TEXT, mandatory = false, defaultValue = "") | |
private String gapText; | |
/** | |
* This boolean parameter sets the value of the parameter <code>gapText</code> to a single space. | |
*/ | |
public static final String PARAM_USE_SPACE_GAP = "useSpaceGap"; | |
@ConfigurationParameter(name = PARAM_USE_SPACE_GAP, mandatory = false, defaultValue = "") | |
private Boolean useSpaceGap; | |
/** | |
* This string array parameter can be used to apply custom conversions. It defaults to a list of | |
* commonly used codes, e.g., {@literal }, which are converted using html 4 entity | |
* unescaping. However, explicit conversion strings can also be passed via the parameter | |
* <code>conversionReplacements</code>. Remember to enable explicit conversion via | |
* <code>conversionPolicy</code> first. | |
*/ | |
public static final String PARAM_CONVERSION_PATTERNS = "conversionPatterns"; | |
@ConfigurationParameter(name = PARAM_CONVERSION_PATTERNS, mandatory = false, defaultValue = { | |
" ", "«", "»", """, "&", "<", ">", "'", "§", | |
"¨", "©", "™", "®", "ö", "ä", "ü", " " }) | |
private String[] conversionPatterns; | |
/** | |
* This string parameter determines the conversion policy used, either "heuristic", "explicit", or | |
* "none". When the value is "explicit", the parameters <code>conversionPatterns</code> and | |
* optionally <code>conversionReplacements</code> are considered. The "heuristic" conversion | |
* policy uses simple regular expressions to decode html4 entities such as "{@literal }". | |
* The default behavior is "heuristic". | |
*/ | |
public static final String PARAM_CONVERSION_POLICY = "conversionPolicy"; | |
@ConfigurationParameter(name = PARAM_CONVERSION_POLICY, mandatory = false, defaultValue = "heuristic") | |
private String conversionPolicy; | |
/** | |
* This string array parameter corresponds to <code>conversionPatterns</code> such that | |
* <code>conversionPatterns[i]</code> will be replaced by <code>conversionReplacements[i]</code>; | |
* replacements should be shorter than the source pattern. Per default, the replacement strings | |
* are computed using Html4 decoding. Remember to enable explicit conversion via | |
* <code>conversionPolicy</code> first. | |
*/ | |
public static final String PARAM_CONVERSION_REPLACEMENTS = "conversionReplacements"; | |
@ConfigurationParameter(name = PARAM_CONVERSION_REPLACEMENTS, mandatory = false) | |
private String[] conversionReplacements; | |
private int[] map; | |
@Override | |
public void initialize(UimaContext aContext) throws ResourceInitializationException { | |
super.initialize(aContext); | |
inputViewName = (String) aContext.getConfigParameterValue(PARAM_INPUT_VIEW); | |
inputViewName = StringUtils.isBlank(inputViewName) ? null : inputViewName; | |
modifiedViewName = (String) aContext.getConfigParameterValue(PARAM_OUTPUT_VIEW); | |
modifiedViewName = StringUtils.isBlank(modifiedViewName) ? DEFAULT_MODIFIED_VIEW | |
: modifiedViewName; | |
replaceLinebreaks = (Boolean) aContext.getConfigParameterValue(PARAM_REPLACE_LINEBREAKS); | |
replaceLinebreaks = replaceLinebreaks == null ? true : replaceLinebreaks; | |
skipWhitespaces = (Boolean) aContext.getConfigParameterValue(PARAM_SKIP_WHITESPACES); | |
skipWhitespaces = skipWhitespaces == null ? true : skipWhitespaces; | |
processAll = (Boolean) aContext.getConfigParameterValue(PARAM_PROCESS_ALL); | |
processAll = processAll == null ? true : processAll; | |
linebreakReplacement = (String) aContext.getConfigParameterValue(PARAM_LINEBREAK_REPLACEMENT); | |
linebreakReplacement = linebreakReplacement == null ? "" : linebreakReplacement; | |
String conversionPolicy = (String) aContext.getConfigParameterValue(PARAM_CONVERSION_POLICY); | |
if (StringUtils.isBlank(conversionPolicy) || conversionPolicy.equals("heuristic")) { | |
conversionPolicy = "heuristic"; | |
} else if (conversionPolicy.equals("explicit")) { | |
} else if (conversionPolicy.equals("none")) { | |
} else { | |
throw new ResourceInitializationException("illegal conversionPolicy parameter value", | |
new Object[0]); | |
} | |
String[] nlTags = (String[]) aContext.getConfigParameterValue(PARAM_NEWLINE_INDUCING_TAGS); | |
if (nlTags == null) { | |
newlineInducingTags = new String[] { "br", "p", "div", "ul", "ol", "dl", "li", "h1", "h2", | |
"h3", "h4", "h5", "h6", "blockquote" }; | |
} | |
// check assertions | |
if (modifiedViewName.equals(inputViewName)) { | |
throw new ResourceInitializationException("input and output view names must differ!", | |
new Object[0]); | |
} | |
conversionPatterns = (String[]) aContext.getConfigParameterValue(PARAM_CONVERSION_PATTERNS); | |
if (conversionPatterns == null) { | |
conversionPatterns = new String[] { " ", "«", "»", """, "&", | |
"<", ">", "'", "§", "¨", "©", "™", "®", "ö", | |
"ä", "ü", " " }; | |
} | |
conversionReplacements = (String[]) aContext | |
.getConfigParameterValue(PARAM_CONVERSION_REPLACEMENTS); | |
if (conversionReplacements == null) { | |
conversionReplacements = new String[conversionPatterns.length]; | |
for (int i = 0; i < conversionPatterns.length; i++) { | |
String c = conversionPatterns[i]; | |
String rep = StringEscapeUtils.unescapeHtml4(c); | |
conversionReplacements[i] = rep; | |
} | |
} | |
gapText = (String) aContext.getConfigParameterValue(PARAM_GAP_TEXT); | |
gapText = gapText == null ? "" : gapText; | |
useSpaceGap = (Boolean) aContext.getConfigParameterValue(PARAM_USE_SPACE_GAP); | |
useSpaceGap = useSpaceGap == null ? false : useSpaceGap; | |
if(useSpaceGap) { | |
gapText = " "; | |
} | |
gapInducingTags = (String[]) aContext.getConfigParameterValue(PARAM_GAP_INDUCING_TAGS); | |
gapInducingTags = gapInducingTags == null ? new String[0] : gapInducingTags; | |
expandOffsets = (Boolean) aContext.getConfigParameterValue(PARAM_EXPAND_OFFSETS); | |
expandOffsets = expandOffsets == null ? false : expandOffsets; | |
newlineInducingTagRegExp = (String) aContext.getConfigParameterValue(PARAM_NEWLINE_INDUCING_TAG_REGEXP); | |
} | |
@Override | |
public void process(JCas jcaz) throws AnalysisEngineProcessException { | |
JCas jcas; | |
try { | |
if (inputViewName != null) { | |
jcas = jcaz.getView(inputViewName); | |
} else { | |
jcas = jcaz; | |
} | |
} catch (CASException e1) { | |
throw new AnalysisEngineProcessException(e1.getCause()); | |
} | |
// init: | |
String documentText = jcas.getDocumentText(); | |
String splitSeq = documentText.contains("\r\n") ? "\r\n" : "\n"; | |
map = new int[documentText.length() + 1]; | |
JCas modview = null; | |
try { | |
// check if view already exists: | |
Iterator<JCas> viewIterator = jcas.getViewIterator(); | |
while (viewIterator.hasNext()) { | |
JCas jCas2 = (JCas) viewIterator.next(); | |
if (jCas2.getViewName().equals(modifiedViewName)) { | |
modview = jCas2; | |
getContext().getLogger().log(Level.WARNING, | |
"view with name \"" + modifiedViewName + "\" already exists."); | |
} | |
} | |
if (modview == null) { | |
modview = jcas.createView(modifiedViewName); | |
} | |
} catch (CASException e) { | |
e.printStackTrace(); | |
return; | |
} | |
SortedSet<HtmlConverterPSpan> visibleSpansSoFar = new TreeSet<HtmlConverterPSpan>(); | |
SortedSet<HtmlConverterPSpan> linebreaksFromHtmlTags = new TreeSet<HtmlConverterPSpan>(); | |
SortedSet<HtmlConverterPSpan> gapsFromHtmlTags = new TreeSet<HtmlConverterPSpan>(); | |
// process | |
try { | |
Parser parser = new Parser(documentText); | |
NodeList list = parser.parse(null); | |
HtmlConverterVisitor visitor = new HtmlConverterVisitor(newlineInducingTags, newlineInducingTagRegExp, gapInducingTags, | |
gapText, skipWhitespaces, processAll); | |
list.visitAllNodesWith(visitor); | |
visibleSpansSoFar = visitor.getTextSpans(); | |
linebreaksFromHtmlTags = visitor.getLinebreaksFromHtmlTags(); | |
gapsFromHtmlTags = visitor.getGapsFromHtmlTags(); | |
} catch (ParserException e) { | |
throw new AnalysisEngineProcessException(e); | |
} | |
if (replaceLinebreaks) { | |
visibleSpansSoFar = this.handleLinebreaksInDocumentText(visibleSpansSoFar, splitSeq); | |
} | |
if (conversionPolicy.equals("heuristic")) { | |
visibleSpansSoFar = this.htmlDecoding(visibleSpansSoFar); | |
} else if (conversionPolicy.equals("explicit")) { | |
for (int i = 0; i < conversionPatterns.length; i++) { | |
String pat = conversionPatterns[i]; | |
String rep = conversionReplacements[i]; | |
visibleSpansSoFar = this.handleConversion(visibleSpansSoFar, pat, rep); | |
} | |
} | |
visibleSpansSoFar.addAll(linebreaksFromHtmlTags); | |
visibleSpansSoFar.addAll(gapsFromHtmlTags); | |
// create new doc-text and the map from deletions and visible-text-spans: | |
StringBuffer sbu = new StringBuffer(documentText.length()); | |
int originalOffsetI = 0; | |
int outOffset = 0; | |
for (HtmlConverterPSpan vis : visibleSpansSoFar) { | |
final int begin = vis.getBegin(); | |
final int end = vis.getEnd(); | |
// map text before annotation: | |
while (originalOffsetI < begin) { | |
map[originalOffsetI++] = outOffset; | |
} | |
// get and map text/replacement: | |
String s = ""; | |
if (vis instanceof HtmlConverterPSpanReplacement) { | |
// conversion/replacement: | |
s = vis.getTxt(); | |
// asserts that s is shorter than the original source | |
while (originalOffsetI < begin + s.length()) { | |
map[originalOffsetI++] = outOffset++; | |
} | |
while (originalOffsetI < end) { | |
map[originalOffsetI++] = outOffset; | |
} | |
} else { | |
// simple annotation: | |
s = documentText.substring(begin, end); | |
while (originalOffsetI < end) { | |
map[originalOffsetI++] = outOffset++; | |
} | |
} | |
sbu.append(s); | |
} | |
while (originalOffsetI < documentText.length()) { | |
map[originalOffsetI++] = outOffset; | |
} | |
map[documentText.length()] = outOffset + 1; // handle doc end separately | |
String modTxt = sbu.toString(); | |
modview.setDocumentText(modTxt); | |
// copy annotations using the 'map': | |
try { | |
mapAnnotations(jcas, map, modifiedViewName); | |
} catch (CASException e) { | |
e.printStackTrace(); | |
} | |
} | |
private void mapAnnotations(JCas fromJcas, int[] map, String toView) throws CASException { | |
JCas modview = fromJcas.getView(toView); | |
Set<Annotation> indexedFs = new HashSet<Annotation>(); | |
Set<Annotation> toExpand = new HashSet<Annotation>(); | |
AnnotationIndex<Annotation> annotationIndex = fromJcas.getAnnotationIndex(); | |
TypeSystem typeSystem = fromJcas.getTypeSystem(); | |
Type docType = typeSystem.getType(UIMAConstants.TYPE_DOCUMENT); | |
CasCopier casCopier = new CasCopier(fromJcas.getCas(), modview.getCas()); | |
for (Annotation annotation : annotationIndex) { | |
// TODO be careful here, because some people inherit from DocumentAnnotation | |
if (typeSystem.subsumes(docType, annotation.getType())) { | |
continue; | |
} | |
Annotation clone = (Annotation) casCopier.copyFs(annotation); | |
// change the view/sofa of the new annotation... | |
// see: http://osdir.com/ml/apache.uima.general/2007-09/msg00107.html | |
clone.setFeatureValue(modview.getTypeSystem() | |
.getFeatureByFullName(CAS.FEATURE_FULL_NAME_SOFA), modview.getSofa()); | |
final int mappedBegin = map[clone.getBegin()]; | |
final int mappedEnd = map[clone.getEnd()]; | |
if (mappedBegin < mappedEnd) { | |
if (mappedEnd > fromJcas.getCas().getDocumentAnnotation().getEnd()) { | |
getContext().getLogger().log(Level.WARNING, "illegal annotation offset mapping"); | |
} else { | |
int max = modview.getCas().getDocumentAnnotation().getEnd(); | |
if (mappedBegin < max && mappedEnd <= max && mappedBegin >= 0 && mappedEnd > 0) { | |
clone.setBegin(mappedBegin); | |
clone.setEnd(mappedEnd); | |
// TODO handle nested annotation features | |
modview.addFsToIndexes(clone); | |
indexedFs.add(clone); | |
} else { | |
getContext().getLogger().log(Level.WARNING, "illegal annotation offset mapping"); | |
} | |
} | |
} else if (expandOffsets) { | |
clone.setBegin(mappedBegin); | |
clone.setEnd(mappedEnd); | |
toExpand.add(clone); | |
} | |
} | |
for (Annotation each : toExpand) { | |
Annotation nextBestAnnotation = getNextBestAnnotation(each, modview); | |
if (nextBestAnnotation != null) { | |
each.setBegin(nextBestAnnotation.getBegin()); | |
each.setEnd(nextBestAnnotation.getEnd()); | |
Feature expandedOffsetsFeature = each.getType().getFeatureByBaseName("expandedOffsets"); | |
if (expandedOffsetsFeature != null) { | |
each.setBooleanValue(expandedOffsetsFeature, true); | |
} | |
modview.addFsToIndexes(each); | |
} | |
} | |
} | |
private Annotation getNextBestAnnotation(Annotation source, JCas jcas) { | |
FSIterator<Annotation> iterator = jcas.getAnnotationIndex().iterator(source); | |
Annotation best = null; | |
if (iterator.isValid()) { | |
Annotation annotation = iterator.get(); | |
best = annotation; | |
} else { | |
Annotation dummy = new Annotation(jcas, source.getBegin(), source.getBegin() + 1); | |
iterator = jcas.getAnnotationIndex().iterator(dummy); | |
if (!iterator.isValid()) { | |
if ((jcas.getDocumentText().length() / 2) > source.getBegin()) { | |
iterator.moveToFirst(); | |
if (iterator.isValid()) { | |
Annotation annotation = iterator.get(); | |
best = annotation; | |
} | |
} else { | |
iterator.moveToLast(); | |
if (iterator.isValid()) { | |
Annotation annotation = iterator.get(); | |
best = annotation; | |
} | |
} | |
} | |
} | |
return best; | |
} | |
private SortedSet<HtmlConverterPSpan> handleLinebreaksInDocumentText( | |
SortedSet<HtmlConverterPSpan> visibleSpansSoFar, String splitSeq) { | |
return this.handleConversion(visibleSpansSoFar, splitSeq, linebreakReplacement); | |
} | |
private SortedSet<HtmlConverterPSpan> htmlDecoding(SortedSet<HtmlConverterPSpan> visibleSpansSoFar) { | |
TreeSet<HtmlConverterPSpan> copy = new TreeSet<HtmlConverterPSpan>(visibleSpansSoFar); | |
Pattern patt = Pattern.compile("(&[a-zA-Z0-9]{2,6};)|(&#\\d{2,5};)"); | |
for (HtmlConverterPSpan pSpan : visibleSpansSoFar) { | |
String spanTxt = pSpan.getTxt(); | |
Matcher matcher = patt.matcher(spanTxt); | |
if (matcher.find()) { | |
copy.remove(pSpan); | |
int pSpanBegin = pSpan.getBegin(); | |
int ioff = pSpan.getBegin(); | |
do { | |
String sourceString = matcher.group(); | |
String replacement = StringEscapeUtils.unescapeHtml4(sourceString); | |
HtmlConverterPSpanReplacement replacementSpan = new HtmlConverterPSpanReplacement( | |
pSpanBegin + matcher.start(), pSpanBegin + matcher.end(), replacement); | |
copy.add(replacementSpan); | |
int replacementLength = sourceString.length(); | |
if (pSpanBegin + matcher.end() > ioff + replacementLength) { | |
int ib = ioff; | |
int ie = pSpanBegin + matcher.start(); | |
String newTxt = spanTxt.substring(ib - pSpanBegin, ie - pSpanBegin); | |
copy.add(new HtmlConverterPSpan(ib, ie, newTxt)); | |
ioff = ie; | |
} | |
ioff += replacementLength; // | |
} while (matcher.find()); | |
if (ioff < pSpan.getEnd()) { | |
int ib = ioff; | |
int ie = pSpan.getEnd(); | |
String newTxt = spanTxt.substring(ib - pSpanBegin, ie - pSpanBegin); | |
copy.add(new HtmlConverterPSpan(ioff, pSpan.getEnd(), newTxt)); | |
} | |
} | |
} | |
return copy; | |
} | |
private SortedSet<HtmlConverterPSpan> handleConversion( | |
SortedSet<HtmlConverterPSpan> visibleSpansSoFar, String patternString, String replacement) { | |
TreeSet<HtmlConverterPSpan> copy = new TreeSet<HtmlConverterPSpan>(visibleSpansSoFar); | |
Pattern patt = Pattern.compile(patternString); | |
int replacementLength = patternString.length(); | |
for (HtmlConverterPSpan pSpan : visibleSpansSoFar) { | |
String spanTxt = pSpan.getTxt(); | |
Matcher matcher = patt.matcher(spanTxt); | |
if (matcher.find()) { | |
copy.remove(pSpan); | |
int pSpanBegin = pSpan.getBegin(); | |
int ioff = pSpan.getBegin(); | |
do { | |
if (!StringUtils.isEmpty(replacement)) { | |
HtmlConverterPSpanReplacement replacementSpan = new HtmlConverterPSpanReplacement( | |
pSpanBegin + matcher.start(), pSpanBegin + matcher.end(), replacement); | |
copy.add(replacementSpan); | |
} | |
if (pSpanBegin + matcher.end() > ioff + replacementLength) { | |
int ib = ioff; | |
int ie = pSpanBegin + matcher.start(); | |
String newTxt = spanTxt.substring(ib - pSpanBegin, ie - pSpanBegin); | |
copy.add(new HtmlConverterPSpan(ib, ie, newTxt)); | |
ioff = ie; | |
} | |
ioff += replacementLength; // | |
} while (matcher.find()); | |
if (ioff < pSpan.getEnd()) { | |
int ib = ioff; | |
int ie = pSpan.getEnd(); | |
String newTxt = spanTxt.substring(ib - pSpanBegin, ie - pSpanBegin); | |
copy.add(new HtmlConverterPSpan(ioff, pSpan.getEnd(), newTxt)); | |
} | |
} | |
} | |
return copy; | |
} | |
} |