| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| */ |
| |
| package org.apache.uima.examples.cas; |
| |
| import java.io.BufferedReader; |
| import java.io.IOException; |
| import java.io.InputStream; |
| import java.io.InputStreamReader; |
| import java.util.ArrayList; |
| import java.util.Arrays; |
| import java.util.List; |
| import java.util.regex.Matcher; |
| import java.util.regex.Pattern; |
| import java.util.regex.PatternSyntaxException; |
| |
| import org.apache.uima.UimaContext; |
| import org.apache.uima.analysis_component.CasAnnotator_ImplBase; |
| import org.apache.uima.analysis_engine.AnalysisEngineProcessException; |
| import org.apache.uima.analysis_engine.annotator.AnnotatorConfigurationException; |
| import org.apache.uima.analysis_engine.annotator.AnnotatorContext; |
| import org.apache.uima.analysis_engine.annotator.AnnotatorInitializationException; |
| import org.apache.uima.analysis_engine.annotator.AnnotatorProcessException; |
| import org.apache.uima.analysis_engine.annotator.TextAnnotator; |
| import org.apache.uima.cas.CAS; |
| import org.apache.uima.cas.FSIterator; |
| import org.apache.uima.cas.FSTypeConstraint; |
| import org.apache.uima.cas.FeatureStructure; |
| import org.apache.uima.cas.Type; |
| import org.apache.uima.cas.TypeSystem; |
| import org.apache.uima.cas.text.AnnotationFS; |
| import org.apache.uima.resource.ResourceAccessException; |
| import org.apache.uima.resource.ResourceInitializationException; |
| import org.apache.uima.util.Level; |
| |
| /** |
| * Annotator that find substrings of the input document that match regular expressions. |
| * <p> |
| * There are two ways to specify the regular expressions - via configuration parameters or via an |
| * external resource file. |
| * <p> |
| * This annotator takes the following optional configuration parameters: |
| * <ul> |
| * <li><code>Patterns</code> - array of Strings indicating regular expressions to match. The |
| * pattern language is described at <a |
| * href="http://java.sun.com/j2se/1.4.2/docs/api/java/util/regex/Pattern.html"> |
| * http://java.sun.com/j2se/1.4.2/docs/api/java/util/regex/Pattern.html</a>) </li> |
| * <li><code>TypeNames</code> - array of Strings indicating names of Types to be created from the |
| * patterns. </li> |
| * <li><code>ContainingAnnotationTypes</code> - an array of input annotation types. This |
| * annotator will only produce new annotations that are contained within existing annotaions of |
| * these types. (This is optional.) </li> |
| * <li><code>AnnotateEntireContainedAnnotation</code> - When the ContainingAnnoationTypes |
| * parameter is specified, a value of true for this parameter will cause the entire containing |
| * annotation to be used as the span of the new annotation, rather than just the span of the regular |
| * expression match. This can be used to "classify" previously created annotations according to |
| * whether or not they contain text matching a regular expression. </li> |
| * </ul> |
| * <p> |
| * The indices of the <code>Patterns</code> and <code>TypeNames</code> arrays correspond, so |
| * that a substring that matches <code>Patterns[i]</code> will result in an annotation of type |
| * <code>TypeNames[i]</code>. |
| * <p> |
| * It is also possible to provide an external resource file that declares the annotation type names |
| * and the regular expressions to match. The annotator will look for this file under the resource |
| * key "PatternFile". The file format is as follows: |
| * <ul> |
| * <li>Lines starting with # or whitepsace are ignored</li> |
| * <li>Lines starting with % indicate an annotation type</li> |
| * <li>All other lines are regular expressions, using the same syntax described for the |
| * <code>Patterns</code> configuration parameter.</li> |
| * </ul> |
| * If a regular expression is matched, it will be annotated with the last annotation type declared |
| * (the nearest preceding line starting with %). |
| * |
| * |
| */ |
| public class RegExAnnotator extends CasAnnotator_ImplBase { |
| public static final String MESSAGE_DIGEST = "org.apache.uima.examples.cas.RegExAnnotator_Messages"; |
| |
| /** |
| * Performs any startup tasks required by this annotator. This implementation reads the |
| * configuration parmaeters and compiles the regular expressions. |
| * |
| * @see TextAnnotator#initialize(AnnotatorContext) |
| */ |
| public void initialize(UimaContext aContext) throws ResourceInitializationException { |
| super.initialize(aContext); |
| try { |
| // Retrieve configuration parameters |
| String[] patternStrings = (String[]) getContext().getConfigParameterValue("Patterns"); |
| String[] typeNames = (String[]) getContext().getConfigParameterValue("TypeNames"); |
| mContainingAnnotationTypeNames = (String[]) getContext().getConfigParameterValue( |
| "ContainingAnnotationTypes"); |
| if (mContainingAnnotationTypeNames != null && mContainingAnnotationTypeNames.length > 0) { |
| mAnnotateEntireContainingAnnotation = (Boolean) getContext().getConfigParameterValue( |
| "AnnotateEntireContainingAnnotation"); |
| } else { |
| mAnnotateEntireContainingAnnotation = Boolean.FALSE; |
| } |
| |
| // create an ArrayList of type names and an ArrayList of pattern arrays, |
| // where the indexes of the two lists corespond so that the patterns |
| // at patternArray[i] correspond to the annotation type at |
| // mTypeNames[i]. |
| mTypeNames = new ArrayList(); |
| ArrayList patternArray = new ArrayList(); |
| if (patternStrings != null) { |
| if (typeNames == null || typeNames.length != patternStrings.length) { |
| // throw exception - error message in external message digest |
| throw new ResourceInitializationException(MESSAGE_DIGEST, |
| "type_pattern_array_length_mismatch", new Object[0]); |
| } |
| mTypeNames.addAll(Arrays.asList(typeNames)); |
| |
| for (int i = 0; i < patternStrings.length; i++) { |
| patternArray.add(new String[] { patternStrings[i] }); |
| } |
| } |
| |
| // if PatternFile resource exists, parse it and add to patternArray |
| InputStream in = getContext().getResourceAsStream("PatternFile"); |
| if (in != null) { |
| try { |
| ArrayList patternsForCurrentType = new ArrayList(); |
| boolean foundFirstType = false; |
| // get buffered reader |
| BufferedReader reader = new BufferedReader(new InputStreamReader(in)); |
| |
| // read lines from file |
| String line = reader.readLine(); |
| while (line != null) { |
| if (!line.startsWith("#") && line.length() > 0 |
| && !Character.isWhitespace(line.charAt(0))) { |
| // line is not a comment |
| if (line.startsWith("%")) // annotation type name |
| { |
| // add pattern array for previous type (if any) to list |
| if (foundFirstType) { |
| String[] pats = new String[patternsForCurrentType.size()]; |
| patternsForCurrentType.toArray(pats); |
| patternArray.add(pats); |
| patternsForCurrentType.clear(); |
| } |
| // add new type name to mTypeNames list |
| mTypeNames.add(line.substring(1)); |
| foundFirstType = true; |
| } else // treat as regular expression |
| { |
| patternsForCurrentType.add(line); |
| } |
| } |
| line = reader.readLine(); |
| } |
| // add last group of pattersn to patternArray |
| String[] pats = new String[patternsForCurrentType.size()]; |
| patternsForCurrentType.toArray(pats); |
| patternArray.add(pats); |
| } finally { |
| if (in != null) { |
| in.close(); |
| } |
| } |
| } |
| |
| // make sure there is at least one pattern |
| if (patternArray.isEmpty()) { |
| throw new ResourceInitializationException( |
| AnnotatorConfigurationException.ONE_PARAM_REQUIRED, |
| new Object[] { "Patterns, Pattern File" }); |
| } |
| |
| // compile regular expression patterns |
| mPatterns = new Pattern[patternArray.size()][]; |
| for (int i = 0; i < patternArray.size(); i++) { |
| String[] pats = (String[]) patternArray.get(i); |
| mPatterns[i] = new Pattern[pats.length]; |
| for (int j = 0; j < mPatterns[i].length; j++) { |
| try { |
| mPatterns[i][j] = Pattern.compile(pats[j]); |
| // make sure no pattern matches the empty string - as this |
| // would lead to infinite loops during processing |
| if (mPatterns[i][j].matcher("").matches()) { |
| throw new ResourceInitializationException(MESSAGE_DIGEST, |
| "regex_matches_empty_string", new Object[] { pats[j] }); |
| } |
| } catch (PatternSyntaxException e) { |
| throw new ResourceInitializationException(MESSAGE_DIGEST, "regex_syntax_error", |
| new Object[] { pats[j] }, e); |
| } |
| } |
| } |
| } catch (ResourceAccessException e) { |
| throw new ResourceInitializationException(e); |
| } catch (IOException e) { |
| throw new ResourceInitializationException(e); |
| } |
| } |
| |
| /** |
| * Acquires references to CAS Type and Feature objects that are later used during the |
| * {@link #process(CAS)} method. |
| * |
| * @see TextAnnotator#typeSystemInit(TypeSystem) |
| */ |
| public void typeSystemInit(TypeSystem aTypeSystem) throws AnalysisEngineProcessException { |
| // get references to annotation types we will create |
| mCASTypes = new Type[mTypeNames.size()]; |
| for (int i = 0; i < mTypeNames.size(); i++) { |
| String curTypeName = (String) mTypeNames.get(i); |
| mCASTypes[i] = aTypeSystem.getType(curTypeName); |
| if (mCASTypes[i] == null) { |
| throw new AnalysisEngineProcessException(AnnotatorInitializationException.TYPE_NOT_FOUND, |
| new Object[] { this.getClass().getName(), curTypeName }); |
| } |
| } |
| |
| // get references to Containing Annotation Types |
| if (mContainingAnnotationTypeNames == null) { |
| mContainingAnnotationTypes = null; |
| } else { |
| mContainingAnnotationTypes = new Type[mContainingAnnotationTypeNames.length]; |
| for (int i = 0; i < mContainingAnnotationTypes.length; i++) { |
| mContainingAnnotationTypes[i] = aTypeSystem.getType(mContainingAnnotationTypeNames[i]); |
| if (mContainingAnnotationTypes[i] == null) { |
| throw new AnalysisEngineProcessException(AnnotatorInitializationException.TYPE_NOT_FOUND, |
| new Object[] { getClass().getName(), mContainingAnnotationTypeNames[i] }); |
| } |
| } |
| } |
| } |
| |
| /** |
| * Invokes this annotator's analysis logic. This annotator uses the java regular expression |
| * package to find annotations using the regular expressions defined by its configuration |
| * parameters. |
| * |
| * @param aCAS |
| * the CAS to process |
| * @param aResultSpec |
| * A list of outputs that this annotator should produce. |
| * |
| * @throws AnnotatorProcessException |
| * if a failure occurs during processing. |
| * |
| * @see CasAnnotator_ImplBase#process(CAS) |
| */ |
| public void process(CAS aCAS) throws AnalysisEngineProcessException { |
| try { |
| String docText = aCAS.getDocumentText(); |
| // Determine which regions of the document we are going to annotate |
| int[] rangesToAnnotate = getRangesToAnnotate(aCAS); |
| |
| // We treat the rangesToAnnotate array as a list of (start,end) offset |
| // pairs. Iterate through all of these pairs. |
| for (int i = 0; i < rangesToAnnotate.length; i += 2) { |
| int startPos = rangesToAnnotate[i]; |
| int endPos = rangesToAnnotate[i + 1]; |
| // get the substring of text to be annotated |
| String subText = docText.substring(startPos, endPos); |
| |
| // iterate over all annotation types for which we have patterns |
| for (int j = 0; j < mCASTypes.length; j++) { |
| // see if the ResultSpec contains this type |
| if (getResultSpecification().containsType(mCASTypes[j].getName())) { |
| // try to match each pattern that we have for this annotation type |
| for (int k = 0; k < mPatterns[j].length; k++) { |
| int pos = 0; |
| Matcher matcher = mPatterns[j][k].matcher(subText); |
| while (pos < subText.length() && matcher.find(pos)) { |
| getContext().getLogger().log(Level.FINER, |
| "RegEx match found: [" + matcher.group() + "]"); |
| // match found; extract locations of start and end of match |
| // (or of entire containing annotation, if that option is on) |
| int annotStart, annotEnd; |
| if (mAnnotateEntireContainingAnnotation.booleanValue()) { |
| annotStart = startPos; |
| annotEnd = endPos; |
| } else { |
| annotStart = startPos + matcher.start(); |
| annotEnd = startPos + matcher.end(); |
| } |
| // create Annotation in CAS |
| FeatureStructure fs = aCAS.createAnnotation(mCASTypes[j], annotStart, annotEnd); |
| aCAS.getIndexRepository().addFS(fs); |
| pos = annotEnd - startPos; |
| } |
| } |
| } |
| } |
| } |
| } catch (Exception e) { |
| throw new AnalysisEngineProcessException(e); |
| } |
| } |
| |
| /** |
| * Utility method that determines which subranges of the document text should be annotated by this |
| * annotator. This is done as follows: |
| * <ul> |
| * <li>If <code>mContainingAnnotationTypes</code> is <code>null</code>, the entire document |
| * is eligible for annotation.</li> |
| * <li>If <code>mContainingAnnotationTypes</code> is not <code>null</code>, then each of its |
| * elements is expected to be an Annotation Type name. The CAS is queried for existing annotations |
| * of any of these Types, and the only subranges of the document eligible for annotation are those |
| * subranges contained within such annotations.</li> |
| * </ul> |
| * |
| * @param aCAS |
| * CAS currently being processed |
| * |
| * @return an array of integers indicating the document subranges eligible for annotation. Begin |
| * and end positions of the subranges are stored in successive elements of the array. For |
| * example, elements 0 and 1 are the start and end of the first subrange; elements 2 and 3 |
| * are the start and end of the second subrange, and so on. |
| */ |
| protected int[] getRangesToAnnotate(CAS aCAS) { |
| if (mContainingAnnotationTypes == null || mContainingAnnotationTypes.length == 0) { |
| // ContainingAnnotationTypes is not set - the whole document is eligible |
| return new int[] { 0, aCAS.getDocumentText().length() }; |
| } else { |
| // get iterator over all annotations in the CAS |
| FSIterator iterator = aCAS.getAnnotationIndex().iterator(); |
| |
| // filter the iterator so that only instances of Types in the |
| // mContainingAnnotationTypes array are returned |
| FSTypeConstraint constraint = aCAS.getConstraintFactory().createTypeConstraint(); |
| for (int i = 0; i < mContainingAnnotationTypes.length; i++) { |
| constraint.add(mContainingAnnotationTypes[i]); |
| } |
| iterator = aCAS.createFilteredIterator(iterator, constraint); |
| |
| // iterate over annotations and add them to an ArrayList |
| List annotationList = new ArrayList(); |
| while (iterator.isValid()) { |
| annotationList.add(iterator.get()); |
| iterator.moveToNext(); |
| } |
| |
| // For each Annotation in the list, add its start and end |
| // positions to the result array. |
| int numRanges = annotationList.size(); |
| int[] result = new int[numRanges * 2]; |
| for (int j = 0; j < numRanges; j++) { |
| AnnotationFS curFS = (AnnotationFS) annotationList.get(j); |
| result[j * 2] = curFS.getBegin(); |
| result[j * 2 + 1] = curFS.getEnd(); |
| } |
| return result; |
| } |
| } |
| |
| /** |
| * The regular expression Patterns to be matched. |
| */ |
| private Pattern[][] mPatterns; |
| |
| /** |
| * The names of the CAS types that this annotator produces from the patterns in {@link #mPatterns}. |
| */ |
| private ArrayList mTypeNames; |
| |
| /** |
| * The names of the CAS types within which this annotator will search for new annotations. This |
| * may be null, indicating that the entire document will be searched. |
| */ |
| private String[] mContainingAnnotationTypeNames; |
| |
| /** |
| * The CAS types corresponding to {@link #mTypeNames}. |
| */ |
| private Type[] mCASTypes; |
| |
| /** |
| * The CAS types corresponding to {@link #mContainingAnnotationTypeNames}. |
| */ |
| private Type[] mContainingAnnotationTypes; |
| |
| /** |
| * Whether to annotate the entire span of the containing annotation when a match is found. |
| */ |
| private Boolean mAnnotateEntireContainingAnnotation; |
| |
| } |