| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| */ |
| |
| package org.apache.uima.conceptMapper; |
| |
| import java.util.ArrayList; |
| import java.util.Arrays; |
| import java.util.Collection; |
| import java.util.HashMap; |
| import java.util.Iterator; |
| import java.util.List; |
| import java.util.Map; |
| import java.util.TreeMap; |
| |
| import org.apache.uima.UimaContext; |
| import org.apache.uima.analysis_component.JCasAnnotator_ImplBase; |
| import org.apache.uima.analysis_engine.AnalysisEngineProcessException; |
| import org.apache.uima.analysis_engine.ResultSpecification; |
| import org.apache.uima.analysis_engine.annotator.AnnotatorConfigurationException; |
| import org.apache.uima.analysis_engine.annotator.AnnotatorInitializationException; |
| import org.apache.uima.cas.CAS; |
| import org.apache.uima.cas.FSIndex; |
| import org.apache.uima.cas.FSIterator; |
| import org.apache.uima.cas.Feature; |
| import org.apache.uima.cas.FeatureStructure; |
| import org.apache.uima.cas.Type; |
| import org.apache.uima.cas.TypeSystem; |
| import org.apache.uima.cas.text.AnnotationFS; |
| import org.apache.uima.cas.text.AnnotationIndex; |
| import org.apache.uima.conceptMapper.support.dictionaryResource.DictionaryResource; |
| import org.apache.uima.conceptMapper.support.dictionaryResource.DictionaryResource.DictEntry; |
| import org.apache.uima.conceptMapper.support.dictionaryResource.EntryProperties; |
| import org.apache.uima.conceptMapper.support.tokens.TokenFilter; |
| import org.apache.uima.conceptMapper.support.tokens.TokenNormalizer; |
| import org.apache.uima.conceptMapper.support.tokens.UnknownTypeException; |
| import org.apache.uima.jcas.JCas; |
| import org.apache.uima.jcas.cas.FSArray; |
| import org.apache.uima.jcas.tcas.Annotation; |
| import org.apache.uima.resource.ResourceInitializationException; |
| |
| public class ConceptMapper extends JCasAnnotator_ImplBase { |
| |
| /** Configuration parameter key/label for the dictionary file to load */ |
| public static final String PARAM_DICT_FILE = "DictionaryFile"; |
| |
| /** |
| * Configuration parameter for name of token class feature of token annotations, to distinguish |
| * classes of tokens to skip during lookups. Token class features are Strings. |
| */ |
| public static final String PARAM_TOKENCLASSFEATURENAME = "TokenClassFeatureName"; |
| |
| private String tokenClassFeatureName; |
| |
| /** |
| * Configuration parameter for name of token type feature of token annotations, to distinguish |
| * types of tokens to skip during lookups. Token type features are Integers |
| */ |
| public static final String PARAM_TOKENTYPEFEATURENAME = "TokenTypeFeatureName"; |
| |
| private String tokenTypeFeatureName; |
| |
| /** Configuration parameter key/label for the annotation name */ |
| public static final String PARAM_ANNOTATION_NAME = "ResultingAnnotationName"; |
| |
| /** |
| * Configuration parameter key/label for the name of the feature that contains the resulting |
| * term's span, i.e. sentence |
| */ |
| public static final String PARAM_ENCLOSINGSPAN = "ResultingEnclosingSpanName"; |
| |
| private String resultEnclosingSpanName; |
| |
| private Feature resultEnclosingSpan; |
| |
| /** |
| * Configuration parameter feature in resulting annotation to store text matched in successful |
| * dict lookup |
| */ |
| public static final String PARAM_MATCHEDFEATURE = "ResultingAnnotationMatchedTextFeature"; |
| |
| private String resultMatchedTextFeatureName; |
| |
| private Feature resultMatchedTextFeature; |
| |
| /** Configuration parameter key/label for the attribute list */ |
| public static final String PARAM_ATTRIBUTE_LIST = "AttributeList"; |
| |
| /** Configuration parameter key/label for the feature list */ |
| public static final String PARAM_FEATURE_LIST = "FeatureList"; |
| |
| /** Configuration parameter giving type of tokens */ |
| public static final String PARAM_TOKENANNOTATION = "TokenAnnotation"; |
| |
| private String tokenAnnotationName; |
| |
| /** |
| * Configuration parameter specifying name of token's feature containing text. If not specified, |
| * the token annotation's covered text is used |
| */ |
| public static final String PARAM_TOKENTEXTFEATURENAME = "TokenTextFeatureName"; |
| |
| private String tokenTextFeatureName; |
| |
| private Feature tokenTextFeature; |
| |
| /** |
| * array of features of the token annotation which should be written back to the token from the |
| * resulting entry. For example, if a Part of Speech is specified as part of a dictionary entry, |
| * it could be written back to the token so that a subsequent POS tagger would be able to use it |
| * as a preannotated item. |
| */ |
| public static final String PARAM_TOKENCLASSWRITEBACKFEATURENAMES = "TokenClassWriteBackFeatureNames"; |
| |
| private String[] tokenClassWriteBackFeatureNames; |
| |
| private Feature[] tokenClassWriteBackFeatures; |
| |
| /** |
| * Configuration parameter for name of feature in result annotations to contain list of matched |
| * tokens |
| */ |
| public static final String PARAM_MATCHEDTOKENSFEATURENAME = "MatchedTokensFeatureName"; |
| |
| private String matchedTokensFeatureName; |
| |
| private Feature matchedTokensFeature; |
| |
| /** |
| * Configuration parameter key/label to indicate if order-independent lookup is to be performed. |
| * If true, words in a phrase are sorted alphabetically before lookup. This implies that a phrase |
| * "C D A" would be considered equivalent to "A C D" and "D A C", etc. |
| */ |
| public static final String PARAM_ORDERINDEPENDENTLOOKUP = "OrderIndependentLookup"; |
| |
| private boolean sortElements; |
| |
| private final static int ContiguousMatch = 1; |
| |
| public static final String PARAMVALUE_CONTIGUOUSMATCH = "ContiguousMatch"; |
| |
| private final static int SkipAnyMatch = 2; |
| |
| public static final String PARAMVALUE_SKIPANYMATCH = "SkipAnyMatch"; |
| |
| private static final int SkipAnyMatchAllowOverlap = 3; |
| |
| public static final String PARAMVALUE_SKIPANYMATCHALLOWOVERLAP = "SkipAnyMatchAllowOverlap"; |
| |
| private final static int DefaultSearchStrategy = ContiguousMatch; |
| |
| // private final static int DefaultSearchStrategy = SkipAnyMatch; |
| |
| /** |
| * Configuration parameter to indicate search strategy, either: LongestMatch: longest match of |
| * contiguous tokens within enclosing span(taking into account included/excluded items). DEFAULT |
| * strategy SkipAnyMatch: longest match of noncontiguous tokens enclosing span (taking into |
| * account included/excluded items). IMPLIES order-independent lookup |
| */ |
| public static final String PARAM_SEARCHSTRATEGY = "SearchStrategy"; |
| |
| private int searchStrategy = DefaultSearchStrategy; |
| |
| public static final String PARAM_FINDALLMATCHES = "FindAllMatches"; |
| |
| private boolean findAllMatches; |
| |
| /** object used to stem/case normalize text */ |
| private TokenNormalizer tokenNormalizer; |
| |
| private TokenFilter tokenFilter; |
| |
| /** The name of the annotation type posted to the CAS by this TAE */ |
| protected String resultAnnotationName; |
| |
| /** The type of annotation posted to the CAS by this TAE */ |
| protected Type resultAnnotationType; |
| |
| /** The type of token annotations to consider */ |
| protected Type tokenType; |
| |
| /** |
| * Array of Feature objects associated with {link #annotationType annotationType} |
| */ |
| protected Feature features[]; |
| |
| /** Array of feature names, obtained as a configuration parameter. */ |
| protected String featureNames[]; |
| |
| /** |
| * Array of attribute names for the XML dictionary token element, obtained as a configuration |
| * parameter. |
| */ |
| protected String attributeNames[]; |
| |
| /** The dictionary */ |
| private DictionaryResource dict; |
| |
| /** |
| * type of annotation that defines a block for processing, e.g. a sentence |
| */ |
| private static final String PARAM_DATA_BLOCK_FS = "SpanFeatureStructure"; |
| |
| private String spanFeatureStructureName; |
| |
| private Type spanFeatureStructureType; |
| |
| public Logger logger; |
| |
| private JCas jcas; |
| |
| private TypeSystem mLastTypeSystem; |
| |
| private static final String PARAM_TOKENIZERDESCRIPTOR = "TokenizerDescriptorPath"; |
| |
| private static final String UNKNOWN_VALUE = "unknown"; |
| |
| // private FileWriter tokenDebugFile; |
| // private FileWriter potentialMatchDebugFile; |
| // private FileWriter findMatchDebugFile; |
| |
| // private void debugWrite (FileWriter file, String text) |
| // { |
| // try |
| // { |
| // file.write(text + "\n"); |
| // } |
| // catch (Exception e) |
| // { |
| // } |
| // } |
| |
| /** |
| * Initialize the annotator, which includes compilation of regular expressions, fetching |
| * configuration parameters from XML descriptor file, and loading of the dictionary file. |
| */ |
| public void initialize(UimaContext uimaContext) throws ResourceInitializationException { |
| super.initialize(uimaContext); |
| |
| // Process configration parameters |
| try { |
| // logger = new Logger (annotatorContext.getLogger ()); |
| logger = new Logger("ConceptMapper", uimaContext.getLogger()); |
| // tokenDebugFile = new FileWriter("/tmp/cm/tokens."+ |
| // Calendar.getInstance ().getTimeInMillis () + ".txt"); |
| // potentialMatchDebugFile = new FileWriter("/tmp/cm/pm."+ |
| // Calendar.getInstance ().getTimeInMillis () + ".txt"); |
| // findMatchDebugFile = new FileWriter("/tmp/cm/fm."+ |
| // Calendar.getInstance ().getTimeInMillis () + ".txt"); |
| // FileWriter dictDebugFile = new FileWriter("/tmp/cm/dict."+ |
| // Calendar.getInstance ().getTimeInMillis () + ".txt"); |
| |
| tokenAnnotationName = (String) uimaContext |
| .getConfigParameterValue(PARAM_TOKENANNOTATION); |
| String tokenizerDescriptor = (String) uimaContext |
| .getConfigParameterValue(PARAM_TOKENIZERDESCRIPTOR); |
| |
| tokenClassFeatureName = (String) uimaContext |
| .getConfigParameterValue(PARAM_TOKENCLASSFEATURENAME); |
| |
| tokenTypeFeatureName = (String) uimaContext |
| .getConfigParameterValue(PARAM_TOKENTYPEFEATURENAME); |
| |
| resultAnnotationName = (String) uimaContext |
| .getConfigParameterValue(PARAM_ANNOTATION_NAME); |
| resultEnclosingSpanName = (String) uimaContext |
| .getConfigParameterValue(PARAM_ENCLOSINGSPAN); |
| |
| resultMatchedTextFeatureName = (String) uimaContext |
| .getConfigParameterValue(PARAM_MATCHEDFEATURE); |
| |
| featureNames = (String[]) uimaContext.getConfigParameterValue(PARAM_FEATURE_LIST); |
| attributeNames = (String[]) uimaContext.getConfigParameterValue(PARAM_ATTRIBUTE_LIST); |
| |
| spanFeatureStructureName = (String) uimaContext |
| .getConfigParameterValue(PARAM_DATA_BLOCK_FS); |
| |
| tokenTextFeatureName = (String) uimaContext |
| .getConfigParameterValue(PARAM_TOKENTEXTFEATURENAME); |
| tokenClassWriteBackFeatureNames = (String[]) uimaContext |
| .getConfigParameterValue(PARAM_TOKENCLASSWRITEBACKFEATURENAMES); |
| |
| tokenAnnotationName = (String) uimaContext |
| .getConfigParameterValue(PARAM_TOKENANNOTATION); |
| |
| matchedTokensFeatureName = (String) uimaContext |
| .getConfigParameterValue(PARAM_MATCHEDTOKENSFEATURENAME); |
| |
| Boolean sortElementsParam = (Boolean) uimaContext |
| .getConfigParameterValue(PARAM_ORDERINDEPENDENTLOOKUP); |
| sortElements = (sortElementsParam == null) ? false : sortElementsParam.booleanValue(); |
| |
| searchStrategy = detectSearchStrategy((String) uimaContext |
| .getConfigParameterValue(PARAM_SEARCHSTRATEGY)); |
| // System.err.println("SEARCH STRATEGY = " + searchStrategy); |
| |
| Boolean findAllMatchesParam = (Boolean) uimaContext |
| .getConfigParameterValue(PARAM_FINDALLMATCHES); |
| findAllMatches = (findAllMatchesParam == null) ? false : findAllMatchesParam.booleanValue(); |
| |
| // always do order-independent lookup if performing "SkipAnyMatch" |
| // lookups |
| if (searchStrategy == SkipAnyMatch) { |
| sortElements = true; |
| } |
| |
| if (featureNames.length != attributeNames.length) { |
| throw new Exception("AttributeList and FeatureList are inconsistent"); |
| } |
| // for (int i = 0; i < featureNames.length; i++ ) |
| // { |
| // logger.logInfo ("Attribute \"" + attributeNames [i] + "\" mapped |
| // to feature \"" + featureNames [i] + "\""); |
| // } |
| |
| tokenNormalizer = new TokenNormalizer(uimaContext, logger); |
| tokenFilter = new TokenFilter(tokenAnnotationName, tokenTypeFeatureName, |
| tokenClassFeatureName, logger); |
| tokenFilter.initConfig(uimaContext); |
| |
| dict = (DictionaryResource) uimaContext.getResourceObject(PARAM_DICT_FILE); |
| if (!dict.isLoaded()) { |
| // logger.logInfo("dictionary not yet loaded"); |
| dict.loadDictionaryContents(uimaContext, logger, tokenAnnotationName, |
| tokenTypeFeatureName, tokenClassFeatureName, tokenizerDescriptor); |
| // logger.logInfo( "now is loaded: "+dict.toString() ); |
| // System.err.println ("NEW DICTIONARY:\n" + dict.toString()); |
| // debugWrite (dictDebugFile, dict.toString()); |
| } |
| |
| } catch (Exception e) { |
| throw new ResourceInitializationException(e); |
| } |
| } |
| |
| private int detectSearchStrategy(String strategyString) throws AnnotatorConfigurationException { |
| if ((strategyString == null) || (strategyString.equals(""))) { |
| return DefaultSearchStrategy; |
| } else if (strategyString.equals(PARAMVALUE_CONTIGUOUSMATCH)) { |
| return ContiguousMatch; |
| } else if (strategyString.equals(PARAMVALUE_SKIPANYMATCH)) { |
| return SkipAnyMatch; |
| } else if (strategyString.equals(PARAMVALUE_SKIPANYMATCHALLOWOVERLAP)) { |
| return SkipAnyMatchAllowOverlap; |
| } else { |
| throw new AnnotatorConfigurationException(); |
| } |
| } |
| |
| /** |
| * Perform local type system initialization. |
| * |
| * @param typeSystem |
| * the current type system. |
| * @throws AnnotatorConfigurationException - |
| * @throws AnnotatorInitializationException - |
| * @see org.apache.uima.analysis_engine.annotator.TextAnnotator#typeSystemInit(TypeSystem) |
| */ |
| public void typeSystemInit(TypeSystem typeSystem) throws AnnotatorConfigurationException, |
| AnnotatorInitializationException { |
| |
| tokenType = typeSystem.getType(tokenAnnotationName); |
| if (tokenType == null) { |
| logger.logError(PARAM_TOKENANNOTATION + " '" + tokenAnnotationName |
| + "' specified, but does not exist"); |
| throw new AnnotatorInitializationException(); |
| } |
| |
| if ((tokenTextFeatureName == null) || (tokenTextFeatureName.equals(""))) { |
| tokenTextFeature = null; |
| } else { |
| tokenTextFeature = tokenType.getFeatureByBaseName(tokenTextFeatureName); |
| if (tokenTextFeature == null) { |
| logger.logError(PARAM_TOKENTEXTFEATURENAME + " '" + tokenTextFeatureName |
| + "' specified, but does not exist for type: " + tokenType.getName()); |
| throw new AnnotatorInitializationException(); |
| } |
| } |
| |
| if ((tokenClassWriteBackFeatureNames != null) && (tokenClassWriteBackFeatureNames.length > 0)) { |
| tokenClassWriteBackFeatures = new Feature[tokenClassWriteBackFeatureNames.length]; |
| for (int i = 0; i < tokenClassWriteBackFeatureNames.length; i++) { |
| tokenClassWriteBackFeatures[i] = tokenType |
| .getFeatureByBaseName(tokenClassWriteBackFeatureNames[i]); |
| if (tokenClassWriteBackFeatures[i] == null) { |
| logger.logError(PARAM_TOKENCLASSWRITEBACKFEATURENAMES + "[" + i + "] '" |
| + tokenClassWriteBackFeatureNames[i] |
| + "' specified, but does not exist for type: " + tokenType.getName()); |
| throw new AnnotatorInitializationException(); |
| } |
| } |
| } else { |
| tokenClassWriteBackFeatures = null; |
| } |
| |
| spanFeatureStructureType = typeSystem.getType(spanFeatureStructureName); |
| if (spanFeatureStructureType == null) { |
| logger.logError(PARAM_DATA_BLOCK_FS + " '" + spanFeatureStructureName |
| + "' specified, but does not exist for type: " + tokenType.getName()); |
| throw new AnnotatorInitializationException(); |
| } |
| |
| resultAnnotationType = typeSystem.getType(resultAnnotationName); |
| if (resultAnnotationType == null) { |
| logger.logError(PARAM_ANNOTATION_NAME + " '" + resultAnnotationName |
| + "' specified, but does not exist"); |
| throw new AnnotatorInitializationException(); |
| } |
| |
| if ((resultEnclosingSpanName == null) || (resultEnclosingSpanName.equals(""))) { |
| resultEnclosingSpan = null; |
| } else { |
| resultEnclosingSpan = resultAnnotationType.getFeatureByBaseName(resultEnclosingSpanName); |
| if (resultEnclosingSpan == null) { |
| logger.logError(PARAM_ENCLOSINGSPAN + " '" + resultEnclosingSpanName |
| + "' specified, but does not exist for type: " + resultAnnotationType.getName()); |
| throw new AnnotatorInitializationException(); |
| } |
| } |
| |
| if ((resultMatchedTextFeatureName == null) || (resultMatchedTextFeatureName.equals(""))) { |
| resultMatchedTextFeature = null; |
| } else { |
| resultMatchedTextFeature = resultAnnotationType |
| .getFeatureByBaseName(resultMatchedTextFeatureName); |
| if (resultMatchedTextFeature == null) { |
| logger.logError(PARAM_MATCHEDFEATURE + " '" + resultMatchedTextFeatureName |
| + "' specified, but does not exist for type: " + resultAnnotationType.getName()); |
| throw new AnnotatorInitializationException(); |
| } |
| } |
| |
| if ((matchedTokensFeatureName == null) || (matchedTokensFeatureName.equals(""))) { |
| matchedTokensFeature = null; |
| } else { |
| matchedTokensFeature = resultAnnotationType.getFeatureByBaseName(matchedTokensFeatureName); |
| if (matchedTokensFeature == null) { |
| logger.logError(PARAM_MATCHEDTOKENSFEATURENAME + " '" + matchedTokensFeatureName |
| + "' specified, but does not exist for type: " + resultAnnotationType.getName()); |
| throw new AnnotatorInitializationException(); |
| } |
| } |
| |
| int numFeatures = featureNames.length; |
| features = new Feature[numFeatures]; |
| |
| for (int i = 0; i < numFeatures; i++) { |
| features[i] = resultAnnotationType.getFeatureByBaseName(featureNames[i]); |
| if (features[i] == null) { |
| logger.logError(PARAM_FEATURE_LIST + "[" + i + "] '" + featureNames[i] |
| + "' specified, but does not exist for type: " + resultAnnotationType.getName()); |
| // System.err.println (PARAM_FEATURE_LIST + "[" + i + "] '" + |
| // featureNames[i] + "' specified, but does not exist for type: |
| // " + resultAnnotationType.getName()); |
| throw new AnnotatorInitializationException(); |
| } |
| |
| } |
| |
| try { |
| tokenFilter.initTypes(typeSystem); |
| } catch (UnknownTypeException e) { |
| throw new AnnotatorInitializationException(e); |
| } |
| } |
| |
| /** |
| * Perform the actual analysis. Iterate over the document content looking for any matching words |
| * or phrases in the loaded dictionary and post an annotation for each match found. |
| * |
| * @param jCas |
| * the current CAS to process. |
| * @throws AnalysisEngineProcessException - |
| * |
| * @see org.apache.uima.analysis_engine.annotator.TextAnnotator#process(CAS,ResultSpecification) |
| */ |
| public void process(JCas jCas) throws AnalysisEngineProcessException { |
| // System.err.println ("ConceptMapper.process() begin"); |
| |
| CAS tcas = jCas.getCas(); |
| |
| AnnotationFS token; |
| |
| try { |
| //explicitly initialize the type system |
| if(mLastTypeSystem == null){ |
| mLastTypeSystem = jCas.getTypeSystem(); |
| typeSystemInit(mLastTypeSystem); |
| } else { |
| checkTypeSystemChange(tcas); |
| } |
| |
| setJCas(jCas); // this is needed to get around an issue |
| // where UIMA crashes if no JCas is |
| // referenced |
| // logger.setupDocument (getJCas ()); |
| |
| FSIndex dbIndex = tcas.getAnnotationIndex(spanFeatureStructureType); |
| FSIterator spanIterator = dbIndex.iterator(); |
| |
| AnnotationIndex tokenIndex = (AnnotationIndex) tcas.getAnnotationIndex(tokenType); |
| |
| while (spanIterator.hasNext()) { |
| ArrayList<AnnotationFS> tokens = new ArrayList<AnnotationFS>(2048); |
| |
| Annotation spanAnnotation = (Annotation) spanIterator.next(); |
| |
| FSIterator tokenIter = tokenIndex.subiterator(spanAnnotation); |
| |
| // System.err.println ("Tokens:"); |
| |
| // get all tokens for the specified block |
| while (tokenIter.hasNext()) { |
| token = (AnnotationFS) tokenIter.next(); |
| // System.err.print ("--> token: '" + token.getCoveredText() |
| // + "' "); |
| if (tokenFilter.isOK_Token(token, tokenNormalizer)) { |
| // System.err.println("--> ADDING token: " + |
| // token.getCoveredText()); |
| // debugWrite(tokenDebugFile, "--> ADDING token: " + |
| // token.getCoveredText() + ", type: " + |
| // token.getIntValue (tokenTypeFeature) + ", checkType: |
| // " + checkTokenType (token)); |
| |
| tokens.add(token); |
| } |
| // else |
| // { |
| // System.err.println("-->NOT! ADDING token: " + |
| // token.getCoveredText()); |
| // debugWrite(tokenDebugFile, "-->NOT! ADDING token: " + |
| // token.getCoveredText() + ", type: " + token.getIntValue |
| // (tokenTypeFeature) + ", checkType: " + checkTokenType |
| // (token)); |
| // } |
| } |
| // System.err.println (); |
| // logger.logInfo("Number of tokens: " + tokens.size()); |
| |
| switch (searchStrategy) { |
| case SkipAnyMatch: |
| case SkipAnyMatchAllowOverlap: |
| processTokenListSkipAny(searchStrategy, findAllMatches, tcas, tokens, spanAnnotation); |
| break; |
| case ContiguousMatch: |
| processTokenList(searchStrategy, findAllMatches, tcas, tokens, spanAnnotation); |
| break; |
| default: |
| processTokenList(searchStrategy, findAllMatches, tcas, tokens, spanAnnotation); |
| break; |
| } |
| |
| } |
| // logger.logFinest("Number of annotations in CAS: " + |
| // (tcas.getAnnotationIndex().size() - 1)); |
| // System.out.println("Number of annotations in CAS: " + |
| // (tcas.getAnnotationIndex().size() - 1)); |
| } catch (Exception e) { |
| throw new AnalysisEngineProcessException(e); |
| } |
| // System.err.println ("ConceptMapper.process() end"); |
| } |
| |
| |
| /** |
| * Checks if the type system of the given CAS is different from the |
| * last type system this component was operating on. If it is different, |
| * calls the typeSystemInit method on the component. |
| * @param |
| * CAS |
| * @throws AnnotatorInitializationException |
| * @throws AnnotatorConfigurationException |
| */ |
| private void checkTypeSystemChange(CAS aCAS) throws |
| AnalysisEngineProcessException, AnnotatorConfigurationException, AnnotatorInitializationException { |
| TypeSystem typeSystem = aCAS.getTypeSystem(); |
| if (typeSystem != mLastTypeSystem) { |
| typeSystemInit(typeSystem); |
| mLastTypeSystem = typeSystem; |
| } |
| } |
| |
| private void setJCas(JCas jcas) { |
| this.jcas = jcas; |
| } |
| |
| private JCas getJCas() { |
| return this.jcas; |
| } |
| |
| private void processTokenListSkipAny(int searchStrategy, boolean findAllMatches, CAS tcas, |
| ArrayList<AnnotationFS> tokens, Annotation spanAnnotation) { |
| AnnotationFS token; |
| // iterate over vector of tokens |
| |
| ArrayList<String> normalizedTokens = new ArrayList<String>(); |
| |
| // iterate through all tokens within span and collect dict entries for each unique one |
| for (int whichToken = 0; whichToken < tokens.size(); whichToken++) { |
| token = tokens.get(whichToken); |
| String tokenText = getTokenText(token); |
| |
| String word = tokenNormalizer.normalize(tokenText); |
| normalizedTokens.add(word); |
| |
| // logger.logInfo("ENTRY SEARCH/ORIGINAL: " + word + " / " + |
| // tokenText); |
| // System.err.println("ENTRY SEARCH/ORIGINAL: " + word + " / " + |
| // tokenText); |
| } |
| |
| // System.err.println ("processTokenListSkipAny finding matches for " + |
| // normalizedTokens.toString ()); |
| |
| findMatchesSkipAnyToken(searchStrategy, findAllMatches, tcas, tokens, normalizedTokens, |
| findPotentialEntries(normalizedTokens, dict), spanAnnotation); |
| } |
| |
| private Map<String, Collection<DictEntry>> findPotentialEntries( |
| ArrayList<String> normalizedTokens, DictionaryResource dict) { |
| HashMap<String, Collection<DictEntry>> potentialEntries = new HashMap<String, Collection<DictEntry>>(); |
| |
| Iterator<String> tokenIter = normalizedTokens.iterator(); |
| while (tokenIter.hasNext()) { |
| String word = tokenIter.next(); |
| Collection<DictEntry> entries = potentialEntries.get(word); |
| |
| if (entries == null) { |
| entries = new ArrayList<DictEntry>(); |
| } |
| DictionaryResource.DictEntriesByLength entriesByLength = dict.getEntries(word); |
| if (entriesByLength != null) { |
| int shortest = entriesByLength.getShortest().intValue(); |
| int longest = entriesByLength.getLongest().intValue(); |
| for (int currentLength = longest; currentLength >= shortest; currentLength--) { |
| DictionaryResource.DictEntries dictEntries = entriesByLength.getEntries(currentLength); |
| if (dictEntries != null) { |
| ArrayList<DictEntry> entryItems = dictEntries.getEntries(); |
| Iterator<DictEntry> entryIter = entryItems.iterator(); |
| while (entryIter.hasNext()) { |
| DictionaryResource.DictEntry entry = (DictionaryResource.DictEntry) entryIter.next(); |
| // System.err.println("entryIter = " + entryIter + |
| // ", Entry: " + entry.getText ()); |
| // debugWrite (potentialMatchDebugFile, "Entry: " + |
| // entry.getText ()); |
| if ((containsAll (normalizedTokens, entry.getElements())) && (!entries.contains(entry))) { |
| entries.add(entry); |
| // System.err.println ("Added potential match: " |
| // + entry); |
| // debugWrite (potentialMatchDebugFile, "Added |
| // potential match: " + entry); |
| } |
| } |
| } |
| } |
| } |
| |
| potentialEntries.put(word, entries); |
| |
| } |
| return potentialEntries; |
| } |
| |
| private boolean containsAll (List<String> container, String[] contained) |
| { |
| for (String item : contained) |
| { |
| if (! container.contains (item)) |
| { |
| return false; |
| } |
| } |
| return true; |
| } |
| /** |
| * @param searchStrategy |
| * @param tcas |
| * @param tokens |
| * list of token annotations |
| * @param normalizedTokens |
| * list of token annotations as strings |
| * @param potentialEntries |
| * list of possible matches from dictionary |
| * @param spanAnnotation |
| */ |
| private void findMatchesSkipAnyToken(int searchStrategy, boolean findAllMatches, CAS tcas, |
| ArrayList<AnnotationFS> tokens, ArrayList<String> normalizedTokens, |
| Map<String, Collection<DictEntry>> potentialEntries, Annotation spanAnnotation) { |
| int whichToken = 0; // use index instead of iterator to simplify walking |
| // through parallel arrays (tokens/normalizedTokens) |
| |
| while (whichToken < normalizedTokens.size()) { |
| // System.err.println ("findMatchesSkipAnyToken(), whichToken = " + |
| // whichToken + ", token: " + (String) normalizedTokens.get |
| // (whichToken)); |
| Collection<DictEntry> entries = potentialEntries.get(normalizedTokens.get(whichToken)); |
| if (entries == null) { |
| whichToken += 1; |
| } else { |
| Iterator<DictEntry> entryIter = entries.iterator(); |
| boolean foundMatch = false; |
| while ((entryIter.hasNext() && (!foundMatch))) { |
| DictionaryResource.DictEntry entry = entryIter.next(); |
| |
| // System.err.println("entryIter = " + entryIter + ", Entry: |
| // " + entry.getText ()); |
| // debugWrite (findMatchDebugFile, "Entry: " + entry.getText |
| // ()); |
| // System.err.println("remainingTokens = " + |
| // normalizedTokens.subList (whichToken, |
| // normalizedTokens.size ()).toString ()); |
| // debugWrite (findMatchDebugFile, "remainingTokens = " + |
| // normalizedTokens.subList (whichToken, |
| // normalizedTokens.size ()).toString ()); |
| |
| if (containsAll (normalizedTokens.subList(whichToken, normalizedTokens.size()), |
| entry.getElements())) { |
| int lengthOfMatch = processMatch(tcas, tokens, normalizedTokens, spanAnnotation, |
| whichToken, entry); |
| if (!findAllMatches) { |
| foundMatch = true; |
| if (searchStrategy == SkipAnyMatchAllowOverlap) { |
| whichToken += 1; |
| } else { |
| whichToken += lengthOfMatch; |
| } |
| // System.err.println ("Processed match, whichToken |
| // = " + whichToken); |
| // debugWrite (findMatchDebugFile, "Processed match, |
| // whichToken = " + whichToken); |
| } |
| } |
| } |
| if (!foundMatch) { |
| whichToken += 1; |
| } |
| } |
| } |
| } |
| |
| /** |
| * @param tcas |
| * @param tokens |
| * list of token annotations |
| * @param normalizedTokens |
| * list of token annotations as strings |
| * @param spanAnnotation |
| * @param whichToken |
| * current token index (for tokens/normalizedTokens) |
| * @param entry |
| * matching dict entry |
| * @return length of match (in tokens) |
| */ |
| private int processMatch(CAS tcas, ArrayList<AnnotationFS> tokens, |
| ArrayList<String> normalizedTokens, Annotation spanAnnotation, int whichToken, |
| DictionaryResource.DictEntry entry) { |
| int startingPoint = whichToken; |
| TreeMap<String, Integer> entryOccurences = findEntryOccurences(entry.getElements(), whichToken); |
| int begin = -1; |
| int end = 0; |
| StringBuilder matchedText = new StringBuilder(); |
| |
| // while there are still items to match against |
| ArrayList<AnnotationFS> matched = new ArrayList<AnnotationFS>(); |
| while ((!entryOccurences.isEmpty()) && (whichToken < normalizedTokens.size())) { |
| String currentTokenText = normalizedTokens.get(whichToken); |
| // System.err.println ("matchedText: '" + matchedText + "', |
| // whichToken = " + whichToken + ", currentTokenText: " + |
| // currentTokenText); |
| |
| // if the dict entry contains at least one more of the current |
| // token, process it |
| Integer count = entryOccurences.get(currentTokenText); |
| if (count != null) { |
| if (matchedText.length() != 0) { |
| matchedText.append(' '); |
| } |
| matchedText.append(currentTokenText); |
| // System.err.println ("matchedText: '" + matchedText + "'"); |
| |
| AnnotationFS realToken = tokens.get(whichToken); |
| // System.err.println ("realToken: '" + realToken.getCoveredText |
| // () + ", count.intValue () = " + count.intValue ()); |
| |
| begin = (begin == -1) ? realToken.getBegin() : Math.min(begin, realToken.getBegin()); |
| end = Math.max(end, realToken.getEnd()); |
| matched.add(realToken); |
| // decrement count, or remove entry if none left |
| if (count.intValue() == 1) { |
| entryOccurences.remove(currentTokenText); |
| } else { |
| entryOccurences.put(currentTokenText, Integer.valueOf (count.intValue() - 1)); |
| } |
| } |
| |
| whichToken += 1; |
| } |
| if (entryOccurences.isEmpty()) { |
| // System.err.println ("makeAnnotation, text: " + |
| // matchedText.toString ()); |
| makeAnnotation(tcas, begin, end, entry.getProperties(), spanAnnotation, matchedText |
| .toString(), matched, logger); |
| } |
| // else |
| // { |
| // System.err.println ("whichToken = " + whichToken + ", |
| // normalizedTokens.size = " + normalizedTokens.size ()); |
| // } |
| |
| return whichToken - startingPoint; |
| } |
| |
| // generate a map from tokens to number of occurences of that token |
| private TreeMap<String, Integer> findEntryOccurences(String[] normalizedTokens, |
| int whichToken) { |
| TreeMap<String, Integer> result = new TreeMap<String, Integer>(); |
| |
| for (String token : normalizedTokens) { |
| Integer count = result.get(token); |
| if (count == null) { |
| count = Integer.valueOf (1); |
| } else { |
| count = Integer.valueOf (count.intValue() + 1); |
| } |
| result.put(token, count); |
| |
| } |
| return result; |
| } |
| |
| /** |
| * @param searchStrategy - |
| * @param findAllMatches true to find all matches |
| * @param tcas the Cas |
| * @param tokens - |
| * @param spanAnnotation - |
| */ |
| protected void processTokenList(int searchStrategy, boolean findAllMatches, CAS tcas, |
| ArrayList<AnnotationFS> tokens, Annotation spanAnnotation) { |
| AnnotationFS token; |
| // iterate over vector of tokens |
| |
| int whichToken = 0; |
| int entryLength = 0; |
| |
| while (whichToken < tokens.size()) { |
| token = tokens.get(whichToken); |
| String tokenText = getTokenText(token); |
| entryLength = 0; |
| |
| String word = tokenNormalizer.normalize(tokenText); |
| |
| // logger.logInfo("ENTRY SEARCH/ORIGINAL: " + word + " / " + |
| // tokenText); |
| // System.err.println("ENTRY SEARCH/ORIGINAL: " + word + ", Token[" |
| // + whichToken + "]: " + tokenText); |
| |
| DictionaryResource.DictEntriesByLength entriesByLength = dict.getEntries(word); |
| if (entriesByLength != null) { |
| entryLength = Math.min(entriesByLength.getLongest().intValue(), |
| (tokens.size() - whichToken)); |
| // logger.logInfo("ENTRY FOUND for: " + word + ", longest: " + |
| // entryLength + ", shortest: " + minLength); |
| // System.err.println("ENTRY FOUND for: " + word + ", longest: " |
| // + entryLength + ", shortest: " + minLength); |
| // System.err.println("ENTRY FOUND for: " + word + ", longest: " |
| // + entryLength); |
| |
| entryLength = defaultMatcher(findAllMatches, tcas, tokens, spanAnnotation, whichToken, |
| entryLength, token.getBegin(), entriesByLength, entriesByLength.getShortest() |
| .intValue()); |
| |
| } |
| whichToken += entryLength + 1; |
| } |
| } |
| |
| private int defaultMatcher(boolean findAllMatches, CAS tcas, ArrayList<AnnotationFS> tokens, |
| Annotation spanAnnotation, int whichToken, int entryLength, int start, |
| DictionaryResource.DictEntriesByLength lengthEntries, int minLength) { |
| boolean entryFound = false; |
| // search through all entry lengths, as necessary |
| while ((!entryFound) && (entryLength >= minLength)) { |
| String [] tokensToMatch = buildTokensToMatchArray(tokens, whichToken, entryLength, sortElements); |
| //System.err.print(">>> tokensToMatch: '"); |
| //for (String token : tokensToMatch) { |
| // System.err.print(token + " "); |
| //} |
| //System.err.println("'"); |
| DictionaryResource.DictEntries entriesByLength = lengthEntries.getEntries(entryLength); |
| // System.err.println(">>> entriesByLength = " + entriesByLength); |
| if (entriesByLength != null) { |
| ArrayList<DictionaryResource.DictEntry> entries = entriesByLength.getEntries(); |
| Collection <DictionaryResource.DictEntry> resultEntries = findMatchingEntry(entries, tokensToMatch); |
| Iterator<DictionaryResource.DictEntry> resultEntriesIterator = resultEntries.iterator(); |
| AnnotationFS endToken = tokens.get(whichToken + entryLength - 1); |
| |
| while (resultEntriesIterator.hasNext()) { |
| DictionaryResource.DictEntry dictEntry = resultEntriesIterator.next (); |
| // System.err.println("===> MATCH: '" + tokensToMatch + "'"); |
| |
| // System.err.println(">>>"+dictEntry.getUnsorted() ); |
| makeAnnotation(tcas, start, endToken.getEnd(), dictEntry.getProperties(), spanAnnotation, |
| dictEntry.getUnsorted(), tokens.subList(whichToken, whichToken + entryLength), |
| logger); |
| |
| updateTokenAnnotations(tokens, whichToken, entryLength, dictEntry); |
| if (!findAllMatches) { |
| entryFound = true; |
| } |
| } |
| } |
| entryLength--; |
| } |
| if (!entryFound) { |
| entryLength = 0; |
| } |
| return entryLength; |
| } |
| |
| /** |
| * update token annotations with value stored in dictionary for feature provided by |
| * tokenClassFeatureName |
| * |
| * @param tokens |
| * @param whichToken |
| * @param entryLength |
| * @param dictEntry |
| */ |
| private void updateTokenAnnotations(ArrayList<AnnotationFS> tokens, int whichToken, |
| int entryLength, DictEntry dictEntry) { |
| if (tokenClassWriteBackFeatures != null) { |
| for (int feature = 0; feature < tokenClassWriteBackFeatures.length; feature++) { |
| if (tokenClassWriteBackFeatures[feature] != null) { |
| String propVal = dictEntry.getProperties().getProperty( |
| tokenClassWriteBackFeatureNames[feature], UNKNOWN_VALUE); |
| // System.err.println ("propVal: " + ": " + propVal); |
| for (int i = whichToken; i < whichToken + entryLength; i++) { |
| AnnotationFS tokenToUpdate = tokens.get(i); |
| // System.err.println ("Token: " + tokenToUpdate.getText |
| // ()); |
| tokenToUpdate.setStringValue(tokenClassWriteBackFeatures[feature], propVal); |
| } |
| } |
| } |
| } |
| } |
| |
| /** |
| * @param tcas - |
| * @param start - |
| * @param end - |
| * @param properties - |
| * @param spanAnnotation - |
| * @param matchedText - |
| * @param matched - |
| * @param log - |
| |
| */ |
| protected void makeAnnotation(CAS tcas, int start, int end, EntryProperties properties, |
| Annotation spanAnnotation, String matchedText, Collection<AnnotationFS> matched, |
| Logger log) { |
| AnnotationFS annotation = tcas.createAnnotation(resultAnnotationType, start, end); |
| if (resultEnclosingSpan != null) { |
| annotation.setFeatureValue(resultEnclosingSpan, spanAnnotation); |
| } |
| |
| if (resultMatchedTextFeature != null) { |
| annotation.setStringValue(resultMatchedTextFeature, matchedText); |
| } |
| |
| if (matchedTokensFeature != null) { |
| FSArray matchedTokens = new FSArray(getJCas(), matched.size()); |
| FeatureStructure[] featureStructArray = new FeatureStructure[matched.size()]; |
| matched.toArray(featureStructArray); |
| matchedTokens.copyFromArray(featureStructArray, 0, 0, featureStructArray.length); |
| annotation.setFeatureValue(matchedTokensFeature, matchedTokens); |
| /* |
| * FSArray tmp = (FSArray) annotation.getFeatureValue (matchedTokensFeature); FeatureStructure [] |
| * tmpfs = tmp.toArray (); System.err.println ("FSArray: begin"); for (int i = 0; i < |
| * tmpfs.length; i++) { System.err.println (((Annotation) tmpfs[i]).getCoveredText ()); } |
| * System.err.println ("FSArray: done"); |
| */ |
| } |
| |
| for (int featIndex = 0; featIndex < features.length; featIndex++) { |
| if (features[featIndex] != null) { |
| annotation.setStringValue(features[featIndex], properties.getProperty( |
| attributeNames[featIndex], UNKNOWN_VALUE)); |
| } else { |
| |
| // String message = "Feature '" + features[featIndex].getName() + "' not found in type '" + |
| // resultAnnotationName + "'"; |
| |
| String message = "Feature '" + featIndex + "' not found in type '" + resultAnnotationName |
| + "'"; |
| // System.err.println(message); |
| |
| log.logWarning(message); |
| } |
| } |
| |
| tcas.getIndexRepository().addFS(annotation); |
| } |
| |
| /** |
| * @param entries |
| * @param tokensToMatch |
| * @return |
| */ |
| private Collection<DictEntry> findMatchingEntry(ArrayList<DictionaryResource.DictEntry> entries, |
| String [] tokensToMatch) { |
| //System.err.print("Searching for: '"); |
| //for (String token : tokensToMatch) { |
| // System.err.print(token + " "); |
| //} |
| //System.err.println("'"); |
| |
| |
| Collection<DictEntry> result = new ArrayList<DictEntry> (); |
| |
| for (int i = 0; i < entries.size(); i++) { |
| DictionaryResource.DictEntry dictEntry = entries.get(i); |
| String[] entryText = dictEntry.getElements(); |
| |
| // System.err.println("--> trying: '" + entryText.toString() + "'"); |
| |
| if (entryText.length == tokensToMatch.length) |
| { |
| boolean match = true; |
| int item = 0; |
| for (String entryTextItem : entryText) |
| { |
| if (! entryTextItem.equals(tokensToMatch[item])) |
| { |
| match = false; |
| break; |
| } |
| item += 1; |
| } |
| if (match) { |
| result.add (dictEntry); |
| } |
| } |
| } |
| return result; |
| } |
| |
| /** |
| * @param tokens |
| * @param length |
| * @return |
| */ |
| private String[] buildTokensToMatchArray(ArrayList<AnnotationFS> tokens, int startIndex, int length, |
| boolean sortElements) { |
| String[] elements = new String[length]; |
| for (int i = startIndex; i < length + startIndex; i++) { |
| AnnotationFS token = tokens.get(i); |
| elements[i - startIndex] = tokenNormalizer.normalize(getTokenText(token)); |
| } |
| |
| if (sortElements) { |
| Arrays.sort(elements); |
| } |
| |
| return elements; |
| } |
| |
| private String getTokenText(AnnotationFS token) { |
| if (tokenTextFeature == null) { |
| return token.getCoveredText(); |
| } else { |
| return token.getStringValue(tokenTextFeature); |
| } |
| } |
| } |