| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| */ |
| |
| package org.apache.uima.lucas.indexer.analysis; |
| |
| import java.io.IOException; |
| import java.text.Format; |
| import java.util.Collection; |
| import java.util.Collections; |
| import java.util.Iterator; |
| import java.util.LinkedList; |
| import java.util.List; |
| import java.util.Map; |
| |
| import org.apache.log4j.Logger; |
| import org.apache.lucene.analysis.Token; |
| import org.apache.lucene.analysis.TokenStream; |
| import org.apache.uima.cas.CAS; |
| import org.apache.uima.cas.CASException; |
| import org.apache.uima.cas.Feature; |
| import org.apache.uima.cas.FeatureStructure; |
| import org.apache.uima.cas.Type; |
| import org.apache.uima.jcas.JCas; |
| import org.apache.uima.jcas.cas.FSArray; |
| import org.apache.uima.jcas.cas.StringArray; |
| import org.apache.uima.jcas.tcas.Annotation; |
| |
| import com.google.common.base.Predicate; |
| import com.google.common.collect.ImmutableBiMap; |
| import com.google.common.collect.Iterators; |
| import com.google.common.collect.Lists; |
| |
| /** |
| * |
| * AnnotationTokenStream represents a TokenStream which extracts tokens from |
| * feature values of annotations of a given type from a JCas object. Each token |
| * has the start and end offset from the annotation object. This class supports |
| * only the following UIMA JCas types of features: |
| * <ol> |
| * <li>String</li> |
| * <li>StringArray</li> |
| * <li>FSArray</li> |
| * <li>Number types</li> |
| * </ol> |
| */ |
| public class AnnotationTokenStream extends TokenStream { |
| |
| private JCas jCas; |
| |
| private String featurePath; |
| |
| private List<String> featureNames; |
| |
| private String delimiter; |
| |
| private Iterator<Annotation> annotationIterator; // iterates over |
| // annotations |
| |
| private Iterator<FeatureStructure> featureStructureIterator; // iterates |
| // over |
| // feature |
| // structures |
| // stored in |
| // feature |
| // arrays of |
| // an |
| // annotation |
| |
| private Iterator<String> featureValueIterator; // iterates over the features |
| // of a feature |
| // structure |
| |
| private Annotation currentAnnotation; |
| |
| private Type annotationType; |
| |
| private Map<String, Format> featureFormats; // a optional map of format |
| // object for each feature |
| |
| private static Logger logger = Logger |
| .getLogger(AnnotationTokenStream.class); |
| |
| private class NotNullPredicate<T> implements Predicate<T> { |
| |
| public boolean apply(T object) { |
| return object != null; |
| } |
| } |
| |
| /** |
| * Creates a TokenStream which extracts all coveredText feature values of |
| * annotations of a given type from a JCas object. Each token has the start |
| * and end offset of the annotation and takes the covered text value as |
| * termText. |
| * |
| * @param jCas |
| * the jCas |
| * @param sofaName |
| * the name of the subject of analysis (sofa) |
| * @param typeName |
| * the type of the annotation |
| * @throws CASException |
| */ |
| public AnnotationTokenStream(JCas jCas, String sofaName, String typeName) |
| throws InvalidTokenSourceException { |
| this(jCas, sofaName, typeName, null, Collections.<String> emptyList(), |
| null, Collections.<String, Format> emptyMap()); |
| } |
| |
| /** |
| * Creates a TokenStream which extracts all feature values of a given |
| * feature name from annotations with a given type from a given JCas object. |
| * Each token has the start and end offset of the annotation and uses the |
| * feature value as term text. |
| * |
| * @param jCas |
| * the JCas object |
| * @param sofaName |
| * the name of the subject of analysis (sofa) |
| * @param typeName |
| * the type of the annotation |
| * @param featureName |
| * the name of the feature from which the token text is build |
| * @param featureFormat |
| * optional format object to convert feature values to strings |
| * @throws InvalidTokenSourceException |
| */ |
| |
| public AnnotationTokenStream(JCas jCas, String sofaName, String typeName, |
| String featureName, Format featureFormat) |
| throws InvalidTokenSourceException { |
| this(jCas, sofaName, typeName, null, Lists.newArrayList(featureName), |
| null, featureFormat != null ? ImmutableBiMap.of(featureName, |
| featureFormat) : Collections |
| .<String, Format> emptyMap()); |
| } |
| |
| /** |
| * Creates a TokenStream which extracts all feature values of a given |
| * feature name list from annotations with a given type from a given JCas |
| * object. Each token has the start and end offset of the annotation and |
| * uses the concatenation of all the feature values as term text. Optionally |
| * the different feature values of an annotation can be concatenated with a |
| * delimiter. |
| * |
| * @param jCas |
| * the JCas object |
| * @param sofaName |
| * the name of the Subject Of Analysis (sofa) |
| * @param typeName |
| * the type of the annotation |
| * @param featureNames |
| * the name of the feature from which the token text is build |
| * @param delimiter |
| * a delimiter for concatenating the different feature values of |
| * an annotation object. If null a white space will be used. |
| * @param featureFormats |
| * optional map of format objects to convert feature values to |
| * strings - the key must be the feature name |
| * @throws InvalidTokenSourceException |
| */ |
| public AnnotationTokenStream(JCas jCas, String sofaName, String typeName, |
| List<String> featureNames, String delimiter, |
| Map<String, Format> featureFormats) |
| throws InvalidTokenSourceException { |
| this(jCas, sofaName, typeName, null, featureNames, delimiter, |
| featureFormats); |
| } |
| |
| /** |
| * Creates a TokenStream which extracts all feature values of a given |
| * feature name list from annotations with a given type from a given JCas |
| * object. Each token has the start and end offset of the annotation and |
| * uses the concatenation of all the feature values as term text. |
| * |
| * @param jCas |
| * the JCas object |
| * @param sofaName |
| * the name of the Subject Of Analysis (sofa) |
| * @param typeName |
| * the type of the annotation |
| * @param featureNames |
| * the name of the feature from which the token text is build |
| * @param featureFormats |
| * optional map of format objects to convert feature values to |
| * strings - the key must be the feature name |
| * @throws InvalidTokenSourceException |
| */ |
| public AnnotationTokenStream(JCas jCas, String sofaName, String typeName, |
| List<String> featureNames, Map<String, Format> featureFormats) |
| throws InvalidTokenSourceException { |
| this(jCas, sofaName, typeName, null, featureNames, null, featureFormats); |
| } |
| |
| /** |
| * Creates a TokenStream which extracts all feature values of a given |
| * feature name list from annotations with a given type from a given JCas |
| * object. The addressed features are part of direct or indirect feature |
| * structure value of a annotation. For example a annotation of type person |
| * has a feature address which values are address feature structures with |
| * features for the street, postal code and city . To create tokens with |
| * postal code and city of a persons address, the featurePath must be |
| * "address" and the featureNames "postalCode" and |
| * "city". Each token has the start and end offset of the |
| * annotation and uses the concatenation of all the feature values as term |
| * text. |
| * |
| * @param jCas |
| * the JCas object |
| * @param sofaName |
| * the name of the Subject of Analysis (sofa) |
| * @param typeName |
| * the type of the annotation |
| * @param featurePath |
| * the path to the feature structures which features should be |
| * used for tokens Path entries should be separated by |
| * ".". Example: |
| * "affiliation.address.country" |
| * @param featureNames |
| * the name of the feature from which the token text is build |
| * @param featureFormats |
| * optional map of format objects to convert feature values to |
| * strings - the key must be the feature name |
| * @throws InvalidTokenSourceException |
| */ |
| public AnnotationTokenStream(JCas jCas, String sofaName, String typeName, |
| String featurePath, List<String> featureNames, |
| Map<String, Format> featureFormats) |
| throws InvalidTokenSourceException { |
| this(jCas, sofaName, typeName, featurePath, featureNames, null, |
| featureFormats); |
| } |
| |
| /** |
| * Creates a TokenStream which extracts all feature values of a given |
| * feature name list from annotations with a given type from a given JCas |
| * object. The addressed features are part of direct or indirect feature |
| * structure value of a annotation. For example a annotation of type person |
| * has a feature address which values are address feature structures with |
| * features for the street, postal code and city . To create tokens with |
| * postal code and city of a persons address, the featurePath must be |
| * "address" and the featureNames "postalCode" and |
| * "city". Each token has the start and end offset of the |
| * annotation and uses the concatenation of all the feature values as term |
| * text. Optionally the different feature values of an annotation can be |
| * concatenated with a delimiter. |
| * |
| * @param jCas |
| * the JCas object |
| * @param sofaName |
| * the name of the Subject of Analysis (sofa) |
| * @param typeName |
| * the type of the annotation |
| * @param featurePath |
| * the path to the feature structures which features should be |
| * used for tokens Path entries should be separated by |
| * ".". Example: |
| * "affiliation.address.country" |
| * @param featureNames |
| * the name of the feature from which the token text is build |
| * @param delimiter |
| * a delimiter for concatenating the different feature values of |
| * an annotation object. If null a white space will be used. |
| * @param featureFormats |
| * optional map of format objects to convert feature values to |
| * strings - the key must be the feature name |
| * @throws InvalidTokenSourceException |
| */ |
| public AnnotationTokenStream(JCas jCas, String sofaName, String typeName, |
| String featurePath, List<String> featureNames, String delimiter, |
| Map<String, Format> featureFormats) |
| throws InvalidTokenSourceException { |
| super(); |
| |
| this.featurePath = featurePath; |
| this.featureNames = featureNames; |
| this.delimiter = delimiter; |
| if (featureFormats == null) |
| this.featureFormats = Collections.emptyMap(); |
| else |
| this.featureFormats = featureFormats; |
| |
| getSofaCas(jCas, sofaName); |
| getTypeForName(typeName); |
| validate(annotationType, featureNames, featurePath); |
| |
| initializeIterators(); |
| |
| } |
| |
| private void getTypeForName(String typeName) |
| throws InvalidTokenSourceException { |
| annotationType = jCas.getTypeSystem().getType(typeName); |
| if (annotationType == null) |
| throw new InvalidTokenSourceException("Type " + typeName |
| + " not found!"); |
| } |
| |
| private void getSofaCas(JCas cas, String sofaName) |
| throws InvalidTokenSourceException { |
| try { |
| jCas = cas.getView(sofaName); |
| } catch (CASException e) { |
| throw new InvalidTokenSourceException(e); |
| } |
| } |
| |
| void validate(Type type, Collection<String> featureNames, String featurePath) |
| throws InvalidTokenSourceException { |
| Type typeToValidate = findTypeWithPath(type, featurePath); |
| |
| for (String featureName : featureNames) { |
| Feature feature = typeToValidate.getFeatureByBaseName(featureName); |
| if (feature == null) |
| throw new InvalidTokenSourceException("Type " |
| + typeToValidate.getName() + " has no feature " |
| + featureName + ". featurePath: " + featurePath); |
| } |
| } |
| |
| private Type findTypeWithPath(Type type, String featurePath) |
| throws InvalidTokenSourceException { |
| if (featurePath == null) |
| return type; |
| |
| String[] featurePathElements = featurePath.split("\\."); |
| Type currentType = type; |
| |
| for (String featurePathElement : featurePathElements) { |
| Feature feature = currentType |
| .getFeatureByBaseName(featurePathElement); |
| if (feature == null) |
| throw new InvalidTokenSourceException("Type " |
| + currentType.getName() + " has no feature " |
| + featurePathElement); |
| |
| currentType = feature.getRange(); |
| if (currentType.isArray()) |
| currentType = currentType.getComponentType(); |
| } |
| |
| return currentType; |
| } |
| |
| @Override |
| public Token next(Token token) throws IOException { |
| while (!featureValueIterator.hasNext()) { |
| while (!featureStructureIterator.hasNext()) { |
| if (!annotationIterator.hasNext()) |
| return null; |
| currentAnnotation = (Annotation) annotationIterator.next(); |
| featureStructureIterator = createFeatureStructureIterator( |
| currentAnnotation, featurePath); |
| } |
| |
| featureValueIterator = createFeatureValueIterator( |
| featureStructureIterator.next(), featureNames); |
| } |
| |
| // If we don't do that we will get problems e.g. with the |
| // HypernymFilter: The tokens are re-used by Lucene 2.9.3 and when the |
| // positionIncrement has once been set to 0, it will stay this way until |
| // it is explicitly set to another value. |
| token.reinit(new Token()); |
| |
| token.setStartOffset(currentAnnotation.getBegin()); |
| token.setEndOffset(currentAnnotation.getEnd()); |
| |
| char[] value = featureValueIterator.next().toCharArray(); |
| token.setTermBuffer(value, 0, value.length); |
| return token; |
| } |
| |
| /* |
| * (non-Javadoc) |
| * |
| * @see org.apache.lucene.analysis.TokenStream#next() |
| */ |
| @Override |
| public Token next() throws IOException { |
| return next(new Token()); |
| } |
| |
| protected void initializeIterators() { |
| annotationIterator = Iterators.filter( |
| jCas.getAnnotationIndex(annotationType).iterator(), |
| new NotNullPredicate<Annotation>()); |
| |
| if (!annotationIterator.hasNext()) { |
| featureStructureIterator = Iterators.emptyIterator(); |
| featureValueIterator = Iterators.emptyIterator(); |
| return; |
| } |
| |
| currentAnnotation = (Annotation) annotationIterator.next(); |
| featureStructureIterator = createFeatureStructureIterator( |
| currentAnnotation, featurePath); |
| if (!featureStructureIterator.hasNext()) { |
| featureValueIterator = Iterators.emptyIterator(); |
| return; |
| } |
| |
| FeatureStructure featureStructure = featureStructureIterator.next(); |
| featureValueIterator = createFeatureValueIterator(featureStructure, |
| featureNames); |
| } |
| |
| protected Iterator<FeatureStructure> createFeatureStructureIterator( |
| Annotation annotation, String featurePath) { |
| Collection<FeatureStructure> featureStructures = new LinkedList<FeatureStructure>(); |
| Collection<FeatureStructure> childs = new LinkedList<FeatureStructure>(); |
| |
| if (featurePath == null) { |
| featureStructures.add(annotation); |
| return featureStructures.iterator(); |
| } |
| |
| Type currentType = annotation.getType(); |
| if (currentType.isArray()) |
| currentType = currentType.getComponentType(); |
| |
| String[] pathEntries = featurePath.split("\\."); |
| featureStructures.add(annotation); |
| |
| for (String pathEntry : pathEntries) { |
| Feature feature = currentType.getFeatureByBaseName(pathEntry); |
| childs.clear(); |
| |
| if (feature.getRange().isArray()) { |
| for (FeatureStructure featureStructureItem : featureStructures) { |
| FSArray fsArray = (FSArray) featureStructureItem |
| .getFeatureValue(feature); |
| if (fsArray == null) |
| continue; |
| |
| for (int i = 0; i < fsArray.size(); i++) |
| childs.add(fsArray.get(i)); |
| } |
| } else |
| for (FeatureStructure featureStructureItem : featureStructures) |
| childs.add(featureStructureItem.getFeatureValue(feature)); |
| |
| currentType = feature.getRange(); |
| if (currentType.isArray()) |
| currentType = currentType.getComponentType(); |
| |
| featureStructures.clear(); |
| featureStructures.addAll(childs); |
| } |
| |
| return Iterators.filter(featureStructures.iterator(), |
| new NotNullPredicate<FeatureStructure>()); |
| } |
| |
| protected Iterator<String> createFeatureValueIterator( |
| FeatureStructure srcFeatureStructure, |
| Collection<String> featureNames) { |
| List<String> values = new LinkedList<String>(); |
| Type featureType = srcFeatureStructure.getType(); |
| |
| if (featureNames.size() == 0) |
| values.add(currentAnnotation.getCoveredText()); |
| |
| for (String featureName : featureNames) { |
| Feature feature = featureType.getFeatureByBaseName(featureName); |
| if (feature.getRange().isArray()) { |
| StringArray fsArray = (StringArray) srcFeatureStructure |
| .getFeatureValue(feature); |
| if (featureNames.size() == 1) { |
| for (int i = 0; i < fsArray.size(); i++) |
| values.add(fsArray.get(i).toString()); |
| } else { |
| String value = ""; |
| for (int i = 0; i < fsArray.size(); i++) { |
| value = value.concat(fsArray.get(i).toString()); |
| if (i < fsArray.size() - 1) |
| value = value.concat(delimiter); |
| } |
| values.add(value); |
| } |
| } else |
| values.add(getValueForFeature(srcFeatureStructure, feature, |
| featureFormats.get(feature.getShortName()))); |
| } |
| String value = ""; |
| if (delimiter != null) { |
| for (int i = 0; i < values.size(); i++) { |
| if (values.get(i) == null) |
| continue; |
| |
| value = value.concat(values.get(i)); |
| if (i < values.size() - 1) |
| value = value.concat(delimiter); |
| } |
| values.clear(); |
| values.add(value); |
| } |
| |
| return Iterators.filter(values.iterator(), |
| new NotNullPredicate<String>()); |
| } |
| |
| public String getValueForFeature(FeatureStructure featureStructure, |
| Feature feature, Format format) { |
| if (format == null) |
| return featureStructure.getFeatureValueAsString(feature); |
| else { |
| Object value = null; |
| if (feature.getRange().getName().equals(CAS.TYPE_NAME_DOUBLE)) |
| value = featureStructure.getDoubleValue(feature); |
| else if (feature.getRange().getName().equals(CAS.TYPE_NAME_FLOAT)) |
| value = featureStructure.getFloatValue(feature); |
| else if (feature.getRange().getName().equals(CAS.TYPE_NAME_LONG)) |
| value = featureStructure.getLongValue(feature); |
| else if (feature.getRange().getName().equals(CAS.TYPE_NAME_INTEGER)) |
| value = featureStructure.getIntValue(feature); |
| else if (feature.getRange().getName().equals(CAS.TYPE_NAME_SHORT)) |
| value = featureStructure.getShortValue(feature); |
| |
| return format.format(value); |
| } |
| } |
| |
| public void reset() { |
| featureStructureIterator = null; |
| currentAnnotation = null; |
| featureFormats = Collections.emptyMap(); |
| initializeIterators(); |
| } |
| |
| public Map<String, Format> getFeatureFormats() { |
| return featureFormats; |
| } |
| |
| public JCas getJCas() { |
| return jCas; |
| } |
| |
| public String getFeaturePath() { |
| return featurePath; |
| } |
| |
| public List<String> getFeatureNames() { |
| return featureNames; |
| } |
| |
| public String getDelimiter() { |
| return delimiter; |
| } |
| |
| public Type getAnnotationType() { |
| return annotationType; |
| } |
| |
| } |