blob: 41223641fe6dce00698b584dd5027a49163a7f9c [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.uima.examples.cas;
import java.util.Arrays;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_component.CasAnnotator_ImplBase;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.analysis_engine.annotator.AnnotatorContext;
import org.apache.uima.analysis_engine.annotator.AnnotatorInitializationException;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.cas.Feature;
import org.apache.uima.cas.Type;
import org.apache.uima.cas.TypeSystem;
import org.apache.uima.cas.text.AnnotationFS;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.util.Level;
import org.apache.uima.util.Logger;
/**
* An example annotator that discovers Person Titles in text and classifies them into three
* categories - Civilian (e.g. Mr.,Ms.), Military (e.g. Lt. Col.) , and Government (e.g. Gov.,
* Sen.). The titles are detected using simple string matching. The strings that are matched are
* determined by the <code>CivilianTitles</code>, <code>MilitaryTitles</code>, and
* <code>GovernmentTitles</code> configuration parameters.
* <p>
* If the <code>ContainingAnnotationType</code> parameter is specified, this annotator will only
* look for titles within existing annotations of that type. This feature can be used, for example,
* to only match person titles within existing Person Name annotations, discovered by some annotator
* that has run previously.
*
*
*/
public class PersonTitleAnnotator extends CasAnnotator_ImplBase {
/**
* The Type of Annotation that we will be creating when we find a match.
*/
private Type mPersonTitleType;
/**
* The Annotation Type within which we will search for Person Titles (optional).
*/
private Type mContainingType;
/**
* The Feature representing the kind of PersonTitle - civilian, military, or government.
*/
private Feature mPersonTitleKindFeature;
/**
* The list of civilian titles, read from the CivilianTitles configuration parameter.
*/
private String[] mCivilianTitles;
/**
* The list of military titles, read from the MilitaryTitles configuration parameter.
*/
private String[] mMilitaryTitles;
/**
* The list of government titles, read from the GovernmentTitles configuration parameter.
*/
private String[] mGovernmentTitles;
/**
* Performs initialization logic. This implementation just reads values for the configuration
* parameters.
*
* @see org.apache.uima.analysis_engine.annotator.BaseAnnotator#initialize(AnnotatorContext)
*/
public void initialize(UimaContext aContext) throws ResourceInitializationException {
super.initialize(aContext);
// read configuration parameter values
mCivilianTitles = (String[]) getContext().getConfigParameterValue("CivilianTitles");
mMilitaryTitles = (String[]) getContext().getConfigParameterValue("MilitaryTitles");
mGovernmentTitles = (String[]) getContext().getConfigParameterValue("GovernmentTitles");
// write log messages
Logger logger = getContext().getLogger();
logger.log(Level.CONFIG, "PersonTitleAnnotator initialized");
logger.log(Level.CONFIG, "CivilianTitles = " + Arrays.asList(mCivilianTitles));
logger.log(Level.CONFIG, "MilitaryTitles = " + Arrays.asList(mMilitaryTitles));
logger.log(Level.CONFIG, "GovernmentTitles = " + Arrays.asList(mGovernmentTitles));
}
/**
* Called whenever the CAS type system changes. Acquires references to Types and Features.
*
* @see org.apache.uima.analysis_engine.annotator.BaseAnnotator#typeSystemInit(TypeSystem)
*/
public void typeSystemInit(TypeSystem aTypeSystem) throws AnalysisEngineProcessException {
// Get a reference to the "PersonTitle" Type
mPersonTitleType = aTypeSystem.getType("example.PersonTitle");
if (mPersonTitleType == null) {
throw new AnalysisEngineProcessException(AnnotatorInitializationException.TYPE_NOT_FOUND,
new Object[] { getClass().getName(), "example.PersonTitle" });
}
// Get a reference to the "Kind" Feature
mPersonTitleKindFeature = mPersonTitleType.getFeatureByBaseName("Kind");
if (mPersonTitleKindFeature == null) {
throw new AnalysisEngineProcessException(AnnotatorInitializationException.FEATURE_NOT_FOUND,
new Object[] { getClass().getName(), "example.PersonTitle:Kind" });
}
// Get the value for the "ContainingType" parameter if there is one
String containingTypeName = (String) getContext().getConfigParameterValue(
"ContainingAnnotationType");
if (containingTypeName != null) {
mContainingType = aTypeSystem.getType(containingTypeName);
if (mContainingType == null) {
throw new AnalysisEngineProcessException(AnnotatorInitializationException.TYPE_NOT_FOUND,
new Object[] { getClass().getName(), containingTypeName });
}
}
}
/**
* Annotates a document. This annotator searches for person titles using simple string matching.
*
* @param aCAS
* CAS containing document text and previously discovered annotations, and to which new
* annotations are to be written.
*
* @see CasAnnotator_ImplBase#process(CAS)
*/
public void process(CAS aCAS) throws AnalysisEngineProcessException {
try {
// If the ResultSpec doesn't include the PersonTitle type, we have
// nothing to do.
if (!getResultSpecification().containsType("example.PersonTitle")) {
return;
}
if (mContainingType == null) {
// Search the whole document for PersonTitle annotations
String text = aCAS.getDocumentText();
annotateRange(aCAS, text, 0);
} else {
// Search only within annotations of type mContainingType
// Get an iterator over the annotations of type mContainingType.
FSIterator it = aCAS.getAnnotationIndex(mContainingType).iterator();
// Loop over the iterator.
while (it.isValid()) {
// Get the next annotation from the iterator
AnnotationFS annot = (AnnotationFS) it.get();
// Get text covered by this annotation
String coveredText = annot.getCoveredText();
// Get begin position of this annotation
int annotBegin = annot.getBegin();
// search for matches within this
annotateRange(aCAS, coveredText, annotBegin);
// Advance the iterator.
it.moveToNext();
}
}
} catch (Exception e) {
throw new AnalysisEngineProcessException(e);
}
}
/**
* A utility method that searches a part of the document for Person Titles.
*
* @param aCAS
* the CAS in which to create new annotations
* @param aText
* the substring of the document text within which to search
* @param aBeginPos
* the position of this substring relative to the start of the document
*/
protected void annotateRange(CAS aCAS, String aText, int aBeginPos) {
// Search for each of the three types of titles
annotateRange(aCAS, aText, aBeginPos, "Civilian", mCivilianTitles);
annotateRange(aCAS, aText, aBeginPos, "Military", mMilitaryTitles);
annotateRange(aCAS, aText, aBeginPos, "Government", mGovernmentTitles);
}
/**
* A utility method that searches a part of the document for a specific kind of Person Title.
*
* @param aCAS
* the CAS in which to create new annotations
* @param aText
* the substring of the document text within which to search
* @param aBeginPos
* the position of this substring relative to the start of the document
* @param aTitleType
* the type of title to look for. This becomes the value of the <code>Kind</code>
* feature.
* @param aTitles
* the exact strings to look for in the document
*
*/
protected void annotateRange(CAS aCAS, String aText, int aBeginPos, String aTitleType,
String[] aTitles) {
// Loop over the matchStrings.
for (int i = 0; i < aTitles.length; i++) {
// logger.log("Looking for string: " + matchStrings[i]);
// Find a first match, if it exists.
int start = aText.indexOf(aTitles[i]);
// Keep going while there are matches in the text.
while (start >= 0) {
// Set the end position (start + length of string).
int end = start + aTitles[i].length();
// Compute absolute position of annotation in document
int absStart = aBeginPos + start;
int absEnd = aBeginPos + end;
// Write log message
getContext().getLogger().log(Level.FINER,
"Found \"" + aTitles[i] + "\" at (" + absStart + "," + absEnd + ")");
// Create a new annotation for the most recently discovered match.
createAnnotation(aCAS, absStart, absEnd, aTitleType);
// Look for the next match, starting after the previous match.
start = aText.indexOf(aTitles[i], end);
}
}
}
/**
* Creates an PersonTitle annotation in the CAS.
*
* @param aCAS
* the CAS in which to create the annotation
* @param aBeginPos
* the begin position of the annotation relative to the start of the document
* @param aEndPos
* the end position of the annotation relative to the start of the document. (Note that,
* as in the Java string functions, the end position is one past the last character in
* the annotation, so that (end - begin) = length.
* @param aTitleType
* the type of person title. This becomes the value of the <code>Kind</code> feature.
*/
protected void createAnnotation(CAS aCAS, int aBeginPos, int aEndPos, String aTitleType) {
AnnotationFS title = aCAS.createAnnotation(mPersonTitleType, aBeginPos, aEndPos);
// Set the "kind" feature if it's part of the ResultSpec
if (getResultSpecification().containsFeature("example.PersonTitle:Kind")) {
title.setStringValue(mPersonTitleKindFeature, aTitleType);
}
// Add the annotation to the index.
aCAS.getIndexRepository().addFS(title);
}
}