uimaj-2.2.0-incubating/uimaj-examples/src/main/opennlp_wrappers/src/org/apache/uima/examples/opennlp/annotator/NEDetector.java - uima-uimaj - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 package org.apache.uima.examples.opennlp.annotator;

 import java.io.File;
 import java.io.FilenameFilter;
 import java.lang.reflect.Constructor;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.Hashtable;
 import java.util.List;

 import opennlp.maxent.io.SuffixSensitiveGISModelReader;
 import opennlp.tools.lang.english.NameFinder;
 import opennlp.tools.namefind.NameFinderME;

 import org.apache.uima.UimaContext;
 import org.apache.uima.analysis_component.JCasAnnotator_ImplBase;
 import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
 import org.apache.uima.analysis_engine.annotator.AnnotatorConfigurationException;
 import org.apache.uima.analysis_engine.annotator.AnnotatorProcessException;
 import org.apache.uima.cas.FSIterator;
 import org.apache.uima.cas.text.AnnotationIndex;
 import org.apache.uima.examples.opennlp.EntityAnnotation;
 import org.apache.uima.examples.opennlp.Sentence;
 import org.apache.uima.examples.opennlp.Token;
 import org.apache.uima.jcas.JCas;
 import org.apache.uima.resource.ResourceInitializationException;

 /**
  * UIMA wrapper for the OpenNLP named entity recognizer. The entity models and corresponding
  * annotation type classes are specified as parameters. The document in the CAS is analyzed and a
  * named entity annotation is created for each entity found. We assume that any entity annotation
  * type class inherits from org.apache.uima.examples.opennlp.annotator.EntityAnnotation.
  */
 public class NEDetector extends JCasAnnotator_ImplBase {

   /* Model directory parameter name. */
   private static final String MODEL_DIR_PARAM = "ModelDirectory";

   /* Mappings array parameter name. */
   private static final String MAPPINGS_PARAM = "EntityTypeMappings";

   /* Name to use for this Analysis Engine component. */
   private static final String COMPONENT_NAME = "OpenNLP NE Detector";

   /* Array of named entity finders that will be run. */
   private NameFinder[] nameFinders;

   /*
    * Array of labels for the named entity finders, derived from the corresponding model file names
    * (i.e., the filename minus ".bin.gz").
    */
   private String[] nameFinderLabels;

   /*
    * Hash that maps named entity finder labels to the constructor for the corresponding annotation
    * type class.
    */
   private Hashtable labelMap = new Hashtable();

   /* Number of named entity finders. */
   private int numNefs = 0;

   /* Array of constructors for the annotation type classes. */
   private Constructor[] neAnnotationMakers;

   /**
    * A simple filename filter class to list all of the named entity model files in the model
    * directory.
    */
   private class modelFileFilter implements FilenameFilter {

     /*
      * (non-Javadoc)
      *
      * @see java.io.FilenameFilter#accept(java.io.File, java.lang.String)
      */
     public boolean accept(File dir, String name) {
       return name.endsWith(".bin.gz");
     }
   }

   /**
    * Initialize the Annotator.
    *
    * @see JCasAnnotator_ImplBase#initialize(UimaContext)
    */
   public void initialize(UimaContext aContext) throws ResourceInitializationException {
     super.initialize(aContext);

     try {
       String[] mappingStrings = null;

       mappingStrings = (String[]) aContext.getConfigParameterValue(MAPPINGS_PARAM);
       if (mappingStrings == null) {
         throw new AnnotatorConfigurationException();
       }
       loadMappings(mappingStrings);

       String modelDirName = (String) aContext.getConfigParameterValue(MODEL_DIR_PARAM);

       File modelDir = new File(modelDirName);
       if (!modelDir.isDirectory()) {
         throw new AnnotatorConfigurationException();
       }
       File[] modelFiles = modelDir.listFiles(new modelFileFilter());
       numNefs = modelFiles.length;
       nameFinders = new NameFinder[numNefs];
       nameFinderLabels = new String[numNefs];
       neAnnotationMakers = new Constructor[numNefs];
       for (int i = 0; i < numNefs; i++) {
         String modelName = modelFiles[i].getName();
         System.out.print("Loading model: " + modelName + "...");
         nameFinders[i] = new NameFinder(new SuffixSensitiveGISModelReader(modelFiles[i]).getModel());
         int nameStart = modelName.lastIndexOf(System.getProperty("file.separator")) + 1;
         int nameEnd = modelName.indexOf('.', nameStart);
         if (nameEnd == -1) {
           nameEnd = modelName.length();
         }
         nameFinderLabels[i] = modelName.substring(nameStart, nameEnd);
         Constructor annotationMaker;
         if ((annotationMaker = (Constructor) labelMap.get(nameFinderLabels[i])) == null) {
           throw new AnnotatorConfigurationException();
         }
         neAnnotationMakers[i] = annotationMaker;
         System.out.println("done");
       }

     } catch (Exception e) {
       throw new ResourceInitializationException(e);
     }
   }

   /**
    * Processes the entity type mappaings parameter. The constructor for each class identified in the
    * array is loaded and stored in the mapping hashtable, using the label provided in the parameter
    * as the key.
    *
    * @param mappingStrings
    *          Array of mapping strings of the form "labe,class"
    * @throws AnnotatorConfigurationException
    */
   private void loadMappings(String[] mappingStrings) throws AnnotatorConfigurationException {
     // populate the mappings hash table (key: entity label,CAS Annotation Type
     // Constructor)
     for (int i = 0; i < mappingStrings.length; i++) {
       String[] mappingPair = mappingStrings[i].split(",");
       if (mappingPair.length < 2)
         throw new AnnotatorConfigurationException();

       String modelName = mappingPair[0];
       String className = mappingPair[1];

       Constructor annotationConstructor;
       // get the name of the JCAS type with this name
       Class annotationClass;
       try {
         annotationClass = Class.forName(className);
         // get the constructor for that JCAS type
         annotationConstructor = annotationClass.getConstructor(new Class[] { JCas.class });
       } catch (Exception e) {
         throw new AnnotatorConfigurationException(e);
       }
       labelMap.put(modelName, annotationConstructor);
     }
   }

   /**
    * Process a CAS.
    *
    * @see JCasAnnotator_ImplBase#process(JCas)
    */
   public void process(JCas aJCas) throws AnalysisEngineProcessException {

     ArrayList tokenList = new ArrayList();
     ArrayList wordList = new ArrayList();
     List finderTags;

     AnnotationIndex sentenceIndex = aJCas.getAnnotationIndex(Sentence.type);
     AnnotationIndex tokenIndex = aJCas.getAnnotationIndex(Token.type);

     // iterate over Sentences
     FSIterator sentenceIterator = sentenceIndex.iterator();
     while (sentenceIterator.hasNext()) {
       Sentence sentence = (Sentence) sentenceIterator.next();

       tokenList.clear();
       wordList.clear();

       // iterate over Tokens
       FSIterator tokenIterator = tokenIndex.subiterator(sentence);
       while (tokenIterator.hasNext()) {
         Token token = (Token) tokenIterator.next();

         tokenList.add(token);
         wordList.add(token.getCoveredText());
       }

       for (int i = 0; i < numNefs; i++) {
         Constructor annotationMaker = neAnnotationMakers[i];
         finderTags = nameFinders[i].find(wordList, Collections.EMPTY_MAP);

         boolean inTag = false;
         int tagStart = 0;
         int tagEnd = 0;
         for (int j = 0; j < finderTags.size(); j++) {
           String tag = (String) finderTags.get(j);

           if (inTag) {
             // check for end tags
             if (tag.equals(NameFinderME.START) || tag.equals(NameFinderME.OTHER)) {
               // make annotation
               tagEnd = j - 1;
               Token startToken = (Token) tokenList.get(tagStart);
               Token endToken = (Token) tokenList.get(tagEnd);
               makeEntityAnnotation(annotationMaker, aJCas, startToken.getBegin(), endToken.getEnd());
               inTag = false;
             }
           }
           if (!inTag) {
             // check for start tags
             if (tag.equals(NameFinderME.START)) {
               tagStart = j;
               inTag = true;
             }
           }
         }
       }
     }
   }

   /**
    * Create a new EntityAnnotation using the supplied Constructor. The start, end, and componentId
    * features are set.
    *
    * @param annotationMaker
    *          Constructor to create the EntityAnnotation object
    * @param jCas
    *          The JCas in which to create the new annotation
    * @param start
    *          Start of annotation span
    * @param end
    *          End of annotation span
    * @throws AnnotatorProcessException
    */
   private void makeEntityAnnotation(Constructor annotationMaker, JCas jCas, int start, int end)
           throws AnalysisEngineProcessException {
     try {
       EntityAnnotation entityAnnot = (EntityAnnotation) annotationMaker
               .newInstance(new Object[] { jCas });
       entityAnnot.setBegin(start);
       entityAnnot.setEnd(end);
       entityAnnot.setComponentId(COMPONENT_NAME);
       entityAnnot.addToIndexes();
     } catch (Exception e) {
       throw new AnalysisEngineProcessException(e);
     }
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/

	package org.apache.uima.examples.opennlp.annotator;

	import java.io.File;
	import java.io.FilenameFilter;
	import java.lang.reflect.Constructor;
	import java.util.ArrayList;
	import java.util.Collections;
	import java.util.Hashtable;
	import java.util.List;

	import opennlp.maxent.io.SuffixSensitiveGISModelReader;
	import opennlp.tools.lang.english.NameFinder;
	import opennlp.tools.namefind.NameFinderME;

	import org.apache.uima.UimaContext;
	import org.apache.uima.analysis_component.JCasAnnotator_ImplBase;
	import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
	import org.apache.uima.analysis_engine.annotator.AnnotatorConfigurationException;
	import org.apache.uima.analysis_engine.annotator.AnnotatorProcessException;
	import org.apache.uima.cas.FSIterator;
	import org.apache.uima.cas.text.AnnotationIndex;
	import org.apache.uima.examples.opennlp.EntityAnnotation;
	import org.apache.uima.examples.opennlp.Sentence;
	import org.apache.uima.examples.opennlp.Token;
	import org.apache.uima.jcas.JCas;
	import org.apache.uima.resource.ResourceInitializationException;

	/**
	* UIMA wrapper for the OpenNLP named entity recognizer. The entity models and corresponding
	* annotation type classes are specified as parameters. The document in the CAS is analyzed and a
	* named entity annotation is created for each entity found. We assume that any entity annotation
	* type class inherits from org.apache.uima.examples.opennlp.annotator.EntityAnnotation.
	*/
	public class NEDetector extends JCasAnnotator_ImplBase {

	/* Model directory parameter name. */
	private static final String MODEL_DIR_PARAM = "ModelDirectory";

	/* Mappings array parameter name. */
	private static final String MAPPINGS_PARAM = "EntityTypeMappings";

	/* Name to use for this Analysis Engine component. */
	private static final String COMPONENT_NAME = "OpenNLP NE Detector";

	/* Array of named entity finders that will be run. */
	private NameFinder[] nameFinders;

	/*
	* Array of labels for the named entity finders, derived from the corresponding model file names
	* (i.e., the filename minus ".bin.gz").
	*/
	private String[] nameFinderLabels;

	/*
	* Hash that maps named entity finder labels to the constructor for the corresponding annotation
	* type class.
	*/
	private Hashtable labelMap = new Hashtable();

	/* Number of named entity finders. */
	private int numNefs = 0;

	/* Array of constructors for the annotation type classes. */
	private Constructor[] neAnnotationMakers;

	/**
	* A simple filename filter class to list all of the named entity model files in the model
	* directory.
	*/
	private class modelFileFilter implements FilenameFilter {

	/*
	* (non-Javadoc)
	*
	* @see java.io.FilenameFilter#accept(java.io.File, java.lang.String)
	*/
	public boolean accept(File dir, String name) {
	return name.endsWith(".bin.gz");
	}
	}

	/**
	* Initialize the Annotator.
	*
	* @see JCasAnnotator_ImplBase#initialize(UimaContext)
	*/
	public void initialize(UimaContext aContext) throws ResourceInitializationException {
	super.initialize(aContext);

	try {
	String[] mappingStrings = null;

	mappingStrings = (String[]) aContext.getConfigParameterValue(MAPPINGS_PARAM);
	if (mappingStrings == null) {
	throw new AnnotatorConfigurationException();
	}
	loadMappings(mappingStrings);

	String modelDirName = (String) aContext.getConfigParameterValue(MODEL_DIR_PARAM);

	File modelDir = new File(modelDirName);
	if (!modelDir.isDirectory()) {
	throw new AnnotatorConfigurationException();
	}
	File[] modelFiles = modelDir.listFiles(new modelFileFilter());
	numNefs = modelFiles.length;
	nameFinders = new NameFinder[numNefs];
	nameFinderLabels = new String[numNefs];
	neAnnotationMakers = new Constructor[numNefs];
	for (int i = 0; i < numNefs; i++) {
	String modelName = modelFiles[i].getName();
	System.out.print("Loading model: " + modelName + "...");
	nameFinders[i] = new NameFinder(new SuffixSensitiveGISModelReader(modelFiles[i]).getModel());
	int nameStart = modelName.lastIndexOf(System.getProperty("file.separator")) + 1;
	int nameEnd = modelName.indexOf('.', nameStart);
	if (nameEnd == -1) {
	nameEnd = modelName.length();
	}
	nameFinderLabels[i] = modelName.substring(nameStart, nameEnd);
	Constructor annotationMaker;
	if ((annotationMaker = (Constructor) labelMap.get(nameFinderLabels[i])) == null) {
	throw new AnnotatorConfigurationException();
	}
	neAnnotationMakers[i] = annotationMaker;
	System.out.println("done");
	}

	} catch (Exception e) {
	throw new ResourceInitializationException(e);
	}
	}

	/**
	* Processes the entity type mappaings parameter. The constructor for each class identified in the
	* array is loaded and stored in the mapping hashtable, using the label provided in the parameter
	* as the key.
	*
	* @param mappingStrings
	* Array of mapping strings of the form "labe,class"
	* @throws AnnotatorConfigurationException
	*/
	private void loadMappings(String[] mappingStrings) throws AnnotatorConfigurationException {
	// populate the mappings hash table (key: entity label,CAS Annotation Type
	// Constructor)
	for (int i = 0; i < mappingStrings.length; i++) {
	String[] mappingPair = mappingStrings[i].split(",");
	if (mappingPair.length < 2)
	throw new AnnotatorConfigurationException();

	String modelName = mappingPair[0];
	String className = mappingPair[1];

	Constructor annotationConstructor;
	// get the name of the JCAS type with this name
	Class annotationClass;
	try {
	annotationClass = Class.forName(className);
	// get the constructor for that JCAS type
	annotationConstructor = annotationClass.getConstructor(new Class[] { JCas.class });
	} catch (Exception e) {
	throw new AnnotatorConfigurationException(e);
	}
	labelMap.put(modelName, annotationConstructor);
	}
	}

	/**
	* Process a CAS.
	*
	* @see JCasAnnotator_ImplBase#process(JCas)
	*/
	public void process(JCas aJCas) throws AnalysisEngineProcessException {

	ArrayList tokenList = new ArrayList();
	ArrayList wordList = new ArrayList();
	List finderTags;

	AnnotationIndex sentenceIndex = aJCas.getAnnotationIndex(Sentence.type);
	AnnotationIndex tokenIndex = aJCas.getAnnotationIndex(Token.type);

	// iterate over Sentences
	FSIterator sentenceIterator = sentenceIndex.iterator();
	while (sentenceIterator.hasNext()) {
	Sentence sentence = (Sentence) sentenceIterator.next();

	tokenList.clear();
	wordList.clear();

	// iterate over Tokens
	FSIterator tokenIterator = tokenIndex.subiterator(sentence);
	while (tokenIterator.hasNext()) {
	Token token = (Token) tokenIterator.next();

	tokenList.add(token);
	wordList.add(token.getCoveredText());
	}

	for (int i = 0; i < numNefs; i++) {
	Constructor annotationMaker = neAnnotationMakers[i];
	finderTags = nameFinders[i].find(wordList, Collections.EMPTY_MAP);

	boolean inTag = false;
	int tagStart = 0;
	int tagEnd = 0;
	for (int j = 0; j < finderTags.size(); j++) {
	String tag = (String) finderTags.get(j);

	if (inTag) {
	// check for end tags
	if (tag.equals(NameFinderME.START) \|\| tag.equals(NameFinderME.OTHER)) {
	// make annotation
	tagEnd = j - 1;
	Token startToken = (Token) tokenList.get(tagStart);
	Token endToken = (Token) tokenList.get(tagEnd);
	makeEntityAnnotation(annotationMaker, aJCas, startToken.getBegin(), endToken.getEnd());
	inTag = false;
	}
	}
	if (!inTag) {
	// check for start tags
	if (tag.equals(NameFinderME.START)) {
	tagStart = j;
	inTag = true;
	}
	}
	}
	}
	}
	}

	/**
	* Create a new EntityAnnotation using the supplied Constructor. The start, end, and componentId
	* features are set.
	*
	* @param annotationMaker
	* Constructor to create the EntityAnnotation object
	* @param jCas
	* The JCas in which to create the new annotation
	* @param start
	* Start of annotation span
	* @param end
	* End of annotation span
	* @throws AnnotatorProcessException
	*/
	private void makeEntityAnnotation(Constructor annotationMaker, JCas jCas, int start, int end)
	throws AnalysisEngineProcessException {
	try {
	EntityAnnotation entityAnnot = (EntityAnnotation) annotationMaker
	.newInstance(new Object[] { jCas });
	entityAnnot.setBegin(start);
	entityAnnot.setEnd(end);
	entityAnnot.setComponentId(COMPONENT_NAME);
	entityAnnot.addToIndexes();
	} catch (Exception e) {
	throw new AnalysisEngineProcessException(e);
	}
	}
	}