modelbuilder-addon/src/main/java/opennlp/addons/modelbuilder/DefaultModelBuilderUtil.java - opennlp-sandbox - Git at Google

 /*
  * Copyright 2013 The Apache Software Foundation.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package opennlp.addons.modelbuilder;

 import java.io.File;

 import opennlp.addons.modelbuilder.impls.BaseModelBuilderParams;
 import opennlp.addons.modelbuilder.impls.FileKnownEntityProvider;
 import opennlp.addons.modelbuilder.impls.FileModelValidatorImpl;
 import opennlp.addons.modelbuilder.impls.FileSentenceProvider;
 import opennlp.addons.modelbuilder.impls.GenericModelGenerator;
 import opennlp.addons.modelbuilder.impls.GenericModelableImpl;

 /**
  * Utilizes the file-based implementations to produce an NER model from user
  * The basic processing is such
  * read in the list of known entities
  * annotate the sentences based on the list of known entities
  * create a model from the annotations
  * perform NER with the model on the sentences
  * add the NER results to the annotations
  * rebuild the model loop defined data.
  */
 public class DefaultModelBuilderUtil {

   /**
    *
    * @param sentences                a file that contains one sentence per line.
    *                                 There should be at least 15K sentences
    *                                 consisting of a representative sample from
    *                                 user data
    * @param knownEntities            a file consisting of a simple list of
    *                                 unambiguous entities, one entry per line.
    *                                 For instance, if one was trying to build a
    *                                 person NER model then this file would be a
    *                                 list of person names that are unambiguous
    *                                 and are known to exist in the sentences
    *                                 file
    * @param knownEntitiesBlacklist   This file contains a list of known bad hits
    *                                 that the NER phase of this processing might
    *                                 catch early one before the model iterates
    *                                 to maturity
    * @param modelOutFile             the location where the model will be
    *                                 written to
    * @param annotatedSentenceOutFile where the annotated sentences produced by
    *                                 this process will be written to
    * @param namedEntityType          the type of entity... for example, person,
    *                                 location, organization...
    * @param iterations               how many times to repeat the iterative loop
    *                                 of annotation, model generation, and NER
    */
   public static void generateModel(File sentences, File knownEntities, File knownEntitiesBlacklist,
           File modelOutFile, File annotatedSentenceOutFile, String namedEntityType, int iterations) {
     SemiSupervisedModelGenerator modelGenerator = new GenericModelGenerator();
     BaseModelBuilderParams params = new BaseModelBuilderParams();
     params.setAnnotatedTrainingDataFile(annotatedSentenceOutFile);
     params.setSentenceFile(sentences);
     params.setEntityType(namedEntityType);
     params.setKnownEntitiesFile(knownEntities);
     params.setModelFile(modelOutFile);
     params.setKnownEntityBlacklist(knownEntitiesBlacklist);
     /*
      * sentence providers feed this process with user data derived sentences
      * this impl just reads line by line through a file
      */
     SentenceProvider sentenceProvider = new FileSentenceProvider();
     sentenceProvider.setParameters(params);
     /*
      * KnownEntityProviders provide a seed list of known entities... such as
      * Barack Obama for person, or Germany for location obviously these would
      * want to be prolific, non-ambiguous names
      */
     KnownEntityProvider knownEntityProvider = new FileKnownEntityProvider();
     knownEntityProvider.setParameters(params);
     /*
      * ModelGenerationValidators try to weed out bad hits by the iterations of
      * the name finder. Since this is a recursive process, with each iteration
      * the namefinder will get more and more greedy if bad entities are allowed
      * in this provides a mechanism for throwing out obviously bad hits. A good
      * impl may be to make sure a location is actually within a noun phrase
      * etc...users can make this as specific as they need for their dat and
      * their use case
      */
     ModelGenerationValidator validator = new FileModelValidatorImpl();
     validator.setParameters(params);
     /*
      * Modelable's write and read the annotated sentences, as well as create and
      * write the NER models
      */
     Modelable modelable = new GenericModelableImpl();
     modelable.setParameters(params);

     /*
      * the modelGenerator actually runs the process with a set number of
      * iterations... could be better by actually calculating the diff between
      * runs and stopping based on a threshold, but for extremely large sentence
      * sets this may be too much.
      */
     modelGenerator.build(sentenceProvider, knownEntityProvider, validator, modelable, iterations);

   }
 }
	/*
	* Copyright 2013 The Apache Software Foundation.
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package opennlp.addons.modelbuilder;

	import java.io.File;

	import opennlp.addons.modelbuilder.impls.BaseModelBuilderParams;
	import opennlp.addons.modelbuilder.impls.FileKnownEntityProvider;
	import opennlp.addons.modelbuilder.impls.FileModelValidatorImpl;
	import opennlp.addons.modelbuilder.impls.FileSentenceProvider;
	import opennlp.addons.modelbuilder.impls.GenericModelGenerator;
	import opennlp.addons.modelbuilder.impls.GenericModelableImpl;

	/**
	* Utilizes the file-based implementations to produce an NER model from user
	* The basic processing is such
	* read in the list of known entities
	* annotate the sentences based on the list of known entities
	* create a model from the annotations
	* perform NER with the model on the sentences
	* add the NER results to the annotations
	* rebuild the model loop defined data.
	*/
	public class DefaultModelBuilderUtil {

	/**
	*
	* @param sentences a file that contains one sentence per line.
	* There should be at least 15K sentences
	* consisting of a representative sample from
	* user data
	* @param knownEntities a file consisting of a simple list of
	* unambiguous entities, one entry per line.
	* For instance, if one was trying to build a
	* person NER model then this file would be a
	* list of person names that are unambiguous
	* and are known to exist in the sentences
	* file
	* @param knownEntitiesBlacklist This file contains a list of known bad hits
	* that the NER phase of this processing might
	* catch early one before the model iterates
	* to maturity
	* @param modelOutFile the location where the model will be
	* written to
	* @param annotatedSentenceOutFile where the annotated sentences produced by
	* this process will be written to
	* @param namedEntityType the type of entity... for example, person,
	* location, organization...
	* @param iterations how many times to repeat the iterative loop
	* of annotation, model generation, and NER
	*/
	public static void generateModel(File sentences, File knownEntities, File knownEntitiesBlacklist,
	File modelOutFile, File annotatedSentenceOutFile, String namedEntityType, int iterations) {
	SemiSupervisedModelGenerator modelGenerator = new GenericModelGenerator();
	BaseModelBuilderParams params = new BaseModelBuilderParams();
	params.setAnnotatedTrainingDataFile(annotatedSentenceOutFile);
	params.setSentenceFile(sentences);
	params.setEntityType(namedEntityType);
	params.setKnownEntitiesFile(knownEntities);
	params.setModelFile(modelOutFile);
	params.setKnownEntityBlacklist(knownEntitiesBlacklist);
	/*
	* sentence providers feed this process with user data derived sentences
	* this impl just reads line by line through a file
	*/
	SentenceProvider sentenceProvider = new FileSentenceProvider();
	sentenceProvider.setParameters(params);
	/*
	* KnownEntityProviders provide a seed list of known entities... such as
	* Barack Obama for person, or Germany for location obviously these would
	* want to be prolific, non-ambiguous names
	*/
	KnownEntityProvider knownEntityProvider = new FileKnownEntityProvider();
	knownEntityProvider.setParameters(params);
	/*
	* ModelGenerationValidators try to weed out bad hits by the iterations of
	* the name finder. Since this is a recursive process, with each iteration
	* the namefinder will get more and more greedy if bad entities are allowed
	* in this provides a mechanism for throwing out obviously bad hits. A good
	* impl may be to make sure a location is actually within a noun phrase
	* etc...users can make this as specific as they need for their dat and
	* their use case
	*/
	ModelGenerationValidator validator = new FileModelValidatorImpl();
	validator.setParameters(params);
	/*
	* Modelable's write and read the annotated sentences, as well as create and
	* write the NER models
	*/
	Modelable modelable = new GenericModelableImpl();
	modelable.setParameters(params);

	/*
	* the modelGenerator actually runs the process with a set number of
	* iterations... could be better by actually calculating the diff between
	* runs and stopping based on a threshold, but for extremely large sentence
	* sets this may be too much.
	*/
	modelGenerator.build(sentenceProvider, knownEntityProvider, validator, modelable, iterations);

	}
	}