blob: 81ff9fd358eb4d658c1aa9b606dea3df5d956761 [file] [log] [blame]
/*
* Copyright 2013 The Apache Software Foundation.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package opennlp.addons.modelbuilder;
import java.io.File;
import opennlp.addons.modelbuilder.impls.BaseModelBuilderParams;
import opennlp.addons.modelbuilder.impls.FileKnownEntityProvider;
import opennlp.addons.modelbuilder.impls.FileModelValidatorImpl;
import opennlp.addons.modelbuilder.impls.FileSentenceProvider;
import opennlp.addons.modelbuilder.impls.GenericModelGenerator;
import opennlp.addons.modelbuilder.impls.GenericModelableImpl;
/**
*
* Utilizes the filebased implementations to produce an NER model from user
* The basic processing is such
* read in the list of known entities
* annotate the sentences based on the list of known entities
* create a model from the annotations
* perform NER with the model on the sentences
* add the NER results to the annotations
* rebuild the model
* loop
* defined data
*/
public class DefaultModelBuilderUtil {
/**
*
* @param sentences a file that contains one sentence per line.
* There should be at least 15K sentences
* consisting of a representative sample from
* user data
* @param knownEntities a file consisting of a simple list of
* unambiguous entities, one entry per line.
* For instance, if one was trying to build a
* person NER model then this file would be a
* list of person names that are unambiguous
* and are known to exist in the sentences
* file
* @param knownEntitiesBlacklist This file contains a list of known bad hits
* that the NER phase of this processing might
* catch early one before the model iterates
* to maturity
* @param modelOutFile the location where the model will be
* written to
* @param annotatedSentenceOutFile where the annotated sentences produced by
* this process will be written to
* @param namedEntityType the type of entity... for example, person,
* location, organization...
* @param iterations how many times to repeat the iterative loop
* of annotation, model generation, and NER
*/
public static void generateModel(File sentences, File knownEntities, File knownEntitiesBlacklist,
File modelOutFile, File annotatedSentenceOutFile, String namedEntityType, int iterations) {
SemiSupervisedModelGenerator modelGenerator = new GenericModelGenerator();
BaseModelBuilderParams params = new BaseModelBuilderParams();
params.setAnnotatedTrainingDataFile(annotatedSentenceOutFile);
params.setSentenceFile(sentences);
params.setEntityType(namedEntityType);
params.setKnownEntitiesFile(knownEntities);
params.setModelFile(modelOutFile);
params.setKnownEntityBlacklist(knownEntitiesBlacklist);
/**
* sentence providers feed this process with user data derived sentences
* this impl just reads line by line through a file
*/
SentenceProvider sentenceProvider = new FileSentenceProvider();
sentenceProvider.setParameters(params);
/**
* KnownEntityProviders provide a seed list of known entities... such as
* Barack Obama for person, or Germany for location obviously these would
* want to be prolific, non ambiguous names
*/
KnownEntityProvider knownEntityProvider = new FileKnownEntityProvider();
knownEntityProvider.setParameters(params);
/**
* ModelGenerationValidators try to weed out bad hits by the iterations of
* the name finder. Since this is a recursive process, with each iteration
* the namefinder will get more and more greedy if bad entities are allowed
* in this provides a mechanism for throwing out obviously bad hits. A good
* impl may be to make sure a location is actually within a noun phrase
* etc...users can make this as specific as they need for their dat and
* their use case
*/
ModelGenerationValidator validator = new FileModelValidatorImpl();
validator.setParameters(params);
/**
* Modelable's write and read the annotated sentences, as well as create and
* write the NER models
*/
Modelable modelable = new GenericModelableImpl();
modelable.setParameters(params);
/**
* the modelGenerator actually runs the process with a set number of
* iterations... could be better by actually calculating the diff between
* runs and stopping based on a thresh, but for extrememly large sentence
* sets this may be too much.
*/
modelGenerator.build(sentenceProvider, knownEntityProvider, validator, modelable, iterations);
}
}