blob: 2d87d77e32f09f92aab2632c12ddce4768577bcf [file] [log] [blame]
/*
* Copyright 2013 The Apache Software Foundation.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package opennlp.modelbuilder.v2;
import java.util.HashMap;
import java.util.Map;
import opennlp.tools.namefind.NameFinderME;
import opennlp.tools.util.Span;
/**
*
*Generic impl
*/
public class GenericModelGenerator implements SemiSupervisedModelGenerator{
private Map<String, String> params = new HashMap<String, String>();
@Override
public void setParameters(Map<String, String> params) {
this.params = params;
}
@Override
public void build(SentenceProvider sentenceProvider, KnownEntityProvider knownEntityProvider,
ModelGenerationValidator validator, Modelable modelable, int iterations) {
for (int iteration = 0; iteration < iterations; iteration++) {
System.out.println("ITERATION: " + iteration);
System.out.println("\tPerfoming Known Entity Annotation");
System.out.println("\t\tknowns: " + knownEntityProvider.getKnownEntities().size());
System.out.println("\t\treading data....: ");
for (String sentence : sentenceProvider.getSentences()) {
for (String knownEntity : knownEntityProvider.getKnownEntities()) {
if (sentence.contains(knownEntity)) {
//if the same sentence has multiple hits should they be annotated separately?
modelable.addAnnotatedSentence(modelable.annotate(sentence, knownEntity, knownEntityProvider.getKnownEntitiesType()));
}
}
}
System.out.println("\t\twriting annotated sentences....: ");
modelable.writeAnnotatedSentences();
modelable.buildModel(knownEntityProvider.getKnownEntitiesType());
NameFinderME nf = new NameFinderME(modelable.getModel());
System.out.println("\t\tannotated sentences: " + modelable.getAnnotatedSentences().size());
System.out.println("\tPerforming NER");
for (String sentence : sentenceProvider.getSentences()) {
if (!validator.validSentence(sentence)) {
continue;
}
String[] tokens = modelable.tokenizeSentenceToWords(sentence);
Span[] find = nf.find(tokens);
nf.clearAdaptiveData();
String[] namedEntities = Span.spansToStrings(find, tokens);
for (String namedEntity : namedEntities) {
if (validator.validNamedEntity(namedEntity)) {
knownEntityProvider.addKnownEntity(namedEntity);
modelable.addAnnotatedSentence(modelable.annotate(sentence, namedEntity, knownEntityProvider.getKnownEntitiesType()));
}
}
}
System.out.println("\t\tannotated sentences: " + modelable.getAnnotatedSentences().size());
System.out.println("\t\tknowns: " + knownEntityProvider.getKnownEntities().size());
}
modelable.writeAnnotatedSentences();
modelable.buildModel(knownEntityProvider.getKnownEntitiesType());
}
}