moved from sandbox
diff --git a/modelbuilder-addon/pom.xml b/modelbuilder-addon/pom.xml
new file mode 100644
index 0000000..4a9c886
--- /dev/null
+++ b/modelbuilder-addon/pom.xml
@@ -0,0 +1,35 @@
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+ <parent>
+ <groupId>org.apache.opennlp</groupId>
+ <artifactId>opennlp</artifactId>
+ <version>1.6.0-SNAPSHOT</version>
+ <relativePath>../opennlp/pom.xml</relativePath>
+ </parent>
+
+ <artifactId>modelbuilder-addon</artifactId>
+ <version>1.0-SNAPSHOT</version>
+ <packaging>jar</packaging>
+
+ <name>modelbuilder-addon</name>
+ <url>http://maven.apache.org</url>
+
+ <properties>
+ <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+ </properties>
+
+ <dependencies>
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <version>3.8.1</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.opennlp</groupId>
+ <artifactId>opennlp-tools</artifactId>
+ <version>1.6.0-SNAPSHOT</version>
+ </dependency>
+ </dependencies>
+</project>
diff --git a/modelbuilder-addon/src/main/java/opennlp/addons/modelbuilder/DefaultModelBuilderUtil.java b/modelbuilder-addon/src/main/java/opennlp/addons/modelbuilder/DefaultModelBuilderUtil.java
new file mode 100644
index 0000000..81ff9fd
--- /dev/null
+++ b/modelbuilder-addon/src/main/java/opennlp/addons/modelbuilder/DefaultModelBuilderUtil.java
@@ -0,0 +1,117 @@
+/*
+ * Copyright 2013 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.addons.modelbuilder;
+
+import java.io.File;
+import opennlp.addons.modelbuilder.impls.BaseModelBuilderParams;
+import opennlp.addons.modelbuilder.impls.FileKnownEntityProvider;
+import opennlp.addons.modelbuilder.impls.FileModelValidatorImpl;
+import opennlp.addons.modelbuilder.impls.FileSentenceProvider;
+import opennlp.addons.modelbuilder.impls.GenericModelGenerator;
+import opennlp.addons.modelbuilder.impls.GenericModelableImpl;
+
+/**
+ *
+ * Utilizes the filebased implementations to produce an NER model from user
+ * The basic processing is such
+ * read in the list of known entities
+ * annotate the sentences based on the list of known entities
+ * create a model from the annotations
+ * perform NER with the model on the sentences
+ * add the NER results to the annotations
+ * rebuild the model
+ * loop
+ * defined data
+ */
+public class DefaultModelBuilderUtil {
+
+ /**
+ *
+ * @param sentences a file that contains one sentence per line.
+ * There should be at least 15K sentences
+ * consisting of a representative sample from
+ * user data
+ * @param knownEntities a file consisting of a simple list of
+ * unambiguous entities, one entry per line.
+ * For instance, if one was trying to build a
+ * person NER model then this file would be a
+ * list of person names that are unambiguous
+ * and are known to exist in the sentences
+ * file
+ * @param knownEntitiesBlacklist This file contains a list of known bad hits
+ * that the NER phase of this processing might
+ * catch early one before the model iterates
+ * to maturity
+ * @param modelOutFile the location where the model will be
+ * written to
+ * @param annotatedSentenceOutFile where the annotated sentences produced by
+ * this process will be written to
+ * @param namedEntityType the type of entity... for example, person,
+ * location, organization...
+ * @param iterations how many times to repeat the iterative loop
+ * of annotation, model generation, and NER
+ */
+ public static void generateModel(File sentences, File knownEntities, File knownEntitiesBlacklist,
+ File modelOutFile, File annotatedSentenceOutFile, String namedEntityType, int iterations) {
+ SemiSupervisedModelGenerator modelGenerator = new GenericModelGenerator();
+ BaseModelBuilderParams params = new BaseModelBuilderParams();
+ params.setAnnotatedTrainingDataFile(annotatedSentenceOutFile);
+ params.setSentenceFile(sentences);
+ params.setEntityType(namedEntityType);
+ params.setKnownEntitiesFile(knownEntities);
+ params.setModelFile(modelOutFile);
+ params.setKnownEntityBlacklist(knownEntitiesBlacklist);
+ /**
+ * sentence providers feed this process with user data derived sentences
+ * this impl just reads line by line through a file
+ */
+ SentenceProvider sentenceProvider = new FileSentenceProvider();
+ sentenceProvider.setParameters(params);
+ /**
+ * KnownEntityProviders provide a seed list of known entities... such as
+ * Barack Obama for person, or Germany for location obviously these would
+ * want to be prolific, non ambiguous names
+ */
+ KnownEntityProvider knownEntityProvider = new FileKnownEntityProvider();
+ knownEntityProvider.setParameters(params);
+ /**
+ * ModelGenerationValidators try to weed out bad hits by the iterations of
+ * the name finder. Since this is a recursive process, with each iteration
+ * the namefinder will get more and more greedy if bad entities are allowed
+ * in this provides a mechanism for throwing out obviously bad hits. A good
+ * impl may be to make sure a location is actually within a noun phrase
+ * etc...users can make this as specific as they need for their dat and
+ * their use case
+ */
+ ModelGenerationValidator validator = new FileModelValidatorImpl();
+ validator.setParameters(params);
+ /**
+ * Modelable's write and read the annotated sentences, as well as create and
+ * write the NER models
+ */
+ Modelable modelable = new GenericModelableImpl();
+ modelable.setParameters(params);
+
+ /**
+ * the modelGenerator actually runs the process with a set number of
+ * iterations... could be better by actually calculating the diff between
+ * runs and stopping based on a thresh, but for extrememly large sentence
+ * sets this may be too much.
+ */
+ modelGenerator.build(sentenceProvider, knownEntityProvider, validator, modelable, iterations);
+
+ }
+}
diff --git a/modelbuilder-addon/src/main/java/opennlp/addons/modelbuilder/KnownEntityProvider.java b/modelbuilder-addon/src/main/java/opennlp/addons/modelbuilder/KnownEntityProvider.java
new file mode 100644
index 0000000..694250e
--- /dev/null
+++ b/modelbuilder-addon/src/main/java/opennlp/addons/modelbuilder/KnownEntityProvider.java
@@ -0,0 +1,45 @@
+/*
+ * Copyright 2013 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.addons.modelbuilder;
+
+import java.util.Set;
+
+
+
+/**
+ *
+Supplies a list of known entities (a list of names or locations)
+ */
+public interface KnownEntityProvider extends ModelParameter{
+ /**
+ * returns a list of known non ambiguous entities.
+ * @return a set of entities
+ */
+ Set<String> getKnownEntities();
+/**
+ * adds to the set of known entities. Overriding classes should hold this list in a class level set.
+ * @param unambiguousEntity
+ */
+ void addKnownEntity(String unambiguousEntity);
+/**
+ * defines the type of entity that the set contains, ie person, location, organization.
+ * @return
+ */
+ String getKnownEntitiesType();
+
+
+
+}
diff --git a/modelbuilder-addon/src/main/java/opennlp/addons/modelbuilder/ModelGenerationValidator.java b/modelbuilder-addon/src/main/java/opennlp/addons/modelbuilder/ModelGenerationValidator.java
new file mode 100644
index 0000000..4bd5fe2
--- /dev/null
+++ b/modelbuilder-addon/src/main/java/opennlp/addons/modelbuilder/ModelGenerationValidator.java
@@ -0,0 +1,33 @@
+/*
+ * Copyright 2013 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.addons.modelbuilder;
+
+import java.util.Collection;
+
+/**
+ *
+Validates results from the iterative namefinding
+ */
+public interface ModelGenerationValidator extends ModelParameter {
+
+ Boolean validSentence(String sentence);
+
+ Boolean validNamedEntity(String namedEntity);
+
+
+
+ Collection<String> getBlackList();
+}
diff --git a/modelbuilder-addon/src/main/java/opennlp/addons/modelbuilder/ModelParameter.java b/modelbuilder-addon/src/main/java/opennlp/addons/modelbuilder/ModelParameter.java
new file mode 100644
index 0000000..136e775
--- /dev/null
+++ b/modelbuilder-addon/src/main/java/opennlp/addons/modelbuilder/ModelParameter.java
@@ -0,0 +1,28 @@
+/*
+ * Copyright 2013 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.addons.modelbuilder;
+
+import opennlp.addons.modelbuilder.impls.BaseModelBuilderParams;
+
+/**
+ *
+ */
+public interface ModelParameter<T extends BaseModelBuilderParams>{
+
+ void setParameters(T params);
+
+
+}
diff --git a/modelbuilder-addon/src/main/java/opennlp/addons/modelbuilder/Modelable.java b/modelbuilder-addon/src/main/java/opennlp/addons/modelbuilder/Modelable.java
new file mode 100644
index 0000000..80b0170
--- /dev/null
+++ b/modelbuilder-addon/src/main/java/opennlp/addons/modelbuilder/Modelable.java
@@ -0,0 +1,45 @@
+/*
+ * Copyright 2013 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.addons.modelbuilder;
+
+import java.util.Set;
+import opennlp.tools.namefind.TokenNameFinderModel;
+
+/**
+ *
+ */
+public interface Modelable extends ModelParameter{
+
+
+
+ String annotate(String sentence, String namedEntity, String entityType);
+
+ void writeAnnotatedSentences();
+
+ Set<String> getAnnotatedSentences();
+
+ void setAnnotatedSentences(Set<String> annotatedSentences);
+
+ void addAnnotatedSentence(String annotatedSentence);
+
+ void buildModel( String entityType);
+
+ TokenNameFinderModel getModel();
+
+ String[] tokenizeSentenceToWords(String sentence);
+
+
+}
diff --git a/modelbuilder-addon/src/main/java/opennlp/addons/modelbuilder/SemiSupervisedModelGenerator.java b/modelbuilder-addon/src/main/java/opennlp/addons/modelbuilder/SemiSupervisedModelGenerator.java
new file mode 100644
index 0000000..c97a4c1
--- /dev/null
+++ b/modelbuilder-addon/src/main/java/opennlp/addons/modelbuilder/SemiSupervisedModelGenerator.java
@@ -0,0 +1,28 @@
+/*
+ * Copyright 2013 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.addons.modelbuilder;
+
+import opennlp.addons.modelbuilder.impls.BaseModelBuilderParams;
+
+/**
+ *
+
+ */
+public interface SemiSupervisedModelGenerator extends ModelParameter<BaseModelBuilderParams> {
+
+ void build(SentenceProvider sentenceProvider, KnownEntityProvider knownEntityProvider,
+ ModelGenerationValidator validator, Modelable modelable, int iterations);
+}
diff --git a/modelbuilder-addon/src/main/java/opennlp/addons/modelbuilder/SentenceProvider.java b/modelbuilder-addon/src/main/java/opennlp/addons/modelbuilder/SentenceProvider.java
new file mode 100644
index 0000000..5610224
--- /dev/null
+++ b/modelbuilder-addon/src/main/java/opennlp/addons/modelbuilder/SentenceProvider.java
@@ -0,0 +1,27 @@
+/*
+ * Copyright 2013 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.addons.modelbuilder;
+
+import java.util.Set;
+import opennlp.addons.modelbuilder.impls.BaseModelBuilderParams;
+
+/**
+ *
+ */
+public interface SentenceProvider extends ModelParameter<BaseModelBuilderParams> {
+
+ Set<String> getSentences();
+}
diff --git a/modelbuilder-addon/src/main/java/opennlp/addons/modelbuilder/impls/BaseModelBuilderParams.java b/modelbuilder-addon/src/main/java/opennlp/addons/modelbuilder/impls/BaseModelBuilderParams.java
new file mode 100644
index 0000000..fcb2384
--- /dev/null
+++ b/modelbuilder-addon/src/main/java/opennlp/addons/modelbuilder/impls/BaseModelBuilderParams.java
@@ -0,0 +1,90 @@
+/*
+ * Copyright 2013 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.addons.modelbuilder.impls;
+
+import java.io.File;
+import java.util.Map;
+
+/**
+ *
+ * Used to pass params through the processing
+ */
+public class BaseModelBuilderParams {
+
+ private File modelFile;
+ private File sentenceFile;
+ private File knownEntitiesFile;
+ private File knownEntityBlacklist;
+ private File annotatedTrainingDataFile;
+ private String entityType;
+ private Map<String, String> additionalParams;
+
+ public File getModelFile() {
+ return modelFile;
+ }
+
+ public void setModelFile(File modelFile) {
+ this.modelFile = modelFile;
+ }
+
+ public File getSentenceFile() {
+ return sentenceFile;
+ }
+
+ public void setSentenceFile(File sentenceFile) {
+ this.sentenceFile = sentenceFile;
+ }
+
+ public File getKnownEntitiesFile() {
+ return knownEntitiesFile;
+ }
+
+ public void setKnownEntitiesFile(File knownEntitiesFile) {
+ this.knownEntitiesFile = knownEntitiesFile;
+ }
+
+ public File getKnownEntityBlacklist() {
+ return knownEntityBlacklist;
+ }
+
+ public void setKnownEntityBlacklist(File knownEntityBlacklist) {
+ this.knownEntityBlacklist = knownEntityBlacklist;
+ }
+
+ public Map<String, String> getAdditionalParams() {
+ return additionalParams;
+ }
+
+ public void setAdditionalParams(Map<String, String> additionalParams) {
+ this.additionalParams = additionalParams;
+ }
+
+ public String getEntityType() {
+ return entityType;
+ }
+
+ public void setEntityType(String entityType) {
+ this.entityType = entityType;
+ }
+
+ public File getAnnotatedTrainingDataFile() {
+ return annotatedTrainingDataFile;
+ }
+
+ public void setAnnotatedTrainingDataFile(File annotatedTrainingDataFile) {
+ this.annotatedTrainingDataFile = annotatedTrainingDataFile;
+ }
+}
\ No newline at end of file
diff --git a/modelbuilder-addon/src/main/java/opennlp/addons/modelbuilder/impls/FileKnownEntityProvider.java b/modelbuilder-addon/src/main/java/opennlp/addons/modelbuilder/impls/FileKnownEntityProvider.java
new file mode 100644
index 0000000..0de043c
--- /dev/null
+++ b/modelbuilder-addon/src/main/java/opennlp/addons/modelbuilder/impls/FileKnownEntityProvider.java
@@ -0,0 +1,82 @@
+/*
+ * Copyright 2013 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.addons.modelbuilder.impls;
+
+import java.io.BufferedReader;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.nio.charset.Charset;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+import opennlp.addons.modelbuilder.KnownEntityProvider;
+
+/**
+ *
+ */
+public class FileKnownEntityProvider implements KnownEntityProvider {
+
+ Set<String> knownEntities = new HashSet<String>();
+ BaseModelBuilderParams params;
+ @Override
+ public Set<String> getKnownEntities() {
+ if (knownEntities.isEmpty()) {
+ try {
+ InputStream fis;
+ BufferedReader br;
+ String line;
+
+ fis = new FileInputStream(params.getKnownEntitiesFile());
+ br = new BufferedReader(new InputStreamReader(fis, Charset.forName("UTF-8")));
+ while ((line = br.readLine()) != null) {
+ knownEntities.add(line);
+ }
+
+ // Done with the file
+ br.close();
+ br = null;
+ fis = null;
+ } catch (FileNotFoundException ex) {
+ Logger.getLogger(FileKnownEntityProvider.class.getName()).log(Level.SEVERE, null, ex);
+ } catch (IOException ex) {
+ Logger.getLogger(FileKnownEntityProvider.class.getName()).log(Level.SEVERE, null, ex);
+ }
+ }
+ return knownEntities;
+ }
+
+ @Override
+ public void addKnownEntity(String unambiguousEntity) {
+ knownEntities.add(unambiguousEntity);
+ }
+
+ @Override
+ public String getKnownEntitiesType() {
+
+ return params.getEntityType();
+ }
+
+
+
+ @Override
+ public void setParameters(BaseModelBuilderParams params) {
+ this.params = params;
+ }
+}
diff --git a/modelbuilder-addon/src/main/java/opennlp/addons/modelbuilder/impls/FileModelValidatorImpl.java b/modelbuilder-addon/src/main/java/opennlp/addons/modelbuilder/impls/FileModelValidatorImpl.java
new file mode 100644
index 0000000..ea4bb05
--- /dev/null
+++ b/modelbuilder-addon/src/main/java/opennlp/addons/modelbuilder/impls/FileModelValidatorImpl.java
@@ -0,0 +1,96 @@
+/*
+ * Copyright 2013 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.addons.modelbuilder.impls;
+
+import java.io.BufferedReader;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.nio.charset.Charset;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+import opennlp.addons.modelbuilder.ModelGenerationValidator;
+
+/**
+ *Validates NER results input before inclusion into the model
+ */
+public class FileModelValidatorImpl implements ModelGenerationValidator {
+
+ private Set<String> badentities = new HashSet<String>();
+ BaseModelBuilderParams params;
+
+ @Override
+ public void setParameters(BaseModelBuilderParams params) {
+ this.params = params;
+ }
+
+ @Override
+ public Boolean validSentence(String sentence) {
+ //returning true by default, because the sentence provider will return only "valid" sentences in this case
+ return true;
+ }
+
+ @Override
+ public Boolean validNamedEntity(String namedEntity) {
+
+ if (badentities.isEmpty()) {
+ getBlackList();
+ }
+//
+// Pattern p = Pattern.compile("[0-9]", Pattern.CASE_INSENSITIVE | Pattern.MULTILINE);
+// if (p.matcher(namedEntity).find()) {
+// return false;
+// }
+ Boolean b = true;
+ if (badentities.contains(namedEntity.toLowerCase())) {
+ b = false;
+ }
+ return b;
+ }
+
+ @Override
+ public Collection<String> getBlackList() {
+ if (params.getKnownEntityBlacklist() == null) {
+ return badentities;
+ }
+ if (!badentities.isEmpty()) {
+ try {
+ InputStream fis;
+ BufferedReader br;
+ String line;
+
+ fis = new FileInputStream(params.getKnownEntityBlacklist());
+ br = new BufferedReader(new InputStreamReader(fis, Charset.forName("UTF-8")));
+ while ((line = br.readLine()) != null) {
+ badentities.add(line);
+ }
+ br.close();
+ br = null;
+ fis = null;
+ } catch (FileNotFoundException ex) {
+ Logger.getLogger(FileKnownEntityProvider.class.getName()).log(Level.SEVERE, null, ex);
+ } catch (IOException ex) {
+ Logger.getLogger(FileKnownEntityProvider.class.getName()).log(Level.SEVERE, null, ex);
+ }
+ }
+ return badentities;
+ }
+}
diff --git a/modelbuilder-addon/src/main/java/opennlp/addons/modelbuilder/impls/FileSentenceProvider.java b/modelbuilder-addon/src/main/java/opennlp/addons/modelbuilder/impls/FileSentenceProvider.java
new file mode 100644
index 0000000..bea55f5
--- /dev/null
+++ b/modelbuilder-addon/src/main/java/opennlp/addons/modelbuilder/impls/FileSentenceProvider.java
@@ -0,0 +1,70 @@
+/*
+ * Copyright 2013 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.addons.modelbuilder.impls;
+
+import java.io.BufferedReader;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.nio.charset.Charset;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+import opennlp.addons.modelbuilder.SentenceProvider;
+
+/**
+ * Provides user sentences via a simple text file
+ */
+public class FileSentenceProvider implements SentenceProvider {
+
+ BaseModelBuilderParams params ;
+ Set<String> sentences = new HashSet<String>();
+
+ public Set<String> getSentences() {
+ if (sentences.isEmpty()) {
+ try {
+ InputStream fis;
+ BufferedReader br;
+ String line;
+
+ fis = new FileInputStream(params.getSentenceFile());
+ br = new BufferedReader(new InputStreamReader(fis, Charset.forName("UTF-8")));
+ int i=0;
+ while ((line = br.readLine()) != null) {
+
+ sentences.add(line);
+ }
+
+ // Done with the file
+ br.close();
+ br = null;
+ fis = null;
+ } catch (FileNotFoundException ex) {
+ Logger.getLogger(FileKnownEntityProvider.class.getName()).log(Level.SEVERE, null, ex);
+ } catch (IOException ex) {
+ Logger.getLogger(FileKnownEntityProvider.class.getName()).log(Level.SEVERE, null, ex);
+ }
+ }
+ return sentences;
+ }
+
+ public void setParameters(BaseModelBuilderParams params) {
+ this.params = params;
+ }
+}
diff --git a/modelbuilder-addon/src/main/java/opennlp/addons/modelbuilder/impls/GenericModelGenerator.java b/modelbuilder-addon/src/main/java/opennlp/addons/modelbuilder/impls/GenericModelGenerator.java
new file mode 100644
index 0000000..bbd23e1
--- /dev/null
+++ b/modelbuilder-addon/src/main/java/opennlp/addons/modelbuilder/impls/GenericModelGenerator.java
@@ -0,0 +1,103 @@
+/*
+ * Copyright 2013 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.addons.modelbuilder.impls;
+
+import java.util.HashMap;
+import java.util.Map;
+import opennlp.addons.modelbuilder.KnownEntityProvider;
+import opennlp.addons.modelbuilder.ModelGenerationValidator;
+import opennlp.addons.modelbuilder.Modelable;
+import opennlp.addons.modelbuilder.SemiSupervisedModelGenerator;
+import opennlp.addons.modelbuilder.SentenceProvider;
+import opennlp.tools.namefind.NameFinderME;
+import opennlp.tools.util.Span;
+
+/**
+ *
+ * Generic impl that handles all processing using the default file implementations
+ */
+public class GenericModelGenerator implements SemiSupervisedModelGenerator {
+
+ private Map<String, String> params = new HashMap<String, String>();
+
+ @Override
+ public void setParameters(BaseModelBuilderParams params) {
+ this.params = params.getAdditionalParams();
+ }
+
+ @Override
+ public void build(SentenceProvider sentenceProvider, KnownEntityProvider knownEntityProvider,
+ ModelGenerationValidator validator, Modelable modelable, int iterations) {
+ for (int iteration = 0; iteration < iterations; iteration++) {
+ System.out.println("ITERATION: " + iteration);
+ System.out.println("\tPerfoming Known Entity Annotation");
+ System.out.println("\t\tknowns: " + knownEntityProvider.getKnownEntities().size());
+ System.out.println("\t\treading data....: ");
+ for (String sentence : sentenceProvider.getSentences()) {
+ for (String knownEntity : knownEntityProvider.getKnownEntities()) {
+ if (sentence.contains(knownEntity)) {
+ //if the same sentence has multiple hits should they be annotated separately?
+ modelable.addAnnotatedSentence(modelable.annotate(sentence, knownEntity, knownEntityProvider.getKnownEntitiesType()));
+ }
+ }
+ }
+ if (sentenceProvider.getSentences().isEmpty()) {
+ System.out.println("No sentences in file");
+ return;
+ }
+ if (knownEntityProvider.getKnownEntities().isEmpty()) {
+ System.out.println("No known entities in file");
+ return;
+ }
+ System.out.println("\t\twriting annotated sentences....: ");
+ modelable.writeAnnotatedSentences();
+ System.out.println("\t\tbuilding model.... ");
+ modelable.buildModel(knownEntityProvider.getKnownEntitiesType());
+ System.out.println("\t\tmodel building complete.... ");
+ NameFinderME nf = new NameFinderME(modelable.getModel());
+ System.out.println("\t\tannotated sentences: " + modelable.getAnnotatedSentences().size());
+ System.out.println("\tPerforming NER with new model");
+ System.out.println("\t\tPrinting NER Results. Add undesired results to the blacklist file and start over");
+ for (String sentence : sentenceProvider.getSentences()) {
+ if (!validator.validSentence(sentence)) {
+ continue;
+ }
+ String[] tokens = modelable.tokenizeSentenceToWords(sentence);
+
+ Span[] find = nf.find(tokens);
+ nf.clearAdaptiveData();
+
+ String[] namedEntities = Span.spansToStrings(find, tokens);
+
+ for (String namedEntity : namedEntities) {
+ System.out.println("\t\t" + namedEntity);
+ if (validator.validNamedEntity(namedEntity)) {
+
+ knownEntityProvider.addKnownEntity(namedEntity);
+ modelable.addAnnotatedSentence(modelable.annotate(sentence, namedEntity, knownEntityProvider.getKnownEntitiesType()));
+
+ } else {
+ System.out.println("\t\t" + namedEntity + "...already blacklisted");
+ }
+ }
+ }
+ System.out.println("\t\tannotated sentences: " + modelable.getAnnotatedSentences().size());
+ System.out.println("\t\tknowns: " + knownEntityProvider.getKnownEntities().size());
+ }
+ modelable.writeAnnotatedSentences();
+ modelable.buildModel(knownEntityProvider.getKnownEntitiesType());
+ }
+}
diff --git a/modelbuilder-addon/src/main/java/opennlp/addons/modelbuilder/impls/GenericModelableImpl.java b/modelbuilder-addon/src/main/java/opennlp/addons/modelbuilder/impls/GenericModelableImpl.java
new file mode 100644
index 0000000..572e84b
--- /dev/null
+++ b/modelbuilder-addon/src/main/java/opennlp/addons/modelbuilder/impls/GenericModelableImpl.java
@@ -0,0 +1,127 @@
+/*
+ * Copyright 2013 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.addons.modelbuilder.impls;
+
+import java.io.BufferedOutputStream;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.nio.charset.Charset;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+import opennlp.addons.modelbuilder.Modelable;
+import opennlp.tools.namefind.NameFinderME;
+import opennlp.tools.namefind.NameSample;
+import opennlp.tools.namefind.NameSampleDataStream;
+import opennlp.tools.namefind.TokenNameFinderModel;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.PlainTextByLineStream;
+
+/**
+ * Creates annotations, writes annotations to file, and creates a model and writes to a file
+ */
+public class GenericModelableImpl implements Modelable {
+
+ private Set<String> annotatedSentences = new HashSet<String>();
+ BaseModelBuilderParams params;
+
+ @Override
+ public void setParameters(BaseModelBuilderParams params) {
+ this.params = params;
+ }
+
+ @Override
+ public String annotate(String sentence, String namedEntity, String entityType) {
+ String annotation = sentence.replace(namedEntity, " <START:" + entityType + "> " + namedEntity + " <END> ");
+ return annotation;
+ }
+
+ @Override
+ public void writeAnnotatedSentences() {
+ try {
+
+ FileWriter writer = new FileWriter(params.getAnnotatedTrainingDataFile(), false);
+
+ for (String s : annotatedSentences) {
+ writer.write(s.replace("\n", " ").trim() + "\n");
+ }
+ writer.close();
+ } catch (IOException ex) {
+ ex.printStackTrace();
+ }
+ }
+
+ @Override
+ public Set<String> getAnnotatedSentences() {
+ return annotatedSentences;
+ }
+
+ @Override
+ public void setAnnotatedSentences(Set<String> annotatedSentences) {
+ this.annotatedSentences = annotatedSentences;
+ }
+
+ @Override
+ public void addAnnotatedSentence(String annotatedSentence) {
+ annotatedSentences.add(annotatedSentence);
+ }
+
+ @Override
+ public void buildModel(String entityType) {
+ try {
+ System.out.println("\tBuilding Model using " + annotatedSentences.size() + " annotations");
+ System.out.println("\t\treading training data...");
+ Charset charset = Charset.forName("UTF-8");
+ ObjectStream<String> lineStream =
+ new PlainTextByLineStream(new FileInputStream(params.getAnnotatedTrainingDataFile()), charset);
+ ObjectStream<NameSample> sampleStream = new NameSampleDataStream(lineStream);
+
+ TokenNameFinderModel model;
+ model = NameFinderME.train("en", entityType, sampleStream, null);
+ sampleStream.close();
+ OutputStream modelOut = new BufferedOutputStream(new FileOutputStream(params.getModelFile()));
+ model.serialize(modelOut);
+ if (modelOut != null) {
+ modelOut.close();
+ }
+ System.out.println("\tmodel generated");
+ } catch (Exception e) {
+ }
+ }
+
+ @Override
+ public TokenNameFinderModel getModel() {
+
+
+ TokenNameFinderModel nerModel = null;
+ try {
+ nerModel = new TokenNameFinderModel(new FileInputStream(params.getModelFile()));
+ } catch (IOException ex) {
+ Logger.getLogger(GenericModelableImpl.class.getName()).log(Level.SEVERE, null, ex);
+ }
+ return nerModel;
+ }
+
+ @Override
+ public String[] tokenizeSentenceToWords(String sentence) {
+ return sentence.split(" ");
+
+ }
+}
diff --git a/modelbuilder-addon/src/test/java/modelbuilder/AppTest.java b/modelbuilder-addon/src/test/java/modelbuilder/AppTest.java
new file mode 100644
index 0000000..2b04731
--- /dev/null
+++ b/modelbuilder-addon/src/test/java/modelbuilder/AppTest.java
@@ -0,0 +1,38 @@
+package modelbuilder;
+
+import junit.framework.Test;
+import junit.framework.TestCase;
+import junit.framework.TestSuite;
+
+/**
+ * Unit test for simple App.
+ */
+public class AppTest
+ extends TestCase
+{
+ /**
+ * Create the test case
+ *
+ * @param testName name of the test case
+ */
+ public AppTest( String testName )
+ {
+ super( testName );
+ }
+
+ /**
+ * @return the suite of tests being tested
+ */
+ public static Test suite()
+ {
+ return new TestSuite( AppTest.class );
+ }
+
+ /**
+ * Rigourous Test :-)
+ */
+ public void testApp()
+ {
+ assertTrue( true );
+ }
+}