OPENNLP-607
renamed packages for consistency in addons, also made the framework generic with file based implementations
diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/DefaultModelBuilderUtil.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/DefaultModelBuilderUtil.java
deleted file mode 100644
index 7ed4fbd..0000000
--- a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/DefaultModelBuilderUtil.java
+++ /dev/null
@@ -1,117 +0,0 @@
-/*
- * Copyright 2013 The Apache Software Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package opennlp.modelbuilder;
-
-import java.io.File;
-import opennlp.modelbuilder.impls.BaseModelBuilderParams;
-import opennlp.modelbuilder.impls.FileKnownEntityProvider;
-import opennlp.modelbuilder.impls.FileModelValidatorImpl;
-import opennlp.modelbuilder.impls.FileSentenceProvider;
-import opennlp.modelbuilder.impls.GenericModelGenerator;
-import opennlp.modelbuilder.impls.GenericModelableImpl;
-
-/**
- *
- * Utilizes the filebased implementations to produce an NER model from user
- * The basic processing is such
- * read in the list of known entities
- * annotate the sentences based on the list of known entities
- * create a model from the annotations
- * perform NER with the model on the sentences
- * add the NER results to the annotations
- * rebuild the model
- * loop
- * defined data
- */
-public class DefaultModelBuilderUtil {
-
- /**
- *
- * @param sentences a file that contains one sentence per line.
- * There should be at least 15K sentences
- * consisting of a representative sample from
- * user data
- * @param knownEntities a file consisting of a simple list of
- * unambiguous entities, one entry per line.
- * For instance, if one was trying to build a
- * person NER model then this file would be a
- * list of person names that are unambiguous
- * and are known to exist in the sentences
- * file
- * @param knownEntitiesBlacklist This file contains a list of known bad hits
- * that the NER phase of this processing might
- * catch early one before the model iterates
- * to maturity
- * @param modelOutFile the location where the model will be
- * written to
- * @param annotatedSentenceOutFile where the annotated sentences produced by
- * this process will be written to
- * @param namedEntityType the type of entity... for example, person,
- * location, organization...
- * @param iterations how many times to repeat the iterative loop
- * of annotation, model generation, and NER
- */
- public static void generateModel(File sentences, File knownEntities, File knownEntitiesBlacklist,
- File modelOutFile, File annotatedSentenceOutFile, String namedEntityType, int iterations) {
- SemiSupervisedModelGenerator modelGenerator = new GenericModelGenerator();
- BaseModelBuilderParams params = new BaseModelBuilderParams();
- params.setAnnotatedTrainingDataFile(annotatedSentenceOutFile);
- params.setSentenceFile(sentences);
- params.setEntityType(namedEntityType);
- params.setKnownEntitiesFile(knownEntities);
- params.setModelFile(modelOutFile);
- params.setKnownEntityBlacklist(knownEntitiesBlacklist);
- /**
- * sentence providers feed this process with user data derived sentences
- * this impl just reads line by line through a file
- */
- SentenceProvider sentenceProvider = new FileSentenceProvider();
- sentenceProvider.setParameters(params);
- /**
- * KnownEntityProviders provide a seed list of known entities... such as
- * Barack Obama for person, or Germany for location obviously these would
- * want to be prolific, non ambiguous names
- */
- KnownEntityProvider knownEntityProvider = new FileKnownEntityProvider();
- knownEntityProvider.setParameters(params);
- /**
- * ModelGenerationValidators try to weed out bad hits by the iterations of
- * the name finder. Since this is a recursive process, with each iteration
- * the namefinder will get more and more greedy if bad entities are allowed
- * in this provides a mechanism for throwing out obviously bad hits. A good
- * impl may be to make sure a location is actually within a noun phrase
- * etc...users can make this as specific as they need for their dat and
- * their use case
- */
- ModelGenerationValidator validator = new FileModelValidatorImpl();
- validator.setParameters(params);
- /**
- * Modelable's write and read the annotated sentences, as well as create and
- * write the NER models
- */
- Modelable modelable = new GenericModelableImpl();
- modelable.setParameters(params);
-
- /**
- * the modelGenerator actually runs the process with a set number of
- * iterations... could be better by actually calculating the diff between
- * runs and stopping based on a thresh, but for extrememly large sentence
- * sets this may be too much.
- */
- modelGenerator.build(sentenceProvider, knownEntityProvider, validator, modelable, iterations);
-
- }
-}
diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/KnownEntityProvider.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/KnownEntityProvider.java
deleted file mode 100644
index 5f54ffe..0000000
--- a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/KnownEntityProvider.java
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright 2013 The Apache Software Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package opennlp.modelbuilder;
-
-import java.util.Set;
-
-
-
-/**
- *
-Supplies a list of known entities (a list of names or locations)
- */
-public interface KnownEntityProvider extends ModelParameter{
- /**
- * returns a list of known non ambiguous entities.
- * @return a set of entities
- */
- Set<String> getKnownEntities();
-/**
- * adds to the set of known entities. Overriding classes should hold this list in a class level set.
- * @param unambiguousEntity
- */
- void addKnownEntity(String unambiguousEntity);
-/**
- * defines the type of entity that the set contains, ie person, location, organization.
- * @return
- */
- String getKnownEntitiesType();
-
-
-
-}
diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/ModelGenerationValidator.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/ModelGenerationValidator.java
deleted file mode 100644
index 73bb515..0000000
--- a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/ModelGenerationValidator.java
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright 2013 The Apache Software Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package opennlp.modelbuilder;
-
-import java.util.Collection;
-
-/**
- *
-Validates results from the iterative namefinding
- */
-public interface ModelGenerationValidator extends ModelParameter {
-
- Boolean validSentence(String sentence);
-
- Boolean validNamedEntity(String namedEntity);
-
-
-
- Collection<String> getBlackList();
-}
diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/ModelParameter.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/ModelParameter.java
deleted file mode 100644
index 3c991ab..0000000
--- a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/ModelParameter.java
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * Copyright 2013 The Apache Software Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package opennlp.modelbuilder;
-
-import opennlp.modelbuilder.impls.BaseModelBuilderParams;
-
-/**
- *
- */
-public interface ModelParameter<T extends BaseModelBuilderParams>{
-
- void setParameters(T params);
-
-
-}
diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/Modelable.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/Modelable.java
deleted file mode 100644
index 4e3ed0c..0000000
--- a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/Modelable.java
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright 2013 The Apache Software Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package opennlp.modelbuilder;
-
-import java.util.Set;
-import opennlp.tools.namefind.TokenNameFinderModel;
-
-/**
- *
- */
-public interface Modelable extends ModelParameter{
-
-
-
- String annotate(String sentence, String namedEntity, String entityType);
-
- void writeAnnotatedSentences();
-
- Set<String> getAnnotatedSentences();
-
- void setAnnotatedSentences(Set<String> annotatedSentences);
-
- void addAnnotatedSentence(String annotatedSentence);
-
- void buildModel( String entityType);
-
- TokenNameFinderModel getModel();
-
- String[] tokenizeSentenceToWords(String sentence);
-
-
-}
diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/SemiSupervisedModelGenerator.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/SemiSupervisedModelGenerator.java
deleted file mode 100644
index cc2f043..0000000
--- a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/SemiSupervisedModelGenerator.java
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * Copyright 2013 The Apache Software Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package opennlp.modelbuilder;
-
-import opennlp.modelbuilder.impls.BaseModelBuilderParams;
-
-/**
- *
-
- */
-public interface SemiSupervisedModelGenerator extends ModelParameter<BaseModelBuilderParams> {
-
- void build(SentenceProvider sentenceProvider, KnownEntityProvider knownEntityProvider,
- ModelGenerationValidator validator, Modelable modelable, int iterations);
-}
diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/SentenceProvider.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/SentenceProvider.java
deleted file mode 100644
index acf3b09..0000000
--- a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/SentenceProvider.java
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Copyright 2013 The Apache Software Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package opennlp.modelbuilder;
-
-import java.util.Set;
-import opennlp.modelbuilder.impls.BaseModelBuilderParams;
-
-/**
- *
- */
-public interface SentenceProvider extends ModelParameter<BaseModelBuilderParams> {
-
- Set<String> getSentences();
-}
diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/BaseModelBuilderParams.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/BaseModelBuilderParams.java
deleted file mode 100644
index e5c1267..0000000
--- a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/BaseModelBuilderParams.java
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright 2013 The Apache Software Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package opennlp.modelbuilder.impls;
-
-import java.io.File;
-import java.util.Map;
-
-/**
- *
- * Used to pass params through the processing
- */
-public class BaseModelBuilderParams {
-
- private File modelFile;
- private File sentenceFile;
- private File knownEntitiesFile;
- private File knownEntityBlacklist;
- private File annotatedTrainingDataFile;
- private String entityType;
- private Map<String, String> additionalParams;
-
- public File getModelFile() {
- return modelFile;
- }
-
- public void setModelFile(File modelFile) {
- this.modelFile = modelFile;
- }
-
- public File getSentenceFile() {
- return sentenceFile;
- }
-
- public void setSentenceFile(File sentenceFile) {
- this.sentenceFile = sentenceFile;
- }
-
- public File getKnownEntitiesFile() {
- return knownEntitiesFile;
- }
-
- public void setKnownEntitiesFile(File knownEntitiesFile) {
- this.knownEntitiesFile = knownEntitiesFile;
- }
-
- public File getKnownEntityBlacklist() {
- return knownEntityBlacklist;
- }
-
- public void setKnownEntityBlacklist(File knownEntityBlacklist) {
- this.knownEntityBlacklist = knownEntityBlacklist;
- }
-
- public Map<String, String> getAdditionalParams() {
- return additionalParams;
- }
-
- public void setAdditionalParams(Map<String, String> additionalParams) {
- this.additionalParams = additionalParams;
- }
-
- public String getEntityType() {
- return entityType;
- }
-
- public void setEntityType(String entityType) {
- this.entityType = entityType;
- }
-
- public File getAnnotatedTrainingDataFile() {
- return annotatedTrainingDataFile;
- }
-
- public void setAnnotatedTrainingDataFile(File annotatedTrainingDataFile) {
- this.annotatedTrainingDataFile = annotatedTrainingDataFile;
- }
-}
\ No newline at end of file
diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/FileKnownEntityProvider.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/FileKnownEntityProvider.java
deleted file mode 100644
index 0ebf565..0000000
--- a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/FileKnownEntityProvider.java
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Copyright 2013 The Apache Software Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package opennlp.modelbuilder.impls;
-
-import java.io.BufferedReader;
-import java.io.FileInputStream;
-import java.io.FileNotFoundException;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.nio.charset.Charset;
-import java.util.HashSet;
-import java.util.Set;
-import java.util.logging.Level;
-import java.util.logging.Logger;
-import opennlp.modelbuilder.KnownEntityProvider;
-
-/**
- *
- */
-public class FileKnownEntityProvider implements KnownEntityProvider {
-
- Set<String> knownEntities = new HashSet<String>();
- BaseModelBuilderParams params;
- @Override
- public Set<String> getKnownEntities() {
- if (knownEntities.isEmpty()) {
- try {
- InputStream fis;
- BufferedReader br;
- String line;
-
- fis = new FileInputStream(params.getKnownEntitiesFile());
- br = new BufferedReader(new InputStreamReader(fis, Charset.forName("UTF-8")));
- while ((line = br.readLine()) != null) {
- knownEntities.add(line);
- }
-
- // Done with the file
- br.close();
- br = null;
- fis = null;
- } catch (FileNotFoundException ex) {
- Logger.getLogger(FileKnownEntityProvider.class.getName()).log(Level.SEVERE, null, ex);
- } catch (IOException ex) {
- Logger.getLogger(FileKnownEntityProvider.class.getName()).log(Level.SEVERE, null, ex);
- }
- }
- return knownEntities;
- }
-
- @Override
- public void addKnownEntity(String unambiguousEntity) {
- knownEntities.add(unambiguousEntity);
- }
-
- @Override
- public String getKnownEntitiesType() {
-
- return params.getEntityType();
- }
-
-
-
- @Override
- public void setParameters(BaseModelBuilderParams params) {
- this.params = params;
- }
-}
diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/FileModelValidatorImpl.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/FileModelValidatorImpl.java
deleted file mode 100644
index e531900..0000000
--- a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/FileModelValidatorImpl.java
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Copyright 2013 The Apache Software Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package opennlp.modelbuilder.impls;
-
-import java.io.BufferedReader;
-import java.io.FileInputStream;
-import java.io.FileNotFoundException;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.nio.charset.Charset;
-import java.util.Collection;
-import java.util.HashSet;
-import java.util.Set;
-import java.util.logging.Level;
-import java.util.logging.Logger;
-import opennlp.modelbuilder.ModelGenerationValidator;
-
-/**
- *Validates NER results input before inclusion into the model
- */
-public class FileModelValidatorImpl implements ModelGenerationValidator {
-
- private Set<String> badentities = new HashSet<String>();
- BaseModelBuilderParams params;
-
- @Override
- public void setParameters(BaseModelBuilderParams params) {
- this.params = params;
- }
-
- @Override
- public Boolean validSentence(String sentence) {
- //returning true by default, because the sentence provider will return only "valid" sentences in this case
- return true;
- }
-
- @Override
- public Boolean validNamedEntity(String namedEntity) {
-
- if (badentities.isEmpty()) {
- getBlackList();
- }
-//
-// Pattern p = Pattern.compile("[0-9]", Pattern.CASE_INSENSITIVE | Pattern.MULTILINE);
-// if (p.matcher(namedEntity).find()) {
-// return false;
-// }
- Boolean b = true;
- if (badentities.contains(namedEntity.toLowerCase())) {
- b = false;
- }
- return b;
- }
-
- @Override
- public Collection<String> getBlackList() {
- if (params.getKnownEntityBlacklist() == null) {
- return badentities;
- }
- if (!badentities.isEmpty()) {
- try {
- InputStream fis;
- BufferedReader br;
- String line;
-
- fis = new FileInputStream(params.getKnownEntityBlacklist());
- br = new BufferedReader(new InputStreamReader(fis, Charset.forName("UTF-8")));
- while ((line = br.readLine()) != null) {
- badentities.add(line);
- }
- br.close();
- br = null;
- fis = null;
- } catch (FileNotFoundException ex) {
- Logger.getLogger(FileKnownEntityProvider.class.getName()).log(Level.SEVERE, null, ex);
- } catch (IOException ex) {
- Logger.getLogger(FileKnownEntityProvider.class.getName()).log(Level.SEVERE, null, ex);
- }
- }
- return badentities;
- }
-}
diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/FileSentenceProvider.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/FileSentenceProvider.java
deleted file mode 100644
index 3479e0c..0000000
--- a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/FileSentenceProvider.java
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright 2013 The Apache Software Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package opennlp.modelbuilder.impls;
-
-import java.io.BufferedReader;
-import java.io.FileInputStream;
-import java.io.FileNotFoundException;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.nio.charset.Charset;
-import java.util.HashSet;
-import java.util.Set;
-import java.util.logging.Level;
-import java.util.logging.Logger;
-import opennlp.modelbuilder.SentenceProvider;
-
-/**
- * Provides user sentences via a simple text file
- */
-public class FileSentenceProvider implements SentenceProvider {
-
- BaseModelBuilderParams params ;
- Set<String> sentences = new HashSet<String>();
-
- public Set<String> getSentences() {
- if (sentences.isEmpty()) {
- try {
- InputStream fis;
- BufferedReader br;
- String line;
-
- fis = new FileInputStream(params.getSentenceFile());
- br = new BufferedReader(new InputStreamReader(fis, Charset.forName("UTF-8")));
- int i=0;
- while ((line = br.readLine()) != null) {
-
- sentences.add(line);
- }
-
- // Done with the file
- br.close();
- br = null;
- fis = null;
- } catch (FileNotFoundException ex) {
- Logger.getLogger(FileKnownEntityProvider.class.getName()).log(Level.SEVERE, null, ex);
- } catch (IOException ex) {
- Logger.getLogger(FileKnownEntityProvider.class.getName()).log(Level.SEVERE, null, ex);
- }
- }
- return sentences;
- }
-
- public void setParameters(BaseModelBuilderParams params) {
- this.params = params;
- }
-}
diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/GenericModelGenerator.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/GenericModelGenerator.java
deleted file mode 100644
index 468e130..0000000
--- a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/GenericModelGenerator.java
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * Copyright 2013 The Apache Software Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package opennlp.modelbuilder.impls;
-
-import java.util.HashMap;
-import java.util.Map;
-import opennlp.modelbuilder.KnownEntityProvider;
-import opennlp.modelbuilder.ModelGenerationValidator;
-import opennlp.modelbuilder.Modelable;
-import opennlp.modelbuilder.SemiSupervisedModelGenerator;
-import opennlp.modelbuilder.SentenceProvider;
-import opennlp.tools.namefind.NameFinderME;
-import opennlp.tools.util.Span;
-
-/**
- *
- * Generic impl that handles all processing using the default file implementations
- */
-public class GenericModelGenerator implements SemiSupervisedModelGenerator {
-
- private Map<String, String> params = new HashMap<String, String>();
-
- @Override
- public void setParameters(BaseModelBuilderParams params) {
- this.params = params.getAdditionalParams();
- }
-
- @Override
- public void build(SentenceProvider sentenceProvider, KnownEntityProvider knownEntityProvider,
- ModelGenerationValidator validator, Modelable modelable, int iterations) {
- for (int iteration = 0; iteration < iterations; iteration++) {
- System.out.println("ITERATION: " + iteration);
- System.out.println("\tPerfoming Known Entity Annotation");
- System.out.println("\t\tknowns: " + knownEntityProvider.getKnownEntities().size());
- System.out.println("\t\treading data....: ");
- for (String sentence : sentenceProvider.getSentences()) {
- for (String knownEntity : knownEntityProvider.getKnownEntities()) {
- if (sentence.contains(knownEntity)) {
- //if the same sentence has multiple hits should they be annotated separately?
- modelable.addAnnotatedSentence(modelable.annotate(sentence, knownEntity, knownEntityProvider.getKnownEntitiesType()));
- }
- }
- }
- if (sentenceProvider.getSentences().isEmpty()) {
- System.out.println("No sentences in file");
- return;
- }
- if (knownEntityProvider.getKnownEntities().isEmpty()) {
- System.out.println("No known entities in file");
- return;
- }
- System.out.println("\t\twriting annotated sentences....: ");
- modelable.writeAnnotatedSentences();
- System.out.println("\t\tbuilding model.... ");
- modelable.buildModel(knownEntityProvider.getKnownEntitiesType());
- System.out.println("\t\tmodel building complete.... ");
- NameFinderME nf = new NameFinderME(modelable.getModel());
- System.out.println("\t\tannotated sentences: " + modelable.getAnnotatedSentences().size());
- System.out.println("\tPerforming NER with new model");
- System.out.println("\t\tPrinting NER Results. Add undesired results to the blacklist file and start over");
- for (String sentence : sentenceProvider.getSentences()) {
- if (!validator.validSentence(sentence)) {
- continue;
- }
- String[] tokens = modelable.tokenizeSentenceToWords(sentence);
-
- Span[] find = nf.find(tokens);
- nf.clearAdaptiveData();
-
- String[] namedEntities = Span.spansToStrings(find, tokens);
-
- for (String namedEntity : namedEntities) {
- System.out.println("\t\t" + namedEntity);
- if (validator.validNamedEntity(namedEntity)) {
-
- knownEntityProvider.addKnownEntity(namedEntity);
- modelable.addAnnotatedSentence(modelable.annotate(sentence, namedEntity, knownEntityProvider.getKnownEntitiesType()));
-
- } else {
- System.out.println("\t\t" + namedEntity + "...already blacklisted");
- }
- }
- }
- System.out.println("\t\tannotated sentences: " + modelable.getAnnotatedSentences().size());
- System.out.println("\t\tknowns: " + knownEntityProvider.getKnownEntities().size());
- }
- modelable.writeAnnotatedSentences();
- modelable.buildModel(knownEntityProvider.getKnownEntitiesType());
- }
-}
diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/GenericModelableImpl.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/GenericModelableImpl.java
deleted file mode 100644
index cfe4124..0000000
--- a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/GenericModelableImpl.java
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * Copyright 2013 The Apache Software Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package opennlp.modelbuilder.impls;
-
-import java.io.BufferedOutputStream;
-import java.io.FileInputStream;
-import java.io.FileOutputStream;
-import java.io.FileWriter;
-import java.io.IOException;
-import java.io.OutputStream;
-import java.nio.charset.Charset;
-import java.util.HashSet;
-import java.util.Set;
-import java.util.logging.Level;
-import java.util.logging.Logger;
-import opennlp.modelbuilder.Modelable;
-import opennlp.tools.namefind.NameFinderME;
-import opennlp.tools.namefind.NameSample;
-import opennlp.tools.namefind.NameSampleDataStream;
-import opennlp.tools.namefind.TokenNameFinderModel;
-import opennlp.tools.util.ObjectStream;
-import opennlp.tools.util.PlainTextByLineStream;
-
-/**
- * Creates annotations, writes annotations to file, and creates a model and writes to a file
- */
-public class GenericModelableImpl implements Modelable {
-
- private Set<String> annotatedSentences = new HashSet<String>();
- BaseModelBuilderParams params;
-
- @Override
- public void setParameters(BaseModelBuilderParams params) {
- this.params = params;
- }
-
- @Override
- public String annotate(String sentence, String namedEntity, String entityType) {
- String annotation = sentence.replace(namedEntity, " <START:" + entityType + "> " + namedEntity + " <END> ");
- return annotation;
- }
-
- @Override
- public void writeAnnotatedSentences() {
- try {
-
- FileWriter writer = new FileWriter(params.getAnnotatedTrainingDataFile(), false);
-
- for (String s : annotatedSentences) {
- writer.write(s.replace("\n", " ").trim() + "\n");
- }
- writer.close();
- } catch (IOException ex) {
- ex.printStackTrace();
- }
- }
-
- @Override
- public Set<String> getAnnotatedSentences() {
- return annotatedSentences;
- }
-
- @Override
- public void setAnnotatedSentences(Set<String> annotatedSentences) {
- this.annotatedSentences = annotatedSentences;
- }
-
- @Override
- public void addAnnotatedSentence(String annotatedSentence) {
- annotatedSentences.add(annotatedSentence);
- }
-
- @Override
- public void buildModel(String entityType) {
- try {
- System.out.println("\tBuilding Model using " + annotatedSentences.size() + " annotations");
- System.out.println("\t\treading training data...");
- Charset charset = Charset.forName("UTF-8");
- ObjectStream<String> lineStream =
- new PlainTextByLineStream(new FileInputStream(params.getAnnotatedTrainingDataFile()), charset);
- ObjectStream<NameSample> sampleStream = new NameSampleDataStream(lineStream);
-
- TokenNameFinderModel model;
- model = NameFinderME.train("en", entityType, sampleStream, null);
- sampleStream.close();
- OutputStream modelOut = new BufferedOutputStream(new FileOutputStream(params.getModelFile()));
- model.serialize(modelOut);
- if (modelOut != null) {
- modelOut.close();
- }
- System.out.println("\tmodel generated");
- } catch (Exception e) {
- }
- }
-
- @Override
- public TokenNameFinderModel getModel() {
-
-
- TokenNameFinderModel nerModel = null;
- try {
- nerModel = new TokenNameFinderModel(new FileInputStream(params.getModelFile()));
- } catch (IOException ex) {
- Logger.getLogger(GenericModelableImpl.class.getName()).log(Level.SEVERE, null, ex);
- }
- return nerModel;
- }
-
- @Override
- public String[] tokenizeSentenceToWords(String sentence) {
- return sentence.split(" ");
-
- }
-}