OPENNLP-607
renamed packages for consistency in addons, also made the framework generic with file based implementations
diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/DefaultModelBuilderUtil.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/DefaultModelBuilderUtil.java
deleted file mode 100644
index 7ed4fbd..0000000
--- a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/DefaultModelBuilderUtil.java
+++ /dev/null
@@ -1,117 +0,0 @@
-/*

- * Copyright 2013 The Apache Software Foundation.

- *

- * Licensed under the Apache License, Version 2.0 (the "License");

- * you may not use this file except in compliance with the License.

- * You may obtain a copy of the License at

- *

- *      http://www.apache.org/licenses/LICENSE-2.0

- *

- * Unless required by applicable law or agreed to in writing, software

- * distributed under the License is distributed on an "AS IS" BASIS,

- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

- * See the License for the specific language governing permissions and

- * limitations under the License.

- */

-package opennlp.modelbuilder;

-

-import java.io.File;

-import opennlp.modelbuilder.impls.BaseModelBuilderParams;

-import opennlp.modelbuilder.impls.FileKnownEntityProvider;

-import opennlp.modelbuilder.impls.FileModelValidatorImpl;

-import opennlp.modelbuilder.impls.FileSentenceProvider;

-import opennlp.modelbuilder.impls.GenericModelGenerator;

-import opennlp.modelbuilder.impls.GenericModelableImpl;

-

-/**

- *

- * Utilizes the filebased implementations to produce an NER model from user

- * The basic processing is such

- * read in the list of known entities

- * annotate the sentences based on the list of known entities

- * create a model from the annotations

- * perform NER with the model on the sentences

- * add the NER results to the annotations

- * rebuild the model

- * loop

- * defined data

- */

-public class DefaultModelBuilderUtil {

-

-  /**

-   *

-   * @param sentences                a file that contains one sentence per line.

-   *                                 There should be at least 15K sentences

-   *                                 consisting of a representative sample from

-   *                                 user data

-   * @param knownEntities            a file consisting of a simple list of

-   *                                 unambiguous entities, one entry per line.

-   *                                 For instance, if one was trying to build a

-   *                                 person NER model then this file would be a

-   *                                 list of person names that are unambiguous

-   *                                 and are known to exist in the sentences

-   *                                 file

-   * @param knownEntitiesBlacklist   This file contains a list of known bad hits

-   *                                 that the NER phase of this processing might

-   *                                 catch early one before the model iterates

-   *                                 to maturity

-   * @param modelOutFile             the location where the model will be

-   *                                 written to

-   * @param annotatedSentenceOutFile where the annotated sentences produced by

-   *                                 this process will be written to

-   * @param namedEntityType          the type of entity... for example, person,

-   *                                 location, organization...

-   * @param iterations               how many times to repeat the iterative loop

-   *                                 of annotation, model generation, and NER

-   */

-  public static void generateModel(File sentences, File knownEntities, File knownEntitiesBlacklist,

-          File modelOutFile, File annotatedSentenceOutFile, String namedEntityType, int iterations) {

-    SemiSupervisedModelGenerator modelGenerator = new GenericModelGenerator();

-    BaseModelBuilderParams params = new BaseModelBuilderParams();

-    params.setAnnotatedTrainingDataFile(annotatedSentenceOutFile);

-    params.setSentenceFile(sentences);

-    params.setEntityType(namedEntityType);

-    params.setKnownEntitiesFile(knownEntities);

-    params.setModelFile(modelOutFile);

-    params.setKnownEntityBlacklist(knownEntitiesBlacklist);

-    /**

-     * sentence providers feed this process with user data derived sentences

-     * this impl just reads line by line through a file

-     */

-    SentenceProvider sentenceProvider = new FileSentenceProvider();

-    sentenceProvider.setParameters(params);

-    /**

-     * KnownEntityProviders provide a seed list of known entities... such as

-     * Barack Obama for person, or Germany for location obviously these would

-     * want to be prolific, non ambiguous names

-     */

-    KnownEntityProvider knownEntityProvider = new FileKnownEntityProvider();

-    knownEntityProvider.setParameters(params);

-    /**

-     * ModelGenerationValidators try to weed out bad hits by the iterations of

-     * the name finder. Since this is a recursive process, with each iteration

-     * the namefinder will get more and more greedy if bad entities are allowed

-     * in this provides a mechanism for throwing out obviously bad hits. A good

-     * impl may be to make sure a location is actually within a noun phrase

-     * etc...users can make this as specific as they need for their dat and

-     * their use case

-     */

-    ModelGenerationValidator validator = new FileModelValidatorImpl();

-    validator.setParameters(params);

-    /**

-     * Modelable's write and read the annotated sentences, as well as create and

-     * write the NER models

-     */

-    Modelable modelable = new GenericModelableImpl();

-    modelable.setParameters(params);

-

-    /**

-     * the modelGenerator actually runs the process with a set number of

-     * iterations... could be better by actually calculating the diff between

-     * runs and stopping based on a thresh, but for extrememly large sentence

-     * sets this may be too much.

-     */

-    modelGenerator.build(sentenceProvider, knownEntityProvider, validator, modelable, iterations);

-

-  }

-}

diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/KnownEntityProvider.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/KnownEntityProvider.java
deleted file mode 100644
index 5f54ffe..0000000
--- a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/KnownEntityProvider.java
+++ /dev/null
@@ -1,45 +0,0 @@
-/*

- * Copyright 2013 The Apache Software Foundation.

- *

- * Licensed under the Apache License, Version 2.0 (the "License");

- * you may not use this file except in compliance with the License.

- * You may obtain a copy of the License at

- *

- *      http://www.apache.org/licenses/LICENSE-2.0

- *

- * Unless required by applicable law or agreed to in writing, software

- * distributed under the License is distributed on an "AS IS" BASIS,

- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

- * See the License for the specific language governing permissions and

- * limitations under the License.

- */

-package opennlp.modelbuilder;

-

-import java.util.Set;

-

-

-

-/**

- *

-Supplies a list of known entities (a list of names or locations)

- */

-public interface KnownEntityProvider extends ModelParameter{

-  /**

- * returns a list of known non ambiguous entities.

- * @return a set of entities

- */

-  Set<String> getKnownEntities();

-/**

- * adds to the set of known entities. Overriding classes should hold this list in a class level set.

- * @param unambiguousEntity 

- */

-  void addKnownEntity(String unambiguousEntity);

-/**

- * defines the type of entity that the set contains, ie person, location, organization.

- * @return 

- */

-  String getKnownEntitiesType();

-  

-  

-  

-}

diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/ModelGenerationValidator.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/ModelGenerationValidator.java
deleted file mode 100644
index 73bb515..0000000
--- a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/ModelGenerationValidator.java
+++ /dev/null
@@ -1,33 +0,0 @@
-/*

- * Copyright 2013 The Apache Software Foundation.

- *

- * Licensed under the Apache License, Version 2.0 (the "License");

- * you may not use this file except in compliance with the License.

- * You may obtain a copy of the License at

- *

- *      http://www.apache.org/licenses/LICENSE-2.0

- *

- * Unless required by applicable law or agreed to in writing, software

- * distributed under the License is distributed on an "AS IS" BASIS,

- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

- * See the License for the specific language governing permissions and

- * limitations under the License.

- */

-package opennlp.modelbuilder;

-

-import java.util.Collection;

-

-/**

- *

-Validates results from the iterative namefinding

- */

-public interface ModelGenerationValidator extends ModelParameter {

-

-  Boolean validSentence(String sentence);

-

-  Boolean validNamedEntity(String namedEntity);

-  

-

-

-  Collection<String> getBlackList();

-}

diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/ModelParameter.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/ModelParameter.java
deleted file mode 100644
index 3c991ab..0000000
--- a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/ModelParameter.java
+++ /dev/null
@@ -1,28 +0,0 @@
-/*

- * Copyright 2013 The Apache Software Foundation.

- *

- * Licensed under the Apache License, Version 2.0 (the "License");

- * you may not use this file except in compliance with the License.

- * You may obtain a copy of the License at

- *

- *      http://www.apache.org/licenses/LICENSE-2.0

- *

- * Unless required by applicable law or agreed to in writing, software

- * distributed under the License is distributed on an "AS IS" BASIS,

- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

- * See the License for the specific language governing permissions and

- * limitations under the License.

- */

-package opennlp.modelbuilder;

-

-import opennlp.modelbuilder.impls.BaseModelBuilderParams;

-

-/**

- *

- */

-public interface ModelParameter<T extends  BaseModelBuilderParams>{

-   

-  void setParameters(T params);

-  

-

-}

diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/Modelable.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/Modelable.java
deleted file mode 100644
index 4e3ed0c..0000000
--- a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/Modelable.java
+++ /dev/null
@@ -1,45 +0,0 @@
-/*

- * Copyright 2013 The Apache Software Foundation.

- *

- * Licensed under the Apache License, Version 2.0 (the "License");

- * you may not use this file except in compliance with the License.

- * You may obtain a copy of the License at

- *

- *      http://www.apache.org/licenses/LICENSE-2.0

- *

- * Unless required by applicable law or agreed to in writing, software

- * distributed under the License is distributed on an "AS IS" BASIS,

- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

- * See the License for the specific language governing permissions and

- * limitations under the License.

- */

-package opennlp.modelbuilder;

-

-import java.util.Set;

-import opennlp.tools.namefind.TokenNameFinderModel;

-

-/**

- *

- */

-public interface Modelable extends ModelParameter{

-

-

-

-  String annotate(String sentence, String namedEntity, String entityType);

-

-  void writeAnnotatedSentences();

-

-  Set<String> getAnnotatedSentences();

-

-  void setAnnotatedSentences(Set<String> annotatedSentences);

-

-  void addAnnotatedSentence(String annotatedSentence);

-

-  void buildModel( String entityType);

-

-  TokenNameFinderModel getModel();

-

-  String[] tokenizeSentenceToWords(String sentence);

-  

-

-}

diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/SemiSupervisedModelGenerator.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/SemiSupervisedModelGenerator.java
deleted file mode 100644
index cc2f043..0000000
--- a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/SemiSupervisedModelGenerator.java
+++ /dev/null
@@ -1,28 +0,0 @@
-/*

- * Copyright 2013 The Apache Software Foundation.

- *

- * Licensed under the Apache License, Version 2.0 (the "License");

- * you may not use this file except in compliance with the License.

- * You may obtain a copy of the License at

- *

- *      http://www.apache.org/licenses/LICENSE-2.0

- *

- * Unless required by applicable law or agreed to in writing, software

- * distributed under the License is distributed on an "AS IS" BASIS,

- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

- * See the License for the specific language governing permissions and

- * limitations under the License.

- */

-package opennlp.modelbuilder;

-

-import opennlp.modelbuilder.impls.BaseModelBuilderParams;

-

-/**

- *

-

- */

-public interface SemiSupervisedModelGenerator extends ModelParameter<BaseModelBuilderParams> {

-

-  void build(SentenceProvider sentenceProvider, KnownEntityProvider knownEntityProvider, 

-          ModelGenerationValidator validator, Modelable modelable, int iterations);

-}

diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/SentenceProvider.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/SentenceProvider.java
deleted file mode 100644
index acf3b09..0000000
--- a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/SentenceProvider.java
+++ /dev/null
@@ -1,27 +0,0 @@
-/*

- * Copyright 2013 The Apache Software Foundation.

- *

- * Licensed under the Apache License, Version 2.0 (the "License");

- * you may not use this file except in compliance with the License.

- * You may obtain a copy of the License at

- *

- *      http://www.apache.org/licenses/LICENSE-2.0

- *

- * Unless required by applicable law or agreed to in writing, software

- * distributed under the License is distributed on an "AS IS" BASIS,

- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

- * See the License for the specific language governing permissions and

- * limitations under the License.

- */

-package opennlp.modelbuilder;

-

-import java.util.Set;

-import opennlp.modelbuilder.impls.BaseModelBuilderParams;

-

-/**

- *

- */

-public interface SentenceProvider extends ModelParameter<BaseModelBuilderParams> {

-

-  Set<String> getSentences();

-}

diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/BaseModelBuilderParams.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/BaseModelBuilderParams.java
deleted file mode 100644
index e5c1267..0000000
--- a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/BaseModelBuilderParams.java
+++ /dev/null
@@ -1,90 +0,0 @@
-/*

- * Copyright 2013 The Apache Software Foundation.

- *

- * Licensed under the Apache License, Version 2.0 (the "License");

- * you may not use this file except in compliance with the License.

- * You may obtain a copy of the License at

- *

- *      http://www.apache.org/licenses/LICENSE-2.0

- *

- * Unless required by applicable law or agreed to in writing, software

- * distributed under the License is distributed on an "AS IS" BASIS,

- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

- * See the License for the specific language governing permissions and

- * limitations under the License.

- */

-package opennlp.modelbuilder.impls;

-

-import java.io.File;

-import java.util.Map;

-

-/**

- *

- * Used to pass params through the processing

- */

-public class BaseModelBuilderParams {

-

-  private File modelFile;

-  private File sentenceFile;

-  private File knownEntitiesFile;

-  private File knownEntityBlacklist;

-  private File annotatedTrainingDataFile;

-  private String entityType;

-  private Map<String, String> additionalParams;

-

-  public File getModelFile() {

-    return modelFile;

-  }

-

-  public void setModelFile(File modelFile) {

-    this.modelFile = modelFile;

-  }

-

-  public File getSentenceFile() {

-    return sentenceFile;

-  }

-

-  public void setSentenceFile(File sentenceFile) {

-    this.sentenceFile = sentenceFile;

-  }

-

-  public File getKnownEntitiesFile() {

-    return knownEntitiesFile;

-  }

-

-  public void setKnownEntitiesFile(File knownEntitiesFile) {

-    this.knownEntitiesFile = knownEntitiesFile;

-  }

-

-  public File getKnownEntityBlacklist() {

-    return knownEntityBlacklist;

-  }

-

-  public void setKnownEntityBlacklist(File knownEntityBlacklist) {

-    this.knownEntityBlacklist = knownEntityBlacklist;

-  }

-

-  public Map<String, String> getAdditionalParams() {

-    return additionalParams;

-  }

-

-  public void setAdditionalParams(Map<String, String> additionalParams) {

-    this.additionalParams = additionalParams;

-  }

-

-  public String getEntityType() {

-    return entityType;

-  }

-

-  public void setEntityType(String entityType) {

-    this.entityType = entityType;

-  }

-

-  public File getAnnotatedTrainingDataFile() {

-    return annotatedTrainingDataFile;

-  }

-

-  public void setAnnotatedTrainingDataFile(File annotatedTrainingDataFile) {

-    this.annotatedTrainingDataFile = annotatedTrainingDataFile;

-  }

-}
\ No newline at end of file
diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/FileKnownEntityProvider.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/FileKnownEntityProvider.java
deleted file mode 100644
index 0ebf565..0000000
--- a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/FileKnownEntityProvider.java
+++ /dev/null
@@ -1,82 +0,0 @@
-/*

- * Copyright 2013 The Apache Software Foundation.

- *

- * Licensed under the Apache License, Version 2.0 (the "License");

- * you may not use this file except in compliance with the License.

- * You may obtain a copy of the License at

- *

- *      http://www.apache.org/licenses/LICENSE-2.0

- *

- * Unless required by applicable law or agreed to in writing, software

- * distributed under the License is distributed on an "AS IS" BASIS,

- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

- * See the License for the specific language governing permissions and

- * limitations under the License.

- */

-package opennlp.modelbuilder.impls;

-

-import java.io.BufferedReader;

-import java.io.FileInputStream;

-import java.io.FileNotFoundException;

-import java.io.IOException;

-import java.io.InputStream;

-import java.io.InputStreamReader;

-import java.nio.charset.Charset;

-import java.util.HashSet;

-import java.util.Set;

-import java.util.logging.Level;

-import java.util.logging.Logger;

-import opennlp.modelbuilder.KnownEntityProvider;

-

-/**

- *

- */

-public class FileKnownEntityProvider implements KnownEntityProvider {

- 

-  Set<String> knownEntities = new HashSet<String>();

-  BaseModelBuilderParams params;

-  @Override

-  public Set<String> getKnownEntities() {

-    if (knownEntities.isEmpty()) {

-      try {

-        InputStream fis;

-        BufferedReader br;

-        String line;

-

-        fis = new FileInputStream(params.getKnownEntitiesFile());

-        br = new BufferedReader(new InputStreamReader(fis, Charset.forName("UTF-8")));

-        while ((line = br.readLine()) != null) {

-          knownEntities.add(line);

-        }

-

-        // Done with the file

-        br.close();

-        br = null;

-        fis = null;

-      } catch (FileNotFoundException ex) {

-        Logger.getLogger(FileKnownEntityProvider.class.getName()).log(Level.SEVERE, null, ex);

-      } catch (IOException ex) {

-        Logger.getLogger(FileKnownEntityProvider.class.getName()).log(Level.SEVERE, null, ex);

-      }

-    }

-    return knownEntities;

-  }

-

-  @Override

-  public void addKnownEntity(String unambiguousEntity) {

-    knownEntities.add(unambiguousEntity);

-  }

-

-  @Override

-  public String getKnownEntitiesType() {

- 

-    return params.getEntityType();

-  }

-

-

-

-  @Override

- public void setParameters(BaseModelBuilderParams params) {

-    this.params = params;

-  }

-}

diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/FileModelValidatorImpl.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/FileModelValidatorImpl.java
deleted file mode 100644
index e531900..0000000
--- a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/FileModelValidatorImpl.java
+++ /dev/null
@@ -1,96 +0,0 @@
-/*

- * Copyright 2013 The Apache Software Foundation.

- *

- * Licensed under the Apache License, Version 2.0 (the "License");

- * you may not use this file except in compliance with the License.

- * You may obtain a copy of the License at

- *

- *      http://www.apache.org/licenses/LICENSE-2.0

- *

- * Unless required by applicable law or agreed to in writing, software

- * distributed under the License is distributed on an "AS IS" BASIS,

- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

- * See the License for the specific language governing permissions and

- * limitations under the License.

- */

-package opennlp.modelbuilder.impls;

-

-import java.io.BufferedReader;

-import java.io.FileInputStream;

-import java.io.FileNotFoundException;

-import java.io.IOException;

-import java.io.InputStream;

-import java.io.InputStreamReader;

-import java.nio.charset.Charset;

-import java.util.Collection;

-import java.util.HashSet;

-import java.util.Set;

-import java.util.logging.Level;

-import java.util.logging.Logger;

-import opennlp.modelbuilder.ModelGenerationValidator;

-

-/**

- *Validates NER results input before inclusion into the model

- */

-public class FileModelValidatorImpl implements ModelGenerationValidator {

-

-  private Set<String> badentities = new HashSet<String>();

-  BaseModelBuilderParams params;

-

-  @Override

-  public void setParameters(BaseModelBuilderParams params) {

-    this.params = params;

-  }

-

-  @Override

-  public Boolean validSentence(String sentence) {

-    //returning true by default, because the sentence provider will  return only "valid" sentences in this case

-    return true;

-  }

-

-  @Override

-  public Boolean validNamedEntity(String namedEntity) {

-

-    if (badentities.isEmpty()) {

-      getBlackList();

-    }

-//

-//    Pattern p = Pattern.compile("[0-9]", Pattern.CASE_INSENSITIVE | Pattern.MULTILINE);

-//    if (p.matcher(namedEntity).find()) {

-//      return false;

-//    }

-    Boolean b = true;

-    if (badentities.contains(namedEntity.toLowerCase())) {

-      b = false;

-    }

-    return b;

-  }

-

-  @Override

-  public Collection<String> getBlackList() {

-    if (params.getKnownEntityBlacklist() == null) {

-      return badentities;

-    }

-    if (!badentities.isEmpty()) {

-      try {

-        InputStream fis;

-        BufferedReader br;

-        String line;

-

-        fis = new FileInputStream(params.getKnownEntityBlacklist());

-        br = new BufferedReader(new InputStreamReader(fis, Charset.forName("UTF-8")));

-        while ((line = br.readLine()) != null) {

-          badentities.add(line);

-        }

-        br.close();

-        br = null;

-        fis = null;

-      } catch (FileNotFoundException ex) {

-        Logger.getLogger(FileKnownEntityProvider.class.getName()).log(Level.SEVERE, null, ex);

-      } catch (IOException ex) {

-        Logger.getLogger(FileKnownEntityProvider.class.getName()).log(Level.SEVERE, null, ex);

-      }

-    }

-    return badentities;

-  }

-}

diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/FileSentenceProvider.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/FileSentenceProvider.java
deleted file mode 100644
index 3479e0c..0000000
--- a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/FileSentenceProvider.java
+++ /dev/null
@@ -1,70 +0,0 @@
-/*

- * Copyright 2013 The Apache Software Foundation.

- *

- * Licensed under the Apache License, Version 2.0 (the "License");

- * you may not use this file except in compliance with the License.

- * You may obtain a copy of the License at

- *

- *      http://www.apache.org/licenses/LICENSE-2.0

- *

- * Unless required by applicable law or agreed to in writing, software

- * distributed under the License is distributed on an "AS IS" BASIS,

- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

- * See the License for the specific language governing permissions and

- * limitations under the License.

- */

-package opennlp.modelbuilder.impls;

-

-import java.io.BufferedReader;

-import java.io.FileInputStream;

-import java.io.FileNotFoundException;

-import java.io.IOException;

-import java.io.InputStream;

-import java.io.InputStreamReader;

-import java.nio.charset.Charset;

-import java.util.HashSet;

-import java.util.Set;

-import java.util.logging.Level;

-import java.util.logging.Logger;

-import opennlp.modelbuilder.SentenceProvider;

-

-/**

- * Provides user sentences via a simple text file

- */

-public class FileSentenceProvider implements SentenceProvider {

-

-  BaseModelBuilderParams params ;

-  Set<String> sentences = new HashSet<String>();

-

-  public Set<String> getSentences() {

-     if (sentences.isEmpty()) {

-      try {

-        InputStream fis;

-        BufferedReader br;

-        String line;

-

-        fis = new FileInputStream(params.getSentenceFile());

-        br = new BufferedReader(new InputStreamReader(fis, Charset.forName("UTF-8")));

-        int i=0;

-        while ((line = br.readLine()) != null) {

-         

-          sentences.add(line);

-        }

-

-        // Done with the file

-        br.close();

-        br = null;

-        fis = null;

-      } catch (FileNotFoundException ex) {

-        Logger.getLogger(FileKnownEntityProvider.class.getName()).log(Level.SEVERE, null, ex);

-      } catch (IOException ex) {

-        Logger.getLogger(FileKnownEntityProvider.class.getName()).log(Level.SEVERE, null, ex);

-      }

-    }

-    return sentences;

-  }

-

- public void setParameters(BaseModelBuilderParams params) {

-    this.params = params;

-  }

-}

diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/GenericModelGenerator.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/GenericModelGenerator.java
deleted file mode 100644
index 468e130..0000000
--- a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/GenericModelGenerator.java
+++ /dev/null
@@ -1,103 +0,0 @@
-/*

- * Copyright 2013 The Apache Software Foundation.

- *

- * Licensed under the Apache License, Version 2.0 (the "License");

- * you may not use this file except in compliance with the License.

- * You may obtain a copy of the License at

- *

- *      http://www.apache.org/licenses/LICENSE-2.0

- *

- * Unless required by applicable law or agreed to in writing, software

- * distributed under the License is distributed on an "AS IS" BASIS,

- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

- * See the License for the specific language governing permissions and

- * limitations under the License.

- */

-package opennlp.modelbuilder.impls;

-

-import java.util.HashMap;

-import java.util.Map;

-import opennlp.modelbuilder.KnownEntityProvider;

-import opennlp.modelbuilder.ModelGenerationValidator;

-import opennlp.modelbuilder.Modelable;

-import opennlp.modelbuilder.SemiSupervisedModelGenerator;

-import opennlp.modelbuilder.SentenceProvider;

-import opennlp.tools.namefind.NameFinderME;

-import opennlp.tools.util.Span;

-

-/**

- *

- * Generic impl that handles all processing using the default file implementations

- */

-public class GenericModelGenerator implements SemiSupervisedModelGenerator {

-

-  private Map<String, String> params = new HashMap<String, String>();

-

-  @Override

-  public void setParameters(BaseModelBuilderParams params) {

-    this.params = params.getAdditionalParams();

-  }

-

-  @Override

-  public void build(SentenceProvider sentenceProvider, KnownEntityProvider knownEntityProvider,

-          ModelGenerationValidator validator, Modelable modelable, int iterations) {

-    for (int iteration = 0; iteration < iterations; iteration++) {

-      System.out.println("ITERATION: " + iteration);

-      System.out.println("\tPerfoming Known Entity Annotation");

-      System.out.println("\t\tknowns: " + knownEntityProvider.getKnownEntities().size());

-      System.out.println("\t\treading data....: ");

-      for (String sentence : sentenceProvider.getSentences()) {

-        for (String knownEntity : knownEntityProvider.getKnownEntities()) {

-          if (sentence.contains(knownEntity)) {

-            //if the same sentence has multiple hits should they be annotated separately?

-            modelable.addAnnotatedSentence(modelable.annotate(sentence, knownEntity, knownEntityProvider.getKnownEntitiesType()));

-          }

-        }

-      }

-      if (sentenceProvider.getSentences().isEmpty()) {

-        System.out.println("No sentences in file");

-        return;

-      }

-      if (knownEntityProvider.getKnownEntities().isEmpty()) {

-        System.out.println("No known entities in file");

-        return;

-      }

-      System.out.println("\t\twriting annotated sentences....: ");

-      modelable.writeAnnotatedSentences();

-          System.out.println("\t\tbuilding model.... ");

-      modelable.buildModel(knownEntityProvider.getKnownEntitiesType());

-      System.out.println("\t\tmodel building complete.... ");

-      NameFinderME nf = new NameFinderME(modelable.getModel());

-      System.out.println("\t\tannotated sentences: " + modelable.getAnnotatedSentences().size());

-      System.out.println("\tPerforming NER with new model");

-      System.out.println("\t\tPrinting NER Results. Add undesired results to the blacklist file and start over");

-      for (String sentence : sentenceProvider.getSentences()) {

-        if (!validator.validSentence(sentence)) {

-          continue;

-        }

-        String[] tokens = modelable.tokenizeSentenceToWords(sentence);

-

-        Span[] find = nf.find(tokens);

-        nf.clearAdaptiveData();

-

-        String[] namedEntities = Span.spansToStrings(find, tokens);

-

-        for (String namedEntity : namedEntities) {

-          System.out.println("\t\t" + namedEntity);

-          if (validator.validNamedEntity(namedEntity)) {

-

-            knownEntityProvider.addKnownEntity(namedEntity);

-            modelable.addAnnotatedSentence(modelable.annotate(sentence, namedEntity, knownEntityProvider.getKnownEntitiesType()));

-

-          } else {

-            System.out.println("\t\t" + namedEntity + "...already blacklisted");

-          }

-        }

-      }

-      System.out.println("\t\tannotated sentences: " + modelable.getAnnotatedSentences().size());

-      System.out.println("\t\tknowns: " + knownEntityProvider.getKnownEntities().size());

-    }

-    modelable.writeAnnotatedSentences();

-    modelable.buildModel(knownEntityProvider.getKnownEntitiesType());

-  }

-}

diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/GenericModelableImpl.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/GenericModelableImpl.java
deleted file mode 100644
index cfe4124..0000000
--- a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/GenericModelableImpl.java
+++ /dev/null
@@ -1,127 +0,0 @@
-/*

- * Copyright 2013 The Apache Software Foundation.

- *

- * Licensed under the Apache License, Version 2.0 (the "License");

- * you may not use this file except in compliance with the License.

- * You may obtain a copy of the License at

- *

- *      http://www.apache.org/licenses/LICENSE-2.0

- *

- * Unless required by applicable law or agreed to in writing, software

- * distributed under the License is distributed on an "AS IS" BASIS,

- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

- * See the License for the specific language governing permissions and

- * limitations under the License.

- */

-package opennlp.modelbuilder.impls;

-

-import java.io.BufferedOutputStream;

-import java.io.FileInputStream;

-import java.io.FileOutputStream;

-import java.io.FileWriter;

-import java.io.IOException;

-import java.io.OutputStream;

-import java.nio.charset.Charset;

-import java.util.HashSet;

-import java.util.Set;

-import java.util.logging.Level;

-import java.util.logging.Logger;

-import opennlp.modelbuilder.Modelable;

-import opennlp.tools.namefind.NameFinderME;

-import opennlp.tools.namefind.NameSample;

-import opennlp.tools.namefind.NameSampleDataStream;

-import opennlp.tools.namefind.TokenNameFinderModel;

-import opennlp.tools.util.ObjectStream;

-import opennlp.tools.util.PlainTextByLineStream;

-

-/**

- * Creates annotations, writes annotations to file, and creates a model and writes to a file

- */

-public class GenericModelableImpl implements Modelable {

-

-  private Set<String> annotatedSentences = new HashSet<String>();

-  BaseModelBuilderParams params;

-

-  @Override

-  public void setParameters(BaseModelBuilderParams params) {

-    this.params = params;

-  }

-

-  @Override

-  public String annotate(String sentence, String namedEntity, String entityType) {

-    String annotation = sentence.replace(namedEntity, " <START:" + entityType + "> " + namedEntity + " <END> ");

-    return annotation;

-  }

-

-  @Override

-  public void writeAnnotatedSentences() {

-    try {

-

-      FileWriter writer = new FileWriter(params.getAnnotatedTrainingDataFile(), false);

-

-      for (String s : annotatedSentences) {

-        writer.write(s.replace("\n", " ").trim() + "\n");

-      }

-      writer.close();

-    } catch (IOException ex) {

-      ex.printStackTrace();

-    }

-  }

-

-  @Override

-  public Set<String> getAnnotatedSentences() {

-    return annotatedSentences;

-  }

-

-  @Override

-  public void setAnnotatedSentences(Set<String> annotatedSentences) {

-    this.annotatedSentences = annotatedSentences;

-  }

-

-  @Override

-  public void addAnnotatedSentence(String annotatedSentence) {

-    annotatedSentences.add(annotatedSentence);

-  }

-

-  @Override

-  public void buildModel(String entityType) {

-    try {

-      System.out.println("\tBuilding Model using " + annotatedSentences.size() + " annotations");

-      System.out.println("\t\treading training data...");

-      Charset charset = Charset.forName("UTF-8");

-      ObjectStream<String> lineStream =

-              new PlainTextByLineStream(new FileInputStream(params.getAnnotatedTrainingDataFile()), charset);

-      ObjectStream<NameSample> sampleStream = new NameSampleDataStream(lineStream);

-

-      TokenNameFinderModel model;

-      model = NameFinderME.train("en", entityType, sampleStream, null);

-      sampleStream.close();

-      OutputStream modelOut = new BufferedOutputStream(new FileOutputStream(params.getModelFile()));

-      model.serialize(modelOut);

-      if (modelOut != null) {

-        modelOut.close();

-      }

-      System.out.println("\tmodel generated");

-    } catch (Exception e) {

-    }

-  }

-

-  @Override

-  public TokenNameFinderModel getModel() {

-

-

-    TokenNameFinderModel nerModel = null;

-    try {

-      nerModel = new TokenNameFinderModel(new FileInputStream(params.getModelFile()));

-    } catch (IOException ex) {

-      Logger.getLogger(GenericModelableImpl.class.getName()).log(Level.SEVERE, null, ex);

-    }

-    return nerModel;

-  }

-

-  @Override

-  public String[] tokenizeSentenceToWords(String sentence) {

-    return sentence.split(" ");

-

-  }

-}