OPENNLP-607
Fixed many issues. Added default file-based impls for all interfaces, and created a util class wrapper to allow for easy use of the default implementations.
diff --git a/modelbuilder-prototype/src/main/java/modelbuilder/App.java b/modelbuilder-prototype/src/main/java/modelbuilder/App.java
deleted file mode 100644
index a84e689..0000000
--- a/modelbuilder-prototype/src/main/java/modelbuilder/App.java
+++ /dev/null
@@ -1,25 +0,0 @@
-/*

- * Copyright 2013 The Apache Software Foundation.

- *

- * Licensed under the Apache License, Version 2.0 (the "License");

- * you may not use this file except in compliance with the License.

- * You may obtain a copy of the License at

- *

- *      http://www.apache.org/licenses/LICENSE-2.0

- *

- * Unless required by applicable law or agreed to in writing, software

- * distributed under the License is distributed on an "AS IS" BASIS,

- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

- * See the License for the specific language governing permissions and

- * limitations under the License.

- */

-package modelbuilder;

-

-

-public class App 

-{

-    public static void main( String[] args )

-    {

-       

-    }

-}

diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/Example.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/Example.java
deleted file mode 100644
index bf4d59f..0000000
--- a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/Example.java
+++ /dev/null
@@ -1,78 +0,0 @@
-/*

- * Copyright 2013 The Apache Software Foundation.

- *

- * Licensed under the Apache License, Version 2.0 (the "License");

- * you may not use this file except in compliance with the License.

- * You may obtain a copy of the License at

- *

- *      http://www.apache.org/licenses/LICENSE-2.0

- *

- * Unless required by applicable law or agreed to in writing, software

- * distributed under the License is distributed on an "AS IS" BASIS,

- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

- * See the License for the specific language governing permissions and

- * limitations under the License.

- */

-package opennlp.modelbuilder.v2;

-

-import java.util.HashMap;

-import java.util.Map;

-import opennlp.modelbuilder.v2.impls.FileKnownEntityProvider;

-import opennlp.modelbuilder.v2.impls.FileModelValidatorImpl;

-import opennlp.modelbuilder.v2.impls.FileSentenceProvider;

-import opennlp.modelbuilder.v2.impls.ModelableImpl;

-

-/**

- *

- * @author Owner

- */

-public class Example {

-

-  public static void main(String[] args) {

-

-    SemiSupervisedModelGenerator modelGenerator = new GenericModelGenerator();

-    //every component has a map as a place to recieve params

-    //these are required for the current file-based impls

-    Map<String, String> params = new HashMap<String, String>();

-    params.put("knownentityfile", "C:\\apache\\entitylinker\\opennlp.geoentitylinker.countrycontext.txt");

-    params.put("sentencesfile", "C:\\apache\\modelbuilder\\sentences.txt");

-    params.put("knownentitytype", "location");

-    params.put("blacklistfile", "C:\\apache\\modelbuilder\\blacklist.txt");

-    params.put("modelablepath", "C:\\apache\\modelbuilder");

-

-    /**

-     * sentence providers feed this process with user data derived sentences

-     * this impl just reads line by line through a file

-     */

-    SentenceProvider sentenceProvider = new FileSentenceProvider();

-    sentenceProvider.setParameters(params);

-    /**

-     *KnownEntityProviders provide a seed list of known entities... such as Barack Obama for person, or Germany for location

-     * obviously these would want to be prolific, non ambiguous names

-     */

-    KnownEntityProvider knownEntityProvider = new FileKnownEntityProvider();

-    knownEntityProvider.setParameters(params);

-    /**

-     * ModelGenerationValidators try to weed out bad hits by the iterations of the name finder.

-     * Since this is a recursive process, with each iteration the namefinder will get more and more greedy if bad entities are allowed in

-     * this provides a mechanism for throwing out obviously bad hits.

-     * A good impl may be to make sure a location is actually within a noun phrase etc...users can make this as specific as they need for their dat

-     * and their use case

-     */

-    ModelGenerationValidator validator = new FileModelValidatorImpl();

-    validator.setParameters(params);

-    /**

-     * Modelable's write and read the annotated sentences, as well as create and write the NER models

-     */

-

-    Modelable modelable = new ModelableImpl();

-    modelable.setParameters(params);

-

-    /**

-     * the modelGenerator actually runs the process with a set number of iterations... could be better by actually calculating the

-     * diff between runs and stopping based on a thresh, but for extrememly large sentence sets this may be too much.

-     */

-    modelGenerator.build(sentenceProvider, knownEntityProvider, validator, modelable, 2);

-

-  }

-}

diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/GenericModelGenerator.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/GenericModelGenerator.java
deleted file mode 100644
index 2d87d77..0000000
--- a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/GenericModelGenerator.java
+++ /dev/null
@@ -1,81 +0,0 @@
-/*

- * Copyright 2013 The Apache Software Foundation.

- *

- * Licensed under the Apache License, Version 2.0 (the "License");

- * you may not use this file except in compliance with the License.

- * You may obtain a copy of the License at

- *

- *      http://www.apache.org/licenses/LICENSE-2.0

- *

- * Unless required by applicable law or agreed to in writing, software

- * distributed under the License is distributed on an "AS IS" BASIS,

- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

- * See the License for the specific language governing permissions and

- * limitations under the License.

- */

-package opennlp.modelbuilder.v2;

-

-import java.util.HashMap;

-import java.util.Map;

-import opennlp.tools.namefind.NameFinderME;

-import opennlp.tools.util.Span;

-

-/**

- *

- *Generic impl

- */

-public class GenericModelGenerator implements SemiSupervisedModelGenerator{

- private Map<String, String> params = new HashMap<String, String>();

-

-  @Override

-  public void setParameters(Map<String, String> params) {

-    this.params = params;

-  }

-  @Override

-  public void build(SentenceProvider sentenceProvider, KnownEntityProvider knownEntityProvider,

-          ModelGenerationValidator validator, Modelable modelable, int iterations) {

-    for (int iteration = 0; iteration < iterations; iteration++) {

-      System.out.println("ITERATION: " + iteration);

-      System.out.println("\tPerfoming Known Entity Annotation");

-      System.out.println("\t\tknowns: " + knownEntityProvider.getKnownEntities().size());

-      System.out.println("\t\treading data....: ");

-      for (String sentence : sentenceProvider.getSentences()) {

-        for (String knownEntity : knownEntityProvider.getKnownEntities()) {

-          if (sentence.contains(knownEntity)) {

-            //if the same sentence has multiple hits should they be annotated separately?

-            modelable.addAnnotatedSentence(modelable.annotate(sentence, knownEntity, knownEntityProvider.getKnownEntitiesType()));

-          }

-        }

-      }

-      System.out.println("\t\twriting annotated sentences....: ");

-      modelable.writeAnnotatedSentences();

-      modelable.buildModel(knownEntityProvider.getKnownEntitiesType());

-      NameFinderME nf = new NameFinderME(modelable.getModel());

-      System.out.println("\t\tannotated sentences: " + modelable.getAnnotatedSentences().size());

-      System.out.println("\tPerforming NER");

-      for (String sentence : sentenceProvider.getSentences()) {

-        if (!validator.validSentence(sentence)) {

-          continue;

-        }

-        String[] tokens = modelable.tokenizeSentenceToWords(sentence);

-

-        Span[] find = nf.find(tokens);

-        nf.clearAdaptiveData();

-

-        String[] namedEntities = Span.spansToStrings(find, tokens);

-

-        for (String namedEntity : namedEntities) {

-          if (validator.validNamedEntity(namedEntity)) {

-            knownEntityProvider.addKnownEntity(namedEntity);

-            modelable.addAnnotatedSentence(modelable.annotate(sentence, namedEntity, knownEntityProvider.getKnownEntitiesType()));

-

-          }

-        }

-      }

-      System.out.println("\t\tannotated sentences: " + modelable.getAnnotatedSentences().size());

-      System.out.println("\t\tknowns: " + knownEntityProvider.getKnownEntities().size());

-    }

-    modelable.writeAnnotatedSentences();

-    modelable.buildModel(knownEntityProvider.getKnownEntitiesType());

-  }

-}

diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/KnownEntityProvider.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/KnownEntityProvider.java
deleted file mode 100644
index d948b66..0000000
--- a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/KnownEntityProvider.java
+++ /dev/null
@@ -1,45 +0,0 @@
-/*

- * Copyright 2013 The Apache Software Foundation.

- *

- * Licensed under the Apache License, Version 2.0 (the "License");

- * you may not use this file except in compliance with the License.

- * You may obtain a copy of the License at

- *

- *      http://www.apache.org/licenses/LICENSE-2.0

- *

- * Unless required by applicable law or agreed to in writing, software

- * distributed under the License is distributed on an "AS IS" BASIS,

- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

- * See the License for the specific language governing permissions and

- * limitations under the License.

- */

-package opennlp.modelbuilder.v2;

-

-import java.util.Set;

-

-

-

-/**

- *

-Supplies a list of known entities (a list of names or locations)

- */

-public interface KnownEntityProvider extends ModelParameter{

-  /**

- * returns a list of known non ambiguous entities.

- * @return a set of entities

- */

-  Set<String> getKnownEntities();

-/**

- * adds to the set of known entities. Overriding classes should hold this list in a class level set.

- * @param unambiguousEntity 

- */

-  void addKnownEntity(String unambiguousEntity);

-/**

- * defines the type of entity that the set contains, ie person, location, organization.

- * @return 

- */

-  String getKnownEntitiesType();

-  

-  

-  

-}

diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/ModelGenerationValidator.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/ModelGenerationValidator.java
deleted file mode 100644
index 51b4fc4..0000000
--- a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/ModelGenerationValidator.java
+++ /dev/null
@@ -1,33 +0,0 @@
-/*

- * Copyright 2013 The Apache Software Foundation.

- *

- * Licensed under the Apache License, Version 2.0 (the "License");

- * you may not use this file except in compliance with the License.

- * You may obtain a copy of the License at

- *

- *      http://www.apache.org/licenses/LICENSE-2.0

- *

- * Unless required by applicable law or agreed to in writing, software

- * distributed under the License is distributed on an "AS IS" BASIS,

- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

- * See the License for the specific language governing permissions and

- * limitations under the License.

- */

-package opennlp.modelbuilder.v2;

-

-import java.util.Collection;

-

-/**

- *

-Validates results from the iterative namefinding

- */

-public interface ModelGenerationValidator extends ModelParameter {

-

-  Boolean validSentence(String sentence);

-

-  Boolean validNamedEntity(String namedEntity);

-  

-

-

-  Collection<String> getBlackList();

-}

diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/ModelParameter.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/ModelParameter.java
deleted file mode 100644
index 7e9d8e9..0000000
--- a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/ModelParameter.java
+++ /dev/null
@@ -1,26 +0,0 @@
-/*

- * Copyright 2013 The Apache Software Foundation.

- *

- * Licensed under the Apache License, Version 2.0 (the "License");

- * you may not use this file except in compliance with the License.

- * You may obtain a copy of the License at

- *

- *      http://www.apache.org/licenses/LICENSE-2.0

- *

- * Unless required by applicable law or agreed to in writing, software

- * distributed under the License is distributed on an "AS IS" BASIS,

- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

- * See the License for the specific language governing permissions and

- * limitations under the License.

- */

-package opennlp.modelbuilder.v2;

-

-import java.util.Map;

-

-/**

- *

- */

-public interface ModelParameter {

-   

-  void setParameters(Map<String, String> params);

-}

diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/Modelable.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/Modelable.java
deleted file mode 100644
index 412b1f3..0000000
--- a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/Modelable.java
+++ /dev/null
@@ -1,45 +0,0 @@
-/*

- * Copyright 2013 The Apache Software Foundation.

- *

- * Licensed under the Apache License, Version 2.0 (the "License");

- * you may not use this file except in compliance with the License.

- * You may obtain a copy of the License at

- *

- *      http://www.apache.org/licenses/LICENSE-2.0

- *

- * Unless required by applicable law or agreed to in writing, software

- * distributed under the License is distributed on an "AS IS" BASIS,

- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

- * See the License for the specific language governing permissions and

- * limitations under the License.

- */

-package opennlp.modelbuilder.v2;

-

-import java.util.Set;

-import opennlp.tools.namefind.TokenNameFinderModel;

-

-/**

- *

- */

-public interface Modelable extends ModelParameter{

-

-

-

-  String annotate(String sentence, String namedEntity, String entityType);

-

-  void writeAnnotatedSentences();

-

-  Set<String> getAnnotatedSentences();

-

-  void setAnnotatedSentences(Set<String> annotatedSentences);

-

-  void addAnnotatedSentence(String annotatedSentence);

-

-  void buildModel( String entityType);

-

-  TokenNameFinderModel getModel();

-

-  String[] tokenizeSentenceToWords(String sentence);

-  

-

-}

diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/SemiSupervisedModelGenerator.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/SemiSupervisedModelGenerator.java
deleted file mode 100644
index 6b7edfe..0000000
--- a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/SemiSupervisedModelGenerator.java
+++ /dev/null
@@ -1,26 +0,0 @@
-/*

- * Copyright 2013 The Apache Software Foundation.

- *

- * Licensed under the Apache License, Version 2.0 (the "License");

- * you may not use this file except in compliance with the License.

- * You may obtain a copy of the License at

- *

- *      http://www.apache.org/licenses/LICENSE-2.0

- *

- * Unless required by applicable law or agreed to in writing, software

- * distributed under the License is distributed on an "AS IS" BASIS,

- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

- * See the License for the specific language governing permissions and

- * limitations under the License.

- */

-package opennlp.modelbuilder.v2;

-

-/**

- *

-

- */

-public interface SemiSupervisedModelGenerator extends ModelParameter {

-

-  void build(SentenceProvider sentenceProvider, KnownEntityProvider knownEntityProvider, 

-          ModelGenerationValidator validator, Modelable modelable, int iterations);

-}

diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/SentenceProvider.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/SentenceProvider.java
deleted file mode 100644
index 4b26f78..0000000
--- a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/SentenceProvider.java
+++ /dev/null
@@ -1,26 +0,0 @@
-/*

- * Copyright 2013 The Apache Software Foundation.

- *

- * Licensed under the Apache License, Version 2.0 (the "License");

- * you may not use this file except in compliance with the License.

- * You may obtain a copy of the License at

- *

- *      http://www.apache.org/licenses/LICENSE-2.0

- *

- * Unless required by applicable law or agreed to in writing, software

- * distributed under the License is distributed on an "AS IS" BASIS,

- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

- * See the License for the specific language governing permissions and

- * limitations under the License.

- */

-package opennlp.modelbuilder.v2;

-

-import java.util.Set;

-

-/**

- *

- */

-public interface SentenceProvider extends ModelParameter {

-

-  Set<String> getSentences();

-}

diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/FileKnownEntityProvider.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/FileKnownEntityProvider.java
deleted file mode 100644
index dfbbbce..0000000
--- a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/FileKnownEntityProvider.java
+++ /dev/null
@@ -1,84 +0,0 @@
-/*

- * Copyright 2013 The Apache Software Foundation.

- *

- * Licensed under the Apache License, Version 2.0 (the "License");

- * you may not use this file except in compliance with the License.

- * You may obtain a copy of the License at

- *

- *      http://www.apache.org/licenses/LICENSE-2.0

- *

- * Unless required by applicable law or agreed to in writing, software

- * distributed under the License is distributed on an "AS IS" BASIS,

- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

- * See the License for the specific language governing permissions and

- * limitations under the License.

- */

-package opennlp.modelbuilder.v2.impls;

-

-import java.io.BufferedReader;

-import java.io.FileInputStream;

-import java.io.FileNotFoundException;

-import java.io.IOException;

-import java.io.InputStream;

-import java.io.InputStreamReader;

-import java.nio.charset.Charset;

-import java.util.HashMap;

-import java.util.HashSet;

-import java.util.Map;

-import java.util.Set;

-import java.util.logging.Level;

-import java.util.logging.Logger;

-import opennlp.modelbuilder.v2.KnownEntityProvider;

-

-/**

- *

- */

-public class FileKnownEntityProvider implements KnownEntityProvider {

-  private Map<String, String> params = new HashMap<String, String>();

-  Set<String> knownEntities = new HashSet<String>();

-

-  @Override

-  public Set<String> getKnownEntities() {

-    if (knownEntities.isEmpty()) {

-      try {

-        InputStream fis;

-        BufferedReader br;

-        String line;

-

-        fis = new FileInputStream(params.get("knownentityfile"));

-        br = new BufferedReader(new InputStreamReader(fis, Charset.forName("UTF-8")));

-        while ((line = br.readLine()) != null) {

-          knownEntities.add(line.split("\t")[2]);

-        }

-

-        // Done with the file

-        br.close();

-        br = null;

-        fis = null;

-      } catch (FileNotFoundException ex) {

-        Logger.getLogger(FileKnownEntityProvider.class.getName()).log(Level.SEVERE, null, ex);

-      } catch (IOException ex) {

-        Logger.getLogger(FileKnownEntityProvider.class.getName()).log(Level.SEVERE, null, ex);

-      }

-    }

-    return knownEntities;

-  }

-

-  @Override

-  public void addKnownEntity(String unambiguousEntity) {

-    knownEntities.add(unambiguousEntity);

-  }

-

-  @Override

-  public String getKnownEntitiesType() {

- 

-    return params.get("knownentitytype");

-  }

-

-

-

-  @Override

-  public void setParameters(Map<String, String> params) {

-    this.params = params;

-  }

-}

diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/FileModelValidatorImpl.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/FileModelValidatorImpl.java
deleted file mode 100644
index 6522a1a..0000000
--- a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/FileModelValidatorImpl.java
+++ /dev/null
@@ -1,98 +0,0 @@
-/*

- * Copyright 2013 The Apache Software Foundation.

- *

- * Licensed under the Apache License, Version 2.0 (the "License");

- * you may not use this file except in compliance with the License.

- * You may obtain a copy of the License at

- *

- *      http://www.apache.org/licenses/LICENSE-2.0

- *

- * Unless required by applicable law or agreed to in writing, software

- * distributed under the License is distributed on an "AS IS" BASIS,

- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

- * See the License for the specific language governing permissions and

- * limitations under the License.

- */

-package opennlp.modelbuilder.v2.impls;

-

-import java.io.BufferedReader;

-import java.io.FileInputStream;

-import java.io.FileNotFoundException;

-import java.io.IOException;

-import java.io.InputStream;

-import java.io.InputStreamReader;

-import java.nio.charset.Charset;

-import java.util.Collection;

-import java.util.HashMap;

-import java.util.HashSet;

-import java.util.Map;

-import java.util.Set;

-import java.util.logging.Level;

-import java.util.logging.Logger;

-import java.util.regex.Pattern;

-import opennlp.modelbuilder.v2.ModelGenerationValidator;

-

-/**

- *

- */

-public class FileModelValidatorImpl implements ModelGenerationValidator {

-

-  private Set<String> badentities = new HashSet<String>();

-  private final double MIN_SCORE_FOR_TRAINING = 0.95d;

-  private Object validationData;

-  private Map<String, String> params = new HashMap<String, String>();

-

-  @Override

-  public void setParameters(Map<String, String> params) {

-    this.params = params;

-  }

-

-  @Override

-  public Boolean validSentence(String sentence) {

-    //returning true by default, because the sentence provider will  return only "valid" sentences in this case

-    return true;

-  }

-

-  @Override

-  public Boolean validNamedEntity(String namedEntity) {

-

-    if (badentities.isEmpty()) {

-      getBlackList();

-    }

-

-    Pattern p = Pattern.compile("[0-9]", Pattern.CASE_INSENSITIVE | Pattern.MULTILINE);

-    if (p.matcher(namedEntity).find()) {

-      return false;

-    }

-    Boolean b = true;

-    if (badentities.contains(namedEntity.toLowerCase())) {

-      b = false;

-    }

-    return b;

-  }

-

-  @Override

-  public Collection<String> getBlackList() {

-    if (!badentities.isEmpty()) {

-      try {

-        InputStream fis;

-        BufferedReader br;

-        String line;

-

-        fis = new FileInputStream(params.get("blacklistfile"));

-        br = new BufferedReader(new InputStreamReader(fis, Charset.forName("UTF-8")));

-        while ((line = br.readLine()) != null) {

-          badentities.add(line);

-        }        

-        br.close();

-        br = null;

-        fis = null;

-      } catch (FileNotFoundException ex) {

-        Logger.getLogger(FileKnownEntityProvider.class.getName()).log(Level.SEVERE, null, ex);

-      } catch (IOException ex) {

-        Logger.getLogger(FileKnownEntityProvider.class.getName()).log(Level.SEVERE, null, ex);

-      }

-    }

-    return badentities;

-  }

-}

diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/FileSentenceProvider.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/FileSentenceProvider.java
deleted file mode 100644
index 029b6fc..0000000
--- a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/FileSentenceProvider.java
+++ /dev/null
@@ -1,72 +0,0 @@
-/*

- * Copyright 2013 The Apache Software Foundation.

- *

- * Licensed under the Apache License, Version 2.0 (the "License");

- * you may not use this file except in compliance with the License.

- * You may obtain a copy of the License at

- *

- *      http://www.apache.org/licenses/LICENSE-2.0

- *

- * Unless required by applicable law or agreed to in writing, software

- * distributed under the License is distributed on an "AS IS" BASIS,

- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

- * See the License for the specific language governing permissions and

- * limitations under the License.

- */

-package opennlp.modelbuilder.v2.impls;

-

-import java.io.BufferedReader;

-import java.io.FileInputStream;

-import java.io.FileNotFoundException;

-import java.io.IOException;

-import java.io.InputStream;

-import java.io.InputStreamReader;

-import java.nio.charset.Charset;

-import java.util.HashMap;

-import java.util.HashSet;

-import java.util.Map;

-import java.util.Set;

-import java.util.logging.Level;

-import java.util.logging.Logger;

-import opennlp.modelbuilder.v2.SentenceProvider;

-

-/**

- *

- */

-public class FileSentenceProvider implements SentenceProvider {

-

-  private Map<String, String> params = new HashMap<String, String>();

-  Set<String> sentences = new HashSet<String>();

-

-  public Set<String> getSentences() {

-     if (sentences.isEmpty()) {

-      try {

-        InputStream fis;

-        BufferedReader br;

-        String line;

-

-        fis = new FileInputStream(params.get("sentencesfile"));

-        br = new BufferedReader(new InputStreamReader(fis, Charset.forName("UTF-8")));

-        int i=0;

-        while ((line = br.readLine()) != null) {

-         

-          sentences.add(line);

-        }

-

-        // Done with the file

-        br.close();

-        br = null;

-        fis = null;

-      } catch (FileNotFoundException ex) {

-        Logger.getLogger(FileKnownEntityProvider.class.getName()).log(Level.SEVERE, null, ex);

-      } catch (IOException ex) {

-        Logger.getLogger(FileKnownEntityProvider.class.getName()).log(Level.SEVERE, null, ex);

-      }

-    }

-    return sentences;

-  }

-

-  public void setParameters(Map<String, String> params) {

-    this.params = params;

-  }

-}

diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/LocationKnownEntityProviderImpl.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/LocationKnownEntityProviderImpl.java
deleted file mode 100644
index 14ddfc0..0000000
--- a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/LocationKnownEntityProviderImpl.java
+++ /dev/null
@@ -1,104 +0,0 @@
-/*

- * Copyright 2013 The Apache Software Foundation.

- *

- * Licensed under the Apache License, Version 2.0 (the "License");

- * you may not use this file except in compliance with the License.

- * You may obtain a copy of the License at

- *

- *      http://www.apache.org/licenses/LICENSE-2.0

- *

- * Unless required by applicable law or agreed to in writing, software

- * distributed under the License is distributed on an "AS IS" BASIS,

- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

- * See the License for the specific language governing permissions and

- * limitations under the License.

- */

-package opennlp.modelbuilder.v2.impls;

-

-import java.sql.CallableStatement;

-import java.sql.Connection;

-import java.sql.DriverManager;

-import java.sql.ResultSet;

-import java.sql.SQLException;

-import java.util.HashMap;

-import java.util.HashSet;

-import java.util.Map;

-import java.util.Set;

-import java.util.logging.Level;

-import java.util.logging.Logger;

-import opennlp.modelbuilder.v2.KnownEntityProvider;

-

-/**

- *

-

- */

-public class LocationKnownEntityProviderImpl implements KnownEntityProvider {

- 

-  Set<String> ret = new HashSet<String>();

-

-  @Override

-  public Set<String> getKnownEntities() {

-    if (ret.isEmpty()) {

-      try {

-        getData();

-      } catch (Exception ex) {

-        Logger.getLogger(LocationKnownEntityProviderImpl.class.getName()).log(Level.SEVERE, null, ex);

-      }

-

-    }

-    return ret;

-  }

-   private Set<String> getData() throws Exception {

-

-    Connection con = getMySqlConnection();

-    if (con.isClosed()) {

-      con = getMySqlConnection();

-    }

-    CallableStatement cs;

-    cs = con.prepareCall("CALL getcountrylist()");

-

-    ResultSet rs;

-    try {

-      rs = cs.executeQuery();

-      while (rs.next()) {

-        ret.add(rs.getString("full_name_nd_ro"));

-      }

-

-    } catch (SQLException ex) {

-      throw ex;

-    } catch (Exception e) {

-      System.err.println(e);

-    } finally {

-      con.close();

-    }

-

-    return ret;

-  }

-  private static Connection getMySqlConnection() throws Exception {

-    // EntityLinkerProperties property = new EntityLinkerProperties(new File("c:\\temp\\opennlpmodels\\entitylinker.properties"));

-    String driver = "org.gjt.mm.mysql.Driver";

-    String url = "jdbc:mysql://127.0.0.1:3306/world";

-    String username = "root";

-    String password = "559447";

-

-    Class.forName(driver);

-    Connection conn = DriverManager.getConnection(url, username, password);

-    return conn;

-  }

-  @Override

-  public String getKnownEntitiesType() {

-    return "location";

-  }

-

-  @Override

-  public void addKnownEntity(String unambiguousEntity) {

-    ret.add(unambiguousEntity);

-  }

-

- private Map<String, String> params = new HashMap<String, String>();

-

-  @Override

-  public void setParameters(Map<String, String> params) {

-    this.params = params;

-  }

-}

diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/ModelValidatorImpl.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/ModelValidatorImpl.java
deleted file mode 100644
index 45e73c1..0000000
--- a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/ModelValidatorImpl.java
+++ /dev/null
@@ -1,141 +0,0 @@
-/*

- * Copyright 2013 The Apache Software Foundation.

- *

- * Licensed under the Apache License, Version 2.0 (the "License");

- * you may not use this file except in compliance with the License.

- * You may obtain a copy of the License at

- *

- *      http://www.apache.org/licenses/LICENSE-2.0

- *

- * Unless required by applicable law or agreed to in writing, software

- * distributed under the License is distributed on an "AS IS" BASIS,

- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

- * See the License for the specific language governing permissions and

- * limitations under the License.

- */

-package opennlp.modelbuilder.v2.impls;

-

-import java.util.HashMap;

-import java.util.HashSet;

-import java.util.Map;

-import java.util.Set;

-import java.util.regex.Pattern;

-import opennlp.modelbuilder.v2.ModelGenerationValidator;

-

-/**

- *

- */

-public class ModelValidatorImpl implements ModelGenerationValidator {

-

-  private Set<String> badentities = new HashSet<String>();

-  private final double MIN_SCORE_FOR_TRAINING = 0.95d;

-  private Object validationData;

- private Map<String, String> params = new HashMap<String, String>();

-

-  @Override

-  public void setParameters(Map<String, String> params) {

-    this.params = params;

-  }

-  @Override

-  public Boolean validSentence(String sentence) {

-    //returning true by default, because the sentence provider will  return only "valid" sentences in this case

-    return true;

-  }

-

-  @Override

-  public Boolean validNamedEntity(String namedEntity) {

-

-    if (badentities.isEmpty()) {

-      getBlackList();

-    }

-

-    Pattern p = Pattern.compile("[0-9]", Pattern.CASE_INSENSITIVE | Pattern.MULTILINE);

-    if (p.matcher(namedEntity).find()) {

-      return false;

-    }

-    Boolean b = true;

-    if (badentities.contains(namedEntity.toLowerCase())) {

-      b = false;

-    }

-    return b;

-  }

-

- 

-  @Override

-  public Set<String> getBlackList() {

-    badentities.add(".");

-    badentities.add("-");

-    badentities.add(",");

-    badentities.add(";");

-    badentities.add("the");

-    badentities.add("that");

-    badentities.add("several");

-    badentities.add("model");

-    badentities.add("our");

-    badentities.add("are");

-    badentities.add("in");

-    badentities.add("are");

-    badentities.add("at");

-    badentities.add("is");

-    badentities.add("for");

-    badentities.add("the");

-    badentities.add("during");

-    badentities.add("south");

-    badentities.add("from");

-    badentities.add("recounts");

-    badentities.add("wissenschaftliches");

-    badentities.add("if");

-    badentities.add("security");

-    badentities.add("denouncing");

-    badentities.add("writes");

-    badentities.add("but");

-    badentities.add("operation");

-    badentities.add("adds");

-    badentities.add("Above");

-    badentities.add("but");

-    badentities.add("RIP");

-    badentities.add("on");

-    badentities.add("no");

-    badentities.add("agrees");

-    badentities.add("year");

-    badentities.add("for");

-    badentities.add("you");

-    badentities.add("red");

-    badentities.add("added");

-    badentities.add("hello");

-    badentities.add("around");

-    badentities.add("has");

-    badentities.add("turn");

-    badentities.add("surrounding");

-    badentities.add("\" No");

-    badentities.add("aug.");

-    badentities.add("or");

-    badentities.add("quips");

-    badentities.add("september");

-    badentities.add("[mr");

-    badentities.add("diseases");

-    badentities.add("when");

-    badentities.add("bbc");

-    badentities.add(":\"");

-    badentities.add("dr");

-    badentities.add("baby");

-    badentities.add("on");

-    badentities.add("route");

-    badentities.add("'");

-    badentities.add("\"");

-    badentities.add("a");

-    badentities.add("her");

-    badentities.add("'");

-    badentities.add("\"");

-    badentities.add("two");

-    badentities.add("that");

-    badentities.add(":");

-    badentities.add("one");

-    badentities.add("Party");

-    badentities.add("Championship");

-

-    badentities.add("Ltd");

-

-    return badentities;

-  }

-}

diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/ModelableImpl.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/ModelableImpl.java
deleted file mode 100644
index dac5969..0000000
--- a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/ModelableImpl.java
+++ /dev/null
@@ -1,141 +0,0 @@
-/*

- * Copyright 2013 The Apache Software Foundation.

- *

- * Licensed under the Apache License, Version 2.0 (the "License");

- * you may not use this file except in compliance with the License.

- * You may obtain a copy of the License at

- *

- *      http://www.apache.org/licenses/LICENSE-2.0

- *

- * Unless required by applicable law or agreed to in writing, software

- * distributed under the License is distributed on an "AS IS" BASIS,

- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

- * See the License for the specific language governing permissions and

- * limitations under the License.

- */

-package opennlp.modelbuilder.v2.impls;

-

-import java.io.BufferedOutputStream;

-import java.io.File;

-import java.io.FileInputStream;

-import java.io.FileOutputStream;

-import java.io.FileWriter;

-import java.io.IOException;

-import java.io.OutputStream;

-import java.nio.charset.Charset;

-import java.util.HashMap;

-import java.util.HashSet;

-import java.util.Map;

-import java.util.Set;

-import java.util.logging.Level;

-import java.util.logging.Logger;

-import opennlp.modelbuilder.v2.Modelable;

-import opennlp.tools.namefind.NameFinderME;

-import opennlp.tools.namefind.NameSample;

-import opennlp.tools.namefind.NameSampleDataStream;

-import opennlp.tools.namefind.TokenNameFinderModel;

-import opennlp.tools.tokenize.TokenizerME;

-import opennlp.tools.tokenize.TokenizerModel;

-import opennlp.tools.util.ObjectStream;

-import opennlp.tools.util.PlainTextByLineStream;

-

-/**

- *

- */

-public class ModelableImpl implements Modelable {

-

-  private TokenizerModel tm;

-  private TokenizerME wordBreaker;

-  private String path = "c:\\temp\\opennlpmodels\\";

-  private String trainingDataPath = "";

-  private String modelOutPath = "";

-  private Set<String> annotatedSentences = new HashSet<String>();

-  private Map<String, String> params = new HashMap<String, String>();

-

-  @Override

-  public void setParameters(Map<String, String> params) {

-    this.params = params;

-    path = params.get("modelablepath");

-    trainingDataPath = path + "\\" + params.get("knownentitytype") + ".train";

-    modelOutPath = path + "\\" + params.get("knownentitytype")+".model";

-  }

-

-  @Override

-  public String annotate(String sentence, String namedEntity, String entityType) {

-    String annotation = sentence.replace(namedEntity, " <START:" + entityType + "> " + namedEntity + " <END> ");

-

-    return annotation;

-  }

-

-  @Override

-  public void writeAnnotatedSentences() {

-    try {

-

-      FileWriter writer = new FileWriter(trainingDataPath, false);

-

-      for (String s : annotatedSentences) {

-        writer.write(s.replace("\n", " ").trim() + "\n");

-      }

-      writer.close();

-    } catch (IOException ex) {

-      ex.printStackTrace();

-    }

-  }

-

-  @Override

-  public Set<String> getAnnotatedSentences() {

-    return annotatedSentences;

-  }

-

-  @Override

-  public void setAnnotatedSentences(Set<String> annotatedSentences) {

-    this.annotatedSentences = annotatedSentences;

-  }

-

-  @Override

-  public void addAnnotatedSentence(String annotatedSentence) {

-    annotatedSentences.add(annotatedSentence);

-  }

-

-  @Override

-  public void buildModel(String entityType) {

-    try {

-      System.out.println("\tBuilding Model using " + annotatedSentences.size() + " annotations");

-      System.out.println("\t\treading training data...");

-      Charset charset = Charset.forName("UTF-8");

-      ObjectStream<String> lineStream =

-              new PlainTextByLineStream(new FileInputStream(trainingDataPath), charset);

-      ObjectStream<NameSample> sampleStream = new NameSampleDataStream(lineStream);

-

-      TokenNameFinderModel model;

-      model = NameFinderME.train("en", entityType, sampleStream, null);

-      sampleStream.close();

-      OutputStream modelOut = new BufferedOutputStream(new FileOutputStream(new File(modelOutPath)));

-      model.serialize(modelOut);

-      if (modelOut != null) {

-        modelOut.close();

-      }

-      System.out.println("\tmodel generated");

-    } catch (Exception e) {

-    }

-  }

-

-  @Override

-  public TokenNameFinderModel getModel() {

-

-

-    TokenNameFinderModel nerModel = null;

-    try {

-      nerModel = new TokenNameFinderModel(new FileInputStream(new File(modelOutPath)));

-    } catch (IOException ex) {

-      Logger.getLogger(ModelableImpl.class.getName()).log(Level.SEVERE, null, ex);

-    }

-    return nerModel;

-  }

-

-  @Override

-  public String[] tokenizeSentenceToWords(String sentence) {

-    return sentence.split(" ");

-

-  }

-}

diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/MySQLSentenceProviderImpl.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/MySQLSentenceProviderImpl.java
deleted file mode 100644
index c90e791..0000000
--- a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/MySQLSentenceProviderImpl.java
+++ /dev/null
@@ -1,88 +0,0 @@
-/*

- * Copyright 2013 The Apache Software Foundation.

- *

- * Licensed under the Apache License, Version 2.0 (the "License");

- * you may not use this file except in compliance with the License.

- * You may obtain a copy of the License at

- *

- *      http://www.apache.org/licenses/LICENSE-2.0

- *

- * Unless required by applicable law or agreed to in writing, software

- * distributed under the License is distributed on an "AS IS" BASIS,

- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

- * See the License for the specific language governing permissions and

- * limitations under the License.

- */

-package opennlp.modelbuilder.v2.impls;

-

-import java.sql.*;

-import java.util.HashMap;

-import java.util.HashSet;

-import java.util.Map;

-import java.util.Set;

-import opennlp.modelbuilder.v2.SentenceProvider;

-

-/**

- *

- */

-public class MySQLSentenceProviderImpl implements SentenceProvider {

-

-  Set<String> sentences = new HashSet<String>();

-

-  @Override

-  public Set<String> getSentences() {

-    try {

-      if (sentences.isEmpty()) {

-        return getData();

-      }

-    } catch (Exception e) {

-    }

-    return sentences;

-  }

-

-  private Set<String> getData() throws Exception {

-

-    Connection con = getMySqlConnection();

-    if (con.isClosed()) {

-      con = getMySqlConnection();

-    }

-    CallableStatement cs;

-    cs = con.prepareCall("CALL getTrainingSentences()");

-

-    ResultSet rs;

-    try {

-      rs = cs.executeQuery();

-      while (rs.next()) {

-        sentences.add(rs.getString(1));

-      }

-

-    } catch (SQLException ex) {

-      throw ex;

-    } catch (Exception e) {

-      System.err.println(e);

-    } finally {

-      con.close();

-    }

-

-    return sentences;

-  }

-

-  private static Connection getMySqlConnection() throws Exception {

-    // EntityLinkerProperties property = new EntityLinkerProperties(new File("c:\\temp\\opennlpmodels\\entitylinker.properties"));

-    String driver = "org.gjt.mm.mysql.Driver";

-    String url = "jdbc:mysql://localhost:3306/db";

-    String username = "root";

-    String password = "??";

-

-    Class.forName(driver);

-    Connection conn = DriverManager.getConnection(url, username, password);

-    return conn;

-  }

-

- private Map<String, String> params = new HashMap<String,String>();

-

-  @Override

-  public void setParameters(Map<String, String> params) {

-    this.params = params;

-  }

-}

diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/PersonKnownEntityProviderImpl.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/PersonKnownEntityProviderImpl.java
deleted file mode 100644
index 80c643a..0000000
--- a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/PersonKnownEntityProviderImpl.java
+++ /dev/null
@@ -1,108 +0,0 @@
-/*

- * Copyright 2013 The Apache Software Foundation.

- *

- * Licensed under the Apache License, Version 2.0 (the "License");

- * you may not use this file except in compliance with the License.

- * You may obtain a copy of the License at

- *

- *      http://www.apache.org/licenses/LICENSE-2.0

- *

- * Unless required by applicable law or agreed to in writing, software

- * distributed under the License is distributed on an "AS IS" BASIS,

- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

- * See the License for the specific language governing permissions and

- * limitations under the License.

- */

-package opennlp.modelbuilder.v2.impls;

-

-import java.util.HashMap;

-import java.util.HashSet;

-import java.util.Map;

-import java.util.Set;

-import opennlp.modelbuilder.v2.KnownEntityProvider;

-

-/**

- *

- */

-public class PersonKnownEntityProviderImpl implements KnownEntityProvider {

-

-  Set<String> ret = new HashSet<String>();

-

-  @Override

-  public Set<String> getKnownEntities() {

-    if (ret.isEmpty()) {

-      ret.add("Barack Obama");

-      ret.add("Mitt Romney");

-      ret.add("John Doe");

-      ret.add("Bill Gates");

-      ret.add("Nguyen Tan Dung");

-      ret.add("Hassanal Bolkiah");

-      ret.add("Bashar al-Assad");

-      ret.add("Faysal Khabbaz Hamou");

-      ret.add("Dr Talwar");

-      ret.add("Mr. Bolkiah");

-      ret.add("Bashar");

-      ret.add("Romney");

-      ret.add("Obama");

-      ret.add("the President");

-      ret.add("Mr. Gates");

-      ret.add("Romney");

-

-

-

-      ret.add("Xi Jinping");

-      ret.add("Hassanal Bolkiah");

-      ret.add("Leon Panetta");

-      ret.add("Paul Beales");

-      ret.add("Mr Rajapaksa");

-      ret.add("Mohammed ");

-      ret.add("Ieng Thirith");

-      ret.add("Mr Xi");

-      ret.add("John Sudworth");

-      ret.add("Ieng Thirith");

-      ret.add("Aung San Suu Kyi");

-

-      ret.add("Khorshid");

-      ret.add("Karrie Webb");

-      ret.add("Doyle McManus");

-      ret.add("Pope John Paul");

-      ret.add("Roland Buerk");

-      ret.add("Paul Ryan");

-      ret.add("Tammy Baldwin");

-      ret.add("Ben Unger");

-      ret.add("Chris Christie");

-      ret.add("Mary Magdalene");

-      ret.add("George Walker Bush");

-      ret.add("Melendez-Martinez");

-      ret.add("Osiel Cardenas Guillen");

-      ret.add("President Molina");

-      ret.add("Lubaina Himid");

-      ret.add("Elizabeth Frink");

-      ret.add("Graham Sutherland");

-      ret.add("Gorman Adams");

-      ret.add("Peter Sheasby");

-      ret.add("Andrew Walker");

-      ret.add("Elias Garcia Martinez");

-      ret.add("Elias Martinez");

-

-    }

-    return ret;

-  }

-

-  @Override

-  public String getKnownEntitiesType() {

-    return "person";

-  }

-

-  @Override

-  public void addKnownEntity(String unambiguousEntity) {

-    ret.add(unambiguousEntity);

-  }

-

-  private Map<String, String> params = new HashMap<String,String>();

-

-  @Override

-  public void setParameters(Map<String, String> params) {

-    this.params = params;

-  }

-}