Prototype of a tool to allow users to create models from  of a set of known entities based on their own data in the form of sentences.
See the Example class in the .v2 package.

diff --git a/modelbuilder-prototype/pom.xml b/modelbuilder-prototype/pom.xml
new file mode 100644
index 0000000..f011971
--- /dev/null
+++ b/modelbuilder-prototype/pom.xml
@@ -0,0 +1,30 @@
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"

+  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">

+  <modelVersion>4.0.0</modelVersion>

+

+  <groupId>modelbuilder</groupId>

+  <artifactId>modelbuilder-prototype</artifactId>

+  <version>1.0-SNAPSHOT</version>

+  <packaging>jar</packaging>

+

+  <name>modelbuilder-prototype</name>

+  <url>http://maven.apache.org</url>

+

+  <properties>

+    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>

+  </properties>

+

+  <dependencies>

+    <dependency>

+      <groupId>junit</groupId>

+      <artifactId>junit</artifactId>

+      <version>3.8.1</version>

+      <scope>test</scope>

+    </dependency>

+      <dependency>

+      <groupId>org.apache.opennlp</groupId>

+      <artifactId>opennlp-tools</artifactId>

+      <version>1.6.0-SNAPSHOT</version>

+    </dependency>

+  </dependencies>

+</project>

diff --git a/modelbuilder-prototype/src/main/java/modelbuilder/App.java b/modelbuilder-prototype/src/main/java/modelbuilder/App.java
new file mode 100644
index 0000000..b788f6c
--- /dev/null
+++ b/modelbuilder-prototype/src/main/java/modelbuilder/App.java
@@ -0,0 +1,13 @@
+package modelbuilder;

+

+/**

+ * Hello world!

+ *

+ */

+public class App 

+{

+    public static void main( String[] args )

+    {

+       

+    }

+}

diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/Example.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/Example.java
new file mode 100644
index 0000000..eed4c1c
--- /dev/null
+++ b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/Example.java
@@ -0,0 +1,67 @@
+/*

+ * To change this template, choose Tools | Templates

+ * and open the template in the editor.

+ */

+package opennlp.modelbuilder.v2;

+

+import java.util.HashMap;

+import java.util.Map;

+import opennlp.modelbuilder.v2.impls.FileKnownEntityProvider;

+import opennlp.modelbuilder.v2.impls.FileModelValidatorImpl;

+import opennlp.modelbuilder.v2.impls.FileSentenceProvider;

+import opennlp.modelbuilder.v2.impls.ModelableImpl;

+

+/**

+ *

+ * @author Owner

+ */

+public class Example {

+

+  public static void main(String[] args) {

+

+    GenericModelGenerator modelGenerator = new GenericModelGenerator();

+    //every component has a map as a place to recieve params

+    //these are required for the current file-based impls

+    Map<String, String> params = new HashMap<String, String>();

+    params.put("sentencesfile", "/the/file");

+    params.put("knownentityfile", "/the/file");

+    params.put("knownentitytype", "person");

+    params.put("blacklistfile", "/the/file");

+    params.put("modelablepath", "/the/file");

+

+    /**

+     * sentence providers feed this process with user data derived sentences

+     * this impl just reads line by line through a file

+     */

+    SentenceProvider sentenceProvider = new FileSentenceProvider();

+    sentenceProvider.setParameters(params);

+    /**

+     *KnownEntityProviders provide a seed list of known entities... such as Barack Obama for person, or Germany for location

+     * obviously these would want to be prolific, non ambiguous names

+     */

+    KnownEntityProvider knownEntityProvider = new FileKnownEntityProvider();

+    knownEntityProvider.setParameters(params);

+    /**

+     * ModelGenerationValidators try to weed out bad hits by the iterations of the name finder.

+     * Since this is a recursive process, with each iteration the namefinder will get more and more greedy if bad entities are allowed in

+     * this provides a mechanism for throwing out obviously bad hits.

+     * A good impl may be to make sure a location is actually within a noun phrase etc...users can make this as specific as they need for their dat

+     * and their use case

+     */

+    ModelGenerationValidator validator = new FileModelValidatorImpl();

+    validator.setParameters(params);

+    /**

+     * Modelable's write and read the annotated sentences, as well as create and write the NER models

+     */

+

+    Modelable modelable = new ModelableImpl();

+    modelable.setParameters(params);

+

+    /**

+     * the modelGenerator actually runs the process with a set number of iterations... could be better by actually calculating the

+     * diff between runs and stopping based on a thresh, but for extrememly large sentence sets this may be too much.

+     */

+    modelGenerator.build(sentenceProvider, knownEntityProvider, validator, modelable, 3);

+

+  }

+}

diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/GenericModelGenerator.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/GenericModelGenerator.java
new file mode 100644
index 0000000..4fe9d89
--- /dev/null
+++ b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/GenericModelGenerator.java
@@ -0,0 +1,70 @@
+/*

+ * To change this template, choose Tools | Templates

+ * and open the template in the editor.

+ */

+package opennlp.modelbuilder.v2;

+

+import java.util.HashMap;

+import java.util.Map;

+import opennlp.tools.namefind.NameFinderME;

+import opennlp.tools.util.Span;

+

+/**

+ *

+ *

+ */

+public class GenericModelGenerator implements SemiSupervisedModelGenerator{

+ private Map<String, String> params = new HashMap<String, String>();

+

+  @Override

+  public void setParameters(Map<String, String> params) {

+    this.params = params;

+  }

+  @Override

+  public void build(SentenceProvider sentenceProvider, KnownEntityProvider knownEntityProvider,

+          ModelGenerationValidator validator, Modelable modelable, int iterations) {

+    for (int iteration = 0; iteration < iterations; iteration++) {

+      System.out.println("ITERATION: " + iteration);

+      System.out.println("\tPerfoming Known Entity Annotation");

+      System.out.println("\t\tknowns: " + knownEntityProvider.getKnownEntities().size());

+      System.out.println("\t\treading data....: ");

+      for (String sentence : sentenceProvider.getSentences()) {

+        for (String knownEntity : knownEntityProvider.getKnownEntities()) {

+          if (sentence.contains(knownEntity)) {

+            //if the same sentence has multiple hits should they be annotated separately?

+            modelable.addAnnotatedSentence(modelable.annotate(sentence, knownEntity, knownEntityProvider.getKnownEntitiesType()));

+          }

+        }

+      }

+      System.out.println("\t\twriting annotated sentences....: ");

+      modelable.writeAnnotatedSentences();

+      modelable.buildModel(knownEntityProvider.getKnownEntitiesType());

+      NameFinderME nf = new NameFinderME(modelable.getModel());

+      System.out.println("\t\tannotated sentences: " + modelable.getAnnotatedSentences().size());

+      System.out.println("\tPerforming NER");

+      for (String sentence : sentenceProvider.getSentences()) {

+        if (!validator.validSentence(sentence)) {

+          continue;

+        }

+        String[] tokens = modelable.tokenizeSentenceToWords(sentence);

+

+        Span[] find = nf.find(tokens);

+        nf.clearAdaptiveData();

+

+        String[] namedEntities = Span.spansToStrings(find, tokens);

+

+        for (String namedEntity : namedEntities) {

+          if (validator.validNamedEntity(namedEntity)) {

+            knownEntityProvider.addKnownEntity(namedEntity);

+            modelable.addAnnotatedSentence(modelable.annotate(sentence, namedEntity, knownEntityProvider.getKnownEntitiesType()));

+

+          }

+        }

+      }

+      System.out.println("\t\tannotated sentences: " + modelable.getAnnotatedSentences().size());

+      System.out.println("\t\tknowns: " + knownEntityProvider.getKnownEntities().size());

+    }

+    modelable.writeAnnotatedSentences();

+    modelable.buildModel(knownEntityProvider.getKnownEntitiesType());

+  }

+}

diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/KnownEntityProvider.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/KnownEntityProvider.java
new file mode 100644
index 0000000..fd04cd4
--- /dev/null
+++ b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/KnownEntityProvider.java
@@ -0,0 +1,35 @@
+/*

+ * To change this template, choose Tools | Templates

+ * and open the template in the editor.

+ */

+package opennlp.modelbuilder.v2;

+

+import java.util.List;

+import java.util.Set;

+

+

+

+/**

+ *

+ * @author Owner

+ */

+public interface KnownEntityProvider extends ModelParameter{

+  /**

+ * returns a list of known non ambiguous entities.

+ * @return a set of entities

+ */

+  Set<String> getKnownEntities();

+/**

+ * adds to the set of known entities. Overriding classes should hold this list in a class level set.

+ * @param unambiguousEntity 

+ */

+  void addKnownEntity(String unambiguousEntity);

+/**

+ * defines the type of entity that the set contains, ie person, location, organization.

+ * @return 

+ */

+  String getKnownEntitiesType();

+  

+  

+  

+}

diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/ModelGenerationValidator.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/ModelGenerationValidator.java
new file mode 100644
index 0000000..1ecc13f
--- /dev/null
+++ b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/ModelGenerationValidator.java
@@ -0,0 +1,23 @@
+/*

+ * To change this template, choose Tools | Templates

+ * and open the template in the editor.

+ */

+package opennlp.modelbuilder.v2;

+

+import java.util.Collection;

+import java.util.Set;

+

+/**

+ *

+ * @author Owner

+ */

+public interface ModelGenerationValidator extends ModelParameter {

+

+  Boolean validSentence(String sentence);

+

+  Boolean validNamedEntity(String namedEntity);

+  

+

+

+  Collection<String> getBlackList();

+}

diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/ModelParameter.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/ModelParameter.java
new file mode 100644
index 0000000..59a60f0
--- /dev/null
+++ b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/ModelParameter.java
@@ -0,0 +1,17 @@
+/*

+ * To change this template, choose Tools | Templates

+ * and open the template in the editor.

+ */

+package opennlp.modelbuilder.v2;

+

+import java.util.HashMap;

+import java.util.Map;

+

+/**

+ *

+ * @author Owner

+ */

+public interface ModelParameter {

+   

+  void setParameters(Map<String, String> params);

+}

diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/Modelable.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/Modelable.java
new file mode 100644
index 0000000..18dd632
--- /dev/null
+++ b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/Modelable.java
@@ -0,0 +1,37 @@
+/*

+ * To change this template, choose Tools | Templates

+ * and open the template in the editor.

+ */

+package opennlp.modelbuilder.v2;

+

+import java.util.List;

+import java.util.Map;

+import java.util.Set;

+import opennlp.tools.namefind.TokenNameFinderModel;

+

+/**

+ *

+ * @author Owner

+ */

+public interface Modelable extends ModelParameter{

+

+

+

+  String annotate(String sentence, String namedEntity, String entityType);

+

+  void writeAnnotatedSentences();

+

+  Set<String> getAnnotatedSentences();

+

+  void setAnnotatedSentences(Set<String> annotatedSentences);

+

+  void addAnnotatedSentence(String annotatedSentence);

+

+  void buildModel( String entityType);

+

+  TokenNameFinderModel getModel();

+

+  String[] tokenizeSentenceToWords(String sentence);

+  

+

+}

diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/SemiSupervisedModelGenerator.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/SemiSupervisedModelGenerator.java
new file mode 100644
index 0000000..6f2020e
--- /dev/null
+++ b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/SemiSupervisedModelGenerator.java
@@ -0,0 +1,15 @@
+/*

+ * To change this template, choose Tools | Templates

+ * and open the template in the editor.

+ */

+package opennlp.modelbuilder.v2;

+

+/**

+ *

+ * @author Owner

+ */

+public interface SemiSupervisedModelGenerator extends ModelParameter {

+

+  void build(SentenceProvider sentenceProvider, KnownEntityProvider knownEntityProvider, 

+          ModelGenerationValidator validator, Modelable modelable, int iterations);

+}

diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/SentenceProvider.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/SentenceProvider.java
new file mode 100644
index 0000000..8549966
--- /dev/null
+++ b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/SentenceProvider.java
@@ -0,0 +1,16 @@
+/*

+ * To change this template, choose Tools | Templates

+ * and open the template in the editor.

+ */

+package opennlp.modelbuilder.v2;

+

+import java.util.Set;

+

+/**

+ *

+ * @author Owner

+ */

+public interface SentenceProvider extends ModelParameter {

+

+  Set<String> getSentences();

+}

diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/FileKnownEntityProvider.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/FileKnownEntityProvider.java
new file mode 100644
index 0000000..aadb482
--- /dev/null
+++ b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/FileKnownEntityProvider.java
@@ -0,0 +1,74 @@
+/*

+ * To change this template, choose Tools | Templates

+ * and open the template in the editor.

+ */

+package opennlp.modelbuilder.v2.impls;

+

+import java.io.BufferedReader;

+import java.io.FileInputStream;

+import java.io.FileNotFoundException;

+import java.io.IOException;

+import java.io.InputStream;

+import java.io.InputStreamReader;

+import java.nio.charset.Charset;

+import java.util.HashMap;

+import java.util.HashSet;

+import java.util.Map;

+import java.util.Set;

+import java.util.logging.Level;

+import java.util.logging.Logger;

+import opennlp.modelbuilder.v2.KnownEntityProvider;

+

+/**

+ *

+ * @author Owner

+ */

+public class FileKnownEntityProvider implements KnownEntityProvider {

+  private Map<String, String> params = new HashMap<String, String>();

+  Set<String> knownEntities = new HashSet<String>();

+

+  @Override

+  public Set<String> getKnownEntities() {

+    if (knownEntities.isEmpty()) {

+      try {

+        InputStream fis;

+        BufferedReader br;

+        String line;

+

+        fis = new FileInputStream(params.get("knownentityfile"));

+        br = new BufferedReader(new InputStreamReader(fis, Charset.forName("UTF-8")));

+        while ((line = br.readLine()) != null) {

+          knownEntities.add(line);

+        }

+

+        // Done with the file

+        br.close();

+        br = null;

+        fis = null;

+      } catch (FileNotFoundException ex) {

+        Logger.getLogger(FileKnownEntityProvider.class.getName()).log(Level.SEVERE, null, ex);

+      } catch (IOException ex) {

+        Logger.getLogger(FileKnownEntityProvider.class.getName()).log(Level.SEVERE, null, ex);

+      }

+    }

+    return knownEntities;

+  }

+

+  @Override

+  public void addKnownEntity(String unambiguousEntity) {

+    knownEntities.add(unambiguousEntity);

+  }

+

+  @Override

+  public String getKnownEntitiesType() {

+ 

+    return params.get("knownentitytype");

+  }

+

+

+

+  @Override

+  public void setParameters(Map<String, String> params) {

+    this.params = params;

+  }

+}

diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/FileModelValidatorImpl.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/FileModelValidatorImpl.java
new file mode 100644
index 0000000..4e92cac
--- /dev/null
+++ b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/FileModelValidatorImpl.java
@@ -0,0 +1,88 @@
+/*

+ * To change this template, choose Tools | Templates

+ * and open the template in the editor.

+ */

+package opennlp.modelbuilder.v2.impls;

+

+import java.io.BufferedReader;

+import java.io.FileInputStream;

+import java.io.FileNotFoundException;

+import java.io.IOException;

+import java.io.InputStream;

+import java.io.InputStreamReader;

+import java.nio.charset.Charset;

+import java.util.Collection;

+import java.util.HashMap;

+import java.util.HashSet;

+import java.util.Map;

+import java.util.Set;

+import java.util.logging.Level;

+import java.util.logging.Logger;

+import java.util.regex.Pattern;

+import opennlp.modelbuilder.v2.ModelGenerationValidator;

+

+/**

+ *

+ * @author Owner

+ */

+public class FileModelValidatorImpl implements ModelGenerationValidator {

+

+  private Set<String> badentities = new HashSet<String>();

+  private final double MIN_SCORE_FOR_TRAINING = 0.95d;

+  private Object validationData;

+  private Map<String, String> params = new HashMap<String, String>();

+

+  @Override

+  public void setParameters(Map<String, String> params) {

+    this.params = params;

+  }

+

+  @Override

+  public Boolean validSentence(String sentence) {

+    //returning true by default, because the sentence provider will  return only "valid" sentences in this case

+    return true;

+  }

+

+  @Override

+  public Boolean validNamedEntity(String namedEntity) {

+

+    if (badentities.isEmpty()) {

+      getBlackList();

+    }

+

+    Pattern p = Pattern.compile("[0-9]", Pattern.CASE_INSENSITIVE | Pattern.MULTILINE);

+    if (p.matcher(namedEntity).find()) {

+      return false;

+    }

+    Boolean b = true;

+    if (badentities.contains(namedEntity.toLowerCase())) {

+      b = false;

+    }

+    return b;

+  }

+

+  @Override

+  public Collection<String> getBlackList() {

+    if (!badentities.isEmpty()) {

+      try {

+        InputStream fis;

+        BufferedReader br;

+        String line;

+

+        fis = new FileInputStream(params.get("blacklistfile"));

+        br = new BufferedReader(new InputStreamReader(fis, Charset.forName("UTF-8")));

+        while ((line = br.readLine()) != null) {

+          badentities.add(line);

+        }        

+        br.close();

+        br = null;

+        fis = null;

+      } catch (FileNotFoundException ex) {

+        Logger.getLogger(FileKnownEntityProvider.class.getName()).log(Level.SEVERE, null, ex);

+      } catch (IOException ex) {

+        Logger.getLogger(FileKnownEntityProvider.class.getName()).log(Level.SEVERE, null, ex);

+      }

+    }

+    return badentities;

+  }

+}

diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/FileSentenceProvider.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/FileSentenceProvider.java
new file mode 100644
index 0000000..fa24f04
--- /dev/null
+++ b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/FileSentenceProvider.java
@@ -0,0 +1,60 @@
+/*

+ * To change this template, choose Tools | Templates

+ * and open the template in the editor.

+ */

+package opennlp.modelbuilder.v2.impls;

+

+import java.io.BufferedReader;

+import java.io.FileInputStream;

+import java.io.FileNotFoundException;

+import java.io.IOException;

+import java.io.InputStream;

+import java.io.InputStreamReader;

+import java.nio.charset.Charset;

+import java.util.HashMap;

+import java.util.HashSet;

+import java.util.Map;

+import java.util.Set;

+import java.util.logging.Level;

+import java.util.logging.Logger;

+import opennlp.modelbuilder.v2.SentenceProvider;

+

+/**

+ *

+ * @author Owner

+ */

+public class FileSentenceProvider implements SentenceProvider {

+

+  private Map<String, String> params = new HashMap<String, String>();

+  Set<String> sentences = new HashSet<String>();

+

+  public Set<String> getSentences() {

+     if (sentences.isEmpty()) {

+      try {

+        InputStream fis;

+        BufferedReader br;

+        String line;

+

+        fis = new FileInputStream(params.get("sentencesfile"));

+        br = new BufferedReader(new InputStreamReader(fis, Charset.forName("UTF-8")));

+        while ((line = br.readLine()) != null) {

+          sentences.add(line);

+        }

+

+        // Done with the file

+        br.close();

+        br = null;

+        fis = null;

+      } catch (FileNotFoundException ex) {

+        Logger.getLogger(FileKnownEntityProvider.class.getName()).log(Level.SEVERE, null, ex);

+      } catch (IOException ex) {

+        Logger.getLogger(FileKnownEntityProvider.class.getName()).log(Level.SEVERE, null, ex);

+      }

+    }

+    return sentences;

+  }

+

+  public void setParameters(Map<String, String> params) {

+    this.params = params;

+  }

+}

diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/LocationKnownEntityProviderImpl.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/LocationKnownEntityProviderImpl.java
new file mode 100644
index 0000000..0e50594
--- /dev/null
+++ b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/LocationKnownEntityProviderImpl.java
@@ -0,0 +1,93 @@
+/*

+ * To change this template, choose Tools | Templates

+ * and open the template in the editor.

+ */

+package opennlp.modelbuilder.v2.impls;

+

+import java.sql.CallableStatement;

+import java.sql.Connection;

+import java.sql.DriverManager;

+import java.sql.ResultSet;

+import java.sql.SQLException;

+import java.util.HashMap;

+import java.util.HashSet;

+import java.util.Map;

+import java.util.Set;

+import java.util.logging.Level;

+import java.util.logging.Logger;

+import opennlp.modelbuilder.v2.KnownEntityProvider;

+

+/**

+ *

+ * @author Owner

+ */

+public class LocationKnownEntityProviderImpl implements KnownEntityProvider {

+ 

+  Set<String> ret = new HashSet<String>();

+

+  @Override

+  public Set<String> getKnownEntities() {

+    if (ret.isEmpty()) {

+      try {

+        getData();

+      } catch (Exception ex) {

+        Logger.getLogger(LocationKnownEntityProviderImpl.class.getName()).log(Level.SEVERE, null, ex);

+      }

+

+    }

+    return ret;

+  }

+   private Set<String> getData() throws Exception {

+

+    Connection con = getMySqlConnection();

+    if (con.isClosed()) {

+      con = getMySqlConnection();

+    }

+    CallableStatement cs;

+    cs = con.prepareCall("CALL getcountrylist()");

+

+    ResultSet rs;

+    try {

+      rs = cs.executeQuery();

+      while (rs.next()) {

+        ret.add(rs.getString("full_name_nd_ro"));

+      }

+

+    } catch (SQLException ex) {

+      throw ex;

+    } catch (Exception e) {

+      System.err.println(e);

+    } finally {

+      con.close();

+    }

+

+    return ret;

+  }

+  private static Connection getMySqlConnection() throws Exception {

+    // EntityLinkerProperties property = new EntityLinkerProperties(new File("c:\\temp\\opennlpmodels\\entitylinker.properties"));

+    String driver = "org.gjt.mm.mysql.Driver";

+    String url = "jdbc:mysql://127.0.0.1:3306/world";

+    String username = "root";

+    String password = "559447";

+

+    Class.forName(driver);

+    Connection conn = DriverManager.getConnection(url, username, password);

+    return conn;

+  }

+  @Override

+  public String getKnownEntitiesType() {

+    return "location";

+  }

+

+  @Override

+  public void addKnownEntity(String unambiguousEntity) {

+    ret.add(unambiguousEntity);

+  }

+

+ private Map<String, String> params = new HashMap<String, String>();

+

+  @Override

+  public void setParameters(Map<String, String> params) {

+    this.params = params;

+  }

+}

diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/ModelValidatorImpl.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/ModelValidatorImpl.java
new file mode 100644
index 0000000..882ce2a
--- /dev/null
+++ b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/ModelValidatorImpl.java
@@ -0,0 +1,131 @@
+/*

+ * To change this template, choose Tools | Templates

+ * and open the template in the editor.

+ */

+package opennlp.modelbuilder.v2.impls;

+

+import java.util.HashMap;

+import java.util.HashSet;

+import java.util.Map;

+import java.util.Set;

+import java.util.regex.Pattern;

+import opennlp.modelbuilder.v2.ModelGenerationValidator;

+

+/**

+ *

+ * @author Owner

+ */

+public class ModelValidatorImpl implements ModelGenerationValidator {

+

+  private Set<String> badentities = new HashSet<String>();

+  private final double MIN_SCORE_FOR_TRAINING = 0.95d;

+  private Object validationData;

+ private Map<String, String> params = new HashMap<String, String>();

+

+  @Override

+  public void setParameters(Map<String, String> params) {

+    this.params = params;

+  }

+  @Override

+  public Boolean validSentence(String sentence) {

+    //returning true by default, because the sentence provider will  return only "valid" sentences in this case

+    return true;

+  }

+

+  @Override

+  public Boolean validNamedEntity(String namedEntity) {

+

+    if (badentities.isEmpty()) {

+      getBlackList();

+    }

+

+    Pattern p = Pattern.compile("[0-9]", Pattern.CASE_INSENSITIVE | Pattern.MULTILINE);

+    if (p.matcher(namedEntity).find()) {

+      return false;

+    }

+    Boolean b = true;

+    if (badentities.contains(namedEntity.toLowerCase())) {

+      b = false;

+    }

+    return b;

+  }

+

+ 

+  @Override

+  public Set<String> getBlackList() {

+    badentities.add(".");

+    badentities.add("-");

+    badentities.add(",");

+    badentities.add(";");

+    badentities.add("the");

+    badentities.add("that");

+    badentities.add("several");

+    badentities.add("model");

+    badentities.add("our");

+    badentities.add("are");

+    badentities.add("in");

+    badentities.add("are");

+    badentities.add("at");

+    badentities.add("is");

+    badentities.add("for");

+    badentities.add("the");

+    badentities.add("during");

+    badentities.add("south");

+    badentities.add("from");

+    badentities.add("recounts");

+    badentities.add("wissenschaftliches");

+    badentities.add("if");

+    badentities.add("security");

+    badentities.add("denouncing");

+    badentities.add("writes");

+    badentities.add("but");

+    badentities.add("operation");

+    badentities.add("adds");

+    badentities.add("Above");

+    badentities.add("but");

+    badentities.add("RIP");

+    badentities.add("on");

+    badentities.add("no");

+    badentities.add("agrees");

+    badentities.add("year");

+    badentities.add("for");

+    badentities.add("you");

+    badentities.add("red");

+    badentities.add("added");

+    badentities.add("hello");

+    badentities.add("around");

+    badentities.add("has");

+    badentities.add("turn");

+    badentities.add("surrounding");

+    badentities.add("\" No");

+    badentities.add("aug.");

+    badentities.add("or");

+    badentities.add("quips");

+    badentities.add("september");

+    badentities.add("[mr");

+    badentities.add("diseases");

+    badentities.add("when");

+    badentities.add("bbc");

+    badentities.add(":\"");

+    badentities.add("dr");

+    badentities.add("baby");

+    badentities.add("on");

+    badentities.add("route");

+    badentities.add("'");

+    badentities.add("\"");

+    badentities.add("a");

+    badentities.add("her");

+    badentities.add("'");

+    badentities.add("\"");

+    badentities.add("two");

+    badentities.add("that");

+    badentities.add(":");

+    badentities.add("one");

+    badentities.add("Party");

+    badentities.add("Championship");

+

+    badentities.add("Ltd");

+

+    return badentities;

+  }

+}

diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/ModelableImpl.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/ModelableImpl.java
new file mode 100644
index 0000000..76608b7
--- /dev/null
+++ b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/ModelableImpl.java
@@ -0,0 +1,137 @@
+/*

+ * To change this template, choose Tools | Templates

+ * and open the template in the editor.

+ */

+package opennlp.modelbuilder.v2.impls;

+

+import java.io.BufferedOutputStream;

+import java.io.File;

+import java.io.FileInputStream;

+import java.io.FileOutputStream;

+import java.io.FileWriter;

+import java.io.IOException;

+import java.io.OutputStream;

+import java.nio.charset.Charset;

+import java.util.HashMap;

+import java.util.HashSet;

+import java.util.Map;

+import java.util.Set;

+import java.util.logging.Level;

+import java.util.logging.Logger;

+import opennlp.modelbuilder.v2.Modelable;

+import opennlp.tools.namefind.NameFinderME;

+import opennlp.tools.namefind.NameSample;

+import opennlp.tools.namefind.NameSampleDataStream;

+import opennlp.tools.namefind.TokenNameFinderModel;

+import opennlp.tools.tokenize.TokenizerME;

+import opennlp.tools.tokenize.TokenizerModel;

+import opennlp.tools.util.ObjectStream;

+import opennlp.tools.util.PlainTextByLineStream;

+

+/**

+ *

+ * @author Owner

+ */

+public class ModelableImpl implements Modelable {

+

+  private TokenizerModel tm;

+  private TokenizerME wordBreaker;

+  private String path = "c:\\temp\\opennlpmodels\\";

+  private Set<String> annotatedSentences = new HashSet<String>();

+  private Map<String, String> params = new HashMap<String, String>();

+

+  @Override

+  public void setParameters(Map<String, String> params) {

+    this.params = params;

+    path=params.get("modelablepath");

+  }

+

+  @Override

+  public String annotate(String sentence, String namedEntity, String entityType) {

+    String annotation = sentence.replace(namedEntity, " <START:" + entityType + "> " + namedEntity + " <END> ");

+

+    return annotation;

+  }

+

+  @Override

+  public void writeAnnotatedSentences() {

+    try {

+      FileWriter writer = new FileWriter(path + "en-ner-person.train", false);

+

+      for (String s : annotatedSentences) {

+        writer.write(s.replace("\n", "").trim() + "\n");

+      }

+      writer.close();

+    } catch (IOException ex) {

+    }

+  }

+

+  @Override

+  public Set<String> getAnnotatedSentences() {

+    return annotatedSentences;

+  }

+

+  @Override

+  public void setAnnotatedSentences(Set<String> annotatedSentences) {

+    this.annotatedSentences = annotatedSentences;

+  }

+

+  @Override

+  public void addAnnotatedSentence(String annotatedSentence) {

+    if (annotatedSentence != null) {

+      int before = annotatedSentences.size();

+      annotatedSentences.add(annotatedSentence);

+      if (annotatedSentences.size() > before) {

+      }

+    }

+  }

+

+  @Override

+  public void buildModel(String entityType) {

+    try {

+      System.out.println("\tBuilding Model using " + annotatedSentences.size() + " annotations");

+      System.out.println("\t\treading training data...");

+      Charset charset = Charset.forName("UTF-8");

+      ObjectStream<String> lineStream =

+              new PlainTextByLineStream(new FileInputStream(path + "en-ner-person.train"), charset);

+      ObjectStream<NameSample> sampleStream = new NameSampleDataStream(lineStream);

+

+      TokenNameFinderModel model;

+      model = NameFinderME.train("en", "person", sampleStream, null);

+      sampleStream.close();

+      OutputStream modelOut = new BufferedOutputStream(new FileOutputStream(new File(path + "en-ner-person.train.model")));

+      model.serialize(modelOut);

+      if (modelOut != null) {

+        modelOut.close();

+      }

+      System.out.println("\tmodel generated");

+    } catch (Exception e) {

+    }

+  }

+

+  @Override

+  public TokenNameFinderModel getModel() {

+

+

+    TokenNameFinderModel nerModel = null;

+    try {

+      nerModel = new TokenNameFinderModel(new FileInputStream(new File(path + "en-ner-person.train.model")));

+    } catch (IOException ex) {

+      Logger.getLogger(ModelableImpl.class.getName()).log(Level.SEVERE, null, ex);

+    }

+    return nerModel;

+  }

+

+  @Override

+  public String[] tokenizeSentenceToWords(String sentence) {

+    return sentence.split(" ");

+//    try {

+//      if (tm == null || wordBreaker == null) {

+//        tm = new TokenizerModel(new FileInputStream(new File(path + "en-token.zip")));

+//        wordBreaker = new TokenizerME(tm);

+//      }

+//    } catch (IOException ex) {

+//    }

+//    return wordBreaker.tokenize(sentence);

+  }

+}

diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/MySQLSentenceProviderImpl.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/MySQLSentenceProviderImpl.java
new file mode 100644
index 0000000..cac060e
--- /dev/null
+++ b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/MySQLSentenceProviderImpl.java
@@ -0,0 +1,78 @@
+/*

+ * To change this template, choose Tools | Templates

+ * and open the template in the editor.

+ */

+package opennlp.modelbuilder.v2.impls;

+

+import java.sql.*;

+import java.util.HashMap;

+import java.util.HashSet;

+import java.util.Map;

+import java.util.Set;

+import opennlp.modelbuilder.v2.SentenceProvider;

+

+/**

+ *

+ * @author Owner

+ */

+public class MySQLSentenceProviderImpl implements SentenceProvider {

+

+  Set<String> sentences = new HashSet<String>();

+

+  @Override

+  public Set<String> getSentences() {

+    try {

+      if (sentences.isEmpty()) {

+        return getData();

+      }

+    } catch (Exception e) {

+    }

+    return sentences;

+  }

+

+  private Set<String> getData() throws Exception {

+

+    Connection con = getMySqlConnection();

+    if (con.isClosed()) {

+      con = getMySqlConnection();

+    }

+    CallableStatement cs;

+    cs = con.prepareCall("CALL getTrainingSentences()");

+

+    ResultSet rs;

+    try {

+      rs = cs.executeQuery();

+      while (rs.next()) {

+        sentences.add(rs.getString(1));

+      }

+

+    } catch (SQLException ex) {

+      throw ex;

+    } catch (Exception e) {

+      System.err.println(e);

+    } finally {

+      con.close();

+    }

+

+    return sentences;

+  }

+

+  private static Connection getMySqlConnection() throws Exception {

+    // EntityLinkerProperties property = new EntityLinkerProperties(new File("c:\\temp\\opennlpmodels\\entitylinker.properties"));

+    String driver = "org.gjt.mm.mysql.Driver";

+    String url = "jdbc:mysql://127.0.0.1:3306/wink";

+    String username = "root";

+    String password = "559447";

+

+    Class.forName(driver);

+    Connection conn = DriverManager.getConnection(url, username, password);

+    return conn;

+  }

+

+ private Map<String, String> params = new HashMap<String,String>();

+

+  @Override

+  public void setParameters(Map<String, String> params) {

+    this.params = params;

+  }

+}

diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/PersonKnownEntityProviderImpl.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/PersonKnownEntityProviderImpl.java
new file mode 100644
index 0000000..64fae27
--- /dev/null
+++ b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/PersonKnownEntityProviderImpl.java
@@ -0,0 +1,98 @@
+/*

+ * To change this template, choose Tools | Templates

+ * and open the template in the editor.

+ */

+package opennlp.modelbuilder.v2.impls;

+

+import java.util.HashMap;

+import java.util.HashSet;

+import java.util.Map;

+import java.util.Set;

+import opennlp.modelbuilder.v2.KnownEntityProvider;

+

+/**

+ *

+ * @author Owner

+ */

+public class PersonKnownEntityProviderImpl implements KnownEntityProvider {

+

+  Set<String> ret = new HashSet<String>();

+

+  @Override

+  public Set<String> getKnownEntities() {

+    if (ret.isEmpty()) {

+      ret.add("Barack Obama");

+      ret.add("Mitt Romney");

+      ret.add("John Doe");

+      ret.add("Bill Gates");

+      ret.add("Nguyen Tan Dung");

+      ret.add("Hassanal Bolkiah");

+      ret.add("Bashar al-Assad");

+      ret.add("Faysal Khabbaz Hamou");

+      ret.add("Dr Talwar");

+      ret.add("Mr. Bolkiah");

+      ret.add("Bashar");

+      ret.add("Romney");

+      ret.add("Obama");

+      ret.add("the President");

+      ret.add("Mr. Gates");

+      ret.add("Romney");

+

+

+

+      ret.add("Xi Jinping");

+      ret.add("Hassanal Bolkiah");

+      ret.add("Leon Panetta");

+      ret.add("Paul Beales");

+      ret.add("Mr Rajapaksa");

+      ret.add("Mohammed ");

+      ret.add("Ieng Thirith");

+      ret.add("Mr Xi");

+      ret.add("John Sudworth");

+      ret.add("Ieng Thirith");

+      ret.add("Aung San Suu Kyi");

+

+      ret.add("Khorshid");

+      ret.add("Karrie Webb");

+      ret.add("Doyle McManus");

+      ret.add("Pope John Paul");

+      ret.add("Roland Buerk");

+      ret.add("Paul Ryan");

+      ret.add("Tammy Baldwin");

+      ret.add("Ben Unger");

+      ret.add("Chris Christie");

+      ret.add("Mary Magdalene");

+      ret.add("George Walker Bush");

+      ret.add("Melendez-Martinez");

+      ret.add("Osiel Cardenas Guillen");

+      ret.add("President Molina");

+      ret.add("Lubaina Himid");

+      ret.add("Elizabeth Frink");

+      ret.add("Graham Sutherland");

+      ret.add("Gorman Adams");

+      ret.add("Peter Sheasby");

+      ret.add("Andrew Walker");

+      ret.add("Elias Garcia Martinez");

+      ret.add("Elias Martinez");

+

+    }

+    return ret;

+  }

+

+  @Override

+  public String getKnownEntitiesType() {

+    return "person";

+  }

+

+  @Override

+  public void addKnownEntity(String unambiguousEntity) {

+    ret.add(unambiguousEntity);

+  }

+

+  private Map<String, String> params = new HashMap<String,String>();

+

+  @Override

+  public void setParameters(Map<String, String> params) {

+    this.params = params;

+  }

+}

diff --git a/modelbuilder-prototype/src/test/java/modelbuilder/AppTest.java b/modelbuilder-prototype/src/test/java/modelbuilder/AppTest.java
new file mode 100644
index 0000000..2b04731
--- /dev/null
+++ b/modelbuilder-prototype/src/test/java/modelbuilder/AppTest.java
@@ -0,0 +1,38 @@
+package modelbuilder;

+

+import junit.framework.Test;

+import junit.framework.TestCase;

+import junit.framework.TestSuite;

+

+/**

+ * Unit test for simple App.

+ */

+public class AppTest 

+    extends TestCase

+{

+    /**

+     * Create the test case

+     *

+     * @param testName name of the test case

+     */

+    public AppTest( String testName )

+    {

+        super( testName );

+    }

+

+    /**

+     * @return the suite of tests being tested

+     */

+    public static Test suite()

+    {

+        return new TestSuite( AppTest.class );

+    }

+

+    /**

+     * Rigourous Test :-)

+     */

+    public void testApp()

+    {

+        assertTrue( true );

+    }

+}