Prototype of a tool to allow users to create models from of a set of known entities based on their own data in the form of sentences.
See the Example class in the .v2 package.
diff --git a/modelbuilder-prototype/pom.xml b/modelbuilder-prototype/pom.xml
new file mode 100644
index 0000000..f011971
--- /dev/null
+++ b/modelbuilder-prototype/pom.xml
@@ -0,0 +1,30 @@
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <groupId>modelbuilder</groupId>
+ <artifactId>modelbuilder-prototype</artifactId>
+ <version>1.0-SNAPSHOT</version>
+ <packaging>jar</packaging>
+
+ <name>modelbuilder-prototype</name>
+ <url>http://maven.apache.org</url>
+
+ <properties>
+ <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+ </properties>
+
+ <dependencies>
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <version>3.8.1</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.opennlp</groupId>
+ <artifactId>opennlp-tools</artifactId>
+ <version>1.6.0-SNAPSHOT</version>
+ </dependency>
+ </dependencies>
+</project>
diff --git a/modelbuilder-prototype/src/main/java/modelbuilder/App.java b/modelbuilder-prototype/src/main/java/modelbuilder/App.java
new file mode 100644
index 0000000..b788f6c
--- /dev/null
+++ b/modelbuilder-prototype/src/main/java/modelbuilder/App.java
@@ -0,0 +1,13 @@
+package modelbuilder;
+
+/**
+ * Hello world!
+ *
+ */
+public class App
+{
+ public static void main( String[] args )
+ {
+
+ }
+}
diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/Example.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/Example.java
new file mode 100644
index 0000000..eed4c1c
--- /dev/null
+++ b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/Example.java
@@ -0,0 +1,67 @@
+/*
+ * To change this template, choose Tools | Templates
+ * and open the template in the editor.
+ */
+package opennlp.modelbuilder.v2;
+
+import java.util.HashMap;
+import java.util.Map;
+import opennlp.modelbuilder.v2.impls.FileKnownEntityProvider;
+import opennlp.modelbuilder.v2.impls.FileModelValidatorImpl;
+import opennlp.modelbuilder.v2.impls.FileSentenceProvider;
+import opennlp.modelbuilder.v2.impls.ModelableImpl;
+
+/**
+ *
+ * @author Owner
+ */
+public class Example {
+
+ public static void main(String[] args) {
+
+ GenericModelGenerator modelGenerator = new GenericModelGenerator();
+ //every component has a map as a place to recieve params
+ //these are required for the current file-based impls
+ Map<String, String> params = new HashMap<String, String>();
+ params.put("sentencesfile", "/the/file");
+ params.put("knownentityfile", "/the/file");
+ params.put("knownentitytype", "person");
+ params.put("blacklistfile", "/the/file");
+ params.put("modelablepath", "/the/file");
+
+ /**
+ * sentence providers feed this process with user data derived sentences
+ * this impl just reads line by line through a file
+ */
+ SentenceProvider sentenceProvider = new FileSentenceProvider();
+ sentenceProvider.setParameters(params);
+ /**
+ *KnownEntityProviders provide a seed list of known entities... such as Barack Obama for person, or Germany for location
+ * obviously these would want to be prolific, non ambiguous names
+ */
+ KnownEntityProvider knownEntityProvider = new FileKnownEntityProvider();
+ knownEntityProvider.setParameters(params);
+ /**
+ * ModelGenerationValidators try to weed out bad hits by the iterations of the name finder.
+ * Since this is a recursive process, with each iteration the namefinder will get more and more greedy if bad entities are allowed in
+ * this provides a mechanism for throwing out obviously bad hits.
+ * A good impl may be to make sure a location is actually within a noun phrase etc...users can make this as specific as they need for their dat
+ * and their use case
+ */
+ ModelGenerationValidator validator = new FileModelValidatorImpl();
+ validator.setParameters(params);
+ /**
+ * Modelable's write and read the annotated sentences, as well as create and write the NER models
+ */
+
+ Modelable modelable = new ModelableImpl();
+ modelable.setParameters(params);
+
+ /**
+ * the modelGenerator actually runs the process with a set number of iterations... could be better by actually calculating the
+ * diff between runs and stopping based on a thresh, but for extrememly large sentence sets this may be too much.
+ */
+ modelGenerator.build(sentenceProvider, knownEntityProvider, validator, modelable, 3);
+
+ }
+}
diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/GenericModelGenerator.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/GenericModelGenerator.java
new file mode 100644
index 0000000..4fe9d89
--- /dev/null
+++ b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/GenericModelGenerator.java
@@ -0,0 +1,70 @@
+/*
+ * To change this template, choose Tools | Templates
+ * and open the template in the editor.
+ */
+package opennlp.modelbuilder.v2;
+
+import java.util.HashMap;
+import java.util.Map;
+import opennlp.tools.namefind.NameFinderME;
+import opennlp.tools.util.Span;
+
+/**
+ *
+ *
+ */
+public class GenericModelGenerator implements SemiSupervisedModelGenerator{
+ private Map<String, String> params = new HashMap<String, String>();
+
+ @Override
+ public void setParameters(Map<String, String> params) {
+ this.params = params;
+ }
+ @Override
+ public void build(SentenceProvider sentenceProvider, KnownEntityProvider knownEntityProvider,
+ ModelGenerationValidator validator, Modelable modelable, int iterations) {
+ for (int iteration = 0; iteration < iterations; iteration++) {
+ System.out.println("ITERATION: " + iteration);
+ System.out.println("\tPerfoming Known Entity Annotation");
+ System.out.println("\t\tknowns: " + knownEntityProvider.getKnownEntities().size());
+ System.out.println("\t\treading data....: ");
+ for (String sentence : sentenceProvider.getSentences()) {
+ for (String knownEntity : knownEntityProvider.getKnownEntities()) {
+ if (sentence.contains(knownEntity)) {
+ //if the same sentence has multiple hits should they be annotated separately?
+ modelable.addAnnotatedSentence(modelable.annotate(sentence, knownEntity, knownEntityProvider.getKnownEntitiesType()));
+ }
+ }
+ }
+ System.out.println("\t\twriting annotated sentences....: ");
+ modelable.writeAnnotatedSentences();
+ modelable.buildModel(knownEntityProvider.getKnownEntitiesType());
+ NameFinderME nf = new NameFinderME(modelable.getModel());
+ System.out.println("\t\tannotated sentences: " + modelable.getAnnotatedSentences().size());
+ System.out.println("\tPerforming NER");
+ for (String sentence : sentenceProvider.getSentences()) {
+ if (!validator.validSentence(sentence)) {
+ continue;
+ }
+ String[] tokens = modelable.tokenizeSentenceToWords(sentence);
+
+ Span[] find = nf.find(tokens);
+ nf.clearAdaptiveData();
+
+ String[] namedEntities = Span.spansToStrings(find, tokens);
+
+ for (String namedEntity : namedEntities) {
+ if (validator.validNamedEntity(namedEntity)) {
+ knownEntityProvider.addKnownEntity(namedEntity);
+ modelable.addAnnotatedSentence(modelable.annotate(sentence, namedEntity, knownEntityProvider.getKnownEntitiesType()));
+
+ }
+ }
+ }
+ System.out.println("\t\tannotated sentences: " + modelable.getAnnotatedSentences().size());
+ System.out.println("\t\tknowns: " + knownEntityProvider.getKnownEntities().size());
+ }
+ modelable.writeAnnotatedSentences();
+ modelable.buildModel(knownEntityProvider.getKnownEntitiesType());
+ }
+}
diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/KnownEntityProvider.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/KnownEntityProvider.java
new file mode 100644
index 0000000..fd04cd4
--- /dev/null
+++ b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/KnownEntityProvider.java
@@ -0,0 +1,35 @@
+/*
+ * To change this template, choose Tools | Templates
+ * and open the template in the editor.
+ */
+package opennlp.modelbuilder.v2;
+
+import java.util.List;
+import java.util.Set;
+
+
+
+/**
+ *
+ * @author Owner
+ */
+public interface KnownEntityProvider extends ModelParameter{
+ /**
+ * returns a list of known non ambiguous entities.
+ * @return a set of entities
+ */
+ Set<String> getKnownEntities();
+/**
+ * adds to the set of known entities. Overriding classes should hold this list in a class level set.
+ * @param unambiguousEntity
+ */
+ void addKnownEntity(String unambiguousEntity);
+/**
+ * defines the type of entity that the set contains, ie person, location, organization.
+ * @return
+ */
+ String getKnownEntitiesType();
+
+
+
+}
diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/ModelGenerationValidator.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/ModelGenerationValidator.java
new file mode 100644
index 0000000..1ecc13f
--- /dev/null
+++ b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/ModelGenerationValidator.java
@@ -0,0 +1,23 @@
+/*
+ * To change this template, choose Tools | Templates
+ * and open the template in the editor.
+ */
+package opennlp.modelbuilder.v2;
+
+import java.util.Collection;
+import java.util.Set;
+
+/**
+ *
+ * @author Owner
+ */
+public interface ModelGenerationValidator extends ModelParameter {
+
+ Boolean validSentence(String sentence);
+
+ Boolean validNamedEntity(String namedEntity);
+
+
+
+ Collection<String> getBlackList();
+}
diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/ModelParameter.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/ModelParameter.java
new file mode 100644
index 0000000..59a60f0
--- /dev/null
+++ b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/ModelParameter.java
@@ -0,0 +1,17 @@
+/*
+ * To change this template, choose Tools | Templates
+ * and open the template in the editor.
+ */
+package opennlp.modelbuilder.v2;
+
+import java.util.HashMap;
+import java.util.Map;
+
+/**
+ *
+ * @author Owner
+ */
+public interface ModelParameter {
+
+ void setParameters(Map<String, String> params);
+}
diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/Modelable.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/Modelable.java
new file mode 100644
index 0000000..18dd632
--- /dev/null
+++ b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/Modelable.java
@@ -0,0 +1,37 @@
+/*
+ * To change this template, choose Tools | Templates
+ * and open the template in the editor.
+ */
+package opennlp.modelbuilder.v2;
+
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import opennlp.tools.namefind.TokenNameFinderModel;
+
+/**
+ *
+ * @author Owner
+ */
+public interface Modelable extends ModelParameter{
+
+
+
+ String annotate(String sentence, String namedEntity, String entityType);
+
+ void writeAnnotatedSentences();
+
+ Set<String> getAnnotatedSentences();
+
+ void setAnnotatedSentences(Set<String> annotatedSentences);
+
+ void addAnnotatedSentence(String annotatedSentence);
+
+ void buildModel( String entityType);
+
+ TokenNameFinderModel getModel();
+
+ String[] tokenizeSentenceToWords(String sentence);
+
+
+}
diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/SemiSupervisedModelGenerator.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/SemiSupervisedModelGenerator.java
new file mode 100644
index 0000000..6f2020e
--- /dev/null
+++ b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/SemiSupervisedModelGenerator.java
@@ -0,0 +1,15 @@
+/*
+ * To change this template, choose Tools | Templates
+ * and open the template in the editor.
+ */
+package opennlp.modelbuilder.v2;
+
+/**
+ *
+ * @author Owner
+ */
+public interface SemiSupervisedModelGenerator extends ModelParameter {
+
+ void build(SentenceProvider sentenceProvider, KnownEntityProvider knownEntityProvider,
+ ModelGenerationValidator validator, Modelable modelable, int iterations);
+}
diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/SentenceProvider.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/SentenceProvider.java
new file mode 100644
index 0000000..8549966
--- /dev/null
+++ b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/SentenceProvider.java
@@ -0,0 +1,16 @@
+/*
+ * To change this template, choose Tools | Templates
+ * and open the template in the editor.
+ */
+package opennlp.modelbuilder.v2;
+
+import java.util.Set;
+
+/**
+ *
+ * @author Owner
+ */
+public interface SentenceProvider extends ModelParameter {
+
+ Set<String> getSentences();
+}
diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/FileKnownEntityProvider.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/FileKnownEntityProvider.java
new file mode 100644
index 0000000..aadb482
--- /dev/null
+++ b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/FileKnownEntityProvider.java
@@ -0,0 +1,74 @@
+/*
+ * To change this template, choose Tools | Templates
+ * and open the template in the editor.
+ */
+package opennlp.modelbuilder.v2.impls;
+
+import java.io.BufferedReader;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.nio.charset.Charset;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+import opennlp.modelbuilder.v2.KnownEntityProvider;
+
+/**
+ *
+ * @author Owner
+ */
+public class FileKnownEntityProvider implements KnownEntityProvider {
+ private Map<String, String> params = new HashMap<String, String>();
+ Set<String> knownEntities = new HashSet<String>();
+
+ @Override
+ public Set<String> getKnownEntities() {
+ if (knownEntities.isEmpty()) {
+ try {
+ InputStream fis;
+ BufferedReader br;
+ String line;
+
+ fis = new FileInputStream(params.get("knownentityfile"));
+ br = new BufferedReader(new InputStreamReader(fis, Charset.forName("UTF-8")));
+ while ((line = br.readLine()) != null) {
+ knownEntities.add(line);
+ }
+
+ // Done with the file
+ br.close();
+ br = null;
+ fis = null;
+ } catch (FileNotFoundException ex) {
+ Logger.getLogger(FileKnownEntityProvider.class.getName()).log(Level.SEVERE, null, ex);
+ } catch (IOException ex) {
+ Logger.getLogger(FileKnownEntityProvider.class.getName()).log(Level.SEVERE, null, ex);
+ }
+ }
+ return knownEntities;
+ }
+
+ @Override
+ public void addKnownEntity(String unambiguousEntity) {
+ knownEntities.add(unambiguousEntity);
+ }
+
+ @Override
+ public String getKnownEntitiesType() {
+
+ return params.get("knownentitytype");
+ }
+
+
+
+ @Override
+ public void setParameters(Map<String, String> params) {
+ this.params = params;
+ }
+}
diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/FileModelValidatorImpl.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/FileModelValidatorImpl.java
new file mode 100644
index 0000000..4e92cac
--- /dev/null
+++ b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/FileModelValidatorImpl.java
@@ -0,0 +1,88 @@
+/*
+ * To change this template, choose Tools | Templates
+ * and open the template in the editor.
+ */
+package opennlp.modelbuilder.v2.impls;
+
+import java.io.BufferedReader;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.nio.charset.Charset;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+import java.util.regex.Pattern;
+import opennlp.modelbuilder.v2.ModelGenerationValidator;
+
+/**
+ *
+ * @author Owner
+ */
+public class FileModelValidatorImpl implements ModelGenerationValidator {
+
+ private Set<String> badentities = new HashSet<String>();
+ private final double MIN_SCORE_FOR_TRAINING = 0.95d;
+ private Object validationData;
+ private Map<String, String> params = new HashMap<String, String>();
+
+ @Override
+ public void setParameters(Map<String, String> params) {
+ this.params = params;
+ }
+
+ @Override
+ public Boolean validSentence(String sentence) {
+ //returning true by default, because the sentence provider will return only "valid" sentences in this case
+ return true;
+ }
+
+ @Override
+ public Boolean validNamedEntity(String namedEntity) {
+
+ if (badentities.isEmpty()) {
+ getBlackList();
+ }
+
+ Pattern p = Pattern.compile("[0-9]", Pattern.CASE_INSENSITIVE | Pattern.MULTILINE);
+ if (p.matcher(namedEntity).find()) {
+ return false;
+ }
+ Boolean b = true;
+ if (badentities.contains(namedEntity.toLowerCase())) {
+ b = false;
+ }
+ return b;
+ }
+
+ @Override
+ public Collection<String> getBlackList() {
+ if (!badentities.isEmpty()) {
+ try {
+ InputStream fis;
+ BufferedReader br;
+ String line;
+
+ fis = new FileInputStream(params.get("blacklistfile"));
+ br = new BufferedReader(new InputStreamReader(fis, Charset.forName("UTF-8")));
+ while ((line = br.readLine()) != null) {
+ badentities.add(line);
+ }
+ br.close();
+ br = null;
+ fis = null;
+ } catch (FileNotFoundException ex) {
+ Logger.getLogger(FileKnownEntityProvider.class.getName()).log(Level.SEVERE, null, ex);
+ } catch (IOException ex) {
+ Logger.getLogger(FileKnownEntityProvider.class.getName()).log(Level.SEVERE, null, ex);
+ }
+ }
+ return badentities;
+ }
+}
diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/FileSentenceProvider.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/FileSentenceProvider.java
new file mode 100644
index 0000000..fa24f04
--- /dev/null
+++ b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/FileSentenceProvider.java
@@ -0,0 +1,60 @@
+/*
+ * To change this template, choose Tools | Templates
+ * and open the template in the editor.
+ */
+package opennlp.modelbuilder.v2.impls;
+
+import java.io.BufferedReader;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.nio.charset.Charset;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+import opennlp.modelbuilder.v2.SentenceProvider;
+
+/**
+ *
+ * @author Owner
+ */
+public class FileSentenceProvider implements SentenceProvider {
+
+ private Map<String, String> params = new HashMap<String, String>();
+ Set<String> sentences = new HashSet<String>();
+
+ public Set<String> getSentences() {
+ if (sentences.isEmpty()) {
+ try {
+ InputStream fis;
+ BufferedReader br;
+ String line;
+
+ fis = new FileInputStream(params.get("sentencesfile"));
+ br = new BufferedReader(new InputStreamReader(fis, Charset.forName("UTF-8")));
+ while ((line = br.readLine()) != null) {
+ sentences.add(line);
+ }
+
+ // Done with the file
+ br.close();
+ br = null;
+ fis = null;
+ } catch (FileNotFoundException ex) {
+ Logger.getLogger(FileKnownEntityProvider.class.getName()).log(Level.SEVERE, null, ex);
+ } catch (IOException ex) {
+ Logger.getLogger(FileKnownEntityProvider.class.getName()).log(Level.SEVERE, null, ex);
+ }
+ }
+ return sentences;
+ }
+
+ public void setParameters(Map<String, String> params) {
+ this.params = params;
+ }
+}
diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/LocationKnownEntityProviderImpl.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/LocationKnownEntityProviderImpl.java
new file mode 100644
index 0000000..0e50594
--- /dev/null
+++ b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/LocationKnownEntityProviderImpl.java
@@ -0,0 +1,93 @@
+/*
+ * To change this template, choose Tools | Templates
+ * and open the template in the editor.
+ */
+package opennlp.modelbuilder.v2.impls;
+
+import java.sql.CallableStatement;
+import java.sql.Connection;
+import java.sql.DriverManager;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+import opennlp.modelbuilder.v2.KnownEntityProvider;
+
+/**
+ *
+ * @author Owner
+ */
+public class LocationKnownEntityProviderImpl implements KnownEntityProvider {
+
+ Set<String> ret = new HashSet<String>();
+
+ @Override
+ public Set<String> getKnownEntities() {
+ if (ret.isEmpty()) {
+ try {
+ getData();
+ } catch (Exception ex) {
+ Logger.getLogger(LocationKnownEntityProviderImpl.class.getName()).log(Level.SEVERE, null, ex);
+ }
+
+ }
+ return ret;
+ }
+ private Set<String> getData() throws Exception {
+
+ Connection con = getMySqlConnection();
+ if (con.isClosed()) {
+ con = getMySqlConnection();
+ }
+ CallableStatement cs;
+ cs = con.prepareCall("CALL getcountrylist()");
+
+ ResultSet rs;
+ try {
+ rs = cs.executeQuery();
+ while (rs.next()) {
+ ret.add(rs.getString("full_name_nd_ro"));
+ }
+
+ } catch (SQLException ex) {
+ throw ex;
+ } catch (Exception e) {
+ System.err.println(e);
+ } finally {
+ con.close();
+ }
+
+ return ret;
+ }
+ private static Connection getMySqlConnection() throws Exception {
+ // EntityLinkerProperties property = new EntityLinkerProperties(new File("c:\\temp\\opennlpmodels\\entitylinker.properties"));
+ String driver = "org.gjt.mm.mysql.Driver";
+ String url = "jdbc:mysql://127.0.0.1:3306/world";
+ String username = "root";
+ String password = "559447";
+
+ Class.forName(driver);
+ Connection conn = DriverManager.getConnection(url, username, password);
+ return conn;
+ }
+ @Override
+ public String getKnownEntitiesType() {
+ return "location";
+ }
+
+ @Override
+ public void addKnownEntity(String unambiguousEntity) {
+ ret.add(unambiguousEntity);
+ }
+
+ private Map<String, String> params = new HashMap<String, String>();
+
+ @Override
+ public void setParameters(Map<String, String> params) {
+ this.params = params;
+ }
+}
diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/ModelValidatorImpl.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/ModelValidatorImpl.java
new file mode 100644
index 0000000..882ce2a
--- /dev/null
+++ b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/ModelValidatorImpl.java
@@ -0,0 +1,131 @@
+/*
+ * To change this template, choose Tools | Templates
+ * and open the template in the editor.
+ */
+package opennlp.modelbuilder.v2.impls;
+
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+import java.util.regex.Pattern;
+import opennlp.modelbuilder.v2.ModelGenerationValidator;
+
+/**
+ *
+ * @author Owner
+ */
+public class ModelValidatorImpl implements ModelGenerationValidator {
+
+ private Set<String> badentities = new HashSet<String>();
+ private final double MIN_SCORE_FOR_TRAINING = 0.95d;
+ private Object validationData;
+ private Map<String, String> params = new HashMap<String, String>();
+
+ @Override
+ public void setParameters(Map<String, String> params) {
+ this.params = params;
+ }
+ @Override
+ public Boolean validSentence(String sentence) {
+ //returning true by default, because the sentence provider will return only "valid" sentences in this case
+ return true;
+ }
+
+ @Override
+ public Boolean validNamedEntity(String namedEntity) {
+
+ if (badentities.isEmpty()) {
+ getBlackList();
+ }
+
+ Pattern p = Pattern.compile("[0-9]", Pattern.CASE_INSENSITIVE | Pattern.MULTILINE);
+ if (p.matcher(namedEntity).find()) {
+ return false;
+ }
+ Boolean b = true;
+ if (badentities.contains(namedEntity.toLowerCase())) {
+ b = false;
+ }
+ return b;
+ }
+
+
+ @Override
+ public Set<String> getBlackList() {
+ badentities.add(".");
+ badentities.add("-");
+ badentities.add(",");
+ badentities.add(";");
+ badentities.add("the");
+ badentities.add("that");
+ badentities.add("several");
+ badentities.add("model");
+ badentities.add("our");
+ badentities.add("are");
+ badentities.add("in");
+ badentities.add("are");
+ badentities.add("at");
+ badentities.add("is");
+ badentities.add("for");
+ badentities.add("the");
+ badentities.add("during");
+ badentities.add("south");
+ badentities.add("from");
+ badentities.add("recounts");
+ badentities.add("wissenschaftliches");
+ badentities.add("if");
+ badentities.add("security");
+ badentities.add("denouncing");
+ badentities.add("writes");
+ badentities.add("but");
+ badentities.add("operation");
+ badentities.add("adds");
+ badentities.add("Above");
+ badentities.add("but");
+ badentities.add("RIP");
+ badentities.add("on");
+ badentities.add("no");
+ badentities.add("agrees");
+ badentities.add("year");
+ badentities.add("for");
+ badentities.add("you");
+ badentities.add("red");
+ badentities.add("added");
+ badentities.add("hello");
+ badentities.add("around");
+ badentities.add("has");
+ badentities.add("turn");
+ badentities.add("surrounding");
+ badentities.add("\" No");
+ badentities.add("aug.");
+ badentities.add("or");
+ badentities.add("quips");
+ badentities.add("september");
+ badentities.add("[mr");
+ badentities.add("diseases");
+ badentities.add("when");
+ badentities.add("bbc");
+ badentities.add(":\"");
+ badentities.add("dr");
+ badentities.add("baby");
+ badentities.add("on");
+ badentities.add("route");
+ badentities.add("'");
+ badentities.add("\"");
+ badentities.add("a");
+ badentities.add("her");
+ badentities.add("'");
+ badentities.add("\"");
+ badentities.add("two");
+ badentities.add("that");
+ badentities.add(":");
+ badentities.add("one");
+ badentities.add("Party");
+ badentities.add("Championship");
+
+ badentities.add("Ltd");
+
+ return badentities;
+ }
+}
diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/ModelableImpl.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/ModelableImpl.java
new file mode 100644
index 0000000..76608b7
--- /dev/null
+++ b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/ModelableImpl.java
@@ -0,0 +1,137 @@
+/*
+ * To change this template, choose Tools | Templates
+ * and open the template in the editor.
+ */
+package opennlp.modelbuilder.v2.impls;
+
+import java.io.BufferedOutputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.nio.charset.Charset;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+import opennlp.modelbuilder.v2.Modelable;
+import opennlp.tools.namefind.NameFinderME;
+import opennlp.tools.namefind.NameSample;
+import opennlp.tools.namefind.NameSampleDataStream;
+import opennlp.tools.namefind.TokenNameFinderModel;
+import opennlp.tools.tokenize.TokenizerME;
+import opennlp.tools.tokenize.TokenizerModel;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.PlainTextByLineStream;
+
+/**
+ *
+ * @author Owner
+ */
+public class ModelableImpl implements Modelable {
+
+ private TokenizerModel tm;
+ private TokenizerME wordBreaker;
+ private String path = "c:\\temp\\opennlpmodels\\";
+ private Set<String> annotatedSentences = new HashSet<String>();
+ private Map<String, String> params = new HashMap<String, String>();
+
+ @Override
+ public void setParameters(Map<String, String> params) {
+ this.params = params;
+ path=params.get("modelablepath");
+ }
+
+ @Override
+ public String annotate(String sentence, String namedEntity, String entityType) {
+ String annotation = sentence.replace(namedEntity, " <START:" + entityType + "> " + namedEntity + " <END> ");
+
+ return annotation;
+ }
+
+ @Override
+ public void writeAnnotatedSentences() {
+ try {
+ FileWriter writer = new FileWriter(path + "en-ner-person.train", false);
+
+ for (String s : annotatedSentences) {
+ writer.write(s.replace("\n", "").trim() + "\n");
+ }
+ writer.close();
+ } catch (IOException ex) {
+ }
+ }
+
+ @Override
+ public Set<String> getAnnotatedSentences() {
+ return annotatedSentences;
+ }
+
+ @Override
+ public void setAnnotatedSentences(Set<String> annotatedSentences) {
+ this.annotatedSentences = annotatedSentences;
+ }
+
+ @Override
+ public void addAnnotatedSentence(String annotatedSentence) {
+ if (annotatedSentence != null) {
+ int before = annotatedSentences.size();
+ annotatedSentences.add(annotatedSentence);
+ if (annotatedSentences.size() > before) {
+ }
+ }
+ }
+
+ @Override
+ public void buildModel(String entityType) {
+ try {
+ System.out.println("\tBuilding Model using " + annotatedSentences.size() + " annotations");
+ System.out.println("\t\treading training data...");
+ Charset charset = Charset.forName("UTF-8");
+ ObjectStream<String> lineStream =
+ new PlainTextByLineStream(new FileInputStream(path + "en-ner-person.train"), charset);
+ ObjectStream<NameSample> sampleStream = new NameSampleDataStream(lineStream);
+
+ TokenNameFinderModel model;
+ model = NameFinderME.train("en", "person", sampleStream, null);
+ sampleStream.close();
+ OutputStream modelOut = new BufferedOutputStream(new FileOutputStream(new File(path + "en-ner-person.train.model")));
+ model.serialize(modelOut);
+ if (modelOut != null) {
+ modelOut.close();
+ }
+ System.out.println("\tmodel generated");
+ } catch (Exception e) {
+ }
+ }
+
+ @Override
+ public TokenNameFinderModel getModel() {
+
+
+ TokenNameFinderModel nerModel = null;
+ try {
+ nerModel = new TokenNameFinderModel(new FileInputStream(new File(path + "en-ner-person.train.model")));
+ } catch (IOException ex) {
+ Logger.getLogger(ModelableImpl.class.getName()).log(Level.SEVERE, null, ex);
+ }
+ return nerModel;
+ }
+
+ @Override
+ public String[] tokenizeSentenceToWords(String sentence) {
+ return sentence.split(" ");
+// try {
+// if (tm == null || wordBreaker == null) {
+// tm = new TokenizerModel(new FileInputStream(new File(path + "en-token.zip")));
+// wordBreaker = new TokenizerME(tm);
+// }
+// } catch (IOException ex) {
+// }
+// return wordBreaker.tokenize(sentence);
+ }
+}
diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/MySQLSentenceProviderImpl.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/MySQLSentenceProviderImpl.java
new file mode 100644
index 0000000..cac060e
--- /dev/null
+++ b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/MySQLSentenceProviderImpl.java
@@ -0,0 +1,78 @@
+/*
+ * To change this template, choose Tools | Templates
+ * and open the template in the editor.
+ */
+package opennlp.modelbuilder.v2.impls;
+
+import java.sql.*;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+import opennlp.modelbuilder.v2.SentenceProvider;
+
+/**
+ *
+ * @author Owner
+ */
+public class MySQLSentenceProviderImpl implements SentenceProvider {
+
+ Set<String> sentences = new HashSet<String>();
+
+ @Override
+ public Set<String> getSentences() {
+ try {
+ if (sentences.isEmpty()) {
+ return getData();
+ }
+ } catch (Exception e) {
+ }
+ return sentences;
+ }
+
+ private Set<String> getData() throws Exception {
+
+ Connection con = getMySqlConnection();
+ if (con.isClosed()) {
+ con = getMySqlConnection();
+ }
+ CallableStatement cs;
+ cs = con.prepareCall("CALL getTrainingSentences()");
+
+ ResultSet rs;
+ try {
+ rs = cs.executeQuery();
+ while (rs.next()) {
+ sentences.add(rs.getString(1));
+ }
+
+ } catch (SQLException ex) {
+ throw ex;
+ } catch (Exception e) {
+ System.err.println(e);
+ } finally {
+ con.close();
+ }
+
+ return sentences;
+ }
+
+ private static Connection getMySqlConnection() throws Exception {
+ // EntityLinkerProperties property = new EntityLinkerProperties(new File("c:\\temp\\opennlpmodels\\entitylinker.properties"));
+ String driver = "org.gjt.mm.mysql.Driver";
+ String url = "jdbc:mysql://127.0.0.1:3306/wink";
+ String username = "root";
+ String password = "559447";
+
+ Class.forName(driver);
+ Connection conn = DriverManager.getConnection(url, username, password);
+ return conn;
+ }
+
+ private Map<String, String> params = new HashMap<String,String>();
+
+ @Override
+ public void setParameters(Map<String, String> params) {
+ this.params = params;
+ }
+}
diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/PersonKnownEntityProviderImpl.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/PersonKnownEntityProviderImpl.java
new file mode 100644
index 0000000..64fae27
--- /dev/null
+++ b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/PersonKnownEntityProviderImpl.java
@@ -0,0 +1,98 @@
+/*
+ * To change this template, choose Tools | Templates
+ * and open the template in the editor.
+ */
+package opennlp.modelbuilder.v2.impls;
+
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+import opennlp.modelbuilder.v2.KnownEntityProvider;
+
+/**
+ *
+ * @author Owner
+ */
+public class PersonKnownEntityProviderImpl implements KnownEntityProvider {
+
+ Set<String> ret = new HashSet<String>();
+
+ @Override
+ public Set<String> getKnownEntities() {
+ if (ret.isEmpty()) {
+ ret.add("Barack Obama");
+ ret.add("Mitt Romney");
+ ret.add("John Doe");
+ ret.add("Bill Gates");
+ ret.add("Nguyen Tan Dung");
+ ret.add("Hassanal Bolkiah");
+ ret.add("Bashar al-Assad");
+ ret.add("Faysal Khabbaz Hamou");
+ ret.add("Dr Talwar");
+ ret.add("Mr. Bolkiah");
+ ret.add("Bashar");
+ ret.add("Romney");
+ ret.add("Obama");
+ ret.add("the President");
+ ret.add("Mr. Gates");
+ ret.add("Romney");
+
+
+
+ ret.add("Xi Jinping");
+ ret.add("Hassanal Bolkiah");
+ ret.add("Leon Panetta");
+ ret.add("Paul Beales");
+ ret.add("Mr Rajapaksa");
+ ret.add("Mohammed ");
+ ret.add("Ieng Thirith");
+ ret.add("Mr Xi");
+ ret.add("John Sudworth");
+ ret.add("Ieng Thirith");
+ ret.add("Aung San Suu Kyi");
+
+ ret.add("Khorshid");
+ ret.add("Karrie Webb");
+ ret.add("Doyle McManus");
+ ret.add("Pope John Paul");
+ ret.add("Roland Buerk");
+ ret.add("Paul Ryan");
+ ret.add("Tammy Baldwin");
+ ret.add("Ben Unger");
+ ret.add("Chris Christie");
+ ret.add("Mary Magdalene");
+ ret.add("George Walker Bush");
+ ret.add("Melendez-Martinez");
+ ret.add("Osiel Cardenas Guillen");
+ ret.add("President Molina");
+ ret.add("Lubaina Himid");
+ ret.add("Elizabeth Frink");
+ ret.add("Graham Sutherland");
+ ret.add("Gorman Adams");
+ ret.add("Peter Sheasby");
+ ret.add("Andrew Walker");
+ ret.add("Elias Garcia Martinez");
+ ret.add("Elias Martinez");
+
+ }
+ return ret;
+ }
+
+ @Override
+ public String getKnownEntitiesType() {
+ return "person";
+ }
+
+ @Override
+ public void addKnownEntity(String unambiguousEntity) {
+ ret.add(unambiguousEntity);
+ }
+
+ private Map<String, String> params = new HashMap<String,String>();
+
+ @Override
+ public void setParameters(Map<String, String> params) {
+ this.params = params;
+ }
+}
diff --git a/modelbuilder-prototype/src/test/java/modelbuilder/AppTest.java b/modelbuilder-prototype/src/test/java/modelbuilder/AppTest.java
new file mode 100644
index 0000000..2b04731
--- /dev/null
+++ b/modelbuilder-prototype/src/test/java/modelbuilder/AppTest.java
@@ -0,0 +1,38 @@
+package modelbuilder;
+
+import junit.framework.Test;
+import junit.framework.TestCase;
+import junit.framework.TestSuite;
+
+/**
+ * Unit test for simple App.
+ */
+public class AppTest
+ extends TestCase
+{
+ /**
+ * Create the test case
+ *
+ * @param testName name of the test case
+ */
+ public AppTest( String testName )
+ {
+ super( testName );
+ }
+
+ /**
+ * @return the suite of tests being tested
+ */
+ public static Test suite()
+ {
+ return new TestSuite( AppTest.class );
+ }
+
+ /**
+ * Rigourous Test :-)
+ */
+ public void testApp()
+ {
+ assertTrue( true );
+ }
+}