OPENNLP-607
Fixed many issues. Added default file-based impls for all interfaces, and created a util class wrapper to allow for easy use of the default implementations.
diff --git a/modelbuilder-prototype/src/main/java/modelbuilder/App.java b/modelbuilder-prototype/src/main/java/modelbuilder/App.java
deleted file mode 100644
index a84e689..0000000
--- a/modelbuilder-prototype/src/main/java/modelbuilder/App.java
+++ /dev/null
@@ -1,25 +0,0 @@
-/*
- * Copyright 2013 The Apache Software Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package modelbuilder;
-
-
-public class App
-{
- public static void main( String[] args )
- {
-
- }
-}
diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/Example.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/Example.java
deleted file mode 100644
index bf4d59f..0000000
--- a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/Example.java
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Copyright 2013 The Apache Software Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package opennlp.modelbuilder.v2;
-
-import java.util.HashMap;
-import java.util.Map;
-import opennlp.modelbuilder.v2.impls.FileKnownEntityProvider;
-import opennlp.modelbuilder.v2.impls.FileModelValidatorImpl;
-import opennlp.modelbuilder.v2.impls.FileSentenceProvider;
-import opennlp.modelbuilder.v2.impls.ModelableImpl;
-
-/**
- *
- * @author Owner
- */
-public class Example {
-
- public static void main(String[] args) {
-
- SemiSupervisedModelGenerator modelGenerator = new GenericModelGenerator();
- //every component has a map as a place to recieve params
- //these are required for the current file-based impls
- Map<String, String> params = new HashMap<String, String>();
- params.put("knownentityfile", "C:\\apache\\entitylinker\\opennlp.geoentitylinker.countrycontext.txt");
- params.put("sentencesfile", "C:\\apache\\modelbuilder\\sentences.txt");
- params.put("knownentitytype", "location");
- params.put("blacklistfile", "C:\\apache\\modelbuilder\\blacklist.txt");
- params.put("modelablepath", "C:\\apache\\modelbuilder");
-
- /**
- * sentence providers feed this process with user data derived sentences
- * this impl just reads line by line through a file
- */
- SentenceProvider sentenceProvider = new FileSentenceProvider();
- sentenceProvider.setParameters(params);
- /**
- *KnownEntityProviders provide a seed list of known entities... such as Barack Obama for person, or Germany for location
- * obviously these would want to be prolific, non ambiguous names
- */
- KnownEntityProvider knownEntityProvider = new FileKnownEntityProvider();
- knownEntityProvider.setParameters(params);
- /**
- * ModelGenerationValidators try to weed out bad hits by the iterations of the name finder.
- * Since this is a recursive process, with each iteration the namefinder will get more and more greedy if bad entities are allowed in
- * this provides a mechanism for throwing out obviously bad hits.
- * A good impl may be to make sure a location is actually within a noun phrase etc...users can make this as specific as they need for their dat
- * and their use case
- */
- ModelGenerationValidator validator = new FileModelValidatorImpl();
- validator.setParameters(params);
- /**
- * Modelable's write and read the annotated sentences, as well as create and write the NER models
- */
-
- Modelable modelable = new ModelableImpl();
- modelable.setParameters(params);
-
- /**
- * the modelGenerator actually runs the process with a set number of iterations... could be better by actually calculating the
- * diff between runs and stopping based on a thresh, but for extrememly large sentence sets this may be too much.
- */
- modelGenerator.build(sentenceProvider, knownEntityProvider, validator, modelable, 2);
-
- }
-}
diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/GenericModelGenerator.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/GenericModelGenerator.java
deleted file mode 100644
index 2d87d77..0000000
--- a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/GenericModelGenerator.java
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Copyright 2013 The Apache Software Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package opennlp.modelbuilder.v2;
-
-import java.util.HashMap;
-import java.util.Map;
-import opennlp.tools.namefind.NameFinderME;
-import opennlp.tools.util.Span;
-
-/**
- *
- *Generic impl
- */
-public class GenericModelGenerator implements SemiSupervisedModelGenerator{
- private Map<String, String> params = new HashMap<String, String>();
-
- @Override
- public void setParameters(Map<String, String> params) {
- this.params = params;
- }
- @Override
- public void build(SentenceProvider sentenceProvider, KnownEntityProvider knownEntityProvider,
- ModelGenerationValidator validator, Modelable modelable, int iterations) {
- for (int iteration = 0; iteration < iterations; iteration++) {
- System.out.println("ITERATION: " + iteration);
- System.out.println("\tPerfoming Known Entity Annotation");
- System.out.println("\t\tknowns: " + knownEntityProvider.getKnownEntities().size());
- System.out.println("\t\treading data....: ");
- for (String sentence : sentenceProvider.getSentences()) {
- for (String knownEntity : knownEntityProvider.getKnownEntities()) {
- if (sentence.contains(knownEntity)) {
- //if the same sentence has multiple hits should they be annotated separately?
- modelable.addAnnotatedSentence(modelable.annotate(sentence, knownEntity, knownEntityProvider.getKnownEntitiesType()));
- }
- }
- }
- System.out.println("\t\twriting annotated sentences....: ");
- modelable.writeAnnotatedSentences();
- modelable.buildModel(knownEntityProvider.getKnownEntitiesType());
- NameFinderME nf = new NameFinderME(modelable.getModel());
- System.out.println("\t\tannotated sentences: " + modelable.getAnnotatedSentences().size());
- System.out.println("\tPerforming NER");
- for (String sentence : sentenceProvider.getSentences()) {
- if (!validator.validSentence(sentence)) {
- continue;
- }
- String[] tokens = modelable.tokenizeSentenceToWords(sentence);
-
- Span[] find = nf.find(tokens);
- nf.clearAdaptiveData();
-
- String[] namedEntities = Span.spansToStrings(find, tokens);
-
- for (String namedEntity : namedEntities) {
- if (validator.validNamedEntity(namedEntity)) {
- knownEntityProvider.addKnownEntity(namedEntity);
- modelable.addAnnotatedSentence(modelable.annotate(sentence, namedEntity, knownEntityProvider.getKnownEntitiesType()));
-
- }
- }
- }
- System.out.println("\t\tannotated sentences: " + modelable.getAnnotatedSentences().size());
- System.out.println("\t\tknowns: " + knownEntityProvider.getKnownEntities().size());
- }
- modelable.writeAnnotatedSentences();
- modelable.buildModel(knownEntityProvider.getKnownEntitiesType());
- }
-}
diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/KnownEntityProvider.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/KnownEntityProvider.java
deleted file mode 100644
index d948b66..0000000
--- a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/KnownEntityProvider.java
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright 2013 The Apache Software Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package opennlp.modelbuilder.v2;
-
-import java.util.Set;
-
-
-
-/**
- *
-Supplies a list of known entities (a list of names or locations)
- */
-public interface KnownEntityProvider extends ModelParameter{
- /**
- * returns a list of known non ambiguous entities.
- * @return a set of entities
- */
- Set<String> getKnownEntities();
-/**
- * adds to the set of known entities. Overriding classes should hold this list in a class level set.
- * @param unambiguousEntity
- */
- void addKnownEntity(String unambiguousEntity);
-/**
- * defines the type of entity that the set contains, ie person, location, organization.
- * @return
- */
- String getKnownEntitiesType();
-
-
-
-}
diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/ModelGenerationValidator.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/ModelGenerationValidator.java
deleted file mode 100644
index 51b4fc4..0000000
--- a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/ModelGenerationValidator.java
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright 2013 The Apache Software Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package opennlp.modelbuilder.v2;
-
-import java.util.Collection;
-
-/**
- *
-Validates results from the iterative namefinding
- */
-public interface ModelGenerationValidator extends ModelParameter {
-
- Boolean validSentence(String sentence);
-
- Boolean validNamedEntity(String namedEntity);
-
-
-
- Collection<String> getBlackList();
-}
diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/ModelParameter.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/ModelParameter.java
deleted file mode 100644
index 7e9d8e9..0000000
--- a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/ModelParameter.java
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- * Copyright 2013 The Apache Software Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package opennlp.modelbuilder.v2;
-
-import java.util.Map;
-
-/**
- *
- */
-public interface ModelParameter {
-
- void setParameters(Map<String, String> params);
-}
diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/Modelable.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/Modelable.java
deleted file mode 100644
index 412b1f3..0000000
--- a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/Modelable.java
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright 2013 The Apache Software Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package opennlp.modelbuilder.v2;
-
-import java.util.Set;
-import opennlp.tools.namefind.TokenNameFinderModel;
-
-/**
- *
- */
-public interface Modelable extends ModelParameter{
-
-
-
- String annotate(String sentence, String namedEntity, String entityType);
-
- void writeAnnotatedSentences();
-
- Set<String> getAnnotatedSentences();
-
- void setAnnotatedSentences(Set<String> annotatedSentences);
-
- void addAnnotatedSentence(String annotatedSentence);
-
- void buildModel( String entityType);
-
- TokenNameFinderModel getModel();
-
- String[] tokenizeSentenceToWords(String sentence);
-
-
-}
diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/SemiSupervisedModelGenerator.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/SemiSupervisedModelGenerator.java
deleted file mode 100644
index 6b7edfe..0000000
--- a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/SemiSupervisedModelGenerator.java
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- * Copyright 2013 The Apache Software Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package opennlp.modelbuilder.v2;
-
-/**
- *
-
- */
-public interface SemiSupervisedModelGenerator extends ModelParameter {
-
- void build(SentenceProvider sentenceProvider, KnownEntityProvider knownEntityProvider,
- ModelGenerationValidator validator, Modelable modelable, int iterations);
-}
diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/SentenceProvider.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/SentenceProvider.java
deleted file mode 100644
index 4b26f78..0000000
--- a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/SentenceProvider.java
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- * Copyright 2013 The Apache Software Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package opennlp.modelbuilder.v2;
-
-import java.util.Set;
-
-/**
- *
- */
-public interface SentenceProvider extends ModelParameter {
-
- Set<String> getSentences();
-}
diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/FileKnownEntityProvider.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/FileKnownEntityProvider.java
deleted file mode 100644
index dfbbbce..0000000
--- a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/FileKnownEntityProvider.java
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * Copyright 2013 The Apache Software Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package opennlp.modelbuilder.v2.impls;
-
-import java.io.BufferedReader;
-import java.io.FileInputStream;
-import java.io.FileNotFoundException;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.nio.charset.Charset;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Map;
-import java.util.Set;
-import java.util.logging.Level;
-import java.util.logging.Logger;
-import opennlp.modelbuilder.v2.KnownEntityProvider;
-
-/**
- *
- */
-public class FileKnownEntityProvider implements KnownEntityProvider {
- private Map<String, String> params = new HashMap<String, String>();
- Set<String> knownEntities = new HashSet<String>();
-
- @Override
- public Set<String> getKnownEntities() {
- if (knownEntities.isEmpty()) {
- try {
- InputStream fis;
- BufferedReader br;
- String line;
-
- fis = new FileInputStream(params.get("knownentityfile"));
- br = new BufferedReader(new InputStreamReader(fis, Charset.forName("UTF-8")));
- while ((line = br.readLine()) != null) {
- knownEntities.add(line.split("\t")[2]);
- }
-
- // Done with the file
- br.close();
- br = null;
- fis = null;
- } catch (FileNotFoundException ex) {
- Logger.getLogger(FileKnownEntityProvider.class.getName()).log(Level.SEVERE, null, ex);
- } catch (IOException ex) {
- Logger.getLogger(FileKnownEntityProvider.class.getName()).log(Level.SEVERE, null, ex);
- }
- }
- return knownEntities;
- }
-
- @Override
- public void addKnownEntity(String unambiguousEntity) {
- knownEntities.add(unambiguousEntity);
- }
-
- @Override
- public String getKnownEntitiesType() {
-
- return params.get("knownentitytype");
- }
-
-
-
- @Override
- public void setParameters(Map<String, String> params) {
- this.params = params;
- }
-}
diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/FileModelValidatorImpl.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/FileModelValidatorImpl.java
deleted file mode 100644
index 6522a1a..0000000
--- a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/FileModelValidatorImpl.java
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * Copyright 2013 The Apache Software Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package opennlp.modelbuilder.v2.impls;
-
-import java.io.BufferedReader;
-import java.io.FileInputStream;
-import java.io.FileNotFoundException;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.nio.charset.Charset;
-import java.util.Collection;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Map;
-import java.util.Set;
-import java.util.logging.Level;
-import java.util.logging.Logger;
-import java.util.regex.Pattern;
-import opennlp.modelbuilder.v2.ModelGenerationValidator;
-
-/**
- *
- */
-public class FileModelValidatorImpl implements ModelGenerationValidator {
-
- private Set<String> badentities = new HashSet<String>();
- private final double MIN_SCORE_FOR_TRAINING = 0.95d;
- private Object validationData;
- private Map<String, String> params = new HashMap<String, String>();
-
- @Override
- public void setParameters(Map<String, String> params) {
- this.params = params;
- }
-
- @Override
- public Boolean validSentence(String sentence) {
- //returning true by default, because the sentence provider will return only "valid" sentences in this case
- return true;
- }
-
- @Override
- public Boolean validNamedEntity(String namedEntity) {
-
- if (badentities.isEmpty()) {
- getBlackList();
- }
-
- Pattern p = Pattern.compile("[0-9]", Pattern.CASE_INSENSITIVE | Pattern.MULTILINE);
- if (p.matcher(namedEntity).find()) {
- return false;
- }
- Boolean b = true;
- if (badentities.contains(namedEntity.toLowerCase())) {
- b = false;
- }
- return b;
- }
-
- @Override
- public Collection<String> getBlackList() {
- if (!badentities.isEmpty()) {
- try {
- InputStream fis;
- BufferedReader br;
- String line;
-
- fis = new FileInputStream(params.get("blacklistfile"));
- br = new BufferedReader(new InputStreamReader(fis, Charset.forName("UTF-8")));
- while ((line = br.readLine()) != null) {
- badentities.add(line);
- }
- br.close();
- br = null;
- fis = null;
- } catch (FileNotFoundException ex) {
- Logger.getLogger(FileKnownEntityProvider.class.getName()).log(Level.SEVERE, null, ex);
- } catch (IOException ex) {
- Logger.getLogger(FileKnownEntityProvider.class.getName()).log(Level.SEVERE, null, ex);
- }
- }
- return badentities;
- }
-}
diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/FileSentenceProvider.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/FileSentenceProvider.java
deleted file mode 100644
index 029b6fc..0000000
--- a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/FileSentenceProvider.java
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Copyright 2013 The Apache Software Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package opennlp.modelbuilder.v2.impls;
-
-import java.io.BufferedReader;
-import java.io.FileInputStream;
-import java.io.FileNotFoundException;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.nio.charset.Charset;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Map;
-import java.util.Set;
-import java.util.logging.Level;
-import java.util.logging.Logger;
-import opennlp.modelbuilder.v2.SentenceProvider;
-
-/**
- *
- */
-public class FileSentenceProvider implements SentenceProvider {
-
- private Map<String, String> params = new HashMap<String, String>();
- Set<String> sentences = new HashSet<String>();
-
- public Set<String> getSentences() {
- if (sentences.isEmpty()) {
- try {
- InputStream fis;
- BufferedReader br;
- String line;
-
- fis = new FileInputStream(params.get("sentencesfile"));
- br = new BufferedReader(new InputStreamReader(fis, Charset.forName("UTF-8")));
- int i=0;
- while ((line = br.readLine()) != null) {
-
- sentences.add(line);
- }
-
- // Done with the file
- br.close();
- br = null;
- fis = null;
- } catch (FileNotFoundException ex) {
- Logger.getLogger(FileKnownEntityProvider.class.getName()).log(Level.SEVERE, null, ex);
- } catch (IOException ex) {
- Logger.getLogger(FileKnownEntityProvider.class.getName()).log(Level.SEVERE, null, ex);
- }
- }
- return sentences;
- }
-
- public void setParameters(Map<String, String> params) {
- this.params = params;
- }
-}
diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/LocationKnownEntityProviderImpl.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/LocationKnownEntityProviderImpl.java
deleted file mode 100644
index 14ddfc0..0000000
--- a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/LocationKnownEntityProviderImpl.java
+++ /dev/null
@@ -1,104 +0,0 @@
-/*
- * Copyright 2013 The Apache Software Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package opennlp.modelbuilder.v2.impls;
-
-import java.sql.CallableStatement;
-import java.sql.Connection;
-import java.sql.DriverManager;
-import java.sql.ResultSet;
-import java.sql.SQLException;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Map;
-import java.util.Set;
-import java.util.logging.Level;
-import java.util.logging.Logger;
-import opennlp.modelbuilder.v2.KnownEntityProvider;
-
-/**
- *
-
- */
-public class LocationKnownEntityProviderImpl implements KnownEntityProvider {
-
- Set<String> ret = new HashSet<String>();
-
- @Override
- public Set<String> getKnownEntities() {
- if (ret.isEmpty()) {
- try {
- getData();
- } catch (Exception ex) {
- Logger.getLogger(LocationKnownEntityProviderImpl.class.getName()).log(Level.SEVERE, null, ex);
- }
-
- }
- return ret;
- }
- private Set<String> getData() throws Exception {
-
- Connection con = getMySqlConnection();
- if (con.isClosed()) {
- con = getMySqlConnection();
- }
- CallableStatement cs;
- cs = con.prepareCall("CALL getcountrylist()");
-
- ResultSet rs;
- try {
- rs = cs.executeQuery();
- while (rs.next()) {
- ret.add(rs.getString("full_name_nd_ro"));
- }
-
- } catch (SQLException ex) {
- throw ex;
- } catch (Exception e) {
- System.err.println(e);
- } finally {
- con.close();
- }
-
- return ret;
- }
- private static Connection getMySqlConnection() throws Exception {
- // EntityLinkerProperties property = new EntityLinkerProperties(new File("c:\\temp\\opennlpmodels\\entitylinker.properties"));
- String driver = "org.gjt.mm.mysql.Driver";
- String url = "jdbc:mysql://127.0.0.1:3306/world";
- String username = "root";
- String password = "559447";
-
- Class.forName(driver);
- Connection conn = DriverManager.getConnection(url, username, password);
- return conn;
- }
- @Override
- public String getKnownEntitiesType() {
- return "location";
- }
-
- @Override
- public void addKnownEntity(String unambiguousEntity) {
- ret.add(unambiguousEntity);
- }
-
- private Map<String, String> params = new HashMap<String, String>();
-
- @Override
- public void setParameters(Map<String, String> params) {
- this.params = params;
- }
-}
diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/ModelValidatorImpl.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/ModelValidatorImpl.java
deleted file mode 100644
index 45e73c1..0000000
--- a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/ModelValidatorImpl.java
+++ /dev/null
@@ -1,141 +0,0 @@
-/*
- * Copyright 2013 The Apache Software Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package opennlp.modelbuilder.v2.impls;
-
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Map;
-import java.util.Set;
-import java.util.regex.Pattern;
-import opennlp.modelbuilder.v2.ModelGenerationValidator;
-
-/**
- *
- */
-public class ModelValidatorImpl implements ModelGenerationValidator {
-
- private Set<String> badentities = new HashSet<String>();
- private final double MIN_SCORE_FOR_TRAINING = 0.95d;
- private Object validationData;
- private Map<String, String> params = new HashMap<String, String>();
-
- @Override
- public void setParameters(Map<String, String> params) {
- this.params = params;
- }
- @Override
- public Boolean validSentence(String sentence) {
- //returning true by default, because the sentence provider will return only "valid" sentences in this case
- return true;
- }
-
- @Override
- public Boolean validNamedEntity(String namedEntity) {
-
- if (badentities.isEmpty()) {
- getBlackList();
- }
-
- Pattern p = Pattern.compile("[0-9]", Pattern.CASE_INSENSITIVE | Pattern.MULTILINE);
- if (p.matcher(namedEntity).find()) {
- return false;
- }
- Boolean b = true;
- if (badentities.contains(namedEntity.toLowerCase())) {
- b = false;
- }
- return b;
- }
-
-
- @Override
- public Set<String> getBlackList() {
- badentities.add(".");
- badentities.add("-");
- badentities.add(",");
- badentities.add(";");
- badentities.add("the");
- badentities.add("that");
- badentities.add("several");
- badentities.add("model");
- badentities.add("our");
- badentities.add("are");
- badentities.add("in");
- badentities.add("are");
- badentities.add("at");
- badentities.add("is");
- badentities.add("for");
- badentities.add("the");
- badentities.add("during");
- badentities.add("south");
- badentities.add("from");
- badentities.add("recounts");
- badentities.add("wissenschaftliches");
- badentities.add("if");
- badentities.add("security");
- badentities.add("denouncing");
- badentities.add("writes");
- badentities.add("but");
- badentities.add("operation");
- badentities.add("adds");
- badentities.add("Above");
- badentities.add("but");
- badentities.add("RIP");
- badentities.add("on");
- badentities.add("no");
- badentities.add("agrees");
- badentities.add("year");
- badentities.add("for");
- badentities.add("you");
- badentities.add("red");
- badentities.add("added");
- badentities.add("hello");
- badentities.add("around");
- badentities.add("has");
- badentities.add("turn");
- badentities.add("surrounding");
- badentities.add("\" No");
- badentities.add("aug.");
- badentities.add("or");
- badentities.add("quips");
- badentities.add("september");
- badentities.add("[mr");
- badentities.add("diseases");
- badentities.add("when");
- badentities.add("bbc");
- badentities.add(":\"");
- badentities.add("dr");
- badentities.add("baby");
- badentities.add("on");
- badentities.add("route");
- badentities.add("'");
- badentities.add("\"");
- badentities.add("a");
- badentities.add("her");
- badentities.add("'");
- badentities.add("\"");
- badentities.add("two");
- badentities.add("that");
- badentities.add(":");
- badentities.add("one");
- badentities.add("Party");
- badentities.add("Championship");
-
- badentities.add("Ltd");
-
- return badentities;
- }
-}
diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/ModelableImpl.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/ModelableImpl.java
deleted file mode 100644
index dac5969..0000000
--- a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/ModelableImpl.java
+++ /dev/null
@@ -1,141 +0,0 @@
-/*
- * Copyright 2013 The Apache Software Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package opennlp.modelbuilder.v2.impls;
-
-import java.io.BufferedOutputStream;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.FileOutputStream;
-import java.io.FileWriter;
-import java.io.IOException;
-import java.io.OutputStream;
-import java.nio.charset.Charset;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Map;
-import java.util.Set;
-import java.util.logging.Level;
-import java.util.logging.Logger;
-import opennlp.modelbuilder.v2.Modelable;
-import opennlp.tools.namefind.NameFinderME;
-import opennlp.tools.namefind.NameSample;
-import opennlp.tools.namefind.NameSampleDataStream;
-import opennlp.tools.namefind.TokenNameFinderModel;
-import opennlp.tools.tokenize.TokenizerME;
-import opennlp.tools.tokenize.TokenizerModel;
-import opennlp.tools.util.ObjectStream;
-import opennlp.tools.util.PlainTextByLineStream;
-
-/**
- *
- */
-public class ModelableImpl implements Modelable {
-
- private TokenizerModel tm;
- private TokenizerME wordBreaker;
- private String path = "c:\\temp\\opennlpmodels\\";
- private String trainingDataPath = "";
- private String modelOutPath = "";
- private Set<String> annotatedSentences = new HashSet<String>();
- private Map<String, String> params = new HashMap<String, String>();
-
- @Override
- public void setParameters(Map<String, String> params) {
- this.params = params;
- path = params.get("modelablepath");
- trainingDataPath = path + "\\" + params.get("knownentitytype") + ".train";
- modelOutPath = path + "\\" + params.get("knownentitytype")+".model";
- }
-
- @Override
- public String annotate(String sentence, String namedEntity, String entityType) {
- String annotation = sentence.replace(namedEntity, " <START:" + entityType + "> " + namedEntity + " <END> ");
-
- return annotation;
- }
-
- @Override
- public void writeAnnotatedSentences() {
- try {
-
- FileWriter writer = new FileWriter(trainingDataPath, false);
-
- for (String s : annotatedSentences) {
- writer.write(s.replace("\n", " ").trim() + "\n");
- }
- writer.close();
- } catch (IOException ex) {
- ex.printStackTrace();
- }
- }
-
- @Override
- public Set<String> getAnnotatedSentences() {
- return annotatedSentences;
- }
-
- @Override
- public void setAnnotatedSentences(Set<String> annotatedSentences) {
- this.annotatedSentences = annotatedSentences;
- }
-
- @Override
- public void addAnnotatedSentence(String annotatedSentence) {
- annotatedSentences.add(annotatedSentence);
- }
-
- @Override
- public void buildModel(String entityType) {
- try {
- System.out.println("\tBuilding Model using " + annotatedSentences.size() + " annotations");
- System.out.println("\t\treading training data...");
- Charset charset = Charset.forName("UTF-8");
- ObjectStream<String> lineStream =
- new PlainTextByLineStream(new FileInputStream(trainingDataPath), charset);
- ObjectStream<NameSample> sampleStream = new NameSampleDataStream(lineStream);
-
- TokenNameFinderModel model;
- model = NameFinderME.train("en", entityType, sampleStream, null);
- sampleStream.close();
- OutputStream modelOut = new BufferedOutputStream(new FileOutputStream(new File(modelOutPath)));
- model.serialize(modelOut);
- if (modelOut != null) {
- modelOut.close();
- }
- System.out.println("\tmodel generated");
- } catch (Exception e) {
- }
- }
-
- @Override
- public TokenNameFinderModel getModel() {
-
-
- TokenNameFinderModel nerModel = null;
- try {
- nerModel = new TokenNameFinderModel(new FileInputStream(new File(modelOutPath)));
- } catch (IOException ex) {
- Logger.getLogger(ModelableImpl.class.getName()).log(Level.SEVERE, null, ex);
- }
- return nerModel;
- }
-
- @Override
- public String[] tokenizeSentenceToWords(String sentence) {
- return sentence.split(" ");
-
- }
-}
diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/MySQLSentenceProviderImpl.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/MySQLSentenceProviderImpl.java
deleted file mode 100644
index c90e791..0000000
--- a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/MySQLSentenceProviderImpl.java
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Copyright 2013 The Apache Software Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package opennlp.modelbuilder.v2.impls;
-
-import java.sql.*;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Map;
-import java.util.Set;
-import opennlp.modelbuilder.v2.SentenceProvider;
-
-/**
- *
- */
-public class MySQLSentenceProviderImpl implements SentenceProvider {
-
- Set<String> sentences = new HashSet<String>();
-
- @Override
- public Set<String> getSentences() {
- try {
- if (sentences.isEmpty()) {
- return getData();
- }
- } catch (Exception e) {
- }
- return sentences;
- }
-
- private Set<String> getData() throws Exception {
-
- Connection con = getMySqlConnection();
- if (con.isClosed()) {
- con = getMySqlConnection();
- }
- CallableStatement cs;
- cs = con.prepareCall("CALL getTrainingSentences()");
-
- ResultSet rs;
- try {
- rs = cs.executeQuery();
- while (rs.next()) {
- sentences.add(rs.getString(1));
- }
-
- } catch (SQLException ex) {
- throw ex;
- } catch (Exception e) {
- System.err.println(e);
- } finally {
- con.close();
- }
-
- return sentences;
- }
-
- private static Connection getMySqlConnection() throws Exception {
- // EntityLinkerProperties property = new EntityLinkerProperties(new File("c:\\temp\\opennlpmodels\\entitylinker.properties"));
- String driver = "org.gjt.mm.mysql.Driver";
- String url = "jdbc:mysql://localhost:3306/db";
- String username = "root";
- String password = "??";
-
- Class.forName(driver);
- Connection conn = DriverManager.getConnection(url, username, password);
- return conn;
- }
-
- private Map<String, String> params = new HashMap<String,String>();
-
- @Override
- public void setParameters(Map<String, String> params) {
- this.params = params;
- }
-}
diff --git a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/PersonKnownEntityProviderImpl.java b/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/PersonKnownEntityProviderImpl.java
deleted file mode 100644
index 80c643a..0000000
--- a/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/PersonKnownEntityProviderImpl.java
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * Copyright 2013 The Apache Software Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package opennlp.modelbuilder.v2.impls;
-
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Map;
-import java.util.Set;
-import opennlp.modelbuilder.v2.KnownEntityProvider;
-
-/**
- *
- */
-public class PersonKnownEntityProviderImpl implements KnownEntityProvider {
-
- Set<String> ret = new HashSet<String>();
-
- @Override
- public Set<String> getKnownEntities() {
- if (ret.isEmpty()) {
- ret.add("Barack Obama");
- ret.add("Mitt Romney");
- ret.add("John Doe");
- ret.add("Bill Gates");
- ret.add("Nguyen Tan Dung");
- ret.add("Hassanal Bolkiah");
- ret.add("Bashar al-Assad");
- ret.add("Faysal Khabbaz Hamou");
- ret.add("Dr Talwar");
- ret.add("Mr. Bolkiah");
- ret.add("Bashar");
- ret.add("Romney");
- ret.add("Obama");
- ret.add("the President");
- ret.add("Mr. Gates");
- ret.add("Romney");
-
-
-
- ret.add("Xi Jinping");
- ret.add("Hassanal Bolkiah");
- ret.add("Leon Panetta");
- ret.add("Paul Beales");
- ret.add("Mr Rajapaksa");
- ret.add("Mohammed ");
- ret.add("Ieng Thirith");
- ret.add("Mr Xi");
- ret.add("John Sudworth");
- ret.add("Ieng Thirith");
- ret.add("Aung San Suu Kyi");
-
- ret.add("Khorshid");
- ret.add("Karrie Webb");
- ret.add("Doyle McManus");
- ret.add("Pope John Paul");
- ret.add("Roland Buerk");
- ret.add("Paul Ryan");
- ret.add("Tammy Baldwin");
- ret.add("Ben Unger");
- ret.add("Chris Christie");
- ret.add("Mary Magdalene");
- ret.add("George Walker Bush");
- ret.add("Melendez-Martinez");
- ret.add("Osiel Cardenas Guillen");
- ret.add("President Molina");
- ret.add("Lubaina Himid");
- ret.add("Elizabeth Frink");
- ret.add("Graham Sutherland");
- ret.add("Gorman Adams");
- ret.add("Peter Sheasby");
- ret.add("Andrew Walker");
- ret.add("Elias Garcia Martinez");
- ret.add("Elias Martinez");
-
- }
- return ret;
- }
-
- @Override
- public String getKnownEntitiesType() {
- return "person";
- }
-
- @Override
- public void addKnownEntity(String unambiguousEntity) {
- ret.add(unambiguousEntity);
- }
-
- private Map<String, String> params = new HashMap<String,String>();
-
- @Override
- public void setParameters(Map<String, String> params) {
- this.params = params;
- }
-}