OPENNLP-579
Added a SetupUtils class so users can get the Lucene indexes and Country Doccat models built very easily. Also many other small efficiencies.
diff --git a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/FuzzyStringMatchScorer.java b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/FuzzyStringMatchScorer.java
index 8216e93..af1aa1c 100644
--- a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/FuzzyStringMatchScorer.java
+++ b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/FuzzyStringMatchScorer.java
@@ -41,7 +41,7 @@
}
}
-
+
}
/**
diff --git a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerIndexer.java b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerIndexer.java
index cbe8a0d..5ea08ad 100644
--- a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerIndexer.java
+++ b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerIndexer.java
@@ -1,6 +1,17 @@
/*
- * To change this template, choose Tools | Templates
- * and open the template in the editor.
+ * Copyright 2013 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
*/
package org.apache.opennlp.addons.tools.entitylinker.geoentitylinker;
@@ -23,7 +34,7 @@
/**
*
- * @author Owner
+ * Creates two lucene indexes, geonames and usgs for use in GeoEntityLinker
*/
public class GazateerIndexer {
diff --git a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoEntityLinker.java b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoEntityLinker.java
index b9401a3..1404ce9 100644
--- a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoEntityLinker.java
+++ b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoEntityLinker.java
@@ -33,13 +33,11 @@
*/
public class GeoEntityLinker implements EntityLinker<LinkedSpan> {
- // CountryProximityScorer scorer = new CountryProximityScorer();
-// private MySQLGeoNamesGazLinkable geoNamesGaz;// = new MySQLGeoNamesGazLinkable();
-// private MySQLUSGSGazLinkable usgsGaz;//= new MySQLUSGSGazLinkable();
private CountryContext countryContext;
private Map<String, Set<Integer>> countryMentions;
private EntityLinkerProperties linkerProperties;
private GazateerSearcher gazateerSearcher = new GazateerSearcher();
+ private List<LinkedEntityScorer> scorers = new ArrayList<>();
/**
* Flag for deciding whether to search gaz only for toponyms within countries
* that are mentioned in the document
@@ -97,11 +95,12 @@
}
}
- List<LinkedEntityScorer<CountryContext>> scorers = new ArrayList<>();
- scorers.add(new FuzzyStringMatchScorer());
- scorers.add(new GeoHashBinningScorer());
- scorers.add(new CountryProximityScorer());
- scorers.add(new ModelBasedScorer());
+ if (scorers.isEmpty()) {
+ scorers.add(new FuzzyStringMatchScorer());
+ scorers.add(new GeoHashBinningScorer());
+ scorers.add(new CountryProximityScorer());
+ scorers.add(new ModelBasedScorer());
+ }
for (LinkedEntityScorer scorer : scorers) {
scorer.score(spans, doctext, sentences, linkerProperties, countryContext);
}
diff --git a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoEntityLinkerSetupUtils.java b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoEntityLinkerSetupUtils.java
new file mode 100644
index 0000000..05fe374
--- /dev/null
+++ b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoEntityLinkerSetupUtils.java
@@ -0,0 +1,146 @@
+/*
+ * Copyright 2013 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.opennlp.addons.tools.entitylinker.geoentitylinker;
+
+import java.io.BufferedOutputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Set;
+import opennlp.tools.doccat.DoccatModel;
+import opennlp.tools.doccat.DocumentCategorizerME;
+import opennlp.tools.doccat.DocumentSample;
+import opennlp.tools.doccat.DocumentSampleStream;
+import opennlp.tools.entitylinker.EntityLinkerProperties;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.PlainTextByLineStream;
+import static org.apache.opennlp.addons.tools.entitylinker.geoentitylinker.ModelBasedScorer.RADIUS;
+
+
+/**
+ *
+ * Tools for setting up GeoEntityLinker gazateers and doccat scoring model
+ */
+public class GeoEntityLinkerSetupUtils {
+ public static ModelBasedScorer scorer;
+
+ static {
+ scorer = new ModelBasedScorer();
+ }
+ public static void createLuceneIndex(File outputIndexDir, File gazateerInputData, GazateerIndexer.GazType type){
+ GazateerIndexer indexer = new GazateerIndexer();
+ try {
+ indexer.index(outputIndexDir, gazateerInputData, type);
+ } catch (Exception ex) {
+ ex.printStackTrace();
+ }
+ }
+ /**
+ *
+ * @param documents A list of document texts, for best results try to
+ * ensure each country you care about will be
+ * represented in the collection
+ * @param annotationOutFile the location where the annotated doccat text file
+ * will be stored
+ * @param modelOutFile the location where the doccat model will be stored
+ * @param properties the properties where the country context object
+ * will find it's country data from this property:
+ * opennlp.geoentitylinker.countrycontext.filepath
+ * @throws IOException
+ */
+ public static void buildCountryContextModel(Collection<String> documents, File annotationOutFile, File modelOutFile, EntityLinkerProperties properties) throws IOException {
+ CountryContext context = new CountryContext();
+ FileWriter writer = new FileWriter(annotationOutFile, true);
+ System.out.println("processing " + documents.size() + " documents");
+ for (String docText : documents) {
+ System.out.append(".");
+ Map<String, Set<Integer>> regexfind = context.regexfind(docText, properties);
+ Map<String, ArrayList<String>> modelCountryContext = modelCountryContext(docText, context, RADIUS);
+ for (String key : modelCountryContext.keySet()) {
+ for (String wordbag : modelCountryContext.get(key)) {
+ writer.write(key + " " + wordbag + "\n");
+ }
+ }
+ }
+ System.out.println("Document processing complete. Writing traininf data to file");
+ writer.close();
+ System.out.println("Building Doccat model...");
+ DoccatModel model = null;
+
+ InputStream dataIn = new FileInputStream(annotationOutFile);
+ try {
+
+ ObjectStream<String> lineStream =
+ new PlainTextByLineStream(dataIn, "UTF-8");
+ ObjectStream<DocumentSample> sampleStream = new DocumentSampleStream(lineStream);
+
+ model = DocumentCategorizerME.train("en", sampleStream);
+ OutputStream modelOut = new BufferedOutputStream(new FileOutputStream(modelOutFile));
+ model.serialize(modelOut);
+ System.out.println("Model complete!");
+ } catch (IOException e) {
+ // Failed to read or parse training data, training failed
+ e.printStackTrace();
+ }
+
+ }
+
+ /**
+ * generates proximal wordbags within the radius of a country mention within
+ * the doctext based on the country context object
+ *
+ *
+ * @param docText
+ * @param additionalContext
+ * @param radius
+ * @return
+ */
+ public static Map<String, ArrayList<String>> modelCountryContext(String docText, CountryContext additionalContext, int radius) {
+ Map<String, ArrayList< String>> featureBags = new HashMap<>();
+ Map<String, Set<Integer>> countryMentions = additionalContext.getCountryMentions();
+ /**
+ * iterator over the map that contains a mapping of every country code to
+ * all of its mentions in the document
+ */
+ for (String code : countryMentions.keySet()) {
+ /**
+ * for each mention, collect features from around each mention, then
+ * consolidate the features into another map
+ */
+ for (int mentionIdx : countryMentions.get(code)) {
+ String chunk = scorer.getTextChunk(mentionIdx, docText, radius);
+ // Collection<String> extractFeatures = super.extractFeatures(chunk.split(" "));
+ if (featureBags.containsKey(code)) {
+ featureBags.get(code).add(chunk);
+ } else {
+ ArrayList<String> newlist = new ArrayList<>();
+ newlist.add(chunk);
+ featureBags.put(code, newlist);
+ }
+ }
+ }
+ return featureBags;
+ }
+
+}
diff --git a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/ModelBasedScorer.java b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/ModelBasedScorer.java
index 5e3abdd..e25ba07 100644
--- a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/ModelBasedScorer.java
+++ b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/ModelBasedScorer.java
@@ -15,46 +15,26 @@
*/
package org.apache.opennlp.addons.tools.entitylinker.geoentitylinker;
-import java.io.BufferedOutputStream;
import java.io.File;
-import java.io.FileInputStream;
import java.io.FileNotFoundException;
-import java.io.FileOutputStream;
-import java.io.FileWriter;
import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.util.ArrayList;
-import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
-import java.util.Set;
-import java.util.logging.Level;
-import java.util.logging.Logger;
-import opennlp.tools.doccat.BagOfWordsFeatureGenerator;
import opennlp.tools.doccat.DoccatModel;
import opennlp.tools.doccat.DocumentCategorizerME;
-import opennlp.tools.doccat.DocumentSample;
-import opennlp.tools.doccat.DocumentSampleStream;
import opennlp.tools.entitylinker.EntityLinkerProperties;
import opennlp.tools.entitylinker.domain.BaseLink;
import opennlp.tools.entitylinker.domain.LinkedSpan;
-import opennlp.tools.util.ObjectStream;
-import opennlp.tools.util.PlainTextByLineStream;
import opennlp.tools.util.Span;
/**
*
- *Utilizes a doccat model to score toponyms based on surrounding context
+ * Utilizes a doccat model to score toponyms based on surrounding context
*/
public class ModelBasedScorer implements LinkedEntityScorer<CountryContext> {
- public static ModelBasedScorer scorer;
- static {
- scorer = new ModelBasedScorer();
- }
DocumentCategorizerME documentCategorizerME;
DoccatModel doccatModel;
public static final int RADIUS = 100;
@@ -68,7 +48,6 @@
return;
}
doccatModel = new DoccatModel(new File(path));
-
documentCategorizerME = new DocumentCategorizerME(doccatModel);
}
Map<Integer, String> proximalFeatures = generateProximalFeatures(linkedSpans, sentenceSpans, docText, RADIUS);
@@ -88,7 +67,7 @@
} catch (IOException ex) {
System.err.println(ex);
} catch (Exception ex) {
- Logger.getLogger(ModelBasedScorer.class.getName()).log(Level.SEVERE, null, ex);
+ System.err.println(ex);
}
}
@@ -136,7 +115,7 @@
return featureBags;
}
- private String getTextChunk(int mentionIdx, String docText, int radius) {
+ public String getTextChunk(int mentionIdx, String docText, int radius) {
int docSize = docText.length();
int left = 0, right = 0;
left = (mentionIdx - radius < 0) ? 0 : mentionIdx - radius;
@@ -167,9 +146,6 @@
private Map<String, Double> getScore(String text) throws Exception {
Map<String, Double> scoreMap = new HashMap<>();
- if (documentCategorizerME == null) {
- documentCategorizerME = new DocumentCategorizerME(new DoccatModel(new File("")));
- }
double[] categorize = documentCategorizerME.categorize(text);
int catSize = documentCategorizerME.getNumberOfCategories();
for (int i = 0; i < catSize; i++) {
@@ -179,88 +155,5 @@
return scoreMap;
}
- /**
- *
- * @param documents A list of document texts, for best results try to
- * ensure each country you care about will be
- * represented by the collection
- * @param annotationOutFile the location where the annotated doccat text file
- * will be stored
- * @param modelOutFile the location where the doccat model will be stored
- * @param properties the properties where the country context object
- * will find it's country data from this property:
- * opennlp.geoentitylinker.countrycontext.filepath
- * @throws IOException
- */
- public static void buildCountryContextModel(Collection<String> documents, File annotationOutFile, File modelOutFile, EntityLinkerProperties properties) throws IOException {
- CountryContext context = new CountryContext();
- FileWriter writer = new FileWriter(annotationOutFile, true);
- for (String docText : documents) {
-
- Map<String, Set<Integer>> regexfind = context.regexfind(docText, properties);
- Map<String, ArrayList<String>> modelCountryContext = modelCountryContext(docText, context, RADIUS);
- for (String key : modelCountryContext.keySet()) {
- for (String wordbag : modelCountryContext.get(key)) {
- writer.write(key + " " + wordbag + "\n");
- }
- }
- }
-
- writer.close();
-
- DoccatModel model = null;
-
- InputStream dataIn = new FileInputStream(annotationOutFile);
- try {
-
- ObjectStream<String> lineStream =
- new PlainTextByLineStream(dataIn, "UTF-8");
- ObjectStream<DocumentSample> sampleStream = new DocumentSampleStream(lineStream);
-
- model = DocumentCategorizerME.train("en", sampleStream);
- OutputStream modelOut = new BufferedOutputStream(new FileOutputStream(modelOutFile));
- model.serialize(modelOut);
- } catch (IOException e) {
- // Failed to read or parse training data, training failed
- e.printStackTrace();
- }
-
- }
-
- /**
- * generates proximal wordbags within the radius of a country mention within
- * the doctext based on the country context object
- *
- *
- * @param docText
- * @param additionalContext
- * @param radius
- * @return
- */
- public static Map<String, ArrayList<String>> modelCountryContext(String docText, CountryContext additionalContext, int radius) {
- Map<String, ArrayList< String>> featureBags = new HashMap<>();
- Map<String, Set<Integer>> countryMentions = additionalContext.getCountryMentions();
- /**
- * iterator over the map that contains a mapping of every country code to
- * all of its mentions in the document
- */
- for (String code : countryMentions.keySet()) {
- /**
- * for each mention, collect features from around each mention, then
- * consolidate the features into another map
- */
- for (int mentionIdx : countryMentions.get(code)) {
- String chunk = scorer.getTextChunk(mentionIdx, docText, radius);
- // Collection<String> extractFeatures = super.extractFeatures(chunk.split(" "));
- if (featureBags.containsKey(code)) {
- featureBags.get(code).add(chunk);
- } else {
- ArrayList<String> newlist = new ArrayList<>();
- newlist.add(chunk);
- featureBags.put(code, newlist);
- }
- }
- }
- return featureBags;
- }
+
}