OPENNLP-615
Added a scoring impl that utilizes a doccat model to help with toponym resolution. The ModelBasedScorer also contains two static methods for training the model based on the CountryContext information used by the GeoEntityLinker.
diff --git a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryContext.java b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryContext.java
index 1702f85..541d042 100644
--- a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryContext.java
+++ b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryContext.java
@@ -82,7 +82,7 @@
// countrydata = getCountryData(properties);
}
for (CountryContextEntry entry : countrydata) {
- Pattern regex = Pattern.compile(entry.getFull_name_nd_ro(), Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
+ Pattern regex = Pattern.compile(entry.getFull_name_nd_ro().trim(), Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
Matcher rs = regex.matcher(docText);
String code = entry.getCc1().toLowerCase();
diff --git a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryProximityScorer.java b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryProximityScorer.java
index 4b24b11..3198650 100644
--- a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryProximityScorer.java
+++ b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryProximityScorer.java
@@ -22,6 +22,7 @@
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
+import opennlp.tools.entitylinker.EntityLinkerProperties;
import opennlp.tools.entitylinker.domain.BaseLink;
import opennlp.tools.entitylinker.domain.LinkedSpan;
import opennlp.tools.util.Span;
@@ -35,7 +36,7 @@
String dominantCode = "";
@Override
- public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, CountryContext additionalContext) {
+ public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans,EntityLinkerProperties properties, CountryContext additionalContext) {
score(linkedSpans, additionalContext.getCountryMentions(), additionalContext.getNameCodesMap(), docText, sentenceSpans, 1000);
@@ -191,8 +192,12 @@
all.addAll(distanceMap.get(key));
}
//get min max for normalization, this could be more efficient
+
Integer min = all.first();
Integer max = all.last();
+ if(min==max){
+ min=0;
+ }
for (String key : distanceMap.keySet()) {
TreeSet<Double> normalizedDistances = new TreeSet<Double>();
diff --git a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/FuzzyStringMatchScorer.java b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/FuzzyStringMatchScorer.java
index c21f5e2..8216e93 100644
--- a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/FuzzyStringMatchScorer.java
+++ b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/FuzzyStringMatchScorer.java
@@ -18,6 +18,7 @@
import java.util.HashSet;
import java.util.List;
import java.util.Set;
+import opennlp.tools.entitylinker.EntityLinkerProperties;
import opennlp.tools.entitylinker.domain.BaseLink;
import opennlp.tools.entitylinker.domain.LinkedSpan;
import opennlp.tools.ngram.NGramGenerator;
@@ -30,7 +31,7 @@
public class FuzzyStringMatchScorer implements LinkedEntityScorer<CountryContext> {
@Override
- public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, CountryContext additionalContext) {
+ public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, CountryContext additionalContext) {
for (LinkedSpan<BaseLink> linkedSpan : linkedSpans) {
for (BaseLink link : linkedSpan.getLinkedEntries()) {
Double dice = getDiceCoefficient(linkedSpan.getSearchTerm().toLowerCase().replace(" ", ""), link.getItemName().toLowerCase().replace(" ", ""), 2);
diff --git a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerSearcher.java b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerSearcher.java
index e89e8a5..13cce97 100644
--- a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerSearcher.java
+++ b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerSearcher.java
@@ -42,7 +42,7 @@
*/
public class GazateerSearcher {
- private FuzzyStringMatchScorer diceScorer = new FuzzyStringMatchScorer();
+ //private FuzzyStringMatchScorer diceScorer = new FuzzyStringMatchScorer();
private double scoreCutoff = .75;
private Directory geonamesIndex;//= new MMapDirectory(new File(indexloc));
private IndexReader geonamesReader;// = DirectoryReader.open(geonamesIndex);
@@ -72,7 +72,7 @@
geonamesAnalyzer = new StandardAnalyzer(Version.LUCENE_45);
}
- String luceneQueryString = "FULL_NAME_ND_RO:" + searchString + " & CC1:" + code.toUpperCase();// + "~1.0";
+ String luceneQueryString = "FULL_NAME_ND_RO:" + searchString + " AND CC1:" + code.toLowerCase() + "^100";
QueryParser parser = new QueryParser(Version.LUCENE_45, luceneQueryString, geonamesAnalyzer);
Query q = parser.parse(luceneQueryString);
diff --git a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoEntityLinker.java b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoEntityLinker.java
index fe58e0d..b9401a3 100644
--- a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoEntityLinker.java
+++ b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoEntityLinker.java
@@ -24,6 +24,7 @@
import opennlp.tools.util.Span;
import opennlp.tools.entitylinker.EntityLinkerProperties;
import opennlp.tools.entitylinker.EntityLinker;
+
/**
* Links location entities to gazatteers. Currently supports gazateers in a
* MySql database (NGA and USGS)
@@ -71,7 +72,7 @@
// geoNamesEntries = geoNamesGaz.find(matches[i], names[i], countryMentions, linkerProperties);
for (String code : countryMentions.keySet()) {
if (!code.equals("us")) {
- geoNamesEntries.addAll(gazateerSearcher.geonamesFind(matches[i], 5, code, linkerProperties));
+ geoNamesEntries.addAll(gazateerSearcher.geonamesFind(matches[i], 10, code, linkerProperties));
}
}
@@ -100,9 +101,9 @@
scorers.add(new FuzzyStringMatchScorer());
scorers.add(new GeoHashBinningScorer());
scorers.add(new CountryProximityScorer());
-
+ scorers.add(new ModelBasedScorer());
for (LinkedEntityScorer scorer : scorers) {
- scorer.score(spans, doctext, sentences, countryContext);
+ scorer.score(spans, doctext, sentences, linkerProperties, countryContext);
}
return spans;
}
diff --git a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoHashBinningScorer.java b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoHashBinningScorer.java
index 7a87ee6..26b69c1 100644
--- a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoHashBinningScorer.java
+++ b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoHashBinningScorer.java
@@ -22,6 +22,7 @@
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
+import opennlp.tools.entitylinker.EntityLinkerProperties;
import opennlp.tools.entitylinker.domain.BaseLink;
import opennlp.tools.entitylinker.domain.LinkedSpan;
import opennlp.tools.util.Span;
@@ -34,7 +35,7 @@
public class GeoHashBinningScorer implements LinkedEntityScorer<CountryContext> {
@Override
- public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, CountryContext additionalContext) {
+ public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans,EntityLinkerProperties properties, CountryContext additionalContext) {
score( linkedSpans);
}
diff --git a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/LinkedEntityScorer.java b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/LinkedEntityScorer.java
index a70a628..3f7d5fa 100644
--- a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/LinkedEntityScorer.java
+++ b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/LinkedEntityScorer.java
@@ -16,6 +16,7 @@
package org.apache.opennlp.addons.tools.entitylinker.geoentitylinker;
import java.util.List;
+import opennlp.tools.entitylinker.EntityLinkerProperties;
import opennlp.tools.entitylinker.domain.LinkedSpan;
import opennlp.tools.util.Span;
@@ -33,5 +34,5 @@
* @param additionalContext any additional data required to perform the scoring operation
* @return void
*/
- void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, T additionalContext);
+ void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, T additionalContext);
}
diff --git a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/ModelBasedScorer.java b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/ModelBasedScorer.java
new file mode 100644
index 0000000..be07240
--- /dev/null
+++ b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/ModelBasedScorer.java
@@ -0,0 +1,255 @@
+/*
+ * To change this template, choose Tools | Templates
+ * and open the template in the editor.
+ */
+package org.apache.opennlp.addons.tools.entitylinker.geoentitylinker;
+
+import java.io.BufferedOutputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+import opennlp.tools.doccat.BagOfWordsFeatureGenerator;
+import opennlp.tools.doccat.DoccatModel;
+import opennlp.tools.doccat.DocumentCategorizerME;
+import opennlp.tools.doccat.DocumentSample;
+import opennlp.tools.doccat.DocumentSampleStream;
+import opennlp.tools.entitylinker.EntityLinkerProperties;
+import opennlp.tools.entitylinker.domain.BaseLink;
+import opennlp.tools.entitylinker.domain.LinkedSpan;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.PlainTextByLineStream;
+import opennlp.tools.util.Span;
+
+/**
+ *
+ * @author Owner
+ */
+public class ModelBasedScorer implements LinkedEntityScorer<CountryContext> {
+
+ public static ModelBasedScorer scorer;
+
+ static {
+ scorer = new ModelBasedScorer();
+ }
+ DocumentCategorizerME documentCategorizerME;
+ DoccatModel doccatModel;
+ public static final int RADIUS = 100;
+
+ @Override
+ public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, CountryContext additionalContext) {
+ try {
+ if (doccatModel == null) {
+ String path = properties.getProperty("opennlp.geoentitylinker.modelbasedscorer.modelpath", "");
+ if (path.equals("")) {
+ return;
+ }
+ doccatModel = new DoccatModel(new File(path));
+
+ documentCategorizerME = new DocumentCategorizerME(doccatModel);
+ }
+ Map<Integer, String> proximalFeatures = generateProximalFeatures(linkedSpans, sentenceSpans, docText, RADIUS);
+ for (Map.Entry<Integer, String> entry : proximalFeatures.entrySet()) {
+ Map<String, Double> scores = this.getScore(entry.getValue());
+ for (BaseLink link : (List<BaseLink>) linkedSpans.get(entry.getKey()).getLinkedEntries()) {
+ double score = 0d;
+ if (scores.containsKey(link.getItemParentID())) {
+ score = scores.get(link.getItemParentID());
+ }
+ link.getScoreMap().put("countrymodel", score);
+ }
+ }
+
+ } catch (FileNotFoundException ex) {
+ System.err.println("could not find modelpath using EntityLinkerProperties. Property should be \"opennlp.geoentitylinker.modelbasedscorer.modelpath\"");
+ } catch (IOException ex) {
+ System.err.println(ex);
+ } catch (Exception ex) {
+ Logger.getLogger(ModelBasedScorer.class.getName()).log(Level.SEVERE, null, ex);
+ }
+ }
+
+ /**
+ * generates features using a BagOfWordsfeatureGenerator that are within the
+ * radius of a mention within the doctext
+ *
+ * @param linkedSpans
+ * @param docText
+ * @param additionalContext
+ * @param radius
+ * @return a map of the index of the linked span to the string of surrounding
+ * text: Map<indexofspan,surrounding text>
+ */
+ public Map<Integer, String> generateProximalFeatures(List<LinkedSpan> linkedSpans, Span[] sentenceSpans, String docText, int radius) {
+ Map<Integer, String> featureBags = new HashMap<>();
+ Map<Integer, Integer> nameMentionMap = new HashMap<>();
+ /**
+ * iterator over the map that contains a mapping of every country code to
+ * all of its mentions in the document
+ */
+ for (int i = 0; i < linkedSpans.size(); i++) {
+ LinkedSpan span = linkedSpans.get(i);
+ if (span.getLinkedEntries().isEmpty()) {
+ //don't care about spans that did not get linked to anything at all; nothing to work with
+ continue;
+ }
+ /**
+ * get the sentence the name span was found in, the beginning of the
+ * sentence will suffice as a centroid for feature generation around the
+ * named entity
+ */
+ Integer mentionIdx = sentenceSpans[span.getSentenceid()].getStart();
+ nameMentionMap.put(i, mentionIdx);
+ }
+ /**
+ * now associate each span to a string that will be used for categorization
+ * against the model.
+ */
+ for (Map.Entry<Integer, Integer> entry : nameMentionMap.entrySet()) {
+ featureBags.put(entry.getKey(), getTextChunk(entry.getValue(), docText, radius));
+ }
+
+
+ return featureBags;
+ }
+
+ private String getTextChunk(int mentionIdx, String docText, int radius) {
+ int docSize = docText.length();
+ int left = 0, right = 0;
+ left = (mentionIdx - radius < 0) ? 0 : mentionIdx - radius;
+ right = (mentionIdx + radius > docSize) ? docSize : mentionIdx + radius;
+ String chunk = "";
+ if (right <= left) {
+ chunk = "";
+ } else {
+ /**
+ * don't want to chop any words in half, so take fron the first space to
+ * the last space in the chunk string
+ */
+ chunk = docText.substring(left, right);
+ if (left != 0) {
+ left = chunk.indexOf(" ");
+ }
+ right = chunk.lastIndexOf(" ");
+ /**
+ * now get the substring again with only whole words
+ */
+ if (left < right) {
+ chunk = chunk.substring(left, right);
+ }
+ }
+
+ return chunk;
+ }
+
+ private Map<String, Double> getScore(String text) throws Exception {
+ Map<String, Double> scoreMap = new HashMap<>();
+ if (documentCategorizerME == null) {
+ documentCategorizerME = new DocumentCategorizerME(new DoccatModel(new File("")));
+ }
+ double[] categorize = documentCategorizerME.categorize(text);
+ int catSize = documentCategorizerME.getNumberOfCategories();
+ for (int i = 0; i < catSize; i++) {
+ String category = documentCategorizerME.getCategory(i);
+ scoreMap.put(category, categorize[documentCategorizerME.getIndex(category)]);
+ }
+ return scoreMap;
+ }
+
+ /**
+ *
+ * @param documents A list of document texts, for best results try to
+ * ensure each country you care about will be
+ * represented by the collection
+ * @param annotationOutFile the location where the annotated doccat text file
+ * will be stored
+ * @param modelOutFile the location where the doccat model will be stored
+ * @param properties the properties where the country context object
+ * will find it's country data from this property:
+ * opennlp.geoentitylinker.countrycontext.filepath
+ * @throws IOException
+ */
+ public static void buildCountryContextModel(Collection<String> documents, File annotationOutFile, File modelOutFile, EntityLinkerProperties properties) throws IOException {
+ CountryContext context = new CountryContext();
+ FileWriter writer = new FileWriter(annotationOutFile, true);
+ for (String docText : documents) {
+
+ Map<String, Set<Integer>> regexfind = context.regexfind(docText, properties);
+ Map<String, ArrayList<String>> modelCountryContext = modelCountryContext(docText, context, RADIUS);
+ for (String key : modelCountryContext.keySet()) {
+ for (String wordbag : modelCountryContext.get(key)) {
+ writer.write(key + " " + wordbag + "\n");
+ }
+ }
+ }
+
+ writer.close();
+
+ DoccatModel model = null;
+
+ InputStream dataIn = new FileInputStream(annotationOutFile);
+ try {
+
+ ObjectStream<String> lineStream =
+ new PlainTextByLineStream(dataIn, "UTF-8");
+ ObjectStream<DocumentSample> sampleStream = new DocumentSampleStream(lineStream);
+
+ model = DocumentCategorizerME.train("en", sampleStream);
+ OutputStream modelOut = new BufferedOutputStream(new FileOutputStream(modelOutFile));
+ model.serialize(modelOut);
+ } catch (IOException e) {
+ // Failed to read or parse training data, training failed
+ e.printStackTrace();
+ }
+
+ }
+
+ /**
+ * generates proximal wordbags within the radius of a country mention within
+ * the doctext based on the country context object
+ *
+ *
+ * @param docText
+ * @param additionalContext
+ * @param radius
+ * @return
+ */
+ public static Map<String, ArrayList<String>> modelCountryContext(String docText, CountryContext additionalContext, int radius) {
+ Map<String, ArrayList< String>> featureBags = new HashMap<>();
+ Map<String, Set<Integer>> countryMentions = additionalContext.getCountryMentions();
+ /**
+ * iterator over the map that contains a mapping of every country code to
+ * all of its mentions in the document
+ */
+ for (String code : countryMentions.keySet()) {
+ /**
+ * for each mention, collect features from around each mention, then
+ * consolidate the features into another map
+ */
+ for (int mentionIdx : countryMentions.get(code)) {
+ String chunk = scorer.getTextChunk(mentionIdx, docText, radius);
+ // Collection<String> extractFeatures = super.extractFeatures(chunk.split(" "));
+ if (featureBags.containsKey(code)) {
+ featureBags.get(code).add(chunk);
+ } else {
+ ArrayList<String> newlist = new ArrayList<>();
+ newlist.add(chunk);
+ featureBags.put(code, newlist);
+ }
+ }
+ }
+ return featureBags;
+ }
+}