OPENNLP-615
Greatly simplified fuzzy string match scoring by simply normalizing the lucene output levenstein, and fixed a bug in the filtering of hits below the thresh. Refined deduping logic a bit, and made the default bag of words radius for doccat larger, which improved scores in testing.
diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazateerSearcher.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazateerSearcher.java
index ca63a3a..f6fee16 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazateerSearcher.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazateerSearcher.java
@@ -18,7 +18,6 @@
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
-import java.util.Iterator;
import java.util.List;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
@@ -39,11 +38,14 @@
/**
*
- * Searches Gazateers stored in a MMapDirectory Lucene index
+ * Searches Gazateers stored in a MMapDirectory Lucene index. The structure of
+ * these indices are based on loading the indexes using the
+ * GeoEntityLinkerSetupUtils
+ *
*/
public class GazateerSearcher {
- private double scoreCutoff = .75;
+ private double scoreCutoff = .90;
private Directory geonamesIndex;//= new MMapDirectory(new File(indexloc));
private IndexReader geonamesReader;// = DirectoryReader.open(geonamesIndex);
private IndexSearcher geonamesSearcher;// = new IndexSearcher(geonamesReader);
@@ -94,7 +96,6 @@
Query q = parser.parse(luceneQueryString);
TopDocs search = geonamesSearcher.search(q, rowsReturned);
- double maxScore = (double) search.getMaxScore();
for (int i = 0; i < search.scoreDocs.length; ++i) {
GazateerEntry entry = new GazateerEntry();
@@ -103,7 +104,7 @@
entry.getScoreMap().put("lucene", sc);
- entry.getScoreMap().put("rawlucene", sc);
+
entry.setIndexID(docId + "");
entry.setSource("geonames");
@@ -140,24 +141,34 @@
}
entry.getIndexData().put(fields.get(idx).name(), value);
}
- //only keep it if the country code is a match. even when the code is passed in as a weighted condition, there is no == equiv in lucene
- if (entry.getItemParentID().toLowerCase().equals(code.toLowerCase())) {
- if (!linkedData.contains(entry)) {
- linkedData.add(entry);
+ /**
+ * norm the levenstein distance
+ */
+ Double normLev = Double.valueOf(searchString.length()) / Double.valueOf(entry.getItemName().length());
+ /**
+ * only want hits above the levenstein thresh
+ */
+ if (normLev.compareTo(scoreCutoff) >= 0) {
+ //only keep it if the country code is a match. even when the code is passed in as a weighted condition, there is no == equiv in lucene
+
+ if (entry.getItemParentID().toLowerCase().equals(code.toLowerCase())) {
+ entry.getScoreMap().put("normlucene", normLev);
+ //make sure we don't produce a duplicate
+ if (!linkedData.contains(entry)) {
+ linkedData.add(entry);
+ /**
+ * add the records to the cache for this query
+ */
+ GazateerSearchCache.put(luceneQueryString, linkedData);
+ }
}
}
}
- if (!linkedData.isEmpty()) {
- normalize(linkedData, 0d, maxScore);
- prune(linkedData);
- }
+
} catch (IOException | ParseException ex) {
System.err.println(ex);
}
- /**
- * add the records to the cache for this query
- */
- GazateerSearchCache.put(luceneQueryString, linkedData);
+
return linkedData;
}
@@ -188,8 +199,6 @@
Query q = parser.parse(luceneQueryString);
TopDocs search = usgsSearcher.search(q, rowsReturned);
- double maxScore = (double) search.getMaxScore();
-
for (int i = 0; i < search.scoreDocs.length; i++) {
GazateerEntry entry = new GazateerEntry();
int docId = search.scoreDocs[i].doc;
@@ -197,7 +206,6 @@
//keep track of the min score for normalization
entry.getScoreMap().put("lucene", sc);
- entry.getScoreMap().put("rawlucene", sc);
entry.setIndexID(docId + "");
entry.setSource("usgs");
entry.setItemParentID("us");
@@ -225,66 +233,36 @@
}
entry.getIndexData().put(fields.get(idx).name(), value);
}
- if (!linkedData.contains(entry)) {
- linkedData.add(entry);
+ /**
+ * norm the levenstein distance
+ */
+ Double normLev = Double.valueOf(searchString.length()) / Double.valueOf(entry.getItemName().length());
+ /**
+ * only want hits above the levenstein thresh
+ */
+ if (normLev.compareTo(scoreCutoff) >= 0) {
+ //only keep it if the country code is a match. even when the code is passed in as a weighted condition, there is no == equiv in lucene
+
+ entry.getScoreMap().put("normlucene", normLev);
+ //make sure we don't produce a duplicate
+ if (!linkedData.contains(entry)) {
+ linkedData.add(entry);
+ /**
+ * add the records to the cache for this query
+ */
+ GazateerSearchCache.put(luceneQueryString, linkedData);
+ }
}
+
}
- if (!linkedData.isEmpty()) {
- normalize(linkedData, 0d, maxScore);
- prune(linkedData);
- }
+
} catch (IOException | ParseException ex) {
System.err.println(ex);
}
- /**
- * add the records to the cache for this query
- */
- GazateerSearchCache.put(luceneQueryString, linkedData);
+
return linkedData;
}
- private void normalize(ArrayList<GazateerEntry> linkedData, Double minScore, Double maxScore) {
- for (GazateerEntry gazateerEntry : linkedData) {
-
- double luceneScore = gazateerEntry.getScoreMap().get("lucene");
- luceneScore = normalize(luceneScore, minScore, maxScore);
- luceneScore = luceneScore > 1.0 ? 1.0 : luceneScore;
- luceneScore = (luceneScore == Double.NaN) ? 0.001 : luceneScore;
- gazateerEntry.getScoreMap().put("lucene", luceneScore);
- }
- }
-
- /**
- * gets rid of entries that are below the score thresh
- *
- * @param linkedData
- */
- private void prune(ArrayList<GazateerEntry> linkedData) {
- for (Iterator<GazateerEntry> itr = linkedData.iterator(); itr.hasNext();) {
- GazateerEntry ge = itr.next();
- /**
- * throw away anything under the configured score thresh
- */
- if (ge.getScoreMap().get("lucene") < scoreCutoff) {
- itr.remove();
- }
- }
- }
-
- /**
- * normalizes the different levenstein scores returned from the query into a
- *
- * @param valueToNormalize the raw score
- * @param minimum the min of the range of scores
- * @param maximum the max of the range
- * @return the normed score
- */
- private Double normalize(Double valueToNormalize, Double minimum, Double maximum) {
- Double d = (double) ((1 - 0) * (valueToNormalize - minimum)) / (maximum - minimum) + 0;
- d = d == null ? 0d : d;
- return d;
- }
-
private void init() throws Exception {
if (usgsIndex == null) {
String indexloc = properties.getProperty("opennlp.geoentitylinker.gaz.usgs", "");
@@ -292,7 +270,7 @@
System.out.println("USGS Gaz location not found");
}
- String cutoff = properties.getProperty("opennlp.geoentitylinker.gaz.lucenescore.min", ".75");
+ String cutoff = properties.getProperty("opennlp.geoentitylinker.gaz.lucenescore.min", String.valueOf(scoreCutoff));
scoreCutoff = Double.valueOf(cutoff);
usgsIndex = new MMapDirectory(new File(indexloc));
usgsReader = DirectoryReader.open(usgsIndex);
diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
index 9900a2f..510d46e 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
@@ -75,11 +75,11 @@
if (!countryMentions.keySet().isEmpty()) {
for (String code : countryMentions.keySet()) {
if (!code.equals("us")) {
- geoNamesEntries.addAll(gazateerSearcher.geonamesFind(matches[i], 10, code));
+ geoNamesEntries.addAll(gazateerSearcher.geonamesFind(matches[i], 3, code));
}
}
} else {
- geoNamesEntries.addAll(gazateerSearcher.geonamesFind(matches[i], 10, ""));
+ geoNamesEntries.addAll(gazateerSearcher.geonamesFind(matches[i], 3, ""));
}
@@ -115,7 +115,7 @@
private void loadScorers() {
if (scorers.isEmpty()) {
- scorers.add(new FuzzyStringMatchScorer());
+ // scorers.add(new FuzzyStringMatchScorer());
scorers.add(new GeoHashBinningScorer());
scorers.add(new CountryProximityScorer());
scorers.add(new ModelBasedScorer());
diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinkerSetupUtils.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinkerSetupUtils.java
index de8af7b..afd6c50 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinkerSetupUtils.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinkerSetupUtils.java
@@ -17,11 +17,9 @@
import java.io.BufferedOutputStream;
import java.io.File;
-import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
-import java.io.InputStream;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Collection;
@@ -35,10 +33,8 @@
import opennlp.tools.entitylinker.EntityLinkerProperties;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.PlainTextByLineStream;
-import static opennlp.addons.geoentitylinker.ModelBasedScorer.RADIUS;
+
import opennlp.tools.cmdline.MarkableFileInputStreamFactory;
-import opennlp.tools.ml.model.DataIndexer;
-import opennlp.tools.util.InputStreamFactory;
/**
@@ -46,7 +42,7 @@
* Tools for setting up GeoEntityLinker gazateers and doccat scoring model
*/
public class GeoEntityLinkerSetupUtils {
-
+ private static final int RADIUS = 200;
public static ModelBasedScorer scorer;
static {
@@ -108,7 +104,7 @@
System.out.println("Building Doccat model...");
DoccatModel model = null;
- InputStream dataIn = new FileInputStream(annotationOutFile);
+ // InputStream dataIn = new FileInputStream(annotationOutFile);
try {
diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoHashBinningScorer.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoHashBinningScorer.java
index beca793..d290d8f 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoHashBinningScorer.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoHashBinningScorer.java
@@ -40,7 +40,7 @@
List<GazateerEntry> allGazEntries = new ArrayList<>();
/**
- * collect all the lat longs
+ * collect all the gaz entry references
*/
for (LinkedSpan<BaseLink> ls : linkedSpans) {
for (BaseLink bl : ls.getLinkedEntries()) {
diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/ModelBasedScorer.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/ModelBasedScorer.java
index 1093165..1c7b422 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/ModelBasedScorer.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/ModelBasedScorer.java
@@ -36,7 +36,7 @@
DocumentCategorizerME documentCategorizerME;
DoccatModel doccatModel;
- public static final int RADIUS = 100;
+ public static final int RADIUS = 200;
boolean modelexists = false;
@Override