OPENNLP-706
fixed caching, ensured indexing and searching are using the same analyzer wrapper, included provinceproximity scorer
diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java
index f62951d..abe2550 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java
@@ -40,6 +40,7 @@
import opennlp.tools.entitylinker.EntityLinkerProperties;
import org.apache.log4j.Logger;
import org.apache.lucene.analysis.core.KeywordAnalyzer;
+import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
import org.apache.lucene.analysis.util.CharArraySet;
/**
@@ -56,12 +57,7 @@
private boolean doubleQuoteAllSearchTerms = false;
private boolean useHierarchyField = false;
- private IndexSearcher geonamesSearcher;// = new IndexSearcher(geonamesReader);
- private Analyzer geonamesAnalyzer;
- //usgs US gazateer
- private IndexSearcher usgsSearcher;// = new IndexSearcher(geonamesReader);
- private Analyzer usgsAnalyzer;
private EntityLinkerProperties properties;
private Directory opennlpIndex;//= new MMapDirectory(new File(indexloc));
@@ -167,7 +163,7 @@
*/
int maxLen = searchString.length() > entry.getItemName().length() ? searchString.length() : entry.getItemName().length();
- Double normLev = Math.abs(1 - (sc / (double) maxLen));//searchString.length() / (double) entry.getItemName().length();
+ Double normLev = Math.abs(1 - (sc / (double) maxLen));
/**
* only want hits above the levenstein thresh. This should be a low
* thresh due to the use of the hierarchy field in the index
@@ -226,7 +222,6 @@
opennlpIndex = new MMapDirectory(new File(indexloc));
opennlpReader = DirectoryReader.open(opennlpIndex);
opennlpSearcher = new IndexSearcher(opennlpReader);
- //TODO: a language code switch statement should be employed here at some point
opennlpAnalyzer
= //new StandardAnalyzer(Version.LUCENE_48, new CharArraySet(Version.LUCENE_48, new ArrayList(), true));
new StandardAnalyzer(Version.LUCENE_48, new CharArraySet(Version.LUCENE_48, new ArrayList(), true));
@@ -237,6 +232,11 @@
analyMap.put("loctype", new KeywordAnalyzer());
analyMap.put("countycode", new KeywordAnalyzer());
analyMap.put("gazsource", new KeywordAnalyzer());
+
+
+ opennlpAnalyzer
+ = new PerFieldAnalyzerWrapper(opennlpAnalyzer, analyMap);
+
String cutoff = properties.getProperty("opennlp.geoentitylinker.gaz.lucenescore.min", String.valueOf(scoreCutoff));
String usehierarchy = properties.getProperty("opennlp.geoentitylinker.gaz.hierarchyfield", String.valueOf("0"));
diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
index 2e2cb0c..0b2fff6 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
@@ -27,6 +27,7 @@
import java.util.HashMap;
import java.util.List;
import opennlp.addons.geoentitylinker.scoring.PlacetypeScorer;
+import opennlp.addons.geoentitylinker.scoring.ProvinceProximityScorer;
import opennlp.tools.entitylinker.BaseLink;
import opennlp.tools.entitylinker.LinkedSpan;
import opennlp.tools.util.Span;
@@ -89,7 +90,8 @@
}
}
/**
- * sort the data with the best score on top based on the sum of the scores below from the score map for each baselink object
+ * sort the data with the best score on top based on the sum of the scores
+ * below from the score map for each baselink object
*/
for (LinkedSpan<BaseLink> s : spans) {
ArrayList<BaseLink> linkedData = s.getLinkedEntries();
@@ -124,6 +126,7 @@
private void loadScorers() {
if (scorers.isEmpty()) {
+ scorers.add(new ProvinceProximityScorer());
scorers.add(new GeoHashBinningScorer());
scorers.add(new CountryProximityScorer());
scorers.add(new ModelBasedScorer());
@@ -132,7 +135,6 @@
}
}
-
@Override
public void init(EntityLinkerProperties properties) throws IOException {
try {
diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/PlacetypeScorer.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/PlacetypeScorer.java
index 6a30f18..f6f9fa7 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/PlacetypeScorer.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/PlacetypeScorer.java
@@ -40,12 +40,12 @@
@Override
public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, AdminBoundaryContext additionalContext) {
- for(LinkedSpan<GazetteerEntry> geospan : linkedSpans){
+ for (LinkedSpan<GazetteerEntry> geospan : linkedSpans) {
ArrayList<GazetteerEntry> linkedEntries = geospan.getLinkedEntries();
for (GazetteerEntry gazetteerEntry : linkedEntries) {
String type = gazetteerEntry.getItemType().toLowerCase();
Double score = getScore(type);
- if(score==null){
+ if (score == null) {
score = 0d;
}
gazetteerEntry.getScoreMap().put("typescore", score);
@@ -63,12 +63,14 @@
for (String type : boosts) {
if (type.equals("PCLI")) {
boosetedTypes.put(type.toLowerCase(), 1d);
- } else if (type.startsWith("PCL") && !type.equals("PCLI")) {
+ } else if (type.startsWith("P") && !type.equals("PCLI")) {
boosetedTypes.put(type.toLowerCase(), .5d);
} else if (type.startsWith("ADM")) {
boosetedTypes.put(type.toLowerCase(), .75d);
}
+
}
+ boosetedTypes.put("pplc", .9);
}
}
diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ProvinceProximityScorer.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ProvinceProximityScorer.java
index f3199a1..afdb4b1 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ProvinceProximityScorer.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ProvinceProximityScorer.java
@@ -34,7 +34,7 @@
* heuristic that toponymn mentions are more likely close to their parent
* province mentions. For instance, if the toponym Berlin is mentioned near an
* indicator of Connecticut, it is more likely to be Berlin Connecticut than
- * Berlin Germany (if Germany did not exist in, or is mentioned further down in,
+ * Berlin Germany (if Germany did not exist in, or is mentioned further away in
* the article).
*
*
@@ -51,7 +51,7 @@
} else {
for (LinkedSpan<BaseLink> span : linkedSpans) {
for (BaseLink link : span.getLinkedEntries()) {
- link.getScoreMap().put("provincecontext", Double.NaN);
+ link.getScoreMap().put("provincecontext", 0d);
}
}
}
@@ -163,7 +163,7 @@
if (scoreMap.containsKey(spanCountryCode)) {
score = scoreMap.get(spanCountryCode);
- ///does the name extracted match a country name?
+ ///does the name extracted match a province name?
if (nameCodesMap.containsKey(entry.getItemName().toLowerCase())) {
//if so, is it the correct country code for that name?
if (nameCodesMap.get(entry.getItemName().toLowerCase()).contains(entry.getProvinceCode())) {
@@ -183,8 +183,8 @@
}
/**
- * takes a map of distances from the toponym to each country mention and
- * generates a map of scores for each country code. The map is then correlated
+ * takes a map of distances from the toponym to each province mention and
+ * generates a map of scores for each province code. The map is then correlated
* to the code of the BaseLink parentid for retrieval. Then the score is added
* to the overall list.
*
@@ -232,7 +232,7 @@
* together to smooth out the average, so one distant outlier does not kill
* the score for an obviously good hit. More elegant solution is possible
* using Math.pow, and making the score decay with distance by using an
- * increasing negative exponent (I think)
+ * increasing negative exponent
*
* @param normDis the normalized and sorted set of distances as a list
* @return