OPENNLP-706
Significant fix to the USGS indexing so that state names are properly discovered and weighted, added placename dice coefficient over bigrams to descending sort.
diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java
index a416136..f62951d 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java
@@ -50,7 +50,6 @@
*/
public class GazetteerSearcher {
- //private static final String boostedTerms = " AND loctype(ADM1^1 ADM1H^1 ADM2^1 ADM2H^1 ADM3^1 ADM3H^1 ADM4^1 ADM4H^1 ADM5^1 ADMD^1 ADMDH^1 PCLD^1 PCLH^1 PCLI^1 PCLIX^1 TERR^1 PCLIX^1 PPL^1 PPLA^1 PPLA2^1 PPLA3^1 PPLA4^1 PPLC^1 PPLCH^1 PPLF^1 PPLG^1 PPLH^1 PPLL^1 PPLQ^1 PPLR^1 PPLS^1 PPLX^1 STLMT^1) ";
private final String REGEX_CLEAN = "[^\\p{L}\\p{Nd}]";
private static final Logger LOGGER = Logger.getLogger(GazetteerSearcher.class);
private double scoreCutoff = .70;
@@ -74,7 +73,7 @@
try {
boolean b = Boolean.valueOf("true");
- new GazetteerSearcher(new EntityLinkerProperties(new File("c:\\temp\\entitylinker.properties"))).find("italy", 5, " countrycode:it AND gazsource:geonames");
+ new GazetteerSearcher(new EntityLinkerProperties(new File("c:\\temp\\entitylinker.properties"))).find("alabama", 5, " countrycode:us AND gazsource:usgs");
} catch (IOException ex) {
java.util.logging.Logger.getLogger(GazetteerSearcher.class.getName()).log(Level.SEVERE, null, ex);
} catch (Exception ex) {
diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
index e6ffea8..2e2cb0c 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
@@ -106,7 +106,7 @@
for (String object : o1scoreMap.keySet()) {
if (object.equals("typescore")
|| object.equals("countrycontext")
- || object.equals("normlucene")
+ || object.equals("placenamedicecoef")
|| object.equals("geohashbin")) {
sumo1 += o1scoreMap.get(object);
sumo2 += o2scoreMap.get(object);
diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/USGSProcessor.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/USGSProcessor.java
index 274e2e2..980eabe 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/USGSProcessor.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/USGSProcessor.java
@@ -57,6 +57,7 @@
public static void readFile(File gazateerInputData, IndexWriter w, GazetteerIndexer.GazType type, Map<String, AdminBoundary> lookupMap) throws Exception {
+ Map<String, StateCentroid> states = new HashMap<>();
BufferedReader reader = new BufferedReader(new FileReader(gazateerInputData));
List<String> fields = new ArrayList<>();
int counter = 0;
@@ -93,7 +94,21 @@
countyCode = get.getCountyCode();
}
String hierarchy = get.getCountryName() + ", " + get.getProvinceName() + ", " + countyname + ", " + placeName;
- // doc.add(new TextField("countryname", "united states", Field.Store.YES));
+
+ if (states.containsKey(get.getProvinceName())) {
+ StateCentroid entry = states.get(get.getProvinceName());
+ entry.count++;
+ entry.latSum += Double.valueOf(lat);
+ entry.longSum += Double.valueOf(lon);
+ } else {
+ StateCentroid centroid = new StateCentroid();
+ centroid.statecode = get.getProvCode();
+ centroid.count = 1;
+ centroid.latSum = Double.valueOf(lat);
+ centroid.longSum = Double.valueOf(lon);
+ states.put(get.getProvinceName(), centroid);
+ }
+
doc.add(new TextField("hierarchy", hierarchy, Field.Store.YES));
doc.add(new TextField("placename", placeName, Field.Store.YES));
doc.add(new TextField("latitude", lat, Field.Store.YES));
@@ -114,10 +129,56 @@
}
}
+
+
+ for (String state : states.keySet()) {
+ StateCentroid get = states.get(state);
+ Document doc = new Document();
+ doc.add(new TextField("hierarchy", "united states, " + state, Field.Store.YES));
+ doc.add(new TextField("placename", state, Field.Store.YES));
+ //calculate a centroid for all the points that were in the state
+ doc.add(new TextField("latitude", (get.latSum / get.count) + "", Field.Store.YES));
+ doc.add(new TextField("longitude", (get.longSum / get.count) + "", Field.Store.YES));
+ doc.add(new StringField("loctype", "adm1", Field.Store.YES));
+ doc.add(new StringField("admincode", get.statecode, Field.Store.YES));
+ doc.add(new StringField("countrycode", "us", Field.Store.YES));
+ doc.add(new StringField("countycode", "", Field.Store.YES));
+
+ doc.add(new StringField("locid", "us_state:" + state, Field.Store.YES));
+ doc.add(new StringField("gazsource", "usgs", Field.Store.YES));
+ w.addDocument(doc);
+
+ // System.out.println(get.statecode + "," + (get.latSum / get.count) + "," + (get.longSum / get.count));
+ }
+ Document doc = new Document();
+ doc.add(new TextField("hierarchy", "united states", Field.Store.YES));
+ doc.add(new TextField("placename", "united states", Field.Store.YES));
+ //calculate a centroid for all the points that were in the state
+ doc.add(new TextField("latitude", 39.0 + "", Field.Store.YES));
+ doc.add(new TextField("longitude", -103.0 + "", Field.Store.YES));
+ doc.add(new StringField("loctype", "pcli", Field.Store.YES));
+ doc.add(new StringField("admincode", "", Field.Store.YES));
+ doc.add(new StringField("countrycode", "us", Field.Store.YES));
+ doc.add(new StringField("countycode", "", Field.Store.YES));
+
+ doc.add(new StringField("locid", "us_centroid" + "unitedstates", Field.Store.YES));
+ doc.add(new StringField("gazsource", "usgs", Field.Store.YES));
+ //System.out.println("uscentroid," + (sumofLatSums / sumOfCounts) + "," + (sumofLonSums / sumOfCounts));
+
+ w.addDocument(doc);
w.commit();
+
System.out.println("Completed indexing USGS gaz!");
}
+ private static class StateCentroid {
+
+ double latSum;
+ double longSum;
+ String statecode;
+ int count;
+ }
+
private static Map<String, AdminBoundary> getProvData(File govUnitsFile, GazetteerIndexer.GazType type) {
System.out.println("Attempting to read USGS province (State) data from: " + govUnitsFile.getPath());
Map<String, AdminBoundary> outmap = new HashMap<>();
diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/FuzzyStringMatchScorer.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/FuzzyStringMatchScorer.java
index ce1bf45..e9634d9 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/FuzzyStringMatchScorer.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/FuzzyStringMatchScorer.java
@@ -42,9 +42,15 @@
if (hierarchy != null) {
Double dice = getDiceCoefficient(linkedSpan.getSearchTerm().toLowerCase(), hierarchy.toLowerCase(), 2);
link.getScoreMap().put("hierarchydicecoef", dice);
- Double ld = (double) getLevenshteinDistance(linkedSpan.getSearchTerm().toLowerCase(), hierarchy.toLowerCase().toLowerCase());
+ Double ld = (double) getLevenshteinDistance(linkedSpan.getSearchTerm().toLowerCase(), hierarchy.toLowerCase());
link.getScoreMap().put("hierarchylevenshtein", ld);
}
+ String placename = entry.getItemName().toLowerCase();
+ if (placename != null) {
+ Double dice = getDiceCoefficient(linkedSpan.getSearchTerm().toLowerCase(), placename, 2);
+ link.getScoreMap().put("placenamedicecoef", dice);
+
+ }
}
}
}