OPENNLP-706
Significant fix to the USGS indexing so that state names are properly discovered and weighted, added placename dice coefficient over bigrams to descending sort. 
diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java
index a416136..f62951d 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java
@@ -50,7 +50,6 @@
  */

 public class GazetteerSearcher {

 

-  //private static final String boostedTerms = " AND loctype(ADM1^1 ADM1H^1 ADM2^1 ADM2H^1 ADM3^1 ADM3H^1 ADM4^1 ADM4H^1 ADM5^1 ADMD^1 ADMDH^1 PCLD^1 PCLH^1 PCLI^1 PCLIX^1 TERR^1 PCLIX^1 PPL^1 PPLA^1 PPLA2^1 PPLA3^1 PPLA4^1 PPLC^1 PPLCH^1 PPLF^1 PPLG^1 PPLH^1 PPLL^1 PPLQ^1 PPLR^1 PPLS^1 PPLX^1 STLMT^1) ";

   private final String REGEX_CLEAN = "[^\\p{L}\\p{Nd}]";

   private static final Logger LOGGER = Logger.getLogger(GazetteerSearcher.class);

   private double scoreCutoff = .70;

@@ -74,7 +73,7 @@
     try {

       boolean b = Boolean.valueOf("true");

 

-      new GazetteerSearcher(new EntityLinkerProperties(new File("c:\\temp\\entitylinker.properties"))).find("italy", 5, " countrycode:it AND gazsource:geonames");

+      new GazetteerSearcher(new EntityLinkerProperties(new File("c:\\temp\\entitylinker.properties"))).find("alabama", 5, " countrycode:us AND gazsource:usgs");

     } catch (IOException ex) {

       java.util.logging.Logger.getLogger(GazetteerSearcher.class.getName()).log(Level.SEVERE, null, ex);

     } catch (Exception ex) {

diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
index e6ffea8..2e2cb0c 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
@@ -106,7 +106,7 @@
           for (String object : o1scoreMap.keySet()) {

             if (object.equals("typescore")

                     || object.equals("countrycontext")

-                    || object.equals("normlucene")

+                    || object.equals("placenamedicecoef")

                     || object.equals("geohashbin")) {

               sumo1 += o1scoreMap.get(object);

               sumo2 += o2scoreMap.get(object);

diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/USGSProcessor.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/USGSProcessor.java
index 274e2e2..980eabe 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/USGSProcessor.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/USGSProcessor.java
@@ -57,6 +57,7 @@
 

   public static void readFile(File gazateerInputData, IndexWriter w, GazetteerIndexer.GazType type, Map<String, AdminBoundary> lookupMap) throws Exception {

 

+    Map<String, StateCentroid> states = new HashMap<>();

     BufferedReader reader = new BufferedReader(new FileReader(gazateerInputData));

     List<String> fields = new ArrayList<>();

     int counter = 0;

@@ -93,7 +94,21 @@
           countyCode = get.getCountyCode();

         }

         String hierarchy = get.getCountryName() + ", " + get.getProvinceName() + ", " + countyname + ", " + placeName;

-       // doc.add(new TextField("countryname", "united states", Field.Store.YES));

+

+        if (states.containsKey(get.getProvinceName())) {

+          StateCentroid entry = states.get(get.getProvinceName());

+          entry.count++;

+          entry.latSum += Double.valueOf(lat);

+          entry.longSum += Double.valueOf(lon);

+        } else {

+          StateCentroid centroid = new StateCentroid();

+          centroid.statecode = get.getProvCode();

+          centroid.count = 1;

+          centroid.latSum = Double.valueOf(lat);

+          centroid.longSum = Double.valueOf(lon);

+          states.put(get.getProvinceName(), centroid);

+        }

+

         doc.add(new TextField("hierarchy", hierarchy, Field.Store.YES));

         doc.add(new TextField("placename", placeName, Field.Store.YES));

         doc.add(new TextField("latitude", lat, Field.Store.YES));

@@ -114,10 +129,56 @@
       }

 

     }

+   

+  

+    for (String state : states.keySet()) {

+      StateCentroid get = states.get(state);

+      Document doc = new Document();

+      doc.add(new TextField("hierarchy", "united states, " + state, Field.Store.YES));

+      doc.add(new TextField("placename", state, Field.Store.YES));

+      //calculate a centroid for all the points that were in the state

+      doc.add(new TextField("latitude", (get.latSum / get.count) + "", Field.Store.YES));

+      doc.add(new TextField("longitude", (get.longSum / get.count) + "", Field.Store.YES));

+      doc.add(new StringField("loctype", "adm1", Field.Store.YES));

+      doc.add(new StringField("admincode", get.statecode, Field.Store.YES));

+      doc.add(new StringField("countrycode", "us", Field.Store.YES));

+      doc.add(new StringField("countycode", "", Field.Store.YES));

+

+      doc.add(new StringField("locid", "us_state:" + state, Field.Store.YES));

+      doc.add(new StringField("gazsource", "usgs", Field.Store.YES));

+      w.addDocument(doc);

+      

+     // System.out.println(get.statecode + "," + (get.latSum / get.count) + "," + (get.longSum / get.count));

+    }

+    Document doc = new Document();

+    doc.add(new TextField("hierarchy", "united states", Field.Store.YES));

+    doc.add(new TextField("placename", "united states", Field.Store.YES));

+    //calculate a centroid for all the points that were in the state

+    doc.add(new TextField("latitude", 39.0 + "", Field.Store.YES));

+    doc.add(new TextField("longitude", -103.0 + "", Field.Store.YES));

+    doc.add(new StringField("loctype", "pcli", Field.Store.YES));

+    doc.add(new StringField("admincode", "", Field.Store.YES));

+    doc.add(new StringField("countrycode", "us", Field.Store.YES));

+    doc.add(new StringField("countycode", "", Field.Store.YES));

+

+    doc.add(new StringField("locid", "us_centroid" + "unitedstates", Field.Store.YES));

+    doc.add(new StringField("gazsource", "usgs", Field.Store.YES));

+    //System.out.println("uscentroid," + (sumofLatSums / sumOfCounts) + "," + (sumofLonSums / sumOfCounts));

+

+    w.addDocument(doc);

     w.commit();

+

     System.out.println("Completed indexing USGS gaz!");

   }

 

+  private static class StateCentroid {

+

+    double latSum;

+    double longSum;

+    String statecode;

+    int count;

+  }

+

   private static Map<String, AdminBoundary> getProvData(File govUnitsFile, GazetteerIndexer.GazType type) {

     System.out.println("Attempting to read USGS province (State) data from: " + govUnitsFile.getPath());

     Map<String, AdminBoundary> outmap = new HashMap<>();

diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/FuzzyStringMatchScorer.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/FuzzyStringMatchScorer.java
index ce1bf45..e9634d9 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/FuzzyStringMatchScorer.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/FuzzyStringMatchScorer.java
@@ -42,9 +42,15 @@
           if (hierarchy != null) {

             Double dice = getDiceCoefficient(linkedSpan.getSearchTerm().toLowerCase(), hierarchy.toLowerCase(), 2);

             link.getScoreMap().put("hierarchydicecoef", dice);

-            Double ld = (double) getLevenshteinDistance(linkedSpan.getSearchTerm().toLowerCase(), hierarchy.toLowerCase().toLowerCase());

+            Double ld = (double) getLevenshteinDistance(linkedSpan.getSearchTerm().toLowerCase(), hierarchy.toLowerCase());

             link.getScoreMap().put("hierarchylevenshtein", ld);

           }

+          String placename = entry.getItemName().toLowerCase();

+           if (placename != null) {

+            Double dice = getDiceCoefficient(linkedSpan.getSearchTerm().toLowerCase(), placename, 2);

+            link.getScoreMap().put("placenamedicecoef", dice);

+            

+          }

         }

       }

     }