OPENNLP-615
Greatly simplified fuzzy string match scoring by simply normalizing the lucene output levenstein, and fixed a bug in the filtering of hits below the thresh. Refined deduping logic a bit, and made the default bag of words radius for doccat larger,  which improved scores in testing.
diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazateerSearcher.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazateerSearcher.java
index ca63a3a..f6fee16 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazateerSearcher.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazateerSearcher.java
@@ -18,7 +18,6 @@
 import java.io.File;

 import java.io.IOException;

 import java.util.ArrayList;

-import java.util.Iterator;

 import java.util.List;

 import org.apache.lucene.analysis.Analyzer;

 import org.apache.lucene.analysis.standard.StandardAnalyzer;

@@ -39,11 +38,14 @@
 

 /**

  *

- * Searches Gazateers stored in a MMapDirectory Lucene index

+ * Searches Gazateers stored in a MMapDirectory Lucene index. The structure of

+ * these indices are based on loading the indexes using the

+ * GeoEntityLinkerSetupUtils

+ *

  */

 public class GazateerSearcher {

 

-  private double scoreCutoff = .75;

+  private double scoreCutoff = .90;

   private Directory geonamesIndex;//= new MMapDirectory(new File(indexloc));

   private IndexReader geonamesReader;// = DirectoryReader.open(geonamesIndex);

   private IndexSearcher geonamesSearcher;// = new IndexSearcher(geonamesReader);

@@ -94,7 +96,6 @@
       Query q = parser.parse(luceneQueryString);

 

       TopDocs search = geonamesSearcher.search(q, rowsReturned);

-      double maxScore = (double) search.getMaxScore();

 

       for (int i = 0; i < search.scoreDocs.length; ++i) {

         GazateerEntry entry = new GazateerEntry();

@@ -103,7 +104,7 @@
 

         entry.getScoreMap().put("lucene", sc);

 

-        entry.getScoreMap().put("rawlucene", sc);

+

         entry.setIndexID(docId + "");

         entry.setSource("geonames");

 

@@ -140,24 +141,34 @@
           }

           entry.getIndexData().put(fields.get(idx).name(), value);

         }

-        //only keep it if the country code is a match. even when the code is passed in as a weighted condition, there is no == equiv in lucene

-        if (entry.getItemParentID().toLowerCase().equals(code.toLowerCase())) {

-          if (!linkedData.contains(entry)) {

-            linkedData.add(entry);

+        /**

+         * norm the levenstein distance

+         */

+        Double normLev = Double.valueOf(searchString.length()) / Double.valueOf(entry.getItemName().length());

+        /**

+         * only want hits above the levenstein thresh

+         */

+        if (normLev.compareTo(scoreCutoff) >= 0) {

+          //only keep it if the country code is a match. even when the code is passed in as a weighted condition, there is no == equiv in lucene

+

+          if (entry.getItemParentID().toLowerCase().equals(code.toLowerCase())) {

+            entry.getScoreMap().put("normlucene", normLev);

+            //make sure we don't produce a duplicate

+            if (!linkedData.contains(entry)) {

+              linkedData.add(entry);

+              /**

+               * add the records to the cache for this query

+               */

+              GazateerSearchCache.put(luceneQueryString, linkedData);

+            }

           }

         }

       }

-      if (!linkedData.isEmpty()) {

-        normalize(linkedData, 0d, maxScore);

-        prune(linkedData);

-      }

+

     } catch (IOException | ParseException ex) {

       System.err.println(ex);

     }

-    /**

-     * add the records to the cache for this query

-     */

-    GazateerSearchCache.put(luceneQueryString, linkedData);

+

     return linkedData;

   }

 

@@ -188,8 +199,6 @@
       Query q = parser.parse(luceneQueryString);

 

       TopDocs search = usgsSearcher.search(q, rowsReturned);

-      double maxScore = (double) search.getMaxScore();

-

       for (int i = 0; i < search.scoreDocs.length; i++) {

         GazateerEntry entry = new GazateerEntry();

         int docId = search.scoreDocs[i].doc;

@@ -197,7 +206,6 @@
         //keep track of the min score for normalization

 

         entry.getScoreMap().put("lucene", sc);

-        entry.getScoreMap().put("rawlucene", sc);

         entry.setIndexID(docId + "");

         entry.setSource("usgs");

         entry.setItemParentID("us");

@@ -225,66 +233,36 @@
           }

           entry.getIndexData().put(fields.get(idx).name(), value);

         }

-        if (!linkedData.contains(entry)) {

-          linkedData.add(entry);

+        /**

+         * norm the levenstein distance

+         */

+        Double normLev = Double.valueOf(searchString.length()) / Double.valueOf(entry.getItemName().length());

+        /**

+         * only want hits above the levenstein thresh

+         */

+        if (normLev.compareTo(scoreCutoff) >= 0) {

+          //only keep it if the country code is a match. even when the code is passed in as a weighted condition, there is no == equiv in lucene

+

+          entry.getScoreMap().put("normlucene", normLev);

+          //make sure we don't produce a duplicate

+          if (!linkedData.contains(entry)) {

+            linkedData.add(entry);

+            /**

+             * add the records to the cache for this query

+             */

+            GazateerSearchCache.put(luceneQueryString, linkedData);

+          }

         }

+

       }

-      if (!linkedData.isEmpty()) {

-        normalize(linkedData, 0d, maxScore);

-        prune(linkedData);

-      }

+

     } catch (IOException | ParseException ex) {

       System.err.println(ex);

     }

-    /**

-     * add the records to the cache for this query

-     */

-    GazateerSearchCache.put(luceneQueryString, linkedData);

+

     return linkedData;

   }

 

-  private void normalize(ArrayList<GazateerEntry> linkedData, Double minScore, Double maxScore) {

-    for (GazateerEntry gazateerEntry : linkedData) {

-

-      double luceneScore = gazateerEntry.getScoreMap().get("lucene");

-      luceneScore = normalize(luceneScore, minScore, maxScore);

-      luceneScore = luceneScore > 1.0 ? 1.0 : luceneScore;

-      luceneScore = (luceneScore == Double.NaN) ? 0.001 : luceneScore;

-      gazateerEntry.getScoreMap().put("lucene", luceneScore);

-    }

-  }

-

-  /**

-   * gets rid of entries that are below the score thresh

-   *

-   * @param linkedData

-   */

-  private void prune(ArrayList<GazateerEntry> linkedData) {

-    for (Iterator<GazateerEntry> itr = linkedData.iterator(); itr.hasNext();) {

-      GazateerEntry ge = itr.next();

-      /**

-       * throw away anything under the configured score thresh

-       */

-      if (ge.getScoreMap().get("lucene") < scoreCutoff) {

-        itr.remove();

-      }

-    }

-  }

-

-  /**

-   * normalizes the different levenstein scores returned from the query into a

-   *

-   * @param valueToNormalize the raw score

-   * @param minimum          the min of the range of scores

-   * @param maximum          the max of the range

-   * @return the normed score

-   */

-  private Double normalize(Double valueToNormalize, Double minimum, Double maximum) {

-    Double d = (double) ((1 - 0) * (valueToNormalize - minimum)) / (maximum - minimum) + 0;

-    d = d == null ? 0d : d;

-    return d;

-  }

-

   private void init() throws Exception {

     if (usgsIndex == null) {

       String indexloc = properties.getProperty("opennlp.geoentitylinker.gaz.usgs", "");

@@ -292,7 +270,7 @@
         System.out.println("USGS Gaz location not found");

 

       }

-      String cutoff = properties.getProperty("opennlp.geoentitylinker.gaz.lucenescore.min", ".75");

+      String cutoff = properties.getProperty("opennlp.geoentitylinker.gaz.lucenescore.min", String.valueOf(scoreCutoff));

       scoreCutoff = Double.valueOf(cutoff);

       usgsIndex = new MMapDirectory(new File(indexloc));

       usgsReader = DirectoryReader.open(usgsIndex);

diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
index 9900a2f..510d46e 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
@@ -75,11 +75,11 @@
           if (!countryMentions.keySet().isEmpty()) {

             for (String code : countryMentions.keySet()) {

               if (!code.equals("us")) {

-                geoNamesEntries.addAll(gazateerSearcher.geonamesFind(matches[i], 10, code));

+                geoNamesEntries.addAll(gazateerSearcher.geonamesFind(matches[i], 3, code));

               }

             }

           } else {

-            geoNamesEntries.addAll(gazateerSearcher.geonamesFind(matches[i], 10, ""));

+            geoNamesEntries.addAll(gazateerSearcher.geonamesFind(matches[i], 3, ""));

 

           }

 

@@ -115,7 +115,7 @@
 

   private void loadScorers() {

     if (scorers.isEmpty()) {

-      scorers.add(new FuzzyStringMatchScorer());

+    //  scorers.add(new FuzzyStringMatchScorer());

       scorers.add(new GeoHashBinningScorer());

       scorers.add(new CountryProximityScorer());

       scorers.add(new ModelBasedScorer());

diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinkerSetupUtils.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinkerSetupUtils.java
index de8af7b..afd6c50 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinkerSetupUtils.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinkerSetupUtils.java
@@ -17,11 +17,9 @@
 

 import java.io.BufferedOutputStream;

 import java.io.File;

-import java.io.FileInputStream;

 import java.io.FileOutputStream;

 import java.io.FileWriter;

 import java.io.IOException;

-import java.io.InputStream;

 import java.io.OutputStream;

 import java.util.ArrayList;

 import java.util.Collection;

@@ -35,10 +33,8 @@
 import opennlp.tools.entitylinker.EntityLinkerProperties;

 import opennlp.tools.util.ObjectStream;

 import opennlp.tools.util.PlainTextByLineStream;

-import static opennlp.addons.geoentitylinker.ModelBasedScorer.RADIUS;

+

 import opennlp.tools.cmdline.MarkableFileInputStreamFactory;

-import opennlp.tools.ml.model.DataIndexer;

-import opennlp.tools.util.InputStreamFactory;

 

 

 /**

@@ -46,7 +42,7 @@
  * Tools for setting up GeoEntityLinker gazateers and doccat scoring model

  */

 public class GeoEntityLinkerSetupUtils {

-

+  private static final int RADIUS = 200;

   public static ModelBasedScorer scorer;

 

   static {

@@ -108,7 +104,7 @@
     System.out.println("Building Doccat model...");

     DoccatModel model = null;

 

-    InputStream dataIn = new FileInputStream(annotationOutFile);

+   // InputStream dataIn = new FileInputStream(annotationOutFile);

     try {

 

     

diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoHashBinningScorer.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoHashBinningScorer.java
index beca793..d290d8f 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoHashBinningScorer.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoHashBinningScorer.java
@@ -40,7 +40,7 @@
     List<GazateerEntry> allGazEntries = new ArrayList<>();

 

     /**

-     * collect all the lat longs

+     * collect all the gaz entry references

      */

     for (LinkedSpan<BaseLink> ls : linkedSpans) {

       for (BaseLink bl : ls.getLinkedEntries()) {

diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/ModelBasedScorer.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/ModelBasedScorer.java
index 1093165..1c7b422 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/ModelBasedScorer.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/ModelBasedScorer.java
@@ -36,7 +36,7 @@
 

   DocumentCategorizerME documentCategorizerME;

   DoccatModel doccatModel;

-  public static final int RADIUS = 100;

+  public static final int RADIUS = 200;

   boolean modelexists = false;

 

   @Override