OPENNLP-637
Greatly simplified point clustering, and ensured no duplication is created.
diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoHashBinningScorer.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoHashBinningScorer.java
index 338ce64..beca793 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoHashBinningScorer.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoHashBinningScorer.java
@@ -16,12 +16,8 @@
 package opennlp.addons.geoentitylinker;

 

 import java.util.ArrayList;

-import java.util.HashMap;

 import java.util.List;

 import java.util.Map;

-import java.util.Set;

-import java.util.TreeMap;

-import java.util.TreeSet;

 import opennlp.tools.entitylinker.EntityLinkerProperties;

 import opennlp.tools.entitylinker.domain.BaseLink;

 import opennlp.tools.entitylinker.domain.LinkedSpan;

@@ -35,242 +31,30 @@
  */

 public class GeoHashBinningScorer implements LinkedEntityScorer<CountryContext> {

 

+  private final PointClustering CLUSTERER = new PointClustering();

+  private int PRECISION = 4;

+

   @Override

   public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, CountryContext additionalContext) {

-    score(linkedSpans);

-  }

-

-  private void score(List<LinkedSpan> geospans) {

-    Map<Double, Double> latLongs = new HashMap<Double, Double>();

+     //Map<Double, Double> latLongs = new HashMap<Double, Double>();

+    List<GazateerEntry> allGazEntries = new ArrayList<>();

 

     /**

      * collect all the lat longs

      */

-    for (LinkedSpan<BaseLink> ls : geospans) {

+    for (LinkedSpan<BaseLink> ls : linkedSpans) {

       for (BaseLink bl : ls.getLinkedEntries()) {

         if (bl instanceof GazateerEntry) {

-          GazateerEntry entry = (GazateerEntry) bl;

-          latLongs.put(entry.getLatitude(), entry.getLongitude());

-

+          allGazEntries.add((GazateerEntry) bl);

         }

       }

     }

-

     /**

-     * convert to geohash and add to sortedset

+     * use the point clustering to score each hit

      */

-    TreeSet<Long> geoHashes = new TreeSet<Long>();

-    for (Map.Entry<Double, Double> entry : latLongs.entrySet()) {

-      geoHashes.add(geoHash(entry.getKey(), entry.getValue()));

-    }

-    /**

-     * bin the points and generate a scoremap

-     */

-    Map<Long, Set<Long>> bins = bin(geoHashes);

-    Map<Long, Double> scores = getScore((TreeMap<Long, Set<Long>>) bins);

-    /**

-     * iterate over the data again and assign the score based on the bins

-     */

-    for (LinkedSpan<BaseLink> ls : geospans) {

-      for (BaseLink bl : ls.getLinkedEntries()) {

-        Long geohash = -1L;

-        Double score = 0d;

-        if (bl instanceof GazateerEntry) {

-          GazateerEntry entry = (GazateerEntry) bl;

-          geohash = geoHash(entry.getLatitude(), entry.getLongitude());

-

-        }

-        if (scores.containsKey(geohash)) {

-          score = scores.get(geohash);

-

-        } else {

-          for (Long bin : bins.keySet()) {

-            if (bin == geohash || bins.get(bin).contains(geohash)) {

-              score = scores.get(bin);

-              break;

-            }

-          }

-        }

-        bl.getScoreMap().put("geohashbin", score);

-      }

-    }

-

+    Map<String, List<GazateerEntry>> cluster = CLUSTERER.cluster(allGazEntries, PRECISION);

+    CLUSTERER.scoreClusters(cluster);

 

   }

 

-  private Long normalize(Double coordpart, Boolean isLat) {

-    Integer add = isLat ? 90 : 180;

-    coordpart = Math.abs(coordpart + add);

-    coordpart = coordpart * 1000000;

-

-    Long l = Math.round(coordpart);

-    String coord = String.valueOf(l);

-    if (coord.length() < 8) {

-      while (coord.length() < 8) {

-        coord += "0";

-      }

-    }

-    coord = coord.substring(0, 8);

-    l = Long.valueOf(coord);

-    return l;

-  }

-

-  /**

-   * interleaves a lat and a long to place the coordinate in linear sortable

-   * space for binning simplicity

-   *

-   * @param lat

-   * @param lon

-   * @return

-   */

-  private Long geoHash(double lat, double lon) {

-    Long normLat = normalize(lat, Boolean.TRUE);

-    Long normLon = normalize(lon, Boolean.FALSE);

-    String sLat = String.valueOf(normLat);

-    String sLon = String.valueOf(normLon);

-    char[] latInts = sLat.toCharArray();

-    char[] lonInts = sLon.toCharArray();

-    String geoHash = "";

-    int len = latInts.length > lonInts.length ? lonInts.length : latInts.length;

-    for (int i = 0; i < len - 1; i++) {

-      String a = String.valueOf(latInts[i]);

-      String b = String.valueOf(lonInts[i]);

-      geoHash += a + b;

-    }

-

-    return Long.valueOf(geoHash);

-  }

-

-  private Map<Long, Set<Long>> bin(TreeSet<Long> sets) {

-    ArrayList<Long> list = new ArrayList<Long>(sets);

-    ArrayList<Long> diffs = new ArrayList<Long>();

-    /**

-     * create a set of differences between the points

-     */

-    for (int i = 0; i < list.size() - 1; i++) {

-      Long n = list.get(i + 1);

-      Long v = list.get(i);

-      diffs.add(Math.abs(n - v));

-    }

-    /**

-     * generate an average "distance" between the normed points

-     */

-    Long sum = 0L;

-    for (Long l : diffs) {

-      sum += l;

-    }

-    Long avg = sum;

-    if (!diffs.isEmpty()) {

-      avg = sum / diffs.size();

-    }

-

-

-    /**

-     * generate break values where the disparity is greater than the average

-     */

-    TreeSet<Long> breaks = new TreeSet<Long>();

-    for (int i = 0; i < list.size() - 1; i++) {

-      Long n = list.get(i + 1);

-      Long v = list.get(i);

-      //Long percent = 100 - (v / n * 100);

-      Long diff = n - v;

-      if (diff > avg) {

-        breaks.add(v);

-      }

-    }

-    /**

-     * based on the break values, place subsets of close points into bins

-     */

-    TreeMap<Long, Set<Long>> binToAmount = new TreeMap<Long, Set<Long>>();

-    Long lastBreak = -1L;

-    for (Long br : breaks) {

-      if (lastBreak == -1L) {

-        binToAmount.put(br, sets.subSet(0L, true, br, true));

-      } else {

-        binToAmount.put(br, sets.subSet(lastBreak, false, br, true));

-      }

-      lastBreak = br;

-    }

-    lastBreak = sets.higher(lastBreak);

-    if (lastBreak != null) {

-      binToAmount.put(lastBreak, sets.subSet(lastBreak, true, sets.last(), true));

-      if (binToAmount.get(lastBreak).isEmpty()) {

-        binToAmount.get(lastBreak).add(lastBreak);

-      }

-    }

-    /**

-     * "binToAmount" is a map of the break value to all the points behind it

-     * (it's sorted), so the key is the max value of its set of values

-     */

-    return binToAmount;

-  }

-

-  /**

-   * returns a map of geohashes and their score

-   *

-   * @param binToAmount

-   * @return Map< Geohash, score>

-   */

-  private Map<Long, Double> getScore(TreeMap<Long, Set<Long>> binToAmount) {

-    TreeMap<Long, Double> ranks = new TreeMap<Long, Double>();

-    TreeMap<Long, Double> normRanks = new TreeMap<Long, Double>();

-    /**

-     * if there is only one bin return 1 as the rank for each item in the value

-     */

-    if (binToAmount.keySet().size() == 1 || binToAmount.keySet().isEmpty()) {

-      for (Long bin : binToAmount.keySet()) {

-        for (Long hash : binToAmount.get(bin)) {

-          ranks.put(bin, 1d);

-        }

-      }

-      return ranks;

-    }

-    int total = 0;

-    /**

-     * generate a total number of points

-     */

-    for (Set<Long> geohashes : binToAmount.values()) {

-      total += geohashes.size();

-    }

-    /**

-     * divide total by bin size, largest bin size gets best score, everything in

-     * that bin gets that score because it is part of that primary cluster

-     * TODO... do an extra iteration of clustering within the predominant

-     * cluster to refine the scoring or make the basis of the binning more

-     * granular than > avg

-     */

-    TreeSet<Double> rankSet = new TreeSet<Double>();

-    for (Long key : binToAmount.keySet()) {

-      int size = binToAmount.get(key).size();

-      Double rank = (double) total / size;

-      rankSet.add(rank);

-      ranks.put(key, rank);

-    }

-    /**

-     * load the final score map with normalized values

-     */

-    for (Map.Entry<Long, Double> rank : ranks.entrySet()) {

-      double norm = normalize(rank.getValue(), rankSet.first() + .1, rankSet.last() + .1);

-      double reverse = Math.abs(norm - 1);

-      double score = reverse > 1d ? 1.0 : reverse;

-      normRanks.put(rank.getKey(), score);

-    }

-

-    return normRanks;

-  }

-

-  /**

-   * transposes a number in a range to a double between 0 and 1

-   *

-   * @param valueToNormalize the value to be normalized (placed within a new

-   *                         range of 0-1)

-   * @param minimum          the min of the current range

-   * @param maximum          the max of the current range

-   * @return

-   */

-  private Double normalize(Double valueToNormalize, Double minimum, Double maximum) {

-    Double d = (double) ((1 - 0) * (valueToNormalize - minimum)) / (maximum - minimum) + 0;

-    d = d == null ? 0d : d;

-    return d;

-  }

 }

diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/PointClustering.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/PointClustering.java
new file mode 100644
index 0000000..648f3d1
--- /dev/null
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/PointClustering.java
@@ -0,0 +1,113 @@
+/*

+ * Copyright 2014 The Apache Software Foundation.

+ *

+ * Licensed under the Apache License, Version 2.0 (the "License");

+ * you may not use this file except in compliance with the License.

+ * You may obtain a copy of the License at

+ *

+ *      http://www.apache.org/licenses/LICENSE-2.0

+ *

+ * Unless required by applicable law or agreed to in writing, software

+ * distributed under the License is distributed on an "AS IS" BASIS,

+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+ * See the License for the specific language governing permissions and

+ * limitations under the License.

+ */

+package opennlp.addons.geoentitylinker;

+

+import java.util.ArrayList;

+import java.util.HashMap;

+import java.util.List;

+import java.util.Map;

+

+/**

+ *

+ * Clusters a list of lat long points using a simple geohashing approach

+ */

+public class PointClustering {

+

+  /**

+   * Clusters a set of points from the gazateers. The idea is that locations

+   * that matched a name that are closer to each other, the more likely the

+   * toponym is to be accurate

+   *

+   * @param entries

+   * @param precision

+   * @return

+   */

+  public Map<String, List<GazateerEntry>> cluster(List<GazateerEntry> entries, int precision) {

+    Map<String, List<GazateerEntry>> map = new HashMap<>();

+    for (int i = 0; i < entries.size(); i++) {

+      GazateerEntry entry = entries.get(i);

+      Double latw = entry.getLatitude();

+      Double lonw = entry.getLongitude();

+

+

+      String key = simpleGeohash(latw, lonw).substring(0, precision);

+      if (map.containsKey(key)) {

+        map.get(key).add(entry);

+      } else {

+        List<GazateerEntry> newlist = new ArrayList<>();

+        newlist.add(entry);

+        map.put(key, newlist);

+      }

+    }

+    return map;

+  }

+

+  public void scoreClusters(Map<String, List<GazateerEntry>> clusters) {

+    Double min = 0d;

+    Double max = -1d;

+    for (String key : clusters.keySet()) {

+      int size = clusters.get(key).size();

+      if (size > max) {

+        max = Double.valueOf(size);

+      }

+    }

+    for (String key : clusters.keySet()) {

+      int size = clusters.get(key).size();

+      Double score = normalize(Double.valueOf(size), min, max);

+      for (GazateerEntry entry : clusters.get(key)) {

+        entry.getScoreMap().put("geohashbin", score);

+      }

+    }

+

+

+  }

+

+  /**

+   * Hashes a lat long based on adding 90 or 180 and then interlarding lat lon

+   * chars. reduces a set of points to a sortable set

+   *

+   * @param lat

+   * @param lon

+   * @return

+   */

+  public String simpleGeohash(Double lat, Double lon) {

+    String geoHash = "";

+    lat = lat + 90;

+    lon = lon + 180;

+    String latString = String.valueOf(lat);

+    String lonString = String.valueOf(lon);

+    int length = latString.length() > lonString.length() ? lonString.length() : latString.length();

+    while (length < 12) {

+      latString += "0";

+      lonString += "0";

+      length++;

+    }

+    latString = latString.substring(0, 10);

+    lonString = lonString.substring(0, 10);

+    char[] latChars = latString.toCharArray();

+    char[] lonChars = lonString.toCharArray();

+

+    for (int i = 0; i < latChars.length; i++) {

+      geoHash += String.valueOf(latChars[i]) + String.valueOf(lonChars[i]);

+    }

+    return geoHash;

+  }

+

+  private Double normalize(Double valueToNormalize, Double minimum, Double maximum) {

+    Double d = (double) ((1 - 0) * (valueToNormalize - minimum)) / (maximum - minimum) + 0;

+    return d;

+  }

+}