blob: a8677e4c24e1b71762c2ad589efd68f5edee90d7 [file] [log] [blame]
/*
* Copyright 2014 The Apache Software Foundation.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package opennlp.addons.geoentitylinker.scoring;
import com.spatial4j.core.context.SpatialContext;
import com.spatial4j.core.io.GeohashUtils;
import com.spatial4j.core.shape.Point;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import opennlp.addons.geoentitylinker.GazetteerEntry;
/**
*
* Clusters a list of lat long points using a simple geohashing/boxing approach
*/
public class PointClustering {
/**
* Clusters a set of points from the gazateers. The idea is that locations
* that matched a name that are closer to each other, the more likely the
* toponym is to be accurate
*
* @param entries
* @param precision
* @return
*/
public Map<String, List<GazetteerEntry>> cluster(List<GazetteerEntry> entries, int precision) {
Map<String, List<GazetteerEntry>> map = new HashMap<>();
for (GazetteerEntry entry : entries) {
Double latw = entry.getLatitude();
Double lonw = entry.getLongitude();
String key = geoHash(latw, lonw).substring(0, precision);
if (map.containsKey(key)) {
map.get(key).add(entry);
} else {
List<GazetteerEntry> newlist = new ArrayList<>();
newlist.add(entry);
map.put(key, newlist);
}
}
return map;
}
public void scoreClusters(Map<String, List<GazetteerEntry>> clusters) {
Double min = 0d;
Double max = -1d;
for (String key : clusters.keySet()) {
int size = clusters.get(key).size();
if (size > max) {
max = (double) size;
}
}
for (String key : clusters.keySet()) {
int size = clusters.get(key).size();
Double score = normalize((double) size, min, max);
for (GazetteerEntry entry : clusters.get(key)) {
entry.getScoreMap().put("geohashbin", score);
}
}
}
/**
* Returns a geohash based on Lucene Spatial
*
* @param lat the input latitude Y
* @param lon the input longitude X
* @return
*/
public String geoHash(Double lat, Double lon) {
String encodeLatLon = GeohashUtils.encodeLatLon(lat, lon);
return encodeLatLon;
}
/**
* Returns the X and Y point for the geohash. Element 0 is the X (longitude)
* element 1 is the Y (latitude)
*
* @param geohash
* @return
*/
public double[] geoHashToPoint(String geohash) {
Point decode = GeohashUtils.decode(geohash, SpatialContext.GEO);
double[] coords = new double[]{decode.getX(), decode.getY()};
return coords;
}
/**
* Returns the X and Y point for the geohash. Element 0 is the X (longitude)
* element 1 is the Y (latitude)
*
* @param geohash
* @return
*/
public String geoHashToPointStr(String geohash) {
Point decode = GeohashUtils.decode(geohash, SpatialContext.GEO);
String point = decode.getX() + "," + decode.getY();
return point;
}
private Double normalize(Double valueToNormalize, double minimum, double maximum) {
return ((1 - 0) * (valueToNormalize - minimum)) / (maximum - minimum) + 0;
}
}