OPENNLP-630
Fixed println to be more friendly to the cli tool (and others). Also did some general cleanup like spelling errors and indexing changes
diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryProximityScorer.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryProximityScorer.java
index fd7d1c2..3dbf5d1 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryProximityScorer.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryProximityScorer.java
@@ -23,8 +23,8 @@
import java.util.Set;
import java.util.TreeSet;
import opennlp.tools.entitylinker.EntityLinkerProperties;
-import opennlp.tools.entitylinker.domain.BaseLink;
-import opennlp.tools.entitylinker.domain.LinkedSpan;
+import opennlp.tools.entitylinker.BaseLink;
+import opennlp.tools.entitylinker.LinkedSpan;
import opennlp.tools.util.Span;
/**
diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/FuzzyStringMatchScorer.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/FuzzyStringMatchScorer.java
index ee7eee9..36e2751 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/FuzzyStringMatchScorer.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/FuzzyStringMatchScorer.java
@@ -19,8 +19,8 @@
import java.util.List;
import java.util.Set;
import opennlp.tools.entitylinker.EntityLinkerProperties;
-import opennlp.tools.entitylinker.domain.BaseLink;
-import opennlp.tools.entitylinker.domain.LinkedSpan;
+import opennlp.tools.entitylinker.BaseLink;
+import opennlp.tools.entitylinker.LinkedSpan;
import opennlp.tools.ngram.NGramGenerator;
import opennlp.tools.util.Span;
diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazateerIndexer.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazateerIndexer.java
deleted file mode 100644
index f9ca4cd..0000000
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazateerIndexer.java
+++ /dev/null
@@ -1,173 +0,0 @@
-/*
- * Copyright 2013 The Apache Software Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package opennlp.addons.geoentitylinker;
-
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.FileReader;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.ar.ArabicAnalyzer;
-import org.apache.lucene.analysis.fa.PersianAnalyzer;
-import org.apache.lucene.analysis.ru.RussianAnalyzer;
-import org.apache.lucene.analysis.standard.StandardAnalyzer;
-import org.apache.lucene.analysis.th.ThaiAnalyzer;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-import org.apache.lucene.document.TextField;
-import org.apache.lucene.index.IndexWriter;
-import org.apache.lucene.index.IndexWriterConfig;
-import org.apache.lucene.store.Directory;
-import org.apache.lucene.store.MMapDirectory;
-import org.apache.lucene.util.Version;
-
-/**
- *
- * Creates two lucene indexes, geonames and usgs for use in GeoEntityLinker
- */
-public class GazateerIndexer {
-
- public GazateerIndexer() {
- loadAnalyzerMap();
- }
- Map<String, Analyzer> languageAnalyzerMap = new HashMap<>();
-
- public static interface Separable {
-
- String getSeparator();
- }
-
- public enum GazType implements Separable {
-
- GEONAMES {
- @Override
- public String toString() {
- return "/opennlp_geoentitylinker_geonames_idx";
- }
-
- @Override
- public String getSeparator() {
- return "\t";
- }
- },
- USGS {
- @Override
- public String toString() {
- return "/opennlp_geoentitylinker_usgsgaz_idx";
- }
-
- @Override
- public String getSeparator() {
- return "\\|";
- }
- }
- }
-/**
- * indexes the USGS or Geonames gazateers.
- * @param outputIndexDir a DIRECTORY path where you would like to store the output lucene indexes
- * @param gazateerInputData the file, "as is" that was downloaded from the USGS and GEONAMES website
- * @param type indicates whether the data is USGS or GEONAMES format
- * @throws Exception
- */
- public void index(File outputIndexDir, File gazateerInputData, GazType type) throws Exception {
- if (!outputIndexDir.isDirectory()) {
- throw new IllegalArgumentException("outputIndexDir must be a directory.");
- }
-
- String indexloc = outputIndexDir + type.toString();
- Directory index = new MMapDirectory(new File(indexloc));
-
- Analyzer a = new StandardAnalyzer(Version.LUCENE_45);
- IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_45, a);
-
- IndexWriter w = new IndexWriter(index, config);
-
- readFile(gazateerInputData, w, type);
- w.commit();
- w.close();
-
- }
-
- public void readFile(File gazateerInputData, IndexWriter w, GazType type) throws Exception {
- BufferedReader reader = new BufferedReader(new FileReader(gazateerInputData));
- List<String> fields = new ArrayList<String>();
- int counter = 0;
- int langCodeIndex = 0;
- System.out.println("reading gazateer data from file...........");
- while (reader.read() != -1) {
- String line = reader.readLine();
- String[] values = line.split(type.getSeparator());
- if (counter == 0) {
- // build fields
- for (int i = 0; i < values.length; i++) {
- String columnName = values[i];
- fields.add(columnName.replace("»¿", "").trim());
- if (columnName.toLowerCase().equals("lc")) {
- langCodeIndex = i;
- }
- }
-
-
- } else {
- Document doc = new Document();
- for (int i = 0; i < fields.size() - 1; i++) {
- doc.add(new TextField(fields.get(i), values[i], Field.Store.YES));
- }
- if (type == GazType.GEONAMES) {
- /**
- * see if the map contains a language specific analyzer
- */
- if (languageAnalyzerMap.containsKey(values[langCodeIndex])) {
- /*
- * if so retrieve it from the map
- */
- Analyzer analyzer = languageAnalyzerMap.get(values[langCodeIndex]);
- /**
- * index the doc using the specified analyzer
- */
- w.addDocument(doc, analyzer);
- } else {
- w.addDocument(doc);
- }
- } else {
- w.addDocument(doc);
- }
- }
- counter++;
- if (counter % 10000 == 0) {
- w.commit();
- System.out.println(counter + " .........committed to index..............");
- }
-
- }
- w.commit();
- System.out.println("Completed indexing gaz! index name is: " + type.toString());
- }
-/**
- * TODO: make these analyzers configurable
- */
- private void loadAnalyzerMap() {
- languageAnalyzerMap.put("ara", new ArabicAnalyzer(Version.LUCENE_45));
- languageAnalyzerMap.put("tha", new ThaiAnalyzer(Version.LUCENE_45));
- languageAnalyzerMap.put("rus", new RussianAnalyzer(Version.LUCENE_45));
- languageAnalyzerMap.put("fas", new PersianAnalyzer(Version.LUCENE_45));
-
- }
-}
diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazateerEntry.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerEntry.java
similarity index 85%
rename from geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazateerEntry.java
rename to geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerEntry.java
index 2b99ced..0770474 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazateerEntry.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerEntry.java
@@ -18,13 +18,13 @@
import java.util.HashMap;
import java.util.Map;
import java.util.Objects;
-import opennlp.tools.entitylinker.domain.BaseLink;
+import opennlp.tools.entitylinker.BaseLink;
/**
*
* Stores a minimal amount of information from a geographic placenames gazateer
*/
-public class GazateerEntry extends BaseLink {
+public class GazetteerEntry extends BaseLink {
private Double latitude;
private Double longitude;
@@ -111,14 +111,21 @@
}
/**
- * sets the other fields in the gazateer in the form of a map
+ * sets the other fields in the gazeteer in the form of a map
*
+ * @param indexData stores all fields in the index as fieldname:value
*/
public void setIndexData(Map<String, String> indexData) {
this.indexData = indexData;
}
@Override
+ public String toString() {
+
+ return super.toString() + "\n GazateerEntry{\n" + "\tlatitude=" + latitude + ", \n\tlongitude=" + longitude + ", \n\tsource=" + source + ", \n\tindexID=" + indexID + ", \n\tindexData=" + indexData + "\n}";
+ }
+
+ @Override
public int hashCode() {
int hash = 7;
hash = 29 * hash + Objects.hashCode(this.latitude);
@@ -136,7 +143,7 @@
if (getClass() != obj.getClass()) {
return false;
}
- final GazateerEntry other = (GazateerEntry) obj;
+ final GazetteerEntry other = (GazetteerEntry) obj;
if (!Objects.equals(this.latitude, other.latitude)) {
return false;
}
diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerIndexer.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerIndexer.java
new file mode 100644
index 0000000..34724e1
--- /dev/null
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerIndexer.java
@@ -0,0 +1,161 @@
+/*
+ * Copyright 2013 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.addons.geoentitylinker;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileReader;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.StringField;
+import org.apache.lucene.document.TextField;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.MMapDirectory;
+import org.apache.lucene.util.Version;
+
+/**
+ *
+ * Creates two lucene indexes, geonames and usgs for use in GeoEntityLinker
+ */
+public class GazetteerIndexer {
+
+ public GazetteerIndexer() {
+ // loadAnalyzerMap();
+ }
+
+ /**
+ * build this into a future release, causing problems at query time
+ */
+ // Map<String, Analyzer> languageAnalyzerMap = new HashMap<>();
+
+ public static interface Separable {
+
+ String getSeparator();
+ }
+
+ public enum GazType implements Separable {
+
+ GEONAMES {
+ @Override
+ public String toString() {
+ return "/opennlp_geoentitylinker_geonames_idx";
+ }
+
+ @Override
+ public String getSeparator() {
+ return "\t";
+ }
+ },
+ USGS {
+ @Override
+ public String toString() {
+ return "/opennlp_geoentitylinker_usgsgaz_idx";
+ }
+
+ @Override
+ public String getSeparator() {
+ return "\\|";
+ }
+ }
+ }
+
+ /**
+ * indexes the USGS or Geonames gazateers.
+ *
+ * @param outputIndexDir a DIRECTORY path where you would like to store the
+ * output lucene indexes
+ * @param gazateerInputData the file, "as is" that was downloaded from the
+ * USGS and GEONAMES website
+ * @param type indicates whether the data is USGS or GEONAMES
+ * format
+ * @throws Exception
+ */
+ public void index(File outputIndexDir, File gazateerInputData, GazType type) throws Exception {
+ if (!outputIndexDir.isDirectory()) {
+ throw new IllegalArgumentException("outputIndexDir must be a directory.");
+ }
+
+ String indexloc = outputIndexDir + type.toString();
+ Directory index = new MMapDirectory(new File(indexloc));
+
+ Analyzer a = new StandardAnalyzer(Version.LUCENE_45);
+ IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_45, a);
+
+ IndexWriter w = new IndexWriter(index, config);
+
+ readFile(gazateerInputData, w, type);
+ w.commit();
+ w.close();
+
+ }
+
+ public void readFile(File gazateerInputData, IndexWriter w, GazType type) throws Exception {
+ BufferedReader reader = new BufferedReader(new FileReader(gazateerInputData));
+ List<String> fields = new ArrayList<String>();
+ int counter = 0;
+ // int langCodeIndex = 0;
+ System.out.println("reading gazateer data from file...........");
+ while (reader.read() != -1) {
+ String line = reader.readLine();
+ String[] values = line.split(type.getSeparator());
+ if (counter == 0) {
+ // build fields
+ for (int i = 0; i < values.length; i++) {
+ String columnName = values[i];
+ fields.add(columnName.replace("»¿", "").trim());
+
+ }
+
+ } else {
+ Document doc = new Document();
+ for (int i = 0; i < fields.size() - 1; i++) {
+
+ doc.add(new TextField(fields.get(i), values[i], Field.Store.YES));
+
+ }
+
+ w.addDocument(doc);
+
+ }
+ counter++;
+ if (counter % 10000 == 0) {
+ w.commit();
+ System.out.println(counter + " .........committed to index..............");
+ }
+
+ }
+ w.commit();
+ System.out.println("Completed indexing gaz! index name is: " + type.toString());
+ }
+
+ /**
+ * TODO: make these analyzers configurable
+ */
+// private void loadAnalyzerMap() {
+//// languageAnalyzerMap.put("ara", new ArabicAnalyzer(Version.LUCENE_45));
+//// languageAnalyzerMap.put("tha", new ThaiAnalyzer(Version.LUCENE_45));
+//// languageAnalyzerMap.put("rus", new RussianAnalyzer(Version.LUCENE_45));
+//// languageAnalyzerMap.put("fas", new PersianAnalyzer(Version.LUCENE_45));
+//
+// }
+}
diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazateerSearchCache.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearchCache.java
similarity index 84%
rename from geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazateerSearchCache.java
rename to geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearchCache.java
index ccfe839..3049169 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazateerSearchCache.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearchCache.java
@@ -23,20 +23,20 @@
*
* Caches gazateer query results statically. Clears itself if more than 10000 results are cached.
*/
-public class GazateerSearchCache {
+public class GazetteerSearchCache {
- private static Map<String, ArrayList<GazateerEntry>> gazCache = new HashMap<>();
+ private static Map<String, ArrayList<GazetteerEntry>> gazCache = new HashMap<>();
/**
* returns the cached entries. Returns null if the query does not exists in the cache
* @param searchString
* @return
*/
- public static synchronized ArrayList<GazateerEntry> get(String searchString) {
+ public static synchronized ArrayList<GazetteerEntry> get(String searchString) {
return gazCache.get(searchString);
}
- public static synchronized void put(String searchString, ArrayList<GazateerEntry> hits) {
+ public static synchronized void put(String searchString, ArrayList<GazetteerEntry> hits) {
if (gazCache.size() > 10000) {
gazCache.clear();
}
diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazateerSearcher.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java
similarity index 89%
rename from geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazateerSearcher.java
rename to geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java
index f6fee16..c25695b 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazateerSearcher.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java
@@ -43,7 +43,7 @@
* GeoEntityLinkerSetupUtils
*
*/
-public class GazateerSearcher {
+public class GazetteerSearcher {
private double scoreCutoff = .90;
private Directory geonamesIndex;//= new MMapDirectory(new File(indexloc));
@@ -57,7 +57,7 @@
private Analyzer usgsAnalyzer;
private EntityLinkerProperties properties;
- public GazateerSearcher(EntityLinkerProperties properties) throws Exception {
+ public GazetteerSearcher(EntityLinkerProperties properties) throws Exception {
this.properties = properties;
init();
}
@@ -67,12 +67,14 @@
* @param searchString the named entity to look up in the lucene index
* @param rowsReturned how many rows to allow lucene to return
* @param code the country code
- * @param properties the entitylinker.properties file that states where the
- * lucene indexes are
+
* @return
*/
- public ArrayList<GazateerEntry> geonamesFind(String searchString, int rowsReturned, String code) {
- ArrayList<GazateerEntry> linkedData = new ArrayList<>();
+ public ArrayList<GazetteerEntry> geonamesFind(String searchString, int rowsReturned, String code) {
+ ArrayList<GazetteerEntry> linkedData = new ArrayList<>();
+ if(code.toLowerCase().equals("in") && searchString.toLowerCase().equals("india")){
+ System.out.println("india");
+ }
String luceneQueryString = "";
try {
/**
@@ -80,12 +82,12 @@
* case the code variable will be an empty string
*/
luceneQueryString = !code.equals("")
- ? "FULL_NAME_ND_RO:" + searchString.toLowerCase().trim() + " AND CC1:" + code.toLowerCase() + "^1000"
+ ? "FULL_NAME_ND_RO:" + searchString.toLowerCase().trim() + " AND CC1:\""+code.toLowerCase()+"\"" //[\"" + code.toLowerCase()+"\" TO \"" + code.toLowerCase() + "\"]"
: "FULL_NAME_ND_RO:" + searchString.toLowerCase().trim();
/**
* check the cache and go no further if the records already exist
*/
- ArrayList<GazateerEntry> get = GazateerSearchCache.get(luceneQueryString);
+ ArrayList<GazetteerEntry> get = GazetteerSearchCache.get(luceneQueryString);
if (get != null) {
return get;
@@ -98,7 +100,7 @@
TopDocs search = geonamesSearcher.search(q, rowsReturned);
for (int i = 0; i < search.scoreDocs.length; ++i) {
- GazateerEntry entry = new GazateerEntry();
+ GazetteerEntry entry = new GazetteerEntry();
int docId = search.scoreDocs[i].doc;
double sc = search.scoreDocs[i].score;
@@ -134,6 +136,9 @@
break;
case 12:
entry.setItemParentID(value);
+ if(entry.getItemParentID().equals("in")){
+ System.out.println("");
+ }
break;
case 23:
entry.setItemName(value);
@@ -159,7 +164,7 @@
/**
* add the records to the cache for this query
*/
- GazateerSearchCache.put(luceneQueryString, linkedData);
+ GazetteerSearchCache.put(luceneQueryString, linkedData);
}
}
}
@@ -181,8 +186,8 @@
* @param properties properties file that states where the lucene indexes
* @return
*/
- public ArrayList<GazateerEntry> usgsFind(String searchString, int rowsReturned) {
- ArrayList<GazateerEntry> linkedData = new ArrayList<>();
+ public ArrayList<GazetteerEntry> usgsFind(String searchString, int rowsReturned) {
+ ArrayList<GazetteerEntry> linkedData = new ArrayList<>();
String luceneQueryString = "FEATURE_NAME:" + searchString.toLowerCase().trim() + " OR MAP_NAME: " + searchString.toLowerCase().trim();
try {
@@ -190,7 +195,7 @@
/**
* hit the cache
*/
- ArrayList<GazateerEntry> get = GazateerSearchCache.get(luceneQueryString);
+ ArrayList<GazetteerEntry> get = GazetteerSearchCache.get(luceneQueryString);
if (get != null) {
//if the name is already there, return the list of cavhed results
return get;
@@ -200,7 +205,7 @@
TopDocs search = usgsSearcher.search(q, rowsReturned);
for (int i = 0; i < search.scoreDocs.length; i++) {
- GazateerEntry entry = new GazateerEntry();
+ GazetteerEntry entry = new GazetteerEntry();
int docId = search.scoreDocs[i].doc;
double sc = search.scoreDocs[i].score;
//keep track of the min score for normalization
@@ -250,7 +255,7 @@
/**
* add the records to the cache for this query
*/
- GazateerSearchCache.put(luceneQueryString, linkedData);
+ GazetteerSearchCache.put(luceneQueryString, linkedData);
}
}
diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
index 510d46e..872e2e5 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
@@ -19,8 +19,8 @@
import java.util.List;
import java.util.Map;
import java.util.Set;
-import opennlp.tools.entitylinker.domain.BaseLink;
-import opennlp.tools.entitylinker.domain.LinkedSpan;
+import opennlp.tools.entitylinker.BaseLink;
+import opennlp.tools.entitylinker.LinkedSpan;
import opennlp.tools.util.Span;
import opennlp.tools.entitylinker.EntityLinkerProperties;
import opennlp.tools.entitylinker.EntityLinker;
@@ -31,12 +31,12 @@
* indexes. The indexes can be built using the GeoEntityLinkerSetupUtils class
* in this same package.
*/
-public class GeoEntityLinker implements EntityLinker<LinkedSpan, EntityLinkerProperties> {
+public class GeoEntityLinker implements EntityLinker<LinkedSpan> {
private CountryContext countryContext;
private Map<String, Set<Integer>> countryMentions;
private EntityLinkerProperties linkerProperties;
- private GazateerSearcher gazateerSearcher;
+ private GazetteerSearcher gazateerSearcher;
private List<LinkedEntityScorer> scorers = new ArrayList<>();
/**
* Flag for deciding whether to search gaz only for toponyms within countries
@@ -127,7 +127,7 @@
try {
this.linkerProperties = properties;
countryContext = new CountryContext(this.linkerProperties);
- gazateerSearcher = new GazateerSearcher(this.linkerProperties);
+ gazateerSearcher = new GazetteerSearcher(this.linkerProperties);
loadScorers();
} catch (Exception ex) {
throw new RuntimeException(ex);
diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinkerSetupUtils.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinkerSetupUtils.java
index aa128e4..991081a 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinkerSetupUtils.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinkerSetupUtils.java
@@ -58,8 +58,8 @@
* format, or USGS format
* @param type the type, USGS, or GEONAMES
*/
- public static void createLuceneIndex(File outputIndexDir, File gazateerInputData, GazateerIndexer.GazType type) {
- GazateerIndexer indexer = new GazateerIndexer();
+ public static void createLuceneIndex(File outputIndexDir, File gazateerInputData, GazetteerIndexer.GazType type) {
+ GazetteerIndexer indexer = new GazetteerIndexer();
try {
indexer.index(outputIndexDir, gazateerInputData, type);
} catch (Exception ex) {
diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoHashBinningScorer.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoHashBinningScorer.java
index d290d8f..97a5d07 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoHashBinningScorer.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoHashBinningScorer.java
@@ -19,8 +19,8 @@
import java.util.List;
import java.util.Map;
import opennlp.tools.entitylinker.EntityLinkerProperties;
-import opennlp.tools.entitylinker.domain.BaseLink;
-import opennlp.tools.entitylinker.domain.LinkedSpan;
+import opennlp.tools.entitylinker.BaseLink;
+import opennlp.tools.entitylinker.LinkedSpan;
import opennlp.tools.util.Span;
/**
@@ -32,27 +32,27 @@
public class GeoHashBinningScorer implements LinkedEntityScorer<CountryContext> {
private final PointClustering CLUSTERER = new PointClustering();
- private int PRECISION = 4;
+ private int PRECISION = 3;
@Override
public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, CountryContext additionalContext) {
//Map<Double, Double> latLongs = new HashMap<Double, Double>();
- List<GazateerEntry> allGazEntries = new ArrayList<>();
+ List<GazetteerEntry> allGazEntries = new ArrayList<>();
/**
* collect all the gaz entry references
*/
for (LinkedSpan<BaseLink> ls : linkedSpans) {
for (BaseLink bl : ls.getLinkedEntries()) {
- if (bl instanceof GazateerEntry) {
- allGazEntries.add((GazateerEntry) bl);
+ if (bl instanceof GazetteerEntry) {
+ allGazEntries.add((GazetteerEntry) bl);
}
}
}
/**
* use the point clustering to score each hit
*/
- Map<String, List<GazateerEntry>> cluster = CLUSTERER.cluster(allGazEntries, PRECISION);
+ Map<String, List<GazetteerEntry>> cluster = CLUSTERER.cluster(allGazEntries, PRECISION);
CLUSTERER.scoreClusters(cluster);
}
diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/LinkedEntityScorer.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/LinkedEntityScorer.java
index 2a5eec7..5567fa2 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/LinkedEntityScorer.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/LinkedEntityScorer.java
@@ -17,7 +17,7 @@
import java.util.List;
import opennlp.tools.entitylinker.EntityLinkerProperties;
-import opennlp.tools.entitylinker.domain.LinkedSpan;
+import opennlp.tools.entitylinker.LinkedSpan;
import opennlp.tools.util.Span;
/**
diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/ModelBasedScorer.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/ModelBasedScorer.java
index 1c7b422..8a1564f 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/ModelBasedScorer.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/ModelBasedScorer.java
@@ -24,8 +24,8 @@
import opennlp.tools.doccat.DoccatModel;
import opennlp.tools.doccat.DocumentCategorizerME;
import opennlp.tools.entitylinker.EntityLinkerProperties;
-import opennlp.tools.entitylinker.domain.BaseLink;
-import opennlp.tools.entitylinker.domain.LinkedSpan;
+import opennlp.tools.entitylinker.BaseLink;
+import opennlp.tools.entitylinker.LinkedSpan;
import opennlp.tools.util.Span;
/**
diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/PointClustering.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/PointClustering.java
index 648f3d1..81ca1eb 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/PointClustering.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/PointClustering.java
@@ -22,7 +22,7 @@
/**
*
- * Clusters a list of lat long points using a simple geohashing approach
+ * Clusters a list of lat long points using a simple geohashing/boxing approach
*/
public class PointClustering {
@@ -35,10 +35,10 @@
* @param precision
* @return
*/
- public Map<String, List<GazateerEntry>> cluster(List<GazateerEntry> entries, int precision) {
- Map<String, List<GazateerEntry>> map = new HashMap<>();
+ public Map<String, List<GazetteerEntry>> cluster(List<GazetteerEntry> entries, int precision) {
+ Map<String, List<GazetteerEntry>> map = new HashMap<>();
for (int i = 0; i < entries.size(); i++) {
- GazateerEntry entry = entries.get(i);
+ GazetteerEntry entry = entries.get(i);
Double latw = entry.getLatitude();
Double lonw = entry.getLongitude();
@@ -47,7 +47,7 @@
if (map.containsKey(key)) {
map.get(key).add(entry);
} else {
- List<GazateerEntry> newlist = new ArrayList<>();
+ List<GazetteerEntry> newlist = new ArrayList<>();
newlist.add(entry);
map.put(key, newlist);
}
@@ -55,7 +55,7 @@
return map;
}
- public void scoreClusters(Map<String, List<GazateerEntry>> clusters) {
+ public void scoreClusters(Map<String, List<GazetteerEntry>> clusters) {
Double min = 0d;
Double max = -1d;
for (String key : clusters.keySet()) {
@@ -67,7 +67,7 @@
for (String key : clusters.keySet()) {
int size = clusters.get(key).size();
Double score = normalize(Double.valueOf(size), min, max);
- for (GazateerEntry entry : clusters.get(key)) {
+ for (GazetteerEntry entry : clusters.get(key)) {
entry.getScoreMap().put("geohashbin", score);
}
}
@@ -87,8 +87,8 @@
String geoHash = "";
lat = lat + 90;
lon = lon + 180;
- String latString = String.valueOf(lat);
- String lonString = String.valueOf(lon);
+ String latString = String.valueOf(lat).replace(".", "");
+ String lonString = String.valueOf(lon).replace(".", "");
int length = latString.length() > lonString.length() ? lonString.length() : latString.length();
while (length < 12) {
latString += "0";