OPENNLP-698
Fixed cleanInput() method so it handles multi token names. Names are now returned in double quotes.
diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java
index 9409f70..1d49277 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java
@@ -61,6 +61,8 @@
private Analyzer usgsAnalyzer;
private EntityLinkerProperties properties;
+
+
public GazetteerSearcher(EntityLinkerProperties properties) throws Exception {
this.properties = properties;
init();
@@ -155,7 +157,7 @@
* only want hits above the levenstein thresh
*/
if (normLev.compareTo(scoreCutoff) >= 0) {
- if (entry.getItemParentID().toLowerCase().equals(code.toLowerCase())) {
+ if (entry.getItemParentID().toLowerCase().equals(code.toLowerCase()) || code.toLowerCase().equals("")) {
entry.getScoreMap().put("normlucene", normLev);
//make sure we don't produce a duplicate
if (!linkedData.contains(entry)) {
@@ -186,7 +188,7 @@
*/
public ArrayList<GazetteerEntry> usgsFind(String searchString, int rowsReturned) {
ArrayList<GazetteerEntry> linkedData = new ArrayList<>();
- searchString = cleanInput(searchString);
+ searchString = cleanInput(searchString);
if (searchString.isEmpty()) {
return linkedData;
}
@@ -269,8 +271,15 @@
return linkedData;
}
+ /**
+ * Replaces any noise chars with
+ * @param input
+ * @return
+ */
private String cleanInput(String input) {
- return input.replaceAll(REGEX_CLEAN, "").trim();
+ String output = input.replaceAll(REGEX_CLEAN, " ").trim();
+ System.out.println(output);
+ return "\"" + output + "\"";
}
private void init() throws Exception {