OPENNLP-698
Fixed cleanInput() method so it handles multi token names. Now there is a property that can be added to the entitylinker.properties file, in which user can define whether to use double quotes around names or not.
diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java
index 1d49277..1f976d6 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java
@@ -19,6 +19,7 @@
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
+import java.util.logging.Level;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
@@ -49,7 +50,8 @@
private final String REGEX_CLEAN = "[^\\p{L}\\p{Nd}]";
private static final Logger LOGGER = Logger.getLogger(GazetteerSearcher.class);
- private double scoreCutoff = .90;
+ private double scoreCutoff = .70;
+ private boolean doubleQuoteAllSearchTerms = false;
private Directory geonamesIndex;//= new MMapDirectory(new File(indexloc));
private IndexReader geonamesReader;// = DirectoryReader.open(geonamesIndex);
private IndexSearcher geonamesSearcher;// = new IndexSearcher(geonamesReader);
@@ -61,7 +63,17 @@
private Analyzer usgsAnalyzer;
private EntityLinkerProperties properties;
+ public static void main(String[] args) {
+ try {
+ boolean b = Boolean.valueOf("true");
+ new GazetteerSearcher(new EntityLinkerProperties(new File("c:\\temp\\entitylinker.properties"))).geonamesFind("townsville, queensland", 5, "");
+ } catch (IOException ex) {
+ java.util.logging.Logger.getLogger(GazetteerSearcher.class.getName()).log(Level.SEVERE, null, ex);
+ } catch (Exception ex) {
+ java.util.logging.Logger.getLogger(GazetteerSearcher.class.getName()).log(Level.SEVERE, null, ex);
+ }
+ }
public GazetteerSearcher(EntityLinkerProperties properties) throws Exception {
this.properties = properties;
@@ -272,14 +284,19 @@
}
/**
- * Replaces any noise chars with
+ * Replaces any noise chars with a space, and depending on configuration adds double quotes to the string
+ *
* @param input
- * @return
+ * @return
*/
private String cleanInput(String input) {
String output = input.replaceAll(REGEX_CLEAN, " ").trim();
- System.out.println(output);
- return "\"" + output + "\"";
+ if (doubleQuoteAllSearchTerms) {
+ return "\"" + output + "\"";
+ } else {
+ return output;
+ }
+
}
private void init() throws Exception {
@@ -290,7 +307,10 @@
LOGGER.error(new Exception("USGS Gaz location not found"));
}
String cutoff = properties.getProperty("opennlp.geoentitylinker.gaz.lucenescore.min", String.valueOf(scoreCutoff));
+
scoreCutoff = Double.valueOf(cutoff);
+ String doubleQuote = properties.getProperty("opennlp.geoentitylinker.gaz.doublequote", String.valueOf(doubleQuoteAllSearchTerms));
+ doubleQuoteAllSearchTerms = Boolean.valueOf(doubleQuote);
usgsIndex = new MMapDirectory(new File(indexloc));
usgsReader = DirectoryReader.open(usgsIndex);
usgsSearcher = new IndexSearcher(usgsReader);