OPENNLP-664
Fixed, now country codes are no longer ignored.
diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerIndexer.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerIndexer.java
index 3b01c90..81e31ad 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerIndexer.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerIndexer.java
@@ -23,9 +23,9 @@
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
-import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
@@ -94,7 +94,7 @@
String indexloc = outputIndexDir + type.toString();
Directory index = new MMapDirectory(new File(indexloc));
- Analyzer a = new StandardAnalyzer(Version.LUCENE_45);
+ Analyzer a = new StandardAnalyzer(Version.LUCENE_45, new CharArraySet(Version.LUCENE_45, new ArrayList(), true));
IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_45, a);
IndexWriter w = new IndexWriter(index, config);
@@ -107,9 +107,8 @@
public void readFile(File gazateerInputData, IndexWriter w, GazType type) throws Exception {
BufferedReader reader = new BufferedReader(new FileReader(gazateerInputData));
- List<String> fields = new ArrayList<String>();
+ List<String> fields = new ArrayList<>();
int counter = 0;
- // int langCodeIndex = 0;
System.out.println("reading gazetteer data from file...........");
while (reader.read() != -1) {
String line = reader.readLine();
@@ -137,14 +136,4 @@
System.out.println("Completed indexing gaz! index name is: " + type.toString());
}
- /**
- * TODO: make these analyzers configurable
- */
-// private void loadAnalyzerMap() {
-//// languageAnalyzerMap.put("ara", new ArabicAnalyzer(Version.LUCENE_45));
-//// languageAnalyzerMap.put("tha", new ThaiAnalyzer(Version.LUCENE_45));
-//// languageAnalyzerMap.put("rus", new RussianAnalyzer(Version.LUCENE_45));
-//// languageAnalyzerMap.put("fas", new PersianAnalyzer(Version.LUCENE_45));
-//
-// }
}
diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java
index b5f9817..44d7e7d 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java
@@ -35,6 +35,7 @@
import org.apache.lucene.store.MMapDirectory;
import org.apache.lucene.util.Version;
import opennlp.tools.entitylinker.EntityLinkerProperties;
+import org.apache.lucene.analysis.util.CharArraySet;
/**
*
@@ -72,18 +73,14 @@
*/
public ArrayList<GazetteerEntry> geonamesFind(String searchString, int rowsReturned, String code) {
ArrayList<GazetteerEntry> linkedData = new ArrayList<>();
- if (code.toLowerCase().equals("in") && searchString.toLowerCase().equals("india")) {
- rowsReturned=100;
- System.out.println("india");
- }
- String luceneQueryString = "";
+
try {
/**
* build the search string Sometimes no country context is found. In this
* case the code variable will be an empty string
*/
- luceneQueryString = !code.equals("")
- ? "FULL_NAME_ND_RO:" + searchString.toLowerCase().trim() + " AND CC1:" + code.toLowerCase()+"^90000" //[\"" + code.toLowerCase()+"\" TO \"" + code.toLowerCase() + "\"]"
+ String luceneQueryString = !code.equals("")
+ ? "FULL_NAME_ND_RO:" + searchString.toLowerCase().trim() + " AND CC1:" + code.toLowerCase()//+"^90000" //[\"" + code.toLowerCase()+"\" TO \"" + code.toLowerCase() + "\"]"
: "FULL_NAME_ND_RO:" + searchString.toLowerCase().trim();
/**
* check the cache and go no further if the records already exist
@@ -97,7 +94,6 @@
QueryParser parser = new QueryParser(Version.LUCENE_45, luceneQueryString, geonamesAnalyzer);
Query q = parser.parse(luceneQueryString);
-
TopDocs search = geonamesSearcher.search(q, rowsReturned);
for (int i = 0; i < search.scoreDocs.length; ++i) {
@@ -135,7 +131,7 @@
break;
case 12:
entry.setItemParentID(value);
- if(!value.toLowerCase().equals(code.toLowerCase())){
+ if (!value.toLowerCase().equals(code.toLowerCase())) {
continue;
}
break;
@@ -153,8 +149,6 @@
* only want hits above the levenstein thresh
*/
if (normLev.compareTo(scoreCutoff) >= 0) {
- //only keep it if the country code is a match. even when the code is passed in as a weighted condition, there is no == equiv in lucene
-
if (entry.getItemParentID().toLowerCase().equals(code.toLowerCase())) {
entry.getScoreMap().put("normlucene", normLev);
//make sure we don't produce a duplicate
@@ -182,7 +176,6 @@
* @param searchString the nameed entity to look up in the lucene index
* @param rowsReturned how many rows to allow lucene to return
*
- * @param properties properties file that states where the lucene indexes
* @return
*/
public ArrayList<GazetteerEntry> usgsFind(String searchString, int rowsReturned) {
@@ -278,7 +271,7 @@
usgsIndex = new MMapDirectory(new File(indexloc));
usgsReader = DirectoryReader.open(usgsIndex);
usgsSearcher = new IndexSearcher(usgsReader);
- usgsAnalyzer = new StandardAnalyzer(Version.LUCENE_45);
+ usgsAnalyzer = new StandardAnalyzer(Version.LUCENE_45, new CharArraySet(Version.LUCENE_45, new ArrayList(), true));
}
if (geonamesIndex == null) {
String indexloc = properties.getProperty("opennlp.geoentitylinker.gaz.geonames", "");
@@ -292,7 +285,7 @@
geonamesReader = DirectoryReader.open(geonamesIndex);
geonamesSearcher = new IndexSearcher(geonamesReader);
//TODO: a language code switch statement should be employed here at some point
- geonamesAnalyzer = new StandardAnalyzer(Version.LUCENE_45);
+ geonamesAnalyzer = new StandardAnalyzer(Version.LUCENE_45, new CharArraySet(Version.LUCENE_45, new ArrayList(), true));
}
}
diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
index 854ca73..1804020 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
@@ -38,14 +38,7 @@
private EntityLinkerProperties linkerProperties;
private GazetteerSearcher gazateerSearcher;
private List<LinkedEntityScorer> scorers = new ArrayList<>();
- /**
- * Flag for deciding whether to search gaz only for toponyms within countries
- * that are mentioned in the document
- */
- // private Boolean filterCountryContext = true;
- public GeoEntityLinker() throws Exception {
- }
@Override
public List<LinkedSpan> find(String doctext, Span[] sentences, String[][] tokensBySentence, Span[][] namesBySentence) {
@@ -68,18 +61,18 @@
* US is the only country mentioned in the doc
*
*/
- ArrayList<BaseLink> geoNamesEntries = new ArrayList<BaseLink>();
+ ArrayList<BaseLink> geoNamesEntries = new ArrayList<>();
if (!(countryMentions.keySet().contains("us") && countryMentions.keySet().size() == 1)
|| countryMentions.keySet().size() > 1 || countryMentions.keySet().isEmpty()) {
if (!countryMentions.keySet().isEmpty()) {
for (String code : countryMentions.keySet()) {
if (!code.equals("us")) {
- geoNamesEntries.addAll(gazateerSearcher.geonamesFind(matches[i], 3, code));
+ geoNamesEntries.addAll(gazateerSearcher.geonamesFind(matches[i], 5, code));
}
}
} else {
- geoNamesEntries.addAll(gazateerSearcher.geonamesFind(matches[i], 3, ""));
+ geoNamesEntries.addAll(gazateerSearcher.geonamesFind(matches[i], 5, ""));
}
@@ -115,7 +108,6 @@
private void loadScorers() {
if (scorers.isEmpty()) {
- // scorers.add(new FuzzyStringMatchScorer());
scorers.add(new GeoHashBinningScorer());
scorers.add(new CountryProximityScorer());
scorers.add(new ModelBasedScorer());