blob: 1f976d644660b16460bb070826694db28cc07ede [file] [log] [blame]
/*
* Copyright 2013 The Apache Software Foundation.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package opennlp.addons.geoentitylinker;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.logging.Level;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.MMapDirectory;
import org.apache.lucene.util.Version;
import opennlp.tools.entitylinker.EntityLinkerProperties;
import org.apache.log4j.Logger;
import org.apache.lucene.analysis.util.CharArraySet;
/**
*
* Searches Gazetteers stored in a MMapDirectory Lucene index. The structure of
* these indices are based on loading the indexes using the
* GeoEntityLinkerSetupUtils
*
*/
public class GazetteerSearcher {
private final String REGEX_CLEAN = "[^\\p{L}\\p{Nd}]";
private static final Logger LOGGER = Logger.getLogger(GazetteerSearcher.class);
private double scoreCutoff = .70;
private boolean doubleQuoteAllSearchTerms = false;
private Directory geonamesIndex;//= new MMapDirectory(new File(indexloc));
private IndexReader geonamesReader;// = DirectoryReader.open(geonamesIndex);
private IndexSearcher geonamesSearcher;// = new IndexSearcher(geonamesReader);
private Analyzer geonamesAnalyzer;
//usgs US gazateer
private Directory usgsIndex;//= new MMapDirectory(new File(indexloc));
private IndexReader usgsReader;// = DirectoryReader.open(geonamesIndex);
private IndexSearcher usgsSearcher;// = new IndexSearcher(geonamesReader);
private Analyzer usgsAnalyzer;
private EntityLinkerProperties properties;
public static void main(String[] args) {
try {
boolean b = Boolean.valueOf("true");
new GazetteerSearcher(new EntityLinkerProperties(new File("c:\\temp\\entitylinker.properties"))).geonamesFind("townsville, queensland", 5, "");
} catch (IOException ex) {
java.util.logging.Logger.getLogger(GazetteerSearcher.class.getName()).log(Level.SEVERE, null, ex);
} catch (Exception ex) {
java.util.logging.Logger.getLogger(GazetteerSearcher.class.getName()).log(Level.SEVERE, null, ex);
}
}
public GazetteerSearcher(EntityLinkerProperties properties) throws Exception {
this.properties = properties;
init();
}
/**
*
* @param searchString the named entity to look up in the lucene index
* @param rowsReturned how many rows to allow lucene to return
* @param code the country code
*
* @return
*/
public ArrayList<GazetteerEntry> geonamesFind(String searchString, int rowsReturned, String code) {
ArrayList<GazetteerEntry> linkedData = new ArrayList<>();
searchString = cleanInput(searchString);
if (searchString.isEmpty()) {
return linkedData;
}
try {
/**
* build the search string Sometimes no country context is found. In this
* case the code variable will be an empty string
*/
String luceneQueryString = !code.equals("")
? "FULL_NAME_ND_RO:" + searchString.toLowerCase().trim() + " AND CC1:" + code.toLowerCase()//+"^90000" //[\"" + code.toLowerCase()+"\" TO \"" + code.toLowerCase() + "\"]"
: "FULL_NAME_ND_RO:" + searchString.toLowerCase().trim();
/**
* check the cache and go no further if the records already exist
*/
ArrayList<GazetteerEntry> get = GazetteerSearchCache.get(luceneQueryString);
if (get != null) {
return get;
}
QueryParser parser = new QueryParser(Version.LUCENE_48, luceneQueryString, geonamesAnalyzer);
Query q = parser.parse(luceneQueryString);
TopDocs search = geonamesSearcher.search(q, rowsReturned);
for (int i = 0; i < search.scoreDocs.length; ++i) {
GazetteerEntry entry = new GazetteerEntry();
int docId = search.scoreDocs[i].doc;
double sc = search.scoreDocs[i].score;
entry.getScoreMap().put("lucene", sc);
entry.setIndexID(docId + "");
entry.setSource("geonames");
Document d = geonamesSearcher.doc(docId);
List<IndexableField> fields = d.getFields();
for (int idx = 0; idx < fields.size(); idx++) {
String value = d.get(fields.get(idx).name());
value = value.toLowerCase();
/**
* these positions map to the required fields in the gaz TODO: allow a
* configurable list of columns that map to the GazateerEntry fields,
* then users would be able to plug in any gazateer they have (if they
* build a lucene index out of it)
*/
switch (idx) {
case 1:
entry.setItemID(value);
break;
case 3:
entry.setLatitude(Double.valueOf(value));
break;
case 4:
entry.setLongitude(Double.valueOf(value));
break;
case 10:
entry.setItemType(value);
break;
case 12:
entry.setItemParentID(value);
if (!value.toLowerCase().equals(code.toLowerCase())) {
continue;
}
break;
case 23:
entry.setItemName(value);
break;
}
entry.getIndexData().put(fields.get(idx).name(), value);
}
/**
* norm the levenstein distance
*/
Double normLev = Double.valueOf(searchString.length()) / Double.valueOf(entry.getItemName().length());
/**
* only want hits above the levenstein thresh
*/
if (normLev.compareTo(scoreCutoff) >= 0) {
if (entry.getItemParentID().toLowerCase().equals(code.toLowerCase()) || code.toLowerCase().equals("")) {
entry.getScoreMap().put("normlucene", normLev);
//make sure we don't produce a duplicate
if (!linkedData.contains(entry)) {
linkedData.add(entry);
/**
* add the records to the cache for this query
*/
GazetteerSearchCache.put(luceneQueryString, linkedData);
}
}
}
}
} catch (IOException | ParseException ex) {
LOGGER.error(ex);
}
return linkedData;
}
/**
* Looks up the name in the USGS gazateer, after checking the cache
*
* @param searchString the nameed entity to look up in the lucene index
* @param rowsReturned how many rows to allow lucene to return
*
* @return
*/
public ArrayList<GazetteerEntry> usgsFind(String searchString, int rowsReturned) {
ArrayList<GazetteerEntry> linkedData = new ArrayList<>();
searchString = cleanInput(searchString);
if (searchString.isEmpty()) {
return linkedData;
}
String luceneQueryString = "FEATURE_NAME:" + searchString.toLowerCase().trim() + " OR MAP_NAME: " + searchString.toLowerCase().trim();
try {
/**
* hit the cache
*/
ArrayList<GazetteerEntry> get = GazetteerSearchCache.get(luceneQueryString);
if (get != null) {
//if the name is already there, return the list of cavhed results
return get;
}
QueryParser parser = new QueryParser(Version.LUCENE_48, luceneQueryString, usgsAnalyzer);
Query q = parser.parse(luceneQueryString);
TopDocs search = usgsSearcher.search(q, rowsReturned);
for (int i = 0; i < search.scoreDocs.length; i++) {
GazetteerEntry entry = new GazetteerEntry();
int docId = search.scoreDocs[i].doc;
double sc = search.scoreDocs[i].score;
//keep track of the min score for normalization
entry.getScoreMap().put("lucene", sc);
entry.setIndexID(docId + "");
entry.setSource("usgs");
entry.setItemParentID("us");
Document d = usgsSearcher.doc(docId);
List<IndexableField> fields = d.getFields();
for (int idx = 0; idx < fields.size(); idx++) {
String value = d.get(fields.get(idx).name());
value = value.toLowerCase();
switch (idx) {
case 0:
entry.setItemID(value);
break;
case 1:
entry.setItemName(value);
break;
case 2:
entry.setItemType(value);
break;
case 9:
entry.setLatitude(Double.valueOf(value));
break;
case 10:
entry.setLongitude(Double.valueOf(value));
break;
}
entry.getIndexData().put(fields.get(idx).name(), value);
}
/**
* norm the levenstein distance
*/
Double normLev = Double.valueOf(searchString.length()) / Double.valueOf(entry.getItemName().length());
/**
* only want hits above the levenstein thresh
*/
if (normLev.compareTo(scoreCutoff) >= 0) {
//only keep it if the country code is a match. even when the code is passed in as a weighted condition, there is no == equiv in lucene
entry.getScoreMap().put("normlucene", normLev);
//make sure we don't produce a duplicate
if (!linkedData.contains(entry)) {
linkedData.add(entry);
/**
* add the records to the cache for this query
*/
GazetteerSearchCache.put(luceneQueryString, linkedData);
}
}
}
} catch (IOException | ParseException ex) {
LOGGER.error(ex);
}
return linkedData;
}
/**
* Replaces any noise chars with a space, and depending on configuration adds double quotes to the string
*
* @param input
* @return
*/
private String cleanInput(String input) {
String output = input.replaceAll(REGEX_CLEAN, " ").trim();
if (doubleQuoteAllSearchTerms) {
return "\"" + output + "\"";
} else {
return output;
}
}
private void init() throws Exception {
if (usgsIndex == null) {
String indexloc = properties.getProperty("opennlp.geoentitylinker.gaz.usgs", "");
if (indexloc.equals("")) {
// System.out.println("USGS Gaz location not found");
LOGGER.error(new Exception("USGS Gaz location not found"));
}
String cutoff = properties.getProperty("opennlp.geoentitylinker.gaz.lucenescore.min", String.valueOf(scoreCutoff));
scoreCutoff = Double.valueOf(cutoff);
String doubleQuote = properties.getProperty("opennlp.geoentitylinker.gaz.doublequote", String.valueOf(doubleQuoteAllSearchTerms));
doubleQuoteAllSearchTerms = Boolean.valueOf(doubleQuote);
usgsIndex = new MMapDirectory(new File(indexloc));
usgsReader = DirectoryReader.open(usgsIndex);
usgsSearcher = new IndexSearcher(usgsReader);
usgsAnalyzer = new StandardAnalyzer(Version.LUCENE_48, new CharArraySet(Version.LUCENE_48, new ArrayList(), true));
}
if (geonamesIndex == null) {
String indexloc = properties.getProperty("opennlp.geoentitylinker.gaz.geonames", "");
if (indexloc.equals("")) {
LOGGER.error(new Exception("Geonames Gaz location not found"));
}
String cutoff = properties.getProperty("opennlp.geoentitylinker.gaz.lucenescore.min", String.valueOf(scoreCutoff));
scoreCutoff = Double.valueOf(cutoff);
geonamesIndex = new MMapDirectory(new File(indexloc));
geonamesReader = DirectoryReader.open(geonamesIndex);
geonamesSearcher = new IndexSearcher(geonamesReader);
//TODO: a language code switch statement should be employed here at some point
geonamesAnalyzer = new StandardAnalyzer(Version.LUCENE_48, new CharArraySet(Version.LUCENE_48, new ArrayList(), true));
}
}
}