/* | |
* Copyright 2013 The Apache Software Foundation. | |
* | |
* Licensed under the Apache License, Version 2.0 (the "License"); | |
* you may not use this file except in compliance with the License. | |
* You may obtain a copy of the License at | |
* | |
* http://www.apache.org/licenses/LICENSE-2.0 | |
* | |
* Unless required by applicable law or agreed to in writing, software | |
* distributed under the License is distributed on an "AS IS" BASIS, | |
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
* See the License for the specific language governing permissions and | |
* limitations under the License. | |
*/ | |
package opennlp.addons.geoentitylinker; | |
import java.io.File; | |
import java.io.IOException; | |
import java.util.ArrayList; | |
import java.util.HashMap; | |
import java.util.List; | |
import java.util.Map; | |
import java.util.logging.Level; | |
import org.apache.lucene.analysis.Analyzer; | |
import org.apache.lucene.analysis.standard.StandardAnalyzer; | |
import org.apache.lucene.document.Document; | |
import org.apache.lucene.index.DirectoryReader; | |
import org.apache.lucene.index.IndexReader; | |
import org.apache.lucene.index.IndexableField; | |
import org.apache.lucene.queryparser.classic.ParseException; | |
import org.apache.lucene.queryparser.classic.QueryParser; | |
import org.apache.lucene.search.IndexSearcher; | |
import org.apache.lucene.search.Query; | |
import org.apache.lucene.search.TopDocs; | |
import org.apache.lucene.store.Directory; | |
import org.apache.lucene.store.MMapDirectory; | |
import org.apache.lucene.util.Version; | |
import opennlp.tools.entitylinker.EntityLinkerProperties; | |
import org.apache.log4j.Logger; | |
import org.apache.lucene.analysis.core.KeywordAnalyzer; | |
import org.apache.lucene.analysis.util.CharArraySet; | |
/** | |
* | |
* Searches Gazetteers stored in a MMapDirectory Lucene index. The structure of | |
* these indices are based on loading the indexes using the GazetteerIndexer | |
* | |
*/ | |
public class GazetteerSearcher { | |
//private static final String boostedTerms = " AND loctype(ADM1^1 ADM1H^1 ADM2^1 ADM2H^1 ADM3^1 ADM3H^1 ADM4^1 ADM4H^1 ADM5^1 ADMD^1 ADMDH^1 PCLD^1 PCLH^1 PCLI^1 PCLIX^1 TERR^1 PCLIX^1 PPL^1 PPLA^1 PPLA2^1 PPLA3^1 PPLA4^1 PPLC^1 PPLCH^1 PPLF^1 PPLG^1 PPLH^1 PPLL^1 PPLQ^1 PPLR^1 PPLS^1 PPLX^1 STLMT^1) "; | |
private final String REGEX_CLEAN = "[^\\p{L}\\p{Nd}]"; | |
private static final Logger LOGGER = Logger.getLogger(GazetteerSearcher.class); | |
private double scoreCutoff = .70; | |
private boolean doubleQuoteAllSearchTerms = false; | |
private boolean useHierarchyField = false; | |
private IndexSearcher geonamesSearcher;// = new IndexSearcher(geonamesReader); | |
private Analyzer geonamesAnalyzer; | |
//usgs US gazateer | |
private IndexSearcher usgsSearcher;// = new IndexSearcher(geonamesReader); | |
private Analyzer usgsAnalyzer; | |
private EntityLinkerProperties properties; | |
private Directory opennlpIndex;//= new MMapDirectory(new File(indexloc)); | |
private IndexReader opennlpReader;// = DirectoryReader.open(geonamesIndex); | |
private IndexSearcher opennlpSearcher;// = new IndexSearcher(geonamesReader); | |
private Analyzer opennlpAnalyzer; | |
public static void main(String[] args) { | |
try { | |
boolean b = Boolean.valueOf("true"); | |
new GazetteerSearcher(new EntityLinkerProperties(new File("c:\\temp\\entitylinker.properties"))).find("italy", 5, " countrycode:it AND gazsource:geonames"); | |
} catch (IOException ex) { | |
java.util.logging.Logger.getLogger(GazetteerSearcher.class.getName()).log(Level.SEVERE, null, ex); | |
} catch (Exception ex) { | |
java.util.logging.Logger.getLogger(GazetteerSearcher.class.getName()).log(Level.SEVERE, null, ex); | |
} | |
} | |
public GazetteerSearcher(EntityLinkerProperties properties) throws Exception { | |
this.properties = properties; | |
init(); | |
} | |
/** | |
* Searches the single lucene index that includes the location hierarchy. | |
* | |
* @param searchString the location name to search for | |
* @param rowsReturned how many index entries to return (top N...) | |
* @param whereClause the conditional statement that defines the index type | |
* and the country oode. | |
* @return | |
*/ | |
public ArrayList<GazetteerEntry> find(String searchString, int rowsReturned, String whereClause) { | |
ArrayList<GazetteerEntry> linkedData = new ArrayList<>(); | |
searchString = cleanInput(searchString); | |
if (searchString.isEmpty()) { | |
return linkedData; | |
} | |
try { | |
/** | |
* build the search string Sometimes no country context is found. In this | |
* case the code variables will be empty strings | |
*/ | |
String placeNameQueryString = "placename:(" + searchString.toLowerCase() + ")" + "AND " + whereClause; | |
if (searchString.trim().contains(" ") && useHierarchyField) { | |
placeNameQueryString = "(placename:(" + searchString.toLowerCase() + ") AND hierarchy:(" + formatForHierarchy(searchString) + "))" | |
+ " AND " + whereClause; | |
} | |
/** | |
* check the cache and go no further if the records already exist | |
*/ | |
ArrayList<GazetteerEntry> get = GazetteerSearchCache.get(placeNameQueryString); | |
if (get != null) { | |
return get; | |
} | |
/** | |
* search the placename | |
*/ | |
QueryParser parser = new QueryParser(Version.LUCENE_48, placeNameQueryString, opennlpAnalyzer); | |
Query q = parser.parse(placeNameQueryString); | |
//Filter filter = new QueryWrapperFilter(new QueryParser(Version.LUCENE_48, whereClause, opennlpAnalyzer).parse(whereClause)); | |
TopDocs bestDocs = opennlpSearcher.search(q, rowsReturned); | |
for (int i = 0; i < bestDocs.scoreDocs.length; ++i) { | |
GazetteerEntry entry = new GazetteerEntry(); | |
int docId = bestDocs.scoreDocs[i].doc; | |
double sc = bestDocs.scoreDocs[i].score; | |
entry.getScoreMap().put("lucene", sc); | |
entry.setIndexID(docId + ""); | |
Document d = opennlpSearcher.doc(docId); | |
List<IndexableField> fields = d.getFields(); | |
String lat = d.get("latitude"); | |
String lon = d.get("longitude"); | |
String placename = d.get("placename"); | |
String parentid = d.get("countrycode").toLowerCase(); | |
String provid = d.get("admincode"); | |
String itemtype = d.get("loctype"); | |
String source = d.get("gazsource"); | |
String hier = d.get("hierarchy"); | |
entry.setSource(source); | |
entry.setItemID(docId + ""); | |
entry.setLatitude(Double.valueOf(lat)); | |
entry.setLongitude(Double.valueOf(lon)); | |
entry.setItemType(itemtype); | |
entry.setItemParentID(parentid); | |
entry.setProvinceCode(provid); | |
entry.setCountryCode(parentid); | |
entry.setItemName(placename); | |
entry.setHierarchy(hier); | |
for (int idx = 0; idx < fields.size(); idx++) { | |
entry.getIndexData().put(fields.get(idx).name(), d.get(fields.get(idx).name())); | |
} | |
/** | |
* norm the levenstein distance | |
*/ | |
int maxLen = searchString.length() > entry.getItemName().length() ? searchString.length() : entry.getItemName().length(); | |
Double normLev = Math.abs(1 - (sc / (double) maxLen));//searchString.length() / (double) entry.getItemName().length(); | |
/** | |
* only want hits above the levenstein thresh. This should be a low | |
* thresh due to the use of the hierarchy field in the index | |
*/ | |
// if (normLev > scoreCutoff) { | |
if (entry.getItemParentID().toLowerCase().equals(parentid.toLowerCase()) || parentid.toLowerCase().equals("")) { | |
entry.getScoreMap().put("normlucene", normLev); | |
//make sure we don't produce a duplicate | |
if (!linkedData.contains(entry)) { | |
linkedData.add(entry); | |
/** | |
* add the records to the cache for this query | |
*/ | |
GazetteerSearchCache.put(placeNameQueryString, linkedData); | |
} | |
} | |
//} | |
} | |
} catch (IOException | ParseException ex) { | |
LOGGER.error(ex); | |
} | |
return linkedData; | |
} | |
/** | |
* Replaces any noise chars with a space, and depending on configuration adds | |
* double quotes to the string | |
* | |
* @param input | |
* @return | |
*/ | |
private String cleanInput(String input) { | |
String output = input.replaceAll(REGEX_CLEAN, " ").trim(); | |
output = output.replace(" ", " "); | |
if (doubleQuoteAllSearchTerms) { | |
return "\"" + output + "\""; | |
} else { | |
return output; | |
} | |
} | |
private void init() throws Exception { | |
if (opennlpIndex == null) { | |
String indexloc = properties.getProperty("opennlp.geoentitylinker.gaz", ""); | |
if (indexloc.equals("")) { | |
LOGGER.error(new Exception("Opennlp combined Gaz directory location not found")); | |
} | |
opennlpIndex = new MMapDirectory(new File(indexloc)); | |
opennlpReader = DirectoryReader.open(opennlpIndex); | |
opennlpSearcher = new IndexSearcher(opennlpReader); | |
//TODO: a language code switch statement should be employed here at some point | |
opennlpAnalyzer | |
= //new StandardAnalyzer(Version.LUCENE_48, new CharArraySet(Version.LUCENE_48, new ArrayList(), true)); | |
new StandardAnalyzer(Version.LUCENE_48, new CharArraySet(Version.LUCENE_48, new ArrayList(), true)); | |
Map<String, Analyzer> analyMap = new HashMap<>(); | |
analyMap.put("countrycode", new KeywordAnalyzer()); | |
analyMap.put("admincode", new KeywordAnalyzer()); | |
analyMap.put("loctype", new KeywordAnalyzer()); | |
analyMap.put("countycode", new KeywordAnalyzer()); | |
analyMap.put("gazsource", new KeywordAnalyzer()); | |
String cutoff = properties.getProperty("opennlp.geoentitylinker.gaz.lucenescore.min", String.valueOf(scoreCutoff)); | |
String usehierarchy = properties.getProperty("opennlp.geoentitylinker.gaz.hierarchyfield", String.valueOf("0")); | |
if (cutoff != null && !cutoff.isEmpty()) { | |
scoreCutoff = Double.valueOf(cutoff); | |
} | |
if (usehierarchy != null && !usehierarchy.isEmpty()) { | |
useHierarchyField = Boolean.valueOf(usehierarchy); | |
} | |
// opennlp.geoentitylinker.gaz.doublequote=false | |
//opennlp.geoentitylinker.gaz.hierarchyfield=false | |
} | |
} | |
private String formatForHierarchy(String searchTerm) { | |
String[] parts = cleanInput(searchTerm).split(" "); | |
String out = ""; | |
if (parts.length != 0) { | |
for (String string : parts) { | |
out += string + " AND "; | |
} | |
out = out.substring(0, out.lastIndexOf(" AND ")); | |
} else { | |
out = cleanInput(searchTerm); | |
} | |
return out; | |
} | |
} |