blob: 43be5d52729408f1bc6a9f55127eb398d7f550cf [file] [log] [blame]
/*
* Copyright 2013 The Apache Software Foundation.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package opennlp.addons.geoentitylinker;
import java.io.IOException;
import opennlp.addons.geoentitylinker.scoring.ModelBasedScorer;
import opennlp.addons.geoentitylinker.scoring.LinkedEntityScorer;
import opennlp.addons.geoentitylinker.scoring.CountryProximityScorer;
import opennlp.addons.geoentitylinker.scoring.GeoHashBinningScorer;
import opennlp.addons.geoentitylinker.scoring.FuzzyStringMatchScorer;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import opennlp.addons.geoentitylinker.scoring.PlacetypeScorer;
import opennlp.addons.geoentitylinker.scoring.ProvinceProximityScorer;
import opennlp.tools.entitylinker.BaseLink;
import opennlp.tools.entitylinker.LinkedSpan;
import opennlp.tools.util.Span;
import opennlp.tools.entitylinker.EntityLinkerProperties;
import opennlp.tools.entitylinker.EntityLinker;
/**
* Links location entities to the USGS and GeoNames gazetteers, and uses several
* scoring techniques to enable resolution. The gazetteers are stored in lucene
* indexes. The indexes can be built using the GeoEntityLinkerSetupUtils class
* in this same package.
*/
public class GeoEntityLinker implements EntityLinker<LinkedSpan> {
private static Integer topN = 2;
private AdminBoundaryContextGenerator countryContext;
private EntityLinkerProperties linkerProperties;
private GazetteerSearcher gazateerSearcher;
private List<LinkedEntityScorer<AdminBoundaryContext>> scorers = new ArrayList<>();
@Override
public List<LinkedSpan> find(String doctext, Span[] sentences, Span[][] tokensBySentence, Span[][] namesBySentence) {
ArrayList<LinkedSpan> spans = new ArrayList<LinkedSpan>();
if (linkerProperties == null) {
throw new IllegalArgumentException("EntityLinkerProperties cannot be null");
}
//countryMentions = countryContext.regexfind(doctext);
AdminBoundaryContext context = countryContext.getContext(doctext);
for (int s = 0; s < sentences.length; s++) {
Span[] names = namesBySentence[s];
Span[] tokenSpans = tokensBySentence[s];
String[] tokens = Span.spansToStrings(tokenSpans, sentences[s].getCoveredText(doctext));
String[] matches = Span.spansToStrings(names, tokens);
for (int i = 0; i < matches.length; i++) {
ArrayList<BaseLink> geoNamesEntries = new ArrayList<>();
if (!context.getWhereClauses().isEmpty()) {
for (String whereclause : context.getWhereClauses()) {
ArrayList<GazetteerEntry> find = gazateerSearcher.find(matches[i], topN, whereclause);
for (GazetteerEntry gazetteerEntry : find) {
if (!geoNamesEntries.contains(gazetteerEntry)) {
geoNamesEntries.add(gazetteerEntry);
}
}
}
} else {//this means there were no where clauses generated so the where clause will default to look at the entire index
ArrayList<GazetteerEntry> find = gazateerSearcher.find(matches[i], topN, " gaztype:usgs geonames regions ");
for (GazetteerEntry gazetteerEntry : find) {
if (!geoNamesEntries.contains(gazetteerEntry)) {
geoNamesEntries.add(gazetteerEntry);
}
}
}
if (geoNamesEntries.isEmpty()) {
continue;
}
/**
* Normalize the returned scores for this name... this will assist the
* sort
*/
if (!spans.isEmpty()) {
Double maxscore = 0d;
for (BaseLink gazetteerEntry : geoNamesEntries) {
Double deNormScore = gazetteerEntry.getScoreMap().get("lucene");
if (deNormScore.compareTo(maxscore) > 0) {
maxscore = deNormScore;
}
}
for (BaseLink gazetteerEntry : geoNamesEntries) {
Double deNormScore = gazetteerEntry.getScoreMap().get("lucene");
Double normalize = normalize(deNormScore, 0d, maxscore);
gazetteerEntry.getScoreMap().put("normlucene", normalize);
}
}
LinkedSpan newspan = new LinkedSpan(geoNamesEntries, names[i], 0);
newspan.setSearchTerm(matches[i]);
newspan.setLinkedEntries(geoNamesEntries);
newspan.setSentenceid(s);
spans.add(newspan);
}
}
if (!scorers.isEmpty()) {
for (LinkedEntityScorer scorer : scorers) {
scorer.score(spans, doctext, sentences, linkerProperties, context);
}
}
/**
* sort the data with the best score on top based on the sum of the scores
* below from the score map for each baselink object
*/
for (LinkedSpan<BaseLink> s : spans) {
ArrayList<BaseLink> linkedData = s.getLinkedEntries();
Collections.sort(linkedData, Collections.reverseOrder(new Comparator<BaseLink>() {
@Override
public int compare(BaseLink o1, BaseLink o2) {
HashMap<String, Double> o1scoreMap = o1.getScoreMap();
HashMap<String, Double> o2scoreMap = o2.getScoreMap();
if (o1scoreMap.size() != o2scoreMap.size()) {
return 0;
}
double sumo1 = 0d;
double sumo2 = 0d;
for (String object : o1scoreMap.keySet()) {
if (object.equals("typescore")
|| object.equals("countrycontext")
|| object.equals("placenamedicecoef")
|| object.equals("provincecontext")
|| object.equals("geohashbin")
|| object.equals("normlucene")) {
sumo1 += o1scoreMap.get(object);
sumo2 += o2scoreMap.get(object);
}
}
return Double.compare(sumo1,
sumo2);
}
}));
//prune the list to topN
Iterator iterator = linkedData.iterator();
int n = 0;
while (iterator.hasNext()) {
if (n >= topN) {
iterator.remove();
}
iterator.next();
n++;
}
}
return spans;
}
/**
* transposes a value within one range to a relative value in a different
* range. Used to normalize distances in this class.
*
* @param valueToNormalize the value to place within the new range
* @param minimum the min of the set to be transposed
* @param maximum the max of the set to be transposed
* @return
*/
private Double normalize(Double valueToNormalize, Double minimum, Double maximum) {
Double d = (double) ((1 - 0) * (valueToNormalize - minimum)) / (maximum - minimum) + 0;
d = d == Double.NaN ? 0d : d;
return d;
}
private void loadScorers() {
if (scorers.isEmpty()) {
scorers.add(new ProvinceProximityScorer());
scorers.add(new GeoHashBinningScorer());
scorers.add(new CountryProximityScorer());
scorers.add(new ModelBasedScorer());
scorers.add(new FuzzyStringMatchScorer());
scorers.add(new PlacetypeScorer());
}
}
@Override
public void init(EntityLinkerProperties properties) throws IOException {
this.linkerProperties = properties;
countryContext = new AdminBoundaryContextGenerator(this.linkerProperties);
gazateerSearcher = new GazetteerSearcher(this.linkerProperties);
String rowsRetStr = this.linkerProperties.getProperty("opennlp.geoentitylinker.gaz.rowsreturned", "2");
Integer rws = 2;
try {
rws = Integer.valueOf(rowsRetStr);
} catch (NumberFormatException e) {
rws = 2;
}
topN = rws;
loadScorers();
}
@Override
public List<LinkedSpan> find(String doctext, Span[] sentences, Span[][] tokensBySentence,
Span[][] namesBySentence, int sentenceIndex) {
throw new UnsupportedOperationException("The GeoEntityLinker requires the entire document "
+ "for proper scoring. This method is unsupported");
}
}