/* | |
* Copyright 2013 The Apache Software Foundation. | |
* | |
* Licensed under the Apache License, Version 2.0 (the "License"); | |
* you may not use this file except in compliance with the License. | |
* You may obtain a copy of the License at | |
* | |
* http://www.apache.org/licenses/LICENSE-2.0 | |
* | |
* Unless required by applicable law or agreed to in writing, software | |
* distributed under the License is distributed on an "AS IS" BASIS, | |
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
* See the License for the specific language governing permissions and | |
* limitations under the License. | |
*/ | |
package opennlp.addons.geoentitylinker; | |
import java.io.IOException; | |
import opennlp.addons.geoentitylinker.scoring.ModelBasedScorer; | |
import opennlp.addons.geoentitylinker.scoring.LinkedEntityScorer; | |
import opennlp.addons.geoentitylinker.scoring.CountryProximityScorer; | |
import opennlp.addons.geoentitylinker.scoring.GeoHashBinningScorer; | |
import opennlp.addons.geoentitylinker.scoring.FuzzyStringMatchScorer; | |
import java.util.ArrayList; | |
import java.util.Collections; | |
import java.util.Comparator; | |
import java.util.HashMap; | |
import java.util.Iterator; | |
import java.util.List; | |
import opennlp.addons.geoentitylinker.scoring.PlacetypeScorer; | |
import opennlp.addons.geoentitylinker.scoring.ProvinceProximityScorer; | |
import opennlp.tools.entitylinker.BaseLink; | |
import opennlp.tools.entitylinker.LinkedSpan; | |
import opennlp.tools.util.Span; | |
import opennlp.tools.entitylinker.EntityLinkerProperties; | |
import opennlp.tools.entitylinker.EntityLinker; | |
/** | |
* Links location entities to the USGS and GeoNames gazetteers, and uses several | |
* scoring techniques to enable resolution. The gazetteers are stored in lucene | |
* indexes. The indexes can be built using the GeoEntityLinkerSetupUtils class | |
* in this same package. | |
*/ | |
public class GeoEntityLinker implements EntityLinker<LinkedSpan> { | |
private static Integer topN = 2; | |
private AdminBoundaryContextGenerator countryContext; | |
private EntityLinkerProperties linkerProperties; | |
private GazetteerSearcher gazateerSearcher; | |
private List<LinkedEntityScorer<AdminBoundaryContext>> scorers = new ArrayList<>(); | |
@Override | |
public List<LinkedSpan> find(String doctext, Span[] sentences, Span[][] tokensBySentence, Span[][] namesBySentence) { | |
ArrayList<LinkedSpan> spans = new ArrayList<LinkedSpan>(); | |
if (linkerProperties == null) { | |
throw new IllegalArgumentException("EntityLinkerProperties cannot be null"); | |
} | |
//countryMentions = countryContext.regexfind(doctext); | |
AdminBoundaryContext context = countryContext.getContext(doctext); | |
for (int s = 0; s < sentences.length; s++) { | |
Span[] names = namesBySentence[s]; | |
Span[] tokenSpans = tokensBySentence[s]; | |
String[] tokens = Span.spansToStrings(tokenSpans, sentences[s].getCoveredText(doctext)); | |
String[] matches = Span.spansToStrings(names, tokens); | |
for (int i = 0; i < matches.length; i++) { | |
ArrayList<BaseLink> geoNamesEntries = new ArrayList<>(); | |
if (!context.getWhereClauses().isEmpty()) { | |
for (String whereclause : context.getWhereClauses()) { | |
ArrayList<GazetteerEntry> find = gazateerSearcher.find(matches[i], topN, whereclause); | |
for (GazetteerEntry gazetteerEntry : find) { | |
if (!geoNamesEntries.contains(gazetteerEntry)) { | |
geoNamesEntries.add(gazetteerEntry); | |
} | |
} | |
} | |
} else {//this means there were no where clauses generated so the where clause will default to look at the entire index | |
ArrayList<GazetteerEntry> find = gazateerSearcher.find(matches[i], topN, " gaztype:usgs geonames regions "); | |
for (GazetteerEntry gazetteerEntry : find) { | |
if (!geoNamesEntries.contains(gazetteerEntry)) { | |
geoNamesEntries.add(gazetteerEntry); | |
} | |
} | |
} | |
if (geoNamesEntries.isEmpty()) { | |
continue; | |
} | |
/** | |
* Normalize the returned scores for this name... this will assist the | |
* sort | |
*/ | |
if (!spans.isEmpty()) { | |
Double maxscore = 0d; | |
for (BaseLink gazetteerEntry : geoNamesEntries) { | |
Double deNormScore = gazetteerEntry.getScoreMap().get("lucene"); | |
if (deNormScore.compareTo(maxscore) > 0) { | |
maxscore = deNormScore; | |
} | |
} | |
for (BaseLink gazetteerEntry : geoNamesEntries) { | |
Double deNormScore = gazetteerEntry.getScoreMap().get("lucene"); | |
Double normalize = normalize(deNormScore, 0d, maxscore); | |
gazetteerEntry.getScoreMap().put("normlucene", normalize); | |
} | |
} | |
LinkedSpan newspan = new LinkedSpan(geoNamesEntries, names[i], 0); | |
newspan.setSearchTerm(matches[i]); | |
newspan.setLinkedEntries(geoNamesEntries); | |
newspan.setSentenceid(s); | |
spans.add(newspan); | |
} | |
} | |
if (!scorers.isEmpty()) { | |
for (LinkedEntityScorer scorer : scorers) { | |
scorer.score(spans, doctext, sentences, linkerProperties, context); | |
} | |
} | |
/** | |
* sort the data with the best score on top based on the sum of the scores | |
* below from the score map for each baselink object | |
*/ | |
for (LinkedSpan<BaseLink> s : spans) { | |
ArrayList<BaseLink> linkedData = s.getLinkedEntries(); | |
Collections.sort(linkedData, Collections.reverseOrder(new Comparator<BaseLink>() { | |
@Override | |
public int compare(BaseLink o1, BaseLink o2) { | |
HashMap<String, Double> o1scoreMap = o1.getScoreMap(); | |
HashMap<String, Double> o2scoreMap = o2.getScoreMap(); | |
if (o1scoreMap.size() != o2scoreMap.size()) { | |
return 0; | |
} | |
double sumo1 = 0d; | |
double sumo2 = 0d; | |
for (String object : o1scoreMap.keySet()) { | |
if (object.equals("typescore") | |
|| object.equals("countrycontext") | |
|| object.equals("placenamedicecoef") | |
|| object.equals("provincecontext") | |
|| object.equals("geohashbin") | |
|| object.equals("normlucene")) { | |
sumo1 += o1scoreMap.get(object); | |
sumo2 += o2scoreMap.get(object); | |
} | |
} | |
return Double.compare(sumo1, | |
sumo2); | |
} | |
})); | |
//prune the list to topN | |
Iterator iterator = linkedData.iterator(); | |
int n = 0; | |
while (iterator.hasNext()) { | |
if (n >= topN) { | |
iterator.remove(); | |
} | |
iterator.next(); | |
n++; | |
} | |
} | |
return spans; | |
} | |
/** | |
* transposes a value within one range to a relative value in a different | |
* range. Used to normalize distances in this class. | |
* | |
* @param valueToNormalize the value to place within the new range | |
* @param minimum the min of the set to be transposed | |
* @param maximum the max of the set to be transposed | |
* @return | |
*/ | |
private Double normalize(Double valueToNormalize, Double minimum, Double maximum) { | |
Double d = (double) ((1 - 0) * (valueToNormalize - minimum)) / (maximum - minimum) + 0; | |
d = d == Double.NaN ? 0d : d; | |
return d; | |
} | |
private void loadScorers() { | |
if (scorers.isEmpty()) { | |
scorers.add(new ProvinceProximityScorer()); | |
scorers.add(new GeoHashBinningScorer()); | |
scorers.add(new CountryProximityScorer()); | |
scorers.add(new ModelBasedScorer()); | |
scorers.add(new FuzzyStringMatchScorer()); | |
scorers.add(new PlacetypeScorer()); | |
} | |
} | |
@Override | |
public void init(EntityLinkerProperties properties) throws IOException { | |
this.linkerProperties = properties; | |
countryContext = new AdminBoundaryContextGenerator(this.linkerProperties); | |
gazateerSearcher = new GazetteerSearcher(this.linkerProperties); | |
String rowsRetStr = this.linkerProperties.getProperty("opennlp.geoentitylinker.gaz.rowsreturned", "2"); | |
Integer rws = 2; | |
try { | |
rws = Integer.valueOf(rowsRetStr); | |
} catch (NumberFormatException e) { | |
rws = 2; | |
} | |
topN = rws; | |
loadScorers(); | |
} | |
@Override | |
public List<LinkedSpan> find(String doctext, Span[] sentences, Span[][] tokensBySentence, | |
Span[][] namesBySentence, int sentenceIndex) { | |
throw new UnsupportedOperationException("The GeoEntityLinker requires the entire document " | |
+ "for proper scoring. This method is unsupported"); | |
} | |
} |