blob: 367a0829b5d9c48c297d9738ada89b111ebd0c60 [file] [log] [blame]
/*
* Copyright 2013 The Apache Software Foundation.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package opennlp.addons.geoentitylinker;
import opennlp.addons.geoentitylinker.scoring.ModelBasedScorer;
import opennlp.addons.geoentitylinker.scoring.LinkedEntityScorer;
import opennlp.addons.geoentitylinker.scoring.CountryProximityScorer;
import opennlp.addons.geoentitylinker.scoring.GeoHashBinningScorer;
import opennlp.addons.geoentitylinker.scoring.FuzzyStringMatchScorer;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import opennlp.tools.entitylinker.BaseLink;
import opennlp.tools.entitylinker.LinkedSpan;
import opennlp.tools.util.Span;
import opennlp.tools.entitylinker.EntityLinkerProperties;
import opennlp.tools.entitylinker.EntityLinker;
/**
* Links location entities to the USGS and GeoNames gazatteers, and uses several
* scoring techniques to enable resolution. The gazateers are stored in lucene
* indexes. The indexes can be built using the GeoEntityLinkerSetupUtils class
* in this same package.
*/
public class GeoEntityLinker implements EntityLinker<LinkedSpan> {
private AdminBoundaryContextGenerator countryContext;
private Map<String, Set<Integer>> countryMentions;
private EntityLinkerProperties linkerProperties;
private GazetteerSearcher gazateerSearcher;
private List<LinkedEntityScorer<AdminBoundaryContext>> scorers = new ArrayList<>();
@Override
public List<LinkedSpan> find(String doctext, Span[] sentences, String[][] tokensBySentence, Span[][] namesBySentence) {
ArrayList<LinkedSpan> spans = new ArrayList<LinkedSpan>();
if (linkerProperties == null) {
throw new IllegalArgumentException("EntityLinkerProperties cannot be null");
}
//countryMentions = countryContext.regexfind(doctext);
AdminBoundaryContext context = countryContext.getContext(doctext);
for (int s = 0; s < sentences.length; s++) {
Span[] names = namesBySentence[s];
String[] tokens = tokensBySentence[s];
String[] matches = Span.spansToStrings(names, tokens);
for (int i = 0; i < matches.length; i++) {
ArrayList<BaseLink> geoNamesEntries = new ArrayList<>();
if (!context.getWhereClauses().isEmpty()) {
for (String whereclause : context.getWhereClauses()) {
geoNamesEntries.addAll(gazateerSearcher.find(matches[i], 3, whereclause));
}
}else{//this means there were no where clauses generated so the where clause will default to look at the entire index
geoNamesEntries.addAll(gazateerSearcher.find(matches[i], 3, " gaztype:* "));
}
//start generating queries
LinkedSpan newspan = new LinkedSpan(geoNamesEntries, names[i], 0);
newspan.setSearchTerm(matches[i]);
newspan.setLinkedEntries(geoNamesEntries);
newspan.setSentenceid(s);
spans.add(newspan);
}
}
if (!scorers.isEmpty()) {
for (LinkedEntityScorer scorer : scorers) {
scorer.score(spans, doctext, sentences, linkerProperties, context);
}
}
return spans;
}
private void loadScorers() {
if (scorers.isEmpty()) {
scorers.add(new GeoHashBinningScorer());
scorers.add(new CountryProximityScorer());
scorers.add(new ModelBasedScorer());
scorers.add(new FuzzyStringMatchScorer());
// scorers.add(new ProvinceProximityScorer());
}
}
@Override
public void init(EntityLinkerProperties properties) {
try {
this.linkerProperties = properties;
countryContext = new AdminBoundaryContextGenerator(this.linkerProperties);
gazateerSearcher = new GazetteerSearcher(this.linkerProperties);
loadScorers();
} catch (Exception ex) {
throw new RuntimeException(ex);
}
}
@Override
public List<LinkedSpan> find(String text, Span[] sentences, Span[] tokens, Span[] nameSpans) {
throw new UnsupportedOperationException("The GeoEntityLinker requires the entire document for proper scoring. This method is unsupported"); //To change body of generated methods, choose Tools | Templates.
}
@Override
public List<LinkedSpan> find(String text, Span[] sentences, Span[] tokens, Span[] nameSpans, int sentenceIndex) {
throw new UnsupportedOperationException("The GeoEntityLinker requires the entire document for proper scoring. This method is unsupported"); //To change body of generated methods, choose Tools | Templates.
}
@Override
public List<LinkedSpan> find(String text, Span[] sentences, String[] tokens, Span[] nameSpans) {
throw new UnsupportedOperationException("The GeoEntityLinker requires the entire document for proper scoring. This method is unsupported"); //To change body of generated methods, choose Tools | Templates.
}
}