blob: 6e4bd60d13d63f1abadf42b09bde45d3e3fe97c4 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.stanbol.enhancer.engines.entitylinking.impl;
import static org.apache.stanbol.enhancer.engines.entitylinking.impl.Suggestion.ENTITY_RANK_COMPARATOR;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.NavigableMap;
import java.util.Set;
import java.util.TreeMap;
import org.apache.clerezza.rdf.core.PlainLiteral;
import org.apache.clerezza.rdf.core.Triple;
import org.apache.clerezza.rdf.core.TripleCollection;
import org.apache.clerezza.rdf.core.UriRef;
import org.apache.clerezza.rdf.core.impl.TripleImpl;
import org.apache.commons.lang.StringUtils;
import org.apache.stanbol.enhancer.engines.entitylinking.Entity;
import org.apache.stanbol.enhancer.engines.entitylinking.EntitySearcher;
import org.apache.stanbol.enhancer.engines.entitylinking.EntitySearcherException;
import org.apache.stanbol.enhancer.engines.entitylinking.LabelTokenizer;
import org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig;
import org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.RedirectProcessingMode;
import org.apache.stanbol.enhancer.engines.entitylinking.config.LanguageProcessingConfig;
import org.apache.stanbol.enhancer.engines.entitylinking.impl.Suggestion.MATCH;
import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
import org.apache.stanbol.enhancer.nlp.model.Section;
import org.apache.stanbol.enhancer.nlp.model.Token;
import org.apache.stanbol.enhancer.servicesapi.rdf.NamespaceEnum;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class EntityLinker {
private static final int MIN_SEARCH_LIMIT = 10;
private final Logger log = LoggerFactory.getLogger(EntityLinker.class);
private final EntityLinkerConfig linkerConfig;
private final LanguageProcessingConfig textProcessingConfig;
//private final AnalysedText analysedText;
private final EntitySearcher entitySearcher;
/**
* The state of the current processing
*/
private final ProcessingState state;
/**
* The map holding the results of the linking process
*/
private final Map<String,LinkedEntity> linkedEntities = new HashMap<String,LinkedEntity>();
//private Integer lookupLimit;
private LabelTokenizer labelTokenizer;
private LinkingStateAware linkingStateAware;
private int minSearchResults;
//Language configuration
final String documentLang;
final String defaultLang;
final String documentMainLang;
private Statistic textProcessingStats = new Statistic("Text Processing");
private Statistic lookupStats = new Statistic("Vocabulary Lookup");
private int numQueryResults = 0;
private int numFilteredResults = 0;
private Statistic matchingStats = new Statistic("Label Matching");
private Statistic rankingStats = new Statistic("Suggestion Ranking");
// private Statistic test = new Statistic("test1");
// private Statistic test2_ = new Statistic("test2");
private int numLabels = 0;
private long processingTime = -1;
public EntityLinker(AnalysedText analysedText, String language,
LanguageProcessingConfig textProcessingConfig,
EntitySearcher entitySearcher,
EntityLinkerConfig linkerConfig,
LabelTokenizer labelTokenizer) {
this(analysedText,language,textProcessingConfig,entitySearcher,linkerConfig,labelTokenizer,null);
}
public EntityLinker(AnalysedText analysedText, String language,
LanguageProcessingConfig textProcessingConfig,
EntitySearcher entitySearcher,
EntityLinkerConfig linkerConfig,
LabelTokenizer labelTokenizer, LinkingStateAware linkingStateAware) {
//this.analysedText = analysedText;
this.entitySearcher = entitySearcher;
this.linkerConfig = linkerConfig;
this.textProcessingConfig = textProcessingConfig;
this.labelTokenizer = labelTokenizer;
this.state = new ProcessingState(analysedText,language,textProcessingConfig);
minSearchResults = entitySearcher.getLimit() == null ? MIN_SEARCH_LIMIT :
Math.max(MIN_SEARCH_LIMIT,entitySearcher.getLimit());
//this.lookupLimit = Math.max(minResults,linkerConfig.getMaxSuggestions()*3);
this.linkingStateAware = linkingStateAware;
//init the language settings
this.documentLang = state.getLanguage();
this.defaultLang = linkerConfig.getDefaultLanguage();
int countryCodeIndex = documentLang == null ? -1 : documentLang.indexOf('-');
if(countryCodeIndex >= 2){
documentMainLang = documentLang.substring(0,countryCodeIndex);
} else {
documentMainLang = null;
}
}
/**
* Steps over the sentences, chunks, tokens of the {@link #sentences}
*/
public void process() throws EntitySearcherException {
long startTime = System.currentTimeMillis();
//int debugedIndex = 0;
Section sentence = null;
textProcessingStats.begin();
while(state.next()) {
//STANBOL-1070: added linkingStateAware callbacks for components that
// need to react on the state of the Linking process
if(linkingStateAware != null){
if(!state.getSentence().equals(sentence)){
if(sentence != null){
linkingStateAware.endSection(sentence);
}
sentence = state.getSentence(); //set the next sentence
linkingStateAware.startSection(sentence); //notify its start
}
linkingStateAware.startToken(state.getToken().token); //notify the current token
}
TokenData token = state.getToken();
if(log.isDebugEnabled()){
log.debug("--- preocess Token {}: {} (lemma: {}) linkable={}, matchable={} | chunk: {}",
new Object[]{token.index,token.getTokenText(),token.getTokenLemma(),
token.isLinkable, token.isMatchable, token.inChunk != null ?
(token.inChunk.chunk + " "+ token.inChunk.chunk.getSpan()) : "none"});
}
List<TokenData> searchStrings = new ArrayList<TokenData>(linkerConfig.getMaxSearchTokens());
getSearchString(token);
searchStrings.add(token);
//Determine the range we are allowed to search for tokens
final int minIncludeIndex;
final int maxIndcludeIndex;
//NOTE: testing has shown that using Chunks to restrict search for
// additional matchable tokens does have an negative impact on
// recall. Because of that this restriction is for now deactivated
//TODO: maybe make configurable via an own property
boolean restrirctContextByChunks = textProcessingConfig.isIgnoreChunks();
int consumedIndex = state.getConsumedIndex();
if(token.inChunk != null && !textProcessingConfig.isIgnoreChunks() &&
restrirctContextByChunks){
minIncludeIndex = token.inChunk.getStartTokenIndex();
// minIncludeIndex = Math.max(
// state.getConsumedIndex()+1,
// token.inChunk.getStartTokenIndex());
maxIndcludeIndex = token.inChunk.getEndTokenIndex();
} else {
maxIndcludeIndex = state.getTokens().size() - 1;
// minIncludeIndex = state.getConsumedIndex() + 1;
minIncludeIndex = 0;
}
int prevIndex = token.index;
int pastIndex = token.index;
int pastNonMatchable = 0;
int prevNonMatchable = 0;
int distance = 0;
do {
distance++;//keep track of the distance
//get the past token at the given distance (However ignore
//non AlphaNumeric tokens when calculating the distance)
pastIndex++;
TokenData pastToken = null;
while(pastToken == null && maxIndcludeIndex >= pastIndex &&
pastNonMatchable <= 1){
TokenData td = state.getTokens().get(pastIndex);
if(td.hasAlphaNumeric){
pastToken = td;
} else {
pastIndex++;
}
}
//get the previous token at the given distance (However ignore
//non AlphaNumeric tokens when calculating the distance)
prevIndex--;
TokenData prevToken = null;
while(prevToken == null && minIncludeIndex <= prevIndex &&
//allow one nonMatchable token if prevIndex > the last
//consumed one and zero nonMatchable if prevIndex is <=
//the last consumed one
((prevIndex > consumedIndex && prevNonMatchable <= 1) ||
prevIndex <= consumedIndex && prevNonMatchable < 1)){
TokenData td = state.getTokens().get(prevIndex);
if(td.hasAlphaNumeric){
prevToken = td;
} else {
prevIndex--;
}
}
//now that we know the tokens at this distance check if they are matchable
//Fist the past token
if(pastToken != null){
if(log.isDebugEnabled()){
log.debug(" {} {}:'{}' (lemma: {}) linkable={}, matchable={}",new Object[]{
pastToken.isMatchable? '+':'-',pastToken.index,
pastToken.getTokenText(), pastToken.getTokenLemma(),
pastToken.isLinkable, pastToken.isMatchable
});
}
if(pastToken.isMatchable){
searchStrings.add(pastToken);
} else {
pastNonMatchable++;
}
}
//Second in the previous token
if(prevToken != null){
if(log.isDebugEnabled()){
log.debug(" {} {}:'{}' (lemma: {}) linkable={}, matchable={}",new Object[]{
prevToken.isMatchable? '+':'-',prevToken.index,
prevToken.getTokenText(), prevToken.getTokenLemma(),
prevToken.isLinkable, prevToken.isMatchable
});
}
if(prevToken.isMatchable){
getSearchString(prevToken);
searchStrings.add(0,prevToken);
} else {
prevNonMatchable++;
}
}
} while(searchStrings.size() < linkerConfig.getMaxSearchTokens() && distance <
linkerConfig.getMaxSearchDistance() &&
(prevIndex > minIncludeIndex || pastIndex < maxIndcludeIndex) &&
(prevNonMatchable <= 1 || pastNonMatchable <= 1));
//we might have an additional element in the list
if(searchStrings.size() > linkerConfig.getMaxSearchTokens()){
searchStrings = searchStrings.subList( //the last part of the list
searchStrings.size()-linkerConfig.getMaxSearchTokens(),
searchStrings.size());
}
if(log.isDebugEnabled()){
List<String> list = new ArrayList<String>(searchStrings.size());
for(TokenData dt : searchStrings){
list.add(dt.token.getSpan());
}
log.debug(" >> searchStrings {}",list);
}
textProcessingStats.complete();
//search for Entities
List<Suggestion> suggestions = lookupEntities(searchStrings);
//Treat partial matches that do match more as the best FULL match
//differently
List<Suggestion> partialMatches = new ArrayList<Suggestion>();
if(!suggestions.isEmpty()){
rankingStats.begin();
//update the suggestions based on the best match
int bestMatchCount = suggestions.get(0).getLabelMatch().getMatchCount();
Iterator<Suggestion> it = suggestions.iterator();
while(it.hasNext()){
Suggestion suggestion = it.next();
//suggestions that match less tokens as the best match
//need to be updated to PARTIAL
int matchCount = suggestion.getLabelMatch().getMatchCount();
if(matchCount < bestMatchCount){
suggestion.setMatch(MATCH.PARTIAL);
} else if( matchCount > bestMatchCount){ //selects more tokens
partialMatches.add(suggestion); //but only a PARTIAL MATCH
it.remove(); //remove from the main suggestion list
}
//Filter matches with less than config.getMinFoundTokens()
//if matchcount is less than of the best match
if(matchCount < bestMatchCount &&
matchCount < linkerConfig.getMinFoundTokens()){
it.remove();
} else { //calculate the score
//how good is the current match in relation to the best one
double spanScore = matchCount >= bestMatchCount ? 1.0d :
matchCount/(double)bestMatchCount;
suggestion.setScore(spanScore*spanScore*suggestion.getLabelMatch().getMatchScore());
}
}
Suggestion oldBestRanked = suggestions.get(0); //for debugging
//resort by score
Collections.sort(suggestions, Suggestion.SCORE_COMPARATOR);
Collections.sort(partialMatches, Suggestion.SCORE_COMPARATOR);
//this should never happen ... but the
//matchcount of the best match MUST NOT change
//after the sort by score!
if(bestMatchCount != suggestions.get(0).getLabelMatch().getMatchCount()){
log.warn("The match count for the top Ranked Suggestion for {} " +
"changed after resorting based on Scores!",
state.getTokenText(suggestions.get(0).getLabelMatch().getStart(),bestMatchCount));
log.warn(" originalbest : {}",oldBestRanked);
log.warn(" currnet ranking : {}",suggestions);
log.warn(" ... this will result in worng confidence values relative to the best match");
}
//adapt equals rankings based on the entity rank
if(linkerConfig.isRankEqualScoresBasedOnEntityRankings()){
adaptScoresForEntityRankings(suggestions);
adaptScoresForEntityRankings(partialMatches);
}
//remove all suggestions > config.maxSuggestions
if(suggestions.size() > linkerConfig.getMaxSuggestions()){
suggestions.subList(linkerConfig.getMaxSuggestions(),suggestions.size()).clear();
}
if(log.isDebugEnabled()){
log.debug(" >> Suggestions:");
int i=0;
for(Suggestion s : suggestions){
log.debug(" - {}: {}",i,s);
i++;
}
}
//process redirects
if(linkerConfig.getRedirectProcessingMode() != RedirectProcessingMode.IGNORE){
for(Suggestion suggestion : suggestions){
processRedirects(suggestion);
}
for(Suggestion suggestion : partialMatches){
processRedirects(suggestion);
}
}
//create LinkedEntities for the main suggestions
int start = suggestions.get(0).getLabelMatch().getStart();
int span = suggestions.get(0).getLabelMatch().getSpan();
//Store the linking results
String selectedText = state.getTokenText(start,span);
//float score;
LinkedEntity linkedEntity = linkedEntities.get(selectedText);
if(linkedEntity == null){
linkedEntity = new LinkedEntity(selectedText,
suggestions, getLinkedEntityTypes(suggestions));
linkedEntities.put(selectedText, linkedEntity);
} // else Assumption: The list of suggestions is the SAME
linkedEntity.addOccurrence(state.getSentence(),
//NOTE: The end Token is "start+span-1"
state.getTokens().get(start).token, state.getTokens().get(start+span-1).token);
//In case of a FULL or EXACT MATCH we can set the next token to process to the next
//word after the currently found suggestion
if(suggestions.get(0).getMatch().ordinal() >= MATCH.FULL.ordinal()){
state.setConsumed(start+span-1);
}
//create LinkedEntities for partial matches
//TODO: maybe we need to group partial matches based on their
// selected Tokens and only group those suggestions that do
// select the same span in the Text. Currently all are grouped
// based on those that does select the most tokens.
if(!partialMatches.isEmpty()){
start = partialMatches.get(0).getLabelMatch().getStart();
span = partialMatches.get(0).getLabelMatch().getSpan();
selectedText = state.getTokenText(start, span);
linkedEntity = linkedEntities.get(selectedText);
if(linkedEntity == null){
linkedEntity = new LinkedEntity(selectedText,
partialMatches, getLinkedEntityTypes(suggestions));
linkedEntities.put(selectedText, linkedEntity);
} // else Assumption: The list of suggestions is the SAME
linkedEntity.addOccurrence(state.getSentence(),
//NOTE: The end Token is "start+span-1"
state.getTokens().get(start).token, state.getTokens().get(start+span-1).token);
}
rankingStats.complete();
} // else suggestions are empty
if(linkingStateAware != null){
linkingStateAware.endToken(state.getToken().token);
}
textProcessingStats.begin();
}
textProcessingStats.cancel(); //do not count the last call
if(linkingStateAware != null && sentence != null){
linkingStateAware.endSection(sentence);
}
this.processingTime = System.currentTimeMillis()-startTime;
}
/**
* @param suggestions
*/
private void adaptScoresForEntityRankings(List<Suggestion> suggestions) {
List<Suggestion> equalScoreList = new ArrayList<Suggestion>(4);
double score = 2f;
for(Suggestion s : suggestions){
double actScore = s.getScore();
if(score == actScore){
equalScoreList.add(s);
} else {
if(equalScoreList.size() > 1){
adaptScoreForEntityRankings(equalScoreList, actScore);
}
score = actScore;
equalScoreList.clear();
equalScoreList.add(s);
}
}
if(equalScoreList.size() > 1){
adaptScoreForEntityRankings(equalScoreList,0);
}
//resort by score
Collections.sort(suggestions, Suggestion.SCORE_COMPARATOR);
}
/**
* Helper that extracts the
* @param token
*/
private String getSearchString(TokenData token) {
String searchString = linkerConfig.isLemmaMatching() ? token.getTokenLemma() :
token.getTokenText();
if(searchString == null){
searchString = token.getTokenText();
}
return searchString;
}
/**
* This method slightly adapts scores of Suggestions based on the Entity ranking.
* It is used for Suggestions that would have the exact same score (e.g. 1.0) to
* ensure ordering of the suggestions based on the rankings of the Entities
* within the knowledge base linked against
* @param equalScoreList Entities with the same {@link Suggestion#getScore()}
* values. If this is not the case this method will change scores in unintended
* ways
* @param nextScore the score of the {@link Suggestion} with a lower score as the
* list of suggestions parsed in the first parameter
*/
private void adaptScoreForEntityRankings(List<Suggestion> equalScoreList, double nextScore) {
double score = equalScoreList.get(0).getScore();
log.debug(" > Adapt Score of multiple Suggestions "
+ "with '{}' based on EntityRanking",score);
//Adapt the score to reflect the entity ranking
//but do not change order with entities of different
//score. Also do not change the score more that 0.1
//TODO: make the max change (0.1) configurable
double dif = (Math.min(0.1, score-nextScore))/equalScoreList.size();
Collections.sort(equalScoreList,ENTITY_RANK_COMPARATOR);
log.debug(" - keep socre of {} at {}", equalScoreList.get(0).getEntity().getId(), score);
for(int i=1;i<equalScoreList.size();i++){
score = score-dif;
if(ENTITY_RANK_COMPARATOR.compare(equalScoreList.get(i-1),
equalScoreList.get(i)) != 0){
equalScoreList.get(i).setScore(score);
log.debug(" - set score of {} at {}", equalScoreList.get(i).getEntity().getId(), score);
} else {
double lastScore = equalScoreList.get(i-1).getScore();
equalScoreList.get(i).setScore(lastScore);
log.debug(" - set score of {} at {}", equalScoreList.get(i).getEntity().getId(), lastScore);
}
}
}
/**
* After {@link #process()}ing this returns the entities linked for the
* parsed {@link AnalysedContent}.
* @return the linked entities
*/
public final Map<String,LinkedEntity> getLinkedEntities() {
return linkedEntities;
}
/**
* Retrieves all {@link EntitySearcher#getTypeField()} values of the parsed
* {@link Suggestion}s and than lookup the {@link NamespaceEnum#dcTerms dc}:type
* values for the {@link LinkedEntity#getTypes()} by using the configured
* {@link EntityLinkerConfig#getTypeMappings() types mappings} (and if
* no mapping is found the {@link EntityLinkerConfig#getDefaultDcType()
* default} type.
* @param conceptTypes The list of suggestions
* @return the types values for the {@link LinkedEntity}
*/
private Set<UriRef> getLinkedEntityTypes(Collection<Suggestion> suggestions){
Collection<UriRef> conceptTypes = new HashSet<UriRef>();
double score = -1; //only consider types of the best ranked Entities
for(Suggestion suggestion : suggestions){
double actScore = suggestion.getScore();
if(actScore < score){
break;
}
for(Iterator<UriRef> types =
suggestion.getEntity().getReferences(linkerConfig.getTypeField());
types.hasNext();conceptTypes.add(types.next()));
}
Map<UriRef,UriRef> typeMappings = linkerConfig.getTypeMappings();
Set<UriRef> dcTypes = new HashSet<UriRef>();
for(UriRef conceptType : conceptTypes){
UriRef dcType = typeMappings.get(conceptType);
if(dcType != null){
dcTypes.add(dcType);
}
}
if(dcTypes.isEmpty() && linkerConfig.getDefaultDcType() != null){
dcTypes.add(linkerConfig.getDefaultDcType());
}
return dcTypes;
}
/**
* Processes {@link EntitySearcher#getRedirectField() redirect field} values for
* the parsed suggestions based on the {@link RedirectProcessingMode}
* as configured in the {@link #config}.<p>
* The results of this method are stored within the parsed {@link Suggestion}s
* @param suggestion The suggestion to process.
* @throws EntitySearcherException
*/
private void processRedirects(Suggestion suggestion) throws EntitySearcherException {
//if mode is IGNORE -> nothing to do
if(linkerConfig.getRedirectProcessingMode() == RedirectProcessingMode.IGNORE){
return;
}
//in case results for queries are locally cached it might be the case
//that some/all of the results do already redirects processed.
//therefore there is a small internal state that stores this information
if(suggestion.isRedirectedProcessed()){
return; //Redirects for ResultMatch are already processed ... ignore
}
Entity result = suggestion.getResult();
Iterator<UriRef> redirects = result.getReferences(linkerConfig.getRedirectField());
switch (linkerConfig.getRedirectProcessingMode()) {
case ADD_VALUES:
TripleCollection entityData = result.getData();
UriRef entityUri = result.getUri();
while(redirects.hasNext()){
UriRef redirect = redirects.next();
if(redirect != null){
Entity redirectedEntity = entitySearcher.get(redirect,
linkerConfig.getSelectedFields());
if(redirectedEntity != null){
for(Iterator<Triple> data = redirectedEntity.getData().filter(
redirectedEntity.getUri(), null, null);data.hasNext();){
Triple t = data.next();
entityData.add(new TripleImpl(entityUri,t.getPredicate(),t.getObject()));
}
}
//set that the redirects where searched for this result
suggestion.setRedirectProcessed(true);
}
}
case FOLLOW:
while(redirects.hasNext()){
UriRef redirect = redirects.next();
if(redirect != null){
Entity redirectedEntity = entitySearcher.get(redirect,
linkerConfig.getSelectedFields());
if(redirectedEntity != null){
suggestion.setRedirect(redirectedEntity);
}
}
}
default: //nothing to do
}
}
/**
* Searches for Entities in the {@link #entitySearcher} corresponding to the
* {@link Token#getText() words} of the current {@link #state position} in
* the text.
* @param searchTokens the list of {@link Token#getText() words} to search
* entities for.
* @return The sorted list with the suggestions.
* If there are no suggestions an empty list will be returned.
* @throws EntitySearcherException
*/
private List<Suggestion> lookupEntities(List<TokenData> searchTokens) throws EntitySearcherException {
Set<String> languages = new HashSet<String>();
languages.add(linkerConfig.getDefaultLanguage());
languages.add(state.getLanguage());
int countryCodeIndex = state.getLanguage() == null ? -1 : state.getLanguage().indexOf('-');
if(countryCodeIndex >= 2){
languages.add(state.getLanguage().substring(0,countryCodeIndex));
}
List<String> searchStrings = new ArrayList<String>(searchTokens.size());
for(Iterator<TokenData> it = searchTokens.iterator();it.hasNext();){
searchStrings.add(getSearchString(it.next()));
}
String[] languageArray = languages.toArray(new String[languages.size()]);
List<Suggestion> suggestions = new ArrayList<Suggestion>();
//perform the lookup with the parsed parameter
int numResults = performLookup(searchStrings, languageArray, suggestions, searchTokens);
//if no match where found in the result .. fallback to a search for the
//current token
if(suggestions.isEmpty() && numResults > 0 && searchStrings.size() > 1){
//there where results, but no one matched ...
// ... it is most likely a case where the used search terms are
// not releated. So try to query for the active token only
searchTokens = Collections.singletonList(state.getToken());
log.debug(" > No match for '{}' searchStrings ... ", searchStrings);
searchStrings = Collections.singletonList(state.getToken().token.getSpan());
log.debug(" ... fallback to search for active token '{}' ...",searchStrings);
performLookup(searchStrings, languageArray, suggestions, searchTokens);
}
//sort the suggestions
if(suggestions.size()>1){
Collections.sort(suggestions,Suggestion.MATCH_TYPE_SUGGESTION_COMPARATOR);
}
return suggestions;
}
/**
* @param searchStrings
* @param languageArray
* @param suggestions
* @param searchTokens
* @return
* @throws EntitySearcherException
*/
private int performLookup(List<String> searchStrings, String[] languageArray,
List<Suggestion> suggestions, List<TokenData> searchTokens) throws EntitySearcherException {
int minProcessedResults = linkerConfig.getMaxSuggestions()*3;
int lookupLimit = Math.max(MIN_SEARCH_LIMIT, linkerConfig.getMaxSuggestions()*2*searchTokens.size());
int maxResults = lookupLimit*2;
int offset = 0;
int numFiltered = 0;
boolean moreResultsAvailable = true;
int numResults = 0;
//search for entities until
// (1) we have more as MAX_SUGGESTION results
// (2) no more results are available
// (3) the number of processed Entities is smaller as two times the
// suggestions
// (4) the number of requested Entities is smaller as two times the
// lookup limit.
//NOTE: making multiple requests can decrease the performance a lot.
// Because of that those limits assure that no more than two
// requests are made for the same lookup.
while(suggestions.size() < linkerConfig.getMaxSuggestions() &&
moreResultsAvailable && (numResults-numFiltered) < (minProcessedResults) &&
numResults < maxResults){
Collection<? extends Entity> results;
log.debug(" > request entities [{}-{}] entities ...",offset,(offset+lookupLimit));
lookupStats.begin(); //keep statistics
results = entitySearcher.lookup(linkerConfig.getNameField(),
linkerConfig.getSelectedFields(), searchStrings, languageArray,
lookupLimit, offset);
lookupStats.complete();
log.debug(" < found {} entities ...",results.size());
//queries might return more as the requested results
moreResultsAvailable = results.size() >= lookupLimit;
numResults = numResults + results.size();
offset = numResults;
matchingStats.begin();
numFiltered = numFiltered + processLookupResults(searchTokens, results, suggestions);
matchingStats.complete();
//sort the suggestions
}
return numResults;
}
/**
* Processes the parsed entity lookup results and adds suggestions to the
* parsed suggestion list
* @param results the results
* @param suggestions the suggestions
* @return the number of filtered results
*/
private int processLookupResults(List<TokenData> searchTokens, Collection<? extends Entity> results, List<Suggestion> suggestions) {
int numFiltered = 0;
for(Entity result : results){
if(log.isDebugEnabled()){
log.debug(" > {} (ranking: {})",result.getId(),result.getEntityRanking());
}
numQueryResults++;
//white/black list based entity type filtering (STANBOL-1111)
boolean filtered = false;
if(linkerConfig.isEntityTypeFilteringActive()){
filtered = filterEntity(result.getReferences(linkerConfig.getTypeField()));
}
if(!filtered){
Suggestion suggestion = matchLabels(searchTokens, result);
if(suggestion.getMatch() != MATCH.NONE){
if(log.isDebugEnabled()){
log.debug(" + {}",suggestion);
}
suggestions.add(suggestion);
} else {
log.debug(" - no match");
}
} else {//do not process Entities with a filtered type
numFilteredResults++; //global statistics
numFiltered++;
}
}
return numFiltered;
}
public boolean filterEntity(Iterator<UriRef> entityTypes){
Map<UriRef, Integer> whiteList = linkerConfig.getWhitelistedTypes();
Map<UriRef, Integer> blackList = linkerConfig.getBlacklistedTypes();
Integer w = null;
Integer b = null;
while(entityTypes.hasNext()){
UriRef type = entityTypes.next();
Integer act = whiteList.get(type);
if(act != null){
if(w == null || act.compareTo(w) < 0){
w = act;
}
if(act.intValue() == 0){
break;
}
}
act = blackList.get(type);
if(act != null){
if(b == null || act.compareTo(b) < 0){
b = act;
}
if(act.intValue() == 0){
break;
}
}
}
if(w == null && b == null){
return !linkerConfig.isDefaultWhitelistTypes();
} else if(w != null){
return b == null || w.compareTo(b) < 0 ? false : true;
} else { //w == null && b != null
return true; //filter
}
}
/**
* Matches the labels of the parsed {@link Representation} with the Tokens of
* the texts (beginning with the currently active
* {@link ProcessingState#getToken() token}).<p>
* The field used to get the labels is retrieved from
* {@link EntitySearcher#getNameField()}. Only labels with no language or the
* language of the current sentence are considered. If less than
* {@link EntityLinkerConfig#getMinFoundTokens()} tokens match with an
* label the Concept is only considered to match if the label is
* {@link String#equalsIgnoreCase(String)} to the text covered by the
* matched token(s). Otherwise also {@link MATCH#FULL} and {@link MATCH#PARTIAL}
* results are allowed.
* @param entity The entity including at least the data for the
* {@link EntitySearcher#getNameField()} property.
* @return The result of the matching.
*/
private Suggestion matchLabels(List<TokenData> searchTokens, Entity entity) {
String curLang = documentLang; //language of the current sentence
String defLang = defaultLang; //configured default language
String mainLang = documentMainLang;
Collection<PlainLiteral> mainLangLabels;
if(documentMainLang != null){
mainLang = documentMainLang;
mainLangLabels = new ArrayList<PlainLiteral>();
} else {
mainLang = documentLang;
mainLangLabels = Collections.emptyList();
}
Iterator<PlainLiteral> labels = entity.getText(linkerConfig.getNameField());
Suggestion match = new Suggestion(entity);
Collection<PlainLiteral> defaultLabels = new ArrayList<PlainLiteral>();
boolean matchedLangLabel = false;
//avoid matching multiple labels with the exact same lexical.
Set<String> matchedLabels = new HashSet<String>();
while(labels.hasNext()){
PlainLiteral label = labels.next();
numLabels++;
String lang = label.getLanguage() != null ? label.getLanguage().toString() : null;
if((lang == null && curLang == null) ||
(lang != null && curLang != null && lang.equalsIgnoreCase(curLang))){
if(!matchedLabels.contains(label.getLexicalForm())){
matchLabel(searchTokens, match, label);
matchedLabels.add(label.getLexicalForm());
matchedLangLabel = true;
}
} else if((lang == null && mainLang == null) ||
(lang != null && mainLang != null && lang.equalsIgnoreCase(mainLang))){
mainLangLabels.add(label);
} else if((lang == null && defLang == null) ||
(lang != null && defLang != null && lang.startsWith(defLang))){
defaultLabels.add(label);
}
}
//try to match main language labels
if(!matchedLangLabel || match.getMatch() == MATCH.NONE){
for(PlainLiteral mainLangLabel : mainLangLabels){
if(!matchedLabels.contains(mainLangLabel.getLexicalForm())){
matchLabel(searchTokens, match, mainLangLabel);
matchedLabels.add(mainLangLabel.getLexicalForm());
matchedLangLabel = true;
}
}
}
//use only labels in the default language if there is
// * no label in the current language or
// * no MATCH was found in the current language
if(!matchedLangLabel || match.getMatch() == MATCH.NONE){
for(PlainLiteral defaultLangLabel : defaultLabels){
if(!matchedLabels.contains(defaultLangLabel.getLexicalForm())){
matchLabel(searchTokens, match, defaultLangLabel);
matchedLabels.add(defaultLangLabel.getLexicalForm());
}
}
}
return match;
}
/**
* @param suggestion
* @param label
*/
private void matchLabel(List<TokenData> searchTokens, Suggestion suggestion, PlainLiteral label) {
// test.begin();
String text = label.getLexicalForm();
String lang = label.getLanguage() == null ? null : label.getLanguage().toString();
if(!linkerConfig.isCaseSensitiveMatching()){
text = text.toLowerCase(); //TODO use language of label for Locale
}
//Tokenize the label and remove remove tokens without alpha numerical chars
String[] unprocessedLabelTokens = labelTokenizer != null ?
labelTokenizer.tokenize(text, lang) : null;
if(unprocessedLabelTokens == null){ //no tokenizer available
log.info("Unable to tokenize {} language texts. Will process untokenized label {}",
state.getLanguage(),text);
unprocessedLabelTokens = new String[]{text}; //there is already a warning
}
int offset = 0;
for(int i=0;i<unprocessedLabelTokens.length;i++){
boolean hasAlphaNumericChar = Utils.hasAlphaNumericChar(unprocessedLabelTokens[i]);
if(!hasAlphaNumericChar){
offset++;
} else if(offset > 0){
String token = unprocessedLabelTokens[i];
token = StringUtils.replaceChars(token,".","");
unprocessedLabelTokens[i-offset] = token;
}
}
String[] labelTokens;
if(offset == 0){
labelTokens = unprocessedLabelTokens;
} else {
labelTokens = new String[unprocessedLabelTokens.length-offset];
System.arraycopy(unprocessedLabelTokens, 0, labelTokens, 0, labelTokens.length);
}
//holds the tokens and their position within the label. NOTE that the same
//token may appear multiple times in the label (e.g. "Da Da Bing"
Map<String,List<Integer>> labelTokenMap = new HashMap<String, List<Integer>>();
for(int i=0;i < labelTokens.length; i++){
List<Integer> tokenIndexes = labelTokenMap.get(labelTokens[i]);
if(tokenIndexes == null){
tokenIndexes = new ArrayList<Integer>(2);
labelTokenMap.put(labelTokens[i], tokenIndexes);
}
tokenIndexes.add(Integer.valueOf(i));
}
NavigableMap<Integer, String> matchedLabelTokens = new TreeMap<Integer,String>();
int foundProcessableTokens = 0;
int foundTokens = 0;
float foundTokenMatch = 0;
//ensure the correct order of the tokens in the suggested entity
boolean search = true;
boolean activeTokenNotMatched = true;
int firstFoundIndex = -1;
int firstProcessableFoundIndex = -1;
int lastFoundIndex = -1;
int lastProcessableFoundIndex = -1;
int firstFoundLabelIndex = -1;
int lastfoundLabelIndex = -1;
TokenData currentToken;
String currentTokenText;
int currentTokenLength;
int notFound = 0;
int matchedTokensNotWithinProcessableTokenSpan = 0;
int foundTokensWithinCoveredProcessableTokens = 0;
float minTokenMatchFactor = linkerConfig.getMinTokenMatchFactor();
//search for matches within the correct order
for(int currentIndex = state.getToken().index;
currentIndex < state.getTokens().size()
&& search ;currentIndex++){
currentToken = state.getTokens().get(currentIndex);
if(currentToken.hasAlphaNumeric){
currentTokenText = linkerConfig.isLemmaMatching() ?
currentToken.getTokenLemma() : currentToken.getTokenText();
if(currentTokenText == null) { //no lemma available
currentTokenText = currentToken.getTokenText(); //fallback to text
}
//ignore '.' in tokens to ensure that 'D.C.' matches 'DC' ...
currentTokenText = StringUtils.replaceChars(currentTokenText,".","");
if(!linkerConfig.isCaseSensitiveMatching()){
currentTokenText = currentTokenText.toLowerCase();
}
currentTokenLength = currentTokenText.length();
boolean found = false;
float matchFactor = 0f;
//iteration starts at the next token after the last matched one
//so it is OK to skip tokens in the label, but not within the text
for(int i = lastfoundLabelIndex+1;!found && i < labelTokens.length;i ++){
String labelTokenText = labelTokens[i];
int labelTokenLength = labelTokenText.length();
float maxLength = currentTokenLength > labelTokenLength ? currentTokenLength : labelTokenLength;
float lengthDif = Math.abs(currentTokenLength - labelTokenLength);
if((lengthDif/maxLength)<=(1-minTokenMatchFactor)){ //this prevents unnecessary string comparison
int matchCount = compareTokens(currentTokenText, labelTokenText);
if(matchCount/maxLength >= minTokenMatchFactor){
lastfoundLabelIndex = i; //set the last found index to the current position
found = true; //set found to true -> stops iteration
matchFactor = matchCount/maxLength; //how good is the match
//remove matched labels from the set to disable them for
//a later random oder search
Integer labelTokenIndex = getLabelTokenIndex(labelTokenText, i, labelTokenMap);
matchedLabelTokens.put(labelTokenIndex, labelTokenText);
}
}
}
if(!found){
//search for a match in the wrong order
//currently only exact matches (for testing)
Integer index = getLabelTokenIndex(currentTokenText, lastfoundLabelIndex+1, labelTokenMap);
if(index != null){
matchedLabelTokens.put(index, currentTokenText);
found = true;
matchFactor = 0.7f;
}
}
//int found = text.indexOf(currentToken.getText().toLowerCase());
if(found){ //found
if(currentToken.isMatchable){
foundProcessableTokens++; //only count processable Tokens
if(firstProcessableFoundIndex < 0){
firstProcessableFoundIndex = currentIndex;
}
lastProcessableFoundIndex = currentIndex;
foundTokensWithinCoveredProcessableTokens++;
if(matchedTokensNotWithinProcessableTokenSpan > 0){
foundTokensWithinCoveredProcessableTokens = foundTokensWithinCoveredProcessableTokens +
matchedTokensNotWithinProcessableTokenSpan;
matchedTokensNotWithinProcessableTokenSpan = 0;
}
} else {
matchedTokensNotWithinProcessableTokenSpan++;
}
foundTokens++;
foundTokenMatch = foundTokenMatch + matchFactor; //sum up the matches
if(firstFoundIndex < 0){
firstFoundIndex = currentIndex;
firstFoundLabelIndex = lastfoundLabelIndex;
}
lastFoundIndex = currentIndex;
} else { //not found
if(state.getToken().index == currentToken.index){
//the currently active Token MUST BE matched
search = false;
activeTokenNotMatched = true;
}
notFound++;
//stop forward search if
// if(currentToken.isMatchable || notFound > linkerConfig.getMaxNotFound()){
if(!searchTokens.contains(currentToken)){
//stop as soon as a token that needs to be processed is
//not found in the label or the maximum number of tokens
//that are not processable are not found
search = false;
}
}
} // else token without alpha or numeric characters are not processed
}
//search backwards for label tokens until firstFoundLabelIndex if there
//are unconsumed Tokens in the sentence before state.getTokenIndex
int currentIndex = state.getToken().index-1;
int labelIndex = firstFoundLabelIndex-1;
notFound = 0;
matchedTokensNotWithinProcessableTokenSpan = 0;
if(activeTokenNotMatched){ //do not search backwards if the active token
//was not found
search = true;
}
while(search && labelIndex >= 0 && currentIndex >= 0){// && currentIndex > state.getConsumedIndex()){
String labelTokenText = labelTokens[labelIndex];
if(labelTokenMap.containsKey(labelTokenText)){ //still not matched
currentToken = state.getTokens().get(currentIndex);
currentTokenText = linkerConfig.isLemmaMatching() ?
currentToken.getTokenLemma() : currentToken.getTokenText();
if(currentTokenText == null) { //no lemma available
currentTokenText = currentToken.getTokenText(); //fallback to text
}
if(!linkerConfig.isCaseSensitiveMatching()){
currentTokenText = currentTokenText.toLowerCase();
}
currentTokenText = StringUtils.replaceChars(currentTokenText,".","");
currentTokenLength = currentTokenText.length();
boolean found = false;
float matchFactor = 0f;
int labelTokenLength = labelTokenText.length();
float maxLength = currentTokenLength > labelTokenLength ? currentTokenLength : labelTokenLength;
float lengthDif = Math.abs(currentTokenLength - labelTokenLength);
if((lengthDif/maxLength)<=(1-minTokenMatchFactor)){ //this prevents unnecessary string comparison
int matchCount = compareTokens(currentTokenText, labelTokenText);
if(matchCount/maxLength >= minTokenMatchFactor){
found = true; //set found to true -> stops iteration
matchFactor = matchCount/maxLength; //how good is the match
}
}
if(found){ //found
if(currentToken.isMatchable){
foundProcessableTokens++; //only count processable Tokens
if(lastProcessableFoundIndex < 0){ //if last is not yet set
lastProcessableFoundIndex = currentIndex;
}
firstProcessableFoundIndex = currentIndex;
foundTokensWithinCoveredProcessableTokens++;
if(matchedTokensNotWithinProcessableTokenSpan > 0){
foundTokensWithinCoveredProcessableTokens = foundTokensWithinCoveredProcessableTokens +
matchedTokensNotWithinProcessableTokenSpan;
matchedTokensNotWithinProcessableTokenSpan = 0;
}
} else {
matchedTokensNotWithinProcessableTokenSpan++;
}
foundTokens++;
foundTokenMatch = foundTokenMatch + matchFactor; //sum up the matches
firstFoundIndex = currentIndex;
labelIndex--;
Integer foundIndex = getLabelTokenIndex(labelTokenText, currentIndex, labelTokenMap);
matchedLabelTokens.put(foundIndex, labelTokenText);
} else {
notFound++;
if(currentToken.isMatchable || notFound > linkerConfig.getMaxNotFound()){
//stop as soon as a token that needs to be processed is
//not found in the label or the maximum number of tokens
//that are not processable are not found
search = false;
}
}
currentIndex --;
} else { //this token is already matched ...
labelIndex--; //try the next one
}
}
if(foundProcessableTokens > 0) { //if any Token has matched
//Now we make a second round to search tokens that match in the wrong order
//e.g. if given and family name of persons are switched
final LabelMatch labelMatch;
int coveredTokens = lastFoundIndex-firstFoundIndex+1;
int coveredProcessableTokens = lastProcessableFoundIndex-firstProcessableFoundIndex+1;
//matched tokens only within the span of the first/last processable token
//Matching rules
// - if less than config#minTokenFound() than accept only EXACT
// - override PARTIAL matches with FULL/EXACT matches only if
// foundTokens of the PARTIAL match is > than of the FULL/EXACT
// match (this will be very rare
String currentText = state.getTokenText(firstFoundIndex,coveredTokens);
if(linkerConfig.isCaseSensitiveMatching() ? currentText.equals(text) : currentText.equalsIgnoreCase(text)){
labelMatch = new LabelMatch(firstFoundIndex, coveredTokens, label);
} else {
int coveredLabelTokens = matchedLabelTokens.lastKey().intValue()-matchedLabelTokens.firstKey().intValue()+1;
if(foundTokens == labelTokens.length && foundTokens == coveredTokens){
//if all token matched set found to covered: May be lower because only
//processable tokens are counted, but FULL also checks
//of non-processable!
foundTokens = coveredTokens;
foundProcessableTokens = coveredProcessableTokens;
}
labelMatch = new LabelMatch(firstProcessableFoundIndex, coveredProcessableTokens,
foundProcessableTokens,foundTokensWithinCoveredProcessableTokens,
foundTokenMatch/(float)foundTokens,label,labelTokens.length, coveredLabelTokens);
}
if(labelMatch.getLabelScore() >= linkerConfig.getMinLabelScore() &&
labelMatch.getTextScore() >= linkerConfig.getMinTextScore() &&
labelMatch.getMatchScore() >= linkerConfig.getMinMatchScore()){
suggestion.addLabelMatch(labelMatch);
}
} //else NO tokens found -> nothing to do
// test.complete();
}
/**
* Utility Method that searches for the Index of the parsed label token text
* within the labelTokenMap. Matched tokens are removed from the parsed
* LabelTokenMap <p>
* NOTE: This is necessary, because in cases where Labels do contain the same
* token twice, it might not be always clear which token is the matching one.
* Especially if the order of the Tokens in the Text does not exactly match
* the order within the Label. This Method tries always to find the matching
* token closest to the parsed currentIndex.
* It iterates backwards to prefer Tokens that occur later as the current index
* in the tokenized label.
* @param labelTokenText the text of the current labelToken
* @param currentIndex the current index of the processing (or if not known
* the last matched index of an token within the label
* @param labelTokenMap the Map holding tokens as key and a list of occurrences
* as values or <code>null</code> if no Token with the parsed labelTokenText
* was present as key in the parsed labelTokenMap
* @return the index of the selected label token
*/
private Integer getLabelTokenIndex(String labelTokenText, int currentIndex,
Map<String,List<Integer>> labelTokenMap) {
List<Integer> tokenIndexes = labelTokenMap.get(labelTokenText);
if(tokenIndexes == null){
return null;
}
//try to remove the closest index in the map
Integer labelTokenIndex = Integer.valueOf(currentIndex);
//search the closest position
int closest = Integer.MAX_VALUE;
int closestIndex = -1;
for(int p = tokenIndexes.size()-1; p >= 0; p--){
Integer index = tokenIndexes.get(p);
int dif = Math.abs(index.intValue()-currentIndex);
if(Math.abs(index.intValue()-currentIndex) < closest){
closest = dif;
closestIndex = p;
labelTokenIndex = index;
if(closest == 0){
break;
}
}
}
tokenIndexes.remove(closestIndex);
if(tokenIndexes.isEmpty()){
labelTokenMap.remove(labelTokenText);
}
return labelTokenIndex;
}
/**
* Compares to token with each other and returns the longest match. The
* tokens are compared from the beginning and from the end.
* @param token1 the first token
* @param token2 the second token
* @return the number of matching chars
*/
private int compareTokens(String token1,String token2){
int l1 = token1.length(); //length of the first token
int l2 = token2.length(); //length of the second token
//in case of same length check for equals first
if(l1 == l2 && token1.equals(token2)){
return l1;
}
int ml = l1>l2?l2:l1; //minimum length of a token
if(ml == 0){
return ml;
}
int f = 0; //forward match count + 1
int b = 0; //backward match count + 1
boolean match = true; //still matches
while(match && f < ml){
match = token1.charAt(f) == token2.charAt(f);
f++;
}
if(!match){
f--;
}
if(f < ml){
match = true;
while(match && b < ml){
b++;
match = token1.charAt(l1-b) == token2.charAt(l2-b);
}
if(!match){
b--;
}
}
return f > b ? f : b;
}
/**
* This logs the statistics about the processing process
* @param log the logger used to log the statistics
*/
public void logStatistics(Logger log){
log.info("EntityLinking Statistics:");
double textProcessingDuration = textProcessingStats.getDuration();
double lookupDuration = lookupStats.getDuration();
double matchingDuration = matchingStats.getDuration();
double rankingDuration = rankingStats.getDuration();
double other = processingTime-textProcessingDuration-lookupDuration-matchingDuration;
log.info(" - overal: {}ms (text processing: {}%, lookup: {}%, matching {}%, ranking {}%, other {}%)", new Object[]{
processingTime,
Math.round(textProcessingDuration*100/(double)processingTime),
Math.round(lookupDuration*100/(double)processingTime),
Math.round(matchingDuration*100/(double)processingTime),
Math.round(rankingDuration*100/(double)processingTime),
Math.round(other*100/(double)processingTime),
});
textProcessingStats.printStatistics(log);
lookupStats.printStatistics(log);
log.info(" - {} query results ({} filtered - {}%)",
new Object[]{numQueryResults,numFilteredResults,
numFilteredResults*100f/(float)numQueryResults});
matchingStats.printStatistics(log);
rankingStats.printStatistics(log);
// test.printStatistics(log);
// test2.printStatistics(log);
}
}