enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/EntityLinker.java - stanbol - Git at Google

 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.stanbol.enhancer.engines.entitylinking.impl;

 import static org.apache.stanbol.enhancer.engines.entitylinking.impl.Suggestion.ENTITY_RANK_COMPARATOR;

 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
 import java.util.NavigableMap;
 import java.util.Set;
 import java.util.TreeMap;

 import org.apache.clerezza.rdf.core.PlainLiteral;
 import org.apache.clerezza.rdf.core.Triple;
 import org.apache.clerezza.rdf.core.TripleCollection;
 import org.apache.clerezza.rdf.core.UriRef;
 import org.apache.clerezza.rdf.core.impl.TripleImpl;
 import org.apache.commons.lang.StringUtils;
 import org.apache.stanbol.enhancer.engines.entitylinking.Entity;
 import org.apache.stanbol.enhancer.engines.entitylinking.EntitySearcher;
 import org.apache.stanbol.enhancer.engines.entitylinking.EntitySearcherException;
 import org.apache.stanbol.enhancer.engines.entitylinking.LabelTokenizer;
 import org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig;
 import org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.RedirectProcessingMode;
 import org.apache.stanbol.enhancer.engines.entitylinking.config.LanguageProcessingConfig;
 import org.apache.stanbol.enhancer.engines.entitylinking.impl.Suggestion.MATCH;
 import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
 import org.apache.stanbol.enhancer.nlp.model.Section;
 import org.apache.stanbol.enhancer.nlp.model.Token;
 import org.apache.stanbol.enhancer.servicesapi.rdf.NamespaceEnum;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 public class EntityLinker {

     private static final int MIN_SEARCH_LIMIT = 10;

     private final Logger log = LoggerFactory.getLogger(EntityLinker.class);

     private final EntityLinkerConfig linkerConfig;
     private final LanguageProcessingConfig textProcessingConfig;
     //private final AnalysedText analysedText;
     private final EntitySearcher entitySearcher;
     /**
      * The state of the current processing
      */
     private final ProcessingState state;
     /**
      * The map holding the results of the linking process
      */
     private final Map<String,LinkedEntity> linkedEntities = new HashMap<String,LinkedEntity>();

     //private Integer lookupLimit;

     private LabelTokenizer labelTokenizer;

     private LinkingStateAware linkingStateAware;

     private int minSearchResults;

     //Language configuration
     final String documentLang;
     final String defaultLang;
     final String documentMainLang;

     private Statistic textProcessingStats = new Statistic("Text Processing");
     private Statistic lookupStats = new Statistic("Vocabulary Lookup");
     private int numQueryResults = 0;
     private int numFilteredResults = 0;
     private Statistic matchingStats = new Statistic("Label Matching");
     private Statistic rankingStats = new Statistic("Suggestion Ranking");
 //    private Statistic test = new Statistic("test1");
 //    private Statistic test2_ = new Statistic("test2");
     private int numLabels = 0;
     private long processingTime = -1;


     public EntityLinker(AnalysedText analysedText, String language,
                         LanguageProcessingConfig textProcessingConfig,
                         EntitySearcher entitySearcher,
                         EntityLinkerConfig linkerConfig,
                         LabelTokenizer labelTokenizer) {
         this(analysedText,language,textProcessingConfig,entitySearcher,linkerConfig,labelTokenizer,null);
     }
     public EntityLinker(AnalysedText analysedText, String language,
                 LanguageProcessingConfig textProcessingConfig,
                 EntitySearcher entitySearcher,
                 EntityLinkerConfig linkerConfig,
                 LabelTokenizer labelTokenizer, LinkingStateAware linkingStateAware) {
         //this.analysedText = analysedText;
         this.entitySearcher = entitySearcher;
         this.linkerConfig = linkerConfig;
         this.textProcessingConfig = textProcessingConfig;
         this.labelTokenizer = labelTokenizer;
         this.state = new ProcessingState(analysedText,language,textProcessingConfig);
         minSearchResults = entitySearcher.getLimit() == null ? MIN_SEARCH_LIMIT :
             Math.max(MIN_SEARCH_LIMIT,entitySearcher.getLimit());
         //this.lookupLimit  = Math.max(minResults,linkerConfig.getMaxSuggestions()*3);
         this.linkingStateAware = linkingStateAware;
         //init the language settings
         this.documentLang = state.getLanguage();
         this.defaultLang = linkerConfig.getDefaultLanguage();
         int countryCodeIndex = documentLang == null ? -1 : documentLang.indexOf('-');
         if(countryCodeIndex >= 2){
             documentMainLang = documentLang.substring(0,countryCodeIndex);
         } else {
             documentMainLang = null;
         }

     }
     /**
      * Steps over the sentences, chunks, tokens of the {@link #sentences}
      */
     public void process() throws EntitySearcherException {
         long startTime = System.currentTimeMillis();
         //int debugedIndex = 0;
         Section sentence = null;
         textProcessingStats.begin();
         while(state.next()) {
             //STANBOL-1070: added linkingStateAware callbacks for components that
             //   need to react on the state of the Linking process
             if(linkingStateAware != null){
                 if(!state.getSentence().equals(sentence)){
                     if(sentence != null){
                         linkingStateAware.endSection(sentence);
                     }
                     sentence = state.getSentence(); //set the next sentence
                     linkingStateAware.startSection(sentence); //notify its start
                 }
                 linkingStateAware.startToken(state.getToken().token); //notify the current token
             }
             TokenData token = state.getToken();
             if(log.isDebugEnabled()){
                 log.debug("--- preocess Token {}: {} (lemma: {}) linkable={}, matchable={} | chunk: {}",
                     new Object[]{token.index,token.getTokenText(),token.getTokenLemma(),
                         token.isLinkable, token.isMatchable, token.inChunk != null ?
                                 (token.inChunk.chunk + " "+ token.inChunk.chunk.getSpan()) : "none"});
             }
             List<TokenData> searchStrings = new ArrayList<TokenData>(linkerConfig.getMaxSearchTokens());
             getSearchString(token);
             searchStrings.add(token);
             //Determine the range we are allowed to search for tokens
             final int minIncludeIndex;
             final int maxIndcludeIndex;
             //NOTE: testing has shown that using Chunks to restrict search for
             //      additional matchable tokens does have an negative impact on
             //      recall. Because of that this restriction is for now deactivated
            //TODO: maybe make configurable via an own property
             boolean restrirctContextByChunks = textProcessingConfig.isIgnoreChunks();
             int consumedIndex = state.getConsumedIndex();
             if(token.inChunk != null && !textProcessingConfig.isIgnoreChunks() &&
                     restrirctContextByChunks){
                 minIncludeIndex = token.inChunk.getStartTokenIndex();
 //                minIncludeIndex = Math.max(
 //                    state.getConsumedIndex()+1,
 //                    token.inChunk.getStartTokenIndex());
                 maxIndcludeIndex = token.inChunk.getEndTokenIndex();
             } else {
                 maxIndcludeIndex = state.getTokens().size() - 1;
 //                minIncludeIndex = state.getConsumedIndex() + 1;
                 minIncludeIndex = 0;
             }
             int prevIndex = token.index;
             int pastIndex = token.index;
             int pastNonMatchable = 0;
             int prevNonMatchable = 0;
             int distance = 0;
             do {
                 distance++;//keep track of the distance
                 //get the past token at the given distance (However ignore
                 //non AlphaNumeric tokens when calculating the distance)
                 pastIndex++;
                 TokenData pastToken = null;
                 while(pastToken == null && maxIndcludeIndex >= pastIndex &&
                         pastNonMatchable <= 1){
                     TokenData td = state.getTokens().get(pastIndex);
                     if(td.hasAlphaNumeric){
                         pastToken = td;
                     } else {
                         pastIndex++;
                     }
                 }
                 //get the previous token at the given distance (However ignore
                 //non AlphaNumeric tokens when calculating the distance)
                 prevIndex--;
                 TokenData prevToken = null;
                 while(prevToken == null && minIncludeIndex <= prevIndex &&
                         //allow one nonMatchable token if prevIndex > the last
                         //consumed one and zero nonMatchable if prevIndex is <=
                         //the last consumed one
                         ((prevIndex > consumedIndex && prevNonMatchable <= 1) ||
                                 prevIndex <= consumedIndex && prevNonMatchable < 1)){
                     TokenData td = state.getTokens().get(prevIndex);
                     if(td.hasAlphaNumeric){
                         prevToken = td;
                     } else {
                         prevIndex--;
                     }
                 }
                 //now that we know the tokens at this distance check if they are matchable
                 //Fist the past token
                 if(pastToken != null){
                     if(log.isDebugEnabled()){
                         log.debug("    {} {}:'{}' (lemma: {}) linkable={}, matchable={}",new Object[]{
                                 pastToken.isMatchable? '+':'-',pastToken.index,
                                 pastToken.getTokenText(), pastToken.getTokenLemma(),
                                 pastToken.isLinkable, pastToken.isMatchable
                         });
                     }
                     if(pastToken.isMatchable){
                         searchStrings.add(pastToken);
                     } else {
                         pastNonMatchable++;
                     }
                 }
                 //Second in the previous token
                 if(prevToken != null){
                     if(log.isDebugEnabled()){
                         log.debug("    {} {}:'{}' (lemma: {}) linkable={}, matchable={}",new Object[]{
                             prevToken.isMatchable? '+':'-',prevToken.index,
                             prevToken.getTokenText(), prevToken.getTokenLemma(),
                             prevToken.isLinkable, prevToken.isMatchable
                         });
                     }
                     if(prevToken.isMatchable){
                         getSearchString(prevToken);
                         searchStrings.add(0,prevToken);
                     } else {
                         prevNonMatchable++;
                     }
                 }
             } while(searchStrings.size() < linkerConfig.getMaxSearchTokens() && distance <
                     linkerConfig.getMaxSearchDistance() &&
                     (prevIndex > minIncludeIndex || pastIndex < maxIndcludeIndex) &&
                     (prevNonMatchable <= 1 || pastNonMatchable <= 1));
             //we might have an additional element in the list
             if(searchStrings.size() > linkerConfig.getMaxSearchTokens()){
                 searchStrings = searchStrings.subList( //the last part of the list
                     searchStrings.size()-linkerConfig.getMaxSearchTokens(),
                     searchStrings.size());
             }
             if(log.isDebugEnabled()){
                 List<String> list = new ArrayList<String>(searchStrings.size());
                 for(TokenData dt : searchStrings){
                     list.add(dt.token.getSpan());
                 }
                 log.debug("  >> searchStrings {}",list);
             }
             textProcessingStats.complete();
             //search for Entities
             List<Suggestion> suggestions = lookupEntities(searchStrings);
             //Treat partial matches that do match more as the best FULL match
             //differently
             List<Suggestion> partialMatches = new ArrayList<Suggestion>();
             if(!suggestions.isEmpty()){
                 rankingStats.begin();
                 //update the suggestions based on the best match
                 int bestMatchCount = suggestions.get(0).getLabelMatch().getMatchCount();
                 Iterator<Suggestion> it = suggestions.iterator();
                 while(it.hasNext()){
                     Suggestion suggestion = it.next();
                     //suggestions that match less tokens as the best match
                     //need to be updated to PARTIAL
                     int matchCount = suggestion.getLabelMatch().getMatchCount();
                     if(matchCount < bestMatchCount){
                         suggestion.setMatch(MATCH.PARTIAL);
                     } else if( matchCount > bestMatchCount){ //selects more tokens
                         partialMatches.add(suggestion); //but only a PARTIAL MATCH
                         it.remove(); //remove from the main suggestion list
                     }
                     //Filter matches with less than config.getMinFoundTokens()
                     //if matchcount is less than of the best match
                     if(matchCount < bestMatchCount &&
                             matchCount < linkerConfig.getMinFoundTokens()){
                         it.remove();
                     } else { //calculate the score
                         //how good is the current match in relation to the best one
                         double spanScore = matchCount >= bestMatchCount ? 1.0d :
                             matchCount/(double)bestMatchCount;
                         suggestion.setScore(spanScore*spanScore*suggestion.getLabelMatch().getMatchScore());
                     }
                 }
                 Suggestion oldBestRanked = suggestions.get(0); //for debugging
                 //resort by score
                 Collections.sort(suggestions, Suggestion.SCORE_COMPARATOR);
                 Collections.sort(partialMatches, Suggestion.SCORE_COMPARATOR);
                 //this should never happen ... but the
                 //matchcount of the best match MUST NOT change
                 //after the sort by score!
                 if(bestMatchCount != suggestions.get(0).getLabelMatch().getMatchCount()){
                     log.warn("The match count for the top Ranked Suggestion for {} " +
                             "changed after resorting based on Scores!",
                         state.getTokenText(suggestions.get(0).getLabelMatch().getStart(),bestMatchCount));
                     log.warn("  originalbest   : {}",oldBestRanked);
                     log.warn(" currnet ranking : {}",suggestions);
                     log.warn("  ... this will result in worng confidence values relative to the best match");
                 }
                 //adapt equals rankings based on the entity rank
                 if(linkerConfig.isRankEqualScoresBasedOnEntityRankings()){
                     adaptScoresForEntityRankings(suggestions);
                     adaptScoresForEntityRankings(partialMatches);
                 }
                 //remove all suggestions > config.maxSuggestions
                 if(suggestions.size() > linkerConfig.getMaxSuggestions()){
                     suggestions.subList(linkerConfig.getMaxSuggestions(),suggestions.size()).clear();
                 }
                 if(log.isDebugEnabled()){
                     log.debug("  >> Suggestions:");
                     int i=0;
                     for(Suggestion s : suggestions){
                         log.debug("   - {}: {}",i,s);
                         i++;
                     }
                 }
                 //process redirects
                 if(linkerConfig.getRedirectProcessingMode() != RedirectProcessingMode.IGNORE){
                     for(Suggestion suggestion : suggestions){
                         processRedirects(suggestion);
                     }
                     for(Suggestion suggestion : partialMatches){
                         processRedirects(suggestion);
                     }
                 }
                 //create LinkedEntities for the main suggestions
                 int start = suggestions.get(0).getLabelMatch().getStart();
                 int span = suggestions.get(0).getLabelMatch().getSpan();
                 //Store the linking results
                 String selectedText = state.getTokenText(start,span);
                 //float score;
                 LinkedEntity linkedEntity = linkedEntities.get(selectedText);
                 if(linkedEntity == null){
                     linkedEntity = new LinkedEntity(selectedText,
                         suggestions, getLinkedEntityTypes(suggestions));
                     linkedEntities.put(selectedText, linkedEntity);
                 } // else Assumption: The list of suggestions is the SAME
                 linkedEntity.addOccurrence(state.getSentence(),
                     //NOTE: The end Token is "start+span-1"
                     state.getTokens().get(start).token, state.getTokens().get(start+span-1).token);
                 //In case of a FULL or EXACT MATCH we can set the next token to process to the next
                 //word after the currently found suggestion
                 if(suggestions.get(0).getMatch().ordinal() >= MATCH.FULL.ordinal()){
                     state.setConsumed(start+span-1);
                 }
                 //create LinkedEntities for partial matches
                 //TODO: maybe we need to group partial matches based on their
                 //      selected Tokens and only group those suggestions that do
                 //      select the same span in the Text. Currently all are grouped
                 //      based on those that does select the most tokens.
                 if(!partialMatches.isEmpty()){
                     start = partialMatches.get(0).getLabelMatch().getStart();
                     span = partialMatches.get(0).getLabelMatch().getSpan();
                     selectedText = state.getTokenText(start, span);
                     linkedEntity = linkedEntities.get(selectedText);
                     if(linkedEntity == null){
                         linkedEntity = new LinkedEntity(selectedText,
                             partialMatches, getLinkedEntityTypes(suggestions));
                         linkedEntities.put(selectedText, linkedEntity);
                     } // else Assumption: The list of suggestions is the SAME
                     linkedEntity.addOccurrence(state.getSentence(),
                         //NOTE: The end Token is "start+span-1"
                         state.getTokens().get(start).token, state.getTokens().get(start+span-1).token);
                 }
                 rankingStats.complete();
             } // else suggestions are empty
             if(linkingStateAware != null){
                 linkingStateAware.endToken(state.getToken().token);
             }
             textProcessingStats.begin();
         }
         textProcessingStats.cancel(); //do not count the last call
         if(linkingStateAware != null && sentence != null){
             linkingStateAware.endSection(sentence);
         }
         this.processingTime = System.currentTimeMillis()-startTime;
     }
     /**
      * @param suggestions
      */
     private void adaptScoresForEntityRankings(List<Suggestion> suggestions) {
         List<Suggestion> equalScoreList = new ArrayList<Suggestion>(4);
         double score = 2f;
         for(Suggestion s : suggestions){
             double actScore = s.getScore();
             if(score == actScore){
                 equalScoreList.add(s);
             } else {
                 if(equalScoreList.size() > 1){
                     adaptScoreForEntityRankings(equalScoreList, actScore);
                 }
                 score = actScore;
                 equalScoreList.clear();
                 equalScoreList.add(s);
             }
         }
         if(equalScoreList.size() > 1){
             adaptScoreForEntityRankings(equalScoreList,0);
         }
         //resort by score
         Collections.sort(suggestions, Suggestion.SCORE_COMPARATOR);
     }
     /**
      * Helper that extracts the
      * @param token
      */
     private String getSearchString(TokenData token) {
         String searchString = linkerConfig.isLemmaMatching() ? token.getTokenLemma() :
             token.getTokenText();
         if(searchString == null){
             searchString = token.getTokenText();
         }
         return searchString;
     }
     /**
      * This method slightly adapts scores of Suggestions based on the Entity ranking.
      * It is used for Suggestions that would have the exact same score (e.g. 1.0) to
      * ensure ordering of the suggestions based on the rankings of the Entities
      * within the knowledge base linked against
      * @param equalScoreList Entities with the same {@link Suggestion#getScore()}
      * values. If this is not the case this method will change scores in unintended
      * ways
      * @param nextScore the score of the {@link Suggestion} with a lower score as the
      * list of suggestions parsed in the first parameter
      */
     private void adaptScoreForEntityRankings(List<Suggestion> equalScoreList, double nextScore) {
         double score = equalScoreList.get(0).getScore();
         log.debug("  > Adapt Score of multiple Suggestions "
             + "with '{}' based on EntityRanking",score);
         //Adapt the score to reflect the entity ranking
         //but do not change order with entities of different
         //score. Also do not change the score more that 0.1
         //TODO: make the max change (0.1) configurable
         double dif = (Math.min(0.1, score-nextScore))/equalScoreList.size();
         Collections.sort(equalScoreList,ENTITY_RANK_COMPARATOR);
         log.debug("    - keep socre of {} at {}", equalScoreList.get(0).getEntity().getId(), score);
         for(int i=1;i<equalScoreList.size();i++){
             score = score-dif;
             if(ENTITY_RANK_COMPARATOR.compare(equalScoreList.get(i-1),
                 equalScoreList.get(i)) != 0){
                 equalScoreList.get(i).setScore(score);
                 log.debug("    - set score of {} at {}", equalScoreList.get(i).getEntity().getId(), score);
             } else {
                 double lastScore = equalScoreList.get(i-1).getScore();
                 equalScoreList.get(i).setScore(lastScore);
                 log.debug("    - set score of {} at {}", equalScoreList.get(i).getEntity().getId(), lastScore);
             }
         }
     }
     /**
      * After {@link #process()}ing this returns the entities linked for the
      * parsed {@link AnalysedContent}.
      * @return the linked entities
      */
     public final Map<String,LinkedEntity> getLinkedEntities() {
         return linkedEntities;
     }
     /**
      * Retrieves all {@link EntitySearcher#getTypeField()} values of the parsed
      * {@link Suggestion}s and than lookup the {@link NamespaceEnum#dcTerms dc}:type
      * values for the {@link LinkedEntity#getTypes()} by using the configured
      * {@link EntityLinkerConfig#getTypeMappings() types mappings} (and if
      * no mapping is found the {@link EntityLinkerConfig#getDefaultDcType()
      * default} type.
      * @param conceptTypes The list of suggestions
      * @return the types values for the {@link LinkedEntity}
      */
     private Set<UriRef> getLinkedEntityTypes(Collection<Suggestion> suggestions){
         Collection<UriRef> conceptTypes = new HashSet<UriRef>();
         double score = -1; //only consider types of the best ranked Entities
         for(Suggestion suggestion : suggestions){
             double actScore = suggestion.getScore();
             if(actScore < score){
                 break;
             }
             for(Iterator<UriRef> types =
                 suggestion.getEntity().getReferences(linkerConfig.getTypeField());
                 types.hasNext();conceptTypes.add(types.next()));
         }
         Map<UriRef,UriRef> typeMappings = linkerConfig.getTypeMappings();
         Set<UriRef> dcTypes = new HashSet<UriRef>();
         for(UriRef conceptType : conceptTypes){
             UriRef dcType = typeMappings.get(conceptType);
             if(dcType != null){
                 dcTypes.add(dcType);
             }
         }
         if(dcTypes.isEmpty() && linkerConfig.getDefaultDcType() != null){
             dcTypes.add(linkerConfig.getDefaultDcType());
         }
         return dcTypes;
     }
     /**
      * Processes {@link EntitySearcher#getRedirectField() redirect field} values for
      * the parsed suggestions based on the {@link RedirectProcessingMode}
      * as configured in the {@link #config}.<p>
      * The results of this method are stored within the parsed {@link Suggestion}s
      * @param suggestion The suggestion to process.
      * @throws EntitySearcherException
      */
     private void processRedirects(Suggestion suggestion) throws EntitySearcherException {
         //if mode is IGNORE -> nothing to do
         if(linkerConfig.getRedirectProcessingMode() == RedirectProcessingMode.IGNORE){
             return;
         }
         //in case results for queries are locally cached it might be the case
         //that some/all of the results do already redirects processed.
         //therefore there is a small internal state that stores this information
         if(suggestion.isRedirectedProcessed()){
             return; //Redirects for ResultMatch are already processed ... ignore
         }
         Entity result = suggestion.getResult();
         Iterator<UriRef> redirects = result.getReferences(linkerConfig.getRedirectField());
         switch (linkerConfig.getRedirectProcessingMode()) {
             case ADD_VALUES:
                 TripleCollection entityData = result.getData();
                 UriRef entityUri = result.getUri();
                 while(redirects.hasNext()){
                     UriRef redirect = redirects.next();
                     if(redirect != null){
                         Entity redirectedEntity = entitySearcher.get(redirect,
                             linkerConfig.getSelectedFields());
                         if(redirectedEntity != null){
                             for(Iterator<Triple> data = redirectedEntity.getData().filter(
                                 redirectedEntity.getUri(), null, null);data.hasNext();){
                                 Triple t = data.next();
                                 entityData.add(new TripleImpl(entityUri,t.getPredicate(),t.getObject()));
                             }
                         }
                         //set that the redirects where searched for this result
                         suggestion.setRedirectProcessed(true);
                     }
                 }
             case FOLLOW:
                 while(redirects.hasNext()){
                     UriRef redirect = redirects.next();
                     if(redirect != null){
                         Entity redirectedEntity = entitySearcher.get(redirect,
                             linkerConfig.getSelectedFields());
                         if(redirectedEntity != null){
                             suggestion.setRedirect(redirectedEntity);
                         }
                     }
                 }
             default: //nothing to do
         }
     }
     /**
      * Searches for Entities in the {@link #entitySearcher} corresponding to the
      * {@link Token#getText() words} of the current {@link #state position} in
      * the text.
      * @param searchTokens the list of {@link Token#getText() words} to search
      * entities for.
      * @return The sorted list with the suggestions.
      * If there are no suggestions an empty list will be returned.
      * @throws EntitySearcherException
      */
     private List<Suggestion> lookupEntities(List<TokenData> searchTokens) throws EntitySearcherException {
         Set<String> languages = new HashSet<String>();
         languages.add(linkerConfig.getDefaultLanguage());
         languages.add(state.getLanguage());
         int countryCodeIndex = state.getLanguage() == null ? -1 : state.getLanguage().indexOf('-');
         if(countryCodeIndex >= 2){
             languages.add(state.getLanguage().substring(0,countryCodeIndex));
         }
         List<String> searchStrings = new ArrayList<String>(searchTokens.size());
         for(Iterator<TokenData> it = searchTokens.iterator();it.hasNext();){
             searchStrings.add(getSearchString(it.next()));
         }
         String[] languageArray = languages.toArray(new String[languages.size()]);
         List<Suggestion> suggestions = new ArrayList<Suggestion>();
         //perform the lookup with the parsed parameter
         int numResults = performLookup(searchStrings, languageArray, suggestions, searchTokens);
         //if no match where found in the result .. fallback to a search for the
         //current token
         if(suggestions.isEmpty() && numResults > 0 && searchStrings.size() > 1){
             //there where results, but no one matched ...
             //   ... it is most likely a case where the used search terms are
             //       not releated. So try to query for the active token only
             searchTokens = Collections.singletonList(state.getToken());
             log.debug("   > No match for '{}' searchStrings ... ", searchStrings);
             searchStrings = Collections.singletonList(state.getToken().token.getSpan());
             log.debug("     ... fallback to search for active token '{}' ...",searchStrings);
             performLookup(searchStrings, languageArray, suggestions, searchTokens);
         }
         //sort the suggestions
         if(suggestions.size()>1){
             Collections.sort(suggestions,Suggestion.MATCH_TYPE_SUGGESTION_COMPARATOR);
         }
         return suggestions;
     }
     /**
      * @param searchStrings
      * @param languageArray
      * @param suggestions
      * @param searchTokens
      * @return
      * @throws EntitySearcherException
      */
     private int performLookup(List<String> searchStrings, String[] languageArray,
             List<Suggestion> suggestions, List<TokenData> searchTokens) throws EntitySearcherException {
         int minProcessedResults = linkerConfig.getMaxSuggestions()*3;
         int lookupLimit = Math.max(MIN_SEARCH_LIMIT, linkerConfig.getMaxSuggestions()*2*searchTokens.size());
         int maxResults = lookupLimit*2;
         int offset = 0;
         int numFiltered = 0;
         boolean moreResultsAvailable = true;
         int numResults = 0;
         //search for entities until
         // (1) we have more as MAX_SUGGESTION results
         // (2) no more results are available
         // (3) the number of processed Entities is smaller as two times the
         //     suggestions
         // (4) the number of requested Entities is smaller as two times the
         //     lookup limit.
         //NOTE: making multiple requests can decrease the performance a lot.
         //      Because of that those limits assure that no more than two
         //      requests are made for the same lookup.
         while(suggestions.size() < linkerConfig.getMaxSuggestions() &&
                 moreResultsAvailable && (numResults-numFiltered) < (minProcessedResults) &&
                 numResults < maxResults){
             Collection<? extends Entity> results;
             log.debug("   > request entities [{}-{}] entities ...",offset,(offset+lookupLimit));
             lookupStats.begin(); //keep statistics
             results = entitySearcher.lookup(linkerConfig.getNameField(),
                 linkerConfig.getSelectedFields(), searchStrings, languageArray,
                 lookupLimit, offset);
             lookupStats.complete();
             log.debug("      < found {} entities ...",results.size());
             //queries might return more as the requested results
             moreResultsAvailable = results.size() >= lookupLimit;
             numResults = numResults + results.size();
             offset = numResults;
             matchingStats.begin();
             numFiltered = numFiltered + processLookupResults(searchTokens, results, suggestions);
             matchingStats.complete();
             //sort the suggestions
         }
         return numResults;
     }
     /**
      * Processes the parsed entity lookup results and adds suggestions to the
      * parsed suggestion list
      * @param results the results
      * @param suggestions the suggestions
      * @return the number of filtered results
      */
     private int processLookupResults(List<TokenData> searchTokens, Collection<? extends Entity> results, List<Suggestion> suggestions) {
         int numFiltered = 0;
         for(Entity result : results){
             if(log.isDebugEnabled()){
                 log.debug("    > {} (ranking: {})",result.getId(),result.getEntityRanking());
             }
             numQueryResults++;
             //white/black list based entity type filtering (STANBOL-1111)
             boolean filtered = false;
             if(linkerConfig.isEntityTypeFilteringActive()){
                 filtered = filterEntity(result.getReferences(linkerConfig.getTypeField()));
             }
             if(!filtered){
                 Suggestion suggestion = matchLabels(searchTokens, result);
                 if(suggestion.getMatch() != MATCH.NONE){
                     if(log.isDebugEnabled()){
                         log.debug("      + {}",suggestion);
                     }
                     suggestions.add(suggestion);
                 } else {
                     log.debug("      - no match");
                 }
             } else {//do not process Entities with a filtered type
                 numFilteredResults++; //global statistics
                 numFiltered++;
             }
         }
         return numFiltered;
     }

     public boolean filterEntity(Iterator<UriRef> entityTypes){
         Map<UriRef, Integer> whiteList = linkerConfig.getWhitelistedTypes();
         Map<UriRef, Integer> blackList = linkerConfig.getBlacklistedTypes();
         Integer w = null;
         Integer b = null;
         while(entityTypes.hasNext()){
             UriRef type = entityTypes.next();
             Integer act = whiteList.get(type);
             if(act != null){
                 if(w == null || act.compareTo(w) < 0){
                     w = act;
                 }
                 if(act.intValue() == 0){
                     break;
                 }
             }
             act = blackList.get(type);
             if(act != null){
                 if(b == null || act.compareTo(b) < 0){
                     b = act;
                 }
                 if(act.intValue() == 0){
                     break;
                 }
             }
         }
         if(w == null && b == null){
             return !linkerConfig.isDefaultWhitelistTypes();
         } else if(w != null){
             return b == null || w.compareTo(b) < 0 ? false : true;
         } else { //w == null && b != null
             return true; //filter
         }
     }

     /**
      * Matches the labels of the parsed {@link Representation} with the Tokens of
      * the texts (beginning with the currently active
      * {@link ProcessingState#getToken() token}).<p>
      * The field used to get the labels is retrieved from
      * {@link EntitySearcher#getNameField()}. Only labels with no language or the
      * language of the current sentence are considered. If less than
      * {@link EntityLinkerConfig#getMinFoundTokens()} tokens match with an
      * label the Concept is only considered to match if the label is
      * {@link String#equalsIgnoreCase(String)} to the text covered by the
      * matched token(s). Otherwise also {@link MATCH#FULL} and {@link MATCH#PARTIAL}
      * results are allowed.
      * @param entity The entity including at least the data for the
      * {@link EntitySearcher#getNameField()} property.
      * @return The result of the matching.
      */
     private Suggestion matchLabels(List<TokenData> searchTokens, Entity entity) {
         String curLang = documentLang; //language of the current sentence
         String defLang = defaultLang; //configured default language
         String mainLang = documentMainLang;
         Collection<PlainLiteral> mainLangLabels;
         if(documentMainLang != null){
             mainLang = documentMainLang;
             mainLangLabels = new ArrayList<PlainLiteral>();
         } else {
             mainLang = documentLang;
             mainLangLabels = Collections.emptyList();
         }
         Iterator<PlainLiteral> labels = entity.getText(linkerConfig.getNameField());
         Suggestion match = new Suggestion(entity);
         Collection<PlainLiteral> defaultLabels = new ArrayList<PlainLiteral>();
         boolean matchedLangLabel = false;
         //avoid matching multiple labels with the exact same lexical.
         Set<String> matchedLabels = new HashSet<String>();
         while(labels.hasNext()){
             PlainLiteral label = labels.next();
             numLabels++;
             String lang = label.getLanguage() != null ? label.getLanguage().toString() : null;
             if((lang == null && curLang == null) ||
                     (lang != null && curLang != null && lang.equalsIgnoreCase(curLang))){
                 if(!matchedLabels.contains(label.getLexicalForm())){
                     matchLabel(searchTokens, match, label);
                     matchedLabels.add(label.getLexicalForm());
                     matchedLangLabel = true;
                 }
             } else if((lang == null && mainLang == null) ||
                     (lang != null && mainLang != null && lang.equalsIgnoreCase(mainLang))){
                 mainLangLabels.add(label);
             } else if((lang == null && defLang == null) ||
                     (lang != null && defLang != null && lang.startsWith(defLang))){
                 defaultLabels.add(label);
             }
         }
         //try to match main language labels
         if(!matchedLangLabel || match.getMatch() == MATCH.NONE){
             for(PlainLiteral mainLangLabel : mainLangLabels){
                 if(!matchedLabels.contains(mainLangLabel.getLexicalForm())){
                     matchLabel(searchTokens, match, mainLangLabel);
                     matchedLabels.add(mainLangLabel.getLexicalForm());
                     matchedLangLabel = true;
                 }
             }
         }
         //use only labels in the default language if there is
         // * no label in the current language or
         // * no MATCH was found in the current language
         if(!matchedLangLabel || match.getMatch() == MATCH.NONE){
             for(PlainLiteral defaultLangLabel : defaultLabels){
                 if(!matchedLabels.contains(defaultLangLabel.getLexicalForm())){
                     matchLabel(searchTokens, match, defaultLangLabel);
                     matchedLabels.add(defaultLangLabel.getLexicalForm());
                 }
             }
         }
         return match;
     }

     /**
      * @param suggestion
      * @param label
      */
     private void matchLabel(List<TokenData> searchTokens, Suggestion suggestion, PlainLiteral label) {
 //        test.begin();
         String text = label.getLexicalForm();
         String lang = label.getLanguage() == null ? null : label.getLanguage().toString();
         if(!linkerConfig.isCaseSensitiveMatching()){
             text = text.toLowerCase(); //TODO use language of label for Locale
         }
         //Tokenize the label and remove remove tokens without alpha numerical chars
         String[] unprocessedLabelTokens = labelTokenizer != null ?
                 labelTokenizer.tokenize(text, lang) : null;
         if(unprocessedLabelTokens == null){ //no tokenizer available
             log.info("Unable to tokenize {} language texts. Will process untokenized label {}",
                 state.getLanguage(),text);
             unprocessedLabelTokens = new String[]{text}; //there is already a warning
         }
         int offset = 0;
         for(int i=0;i<unprocessedLabelTokens.length;i++){
             boolean hasAlphaNumericChar = Utils.hasAlphaNumericChar(unprocessedLabelTokens[i]);
             if(!hasAlphaNumericChar){
                 offset++;
             } else if(offset > 0){
                 String token = unprocessedLabelTokens[i];
                 token = StringUtils.replaceChars(token,".","");
                 unprocessedLabelTokens[i-offset] = token;
             }
         }
         String[] labelTokens;
         if(offset == 0){
             labelTokens = unprocessedLabelTokens;
         } else {
             labelTokens = new String[unprocessedLabelTokens.length-offset];
             System.arraycopy(unprocessedLabelTokens, 0, labelTokens, 0, labelTokens.length);
         }
         //holds the tokens and their position within the label. NOTE that the same
         //token may appear multiple times in the label (e.g. "Da Da Bing"
         Map<String,List<Integer>> labelTokenMap = new HashMap<String, List<Integer>>();
         for(int i=0;i < labelTokens.length; i++){
             List<Integer> tokenIndexes = labelTokenMap.get(labelTokens[i]);
             if(tokenIndexes == null){
                 tokenIndexes = new ArrayList<Integer>(2);
                 labelTokenMap.put(labelTokens[i], tokenIndexes);
             }
             tokenIndexes.add(Integer.valueOf(i));
         }
         NavigableMap<Integer, String> matchedLabelTokens = new TreeMap<Integer,String>();
         int foundProcessableTokens = 0;
         int foundTokens = 0;
         float foundTokenMatch = 0;
         //ensure the correct order of the tokens in the suggested entity
         boolean search = true;
         boolean activeTokenNotMatched = true;
         int firstFoundIndex = -1;
         int firstProcessableFoundIndex = -1;
         int lastFoundIndex = -1;
         int lastProcessableFoundIndex = -1;
         int firstFoundLabelIndex = -1;
         int lastfoundLabelIndex = -1;
         TokenData currentToken;
         String currentTokenText;
         int currentTokenLength;
         int notFound = 0;

         int matchedTokensNotWithinProcessableTokenSpan = 0;
         int foundTokensWithinCoveredProcessableTokens = 0;
         float minTokenMatchFactor = linkerConfig.getMinTokenMatchFactor();
         //search for matches within the correct order
         for(int currentIndex = state.getToken().index;
                 currentIndex < state.getTokens().size()
                 && search ;currentIndex++){
             currentToken = state.getTokens().get(currentIndex);
             if(currentToken.hasAlphaNumeric){
                 currentTokenText = linkerConfig.isLemmaMatching() ?
                         currentToken.getTokenLemma() : currentToken.getTokenText();
                 if(currentTokenText == null) { //no lemma available
                     currentTokenText = currentToken.getTokenText(); //fallback to text
                 }
                 //ignore '.' in tokens to ensure that 'D.C.' matches 'DC' ...
                 currentTokenText = StringUtils.replaceChars(currentTokenText,".","");
                 if(!linkerConfig.isCaseSensitiveMatching()){
                     currentTokenText = currentTokenText.toLowerCase();
                 }
                 currentTokenLength = currentTokenText.length();
                 boolean found = false;
                 float matchFactor = 0f;
                 //iteration starts at the next token after the last matched one
                 //so it is OK to skip tokens in the label, but not within the text
                 for(int i = lastfoundLabelIndex+1;!found && i < labelTokens.length;i ++){
                     String labelTokenText = labelTokens[i];
                     int labelTokenLength = labelTokenText.length();
                     float maxLength = currentTokenLength > labelTokenLength ? currentTokenLength : labelTokenLength;
                     float lengthDif = Math.abs(currentTokenLength - labelTokenLength);
                     if((lengthDif/maxLength)<=(1-minTokenMatchFactor)){ //this prevents unnecessary string comparison
                         int matchCount = compareTokens(currentTokenText, labelTokenText);
                         if(matchCount/maxLength >= minTokenMatchFactor){
                             lastfoundLabelIndex = i; //set the last found index to the current position
                             found = true; //set found to true -> stops iteration
                             matchFactor = matchCount/maxLength; //how good is the match
                             //remove matched labels from the set to disable them for
                             //a later random oder search
                             Integer labelTokenIndex = getLabelTokenIndex(labelTokenText, i, labelTokenMap);
                             matchedLabelTokens.put(labelTokenIndex, labelTokenText);
                         }
                     }
                 }
                 if(!found){
                     //search for a match in the wrong order
                     //currently only exact matches (for testing)
                     Integer index = getLabelTokenIndex(currentTokenText, lastfoundLabelIndex+1, labelTokenMap);
                     if(index != null){
                         matchedLabelTokens.put(index, currentTokenText);
                         found = true;
                         matchFactor = 0.7f;
                     }
                 }
                 //int found = text.indexOf(currentToken.getText().toLowerCase());
                 if(found){ //found
                     if(currentToken.isMatchable){
                         foundProcessableTokens++; //only count processable Tokens
                         if(firstProcessableFoundIndex < 0){
                             firstProcessableFoundIndex = currentIndex;
                         }
                         lastProcessableFoundIndex = currentIndex;
                         foundTokensWithinCoveredProcessableTokens++;
                         if(matchedTokensNotWithinProcessableTokenSpan > 0){
                             foundTokensWithinCoveredProcessableTokens = foundTokensWithinCoveredProcessableTokens +
                                     matchedTokensNotWithinProcessableTokenSpan;
                             matchedTokensNotWithinProcessableTokenSpan = 0;
                         }
                     } else {
                         matchedTokensNotWithinProcessableTokenSpan++;
                     }
                     foundTokens++;
                     foundTokenMatch = foundTokenMatch + matchFactor; //sum up the matches
                     if(firstFoundIndex < 0){
                         firstFoundIndex = currentIndex;
                         firstFoundLabelIndex = lastfoundLabelIndex;
                     }
                     lastFoundIndex = currentIndex;
                 } else { //not found
                     if(state.getToken().index == currentToken.index){
                         //the currently active Token MUST BE matched
                         search = false;
                         activeTokenNotMatched = true;
                     }
                     notFound++;
                     //stop forward search if
 //                    if(currentToken.isMatchable || notFound > linkerConfig.getMaxNotFound()){
                     if(!searchTokens.contains(currentToken)){
                         //stop as soon as a token that needs to be processed is
                         //not found in the label or the maximum number of tokens
                         //that are not processable are not found
                         search = false;
                     }
                 }
             } // else token without alpha or numeric characters are not processed
         }
         //search backwards for label tokens until firstFoundLabelIndex if there
         //are unconsumed Tokens in the sentence before state.getTokenIndex
         int currentIndex = state.getToken().index-1;
         int labelIndex = firstFoundLabelIndex-1;
         notFound = 0;
         matchedTokensNotWithinProcessableTokenSpan = 0;
         if(activeTokenNotMatched){ //do not search backwards if the active token
             //was not found
             search = true;
         }
         while(search && labelIndex >= 0 && currentIndex >= 0){// && currentIndex > state.getConsumedIndex()){
             String labelTokenText = labelTokens[labelIndex];
             if(labelTokenMap.containsKey(labelTokenText)){ //still not matched
                 currentToken = state.getTokens().get(currentIndex);
                 currentTokenText = linkerConfig.isLemmaMatching() ?
                         currentToken.getTokenLemma() : currentToken.getTokenText();
                 if(currentTokenText == null) { //no lemma available
                     currentTokenText = currentToken.getTokenText(); //fallback to text
                 }
                 if(!linkerConfig.isCaseSensitiveMatching()){
                     currentTokenText = currentTokenText.toLowerCase();
                 }
                 currentTokenText = StringUtils.replaceChars(currentTokenText,".","");
                 currentTokenLength = currentTokenText.length();
                 boolean found = false;
                 float matchFactor = 0f;
                 int labelTokenLength = labelTokenText.length();
                 float maxLength = currentTokenLength > labelTokenLength ? currentTokenLength : labelTokenLength;
                 float lengthDif = Math.abs(currentTokenLength - labelTokenLength);
                 if((lengthDif/maxLength)<=(1-minTokenMatchFactor)){ //this prevents unnecessary string comparison
                     int matchCount = compareTokens(currentTokenText, labelTokenText);
                     if(matchCount/maxLength >= minTokenMatchFactor){
                         found = true; //set found to true -> stops iteration
                         matchFactor = matchCount/maxLength; //how good is the match
                     }
                 }
                 if(found){ //found
                     if(currentToken.isMatchable){
                         foundProcessableTokens++; //only count processable Tokens
                         if(lastProcessableFoundIndex < 0){ //if last is not yet set
                             lastProcessableFoundIndex = currentIndex;
                         }
                         firstProcessableFoundIndex = currentIndex;
                         foundTokensWithinCoveredProcessableTokens++;
                         if(matchedTokensNotWithinProcessableTokenSpan > 0){
                             foundTokensWithinCoveredProcessableTokens = foundTokensWithinCoveredProcessableTokens +
                                     matchedTokensNotWithinProcessableTokenSpan;
                             matchedTokensNotWithinProcessableTokenSpan = 0;
                         }
                     } else {
                         matchedTokensNotWithinProcessableTokenSpan++;
                     }
                     foundTokens++;
                     foundTokenMatch = foundTokenMatch + matchFactor; //sum up the matches
                     firstFoundIndex = currentIndex;
                     labelIndex--;
                     Integer foundIndex = getLabelTokenIndex(labelTokenText, currentIndex, labelTokenMap);
                     matchedLabelTokens.put(foundIndex, labelTokenText);
                 } else {
                     notFound++;
                     if(currentToken.isMatchable || notFound > linkerConfig.getMaxNotFound()){
                         //stop as soon as a token that needs to be processed is
                         //not found in the label or the maximum number of tokens
                         //that are not processable are not found
                         search = false;
                     }
                 }
                 currentIndex --;
             } else { //this token is already matched ...
                 labelIndex--; //try the next one
             }
         }
         if(foundProcessableTokens > 0) { //if any Token has matched
             //Now we make a second round to search tokens that match in the wrong order
             //e.g. if given and family name of persons are switched
             final LabelMatch labelMatch;
             int coveredTokens = lastFoundIndex-firstFoundIndex+1;
             int coveredProcessableTokens = lastProcessableFoundIndex-firstProcessableFoundIndex+1;
             //matched tokens only within the span of the first/last processable token
             //Matching rules
             // - if less than config#minTokenFound() than accept only EXACT
             // - override PARTIAL matches with FULL/EXACT matches only if
             //   foundTokens of the PARTIAL match is > than of the FULL/EXACT
             //   match (this will be very rare
             String currentText = state.getTokenText(firstFoundIndex,coveredTokens);
             if(linkerConfig.isCaseSensitiveMatching() ? currentText.equals(text) : currentText.equalsIgnoreCase(text)){
                 labelMatch = new LabelMatch(firstFoundIndex, coveredTokens, label);
             } else {
                 int coveredLabelTokens = matchedLabelTokens.lastKey().intValue()-matchedLabelTokens.firstKey().intValue()+1;
                 if(foundTokens == labelTokens.length && foundTokens == coveredTokens){
                     //if all token matched set found to covered: May be lower because only
                     //processable tokens are counted, but FULL also checks
                     //of non-processable!
                     foundTokens = coveredTokens;
                     foundProcessableTokens = coveredProcessableTokens;
                 }
                 labelMatch = new LabelMatch(firstProcessableFoundIndex, coveredProcessableTokens,
                     foundProcessableTokens,foundTokensWithinCoveredProcessableTokens,
                     foundTokenMatch/(float)foundTokens,label,labelTokens.length, coveredLabelTokens);
             }
             if(labelMatch.getLabelScore() >= linkerConfig.getMinLabelScore() &&
                     labelMatch.getTextScore() >= linkerConfig.getMinTextScore() &&
                     labelMatch.getMatchScore() >= linkerConfig.getMinMatchScore()){
                 suggestion.addLabelMatch(labelMatch);
             }
         } //else NO tokens found -> nothing to do
 //        test.complete();
     }
     /**
      * Utility Method that searches for the Index of the parsed label token text
      * within the labelTokenMap. Matched tokens are removed from the parsed
      * LabelTokenMap <p>
      * NOTE: This is necessary, because in cases where Labels do contain the same
      * token twice, it might not be always clear which token is the matching one.
      * Especially if the order of the Tokens in the Text does not exactly match
      * the order within the Label. This Method tries always to find the matching
      * token closest to the parsed currentIndex.
      * It iterates backwards to prefer Tokens that occur later as the current index
      * in the tokenized label.
      * @param labelTokenText the text of the current labelToken
      * @param currentIndex the current index of the processing (or if not known
      * the last matched index of an token within the label
      * @param labelTokenMap the Map holding tokens as key and a list of occurrences
      * as values or <code>null</code> if no Token with the parsed labelTokenText
      * was present as key in the parsed labelTokenMap
      * @return the index of the selected label token
      */
     private Integer getLabelTokenIndex(String labelTokenText, int currentIndex,
             Map<String,List<Integer>> labelTokenMap) {
         List<Integer> tokenIndexes = labelTokenMap.get(labelTokenText);
         if(tokenIndexes == null){
             return null;
         }
         //try to remove the closest index in the map
         Integer labelTokenIndex = Integer.valueOf(currentIndex);
         //search the closest position
         int closest = Integer.MAX_VALUE;
         int closestIndex = -1;
         for(int p = tokenIndexes.size()-1; p >= 0; p--){
             Integer index = tokenIndexes.get(p);
             int dif = Math.abs(index.intValue()-currentIndex);
             if(Math.abs(index.intValue()-currentIndex) < closest){
                 closest = dif;
                 closestIndex = p;
                 labelTokenIndex = index;
                 if(closest == 0){
                     break;
                 }
             }
         }
         tokenIndexes.remove(closestIndex);
         if(tokenIndexes.isEmpty()){
             labelTokenMap.remove(labelTokenText);
         }
         return labelTokenIndex;
     }
     /**
      * Compares to token with each other and returns the longest match. The
      * tokens are compared from the beginning and from the end.
      * @param token1 the first token
      * @param token2 the second token
      * @return the number of matching chars
      */
     private int compareTokens(String token1,String token2){
         int l1 = token1.length(); //length of the first token
         int l2 = token2.length(); //length of the second token
         //in case of same length check for equals first
         if(l1 == l2 && token1.equals(token2)){
             return l1;
         }
         int ml = l1>l2?l2:l1; //minimum length of a token
         if(ml == 0){
             return ml;
         }
         int f = 0; //forward match count + 1
         int b = 0; //backward match count + 1
         boolean match = true; //still matches
         while(match && f < ml){
             match = token1.charAt(f) == token2.charAt(f);
             f++;
         }
         if(!match){
             f--;
         }
         if(f < ml){
             match = true;
             while(match && b < ml){
                 b++;
                 match = token1.charAt(l1-b) == token2.charAt(l2-b);
             }
             if(!match){
                 b--;
             }
         }
         return f > b ? f : b;
     }
     /**
      * This logs the statistics about the processing process
      * @param log the logger used to log the statistics
      */
     public void logStatistics(Logger log){
         log.info("EntityLinking Statistics:");
         double textProcessingDuration = textProcessingStats.getDuration();
         double lookupDuration = lookupStats.getDuration();
         double matchingDuration = matchingStats.getDuration();
         double rankingDuration = rankingStats.getDuration();
         double other = processingTime-textProcessingDuration-lookupDuration-matchingDuration;
         log.info("    - overal: {}ms (text processing: {}%, lookup: {}%, matching {}%, ranking {}%, other {}%)", new Object[]{
                 processingTime,
                 Math.round(textProcessingDuration*100/(double)processingTime),
                 Math.round(lookupDuration*100/(double)processingTime),
                 Math.round(matchingDuration*100/(double)processingTime),
                  Math.round(rankingDuration*100/(double)processingTime),
                 Math.round(other*100/(double)processingTime),
         });
         textProcessingStats.printStatistics(log);
         lookupStats.printStatistics(log);
         log.info("      - {} query results ({} filtered - {}%)",
             new Object[]{numQueryResults,numFilteredResults,
                 numFilteredResults*100f/(float)numQueryResults});
         matchingStats.printStatistics(log);
         rankingStats.printStatistics(log);
 //        test.printStatistics(log);
 //        test2.printStatistics(log);
     }

 }