| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.stanbol.enhancer.engines.keywordextraction.impl; |
| |
| import java.util.ArrayList; |
| import java.util.Arrays; |
| import java.util.Collection; |
| import java.util.Collections; |
| import java.util.HashMap; |
| import java.util.HashSet; |
| import java.util.Iterator; |
| import java.util.List; |
| import java.util.Map; |
| import java.util.Set; |
| |
| import org.apache.clerezza.rdf.core.UriRef; |
| import org.apache.stanbol.commons.opennlp.TextAnalyzer.AnalysedText.Token; |
| import org.apache.stanbol.enhancer.engines.keywordextraction.impl.EntityLinkerConfig.RedirectProcessingMode; |
| import org.apache.stanbol.enhancer.engines.keywordextraction.impl.Suggestion.MATCH; |
| import org.apache.stanbol.enhancer.engines.keywordextraction.linking.AnalysedContent; |
| import org.apache.stanbol.enhancer.engines.keywordextraction.linking.EntitySearcher; |
| import org.apache.stanbol.enhancer.servicesapi.EngineException; |
| import org.apache.stanbol.entityhub.servicesapi.defaults.NamespaceEnum; |
| import org.apache.stanbol.entityhub.servicesapi.model.Reference; |
| import org.apache.stanbol.entityhub.servicesapi.model.Representation; |
| import org.apache.stanbol.entityhub.servicesapi.model.Text; |
| import org.apache.stanbol.entityhub.servicesapi.model.rdf.RdfResourceEnum; |
| import org.slf4j.Logger; |
| import org.slf4j.LoggerFactory; |
| |
| public class EntityLinker { |
| |
| private final Logger log = LoggerFactory.getLogger(EntityLinker.class); |
| |
| private final EntityLinkerConfig config; |
| private final AnalysedContent content; |
| private final EntitySearcher entitySearcher; |
| /** |
| * The state of the current processing |
| */ |
| private final ProcessingState state; |
| /** |
| * The map holding the results of the linking process |
| */ |
| private final Map<String,LinkedEntity> linkedEntities = new HashMap<String,LinkedEntity>(); |
| |
| private Integer lookupLimit; |
| /** |
| * After {@link #process()}ing this returns the entities linked for the |
| * parsed {@link AnalysedContent}. |
| * @return the linked entities |
| */ |
| public final Map<String,LinkedEntity> getLinkedEntities() { |
| return linkedEntities; |
| } |
| public EntityLinker(AnalysedContent content,EntitySearcher taxonomy,EntityLinkerConfig config){ |
| if(config == null){ |
| throw new IllegalArgumentException("The parsed TaxonomyLinkerConfig MUST NOT be NULL!"); |
| } |
| if(taxonomy == null){ |
| throw new IllegalArgumentException("The parsed Taxonomy MUST NOT be NULL!"); |
| } |
| if(content == null){ |
| throw new IllegalArgumentException("The parsed AnalysedContent MUST NOT be NULL!"); |
| } |
| this.content = content; |
| this.entitySearcher = taxonomy; |
| this.config = config; |
| this.state = new ProcessingState(content.getAnalysedText()); |
| this.lookupLimit = Math.max(10,config.getMaxSuggestions()*2); |
| } |
| /** |
| * Steps over the sentences, chunks, tokens of the {@link #sentences} |
| */ |
| public void process() throws EngineException { |
| int debugedIndex = 0; |
| while(state.next()) { |
| if(log.isDebugEnabled() && (state.getTokenIndex() > debugedIndex || state.getTokenIndex() == 0)){ |
| debugedIndex = state.getTokenIndex(); |
| Token token = state.getToken(); |
| log.debug(" {} {} (pos:{}|prop:{})",new Object[]{ |
| isProcessableToken(token)? '+':'-', |
| token.getText(),token.getPosTags(),token.getPosProbabilities() |
| }); |
| } |
| if(isProcessableToken(state.getToken())){ |
| List<String> searchStrings = new ArrayList<String>(config.getMaxSearchTokens()); |
| searchStrings.add(state.getToken().getText()); |
| //get the list of all tokens that can possible be matched |
| int includeTokenIndex = state.getTokenIndex(); |
| includeTokenIndex++; |
| while(searchStrings.size() < config.getMaxSearchTokens() && //more search strings |
| (includeTokenIndex <= (state.getChunk() != null ? //still within |
| state.getChunk().getEnd() : //the chunk |
| state.getSentence().getTokens().size()-1))){ //or sentence |
| Token included = state.getSentence().getTokens().get(includeTokenIndex); |
| if(log.isDebugEnabled() && includeTokenIndex > debugedIndex){ |
| debugedIndex = includeTokenIndex; |
| log.debug(" {} {} (pos:{}|prop:{})",new Object[]{ |
| isProcessableToken(included)? '+':'-', |
| included.getText(),included.getPosTags(),included.getPosProbabilities() |
| }); |
| } |
| includeTokenIndex++; |
| if(isProcessableToken(included)){ |
| searchStrings.add(included.getText()); |
| } |
| } |
| //search for Entities |
| List<Suggestion> suggestions = lookupEntities(searchStrings); |
| if(!suggestions.isEmpty()){ |
| //update the suggestions based on the best match |
| int bestMatchCount = suggestions.get(0).getMatchCount(); |
| Iterator<Suggestion> it = suggestions.iterator(); |
| while(it.hasNext()){ |
| Suggestion suggestion = it.next(); |
| //suggestions that match less tokens as the best match |
| //need to be updated to PARTIAL |
| if(suggestion.getMatchCount() < bestMatchCount){ |
| suggestion.setMatch(MATCH.PARTIAL); |
| } |
| //Filter matches with less than config.getMinFoundTokens() |
| //if matchcount is less than of the best match |
| if(suggestion.getMatchCount() < bestMatchCount && |
| suggestion.getMatchCount() < config.getMinFoundTokens()){ |
| it.remove(); |
| } else { //calculate the score |
| double suggestionMatchScore = suggestion.getMatchCount()*suggestion.getMatchScore(); |
| //how good is the current match in relation to the best one |
| double spanScore = suggestion.getMatchCount()/bestMatchCount; |
| //how good is the match to the span selected by this suggestion |
| double textScore = suggestionMatchScore/suggestion.getSpan(); |
| //how good is the match in relation to the tokens of the suggested label |
| double labelScore = suggestionMatchScore/suggestion.getLabelTokenCount(); |
| suggestion.setScore(spanScore*spanScore*textScore*labelScore); |
| } |
| } |
| Suggestion oldBestRanked = suggestions.get(0); //for debugging |
| //resort by score |
| Collections.sort(suggestions, Suggestion.SCORE_COMPARATOR); |
| //this should never happen ... but the |
| //matchcount of the best match MUST NOT change |
| //after the sort by score! |
| if(bestMatchCount != suggestions.get(0).getMatchCount()){ |
| log.warn("The match count for the top Ranked Suggestion for {} " + |
| "changed after resorting based on Scores!", |
| state.getTokenText(suggestions.get(0).getStart(),bestMatchCount)); |
| log.warn(" originalbest : {}",oldBestRanked); |
| log.warn(" currnet ranking : {}",suggestions); |
| log.warn(" ... this will result in worng confidence values relative to the best match"); |
| } |
| //remove all suggestions > config.maxSuggestions |
| if(suggestions.size() > config.getMaxSuggestions()){ |
| suggestions.subList(config.getMaxSuggestions(),suggestions.size()).clear(); |
| } |
| |
| //process redirects |
| if(config.getRedirectProcessingMode() != RedirectProcessingMode.IGNORE){ |
| for(Suggestion suggestion : suggestions){ |
| processRedirects(suggestion); |
| } |
| } |
| int start = suggestions.get(0).getStart(); |
| int span = suggestions.get(0).getSpan(); |
| //Store the linking results |
| String selectedText = state.getTokenText(start,span); |
| //float score; |
| LinkedEntity linkedEntity = linkedEntities.get(selectedText); |
| if(linkedEntity == null){ |
| linkedEntity = new LinkedEntity(selectedText, |
| suggestions, getLinkedEntityTypes(suggestions.subList(0, 1))); |
| linkedEntities.put(selectedText, linkedEntity); |
| } |
| linkedEntity.addOccurrence( |
| state.getSentence(), start, span); |
| //set the next token to process to the next word after the |
| //currently found suggestion |
| state.setConsumed(start+span-1); |
| } |
| |
| } //else do not process this token |
| } |
| } |
| /** |
| * Retrieves all {@link EntitySearcher#getTypeField()} values of the parsed |
| * {@link Suggestion}s and than lookup the {@link NamespaceEnum#dcTerms dc}:type |
| * values for the {@link LinkedEntity#getTypes()} by using the configured |
| * {@link EntityLinkerConfig#getTypeMappings() types mappings} (and if |
| * no mapping is found the {@link EntityLinkerConfig#getDefaultDcType() |
| * default} type. |
| * @param conceptTypes The list of suggestions |
| * @return the types values for the {@link LinkedEntity} |
| */ |
| private Set<UriRef> getLinkedEntityTypes(Collection<Suggestion> suggestions){ |
| Collection<String> conceptTypes = new HashSet<String>(); |
| for(Suggestion suggestion : suggestions){ |
| for(Iterator<Reference> types = |
| suggestion.getRepresentation().getReferences(config.getTypeField()); |
| types.hasNext();conceptTypes.add(types.next().getReference())); |
| } |
| Map<String,UriRef> typeMappings = config.getTypeMappings(); |
| Set<UriRef> dcTypes = new HashSet<UriRef>(); |
| for(String conceptType : conceptTypes){ |
| UriRef dcType = typeMappings.get(conceptType); |
| if(dcType != null){ |
| dcTypes.add(dcType); |
| } |
| } |
| if(dcTypes.isEmpty() && config.getDefaultDcType() != null){ |
| dcTypes.add(config.getDefaultDcType()); |
| } |
| return dcTypes; |
| } |
| /** |
| * Processes {@link EntitySearcher#getRedirectField() redirect field} values for |
| * the parsed suggestions based on the {@link RedirectProcessingMode} |
| * as configured in the {@link #config}.<p> |
| * The results of this method are stored within the parsed {@link Suggestion}s |
| * @param suggestion The suggestion to process. |
| */ |
| private void processRedirects(Suggestion suggestion) { |
| //if mode is IGNORE -> nothing to do |
| if(config.getRedirectProcessingMode() == RedirectProcessingMode.IGNORE){ |
| return; |
| } |
| //in case results for queries are locally cached it might be the case |
| //that some/all of the results do already redirects processed. |
| //therefore there is a small internal state that stores this information |
| if(suggestion.isRedirectedProcessed()){ |
| return; //Redirects for ResultMatch are already processed ... ignore |
| } |
| Representation result = suggestion.getResult(); |
| Iterator<Reference> redirects = result.getReferences(config.getRedirectField()); |
| switch (config.getRedirectProcessingMode()) { |
| case ADD_VALUES: |
| while(redirects.hasNext()){ |
| Reference redirect = redirects.next(); |
| if(redirect != null){ |
| Representation redirectedEntity = entitySearcher.get(redirect.getReference(), |
| config.getSelectedFields()); |
| if(redirectedEntity != null){ |
| for(Iterator<String> fields = redirectedEntity.getFieldNames();fields.hasNext();){ |
| String field = fields.next(); |
| result.add(field, redirectedEntity.get(field)); |
| } |
| } |
| //set that the redirects where searched for this result |
| suggestion.setRedirectProcessed(true); |
| } |
| } |
| case FOLLOW: |
| while(redirects.hasNext()){ |
| Reference redirect = redirects.next(); |
| if(redirect != null){ |
| Representation redirectedEntity = entitySearcher.get(redirect.getReference(), |
| config.getSelectedFields()); |
| if(redirectedEntity != null){ |
| //copy the original result score |
| redirectedEntity.set(RdfResourceEnum.resultScore.getUri(), |
| result.get(RdfResourceEnum.resultScore.getUri())); |
| //set the redirect |
| suggestion.setRedirect(redirectedEntity); |
| } |
| } |
| } |
| default: //nothing to do |
| } |
| } |
| /** |
| * Searches for Entities in the {@link #entitySearcher} corresponding to the |
| * {@link Token#getText() words} of the current {@link #state position} in |
| * the text. |
| * @param searchStrings the list of {@link Token#getText() words} to search |
| * entities for. |
| * @return The sorted list with the suggestions. |
| * If there are no suggestions an empty list will be returned. |
| */ |
| private List<Suggestion> lookupEntities(List<String> searchStrings) throws EngineException { |
| Collection<? extends Representation> results; |
| try { |
| results = entitySearcher.lookup(config.getNameField(),config.getSelectedFields(), |
| searchStrings, state.getSentence().getLanguage(),config.getDefaultLanguage()); |
| } catch (RuntimeException e) { |
| throw new EngineException(e.getMessage(),e); |
| } |
| List<Suggestion> suggestions = new ArrayList<Suggestion>(); |
| for(Representation result : results){ |
| Suggestion match = matchLabels(result); |
| if(match.getMatch() != MATCH.NONE){ |
| suggestions.add(match); |
| } |
| } |
| //sort the suggestions |
| if(suggestions.size()>1){ |
| Collections.sort(suggestions,Suggestion.DEFAULT_SUGGESTION_COMPARATOR); |
| } |
| //TODO: Work in Progress feature ... allowing to refine search if no |
| // suggestion is found but results where present |
| // However this would need full limit/offset support for the |
| // EntitySearcher. (rwesten 2012-05-21) |
| // Integer maxResults = entitySearcher.getLimit(); |
| // if(maxResults == null){ |
| // maxResults = 1; //fall back to 1 if limit is not known |
| // } |
| // if(suggestions.isEmpty() && //if no suggestions where found |
| // results.size() >= maxResults && //but the query had max results |
| // //than the actual entity might not be within the first LIMIT results |
| // searchStrings.size() > 1){ //if multiple words where used for the search |
| // //try again with only a single word |
| // suggestions = lookupEntities(Collections.singletonList(searchStrings.get(0))); |
| // |
| // } |
| //remove all elements > config.getMaxSuggestions() |
| return suggestions; |
| } |
| /** |
| * Matches the labels of the parsed {@link Representation} with the Tokens of |
| * the texts (beginning with the currently active |
| * {@link ProcessingState#getToken() token}).<p> |
| * The field used to get the labels is retrieved from |
| * {@link EntitySearcher#getNameField()}. Only labels with no language or the |
| * language of the current sentence are considered. If less than |
| * {@link EntityLinkerConfig#getMinFoundTokens()} tokens match with an |
| * label the Concept is only considered to match if the label is |
| * {@link String#equalsIgnoreCase(String)} to the text covered by the |
| * matched token(s). Otherwise also {@link MATCH#FULL} and {@link MATCH#PARTIAL} |
| * results are allowed. |
| * @param rep The representation including at least the data for the |
| * {@link EntitySearcher#getNameField()} property. |
| * @return The result of the matching. |
| */ |
| private Suggestion matchLabels(Representation rep) { |
| String curLang = state.getLanguage(); //language of the current sentence |
| String defLang = config.getDefaultLanguage(); //configured default language |
| // Iterator<Text> labels = rep.get(config.getNameField(), //get all labels |
| // state.getLanguage(), //in the current language |
| // config.getDefaultLanguage()); //and the default language |
| Iterator<Text> labels = rep.getText(config.getNameField()); |
| Suggestion match = new Suggestion(rep); |
| Collection<Text> defaultLabels = new ArrayList<Text>(); |
| boolean matchedCurLangLabel = false; |
| while(labels.hasNext()){ |
| Text label = labels.next(); |
| String lang = label.getLanguage(); |
| if((lang == null && curLang == null) || |
| (lang != null && curLang != null && lang.startsWith(curLang))){ |
| matchLabel(match, label); |
| matchedCurLangLabel = true; |
| } else if((lang ==null && defLang == null) || |
| (lang != null && defLang != null && lang.startsWith(defLang))){ |
| defaultLabels.add(label); |
| } |
| } |
| //use only labels in the default language if there is |
| // * no label in the current language or |
| if(!matchedCurLangLabel){// || match.getMatch() == MATCH.NONE){ |
| for(Text defaultLangLabel : defaultLabels){ |
| matchLabel(match, defaultLangLabel); |
| } |
| } |
| return match; |
| } |
| |
| /** |
| * @param match |
| * @param label |
| */ |
| private void matchLabel(Suggestion match, Text label) { |
| String text = label.getText(); |
| if(!config.isCaseSensitiveMatching()){ |
| text = text.toLowerCase(); //TODO use language of label for Locale |
| } |
| //Tokenize the label and remove remove tokens without alpha numerical chars |
| String[] unprocessedLabelTokens = content.tokenize(text); |
| int offset = 0; |
| for(int i=0;i<unprocessedLabelTokens.length;i++){ |
| boolean hasAlpha = false; |
| for(int j=0;!hasAlpha && j<unprocessedLabelTokens[i].length();j++){ |
| hasAlpha = Character.isLetterOrDigit(unprocessedLabelTokens[i].charAt(j)); |
| } |
| if(!hasAlpha){ |
| offset++; |
| } else if(offset > 0){ |
| unprocessedLabelTokens[i-offset] = unprocessedLabelTokens[i]; |
| } |
| } |
| String[] labelTokens; |
| if(offset == 0){ |
| labelTokens = unprocessedLabelTokens; |
| } else { |
| labelTokens = new String[unprocessedLabelTokens.length-offset]; |
| System.arraycopy(unprocessedLabelTokens, 0, labelTokens, 0, labelTokens.length); |
| } |
| Set<String> labelTokenSet = new HashSet<String>( |
| Arrays.asList(labelTokens)); |
| int foundProcessableTokens = 0; |
| int foundTokens = 0; |
| float foundTokenMatch = 0; |
| //ensure the correct order of the tokens in the suggested entity |
| boolean search = true; |
| int firstFoundIndex = -1; |
| int lastFoundIndex = -1; |
| int firstFoundLabelIndex = -1; |
| int lastfoundLabelIndex = -1; |
| Token currentToken; |
| String currentTokenText; |
| int currentTokenLength; |
| int notFound = 0; |
| float minTokenMatchFactor = config.getMinTokenMatchFactor(); |
| //search for matches within the correct order |
| for(int currentIndex = state.getTokenIndex(); |
| currentIndex < state.getSentence().getTokens().size() |
| && search ;currentIndex++){ |
| currentToken = state.getSentence().getTokens().get(currentIndex); |
| if(currentToken.hasAplhaNumericChar()){ |
| currentTokenText = currentToken.getText(); |
| if(!config.isCaseSensitiveMatching()){ |
| currentTokenText = currentTokenText.toLowerCase(); |
| } |
| currentTokenLength = currentTokenText.length(); |
| boolean isProcessable = isProcessableToken(currentToken); |
| boolean found = false; |
| float matchFactor = 0f; |
| //iteration starts at the next token after the last matched one |
| //so it is OK to skip tokens in the label, but not within the text |
| for(int i = lastfoundLabelIndex+1;!found && i < labelTokens.length;i ++){ |
| String labelTokenText = labelTokens[i]; |
| int labelTokenLength = labelTokenText.length(); |
| float maxLength = currentTokenLength > labelTokenLength ? currentTokenLength : labelTokenLength; |
| float lengthDif = Math.abs(currentTokenLength - labelTokenLength); |
| if((lengthDif/maxLength)<=(1-minTokenMatchFactor)){ //this prevents unnecessary string comparison |
| int matchCount = compareTokens(currentTokenText, labelTokenText); |
| if(matchCount/maxLength >= minTokenMatchFactor){ |
| lastfoundLabelIndex = i; //set the last found index to the current position |
| found = true; //set found to true -> stops iteration |
| matchFactor = matchCount/maxLength; //how good is the match |
| //remove matched labels from the set to disable them for |
| //a later random oder search |
| labelTokenSet.remove(labelTokenText); |
| } |
| } |
| } |
| if(!found){ |
| //search for a match in the wrong order |
| //currently only exact matches (for testing) |
| if(found = labelTokenSet.remove(currentTokenText)){ |
| matchFactor = 0.7f; |
| } |
| } |
| //int found = text.indexOf(currentToken.getText().toLowerCase()); |
| if(found){ //found |
| if(isProcessable){ |
| foundProcessableTokens++; //only count processable Tokens |
| } |
| foundTokens++; |
| foundTokenMatch = foundTokenMatch + matchFactor; //sum up the matches |
| if(firstFoundIndex < 0){ |
| firstFoundIndex = currentIndex; |
| firstFoundLabelIndex = lastfoundLabelIndex; |
| } |
| lastFoundIndex = currentIndex; |
| } else { //not found |
| notFound++; |
| if(isProcessable || notFound > config.getMaxNotFound()){ |
| //stop as soon as a token that needs to be processed is |
| //not found in the label or the maximum number of tokens |
| //that are not processable are not found |
| search = false; |
| } |
| } |
| } // else token without alpha or numeric characters are not processed |
| } |
| //search backwards for label tokens until firstFoundLabelIndex if there |
| //are unconsumed Tokens in the sentence before state.getTokenIndex |
| int currentIndex = state.getTokenIndex()-1; |
| int labelIndex = firstFoundLabelIndex-1; |
| notFound = 0; |
| search = true; |
| while(search && labelIndex >= 0 && currentIndex > state.getConsumedIndex()){ |
| String labelTokenText = labelTokens[labelIndex]; |
| if(labelTokenSet.remove(labelTokenText)){ //still not matched |
| currentToken = state.getSentence().getTokens().get(currentIndex); |
| boolean isProcessable = isProcessableToken(currentToken); |
| currentTokenText = currentToken.getText(); |
| if(!config.isCaseSensitiveMatching()){ |
| currentTokenText = currentTokenText.toLowerCase(); |
| } |
| currentTokenLength = currentTokenText.length(); |
| boolean found = false; |
| float matchFactor = 0f; |
| int labelTokenLength = labelTokenText.length(); |
| float maxLength = currentTokenLength > labelTokenLength ? currentTokenLength : labelTokenLength; |
| float lengthDif = Math.abs(currentTokenLength - labelTokenLength); |
| if((lengthDif/maxLength)<=(1-minTokenMatchFactor)){ //this prevents unnecessary string comparison |
| int matchCount = compareTokens(currentTokenText, labelTokenText); |
| if(matchCount/maxLength >= minTokenMatchFactor){ |
| found = true; //set found to true -> stops iteration |
| matchFactor = matchCount/maxLength; //how good is the match |
| } |
| } |
| if(found){ //found |
| if(isProcessable){ |
| foundProcessableTokens++; //only count processable Tokens |
| } |
| foundTokens++; |
| foundTokenMatch = foundTokenMatch + matchFactor; //sum up the matches |
| firstFoundIndex = currentIndex; |
| currentIndex --; |
| } else { |
| notFound++; |
| if(isProcessable || notFound > config.getMaxNotFound()){ |
| //stop as soon as a token that needs to be processed is |
| //not found in the label or the maximum number of tokens |
| //that are not processable are not found |
| search = false; |
| } |
| } |
| } |
| labelIndex--; |
| } |
| //Now we make a second round to search tokens that match in the wrong order |
| //e.g. if given and family name of persons are switched |
| MATCH labelMatch; |
| int coveredTokens = lastFoundIndex-firstFoundIndex+1; |
| float labelMatchScore = (foundTokenMatch/(float)labelTokens.length); |
| //Matching rules |
| // - if less than config#minTokenFound() than accept only EXACT |
| // - override PARTIAL matches with FULL/EXACT matches only if |
| // foundTokens of the PARTIAL match is > than of the FULL/EXACT |
| // match (this will be very rare |
| if(foundProcessableTokens > 0 && match.getMatchCount() <= foundProcessableTokens) { |
| String currentText = state.getTokenText(firstFoundIndex,coveredTokens); |
| if(config.isCaseSensitiveMatching() ? currentText.equals(text) : currentText.equalsIgnoreCase(text)){ |
| labelMatch = MATCH.EXACT; |
| //set found to covered: May be lower because only |
| //processable tokens are counted, but Exact also checks |
| //of non-processable! |
| foundTokens = coveredTokens; |
| } else if((foundProcessableTokens >= config.getMinFoundTokens() || |
| //NOTE (rwesten, 2012-05-21): Do not check if all covered |
| // Tokens are found, but if all Tokens of the Label are |
| // matched! (STANBOL-622) |
| //foundTokens == coveredTokens) && |
| foundTokens >= labelTokens.length) && |
| labelMatchScore >= 0.6f){ |
| //same as above |
| //if(foundTokens == coveredTokens){ |
| if(foundTokens == labelTokens.length && foundTokens == coveredTokens){ |
| labelMatch = MATCH.FULL; |
| } else { |
| labelMatch = MATCH.PARTIAL; |
| } |
| } else { |
| labelMatch = MATCH.NONE; |
| } |
| if(labelMatch != MATCH.NONE){ |
| if(match.getMatchCount() < foundProcessableTokens || |
| match.getMatchCount() == foundProcessableTokens && |
| labelMatch.ordinal() > match.getMatch().ordinal()){ |
| match.updateMatch(labelMatch, firstFoundIndex, coveredTokens, foundTokens, |
| foundTokenMatch/foundTokens,label,labelTokens.length); |
| } //else this match is not better as the existing one |
| } //else ignore labels with MATCH.NONE |
| } //else NO tokens found -> nothing to do |
| } |
| /** |
| * Compares to token with each other and returns the longest match. The |
| * tokens are compared from the beginning and from the end. |
| * @param token1 the first token |
| * @param token2 the second token |
| * @return the number of matching chars |
| */ |
| private int compareTokens(String token1,String token2){ |
| int l1 = token1.length(); //length of the first token |
| int l2 = token2.length(); //length of the second token |
| //in case of same length check for equals first |
| if(l1 == l2 && token1.equals(token2)){ |
| return l1; |
| } |
| int ml = l1>l2?l2:l1; //minimum length of a token |
| if(ml == 0){ |
| return ml; |
| } |
| int f = 0; //forward match count + 1 |
| int b = 0; //backward match count + 1 |
| boolean match = true; //still matches |
| while(match && f < ml){ |
| match = token1.charAt(f) == token2.charAt(f); |
| f++; |
| } |
| if(!match){ |
| f--; |
| } |
| if(f < ml){ |
| match = true; |
| while(match && b < ml){ |
| b++; |
| match = token1.charAt(l1-b) == token2.charAt(l2-b); |
| } |
| if(!match){ |
| b--; |
| } |
| } |
| return f > b ? f : b; |
| } |
| |
| /** |
| * Checks if the current token of {@link #state} is processable. |
| * @param token the {@link Token} to check. |
| * @return <code>true</code> if the parsed token needs to be processed. |
| * Otherwise <code>false</code> |
| */ |
| private boolean isProcessableToken(Token token) { |
| Boolean processToken = null; |
| String[] posTags = token.getPosTags(); |
| double[] posProb = token.getPosProbabilities(); |
| if(posTags != null){ |
| int i=0; |
| do { |
| processToken = content.processPOS(posTags[i],posProb[i]); |
| i++; |
| } while(processToken == null && i<posTags.length); |
| } |
| if(processToken == null) { |
| processToken = token.getText().length() >= config.getMinSearchTokenLength(); |
| } |
| return processToken; |
| } |
| } |