blob: 3b18f09cc8e4bcc3a9928cfcd23ffc812dd71545 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.stanbol.enhancer.engines.entitylinking.impl;
import static java.util.Collections.disjoint;
import static org.apache.stanbol.enhancer.nlp.NlpAnnotations.POS_ANNOTATION;
import java.util.Collections;
import java.util.List;
import org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig;
import org.apache.stanbol.enhancer.engines.entitylinking.config.LanguageProcessingConfig;
import org.apache.stanbol.enhancer.nlp.NlpAnnotations;
import org.apache.stanbol.enhancer.nlp.model.Token;
import org.apache.stanbol.enhancer.nlp.model.annotation.Value;
import org.apache.stanbol.enhancer.nlp.morpho.MorphoFeatures;
import org.apache.stanbol.enhancer.nlp.pos.PosTag;
/**
* Internally used to store additional Metadata for Tokens of the current Sentence
* <p>
* Checks if the parsed {@link Token} is processable. This decision is taken first based on the POS
* annotation ( Lexical Category, POS tag) and second on the
* {@link EntityLinkerConfig#getMinSearchTokenLength()} if no POS annotations are available or the
* probability of the POS annotations is to low.
* <p>
* Since STANBOL-685two POS Probabilities are used <ul>
* <li> {@link LanguageProcessingConfig#getMinPosAnnotationProbability()} for accepting POS tags that are
* processed - included in {@link LanguageProcessingConfig#getLinkedLexicalCategories()} or
* {@link LanguageProcessingConfig#getLinkedPosTags()}.
* <li> {@link LanguageProcessingConfig#getMinExcludePosAnnotationProbability()} for those that are not
* processed. By default the exclusion probability is set to half of the inclusion one.
* </ul>
* Assuming that the <code>minPosTypePropb=0.667</code> a
* <ul>
* <li>noun with the prop 0.8 would result in returning <code>true</code>
* <li>noun with prop 0.5 would return <code>null</code>
* <li>verb with prop 0.4 would return <code>false</code>
* <li>verb with prop 0.3 would return <code>null</code>
* </ul>
* This algorithm makes it less likely that the {@link EntityLinkerConfig#getMinSearchTokenLength()} needs
* to be used as fallback for Tokens (what typically still provides better estimations as the token
* length).
* <p>
* (see also STANBOL-685 even that this Issue refers a version of this Engine that has not yet used the
* Stanbol NLP processing chain)
*
* @param token
* the {@link Token} to check.
* @return <code>true</code> if the parsed token needs to be processed. Otherwise <code>false</code>
*/
public class TokenData {
/** The Token */
public final Token token;
/** The index of the Token within the current Section (Sentence) */
public final int index;
/** If this Token should be linked with the Vocabulary */
public boolean isLinkable;
/** If this Token should be used for multi word searches in the Vocabulary */
public boolean isMatchable;
/** if this Token has an alpha or numeric char */
public final boolean hasAlphaNumeric;
/** the chunk of this Token */
public final ChunkData inChunk;
/** the morphological features of the Token (selected based on the POS Tag) */
public final MorphoFeatures morpho;
/**
* if this token starts with an upperCase letter
*/
public final boolean upperCase;
/**
* if the length of the token is &gt;= {@link LanguageProcessingConfig#getMinSearchTokenLength()}
*/
public boolean hasSearchableLength;
/**
* If the POS type of this word matches a linkable category
*/
public final Boolean isLinkablePos;
/**
* if the POS type of this word matches a matchable category
*/
public final Boolean isMatchablePos;
/**
* if this Token represents the start of an sub-sentence such as an
* starting ending quote
* @see ProcessingState#SUB_SENTENCE_START_POS
*/
public final boolean isSubSentenceStart;
/**
* Constructs and initializes meta data needed for linking based
* on the current tokens (and its NLP annotation)
* @param index the index of the Token within the current section
* @param token the token
* @param chunk the current chunk or <code>null</code> if none
*/
public TokenData(LanguageProcessingConfig tpc, int index,Token token, ChunkData chunk) {
//(0) init fields
this.token = token;
this.index = index;
this.inChunk = chunk;
this.hasAlphaNumeric = Utils.hasAlphaNumericChar(token.getSpan());
this.hasSearchableLength = token.getSpan().length() >= tpc.getMinSearchTokenLength();
PosTag selectedPosTag = null;
boolean matchedPosTag = false; //matched any of the POS annotations
//(1) check if this Token should be linked against the Vocabulary (isProcessable)
upperCase = token.getEnd() > token.getStart() && //not an empty token
Character.isUpperCase(token.getSpan().codePointAt(0)); //and upper case
boolean isLinkablePos = false;
boolean isMatchablePos = false;
boolean isSubSentenceStart = false;
List<Value<PosTag>> posAnnotations = token.getAnnotations(POS_ANNOTATION);
for(Value<PosTag> posAnnotation : posAnnotations){
// check three possible match
// 1. the LexicalCategory matches
// 2. the Pos matches
// 3. the String tag matches
PosTag posTag = posAnnotation.value();
if((!disjoint(tpc.getLinkedLexicalCategories(), posTag.getCategories())) ||
(!disjoint(tpc.getLinkedPos(), posTag.getPosHierarchy())) ||
tpc.getLinkedPosTags().contains(posTag.getTag())){
if(posAnnotation.probability() == Value.UNKNOWN_PROBABILITY ||
posAnnotation.probability() >= tpc.getMinPosAnnotationProbability()){
selectedPosTag = posTag;
isLinkablePos = true;
isMatchablePos = true;
matchedPosTag = true;
break;
} // else probability to low for inclusion
} else if(posAnnotation.probability() == Value.UNKNOWN_PROBABILITY ||
posAnnotation.probability() >= tpc.getMinExcludePosAnnotationProbability()){
selectedPosTag = posTag; //also rejected PosTags are selected
matchedPosTag = true;
isLinkablePos = false;
break;
} // else probability to low for exclusion
}
if(!matchedPosTag) { //not matched against a POS Tag ...
this.isLinkablePos = null;
} else {
this.isLinkablePos = isLinkablePos;
}
//(2) check if this token should be considered to match labels of suggestions
if(this.isLinkablePos != null && this.isLinkablePos){ //processable tokens are also matchable
this.isMatchablePos = true;
} else { //check POS and length to see if token is matchable
matchedPosTag = false; //reset to false!
for(Value<PosTag> posAnnotation : posAnnotations){
PosTag posTag = posAnnotation.value();
if(posTag.isMapped()){
if(!Collections.disjoint(tpc.getMatchedLexicalCategories(),
posTag.getCategories())){
if(posAnnotation.probability() == Value.UNKNOWN_PROBABILITY ||
posAnnotation.probability() >= tpc.getMinPosAnnotationProbability()){
//override selectedPosTag if present
selectedPosTag = posTag; //mark the matchable as selected PosTag
isMatchablePos = true;
matchedPosTag = true;
break;
} // else probability to low for inclusion
} else if(posAnnotation.probability() == Value.UNKNOWN_PROBABILITY ||
posAnnotation.probability() >= tpc.getMinExcludePosAnnotationProbability()){
if(selectedPosTag == null){ //do not override existing values
selectedPosTag = posTag; //also rejected PosTags are selected
}
isMatchablePos = false;
matchedPosTag = true;
break;
} // else probability to low for exclusion
} //else not matched ... search next one
}
if(!matchedPosTag){ //not matched against POS tag ...
//fall back to the token length
this.isMatchablePos = null;
//this.isMatchablePos = token.getSpan().length() >= tpc.getMinSearchTokenLength();
} else {
this.isMatchablePos = isMatchablePos;
}
}
//(3) check if the POS tag indicates the start/end of an sub-sentence
for(Value<PosTag> posAnnotation : posAnnotations){
PosTag posTag = posAnnotation.value();
if((!disjoint(ProcessingState.SUB_SENTENCE_START_POS,posTag.getPosHierarchy()))){
if(posAnnotation.probability() == Value.UNKNOWN_PROBABILITY ||
posAnnotation.probability() >= tpc.getMinPosAnnotationProbability()){
isSubSentenceStart = true;
} // else probability to low for inclusion
} else if(posAnnotation.probability() == Value.UNKNOWN_PROBABILITY ||
posAnnotation.probability() >= tpc.getMinExcludePosAnnotationProbability()){
isSubSentenceStart = false;
}
}
this.isSubSentenceStart = isSubSentenceStart;
//(4) check for morpho analyses
if(selectedPosTag == null){ //token is not processable or matchable
//we need to set the selectedPoas tag to the first POS annotation
Value<PosTag> posAnnotation = token.getAnnotation(POS_ANNOTATION);
if(posAnnotation != null) {
selectedPosTag = posAnnotation.value();
}
}
List<Value<MorphoFeatures>> morphoAnnotations = token.getAnnotations(NlpAnnotations.MORPHO_ANNOTATION);
if(selectedPosTag == null){ //no POS information ... use the first morpho annotation
morpho = morphoAnnotations.isEmpty() ? null : morphoAnnotations.get(0).value();
} else { //select the correct morpho annotation based on the POS tag
MorphoFeatures mf = null;
selectMorphoFeature :
for(Value<MorphoFeatures> morphoAnnotation : morphoAnnotations){
for(PosTag posTag : morphoAnnotation.value().getPosList()){
if(!disjoint(selectedPosTag.getCategories(),posTag.getCategories())){
mf = morphoAnnotation.value();
break selectMorphoFeature; //stop after finding the first one
}
}
}
morpho = mf;
}
}
/**
* Getter for token text
* @return the text of the token
*/
public String getTokenText(){
return token.getSpan();
}
/**
* Getter for the Lemma of the token.
* @return the Lemma of the Token or <code>null</code> if not available
*/
public String getTokenLemma(){
return morpho != null ? morpho.getLemma() : null;
}
@Override
public String toString() {
return new StringBuilder("TokenData: '").append(getTokenText())
.append("'[linkable=").append(isLinkable).append("(linkabkePos=").append(isLinkablePos)
.append(")| matchable=").append(isMatchable).append("(matchablePos=").append(isMatchablePos)
.append(")| alpha=").append(hasAlphaNumeric).append("| seachLength=")
.append(hasSearchableLength).append("| upperCase=").append(upperCase)
.append("]").toString();
}
}