| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.stanbol.enhancer.engines.lucenefstlinking; |
| |
| import static org.apache.stanbol.enhancer.engines.entitylinking.config.TextProcessingConfig.UNICASE_SCRIPT_LANUAGES; |
| |
| import java.io.IOException; |
| import java.security.AccessController; |
| import java.security.PrivilegedAction; |
| import java.util.Arrays; |
| import java.util.Collections; |
| import java.util.EnumSet; |
| import java.util.HashMap; |
| import java.util.HashSet; |
| import java.util.Iterator; |
| import java.util.LinkedList; |
| import java.util.List; |
| import java.util.Map; |
| import java.util.Set; |
| |
| import org.apache.lucene.analysis.TokenFilter; |
| import org.apache.lucene.analysis.TokenStream; |
| import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; |
| import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; |
| import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; |
| import org.apache.stanbol.enhancer.engines.entitylinking.config.LanguageProcessingConfig; |
| import org.apache.stanbol.enhancer.engines.entitylinking.config.TextProcessingConfig; |
| import org.apache.stanbol.enhancer.engines.entitylinking.engine.EntityLinkingEngine; |
| import org.apache.stanbol.enhancer.engines.entitylinking.impl.ChunkData; |
| import org.apache.stanbol.enhancer.engines.entitylinking.impl.ProcessingState; |
| import org.apache.stanbol.enhancer.engines.entitylinking.impl.SectionData; |
| import org.apache.stanbol.enhancer.engines.entitylinking.impl.TokenData; |
| import org.apache.stanbol.enhancer.nlp.model.AnalysedText; |
| import org.apache.stanbol.enhancer.nlp.model.Chunk; |
| import org.apache.stanbol.enhancer.nlp.model.Section; |
| import org.apache.stanbol.enhancer.nlp.model.Sentence; |
| import org.apache.stanbol.enhancer.nlp.model.SpanTypeEnum; |
| import org.apache.stanbol.enhancer.nlp.model.Token; |
| import org.opensextant.solrtexttagger.TagClusterReducer; |
| import org.opensextant.solrtexttagger.TagLL; |
| import org.opensextant.solrtexttagger.TaggingAttribute; |
| import org.slf4j.Logger; |
| import org.slf4j.LoggerFactory; |
| |
| /** |
| * Class that ensures that only {@link TokenData#isLinkable linkable} Tokens |
| * are processed.<p> |
| * This is ensured on two places:<ol> |
| * <li> Classifies Tokens in the Solr {@link TokenStream} with the {@link TaggingAttribute} |
| * based on NLP processing results present in the {@link AnalysedText}. This |
| * implementation classifies Token similar to the {@link EntityLinkingEngine}. |
| * It uses the {@link TextProcessingConfig} for its configuration.<p> |
| * <li> Implements {@link TagClusterReducer} to ensure that all {@link TagLL tags} |
| * that do not overlap with any {@link TokenData#isLinkable linkable} are |
| * removed from the Cluster. |
| * </ol> |
| * <b> Implementation Details</b><p> |
| * The {@link TokenStream} implementation of this class serves a similar |
| * purpose as the {@link ProcessingState} used by the EntityLinkingEngine. |
| * The main differences are:<p> |
| * <ul> |
| * <li>This code needs to deal with potential different tokenization present |
| * in the {@link AnalysedText} and the {@link TokenStream}. The implemented |
| * semantics does mark Tokens in the {@link TokenStream} as |
| * <code>{@link TaggingAttribute#isTaggable()} == ture</code> if the do overlap |
| * with a {@link TokenData#isLinkable} token in the {@link AnalysedText}. |
| * <li> {@link TokenData#isMatchable} tokens are also considered as |
| * <code>{@link TaggingAttribute#isTaggable()} == ture</code> if a |
| * {@link TokenData#isMatchable} token is following within two tokens of the |
| * {@link AnalysedText}. This Range is extended if other matchable tokens are |
| * within the lookahead range. However the range is never extended over a |
| * section border. |
| * </ul> |
| * The {@link TagClusterReducer} implementation keeps track of linkable tokens |
| * while iterating over the {@link TokenStream} and adds them to the end of a |
| * List. When {@link TagClusterReducer#reduce(TagLL[])} is called tags of the |
| * cluster are checked if they do overlap with any linkable Token at the start |
| * of the list. Tokens with earlier ends as the start of the tags are removed |
| * from the list. |
| * @author Rupert Westenthaler |
| * |
| */ |
| public final class LinkableTokenFilter extends TokenFilter implements TagClusterReducer{ |
| |
| private final Logger log = LoggerFactory.getLogger(LinkableTokenFilter.class); |
| |
| /** |
| * Required to use {@link SectionData} |
| */ |
| private static final Set<SpanTypeEnum> PROCESSED_SPAN_TYPES = EnumSet.of( |
| SpanTypeEnum.Chunk,SpanTypeEnum.Token); |
| /** |
| * The NLP processing results |
| */ |
| private AnalysedText at; |
| /** |
| * The language of the text |
| */ |
| //private String lang; |
| /** |
| * If the language is unicase or not |
| */ |
| private boolean isUnicaseLanguage; |
| /** |
| * Defines how NLP processing results are processed to determine Words that |
| * need to be looked-up in the vocabulary |
| */ |
| private LanguageProcessingConfig lpc; |
| |
| /** |
| * Iterator over all sections of the {@link AnalysedText} |
| */ |
| private Iterator<? extends Section> sections; |
| /** |
| * The current section |
| */ |
| private SectionData sectionData; |
| /** |
| * Iterator over all {@link Token}s in the current section |
| */ |
| private Iterator<TokenData> tokenIt; |
| /** |
| * The current Token(s). {@link #incrementToken()} will add tokens to the |
| * end of the list and {@link #nextToken(boolean)} with <code>true</code> |
| * will remove earlier tokens as {@link #offset} from the list.<p> |
| * We need to hold multiple tokens because the TokenStream might parse |
| * multiple tokens with |
| * <code>{@link PositionIncrementAttribute#getClass() posInc} == 0</code> |
| * covering multiple {@link TokenData tokens}. |
| */ |
| private List<TokenData> tokens = new LinkedList<TokenData>(); |
| /** |
| * The cursor within the {@link #tokens} list of the currently active Token |
| */ |
| private int tokensCursor = -1; //the cursor within the tokens list |
| |
| private int lookupCount = 0; |
| private int incrementCount = 0; |
| |
| protected final CharTermAttribute termAtt; |
| protected final OffsetAttribute offset; |
| protected final TaggingAttribute taggable; |
| /** |
| * List with {@link TokenData#isLinkable linkable} {@link Token}s used by |
| * the {@link #reduce(TagLL[])} method to check if {@link TagLL tags} |
| * do overlap with any linkable token. |
| */ |
| private final List<LinkableTokenContext> linkableTokens = new LinkedList<LinkableTokenContext>(); |
| /** |
| * The minimum score a tag needs to match processable tokens within a |
| * {@link Chunk} so that is is not omitted. |
| */ |
| private double minChunkMatchScore; |
| /** |
| * The minimum amount of matched (matchable) Tokens so that an Entity is |
| * considered. Only used within processable chunks |
| */ |
| private int minFoundTokens; |
| |
| protected LinkableTokenFilter(TokenStream input, AnalysedText at, |
| String lang, LanguageProcessingConfig lpc, double minChunkMatchScore, int minFoundTokens) { |
| super(input); |
| //STANBOL-1177: add attributes in doPrivileged to avoid |
| //AccessControlException: access denied ("java.lang.RuntimePermission" "getClassLoader") |
| termAtt = AccessController.doPrivileged(new PrivilegedAction<CharTermAttribute>() { |
| @Override public CharTermAttribute run() { |
| return addAttribute(CharTermAttribute.class); |
| }}); |
| offset = AccessController.doPrivileged(new PrivilegedAction<OffsetAttribute>() { |
| @Override public OffsetAttribute run() { |
| return addAttribute(OffsetAttribute.class); |
| }}); |
| taggable = AccessController.doPrivileged(new PrivilegedAction<TaggingAttribute>() { |
| @Override public TaggingAttribute run() { |
| return addAttribute(TaggingAttribute.class); |
| }}); |
| this.at = at; |
| //this.lang = lang; |
| this.lpc = lpc; |
| this.isUnicaseLanguage = lang != null && !lang.isEmpty() && |
| UNICASE_SCRIPT_LANUAGES.contains(lang); |
| this.minChunkMatchScore = minChunkMatchScore; |
| this.minFoundTokens = minFoundTokens; |
| } |
| |
| @Override |
| public void reset() throws IOException { |
| super.reset(); |
| Iterator<Sentence> sentences = at.getSentences(); |
| this.sections = sentences.hasNext() ? sentences : Collections.singleton(at).iterator(); |
| sectionData = null; |
| tokenIt = null; |
| incrementCount = 0; |
| lookupCount = 0; |
| } |
| |
| @Override |
| public boolean incrementToken() throws IOException { |
| if(input.incrementToken()){ |
| incrementCount++; |
| boolean first = true; |
| TokenData token; |
| boolean lookup = false; |
| int lastMatchable = -1; |
| int lastIndex = -1; |
| log.trace("> solr:[{},{}] {}",new Object[]{ |
| offset.startOffset(), offset.endOffset(), termAtt}); |
| while((token = nextToken(first)) != null){ |
| log.trace(" < [{},{}]:{} (link {}, match; {})",new Object[]{ |
| token.token.getStart(), token.token.getEnd(),token.getTokenText(), |
| token.isLinkable, token.isMatchable}); |
| first = false; |
| if(token.isLinkable){ |
| log.trace(" + lookup because {} is linkable", token); |
| lookup = true; |
| } else if (token.isMatchable){ |
| lastMatchable = token.index; |
| lastIndex = lastMatchable; |
| } |
| //special rules for processable chunks (typically noun phrases) |
| //accept all tokens in processable chunks with a linkable or |
| //multiple matchable tokens. |
| if(!lookup && (!lpc.isIgnoreChunks()) && token.inChunk != null |
| && token.inChunk.isProcessable){ |
| if(token.inChunk.isNamedEntity()){ |
| if(log.isTraceEnabled()){ |
| log.trace(" + lookup because {} is part of Named Entity '{}'", |
| token.token, token.inChunk.chunk.getSpan()); |
| } |
| lookup = true; |
| } |
| if(token.inChunk.hasLinkable() || |
| (lpc.isLinkMultiMatchableTokensInChunk() && |
| token.inChunk.getMatchableCount() > 1)){ |
| if(log.isTraceEnabled()){ |
| log.trace(" + lookup because {} is part of a linkable chunk '{}'", |
| token.token, token.inChunk.chunk.getSpan()); |
| } |
| lookup = true; |
| } |
| } |
| } |
| //lookahead |
| if(!lookup && lastIndex >= 0 && sectionData != null){ |
| List<TokenData> tokens = sectionData.getTokens(); |
| int maxLookahead = Math.max(lastIndex, lastMatchable+3); |
| for(int i = lastIndex+1;!lookup && i < maxLookahead && i < tokens.size(); i++){ |
| token = tokens.get(i); |
| if(token.isLinkable){ |
| lookup = true; |
| } else if(token.isMatchable && (i+1) == maxLookahead){ |
| maxLookahead++; //increase lookahead for matchable tokens |
| } |
| } |
| } |
| this.taggable.setTaggable(lookup); |
| if(lookup){ |
| if(log.isTraceEnabled()){ |
| TokenData t = getToken(); |
| log.trace("lookup: token [{},{}]: {} | word [{},{}]:{}", new Object[]{ |
| offset.startOffset(), offset.endOffset(), termAtt, |
| t.token.getStart(), t.token.getEnd(), |
| t.getTokenText()}); |
| } |
| lookupCount++; |
| } |
| return true; |
| } else { |
| log.debug("lookup percentage: {}",lookupCount*100/(float)incrementCount); |
| return false; |
| } |
| } |
| |
| /** |
| * Iterating over TokensData requires to iterate over two hierarchy levels: |
| * (1) sections (likely Sentences) and (2) Tokens <p> |
| * <b>NOTE</b> that this method modifies a lot of fields to update the |
| * state of the iteration accordingly. If the {@link #token} field is |
| * <code>null</code> after a call to this method this indicates that the |
| * end of the {@link Token} in the {@link AnalysedText} was reached. |
| * @param first is this the first call for the current {@link #offset} state? |
| * @return the token or <code>null</code> if there are no more tokens for |
| * the current {@link #offset} |
| */ |
| private TokenData nextToken(boolean first){ |
| int startOffset = offset.startOffset(); |
| int endOffset = offset.endOffset(); |
| if(first){ //on the first call for a token |
| tokensCursor = -1; //reset cursor to zero |
| while(!tokens.isEmpty()){ |
| //remove tokens earlier as the current offset |
| if(tokens.get(0).token.getEnd() <= startOffset){ |
| tokens.remove(0); |
| } else { //stop on the first overlapping token |
| break; |
| } |
| } //else nothing to do |
| } |
| if(tokensCursor >= tokens.size()-1){ |
| if(!incrementTokenData()){ //adds a new token to the list |
| return null; //EoF |
| } |
| } |
| TokenData cursorToken = tokens.get(tokensCursor+1); |
| if(cursorToken.token.getStart() < endOffset){ |
| tokensCursor++; //set the next token as current |
| return cursorToken; //and return it |
| } else { |
| return null; |
| } |
| } |
| /** |
| * Increments the {@link #token} and - if necessary also the {@link #sectionData |
| * section}. |
| * @return <code>true</code> unless there are no more tokens |
| */ |
| private boolean incrementTokenData(){ |
| if(tokenIt == null || !tokenIt.hasNext()){ |
| sectionData = null; |
| tokenIt = null; |
| while(sections.hasNext() && (tokenIt == null || !tokenIt.hasNext())){ |
| //analyse NLP results for the next Section |
| sectionData = new SectionData(lpc, sections.next(), |
| PROCESSED_SPAN_TYPES, isUnicaseLanguage); |
| tokenIt = sectionData.getTokens().iterator(); |
| } |
| if(tokenIt != null && tokenIt.hasNext()){ |
| addToken(tokenIt.next()); |
| return true; |
| } else { //reached the end .. clean up |
| sectionData = null; |
| tokenIt = null; |
| return false; |
| } |
| } else { //more token in the same section |
| addToken(tokenIt.next()); |
| return true; |
| } |
| } |
| /** |
| * Adds a token. Also cares about adding tokens to {@link #linkableTokens} |
| * @param token the tokens - MUST NOT be NULL. |
| */ |
| private void addToken(TokenData token){ |
| tokens.add(token); |
| if(token.isLinkable){ |
| //add to the list of linkable for #reduce(TagLL[]) |
| linkableTokens.add(new LinkableTokenContext(token,sectionData.getTokens())); |
| } else if(token.isMatchable && !lpc.isIgnoreChunks() &&//matchable token |
| token.inChunk != null && //in processable chunks with more |
| token.inChunk.isProcessable && //as two matchable tokens |
| token.inChunk.getMatchableCount() > 1){ //matchable tokens |
| linkableTokens.add(new LinkableTokenContext(token, sectionData.getTokens())); |
| } |
| } |
| /** |
| * Getter for the current Token |
| * @return |
| */ |
| private TokenData getToken(){ |
| return tokens.isEmpty() ? null : tokens.get(tokensCursor); |
| } |
| @Override |
| public void reduce(TagLL[] head) { |
| //this implements a two phase reduce |
| //(1) reduce Tags with no linkable tokens and not matching enough of the |
| // current chunk. |
| //(2) reduce remaining Tags in the cluster similar to TagClusterReducer |
| // but only considering the "matchable span" of the Tags. Meaning the |
| // span over matchable Tokens and not the full Text. |
| |
| //this map holds the matchable spans for Tags. Filled during phase (1) and |
| //used for phase(2) |
| Map<TagLL,int[]> matchableTagSpan = new HashMap<TagLL,int[]>(); |
| |
| //(1) reduce Tags based on link-/matchable tokens as well as chunks. |
| LinkableTokenContext linkableTokenContext; |
| for(TagLL tag = head[0]; tag != null; tag = tag.getNextTag()) { |
| int start = tag.getStartOffset(); |
| int end = tag.getEndOffset(); |
| linkableTokenContext = linkableTokens.isEmpty() ? null : linkableTokens.get(0); |
| while(linkableTokenContext != null && linkableTokenContext.linkableToken.token.getEnd() <= start){ |
| linkableTokens.remove(0); |
| linkableTokenContext = linkableTokens.isEmpty() ? null : linkableTokens.get(0); |
| } |
| if(linkableTokenContext == null || linkableTokenContext.linkableToken.token.getStart() >= end){ |
| //does not overlap any linkable token |
| tag.removeLL(); //remove the tag from the cluster |
| if(log.isTraceEnabled()){ |
| CharSequence tagSequence = at.getText().subSequence(start, end); |
| log.trace(" > reduce tag {} - no overlapp with linkable token", tagSequence); |
| } |
| } else { //if the tag overlaps a linkable token |
| TokenData linkableToken = linkableTokenContext.linkableToken; |
| List<TokenData> tokens = linkableTokenContext.context; |
| //calculate the matchable start/end span of the current TagLL |
| int[] mSpan = new int[]{ |
| Math.max(start,linkableToken.token.getStart()), |
| Math.min(end,linkableToken.token.getEnd())}; |
| if(mSpan[0] > start){ |
| for(int i = linkableToken.index-1; i >= 0; i--){ |
| TokenData token = tokens.get(i); |
| int tStart = token.token.getStart(); |
| if(tStart < start){ |
| break; |
| } else if(token.isMatchable){ |
| mSpan[0] = tStart; |
| } |
| } |
| } |
| if(mSpan[1] < end){ |
| for(int i= linkableToken.index+1; i < tokens.size();i++){ |
| TokenData token = tokens.get(i); |
| int tEnd = token.token.getEnd(); |
| if(tEnd > end){ |
| break; |
| } else if(token.isMatchable){ |
| mSpan[1] = tEnd; |
| } |
| } |
| } |
| if(log.isTraceEnabled()){ |
| CharSequence text = at.getText(); |
| log.trace(" - matchable Span {}{} for Tag {}[{},{}]", |
| new Object[]{ text.subSequence(mSpan[0],mSpan[1]), |
| Arrays.toString(mSpan), text.subSequence(start, end), |
| start, end}); |
| } |
| matchableTagSpan.put(tag, mSpan); |
| ChunkData cd = linkableToken.inChunk; //check if it matches > 50% of the chunk |
| if(!lpc.isIgnoreChunks() && cd != null && cd.isProcessable){ |
| int cstart = cd.getMatchableStartChar() >= 0 ? cd.getMatchableStartChar() : |
| start; |
| int cend = cd.getMatchableEndChar(); |
| if(cstart < start || cend > end){ //if the tag does not cover the whole chunk |
| int num = 0; |
| int match = 0; |
| for(int i = cd.getMatchableStart(); i <= cd.getMatchableEnd(); i++){ |
| TokenData td = tokens.get(i); |
| if(td.isMatchable){ |
| num++; |
| if(match < 1 && td.token.getStart() >= start || |
| match > 0 && td.token.getEnd() <= end){ |
| match++; |
| } |
| } |
| } |
| //only accept tags with more as half of the matchable |
| //tokens in the Chunk are matched! |
| if(((float)match/(float)num) < minChunkMatchScore && |
| match < minFoundTokens){ |
| tag.removeLL(); //ignore |
| matchableTagSpan.remove(tag); |
| if(log.isTraceEnabled()){ |
| CharSequence text = at.getText(); |
| log.trace(" - reduce tag {}[{},{}] - does only match " |
| + "{} of {} of matchable Chunk {}[{},{}]", |
| new Object[]{text.subSequence(start, end), start, end, match, |
| num, text.subSequence(cstart, cend), cstart, cend}); |
| } |
| } else if(log.isTraceEnabled()){ |
| CharSequence text = at.getText(); |
| log.trace(" + keep tag {}[{},{}] - matches {} of {} " |
| + "matchable Tokens for matchable Chunk {}[{},{}]", |
| new Object[]{text.subSequence(start, end), start, end, match, |
| num, text.subSequence(cstart, cend), cstart, cend}); |
| } |
| } else { |
| if(log.isTraceEnabled()){ |
| CharSequence text = at.getText(); |
| log.trace(" + keep tag {}[{},{}] - matches whole Chunk {}[{},{}]", |
| new Object[]{text.subSequence(start, end), start, end, |
| text.subSequence(cstart, cend), cstart, cend}); |
| } |
| } |
| } else if(log.isTraceEnabled()){ |
| CharSequence tagSequence = at.getText().subSequence(start, end); |
| log.trace(" + keep tag {} - not in processable chunk", tagSequence); |
| } |
| } |
| } |
| //(2) reduce Tags base on longest dominant right based on the matchable |
| // spans |
| //NOTE: This is the same code as TagClusterReducer#LONGEST_DOMINANT_RIGHT |
| // but adapted to use the matchable spans instead of the full Tag |
| // spans |
| if (head.length == 0 || head[0] == null || head[0].getNextTag() == null) { |
| return; //no tag left from phase one or single token optimization |
| } |
| Set<TagLL> marked = new HashSet<TagLL>(); //can not use TagLL#mark |
| while (true) { |
| // --Find longest not already marked |
| TagLL longest = null; |
| int longestMCharLen = -1; |
| int[] longestMSpan = null; |
| for (TagLL t = head[0]; t != null; t = t.getNextTag()) { |
| int[] mSpan = matchableTagSpan.get(t); |
| int mCharLen = mSpan[1] - mSpan[0]; |
| if (!marked.contains(t) && (longest == null || mCharLen >= longestMCharLen)) { |
| longest = t; |
| longestMSpan = mSpan; |
| longestMCharLen = mCharLen; |
| } |
| } |
| if (longest == null) break; |
| // --Mark longest (so we return it eventually) |
| marked.add(longest); |
| // --Remove tags overlapping this longest |
| for (TagLL t = head[0]; t != null; t = t.getNextTag()) { |
| if (marked.contains(t)) { |
| continue; |
| } |
| int[] mSpan = matchableTagSpan.get(t); |
| boolean overlaps = |
| mSpan[0] < longestMSpan[0] ? mSpan[1] > longestMSpan[1] : mSpan[0] < longestMSpan[1]; |
| if (overlaps) { |
| t.removeLL(); |
| } else if (mSpan[0] >= longestMSpan[1]) { |
| break;// no subsequent can possibly overlap |
| } |
| } |
| }// loop |
| } |
| /** |
| * Holds the context for a linkable {@link Token}s. This ensures that the |
| * list of Tokens of the current {@link Section} (typically a {@link Sentence}) |
| * is still available even if the {@link LinkableTokenFilter#sectionData} does hold |
| * already tokens for the next section.<p> |
| * This is necessary as {@link LinkableTokenFilter#reduce(TagLL[])} can |
| * be called for the previous sentence in cases where a Tag cluster includes |
| * the last {@link Token} of a {@link Section}. |
| * @author Rupert Westenthaler |
| * |
| */ |
| private static class LinkableTokenContext { |
| final TokenData linkableToken; |
| final List<TokenData> context; |
| |
| LinkableTokenContext(TokenData linkableToken, List<TokenData> context){ |
| this.linkableToken = linkableToken; |
| this.context = context; |
| } |
| } |
| |
| } |