| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.stanbol.enhancer.engines.lucenefstlinking; |
| |
| import static org.apache.stanbol.enhancer.engines.entitylinking.config.TextProcessingConfig.UNICASE_SCRIPT_LANUAGES; |
| |
| import java.io.IOException; |
| import java.util.Collections; |
| import java.util.EnumSet; |
| import java.util.Iterator; |
| import java.util.LinkedList; |
| import java.util.List; |
| import java.util.Set; |
| |
| import org.apache.lucene.analysis.TokenFilter; |
| import org.apache.lucene.analysis.TokenStream; |
| import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; |
| import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; |
| import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; |
| import org.apache.stanbol.enhancer.engines.entitylinking.config.LanguageProcessingConfig; |
| import org.apache.stanbol.enhancer.engines.entitylinking.config.TextProcessingConfig; |
| import org.apache.stanbol.enhancer.engines.entitylinking.engine.EntityLinkingEngine; |
| import org.apache.stanbol.enhancer.engines.entitylinking.impl.ProcessingState; |
| import org.apache.stanbol.enhancer.engines.entitylinking.impl.SectionData; |
| import org.apache.stanbol.enhancer.engines.entitylinking.impl.TokenData; |
| import org.apache.stanbol.enhancer.nlp.model.AnalysedText; |
| import org.apache.stanbol.enhancer.nlp.model.Section; |
| import org.apache.stanbol.enhancer.nlp.model.Sentence; |
| import org.apache.stanbol.enhancer.nlp.model.Span.SpanTypeEnum; |
| import org.apache.stanbol.enhancer.nlp.model.Token; |
| import org.opensextant.solrtexttagger.TagClusterReducer; |
| import org.opensextant.solrtexttagger.TagLL; |
| import org.opensextant.solrtexttagger.TaggingAttribute; |
| import org.slf4j.Logger; |
| import org.slf4j.LoggerFactory; |
| |
| /** |
| * Class the ensures that only {@link TokenData#isLinkable linkable} Tokens |
| * are processed.<p> |
| * This is ensured on two places:<ol> |
| * <li> Classifies Tokens in the Solr {@link TokenStream} with the {@link TaggingAttribute} |
| * based on NLP processing results present in the {@link AnalysedText}. This |
| * implementation Classifies Token similar to the {@link EntityLinkingEngine}. |
| * It uses the {@link TextProcessingConfig} for its configuration.<p> |
| * <li> Implements {@link TagClusterReducer} to ensure that all {@link TagLL tags} |
| * that do not overlap with any {@link TokenData#isLinkable linkable} are |
| * removed from the Cluster. |
| * </ol> |
| * <b> Implementation Details</b><p> |
| * The {@link TokenStream} implementation of this class serves a similar |
| * purpose as the {@link ProcessingState} used by the EntityLinkingEngine. |
| * The main differences are:<p> |
| * <ul> |
| * <li>This code needs to deal with potential different tokenization present |
| * in the {@link AnalysedText} and the {@link TokenStream}. The implemented |
| * semantics does mark Tokens in the {@link TokenStream} as |
| * <code>{@link TaggingAttribute#isTaggable()} == ture</code> if the do overlap |
| * with a {@link TokenData#isLinkable} token in the {@link AnalysedText}. |
| * <li> {@link TokenData#isMatchable} tokens are also considered as |
| * <code>{@link TaggingAttribute#isTaggable()} == ture</code> if a |
| * {@link TokenData#isMatchable} token is following within two tokens of the |
| * {@link AnalysedText}. This Range is extended if other matchable tokens are |
| * within the lookahead range. However the range is never extended over a |
| * section border. |
| * </ul> |
| * The {@link TagClusterReducer} implementation keeps track of linkable tokens |
| * while iterating over the {@link TokenStream} and adds them to the end of a |
| * List. When {@link TagClusterReducer#reduce(TagLL[])} is called tags of the |
| * cluster are checked if they do overlap with any linkable Token at the start |
| * of the list. Tokens with earlier ends as the start of the tags are removed |
| * from the list. |
| * @author Rupert Westenthaler |
| * |
| */ |
| public final class LinkableTokenFilter extends TokenFilter implements TagClusterReducer{ |
| |
| private final Logger log = LoggerFactory.getLogger(LinkableTokenFilter.class); |
| |
| /** |
| * Required to use {@link SectionData} |
| */ |
| private static final Set<SpanTypeEnum> PROCESSED_SPAN_TYPES = EnumSet.of( |
| SpanTypeEnum.Chunk,SpanTypeEnum.Token); |
| /** |
| * The NLP processing results |
| */ |
| private AnalysedText at; |
| /** |
| * The language of the text |
| */ |
| //private String lang; |
| /** |
| * If the language is unicase or not |
| */ |
| private boolean isUnicaseLanguage; |
| /** |
| * Defines how NLP processing results are processed to determine Words that |
| * need to be looked-up in the vocabulary |
| */ |
| private LanguageProcessingConfig lpc; |
| |
| /** |
| * Iterator over all sections of the {@link AnalysedText} |
| */ |
| private Iterator<? extends Section> sections; |
| /** |
| * The current section |
| */ |
| private SectionData sectionData; |
| /** |
| * Iterator over all {@link Token}s in the current section |
| */ |
| private Iterator<TokenData> tokenIt; |
| /** |
| * The current Token(s). {@link #incrementToken()} will add tokens to the |
| * end of the list and {@link #nextToken(boolean)} with <code>true</code> |
| * will remove earlier tokens as {@link #offset} from the list.<p> |
| * We need to hold multiple tokens because the TokenStream might parse |
| * multiple tokens with |
| * <code>{@link PositionIncrementAttribute#getClass() posInc} == 0</code> |
| * covering multiple {@link TokenData tokens}. |
| */ |
| private List<TokenData> tokens = new LinkedList<TokenData>(); |
| /** |
| * The cursor within the {@link #tokens} list of the currently active Token |
| */ |
| private int tokensCursor = -1; //the cursor within the tokens list |
| |
| private int lookupCount = 0; |
| private int incrementCount = 0; |
| |
| private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); |
| private final OffsetAttribute offset = addAttribute(OffsetAttribute.class); |
| private final TaggingAttribute taggable = addAttribute(TaggingAttribute.class); |
| /** |
| * List with {@link TokenData#isLinkable linkable} {@link Token}s used by |
| * the {@link #reduce(TagLL[])} method to check if {@link TagLL tags} |
| * do overlap with any linkable token. |
| */ |
| private final List<Token> linkableTokens = new LinkedList<Token>(); |
| |
| protected LinkableTokenFilter(TokenStream input, AnalysedText at, |
| String lang, LanguageProcessingConfig lpc) { |
| super(input); |
| this.at = at; |
| //this.lang = lang; |
| this.lpc = lpc; |
| this.isUnicaseLanguage = lang != null && !lang.isEmpty() && |
| UNICASE_SCRIPT_LANUAGES.contains(lang); |
| } |
| |
| @Override |
| public void reset() throws IOException { |
| super.reset(); |
| Iterator<Sentence> sentences = at.getSentences(); |
| this.sections = sentences.hasNext() ? sentences : Collections.singleton(at).iterator(); |
| sectionData = null; |
| tokenIt = null; |
| incrementCount = 0; |
| lookupCount = 0; |
| } |
| |
| @Override |
| public boolean incrementToken() throws IOException { |
| if(input.incrementToken()){ |
| incrementCount++; |
| boolean first = true; |
| TokenData token; |
| boolean lookup = false; |
| int lastMatchable = -1; |
| int lastIndex = -1; |
| log.trace("> solr:[{},{}] {}",new Object[]{ |
| offset.startOffset(), offset.endOffset(), termAtt}); |
| while((token = nextToken(first)) != null){ |
| log.trace(" < [{},{}]:{} (link {}, match; {})",new Object[]{ |
| token.token.getStart(), token.token.getEnd(),token.getTokenText(), |
| token.isLinkable, token.isMatchable}); |
| first = false; |
| if(token.isLinkable){ |
| lookup = true; |
| } else if (token.isMatchable){ |
| lastMatchable = token.index; |
| lastIndex = lastMatchable; |
| } //else if(token.hasAlphaNumeric){ |
| // lastIndex = token.index; |
| //} |
| } |
| //lookahead |
| if(!lookup && lastIndex >= 0 && sectionData != null){ |
| List<TokenData> tokens = sectionData.getTokens(); |
| int maxLookahead = Math.max(lastIndex, lastMatchable+3); |
| for(int i = lastIndex+1;!lookup && i < maxLookahead && i < tokens.size(); i++){ |
| token = tokens.get(i); |
| if(token.isLinkable){ |
| lookup = true; |
| } else if(token.isMatchable && (i+1) == maxLookahead){ |
| maxLookahead++; //increase lookahead for matchable tokens |
| } |
| } |
| } |
| this.taggable.setTaggable(lookup); |
| if(lookup){ |
| if(log.isTraceEnabled()){ |
| TokenData t = getToken(); |
| log.trace("lookup: token [{},{}]: {} | word [{},{}]:{}", new Object[]{ |
| offset.startOffset(), offset.endOffset(), termAtt, |
| t.token.getStart(), t.token.getEnd(), |
| t.getTokenText()}); |
| } |
| lookupCount++; |
| } |
| return true; |
| } else { |
| log.debug("lookup percentage: {}",lookupCount*100/(float)incrementCount); |
| return false; |
| } |
| } |
| |
| /** |
| * Iterating over TokensData requires to iterate over two hierarchy levels: |
| * (1) sections (likely Sentences) and (2) Tokens <p> |
| * <b>NOTE</b> that this method modifies a lot of fields to update the |
| * state of the iteration accordingly. If the {@link #token} field is |
| * <code>null</code> after a call to this method this indicates that the |
| * end of the {@link Token} in the {@link AnalysedText} was reached. |
| * @param first is this the first call for the current {@link #offset} state? |
| * @return the token or <code>null</code> if there are no more tokens for |
| * the current {@link #offset} |
| */ |
| private TokenData nextToken(boolean first){ |
| int startOffset = offset.startOffset(); |
| int endOffset = offset.endOffset(); |
| if(first){ //on the first call for a token |
| tokensCursor = -1; //reset cursor to zero |
| while(!tokens.isEmpty()){ |
| //remove tokens earlier as the current offset |
| if(tokens.get(0).token.getEnd() <= startOffset){ |
| tokens.remove(0); |
| } else { //stop on the first overlapping token |
| break; |
| } |
| } //else nothing to do |
| } |
| if(tokensCursor >= tokens.size()-1){ |
| if(!incrementTokenData()){ //adds a new token to the list |
| return null; //EoF |
| } |
| } |
| TokenData cursorToken = tokens.get(tokensCursor+1); |
| if(cursorToken.token.getStart() < endOffset){ |
| tokensCursor++; //set the next token as current |
| return cursorToken; //and return it |
| } else { |
| return null; |
| } |
| } |
| /** |
| * Increments the {@link #token} and - if necessary also the {@link #sectionData |
| * section}. |
| * @return <code>true</code> unless there are no more tokens |
| */ |
| private boolean incrementTokenData(){ |
| if(tokenIt == null || !tokenIt.hasNext()){ |
| sectionData = null; |
| tokenIt = null; |
| while(sections.hasNext() && (tokenIt == null || !tokenIt.hasNext())){ |
| //analyse NLP results for the next Section |
| sectionData = new SectionData(lpc, sections.next(), |
| PROCESSED_SPAN_TYPES, isUnicaseLanguage); |
| tokenIt = sectionData.getTokens().iterator(); |
| } |
| if(tokenIt != null && tokenIt.hasNext()){ |
| addToken(tokenIt.next()); |
| return true; |
| } else { //reached the end .. clean up |
| sectionData = null; |
| tokenIt = null; |
| return false; |
| } |
| } else { //more token in the same section |
| addToken(tokenIt.next()); |
| return true; |
| } |
| } |
| private void addToken(TokenData token){ |
| tokens.add(token); |
| if(token.isLinkable){ |
| //add to the list of linkable for #reduce(TagLL[]) |
| linkableTokens.add(token.token); |
| } |
| } |
| /** |
| * Getter for the current Token |
| * @return |
| */ |
| private TokenData getToken(){ |
| return tokens.isEmpty() ? null : tokens.get(tokensCursor); |
| } |
| |
| @Override |
| public void reduce(TagLL[] head) { |
| Token linkableToken; |
| for(TagLL tag = head[0]; tag != null; tag = tag.getNextTag()) { |
| int start = tag.getStartOffset(); |
| int end = tag.getEndOffset(); |
| linkableToken = linkableTokens.isEmpty() ? null : linkableTokens.get(0); |
| while(linkableToken != null && linkableToken.getEnd() <= start){ |
| linkableTokens.remove(0); |
| linkableToken = linkableTokens.isEmpty() ? null : linkableTokens.get(0); |
| } |
| if(linkableToken == null || linkableToken.getStart() >= end){ |
| //does not overlap any linkable token |
| tag.removeLL(); //remove the tag from the cluster |
| if(log.isTraceEnabled()){ |
| CharSequence tagSequence = at.getText().subSequence(start, end); |
| log.trace(" > reduce tag {}", tagSequence); |
| } |
| } else { |
| if(log.isTraceEnabled()){ |
| CharSequence tagSequence = at.getText().subSequence(start, end); |
| log.trace(" > keep tag {}", tagSequence); |
| } |
| } |
| } |
| |
| } |
| |
| } |