| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| /** |
| * |
| */ |
| package org.apache.stanbol.enhancer.engines.keywordextraction.impl; |
| |
| import java.util.HashMap; |
| import java.util.Iterator; |
| import java.util.LinkedHashMap; |
| import java.util.Map; |
| |
| import org.apache.stanbol.commons.opennlp.TextAnalyzer.AnalysedText; |
| import org.apache.stanbol.commons.opennlp.TextAnalyzer.AnalysedText.Chunk; |
| import org.apache.stanbol.commons.opennlp.TextAnalyzer.AnalysedText.Token; |
| |
| public class ProcessingState { |
| |
| private final Iterator<AnalysedText> sentences; |
| /** |
| * The sentence currently processed |
| */ |
| private AnalysedText sentence; |
| /** |
| * The index of the current token needed to be linked |
| */ |
| private int tokenIndex = -1; |
| /** |
| * The current token |
| */ |
| private Token token; |
| /** |
| * The iterator over the chunks of the current {@link #sentence} |
| * or <code>null</code> if no {@link Chunk}s are available. |
| */ |
| private Iterator<Chunk> chunks; |
| /** |
| * The current {@link Chunk} |
| */ |
| private Chunk chunk; |
| private static final int MAX_TEXT_CACHE_SIZE = 32; |
| /** |
| * This is a cache over the last {@link #MAX_TEXT_CACHE_SIZE} token texts |
| * requested by {@link #getTokenText(int, int)} |
| */ |
| private Map<String,String> textCache = new LinkedHashMap<String,String>( |
| MAX_TEXT_CACHE_SIZE, 0.75f, true){ |
| private static final long serialVersionUID = 1L; |
| protected boolean removeEldestEntry(Map.Entry<String,String> eldest) { |
| return size() > MAX_TEXT_CACHE_SIZE; |
| }; |
| }; |
| /** |
| * The position for the next token |
| */ |
| private int nextToken = -1; |
| /** |
| * The position of the last consumed position |
| */ |
| private int consumedIndex = -1; |
| |
| public ProcessingState(Iterator<AnalysedText> sentences){ |
| this.sentences = sentences; |
| if(!sentences.hasNext()){ |
| throw new IllegalArgumentException("The parsed AnalysedContent MUST NOT have an empty AnalysedText iterator!"); |
| } |
| } |
| /** |
| * Getter for the current Sentence |
| * @return the sentence |
| */ |
| public final AnalysedText getSentence() { |
| return sentence; |
| } |
| /** |
| * Getter for the index of the current active token within the current |
| * active {@link #getSentence() sentence} |
| * @return the tokenPos the index of the token |
| */ |
| public final int getTokenIndex() { |
| return tokenIndex; |
| } |
| /** |
| * Getter for the last consumed index |
| * @return the index of the last consumed token |
| */ |
| public final int getConsumedIndex() { |
| return consumedIndex; |
| } |
| /** |
| * The currently active token |
| * @return the token |
| */ |
| public final Token getToken() { |
| return token; |
| } |
| /** |
| * Getter for the language of the current Token (based on the current |
| * sentence) |
| * @return the language |
| */ |
| public final String getLanguage() { |
| return sentence.getLanguage(); |
| } |
| /** |
| * The currently active chunk or <code>null</code> if no chunks are |
| * available. If chunks are present this can not be <code>null</code> |
| * because {@link Token}s outside of chunks are skiped. |
| * @return the chunk the current {@link Chunk} or <code>null</code> if |
| * no chunks are present. |
| */ |
| public final Chunk getChunk() { |
| return chunk; |
| } |
| /** |
| * Getter for the next {@link Token} to be processed. Calling {@link #next()} |
| * is guaranteed to skip all tokens in between {@link #getTokenIndex()} |
| * and {@link #getNextToken()}, but it might even skip more tokens (e.g. |
| * in case that the token referenced by {@link #getNextToken()} is not |
| * within a {@link Chunk} |
| * @return the nextToken |
| */ |
| public final int getNextToken() { |
| return nextToken; |
| } |
| // /** |
| // * Allows to manually set to position of the next token to process. |
| // * This can be used to skip some tokens within (e.g. if a Concept |
| // * matching multiple Tokens where found.<p> |
| // * The set token may be greater than the number of tokens in |
| // * {@link #sentence}. This will simple cause the next sentence to be |
| // * activated on the next call to {@link #next()} |
| // * @param pos the position of the next token to process. |
| // */ |
| // public void setNextToken(int pos){ |
| // if(pos > tokenIndex){ |
| // this.nextToken = pos; |
| // } else { |
| // throw new IllegalArgumentException("The nextTokenPos "+pos+ |
| // " MUST BE greater than the current "+tokenIndex); |
| // } |
| // } |
| /** |
| * The index of an consumed Token. The consumed index MUST BE equals or |
| * greater as {@link #getTokenIndex()}. If the consumed index is set to a |
| * value greater that {@link #getTokenIndex()} than consumed tokens are |
| * skipped on the next call to {@link #next()} |
| * @param pos the position of the last consumed token. |
| */ |
| public void setConsumed(int pos){ |
| if(pos >= tokenIndex){ |
| this.consumedIndex = pos; |
| this.nextToken = pos+1; |
| } else { |
| throw new IllegalArgumentException("The lastConsumedPos "+pos+ |
| " MUST BE equals or gerater than the current Pos "+tokenIndex); |
| } |
| } |
| /** |
| * Moves the state to #nextToken this may switch to the next Chunk or |
| * sentence. |
| * @return <code>true</code> if there are further elements to process or |
| * <code>false</code> if there are no further elements to process. |
| */ |
| public boolean next() { |
| //switch to the next token |
| if(nextToken > tokenIndex){ |
| tokenIndex = nextToken; |
| } else { |
| tokenIndex++; |
| nextToken = tokenIndex; |
| } |
| //now init the next element |
| final boolean hasNext; |
| if(chunk != null){ //if chunks are present |
| //get next chunk (may be the current if chunk.getEnd() > tokenPos |
| for(;tokenIndex > chunk.getEnd() && chunks.hasNext();chunk = chunks.next()); |
| if(tokenIndex <= chunk.getEnd()){ //found valid chunk |
| if(chunk.getStart() > tokenIndex) { //skip tokens outside chunks |
| tokenIndex = chunk.getStart(); |
| } |
| if(chunk.getStart() > consumedIndex){ |
| consumedIndex = chunk.getStart()-1; |
| } |
| hasNext = true; |
| } else { //no more valid chunks in this sentence |
| hasNext = initNextSentence(); |
| } |
| } else { //no chunks ... use tokens only |
| if(sentence == null){ //first sentence |
| hasNext = initNextSentence(); |
| } else if(tokenIndex >= sentence.getTokens().size()){ |
| hasNext = initNextSentence(); |
| } else { //more tokens in the sentence |
| //set the token |
| hasNext = true; |
| } |
| } |
| if(hasNext){ //set the Token |
| token = sentence.getTokens().get(tokenIndex); |
| } |
| return hasNext; |
| } |
| |
| /** |
| * Correctly initialise {@link #sentence}, {@link #chunks}, {@link #chunk} |
| * and {@link #tokenIndex} for the next element of {@link #sentences}. If |
| * no further sentences are to process it simple sets {@link #sentence}, |
| * {@link #chunks}, {@link #chunk} and {@link #tokenIndex} to <code>null</code> |
| */ |
| private boolean initNextSentence() { |
| textCache.clear(); |
| sentence = null; |
| while(sentence == null && sentences.hasNext()){ |
| sentence = sentences.next(); |
| if(sentence.getChunks() != null){ |
| chunks = sentence.getChunks().iterator(); |
| if(chunks.hasNext()){ |
| chunk = chunks.next(); |
| tokenIndex = chunk.getStart(); |
| consumedIndex = tokenIndex-1; |
| nextToken = tokenIndex; |
| } else { //no chunks in this sentence |
| sentence = null; //skip this sentence |
| } |
| } else { |
| if(sentence.getTokens().isEmpty()){ //no tokens in this sentence |
| sentence = null; //skip this one |
| } else { |
| chunks = null; |
| chunk = null; |
| tokenIndex = 0; |
| consumedIndex = -1; |
| nextToken = 0; |
| } |
| } |
| } |
| return sentence != null; |
| } |
| /** |
| * Getter for the text covered by the next tokenCount tokens relative to |
| * {@link #token}. It uses the {@link #textCache} to lookup/store such texts. |
| * Given the Tokens |
| * <pre> |
| * [This, is, an, Example] |
| * </pre> |
| * and the parameter <code>3</code> this method will return |
| * <pre> |
| * This is an |
| * </pre> |
| * @param tokenCount the number of tokens to be included relative to |
| * {@link #tokenIndex} |
| * @return the text covered by the span start of {@link #token} to end of |
| * token at <code>{@link #tokenIndex}+tokenCount</code>. |
| */ |
| public String getTokenText(int start, int tokenCount){ |
| String pos = start+","+tokenCount; |
| String text = textCache.get(pos); |
| if(text == null){ |
| text = sentence.getText().substring( |
| sentence.getTokens().get(start).getStart(), |
| sentence.getTokens().get(start+tokenCount-1).getEnd()); |
| textCache.put(pos, text); |
| } |
| return text; |
| } |
| @Override |
| public String toString() { |
| return "["+tokenIndex+","+token+"] chunk: " + |
| (chunk == null?null:chunk.getText())+"| sentence: "+ |
| (sentence == null?null:sentence.getText()); |
| } |
| } |