blob: 1e32f1bebb7c1a96df6180ddc75d4c6e072380c7 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
*
*/
package org.apache.stanbol.enhancer.engines.entitylinking.impl;
import static org.apache.stanbol.enhancer.engines.entitylinking.config.TextProcessingConfig.UNICASE_SCRIPT_LANUAGES;
import static org.apache.stanbol.enhancer.nlp.NlpAnnotations.POS_ANNOTATION;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.EnumSet;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import org.apache.commons.collections.Predicate;
import org.apache.commons.collections.iterators.FilterIterator;
import org.apache.stanbol.commons.namespaceprefix.service.StanbolNamespacePrefixService;
import org.apache.stanbol.enhancer.engines.entitylinking.config.LanguageProcessingConfig;
import org.apache.stanbol.enhancer.engines.entitylinking.config.TextProcessingConfig;
import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
import org.apache.stanbol.enhancer.nlp.model.Chunk;
import org.apache.stanbol.enhancer.nlp.model.Section;
import org.apache.stanbol.enhancer.nlp.model.Sentence;
import org.apache.stanbol.enhancer.nlp.model.Span;
import org.apache.stanbol.enhancer.nlp.model.Span.SpanTypeEnum;
import org.apache.stanbol.enhancer.nlp.model.Token;
import org.apache.stanbol.enhancer.nlp.pos.Pos;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class ProcessingState {
private final Logger log = LoggerFactory.getLogger(ProcessingState.class);
/**
* Iterator over the sentences (might be
* the whole {@link AnalysedText} if no sentences are
* defined).
*/
private final Iterator<? extends Section> sections;
/**
* The sentence currently processed
*/
private Section section;
/**
* Holds the {@link Token}s of the current {@link #sentence}
* to allow fast index based access.
*/
private List<TokenData> tokens = new ArrayList<TokenData>(64);
@SuppressWarnings("unchecked")
private Iterator<TokenData> processableTokensIterator = Collections.EMPTY_LIST.iterator();
private final EnumSet<SpanTypeEnum> enclosedSpanTypes;
/**
* The current token
*/
private TokenData token;
/**
* The position of the last consumed position
*/
private int consumedIndex = -1;
/**
* Ensures that Tokens are not processed twice in case of multiple
* overlapping Sentence Annotations (e.g. if two NLP frameworks contributing
* Sentences do not agree with each other).
*/
private int consumedSectionIndex = -1;
/**
* The language of the text
*/
private String language;
protected final LanguageProcessingConfig tpc;
//protected final EntityLinkerConfig elc;
private AnalysedText at;
/**
* If the language uses a unicase script and therefore upper case specific
* processing rules can not be used (see STANBOL-1049)
*/
private boolean isUnicaseLanguage;
private static final Predicate PROCESSABLE_TOKEN_OREDICATE = new Predicate() {
@Override
public boolean evaluate(Object object) {
return ((TokenData)object).isLinkable;
}
};
public static final Collection<Pos> SUB_SENTENCE_START_POS = EnumSet.of(
Pos.Quote);
public ProcessingState(AnalysedText at, String language, LanguageProcessingConfig tpc){
if(at == null){
throw new IllegalArgumentException("The parsed AnalysedText MUST NOT be NULL!");
}
if(language == null || language.isEmpty()){
throw new IllegalArgumentException("The parsed Language MUST NOT be NULL nor empty!");
}
if(tpc == null){
throw new IllegalArgumentException("The parsed TextProcessingConfig MUST NOT be NULL!");
}
this.tpc = tpc;
enclosedSpanTypes = EnumSet.of(SpanTypeEnum.Token);
if(!tpc.isIgnoreChunks()){
enclosedSpanTypes.add(SpanTypeEnum.Chunk);
}
this.at = at; //store as field (just used for logging)
this.language = language;
//STANBOL-1049: we need now to know if a language uses a unicase script
//ensure lower case and only use the language part
String lookupLang = language.toLowerCase(Locale.ROOT).split("[_-]")[0];
this.isUnicaseLanguage = UNICASE_SCRIPT_LANUAGES.contains(lookupLang);
//prefer to iterate over sentences
Iterator<Sentence> sentences = at.getSentences();
this.sections = sentences.hasNext() ? sentences : Collections.singleton(at).iterator();
//init the first sentence
//initNextSentence();
}
/**
* Getter for the current section. This is typically a {@link Sentence}
* but might also be the whole {@link AnalysedText} in case no sentence
* annotations are available
* @return the currently processed {@link Section}
*/
public final Section getSentence() {
return section;
}
/**
* Getter for the current token
* @return the token for the currently processed word
*/
public TokenData getToken(){
return token;
}
/**
* Getter for the Tokens of the currently processed section
* @return the Tokens of the currently processed section
*/
public List<TokenData> getTokens(){
return tokens;
}
/**
* Getter for the last consumed index
* @return the index of the last consumed token
*/
public final int getConsumedIndex() {
return consumedIndex;
}
/**
* Getter for the language of the current Token (based on the current
* sentence)
* @return the language
*/
public final String getLanguage() {
return language;
}
// /**
// * Getter for the next {@link Token} to be processed. Calling {@link #next()}
// * is guaranteed to skip all tokens in between {@link #getTokenIndex()}
// * and {@link #getNextToken()}, but it might even skip more tokens (e.g.
// * in case that the token referenced by {@link #getNextToken()} is not
// * within a {@link Chunk}
// * @return the nextToken
// */
// public final int getNextToken() {
// return nextToken;
// }
/**
* The index of an consumed Token. The consumed index MUST BE equals or
* greater as {@link #getTokenIndex()}. If the consumed index is set to a
* value greater that {@link #getTokenIndex()} than consumed tokens are
* skipped on the next call to {@link #next()}
* @param pos the position of the last consumed token.
*/
public void setConsumed(int pos){
if(pos >= token.index){
this.consumedIndex = pos;
// this.nextToken = pos+1;
} else {
throw new IllegalArgumentException("The lastConsumedPos "+pos+
" MUST BE equals or gerater than the current Pos "+token.index);
}
}
/**
* Moves the state to next processable token after the index #nextToken
* @return <code>true</code> if there are further elements to process or
* <code>false</code> if there are no further elements to process.
*/
public boolean next() {
while(processableTokensIterator.hasNext() || initNextSentence()){
TokenData token = processableTokensIterator.next();
if(token.index > consumedIndex){
this.token = token;
return true;
}
}
return false;
}
/**
* Correctly initialise {@link #sentence}, {@link #chunks}, {@link #chunk}
* and {@link #tokenIndex} for the next element of {@link #sections}. If
* no further sentences are to process it simple sets {@link #sentence},
* {@link #chunks}, {@link #chunk} and {@link #tokenIndex} to <code>null</code>
*/
private boolean initNextSentence() {
section = null;
processableTokensIterator = null;
consumedIndex = -1;
boolean foundLinkableToken = false;
while(!foundLinkableToken && sections.hasNext()){
section = sections.next();
if(consumedSectionIndex > section.getStart()){
log.debug(" > skipping {} because an other section until Index {} " +
"was already processed. This is not an error, but indicates that" +
"multiple NLP framewords do contribute divergating Sentence annotations",
section, consumedSectionIndex);
continue; //ignore this section
}
consumedSectionIndex = section.getEnd();
SectionData sectionData = new SectionData(tpc, section, enclosedSpanTypes, foundLinkableToken);
//TODO: It would be better to use a SectionData field instead
tokens = sectionData.getTokens();
section = sectionData.section;
foundLinkableToken = sectionData.hasLinkableToken();
}
processableTokensIterator = new FilterIterator(tokens.iterator(), PROCESSABLE_TOKEN_OREDICATE);
return foundLinkableToken;
}
/**
* Getter for the text covered by the next tokenCount tokens relative to
* {@link #token}. It uses the {@link #textCache} to lookup/store such texts.
* Given the Tokens
* <pre>
* [This, is, an, Example]
* </pre>
* and the parameter <code>3</code> this method will return
* <pre>
* This is an
* </pre>
* @param tokenCount the number of tokens to be included relative to
* {@link #tokenIndex}
* @return the text covered by the span start of {@link #token} to end of
* token at <code>{@link #tokenIndex}+tokenCount</code>.
*/
public String getTokenText(int start, int tokenCount){
int offset = section.getStart();
return section.getSpan().substring(
tokens.get(start).token.getStart()-offset,
tokens.get(start+(tokenCount-1)).token.getEnd()-offset);
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append('[').append(token.index).append(',').append(token.token);
sb.append("] chunk: ");
if(token.inChunk == null){
sb.append("none");
} else {
sb.append(token.inChunk.chunk);
if(token.inChunk.merged != null){
sb.append("(merged with ").append(token.inChunk.merged).append(')');
}
}
sb.append("| sentence: ");
if(section == null){
sb.append("none");
} else if(section.getSpan().length() > 45){
sb.append(section.getSpan().substring(0, 45)).append(" ...");
} else {
sb.append(section.getSpan());
}
return sb.toString();
}
}