enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ProcessingState.java - stanbol - Git at Google

 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 /**
  *
  */
 package org.apache.stanbol.enhancer.engines.entitylinking.impl;

 import static org.apache.stanbol.enhancer.engines.entitylinking.config.TextProcessingConfig.UNICASE_SCRIPT_LANUAGES;
 import static org.apache.stanbol.enhancer.nlp.NlpAnnotations.POS_ANNOTATION;

 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.EnumSet;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Locale;

 import org.apache.commons.collections.Predicate;
 import org.apache.commons.collections.iterators.FilterIterator;
 import org.apache.stanbol.commons.namespaceprefix.service.StanbolNamespacePrefixService;
 import org.apache.stanbol.enhancer.engines.entitylinking.config.LanguageProcessingConfig;
 import org.apache.stanbol.enhancer.engines.entitylinking.config.TextProcessingConfig;
 import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
 import org.apache.stanbol.enhancer.nlp.model.Chunk;
 import org.apache.stanbol.enhancer.nlp.model.Section;
 import org.apache.stanbol.enhancer.nlp.model.Sentence;
 import org.apache.stanbol.enhancer.nlp.model.Span;
 import org.apache.stanbol.enhancer.nlp.model.Span.SpanTypeEnum;
 import org.apache.stanbol.enhancer.nlp.model.Token;
 import org.apache.stanbol.enhancer.nlp.pos.Pos;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 public class ProcessingState {

     private final Logger log = LoggerFactory.getLogger(ProcessingState.class);

     /**
      * Iterator over the sentences (might be
      * the whole {@link AnalysedText} if no sentences are
      * defined).
      */
     private final Iterator<? extends Section> sections;
     /**
      * The sentence currently processed
      */
     private Section section;
     /**
      * Holds the {@link Token}s of the current {@link #sentence}
      * to allow fast index based access.
      */
     private List<TokenData> tokens = new ArrayList<TokenData>(64);

     @SuppressWarnings("unchecked")
     private Iterator<TokenData> processableTokensIterator = Collections.EMPTY_LIST.iterator();

     private final EnumSet<SpanTypeEnum> enclosedSpanTypes;
     /**
      * The current token
      */
     private TokenData token;
     /**
      * The position of the last consumed position
      */
     private int consumedIndex = -1;
     /**
      * Ensures that Tokens are not processed twice in case of multiple
      * overlapping Sentence Annotations (e.g. if two NLP frameworks contributing
      * Sentences do not agree with each other).
      */
     private int consumedSectionIndex = -1;
     /**
      * The language of the text
      */
     private String language;

     protected final LanguageProcessingConfig tpc;
     //protected final EntityLinkerConfig elc;

     private AnalysedText at;
     /**
      * If the language uses a unicase script and therefore upper case specific
      * processing rules can not be used (see STANBOL-1049)
      */
     private boolean isUnicaseLanguage;

     private static final Predicate PROCESSABLE_TOKEN_OREDICATE = new Predicate() {
         @Override
         public boolean evaluate(Object object) {
             return ((TokenData)object).isLinkable;
         }
     };

     public static final Collection<Pos> SUB_SENTENCE_START_POS = EnumSet.of(
         Pos.Quote);

     public ProcessingState(AnalysedText at, String language, LanguageProcessingConfig tpc){
         if(at == null){
             throw new IllegalArgumentException("The parsed AnalysedText MUST NOT be NULL!");
         }
         if(language == null || language.isEmpty()){
             throw new IllegalArgumentException("The parsed Language MUST NOT be NULL nor empty!");
         }
         if(tpc == null){
             throw new IllegalArgumentException("The parsed TextProcessingConfig MUST NOT be NULL!");
         }
         this.tpc = tpc;
         enclosedSpanTypes = EnumSet.of(SpanTypeEnum.Token);

         if(!tpc.isIgnoreChunks()){
             enclosedSpanTypes.add(SpanTypeEnum.Chunk);
         }
         this.at = at; //store as field (just used for logging)
         this.language = language;
         //STANBOL-1049: we need now to know if a language uses a unicase script
         //ensure lower case and only use the language part
         String lookupLang = language.toLowerCase(Locale.ROOT).split("[_-]")[0];
         this.isUnicaseLanguage = UNICASE_SCRIPT_LANUAGES.contains(lookupLang);
         //prefer to iterate over sentences
         Iterator<Sentence> sentences = at.getSentences();
         this.sections = sentences.hasNext() ? sentences : Collections.singleton(at).iterator();
         //init the first sentence
         //initNextSentence();
     }
     /**
      * Getter for the current section. This is typically a {@link Sentence}
      * but might also be the whole {@link AnalysedText} in case no sentence
      * annotations are available
      * @return the currently processed {@link Section}
      */
     public final Section getSentence() {
         return section;
     }
     /**
      * Getter for the current token
      * @return the token for the currently processed word
      */
     public TokenData getToken(){
         return token;
     }
     /**
      * Getter for the Tokens of the currently processed section
      * @return the Tokens of the currently processed section
      */
     public List<TokenData> getTokens(){
         return tokens;
     }

     /**
      * Getter for the last consumed index
      * @return the index of the last consumed token
      */
     public final int getConsumedIndex() {
         return consumedIndex;
     }


     /**
      * Getter for the language of the current Token (based on the current
      * sentence)
      * @return the language
      */
     public final String getLanguage() {
         return language;
     }
 //    /**
 //     * Getter for the next {@link Token} to be processed. Calling {@link #next()}
 //     * is guaranteed to skip all tokens in between {@link #getTokenIndex()}
 //     * and {@link #getNextToken()}, but it might even skip more tokens (e.g.
 //     * in case that the token referenced by {@link #getNextToken()} is not
 //     * within a {@link Chunk}
 //     * @return the nextToken
 //     */
 //    public final int getNextToken() {
 //        return nextToken;
 //    }

     /**
      * The index of an consumed Token. The consumed index MUST BE equals or
      * greater as {@link #getTokenIndex()}. If the consumed index is set to a
      * value greater that {@link #getTokenIndex()} than consumed tokens are
      * skipped on the next call to {@link #next()}
      * @param pos the position of the last consumed token.
      */
     public void setConsumed(int pos){
         if(pos >= token.index){
             this.consumedIndex = pos;
 //            this.nextToken = pos+1;
         } else {
             throw new IllegalArgumentException("The lastConsumedPos "+pos+
                 " MUST BE equals or gerater than the current Pos "+token.index);
         }
     }

     /**
      * Moves the state to next processable token after the index #nextToken
      * @return <code>true</code> if there are further elements to process or
      * <code>false</code> if there are no further elements to process.
      */
     public boolean next() {
         while(processableTokensIterator.hasNext() || initNextSentence()){
             TokenData token = processableTokensIterator.next();
             if(token.index > consumedIndex){
                 this.token = token;
                 return true;
             }
         }
         return false;
     }

     /**
      * Correctly initialise {@link #sentence}, {@link #chunks}, {@link #chunk}
      * and {@link #tokenIndex} for the next element of {@link #sections}. If
      * no further sentences are to process it simple sets {@link #sentence},
      * {@link #chunks}, {@link #chunk} and {@link #tokenIndex} to <code>null</code>
      */
     private boolean initNextSentence() {
         section = null;
         processableTokensIterator = null;
         consumedIndex = -1;
         boolean foundLinkableToken = false;
         while(!foundLinkableToken && sections.hasNext()){
             section = sections.next();
             if(consumedSectionIndex > section.getStart()){
                 log.debug(" > skipping {} because an other section until Index {} " +
                 		"was already processed. This is not an error, but indicates that" +
                 		"multiple NLP framewords do contribute divergating Sentence annotations",
                 		section, consumedSectionIndex);
                 continue; //ignore this section
             }
             consumedSectionIndex = section.getEnd();
             SectionData sectionData = new SectionData(tpc, section, enclosedSpanTypes, foundLinkableToken);
             //TODO: It would be better to use a SectionData field instead
             tokens = sectionData.getTokens();
             section = sectionData.section;
             foundLinkableToken = sectionData.hasLinkableToken();
         }
         processableTokensIterator = new FilterIterator(tokens.iterator(), PROCESSABLE_TOKEN_OREDICATE);
         return foundLinkableToken;
     }
     /**
      * Getter for the text covered by the next tokenCount tokens relative to
      * {@link #token}. It uses the {@link #textCache} to lookup/store such texts.
      * Given the Tokens
      * <pre>
      *    [This, is, an, Example]
      * </pre>
      * and the parameter <code>3</code> this method will return
      * <pre>
      *     This is an
      * </pre>
      * @param tokenCount the number of tokens to be included relative to
      * {@link #tokenIndex}
      * @return the text covered by the span start of {@link #token} to end of
      * token at <code>{@link #tokenIndex}+tokenCount</code>.
      */
     public String getTokenText(int start, int tokenCount){
         int offset = section.getStart();
         return section.getSpan().substring(
             tokens.get(start).token.getStart()-offset,
             tokens.get(start+(tokenCount-1)).token.getEnd()-offset);
     }

     @Override
     public String toString() {
         StringBuilder sb = new StringBuilder();
         sb.append('[').append(token.index).append(',').append(token.token);
         sb.append("] chunk: ");
         if(token.inChunk == null){
             sb.append("none");
         } else {
             sb.append(token.inChunk.chunk);
             if(token.inChunk.merged != null){
                 sb.append("(merged with ").append(token.inChunk.merged).append(')');
             }
         }
         sb.append("| sentence: ");
         if(section == null){
             sb.append("none");
         } else if(section.getSpan().length() > 45){
             sb.append(section.getSpan().substring(0, 45)).append(" ...");
         } else {
             sb.append(section.getSpan());
         }
         return sb.toString();
     }

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	/**
	*
	*/
	package org.apache.stanbol.enhancer.engines.entitylinking.impl;

	import static org.apache.stanbol.enhancer.engines.entitylinking.config.TextProcessingConfig.UNICASE_SCRIPT_LANUAGES;
	import static org.apache.stanbol.enhancer.nlp.NlpAnnotations.POS_ANNOTATION;

	import java.util.ArrayList;
	import java.util.Collection;
	import java.util.Collections;
	import java.util.EnumSet;
	import java.util.Iterator;
	import java.util.List;
	import java.util.Locale;

	import org.apache.commons.collections.Predicate;
	import org.apache.commons.collections.iterators.FilterIterator;
	import org.apache.stanbol.commons.namespaceprefix.service.StanbolNamespacePrefixService;
	import org.apache.stanbol.enhancer.engines.entitylinking.config.LanguageProcessingConfig;
	import org.apache.stanbol.enhancer.engines.entitylinking.config.TextProcessingConfig;
	import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
	import org.apache.stanbol.enhancer.nlp.model.Chunk;
	import org.apache.stanbol.enhancer.nlp.model.Section;
	import org.apache.stanbol.enhancer.nlp.model.Sentence;
	import org.apache.stanbol.enhancer.nlp.model.Span;
	import org.apache.stanbol.enhancer.nlp.model.Span.SpanTypeEnum;
	import org.apache.stanbol.enhancer.nlp.model.Token;
	import org.apache.stanbol.enhancer.nlp.pos.Pos;
	import org.slf4j.Logger;
	import org.slf4j.LoggerFactory;

	public class ProcessingState {

	private final Logger log = LoggerFactory.getLogger(ProcessingState.class);

	/**
	* Iterator over the sentences (might be
	* the whole {@link AnalysedText} if no sentences are
	* defined).
	*/
	private final Iterator<? extends Section> sections;
	/**
	* The sentence currently processed
	*/
	private Section section;
	/**
	* Holds the {@link Token}s of the current {@link #sentence}
	* to allow fast index based access.
	*/
	private List<TokenData> tokens = new ArrayList<TokenData>(64);

	@SuppressWarnings("unchecked")
	private Iterator<TokenData> processableTokensIterator = Collections.EMPTY_LIST.iterator();

	private final EnumSet<SpanTypeEnum> enclosedSpanTypes;
	/**
	* The current token
	*/
	private TokenData token;
	/**
	* The position of the last consumed position
	*/
	private int consumedIndex = -1;
	/**
	* Ensures that Tokens are not processed twice in case of multiple
	* overlapping Sentence Annotations (e.g. if two NLP frameworks contributing
	* Sentences do not agree with each other).
	*/
	private int consumedSectionIndex = -1;
	/**
	* The language of the text
	*/
	private String language;

	protected final LanguageProcessingConfig tpc;
	//protected final EntityLinkerConfig elc;

	private AnalysedText at;
	/**
	* If the language uses a unicase script and therefore upper case specific
	* processing rules can not be used (see STANBOL-1049)
	*/
	private boolean isUnicaseLanguage;

	private static final Predicate PROCESSABLE_TOKEN_OREDICATE = new Predicate() {
	@Override
	public boolean evaluate(Object object) {
	return ((TokenData)object).isLinkable;
	}
	};

	public static final Collection<Pos> SUB_SENTENCE_START_POS = EnumSet.of(
	Pos.Quote);

	public ProcessingState(AnalysedText at, String language, LanguageProcessingConfig tpc){
	if(at == null){
	throw new IllegalArgumentException("The parsed AnalysedText MUST NOT be NULL!");
	}
	if(language == null \|\| language.isEmpty()){
	throw new IllegalArgumentException("The parsed Language MUST NOT be NULL nor empty!");
	}
	if(tpc == null){
	throw new IllegalArgumentException("The parsed TextProcessingConfig MUST NOT be NULL!");
	}
	this.tpc = tpc;
	enclosedSpanTypes = EnumSet.of(SpanTypeEnum.Token);

	if(!tpc.isIgnoreChunks()){
	enclosedSpanTypes.add(SpanTypeEnum.Chunk);
	}
	this.at = at; //store as field (just used for logging)
	this.language = language;
	//STANBOL-1049: we need now to know if a language uses a unicase script
	//ensure lower case and only use the language part
	String lookupLang = language.toLowerCase(Locale.ROOT).split("[_-]")[0];
	this.isUnicaseLanguage = UNICASE_SCRIPT_LANUAGES.contains(lookupLang);
	//prefer to iterate over sentences
	Iterator<Sentence> sentences = at.getSentences();
	this.sections = sentences.hasNext() ? sentences : Collections.singleton(at).iterator();
	//init the first sentence
	//initNextSentence();
	}
	/**
	* Getter for the current section. This is typically a {@link Sentence}
	* but might also be the whole {@link AnalysedText} in case no sentence
	* annotations are available
	* @return the currently processed {@link Section}
	*/
	public final Section getSentence() {
	return section;
	}
	/**
	* Getter for the current token
	* @return the token for the currently processed word
	*/
	public TokenData getToken(){
	return token;
	}
	/**
	* Getter for the Tokens of the currently processed section
	* @return the Tokens of the currently processed section
	*/
	public List<TokenData> getTokens(){
	return tokens;
	}

	/**
	* Getter for the last consumed index
	* @return the index of the last consumed token
	*/
	public final int getConsumedIndex() {
	return consumedIndex;
	}


	/**
	* Getter for the language of the current Token (based on the current
	* sentence)
	* @return the language
	*/
	public final String getLanguage() {
	return language;
	}
	// /**
	// * Getter for the next {@link Token} to be processed. Calling {@link #next()}
	// * is guaranteed to skip all tokens in between {@link #getTokenIndex()}
	// * and {@link #getNextToken()}, but it might even skip more tokens (e.g.
	// * in case that the token referenced by {@link #getNextToken()} is not
	// * within a {@link Chunk}
	// * @return the nextToken
	// */
	// public final int getNextToken() {
	// return nextToken;
	// }

	/**
	* The index of an consumed Token. The consumed index MUST BE equals or
	* greater as {@link #getTokenIndex()}. If the consumed index is set to a
	* value greater that {@link #getTokenIndex()} than consumed tokens are
	* skipped on the next call to {@link #next()}
	* @param pos the position of the last consumed token.
	*/
	public void setConsumed(int pos){
	if(pos >= token.index){
	this.consumedIndex = pos;
	// this.nextToken = pos+1;
	} else {
	throw new IllegalArgumentException("The lastConsumedPos "+pos+
	" MUST BE equals or gerater than the current Pos "+token.index);
	}
	}

	/**
	* Moves the state to next processable token after the index #nextToken
	* @return <code>true</code> if there are further elements to process or
	* <code>false</code> if there are no further elements to process.
	*/
	public boolean next() {
	while(processableTokensIterator.hasNext() \|\| initNextSentence()){
	TokenData token = processableTokensIterator.next();
	if(token.index > consumedIndex){
	this.token = token;
	return true;
	}
	}
	return false;
	}

	/**
	* Correctly initialise {@link #sentence}, {@link #chunks}, {@link #chunk}
	* and {@link #tokenIndex} for the next element of {@link #sections}. If
	* no further sentences are to process it simple sets {@link #sentence},
	* {@link #chunks}, {@link #chunk} and {@link #tokenIndex} to <code>null</code>
	*/
	private boolean initNextSentence() {
	section = null;
	processableTokensIterator = null;
	consumedIndex = -1;
	boolean foundLinkableToken = false;
	while(!foundLinkableToken && sections.hasNext()){
	section = sections.next();
	if(consumedSectionIndex > section.getStart()){
	log.debug(" > skipping {} because an other section until Index {} " +
	"was already processed. This is not an error, but indicates that" +
	"multiple NLP framewords do contribute divergating Sentence annotations",
	section, consumedSectionIndex);
	continue; //ignore this section
	}
	consumedSectionIndex = section.getEnd();
	SectionData sectionData = new SectionData(tpc, section, enclosedSpanTypes, foundLinkableToken);
	//TODO: It would be better to use a SectionData field instead
	tokens = sectionData.getTokens();
	section = sectionData.section;
	foundLinkableToken = sectionData.hasLinkableToken();
	}
	processableTokensIterator = new FilterIterator(tokens.iterator(), PROCESSABLE_TOKEN_OREDICATE);
	return foundLinkableToken;
	}
	/**
	* Getter for the text covered by the next tokenCount tokens relative to
	* {@link #token}. It uses the {@link #textCache} to lookup/store such texts.
	* Given the Tokens
	* <pre>
	* [This, is, an, Example]
	* </pre>
	* and the parameter <code>3</code> this method will return
	* <pre>
	* This is an
	* </pre>
	* @param tokenCount the number of tokens to be included relative to
	* {@link #tokenIndex}
	* @return the text covered by the span start of {@link #token} to end of
	* token at <code>{@link #tokenIndex}+tokenCount</code>.
	*/
	public String getTokenText(int start, int tokenCount){
	int offset = section.getStart();
	return section.getSpan().substring(
	tokens.get(start).token.getStart()-offset,
	tokens.get(start+(tokenCount-1)).token.getEnd()-offset);
	}

	@Override
	public String toString() {
	StringBuilder sb = new StringBuilder();
	sb.append('[').append(token.index).append(',').append(token.token);
	sb.append("] chunk: ");
	if(token.inChunk == null){
	sb.append("none");
	} else {
	sb.append(token.inChunk.chunk);
	if(token.inChunk.merged != null){
	sb.append("(merged with ").append(token.inChunk.merged).append(')');
	}
	}
	sb.append("\| sentence: ");
	if(section == null){
	sb.append("none");
	} else if(section.getSpan().length() > 45){
	sb.append(section.getSpan().substring(0, 45)).append(" ...");
	} else {
	sb.append(section.getSpan());
	}
	return sb.toString();
	}

	}