blob: a3fb668dd97249a8ee6a83bede23c59622b20694 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
*
*/
package org.apache.stanbol.enhancer.engines.keywordextraction.linking;
import java.util.Iterator;
import org.apache.stanbol.commons.opennlp.TextAnalyzer;
import org.apache.stanbol.commons.opennlp.TextAnalyzer.AnalysedText;
import org.apache.stanbol.commons.opennlp.TextAnalyzer.AnalysedText.Token;
/**
* Represents the already with NLP tools analysed content to be linked with
* Entities of an {@link EntitySearcher}.<p>
* Note that for the linking process it is only required that the text is
* tokenized. All other features (sentence detection, POS tags and Chunks) are
* optional but do improve the performance and to an smaller amount also the
* results of the linking process. <p>
* TODO: <ul>
* <li> Find a better Name
* <li> The API is not optimal. In general the {@link TextAnalyzer} and the
* {@link AnalysedContent} interface do not play well together :(
* </ul>
* @author Rupert Westenthaler
*
*/
public interface AnalysedContent {
/**
* Getter for the Iterator over the analysed sentences. This Method
* is expected to return always the same Iterator instance.
* @return the iterator over the analysed sentences
*/
public Iterator<AnalysedText> getAnalysedText();
/**
* Called to check if a {@link Token} should be used to search for
* Concepts within the Taxonomy based on the POS tag of the Token.
* @param posTag the POS tag to check
* @param posProb the probability of the POS tag or <code>1.0</code> if not
* available
* @return <code>true</code> if Tokens with this POS tag should be
* included in searches. Otherwise <code>false</code>. If this information
* is not available (e.g. no set of Tags that need to be processed is defined)
* this Method MUST return <code>null</code>
*/
public Boolean processPOS(String posTag, double posProb);
/**
* Called to check if a chunk should be used to search for Concepts.
* @param chunkTag the tag (type) of the chunk
* @param chunkProb the probability of the chunk tag or <code>1.0</code> if
* not available
* @return <code>true</code> if chunks with this tag (type) should be
* processed (used to search for matches of concepts) and <code>false</code>
* if not. If this information is not available (e.g. no set of Tags that
* need to be processed is defined) this Method MUST return <code>null</code>
*/
public Boolean processChunk(String chunkTag,double chunkProb);
/**
* Tokenizes the parsed label
* @param label the label to tokenize
* @return the spans of the tokens
*/
public String[] tokenize(String label);
}