enhancement-engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/AnalysedContent.java - stanbol - Git at Google

 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 /**
  *
  */
 package org.apache.stanbol.enhancer.engines.keywordextraction.linking;

 import java.util.Iterator;

 import org.apache.stanbol.commons.opennlp.TextAnalyzer;
 import org.apache.stanbol.commons.opennlp.TextAnalyzer.AnalysedText;
 import org.apache.stanbol.commons.opennlp.TextAnalyzer.AnalysedText.Token;

 /**
  * Represents the already with NLP tools analysed content to be linked with
  * Entities of an {@link EntitySearcher}.<p>
  * Note that for the linking process it is only required that the text is
  * tokenized. All other features (sentence detection, POS tags and Chunks) are
  * optional but do improve the performance and to an smaller amount also the
  * results of the linking process. <p>
  * TODO: <ul>
  * <li> Find a better Name
  * <li> The API is not optimal. In general the {@link TextAnalyzer} and the
  * {@link AnalysedContent} interface do not play well together :(
  * </ul>
  * @author Rupert Westenthaler
  *
  */
 public interface AnalysedContent {


     /**
      * Getter for the Iterator over the analysed sentences. This Method
      * is expected to return always the same Iterator instance.
      * @return the iterator over the analysed sentences
      */
     public Iterator<AnalysedText> getAnalysedText();
     /**
      * Called to check if a {@link Token} should be used to search for
      * Concepts within the Taxonomy based on the POS tag of the Token.
      * @param posTag the POS tag to check
      * @param posProb the probability of the POS tag or <code>1.0</code> if not
      * available
      * @return <code>true</code> if Tokens with this POS tag should be
      * included in searches. Otherwise <code>false</code>.  If this information
      * is not available (e.g. no set of Tags that need to be processed is defined)
      * this Method MUST return <code>null</code>
      */
     public Boolean processPOS(String posTag, double posProb);
     /**
      * Called to check if a chunk should be used to search for Concepts.
      * @param chunkTag the tag (type) of the chunk
      * @param chunkProb the probability of the chunk tag or <code>1.0</code> if
      * not available
      * @return <code>true</code> if chunks with this tag (type) should be
      * processed (used to search for matches of concepts) and <code>false</code>
      * if not. If this information is not available (e.g. no set of Tags that
      * need to be processed is defined) this Method MUST return <code>null</code>
      */
     public Boolean processChunk(String chunkTag,double chunkProb);
     /**
      * Tokenizes the parsed label
      * @param label the label to tokenize
      * @return the spans of the tokens
      */
     public String[] tokenize(String label);
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	/**
	*
	*/
	package org.apache.stanbol.enhancer.engines.keywordextraction.linking;

	import java.util.Iterator;

	import org.apache.stanbol.commons.opennlp.TextAnalyzer;
	import org.apache.stanbol.commons.opennlp.TextAnalyzer.AnalysedText;
	import org.apache.stanbol.commons.opennlp.TextAnalyzer.AnalysedText.Token;

	/**
	* Represents the already with NLP tools analysed content to be linked with
	* Entities of an {@link EntitySearcher}.<p>
	* Note that for the linking process it is only required that the text is
	* tokenized. All other features (sentence detection, POS tags and Chunks) are
	* optional but do improve the performance and to an smaller amount also the
	* results of the linking process. <p>
	* TODO: <ul>
	* <li> Find a better Name
	* <li> The API is not optimal. In general the {@link TextAnalyzer} and the
	* {@link AnalysedContent} interface do not play well together :(
	* </ul>
	* @author Rupert Westenthaler
	*
	*/
	public interface AnalysedContent {


	/**
	* Getter for the Iterator over the analysed sentences. This Method
	* is expected to return always the same Iterator instance.
	* @return the iterator over the analysed sentences
	*/
	public Iterator<AnalysedText> getAnalysedText();
	/**
	* Called to check if a {@link Token} should be used to search for
	* Concepts within the Taxonomy based on the POS tag of the Token.
	* @param posTag the POS tag to check
	* @param posProb the probability of the POS tag or <code>1.0</code> if not
	* available
	* @return <code>true</code> if Tokens with this POS tag should be
	* included in searches. Otherwise <code>false</code>. If this information
	* is not available (e.g. no set of Tags that need to be processed is defined)
	* this Method MUST return <code>null</code>
	*/
	public Boolean processPOS(String posTag, double posProb);
	/**
	* Called to check if a chunk should be used to search for Concepts.
	* @param chunkTag the tag (type) of the chunk
	* @param chunkProb the probability of the chunk tag or <code>1.0</code> if
	* not available
	* @return <code>true</code> if chunks with this tag (type) should be
	* processed (used to search for matches of concepts) and <code>false</code>
	* if not. If this information is not available (e.g. no set of Tags that
	* need to be processed is defined) this Method MUST return <code>null</code>
	*/
	public Boolean processChunk(String chunkTag,double chunkProb);
	/**
	* Tokenizes the parsed label
	* @param label the label to tokenize
	* @return the spans of the tokens
	*/
	public String[] tokenize(String label);
	}