enhancement-engines/topic/api/src/main/java/org/apache/stanbol/enhancer/topic/api/TopicClassifier.java - stanbol - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.stanbol.enhancer.topic.api;

 import org.apache.clerezza.commons.rdf.ImmutableGraph;
 import org.apache.clerezza.commons.rdf.IRI;
 import org.apache.stanbol.enhancer.topic.api.training.TrainingSet;
 import org.apache.stanbol.enhancer.topic.api.training.TrainingSetException;
 import org.osgi.framework.InvalidSyntaxException;

 import java.util.Collection;
 import java.util.List;
 import java.util.Set;

 /**
  * Service interface for suggesting hierarchical concepts from a specific scheme (a.k.a. taxonomy, thesaurus
  * or concepts hierarchy) from the text content of a document or part of a document.
  */
 public interface TopicClassifier {

     /**
      * The short name of the training set. Can be used as the URI component to identify the training set in
      * the Web management interface or in RDF descriptions of the service.
      */
     String getName();

     /**
      * @return list of language codes for text that can be automatically classified by the service.
      */
     public List<String> getAcceptedLanguages();

     /**
      * Perform automated text categorization based on statistical occurrences of words in the given text.
      *
      * @param text
      *            the text content to analyze
      * @return the most likely concepts related to the text
      * @throws EngineException
      */
     List<TopicSuggestion> suggestTopics(String text) throws ClassifierException;

     /**
      * @return the set of ids of concepts directly broader than
      * @param id
      */
     Set<String> getNarrowerConcepts(String broadConceptId) throws ClassifierException;

     /**
      * @return the set of ids of concepts directly narrower than
      * @param id
      */
     Set<String> getBroaderConcepts(String id) throws ClassifierException;

     /**
      * @return the set of ids of concepts without broader concepts.
      */
     Set<String> getRootConcepts() throws ClassifierException;

     /**
      * @return true if the classifier model can be updated with the {@code addConcept} / {@code removeConcept}
      *         / {@code updateModel} / methods.
      */
     boolean isUpdatable();

     /**
      * Register a topic and set it's ancestors in the taxonomy. Warning: re-adding an already existing topic
      * can delete the underlying statistical model. Calling {@code updateModel} is necessary to rebuild the
      * statistical model based on the hierarchical structure of the concepts and the registered training set.
      *
      * @param conceptUri
      *            the new concept identifier
      * @param primaryTopicUri
      *            optional identifier of a resource best describing that concept.
      * @param broaderConcepts
      *            list of directly broader concepts in the thesaurus
      */
     void addConcept(String conceptUri, String primaryTopicUri, Collection<String> broaderConcepts) throws ClassifierException;

     /**
      * Register a topic and set it's ancestors in the taxonomy. Warning: re-adding an already existing topic
      * can delete the underlying statistical model. Calling {@code updateModel} is necessary to rebuild the
      * statistical model based on the hierarchical structure of the concepts and the registered training set.
      *
      * @param conceptUri
      *            the new concept identifier
      * @param broaderConcepts
      *            list of directly broader concepts in the thesaurus
      */
     void addConcept(String conceptUri, Collection<String> broaderConcepts) throws ClassifierException;

     /**
      * Remove a topic from the thesaurus. WARNING: it is the caller responsibility to recursively remove or
      * update any narrower topic that might hold a reference on this topic. Once the tree is updated,
      * {@code updateModel} should be called to re-align the statistical model to match the new hierarchy by
      * drawing examples from the dataset.
      *
      * @param conceptUri
      *            if of the topic to remove from the model, must not be null
      */
     void removeConcept(String conceptUri) throws ClassifierException;

     /**
      * Remove all the concepts from the current model leaving with an empty model.
      *
      * @throws ClassifierException
      */
     void removeAllConcepts() throws ClassifierException;

     /**
      * @return the training set registered for this classifier (either set explicitly using setTrainingSet or
      *         configured through OSGi properties).
      */
     public TrainingSet getTrainingSet();

     /**
      * Register a training set to use to build the statistical model of the classifier.
      */
     void setTrainingSet(TrainingSet trainingSet);

     /**
      * Update (incrementally or from scratch) the statistical model of the classifier. Note: depending on the
      * size of the dataset and the number of concepts to update, this process can take a long time and should
      * probably be wrapped in a dedicated thread if called by a the user interface layer.
      *
      * @return the number of updated concepts
      */
     int updateModel(boolean incremental) throws TrainingSetException, ClassifierException;

     /**
      * Perform k-fold cross validation of the model to compute estimates of the precision, recall and f1
      * score.
      *
      * @return number of updated concepts
      */
     public int updatePerformanceEstimates(boolean incremental) throws ClassifierException,
                                                               TrainingSetException;

     /**
      * Tell the classifier which slice of data to keep aside while training for model evaluation using k-folds
      * cross validation.
      *
      * http://en.wikipedia.org/wiki/Cross-validation_%28statistics%29#K-fold_cross-validation
      *
      * @param foldIndex
      *            the fold id used as a training set for this classifier instance.
      * @param foldCount
      *            the number of folds used in the cross validation process (typically 3 or 5). Set to 0 to
      *            disable cross validation for this classifier.
      */
     void setCrossValidationInfo(int foldIndex, int foldCount);

     /**
      * Get a classification report with various accuracy metrics (precision, recall and f1-score) along with
      * the example ids of some mistakes (false positives or false negatives).
      */
     ClassificationReport getPerformanceEstimates(String concept) throws ClassifierException;

     /**
      * Return the list of names of chains where the classifier is currently registered as an enhancement
      * engine.
      */
     List<String> getChainNames() throws InvalidSyntaxException;

     /**
      * Initialize the concept hierarchy of the model using the provided RDF model (e.g. a SKOS taxonomy).
      *
      * @return the number of concepts successfully imported (including roots).
      */
     int importConceptsFromGraph(ImmutableGraph graph, IRI conceptClass, IRI broaderProperty) throws ClassifierException;
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.stanbol.enhancer.topic.api;

	import org.apache.clerezza.commons.rdf.ImmutableGraph;
	import org.apache.clerezza.commons.rdf.IRI;
	import org.apache.stanbol.enhancer.topic.api.training.TrainingSet;
	import org.apache.stanbol.enhancer.topic.api.training.TrainingSetException;
	import org.osgi.framework.InvalidSyntaxException;

	import java.util.Collection;
	import java.util.List;
	import java.util.Set;

	/**
	* Service interface for suggesting hierarchical concepts from a specific scheme (a.k.a. taxonomy, thesaurus
	* or concepts hierarchy) from the text content of a document or part of a document.
	*/
	public interface TopicClassifier {

	/**
	* The short name of the training set. Can be used as the URI component to identify the training set in
	* the Web management interface or in RDF descriptions of the service.
	*/
	String getName();

	/**
	* @return list of language codes for text that can be automatically classified by the service.
	*/
	public List<String> getAcceptedLanguages();

	/**
	* Perform automated text categorization based on statistical occurrences of words in the given text.
	*
	* @param text
	* the text content to analyze
	* @return the most likely concepts related to the text
	* @throws EngineException
	*/
	List<TopicSuggestion> suggestTopics(String text) throws ClassifierException;

	/**
	* @return the set of ids of concepts directly broader than
	* @param id
	*/
	Set<String> getNarrowerConcepts(String broadConceptId) throws ClassifierException;

	/**
	* @return the set of ids of concepts directly narrower than
	* @param id
	*/
	Set<String> getBroaderConcepts(String id) throws ClassifierException;

	/**
	* @return the set of ids of concepts without broader concepts.
	*/
	Set<String> getRootConcepts() throws ClassifierException;

	/**
	* @return true if the classifier model can be updated with the {@code addConcept} / {@code removeConcept}
	* / {@code updateModel} / methods.
	*/
	boolean isUpdatable();

	/**
	* Register a topic and set it's ancestors in the taxonomy. Warning: re-adding an already existing topic
	* can delete the underlying statistical model. Calling {@code updateModel} is necessary to rebuild the
	* statistical model based on the hierarchical structure of the concepts and the registered training set.
	*
	* @param conceptUri
	* the new concept identifier
	* @param primaryTopicUri
	* optional identifier of a resource best describing that concept.
	* @param broaderConcepts
	* list of directly broader concepts in the thesaurus
	*/
	void addConcept(String conceptUri, String primaryTopicUri, Collection<String> broaderConcepts) throws ClassifierException;

	/**
	* Register a topic and set it's ancestors in the taxonomy. Warning: re-adding an already existing topic
	* can delete the underlying statistical model. Calling {@code updateModel} is necessary to rebuild the
	* statistical model based on the hierarchical structure of the concepts and the registered training set.
	*
	* @param conceptUri
	* the new concept identifier
	* @param broaderConcepts
	* list of directly broader concepts in the thesaurus
	*/
	void addConcept(String conceptUri, Collection<String> broaderConcepts) throws ClassifierException;

	/**
	* Remove a topic from the thesaurus. WARNING: it is the caller responsibility to recursively remove or
	* update any narrower topic that might hold a reference on this topic. Once the tree is updated,
	* {@code updateModel} should be called to re-align the statistical model to match the new hierarchy by
	* drawing examples from the dataset.
	*
	* @param conceptUri
	* if of the topic to remove from the model, must not be null
	*/
	void removeConcept(String conceptUri) throws ClassifierException;

	/**
	* Remove all the concepts from the current model leaving with an empty model.
	*
	* @throws ClassifierException
	*/
	void removeAllConcepts() throws ClassifierException;

	/**
	* @return the training set registered for this classifier (either set explicitly using setTrainingSet or
	* configured through OSGi properties).
	*/
	public TrainingSet getTrainingSet();

	/**
	* Register a training set to use to build the statistical model of the classifier.
	*/
	void setTrainingSet(TrainingSet trainingSet);

	/**
	* Update (incrementally or from scratch) the statistical model of the classifier. Note: depending on the
	* size of the dataset and the number of concepts to update, this process can take a long time and should
	* probably be wrapped in a dedicated thread if called by a the user interface layer.
	*
	* @return the number of updated concepts
	*/
	int updateModel(boolean incremental) throws TrainingSetException, ClassifierException;

	/**
	* Perform k-fold cross validation of the model to compute estimates of the precision, recall and f1
	* score.
	*
	* @return number of updated concepts
	*/
	public int updatePerformanceEstimates(boolean incremental) throws ClassifierException,
	TrainingSetException;

	/**
	* Tell the classifier which slice of data to keep aside while training for model evaluation using k-folds
	* cross validation.
	*
	* http://en.wikipedia.org/wiki/Cross-validation_%28statistics%29#K-fold_cross-validation
	*
	* @param foldIndex
	* the fold id used as a training set for this classifier instance.
	* @param foldCount
	* the number of folds used in the cross validation process (typically 3 or 5). Set to 0 to
	* disable cross validation for this classifier.
	*/
	void setCrossValidationInfo(int foldIndex, int foldCount);

	/**
	* Get a classification report with various accuracy metrics (precision, recall and f1-score) along with
	* the example ids of some mistakes (false positives or false negatives).
	*/
	ClassificationReport getPerformanceEstimates(String concept) throws ClassifierException;

	/**
	* Return the list of names of chains where the classifier is currently registered as an enhancement
	* engine.
	*/
	List<String> getChainNames() throws InvalidSyntaxException;

	/**
	* Initialize the concept hierarchy of the model using the provided RDF model (e.g. a SKOS taxonomy).
	*
	* @return the number of concepts successfully imported (including roots).
	*/
	int importConceptsFromGraph(ImmutableGraph graph, IRI conceptClass, IRI broaderProperty) throws ClassifierException;
	}