blob: 6295db88ff8f59df19e4fd82981cdf1fa14876d5 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.stanbol.enhancer.topic.api;
import org.apache.clerezza.commons.rdf.ImmutableGraph;
import org.apache.clerezza.commons.rdf.IRI;
import org.apache.stanbol.enhancer.topic.api.training.TrainingSet;
import org.apache.stanbol.enhancer.topic.api.training.TrainingSetException;
import org.osgi.framework.InvalidSyntaxException;
import java.util.Collection;
import java.util.List;
import java.util.Set;
/**
* Service interface for suggesting hierarchical concepts from a specific scheme (a.k.a. taxonomy, thesaurus
* or concepts hierarchy) from the text content of a document or part of a document.
*/
public interface TopicClassifier {
/**
* The short name of the training set. Can be used as the URI component to identify the training set in
* the Web management interface or in RDF descriptions of the service.
*/
String getName();
/**
* @return list of language codes for text that can be automatically classified by the service.
*/
public List<String> getAcceptedLanguages();
/**
* Perform automated text categorization based on statistical occurrences of words in the given text.
*
* @param text
* the text content to analyze
* @return the most likely concepts related to the text
* @throws EngineException
*/
List<TopicSuggestion> suggestTopics(String text) throws ClassifierException;
/**
* @return the set of ids of concepts directly broader than
* @param id
*/
Set<String> getNarrowerConcepts(String broadConceptId) throws ClassifierException;
/**
* @return the set of ids of concepts directly narrower than
* @param id
*/
Set<String> getBroaderConcepts(String id) throws ClassifierException;
/**
* @return the set of ids of concepts without broader concepts.
*/
Set<String> getRootConcepts() throws ClassifierException;
/**
* @return true if the classifier model can be updated with the {@code addConcept} / {@code removeConcept}
* / {@code updateModel} / methods.
*/
boolean isUpdatable();
/**
* Register a topic and set it's ancestors in the taxonomy. Warning: re-adding an already existing topic
* can delete the underlying statistical model. Calling {@code updateModel} is necessary to rebuild the
* statistical model based on the hierarchical structure of the concepts and the registered training set.
*
* @param conceptUri
* the new concept identifier
* @param primaryTopicUri
* optional identifier of a resource best describing that concept.
* @param broaderConcepts
* list of directly broader concepts in the thesaurus
*/
void addConcept(String conceptUri, String primaryTopicUri, Collection<String> broaderConcepts) throws ClassifierException;
/**
* Register a topic and set it's ancestors in the taxonomy. Warning: re-adding an already existing topic
* can delete the underlying statistical model. Calling {@code updateModel} is necessary to rebuild the
* statistical model based on the hierarchical structure of the concepts and the registered training set.
*
* @param conceptUri
* the new concept identifier
* @param broaderConcepts
* list of directly broader concepts in the thesaurus
*/
void addConcept(String conceptUri, Collection<String> broaderConcepts) throws ClassifierException;
/**
* Remove a topic from the thesaurus. WARNING: it is the caller responsibility to recursively remove or
* update any narrower topic that might hold a reference on this topic. Once the tree is updated,
* {@code updateModel} should be called to re-align the statistical model to match the new hierarchy by
* drawing examples from the dataset.
*
* @param conceptUri
* if of the topic to remove from the model, must not be null
*/
void removeConcept(String conceptUri) throws ClassifierException;
/**
* Remove all the concepts from the current model leaving with an empty model.
*
* @throws ClassifierException
*/
void removeAllConcepts() throws ClassifierException;
/**
* @return the training set registered for this classifier (either set explicitly using setTrainingSet or
* configured through OSGi properties).
*/
public TrainingSet getTrainingSet();
/**
* Register a training set to use to build the statistical model of the classifier.
*/
void setTrainingSet(TrainingSet trainingSet);
/**
* Update (incrementally or from scratch) the statistical model of the classifier. Note: depending on the
* size of the dataset and the number of concepts to update, this process can take a long time and should
* probably be wrapped in a dedicated thread if called by a the user interface layer.
*
* @return the number of updated concepts
*/
int updateModel(boolean incremental) throws TrainingSetException, ClassifierException;
/**
* Perform k-fold cross validation of the model to compute estimates of the precision, recall and f1
* score.
*
* @return number of updated concepts
*/
public int updatePerformanceEstimates(boolean incremental) throws ClassifierException,
TrainingSetException;
/**
* Tell the classifier which slice of data to keep aside while training for model evaluation using k-folds
* cross validation.
*
* http://en.wikipedia.org/wiki/Cross-validation_%28statistics%29#K-fold_cross-validation
*
* @param foldIndex
* the fold id used as a training set for this classifier instance.
* @param foldCount
* the number of folds used in the cross validation process (typically 3 or 5). Set to 0 to
* disable cross validation for this classifier.
*/
void setCrossValidationInfo(int foldIndex, int foldCount);
/**
* Get a classification report with various accuracy metrics (precision, recall and f1-score) along with
* the example ids of some mistakes (false positives or false negatives).
*/
ClassificationReport getPerformanceEstimates(String concept) throws ClassifierException;
/**
* Return the list of names of chains where the classifier is currently registered as an enhancement
* engine.
*/
List<String> getChainNames() throws InvalidSyntaxException;
/**
* Initialize the concept hierarchy of the model using the provided RDF model (e.g. a SKOS taxonomy).
*
* @return the number of concepts successfully imported (including roots).
*/
int importConceptsFromGraph(ImmutableGraph graph, IRI conceptClass, IRI broaderProperty) throws ClassifierException;
}