enhancement-engines/topic/engine/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java - stanbol - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.stanbol.enhancer.engine.topic;

 import java.io.File;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.Date;
 import java.util.Dictionary;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Hashtable;
 import java.util.Iterator;
 import java.util.LinkedHashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.Map.Entry;
 import java.util.Set;
 import java.util.UUID;

 import org.apache.clerezza.commons.rdf.ImmutableGraph;
 import org.apache.clerezza.rdf.core.LiteralFactory;
 import org.apache.clerezza.commons.rdf.Graph;
 import org.apache.clerezza.commons.rdf.RDFTerm;
 import org.apache.clerezza.commons.rdf.Triple;
 import org.apache.clerezza.commons.rdf.IRI;
 import org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl;
 import org.apache.clerezza.commons.rdf.impl.utils.TripleImpl;
 import org.apache.clerezza.rdf.utils.GraphNode;
 import org.apache.commons.io.FileUtils;
 import org.apache.commons.lang.StringUtils;
 import org.apache.felix.scr.annotations.Activate;
 import org.apache.felix.scr.annotations.Component;
 import org.apache.felix.scr.annotations.ConfigurationPolicy;
 import org.apache.felix.scr.annotations.Deactivate;
 import org.apache.felix.scr.annotations.Properties;
 import org.apache.felix.scr.annotations.Property;
 import org.apache.felix.scr.annotations.Reference;
 import org.apache.felix.scr.annotations.ReferenceCardinality;
 import org.apache.felix.scr.annotations.ReferencePolicy;
 import org.apache.felix.scr.annotations.ReferenceStrategy;
 import org.apache.felix.scr.annotations.Service;
 import org.apache.solr.client.solrj.SolrQuery;
 import org.apache.solr.client.solrj.SolrServer;
 import org.apache.solr.client.solrj.SolrServerException;
 import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer;
 import org.apache.solr.client.solrj.request.UpdateRequest;
 import org.apache.solr.client.solrj.response.QueryResponse;
 import org.apache.solr.client.solrj.util.ClientUtils;
 import org.apache.solr.common.SolrDocument;
 import org.apache.solr.common.SolrDocumentList;
 import org.apache.solr.common.SolrInputDocument;
 import org.apache.solr.common.params.CommonParams;
 import org.apache.solr.common.params.MoreLikeThisParams;
 import org.apache.stanbol.commons.solr.managed.ManagedSolrServer;
 import org.apache.stanbol.commons.solr.utils.StreamQueryRequest;
 import org.apache.stanbol.enhancer.servicesapi.Blob;
 import org.apache.stanbol.enhancer.servicesapi.Chain;
 import org.apache.stanbol.enhancer.servicesapi.ChainException;
 import org.apache.stanbol.enhancer.servicesapi.ContentItem;
 import org.apache.stanbol.enhancer.servicesapi.EngineException;
 import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
 import org.apache.stanbol.enhancer.servicesapi.InvalidContentException;
 import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
 import org.apache.stanbol.enhancer.servicesapi.helper.ContentItemHelper;
 import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
 import org.apache.stanbol.enhancer.servicesapi.rdf.NamespaceEnum;
 import org.apache.stanbol.enhancer.servicesapi.rdf.OntologicalClasses;
 import org.apache.stanbol.enhancer.servicesapi.rdf.TechnicalClasses;
 import org.apache.stanbol.enhancer.topic.ConfiguredSolrCoreTracker;
 import org.apache.stanbol.enhancer.topic.EmbeddedSolrHelper;
 import org.apache.stanbol.enhancer.topic.UTCTimeStamper;
 import org.apache.stanbol.enhancer.topic.api.Batch;
 import org.apache.stanbol.enhancer.topic.api.BatchProcessor;
 import org.apache.stanbol.enhancer.topic.api.ClassificationReport;
 import org.apache.stanbol.enhancer.topic.api.ClassifierException;
 import org.apache.stanbol.enhancer.topic.api.TopicClassifier;
 import org.apache.stanbol.enhancer.topic.api.TopicSuggestion;
 import org.apache.stanbol.enhancer.topic.api.training.Example;
 import org.apache.stanbol.enhancer.topic.api.training.TrainingSet;
 import org.apache.stanbol.enhancer.topic.api.training.TrainingSetException;
 import org.apache.stanbol.enhancer.topic.training.SolrTrainingSet;
 import org.apache.stanbol.entityhub.servicesapi.Entityhub;
 import org.apache.stanbol.entityhub.servicesapi.EntityhubException;
 import org.apache.stanbol.entityhub.servicesapi.model.Entity;
 import org.apache.stanbol.entityhub.servicesapi.model.Representation;
 import org.apache.stanbol.entityhub.servicesapi.model.Text;
 import org.apache.stanbol.entityhub.servicesapi.site.SiteManager;
 import org.osgi.framework.BundleContext;
 import org.osgi.framework.Constants;
 import org.osgi.framework.InvalidSyntaxException;
 import org.osgi.framework.ServiceReference;
 import org.osgi.service.cm.ConfigurationException;
 import org.osgi.service.component.ComponentContext;
 import org.osgi.util.tracker.ServiceTracker;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 /**
  * Enhancement Engine that provides the ability to assign a text document to a set of concepts indexed in a
  * dedicated Solr core. The assignment logic comes from terms frequencies match of the text of the document to
  * categorize with the text indexed for each concept.
  *
  * The data model of the concept tree follows the SKOS model: concepts are organized in a hierarchical
  * "scheme" with a "broader" relation (and the inferred "narrower" inverse relation). Concepts can also
  * optionally be grounded in the real world by the mean of a foaf:primaryTopic link to an external resource
  * such as a DBpedia entry.
  *
  * A document is typically classified with the concept by using the dct:subject property to link the document
  * (subject) to the concept (object).
  *
  * The Solr server is expected to be configured with the MoreLikeThisHandler and the matching fields from the
  * engine configuration.
  *
  * This text classifier method sometimes goes by the name of "Rocchio classification" or "Nearest Centroid
  * classification" in the IR and machine learning literature. It is often slightly less accurate than fitting
  * penalized linear model such as linear kernel Support Vector Machines or penalized Logistic Regression but
  * has the advantage to scale to large number of categories (e.g. more that tens of thousands) without having
  * to load the full statistical model in memory thanks to the use of the inverted index datastructure that
  * also provides the feature extraction and TF-IDF weighting for free.
  *
  * Furthermore it could be refined by using a "Learning to Rank" approach by training a RankSVM, Gradient
  * Boosted Trees or Random Forests on the output on the raw output of the Rocchio classifier so as to re-rank
  * candidate classes more finely. The Learning to Rank refinement is not implemented yet.
  */
 @Component(metatype = true, immediate = true, configurationFactory = true, policy = ConfigurationPolicy.REQUIRE)
 @Service
 @Properties(value = {
         @Property(name = EnhancementEngine.PROPERTY_NAME),
         @Property(name = TopicClassificationEngine.SOLR_CORE),
         @Property(name = TopicClassificationEngine.SOLR_CORE_CONFIG,
             value = TopicClassificationEngine.DEFAULT_SOLR_CORE_CONFIG),
         @Property(name = TopicClassificationEngine.LANGUAGES),
      // those properties can still be set via a configuration file, but as most users
      // will not use them exclude those from the configuration form
 //             @Property(name = TopicClassificationEngine.SIMILARTITY_FIELD, value = TopicClassificationEngine.DEFAULT_SIMILARTITY_FIELD),
 //             @Property(name = TopicClassificationEngine.CONCEPT_URI_FIELD, value = TopicClassificationEngine.DEFAULT_CONCEPT_URI_FIELD),
 //             @Property(name = TopicClassificationEngine.PRIMARY_TOPIC_URI_FIELD, value = TopicClassificationEngine.DEFAULT_PRIMARY_TOPIC_URI_FIELD),
 //             @Property(name = TopicClassificationEngine.BROADER_FIELD, value = TopicClassificationEngine.DEFAULT_BROADER_FIELD),
 //             @Property(name = TopicClassificationEngine.MODEL_UPDATE_DATE_FIELD, value = TopicClassificationEngine.DEFAULT_MODEL_UPDATE_DATE_FIELD),
 //             @Property(name = TopicClassificationEngine.PRECISION_FIELD, value = TopicClassificationEngine.DEFAULT_PRECISION_FIELD),
 //             @Property(name = TopicClassificationEngine.RECALL_FIELD, value = TopicClassificationEngine.DEFAULT_RECALL_FIELD),
 //             @Property(name = TopicClassificationEngine.ENTRY_ID_FIELD, value = TopicClassificationEngine.DEFAULT_ENTRY_ID_FIELD),
 //             @Property(name = TopicClassificationEngine.MODEL_ENTRY_ID_FIELD, value = TopicClassificationEngine.DEFAULT_MODEL_ENTRY_ID_FIELD),
 //             @Property(name = TopicClassificationEngine.ENTRY_TYPE_FIELD, value = TopicClassificationEngine.DEFAULT_ENTRY_TYPE_FIELD),
 //             @Property(name = TopicClassificationEngine.MODEL_EVALUATION_DATE_FIELD, value = TopicClassificationEngine.DEFAULT_MODEL_EVALUATION_DATE_FIELD),
 //             @Property(name = TopicClassificationEngine.FALSE_NEGATIVES_FIELD, value = TopicClassificationEngine.DEFAULT_FALSE_NEGATIVES_FIELD),
 //             @Property(name = TopicClassificationEngine.FALSE_POSITIVES_FIELD, value = TopicClassificationEngine.DEFAULT_FALSE_POSITIVES_FIELD),
 //             @Property(name = TopicClassificationEngine.POSITIVE_SUPPORT_FIELD, value = TopicClassificationEngine.DEFAULT_POSITIVE_SUPPORT_FIELD),
 //             @Property(name = TopicClassificationEngine.NEGATIVE_SUPPORT_FIELD, value = TopicClassificationEngine.DEFAULT_NEGATIVE_SUPPORT_FIELD),
 //             @Property(name = TopicClassificationEngine.ORDER, intValue = TopicClassificationEngine.DEFAULT_ENGINE_ORDER),
         @Property(name = TopicClassificationEngine.TRAINING_SET_ID),
         @Property(name = Constants.SERVICE_RANKING, intValue = 0)
 })
 public class TopicClassificationEngine extends ConfiguredSolrCoreTracker implements EnhancementEngine,
         ServiceProperties, TopicClassifier {

     public static final String DEFAULT_SOLR_CORE_CONFIG = "default-topic-model.solrindex.zip";

     public static final String MODEL_ENTRY = "model";

     public static final String METADATA_ENTRY = "metadata";
     /**
      * The reference to the SolrCore used ny the TopicClassificationEngine.
      * The default is the engine name with the suffix '-model'. This also supports
      * the {server-name}:{index-name} syntax. if n
      */
     public static final String SOLR_CORE = "org.apache.stanbol.enhancer.engine.topic.solrCore";
     /**
      * The name of the Solr Index archive (default: "default-topic-model.solrindex.zip").
      * The file is loaded by using the DataFileProvider infrastructure. The archive may
      * also include a pre-trained model.
      */
     public static final String SOLR_CORE_CONFIG = "org.apache.stanbol.enhancer.engine.topic.solrCoreConfig";

     public static final String LANGUAGES = "org.apache.stanbol.enhancer.engine.topic.languages";

     public static final String ORDER = "org.apache.stanbol.enhancer.engine.topic.order";

     public static final Integer DEFAULT_ENGINE_ORDER = ServiceProperties.ORDERING_CONTENT_EXTRACTION;

     public static final String ENTRY_ID_FIELD = "org.apache.stanbol.enhancer.engine.topic.entryIdField";

     public static final String DEFAULT_ENTRY_ID_FIELD = "entry_id";

     public static final String ENTRY_TYPE_FIELD = "org.apache.stanbol.enhancer.engine.topic.entryTypeField";

     public static final String DEFAULT_ENTRY_TYPE_FIELD = "entry_type";

     public static final String SIMILARTITY_FIELD = "org.apache.stanbol.enhancer.engine.topic.similarityField";

     public static final String DEFAULT_SIMILARTITY_FIELD = "classifier_features";

     public static final String CONCEPT_URI_FIELD = "org.apache.stanbol.enhancer.engine.topic.conceptUriField";

     public static final String DEFAULT_CONCEPT_URI_FIELD = "concept";

     public static final String BROADER_FIELD = "org.apache.stanbol.enhancer.engine.topic.broaderField";

     public static final String DEFAULT_BROADER_FIELD = "broader";

     public static final String PRIMARY_TOPIC_URI_FIELD = "org.apache.stanbol.enhancer.engine.topic.primaryTopicField";

     public static final String DEFAULT_PRIMARY_TOPIC_URI_FIELD = "primary_topic";

     public static final String MODEL_UPDATE_DATE_FIELD = "org.apache.stanbol.enhancer.engine.topic.modelUpdateDateField";

     public static final String DEFAULT_MODEL_UPDATE_DATE_FIELD = "last_update_dt";

     public static final String MODEL_EVALUATION_DATE_FIELD = "org.apache.stanbol.enhancer.engine.topic.modelEvaluationDateField";

     public static final String DEFAULT_MODEL_EVALUATION_DATE_FIELD = "last_evaluation_dt";

     public static final String MODEL_ENTRY_ID_FIELD = "org.apache.stanbol.enhancer.engine.topic.modelEntryIdField";

     public static final String DEFAULT_MODEL_ENTRY_ID_FIELD = "model_entry_id";

     public static final String PRECISION_FIELD = "org.apache.stanbol.enhancer.engine.topic.precisionField";

     public static final String DEFAULT_PRECISION_FIELD = "precision";

     public static final String RECALL_FIELD = "org.apache.stanbol.enhancer.engine.topic.recallField";

     public static final String DEFAULT_RECALL_FIELD = "recall";

     public static final String FALSE_POSITIVES_FIELD = "org.apache.stanbol.enhancer.engine.topic.falsePositivesField";

     public static final String DEFAULT_FALSE_POSITIVES_FIELD = "false_positives";

     public static final String FALSE_NEGATIVES_FIELD = "org.apache.stanbol.enhancer.engine.topic.falseNegativesField";

     public static final String DEFAULT_FALSE_NEGATIVES_FIELD = "false_negatives";

     public static final String POSITIVE_SUPPORT_FIELD = "org.apache.stanbol.enhancer.engine.topic.positiveSupportField";

     public static final String DEFAULT_POSITIVE_SUPPORT_FIELD = "positive_support";

     public static final String NEGATIVE_SUPPORT_FIELD = "org.apache.stanbol.enhancer.engine.topic.negativeSupportField";

     public static final String DEFAULT_NEGATIVE_SUPPORT_FIELD = "negative_support";

     public static final String TRAINING_SET_ID = "org.apache.stanbol.enhancer.engine.topic.trainingSetId";

     private static final Logger log = LoggerFactory.getLogger(TopicClassificationEngine.class);

     /**
      * The "text/plain" mime type
      */
     public static final String PLAIN_TEXT_MIMETYPE = "text/plain";

     /**
      * Contains the only supported mime type {@link #PLAIN_TEXT_MIMETYPE}
      */
     public static final Set<String> SUPPORTED_MIMETYPES = Collections.singleton(PLAIN_TEXT_MIMETYPE);

     public static final String SOLR_NON_EMPTY_FIELD = "[\"\" TO *]";

     @Reference
     protected Entityhub entityhub;

     @Reference
     protected SiteManager referencedSiteManager;

     // TODO: make the following fields configurable

     private int MAX_COLLECTED_EXAMPLES = 1000;

     // Limit the evaluation time by computing the performance estimates on a bounded random selection of
     // labeled examples in the training set.
     public int MAX_EVALUATION_SAMPLES = 500;

     // Do not try to compute performance estimates if there is not at least a minimum number of example
     // documents for this concept.
     public int MIN_EVALUATION_SAMPLES = 10;

     public int MAX_CHARS_PER_TOPIC = 100000;

     public Integer MAX_ROOTS = 1000;

     public int MAX_SUGGESTIONS = 5; // never suggest more than this: this is expected to be a reasonable
                                     // estimate of the number of topics occurring in each documents

     protected String engineName;

     protected List<String> acceptedLanguages;
     private Set<String> acceptedLanguageSet;

     protected Integer order = ORDERING_EXTRACTION_ENHANCEMENT;

     protected String similarityField;

     protected String conceptUriField;

     protected String broaderField;

     protected String primaryTopicUriField;

     protected String modelUpdateDateField;

     protected String modelEvaluationDateField;

     protected String precisionField;

     protected String recallField;

     // without OSGi
     protected TrainingSet trainingSet;

     // with OSGi
     protected ServiceTracker trainingSetTracker;

     protected String trainingSetId;

     // the ENTRY_*_FIELD are basically a hack to use a single Solr core to make documents with partially
     // updateable stored fields: the logical document is splitted into two parts joined by entryIdField. The
     // first part has entryTypeField field with value METADATA_ENTRY and the second half has entryTypeField
     // with value MODEL_ENTRY.
     // The logical primary key stays the topic id.
     protected String entryIdField;

     protected String entryTypeField;

     protected String modelEntryIdField;

     protected String positiveSupportField;

     protected String negativeSupportField;

     protected String falsePositivesField;

     protected String falseNegativesField;

     // customize the behavior of the classifier instance for model evaluation
     protected int cvFoldIndex = 0;

     protected int cvFoldCount = 0;

     protected boolean evaluationRunning = false;

     @Reference(cardinality = ReferenceCardinality.OPTIONAL_UNARY, bind = "bindManagedSolrServer", unbind = "unbindManagedSolrServer", strategy = ReferenceStrategy.EVENT, policy = ReferencePolicy.DYNAMIC)
     protected ManagedSolrServer managedSolrServerDummy; // trick to call the super class binders

     /**
      * Only used for testing outside an OSGI environment (see STANBOL-811:
      * the previously used {@link File#createTempFile(String, String)} does not
      * work on some Windows versions.
      */
     private File embeddedSolrServerDir;

     /**
      * Embedded Solr server used for {@link #performCVFold(File, int, int, int, boolean)}
      * if no ManagedSolrServer is present (e.g. when running outside of OSGI).
      * Lazily initialised relative to {@link #embeddedSolrServerDir} on the first
      * call to performCVFold
      */
     private EmbeddedSolrServer __evaluationServer;

     private File __evaluationServerDir;

     void configureEmbeddedSolrServerDir(File directory){
         embeddedSolrServerDir = directory;
     }

     @Activate
     protected void activate(ComponentContext context) throws ConfigurationException, InvalidSyntaxException {
         @SuppressWarnings("unchecked")
         Dictionary<String,Object> config = context.getProperties();
         activate(context, config);
     }

     protected void activate(ComponentContext context, Dictionary<String,Object> config) throws ConfigurationException,
                                                                                        InvalidSyntaxException {
         this.context = context;
         configure(config);

         // if training set is not null, track it
         if (trainingSetId != null) {
             String filter = String.format("(&(%s=%s)(%s=%s))", Constants.OBJECTCLASS,
                 TrainingSet.class.getName(), SolrTrainingSet.TRAINING_SET_NAME, trainingSetId);
             trainingSetTracker = new ServiceTracker(context.getBundleContext(), context.getBundleContext()
                     .createFilter(filter), null);
             trainingSetTracker.open();
         }
         // TODO if training set is null, make it possible to programmatically create a SolrTrainingSet
         // instance using the same managed solr server and register it under the same name as the engine
         // it-self.
     }

     @Deactivate
     public void deactivate(ComponentContext context) {
         if (indexTracker != null) {
             indexTracker.close();
         }
         if (trainingSetTracker != null) {
             trainingSetTracker.close();
         }
         //shutdown the evaluation server and delete the data of the evaluation server
         if(__evaluationServer != null){
             try {
                 __evaluationServer.getCoreContainer().shutdown();
             }catch (Exception e){
               //ignore
             } finally {
                 FileUtils.deleteQuietly(__evaluationServerDir);
             }
         }
         context = null;
     }

     public void configure(Dictionary<String,Object> config) throws ConfigurationException {
         engineName = getRequiredStringParam(config, EnhancementEngine.PROPERTY_NAME);
         entryIdField = getRequiredStringParam(config, ENTRY_ID_FIELD, DEFAULT_ENTRY_ID_FIELD);
         modelEntryIdField = getRequiredStringParam(config, MODEL_ENTRY_ID_FIELD, DEFAULT_MODEL_ENTRY_ID_FIELD);
         conceptUriField = getRequiredStringParam(config, CONCEPT_URI_FIELD, DEFAULT_CONCEPT_URI_FIELD);
         entryTypeField = getRequiredStringParam(config, ENTRY_TYPE_FIELD, DEFAULT_ENTRY_TYPE_FIELD);
         similarityField = getRequiredStringParam(config, SIMILARTITY_FIELD, DEFAULT_SIMILARTITY_FIELD);
         acceptedLanguages = getStringListParan(config, LANGUAGES);
         acceptedLanguageSet = new HashSet<String>(acceptedLanguages);
         precisionField = getRequiredStringParam(config, PRECISION_FIELD, DEFAULT_PRECISION_FIELD);
         recallField = getRequiredStringParam(config, RECALL_FIELD, DEFAULT_RECALL_FIELD);
         modelUpdateDateField = getRequiredStringParam(config, MODEL_UPDATE_DATE_FIELD, DEFAULT_MODEL_UPDATE_DATE_FIELD);
         modelEvaluationDateField = getRequiredStringParam(config, MODEL_EVALUATION_DATE_FIELD, DEFAULT_MODEL_EVALUATION_DATE_FIELD);
         falsePositivesField = getRequiredStringParam(config, FALSE_POSITIVES_FIELD, DEFAULT_FALSE_POSITIVES_FIELD);
         falseNegativesField = getRequiredStringParam(config, FALSE_NEGATIVES_FIELD, DEFAULT_FALSE_NEGATIVES_FIELD);
         positiveSupportField = getRequiredStringParam(config, POSITIVE_SUPPORT_FIELD, DEFAULT_POSITIVE_SUPPORT_FIELD);
         negativeSupportField = getRequiredStringParam(config, NEGATIVE_SUPPORT_FIELD, DEFAULT_NEGATIVE_SUPPORT_FIELD);
         configureSolrCore(config, SOLR_CORE, engineName + "-model",SOLR_CORE_CONFIG);

         // optional fields, can be null
         broaderField = getRequiredStringParam(config, BROADER_FIELD, DEFAULT_BROADER_FIELD);
         primaryTopicUriField = getRequiredStringParam(config, PRIMARY_TOPIC_URI_FIELD, DEFAULT_PRIMARY_TOPIC_URI_FIELD);
         trainingSetId = (String) config.get(TRAINING_SET_ID);
         Object orderParamValue = config.get(ORDER);
         if (orderParamValue instanceof Number) {
             order = ((Number) orderParamValue).intValue();
         } else if(orderParamValue != null){
             try {
                 Integer.parseInt(orderParamValue.toString());
             }catch (NumberFormatException e) {
                 throw new ConfigurationException(ORDER, "The configured EnhancementEngine "
                     + "order MUST BE an Intever value!",e);
             }
         } else {
             order = DEFAULT_ENGINE_ORDER;
         }
     }

     @Override
     public int canEnhance(ContentItem ci) throws EngineException {
         if (ContentItemHelper.getBlob(ci, SUPPORTED_MIMETYPES) != null && getActiveSolrServer() != null) {
             String language = EnhancementEngineHelper.getLanguage(ci);
             if(acceptedLanguageSet.isEmpty() || acceptedLanguageSet.contains(language) ||
                     acceptedLanguageSet.contains("")){
                 return ENHANCE_SYNCHRONOUS;
             } else {
                 return CANNOT_ENHANCE;
             }
         } else {
             return CANNOT_ENHANCE;
         }
     }

     @Override
     public void computeEnhancements(ContentItem ci) throws EngineException {
         Entry<IRI,Blob> contentPart = ContentItemHelper.getBlob(ci, SUPPORTED_MIMETYPES);
         if (contentPart == null) {
             throw new IllegalStateException(
                     "No ContentPart with a supported Mime Type" + "found for ContentItem " + ci.getUri()
                             + "(supported: '" + SUPPORTED_MIMETYPES
                             + "') -> this indicates that canEnhance was"
                             + "NOT called and indicates a bug in the used EnhancementJobManager!");
         }
         String language = EnhancementEngineHelper.getLanguage(ci);
         if(!(acceptedLanguageSet.isEmpty() || acceptedLanguageSet.contains(language) ||
                 acceptedLanguageSet.contains(""))){
             throw new IllegalStateException("The language '"+language+"' of the ContentItem is not configured as "
                 +" active for this Engine (active: "+acceptedLanguageSet+").");
         }
         String text;
         try {
             text = ContentItemHelper.getText(contentPart.getValue());
         } catch (IOException e) {
             throw new InvalidContentException(String.format(
                 "Unable to extract " + " textual content from ContentPart %s of ContentItem %s!",
                 contentPart.getKey(), ci.getUri()), e);
         }
         if (text.trim().isEmpty()) {
             log.warn(
                 "ContentPart {} of ContentItem {} does not contain any " + "text to extract topics from",
                 contentPart.getKey(), ci.getUri());
             return;
         }
         Graph metadata = ci.getMetadata();
         List<TopicSuggestion> topics;
         try {
             topics = suggestTopics(text);
             if (topics.isEmpty()) {
                 return;
             }
         } catch (ClassifierException e) {
             throw new EngineException(e);
         }
         IRI precision = new IRI(NamespaceEnum.fise + "classifier/precision");
         IRI recall = new IRI(NamespaceEnum.fise + "classifier/recall");
         IRI f1 = new IRI(NamespaceEnum.fise + "classifier/f1");

         LiteralFactory lf = LiteralFactory.getInstance();
         ci.getLock().writeLock().lock();
         try {
             // Global text annotation to attach all the topic annotation to it.
             IRI textAnnotation = EnhancementEngineHelper.createTextEnhancement(ci, this);
             metadata.add(new TripleImpl(textAnnotation,
                     org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_TYPE,
                     OntologicalClasses.SKOS_CONCEPT));
             for (TopicSuggestion topic : topics) {
                 IRI enhancement = EnhancementEngineHelper.createEntityEnhancement(ci, this);
                 metadata.add(new TripleImpl(enhancement,
                         org.apache.stanbol.enhancer.servicesapi.rdf.Properties.RDF_TYPE,
                         TechnicalClasses.ENHANCER_TOPICANNOTATION));
                 metadata.add(new TripleImpl(enhancement,
                         org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_RELATION, textAnnotation));

                 // add link to entity
                 metadata.add(new TripleImpl(enhancement,
                         org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_REFERENCE,
                         new IRI(topic.conceptUri)));
                 metadata.add(new TripleImpl(enhancement,
                         org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_TYPE,
                         OntologicalClasses.SKOS_CONCEPT));

                 // add confidence information
                 metadata.add(new TripleImpl(enhancement,
                         org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_CONFIDENCE, lf
                                 .createTypedLiteral(Double.valueOf(topic.score))));

                 // add performance estimates of the classifier if available
                 ClassificationReport perf = getPerformanceEstimates(topic.conceptUri);
                 if (perf.uptodate) {
                     metadata.add(new TripleImpl(enhancement, precision, lf.createTypedLiteral(Double
                             .valueOf(perf.precision))));
                     metadata.add(new TripleImpl(enhancement, recall, lf.createTypedLiteral(Double
                             .valueOf(perf.recall))));
                     metadata.add(new TripleImpl(enhancement, f1, lf.createTypedLiteral(Double
                             .valueOf(perf.f1))));
                 }
                 // fetch concept label from the entityhub or a referenced site if available
                 Entity entity = entityhub.getEntity(topic.conceptUri);
                 if (entity == null) {
                     entity = referencedSiteManager.getEntity(topic.conceptUri);
                 }
                 if (entity != null) {
                     Representation representation = entity.getRepresentation();
                     // TODO: extract all languages based on some configuration instead of hardcoding English
                     Text label = representation.getFirst(NamespaceEnum.skos + "prefLabel", "en", "en-US",
                         "en-GB");
                     if (label == null) {
                         label = representation.getFirst(NamespaceEnum.rdfs + "label", "en", "en-US", "en-GB");
                     }
                     if (label != null) {
                         metadata.add(new TripleImpl(enhancement,
                                 org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_LABEL,
                                 new PlainLiteralImpl(label.getText())));
                     }
                 }
             }
         } catch (ClassifierException e) {
             throw new EngineException(e);
         } catch (IllegalArgumentException e) {
             throw new EngineException(e);
         } catch (EntityhubException e) {
             throw new EngineException(e);
         } finally {
             ci.getLock().writeLock().unlock();
         }
     }

     @Override
     public Map<String,Object> getServiceProperties() {
         return Collections.unmodifiableMap(Collections.singletonMap(ENHANCEMENT_ENGINE_ORDERING,
             (Object) order));
     }

     public static TopicClassificationEngine fromParameters(Dictionary<String,Object> config) throws ConfigurationException {
         TopicClassificationEngine engine = new TopicClassificationEngine();
         engine.configure(config);
         return engine;
     }

     // classifier API

     @Override
     public String getName() {
         return engineName;
     }

     @Override
     public List<String> getAcceptedLanguages() {
         return acceptedLanguages;
     }

     public List<TopicSuggestion> suggestTopics(Collection<Object> contents) throws ClassifierException {
         return suggestTopics(StringUtils.join(contents, "\n\n"));
     }

     public List<TopicSuggestion> suggestTopics(String text) throws ClassifierException {
         List<TopicSuggestion> suggestedTopics = new ArrayList<TopicSuggestion>(MAX_SUGGESTIONS * 3);
         SolrServer solrServer = getActiveSolrServer();
         SolrQuery query = new SolrQuery();
         query.setRequestHandler("/" + MoreLikeThisParams.MLT);
         query.setFilterQueries(entryTypeField + ":" + MODEL_ENTRY);
         query.set(MoreLikeThisParams.MATCH_INCLUDE, false);
         query.set(MoreLikeThisParams.MIN_DOC_FREQ, 1);
         query.set(MoreLikeThisParams.MIN_TERM_FREQ, 1);
         query.set(MoreLikeThisParams.MAX_QUERY_TERMS, 30);
         query.set(MoreLikeThisParams.MAX_NUM_TOKENS_PARSED, 10000);
         // TODO: find a way to parse the interesting terms and report them
         // for debugging / explanation in dedicated RDF data structure.
         // query.set(MoreLikeThisParams.INTERESTING_TERMS, "details");
         query.set(MoreLikeThisParams.SIMILARITY_FIELDS, similarityField);
         query.set(CommonParams.STREAM_BODY, text);
         // over query the number of suggestions to find a statistical cut based on the curve of the scores of
         // the top suggestion
         query.setRows(MAX_SUGGESTIONS * 3);
         query.setFields(conceptUriField);
         query.setIncludeScore(true);
         try {
             StreamQueryRequest request = new StreamQueryRequest(query);
             QueryResponse response = request.process(solrServer);
             SolrDocumentList results = response.getResults();
             for (SolrDocument result : results.toArray(new SolrDocument[0])) {
                 String conceptUri = (String) result.getFirstValue(conceptUriField);
                 if (conceptUri == null) {
                     throw new ClassifierException(String.format(
                         "Solr Core '%s' is missing required field '%s'.", solrCoreId, conceptUriField));
                 }
                 Float score = (Float) result.getFirstValue("score");

                 // fetch metadata
                 SolrQuery metadataQuery = new SolrQuery("*:*");
                 // use filter queries to leverage the Solr cache explicitly
                 metadataQuery.addFilterQuery(entryTypeField + ":" + METADATA_ENTRY);
                 metadataQuery
                         .addFilterQuery(conceptUriField + ":" + ClientUtils.escapeQueryChars(conceptUri));
                 metadataQuery.setFields(conceptUriField, broaderField, primaryTopicUriField);
                 SolrDocument metadata = solrServer.query(metadataQuery).getResults().get(0);
                 String primaryTopicUri = (String) metadata.getFirstValue(primaryTopicUriField);
                 suggestedTopics.add(new TopicSuggestion(conceptUri, primaryTopicUri, metadata
                         .getFieldValues(broaderField), score));
             }
         } catch (SolrServerException e) {
             if ("unknown handler: /mlt".equals(e.getCause().getMessage())) {
                 String message = String.format("SolrServer with id '%s' for topic engine '%s' lacks"
                                                + " configuration for the MoreLikeThisHandler", solrCoreId,
                     engineName);
                 throw new ClassifierException(message, e);
             } else {
                 throw new ClassifierException(e);
             }
         }
         if (suggestedTopics.size() <= 1) {
             // no need to apply the cutting heuristic
             return suggestedTopics;
         }
         // filter out suggestions that are less than some threshold based on the mean of the top scores
         float mean = 0.0f;
         for (TopicSuggestion suggestion : suggestedTopics) {
             mean += suggestion.score / suggestedTopics.size();
         }
         float threshold = 0.25f * suggestedTopics.get(0).score + 0.75f * mean;
         List<TopicSuggestion> filteredSuggestions = new ArrayList<TopicSuggestion>();
         for (TopicSuggestion suggestion : suggestedTopics) {
             if (filteredSuggestions.size() >= MAX_SUGGESTIONS) {
                 return filteredSuggestions;
             }
             if (filteredSuggestions.isEmpty() || suggestion.score > threshold) {
                 filteredSuggestions.add(suggestion);
             } else {
                 break;
             }
         }
         return filteredSuggestions;
     }

     @Override
     public Set<String> getNarrowerConcepts(String broadTopicId) throws ClassifierException {
         LinkedHashSet<String> narrowerConcepts = new LinkedHashSet<String>();
         if (broaderField == null) {
             return narrowerConcepts;
         }
         SolrServer solrServer = getActiveSolrServer();
         SolrQuery query = new SolrQuery("*:*");
         query.addFilterQuery(entryTypeField + ":" + METADATA_ENTRY);
         query.addFilterQuery(broaderField + ":" + ClientUtils.escapeQueryChars(broadTopicId));
         query.addField(conceptUriField);
         query.addSortField(conceptUriField, SolrQuery.ORDER.asc);
         try {
             for (SolrDocument result : solrServer.query(query).getResults()) {
                 narrowerConcepts.add(result.getFirstValue(conceptUriField).toString());
             }
         } catch (SolrServerException e) {
             String msg = String.format("Error while fetching narrower topics of '%s' on Solr Core '%s'.",
                 broadTopicId, solrCoreId);
             throw new ClassifierException(msg, e);
         }
         return narrowerConcepts;
     }

     @Override
     public Set<String> getBroaderConcepts(String id) throws ClassifierException {
         LinkedHashSet<String> broaderConcepts = new LinkedHashSet<String>();
         if (broaderField == null) {
             return broaderConcepts;
         }
         SolrServer solrServer = getActiveSolrServer();
         SolrQuery query = new SolrQuery("*:*");
         query.addFilterQuery(conceptUriField + ":" + ClientUtils.escapeQueryChars(id));
         query.addField(broaderField);
         try {
             for (SolrDocument result : solrServer.query(query).getResults()) {
                 // there should be only one results
                 Collection<Object> broaderFieldValues = result.getFieldValues(broaderField);
                 if (broaderFieldValues == null) {
                     continue;
                 }
                 for (Object value : broaderFieldValues) {
                     broaderConcepts.add(value.toString());
                 }
             }
         } catch (SolrServerException e) {
             String msg = String.format("Error while fetching broader topics of '%s' on Solr Core '%s'.", id,
                 solrCoreId);
             throw new ClassifierException(msg, e);
         }
         return broaderConcepts;
     }

     @Override
     public Set<String> getRootConcepts() throws ClassifierException {
         LinkedHashSet<String> rootConcepts = new LinkedHashSet<String>();
         SolrServer solrServer = getActiveSolrServer();
         SolrQuery query = new SolrQuery("*:*");
         // TODO: this can be very big on flat thesauri: should we enable a paging API instead?
         query.setRows(MAX_ROOTS);
         query.setFields(conceptUriField);
         query.setSortField(conceptUriField, SolrQuery.ORDER.asc);
         query.addFilterQuery(entryTypeField + ":" + METADATA_ENTRY);
         if (broaderField != null) {
             // find any topic with an empty broaderField
             query.addFilterQuery(" -" + broaderField + ":" + SOLR_NON_EMPTY_FIELD);
         }
         try {
             QueryResponse response = solrServer.query(query);
             if (response.getResults().size() >= MAX_ROOTS) {
                 log.warn(String.format("TopicClassifier '%s' has more than %d registered topic roots."
                                        + " Some roots might be ignored.", engineName, MAX_ROOTS));
             }
             for (SolrDocument result : response.getResults()) {
                 rootConcepts.add(result.getFirstValue(conceptUriField).toString());
             }
         } catch (SolrServerException e) {
             String msg = String.format("Error while fetching root topics on Solr Core '%s'.", solrCoreId);
             throw new ClassifierException(msg, e);
         }
         return rootConcepts;
     }

     @Override
     public void addConcept(String conceptUri, String primaryTopicUri, Collection<String> broaderConcepts) throws ClassifierException {
         // ensure that there is no previous topic registered with the same id
         removeConcept(conceptUri);

         SolrInputDocument metadataEntry = new SolrInputDocument();
         String metadataEntryId = UUID.randomUUID().toString();
         String modelEntryId = UUID.randomUUID().toString();
         metadataEntry.addField(conceptUriField, conceptUri);
         metadataEntry.addField(entryIdField, metadataEntryId);
         metadataEntry.addField(modelEntryIdField, modelEntryId);
         metadataEntry.addField(entryTypeField, METADATA_ENTRY);
         if (broaderConcepts != null && broaderField != null) {
             metadataEntry.addField(broaderField, broaderConcepts);
         }
         if (primaryTopicUri != null && primaryTopicUriField != null) {
             metadataEntry.addField(primaryTopicUriField, primaryTopicUri);
         }
         SolrInputDocument modelEntry = new SolrInputDocument();
         modelEntry.addField(entryIdField, modelEntryId);
         modelEntry.addField(conceptUriField, conceptUri);
         modelEntry.addField(entryTypeField, MODEL_ENTRY);
         if (broaderConcepts != null) {
             invalidateModelFields(broaderConcepts, modelUpdateDateField, modelEvaluationDateField);
         }
         SolrServer solrServer = getActiveSolrServer();
         try {
             UpdateRequest request = new UpdateRequest();
             request.add(metadataEntry);
             request.add(modelEntry);
             solrServer.request(request);
             solrServer.commit();
         } catch (Exception e) {
             String msg = String.format("Error adding topic with id '%s' on Solr Core '%s'", conceptUri,
                 solrCoreId);
             throw new ClassifierException(msg, e);
         }
     }

     @Override
     public void addConcept(String conceptId, Collection<String> broaderConcepts) throws ClassifierException {
         addConcept(conceptId, null, broaderConcepts);
     }

     /*
      * The commit is the responsibility of the caller.
      */
     protected void invalidateModelFields(Collection<String> conceptIds, String... fieldNames) throws ClassifierException {
         if (conceptIds.isEmpty() || fieldNames.length == 0) {
             return;
         }
         SolrServer solrServer = getActiveSolrServer();
         List<String> invalidatedFields = Arrays.asList(fieldNames);
         try {
             UpdateRequest request = new UpdateRequest();
             for (String conceptId : conceptIds) {
                 SolrQuery query = new SolrQuery("*:*");
                 query.addFilterQuery(entryTypeField + ":" + METADATA_ENTRY);
                 query.addFilterQuery(conceptUriField + ":" + ClientUtils.escapeQueryChars(conceptId));
                 for (SolrDocument result : solrServer.query(query).getResults()) {
                     // there should be only one (or none: tolerated)
                     SolrInputDocument newEntry = new SolrInputDocument();
                     for (String fieldName : result.getFieldNames()) {
                         if (!invalidatedFields.contains(fieldName)) {
                             newEntry.setField(fieldName, result.getFieldValues(fieldName));
                         }
                     }
                     request.add(newEntry);
                 }
             }
             if (request.getDocuments() != null && request.getDocuments().size() > 0) {
                 solrServer.request(request);
             }
         } catch (Exception e) {
             String msg = String.format("Error invalidating topics [%s] on Solr Core '%s'",
                 StringUtils.join(conceptIds, ", "), solrCoreId);
             throw new ClassifierException(msg, e);
         }
     }

     @Override
     public void removeAllConcepts() throws ClassifierException {
         SolrServer solrServer = getActiveSolrServer();
         try {
             solrServer.deleteByQuery("*:*");
             solrServer.commit();
         } catch (Exception e) {
             String msg = String.format("Error deleting concepts from Solr Core '%s'", solrCoreId);
             throw new ClassifierException(msg, e);
         }
     }

     @Override
     public void removeConcept(String conceptId) throws ClassifierException {
         if (conceptId == null || conceptId.isEmpty()) {
             throw new ClassifierException("conceptId must not be null or empty");
         }
         SolrServer solrServer = getActiveSolrServer();
         try {
             solrServer.deleteByQuery(conceptUriField + ":" + ClientUtils.escapeQueryChars(conceptId));
             solrServer.commit();
         } catch (Exception e) {
             String msg = String
                     .format("Error removing concept '%s' on Solr Core '%s'", conceptId, solrCoreId);
             throw new ClassifierException(msg, e);
         }
     }

     @Override
     public TrainingSet getTrainingSet() {
         if (trainingSet != null) {
             return trainingSet;
         }
         if (trainingSetTracker != null) {
             TrainingSet trainingsSet = (TrainingSet) trainingSetTracker.getService();
             if(trainingsSet == null){
                 for(int i=0; i < 5 && trainingsSet == null; i++){
                     try {
                         trainingsSet = (TrainingSet) trainingSetTracker.waitForService(1000);
                     } catch (InterruptedException e) {/*ignore*/}
                 }
             }
             return trainingsSet;
         }
         return null;
     }

     @Override
     public boolean isUpdatable() {
         return getTrainingSet() != null;
     }

     @Override
     public void setTrainingSet(TrainingSet trainingSet) {
         this.trainingSet = trainingSet;
     }

     protected int batchOverTopics(BatchProcessor<SolrDocument> processor) throws TrainingSetException {
         // TODO: implement incremental update by using the date informations
         int processedCount = 0;
         SolrServer solrServer = getActiveSolrServer();
         SolrQuery query = new SolrQuery("*:*");
         query.addFilterQuery(entryTypeField + ":" + METADATA_ENTRY);
         String offset = null;
         boolean done = false;
         int batchSize = 1000;
         query.addSortField(conceptUriField, SolrQuery.ORDER.asc);
         query.setRows(batchSize + 1);
         try {
             while (!done) {
                 // batch over all the indexed topics
                 if (offset != null) {
                     query.addFilterQuery(conceptUriField + ":["
                                          + ClientUtils.escapeQueryChars(offset) + " TO *]");
                 }
                 QueryResponse response = solrServer.query(query);
                 int count = 0;
                 List<SolrDocument> batchDocuments = new ArrayList<SolrDocument>();
                 for (SolrDocument result : response.getResults()) {
                     String conceptId = result.getFirstValue(conceptUriField).toString();
                     if (count == batchSize) {
                         offset = conceptId;
                     } else {
                         count++;
                         batchDocuments.add(result);
                     }
                 }
                 processedCount += processor.process(batchDocuments);
                 solrServer.commit();
                 if (count < batchSize) {
                     done = true;
                 }
             }
             solrServer.optimize();
         } catch (Exception e) {
             String msg = String.format("Error while updating topics on Solr Core '%s'.", solrCoreId);
             throw new TrainingSetException(msg, e);
         }
         return processedCount;
     }

     @Override
     public int updateModel(boolean incremental) throws TrainingSetException, ClassifierException {
         checkTrainingSet();
         long start = System.currentTimeMillis();
         if (incremental && modelUpdateDateField == null) {
             log.warn(MODEL_UPDATE_DATE_FIELD + " field is not configured: switching to batch update mode.");
             incremental = false;
         }
         final boolean incr = incremental;
         int updatedTopics = batchOverTopics(new BatchProcessor<SolrDocument>() {

             @Override
             public int process(List<SolrDocument> batch) throws ClassifierException, TrainingSetException {
                 int processed = 0;
                 for (SolrDocument result : batch) {
                     String conceptId = result.getFirstValue(conceptUriField).toString();
                     List<String> impactedTopics = new ArrayList<String>();
                     impactedTopics.add(conceptId);
                     impactedTopics.addAll(getNarrowerConcepts(conceptId));
                     if (incr) {
                         Date lastModelUpdate = (Date) result.getFirstValue(modelUpdateDateField);
                         if (lastModelUpdate != null
                             && !getTrainingSet().hasChangedSince(impactedTopics, lastModelUpdate)) {
                             continue;
                         }
                     }
                     String metadataEntryId = result.getFirstValue(entryIdField).toString();
                     String modelEntryId = result.getFirstValue(modelEntryIdField).toString();
                     String primaryTopicUri = null;
                     if (primaryTopicUriField != null) {
                         primaryTopicUri = (String) result.getFirstValue(primaryTopicUriField);
                     }
                     updateTopic(conceptId, metadataEntryId, modelEntryId, impactedTopics, primaryTopicUri,
                         result.getFieldValues(broaderField));
                     processed++;
                 }
                 return processed;
             }
         });
         long stop = System.currentTimeMillis();
         log.info("Sucessfully updated {} topics in {}s", updatedTopics, (double) (stop - start) / 1000.);
         return updatedTopics;
     }

     /**
      * @param conceptUri
      *            the topic model to update
      * @param metadataEntryId
      *            of the metadata entry id of the topic
      * @param modelEntryId
      *            of the model entry id of the topic
      * @param impactedTopics
      *            the list of impacted topics (e.g. the topic node and direct children)
      * @param primaryTopicUri
      * @param broaderConcepts
      *            the collection of broader to re-add in the broader field
      */
     protected void updateTopic(String conceptUri,
                                String metadataId,
                                String modelId,
                                List<String> impactedTopics,
                                String primaryTopicUri,
                                Collection<Object> broaderConcepts) throws TrainingSetException,
                                                                   ClassifierException {
         long start = System.currentTimeMillis();
         Batch<Example> examples = Batch.emtpyBatch(Example.class);
         StringBuffer sb = new StringBuffer();
         int offset = 0;
         do {
             examples = getTrainingSet().getPositiveExamples(impactedTopics, examples.nextOffset);
             for (Example example : examples.items) {
                 if ((cvFoldCount != 0) && (offset % cvFoldCount == cvFoldIndex)) {
                     // we are performing a cross validation session and this example belong to the test
                     // fold hence should be skipped
                     offset++;
                     continue;
                 }
                 offset++;
                 sb.append(StringUtils.join(example.contents, "\n\n"));
                 sb.append("\n\n");
             }
         } while (sb.length() < MAX_CHARS_PER_TOPIC && examples.hasMore);

         // reindex the topic with the new text data collected from the examples
         SolrInputDocument modelEntry = new SolrInputDocument();
         modelEntry.addField(entryIdField, modelId);
         modelEntry.addField(conceptUriField, conceptUri);
         modelEntry.addField(entryTypeField, MODEL_ENTRY);
         if (sb.length() > 0) {
             modelEntry.addField(similarityField, sb);
         }

         // update the metadata of the topic model
         SolrInputDocument metadataEntry = new SolrInputDocument();
         metadataEntry.addField(entryIdField, metadataId);
         metadataEntry.addField(modelEntryIdField, modelId);
         metadataEntry.addField(entryTypeField, METADATA_ENTRY);
         metadataEntry.addField(conceptUriField, conceptUri);
         if (primaryTopicUriField != null) {
             metadataEntry.addField(primaryTopicUriField, primaryTopicUri);
         }
         if (broaderConcepts != null && broaderField != null) {
             metadataEntry.addField(broaderField, broaderConcepts);
         }
         if (modelUpdateDateField != null) {
             metadataEntry.addField(modelUpdateDateField, UTCTimeStamper.nowUtcDate());
         }
         SolrServer solrServer = getActiveSolrServer();
         try {
             UpdateRequest request = new UpdateRequest();
             request.add(metadataEntry);
             request.add(modelEntry);
             solrServer.request(request);
             // the commit is done by the caller in batch
         } catch (Exception e) {
             String msg = String.format("Error updating topic with id '%s' on Solr Core '%s'", conceptUri,
                 solrCoreId);
             throw new ClassifierException(msg, e);
         }
         long stop = System.currentTimeMillis();
         log.debug("Sucessfully updated topic {} in {}s", conceptUri, (double) (stop - start) / 1000.);
     }

     protected void checkTrainingSet() throws TrainingSetException {
         if (getTrainingSet() == null) {
             throw new TrainingSetException(String.format("TopicClassificationEngine %s has no registered"
                                                          + " training set hence cannot be updated.",
                 engineName));
         }
     }

     @Override
     public void setCrossValidationInfo(int foldIndex, int foldCount) {
         if (foldIndex > foldCount - 1) {
             throw new IllegalArgumentException(String.format(
                 "foldIndex=%d should be smaller than foldCount=%d - 1", foldIndex, foldCount));
         }
         cvFoldIndex = foldIndex;
         cvFoldCount = foldCount;
     }

     protected Dictionary<String,Object> getCanonicalConfiguration(Object server, Object coreConfig) {
         Hashtable<String,Object> config = new Hashtable<String,Object>();
         config.put(EnhancementEngine.PROPERTY_NAME, engineName + "-evaluation");
         config.put(TopicClassificationEngine.ENTRY_ID_FIELD, "entry_id");
         config.put(TopicClassificationEngine.ENTRY_TYPE_FIELD, "entry_type");
         config.put(TopicClassificationEngine.MODEL_ENTRY_ID_FIELD, "model_entry_id");
         config.put(TopicClassificationEngine.SOLR_CORE, server);
         config.put(TopicClassificationEngine.SOLR_CORE_CONFIG, coreConfig);
         config.put(TopicClassificationEngine.CONCEPT_URI_FIELD, "concept");
         config.put(TopicClassificationEngine.PRIMARY_TOPIC_URI_FIELD, "primary_topic");
         config.put(TopicClassificationEngine.SIMILARTITY_FIELD, "classifier_features");
         config.put(TopicClassificationEngine.BROADER_FIELD, "broader");
         config.put(TopicClassificationEngine.MODEL_UPDATE_DATE_FIELD, "last_update_dt");
         config.put(TopicClassificationEngine.MODEL_EVALUATION_DATE_FIELD, "last_evaluation_dt");
         config.put(TopicClassificationEngine.PRECISION_FIELD, "precision");
         config.put(TopicClassificationEngine.RECALL_FIELD, "recall");
         config.put(TopicClassificationEngine.POSITIVE_SUPPORT_FIELD, "positive_support");
         config.put(TopicClassificationEngine.NEGATIVE_SUPPORT_FIELD, "negative_support");
         config.put(TopicClassificationEngine.FALSE_POSITIVES_FIELD, "false_positives");
         config.put(TopicClassificationEngine.FALSE_NEGATIVES_FIELD, "false_negatives");
         return config;
     }

     public boolean isEvaluationRunning() {
         return evaluationRunning;
     }

     synchronized public int updatePerformanceEstimates(boolean incremental) throws ClassifierException,
                                                                            TrainingSetException {
         checkTrainingSet();
         if (evaluationRunning) {
             throw new ClassifierException("Another evaluation is already running");
         }
         int updatedTopics = 0;
         // NOTE: The folder used to create the SolrServer used for CVFold
         //       is now created within the #embeddedSolrServerDir
         try {
             evaluationRunning = true;
             int cvFoldCount = 3; // 3-folds CV is hardcoded for now
             int cvIterationCount = 3; // make it possible to limit the number of folds to use

             // We will use the training set quite intensively, ensure that the index is packed and its
             // statistics are up to date
             getTrainingSet().optimize();
             for (int cvFoldIndex = 0; cvFoldIndex < cvIterationCount; cvFoldIndex++) {
                 updatedTopics = performCVFold(cvFoldIndex, cvFoldCount, cvIterationCount,
                     incremental);
             }
             SolrServer solrServer = getActiveSolrServer();
             solrServer.optimize();
         } catch (ConfigurationException e) {
             throw new ClassifierException(e);
         } catch (IOException e) {
             throw new ClassifierException(e);
         } catch (SolrServerException e) {
             throw new ClassifierException(e);
         } finally {
             FileUtils.deleteQuietly(__evaluationServerDir);
             evaluationRunning = false;
         }
         return updatedTopics;
     }

     protected int performCVFold(int cvFoldIndex, int cvFoldCount, int cvIterations, boolean incremental)
             throws ConfigurationException, TrainingSetException, ClassifierException {

         cvIterations = cvIterations <= 0 ? cvFoldCount : cvFoldCount;
         log.info(String.format("Performing evaluation %d-fold CV iteration %d/%d on classifier %s",
             cvFoldCount, cvFoldIndex + 1, cvIterations, engineName));
         long start = System.currentTimeMillis();
         final TopicClassificationEngine classifier = new TopicClassificationEngine();
         try {
             if (managedSolrServer != null) {
                 // OSGi setup: the evaluation server will be generated automatically using the
                 // managedSolrServer
                 classifier.bindManagedSolrServer(managedSolrServer);
                 classifier.activate(context, getCanonicalConfiguration(
                     engineName + "-evaluation", //TODO: maybe we should use the SolrCoreName instead
                     solrCoreConfig));
             } else {
                 if(__evaluationServer == null){
                     __evaluationServerDir = new File(embeddedSolrServerDir,engineName + "-evaluation");
                     if(!__evaluationServerDir.exists()){
                         FileUtils.forceMkdir(__evaluationServerDir);
                     }
                     __evaluationServer = EmbeddedSolrHelper.makeEmbeddedSolrServer(__evaluationServerDir,
                         "evaluationclassifierserver", "default-topic-model", "default-topic-model");
                 }
                 classifier.configure(getCanonicalConfiguration(__evaluationServer,solrCoreConfig));
             }
         } catch (Exception e) {
             throw new ClassifierException(e);
         }

         // clean all previous concepts from the evaluation classifier in case we are reusing an existing solr
         // index from OSGi.
         classifier.removeAllConcepts();

         // iterate over all the topics to register them in the evaluation classifier
         batchOverTopics(new BatchProcessor<SolrDocument>() {
             @Override
             public int process(List<SolrDocument> batch) throws ClassifierException {
                 for (SolrDocument topicEntry : batch) {
                     String conceptId = topicEntry.getFirstValue(conceptUriField).toString();
                     Collection<Object> broader = topicEntry.getFieldValues(broaderField);
                     if (broader == null) {
                         classifier.addConcept(conceptId, null, null);
                     } else {
                         List<String> broaderConcepts = new ArrayList<String>();
                         for (Object broaderConcept : broader) {
                             broaderConcepts.add(broaderConcept.toString());
                         }
                         classifier.addConcept(conceptId, null, broaderConcepts);
                     }
                 }
                 return batch.size();
             }
         });

         // build the model on the for the current train CV folds
         classifier.setCrossValidationInfo(cvFoldIndex, cvFoldCount);
         // bind our new classifier to the same training set at the parent
         classifier.setTrainingSet(getTrainingSet());
         classifier.updateModel(false);

         final int foldCount = cvFoldCount;
         final int foldIndex = cvFoldIndex;

         // iterate over the topics again to compute scores on the test fold
         int updatedTopics = batchOverTopics(new BatchProcessor<SolrDocument>() {

             @Override
             public int process(List<SolrDocument> batch) throws TrainingSetException, ClassifierException {
                 int offset;
                 int updated = 0;
                 for (SolrDocument topicMetadata : batch) {
                     String topic = topicMetadata.getFirstValue(conceptUriField).toString();
                     List<String> topics = Arrays.asList(topic);
                     List<String> falseNegativeExamples = new ArrayList<String>();
                     int truePositives = 0;
                     int falseNegatives = 0;
                     int positiveSupport = 0;
                     offset = 0;
                     Batch<Example> examples = Batch.emtpyBatch(Example.class);
                     boolean skipTopic = false;
                     do {
                         examples = getTrainingSet().getPositiveExamples(topics, examples.nextOffset);
                         if (offset == 0 && examples.items.size() < MIN_EVALUATION_SAMPLES) {
                             // we need a minimum about of examples otherwise it's really not
                             // worth computing statistics
                             skipTopic = true;
                             break;
                         }
                         for (Example example : examples.items) {
                             if (!(offset % foldCount == foldIndex)) {
                                 // this example is not part of the test fold, skip it
                                 offset++;
                                 continue;
                             }
                             positiveSupport++;
                             offset++;
                             List<TopicSuggestion> suggestedTopics = classifier
                                     .suggestTopics(example.contents);
                             boolean match = false;
                             for (TopicSuggestion suggestedTopic : suggestedTopics) {
                                 if (topic.equals(suggestedTopic.conceptUri)) {
                                     match = true;
                                     truePositives++;
                                     break;
                                 }
                             }
                             if (!match) {
                                 falseNegatives++;
                                 if (falseNegativeExamples.size() < MAX_COLLECTED_EXAMPLES / foldCount) {
                                     falseNegativeExamples.add(example.id);
                                 }
                             }
                         }
                     } while (!skipTopic && examples.hasMore && offset < MAX_EVALUATION_SAMPLES);

                     List<String> falsePositiveExamples = new ArrayList<String>();
                     int falsePositives = 0;
                     int negativeSupport = 0;
                     offset = 0;
                     examples = Batch.emtpyBatch(Example.class);
                     do {
                         if (skipTopic) {
                             break;
                         }
                         examples = getTrainingSet().getNegativeExamples(topics, examples.nextOffset);
                         for (Example example : examples.items) {
                             if (!(offset % foldCount == foldIndex)) {
                                 // this example is not part of the test fold, skip it
                                 offset++;
                                 continue;
                             }
                             negativeSupport++;
                             offset++;
                             List<TopicSuggestion> suggestedTopics = classifier
                                     .suggestTopics(example.contents);
                             for (TopicSuggestion suggestedTopic : suggestedTopics) {
                                 if (topic.equals(suggestedTopic.conceptUri)) {
                                     falsePositives++;
                                     if (falsePositiveExamples.size() < MAX_COLLECTED_EXAMPLES / foldCount) {
                                         falsePositiveExamples.add(example.id);
                                     }
                                     break;
                                 }
                             }
                             // we don't need to collect true negatives
                         }
                     } while (examples.hasMore && offset < MAX_EVALUATION_SAMPLES);

                     if (skipTopic) {
                         log.debug("Skipping evaluation of {} because too few positive examples.", topic);
                     } else {
                         // compute precision, recall and f1 score for the current test fold and topic
                         float precision = 0;
                         if (truePositives != 0 || falsePositives != 0) {
                             precision = truePositives / (float) (truePositives + falsePositives);
                         }
                         float recall = 0;
                         if (truePositives != 0 || falseNegatives != 0) {
                             recall = truePositives / (float) (truePositives + falseNegatives);
                         }
                         updatePerformanceMetadata(topic, precision, recall, positiveSupport, negativeSupport,
                             falsePositiveExamples, falseNegativeExamples);
                         updated += 1;
                     }
                 }
                 try {
                     getActiveSolrServer().commit();
                 } catch (Exception e) {
                     throw new ClassifierException(e);
                 }
                 return updated;
             }
         });

         long stop = System.currentTimeMillis();
         log.info(String.format("Finished CV iteration %d/%d on classifier %s in %fs.", cvFoldIndex + 1,
             cvFoldCount, engineName, (stop - start) / 1000.0));
         if (context != null) {
             // close open trackers
             classifier.deactivate(context);
         }
         return updatedTopics;
     }

     /**
      * Update the performance statistics in a metadata entry of a topic. It is the responsibility of the
      * caller to commit.
      */
     protected void updatePerformanceMetadata(String conceptId,
                                              float precision,
                                              float recall,
                                              int positiveSupport,
                                              int negativeSupport,
                                              List<String> falsePositiveExamples,
                                              List<String> falseNegativeExamples) throws ClassifierException {
         SolrServer solrServer = getActiveSolrServer();
         try {
             SolrQuery query = new SolrQuery("*:*");
             query.addFilterQuery(entryTypeField + ":" + METADATA_ENTRY);
             query.addFilterQuery(conceptUriField + ":" + ClientUtils.escapeQueryChars(conceptId));
             for (SolrDocument result : solrServer.query(query).getResults()) {
                 // there should be only one (or none: tolerated)
                 // fetch any old values to update (all metadata fields are assumed to be stored)s
                 Map<String,Collection<Object>> fieldValues = new HashMap<String,Collection<Object>>();
                 for (String fieldName : result.getFieldNames()) {
                     fieldValues.put(fieldName, result.getFieldValues(fieldName));
                 }
                 addToList(fieldValues, precisionField, precision);
                 addToList(fieldValues, recallField, recall);
                 increment(fieldValues, positiveSupportField, positiveSupport);
                 increment(fieldValues, negativeSupportField, negativeSupport);
                 addToList(fieldValues, falsePositivesField, falsePositiveExamples);
                 addToList(fieldValues, falseNegativesField, falseNegativeExamples);
                 SolrInputDocument newEntry = new SolrInputDocument();
                 for (Map.Entry<String,Collection<Object>> entry : fieldValues.entrySet()) {
                     newEntry.addField(entry.getKey(), entry.getValue());
                 }
                 newEntry.setField(modelEvaluationDateField, UTCTimeStamper.nowUtcDate());
                 solrServer.add(newEntry);
             }
             log.info(String.format("Performance for concept '%s': precision=%f, recall=%f,"
                                    + " positiveSupport=%d, negativeSupport=%d", conceptId, precision, recall,
                 positiveSupport, negativeSupport));
         } catch (Exception e) {
             String msg = String
                     .format("Error updating performance metadata for topic '%s' on Solr Core '%s'",
                         conceptId, solrCoreId);
             throw new ClassifierException(msg, e);
         }
     }

     protected void increment(Map<String,Collection<Object>> fieldValues, String fieldName, int count) {
         // this collection is expected to be a singleton for this particular field
         Collection<Object> oldValues = fieldValues.get(fieldName);
         if (oldValues != null && !oldValues.isEmpty()) {
             count += (Integer) oldValues.iterator().next();
         }
         Collection<Object> values = new ArrayList<Object>();
         values.add(count);
         fieldValues.put(fieldName, values);
     }

     @SuppressWarnings("unchecked")
     protected void addToList(Map<String,Collection<Object>> fieldValues, String fieldName, Object value) {
         Collection<Object> values = new ArrayList<Object>();
         if (fieldValues.get(fieldName) != null) {
             values.addAll(fieldValues.get(fieldName));
         }
         if (value instanceof Collection) {
             values.addAll((Collection<Object>) value);
         } else {
             values.add(value);
         }
         fieldValues.put(fieldName, values);
     }

     @Override
     public ClassificationReport getPerformanceEstimates(String conceptId) throws ClassifierException {

         SolrServer solrServer = getActiveSolrServer();
         SolrQuery query = new SolrQuery("*:*");
         query.addFilterQuery(entryTypeField + ":" + METADATA_ENTRY);
         query.addFilterQuery(conceptUriField + ":" + ClientUtils.escapeQueryChars(conceptId));
         try {
             SolrDocumentList results = solrServer.query(query).getResults();
             if (results.isEmpty()) {
                 throw new ClassifierException(String.format("'%s' is not a registered topic", conceptId));
             }
             SolrDocument metadata = results.get(0);
             Float precision = computeMeanValue(metadata, precisionField);
             Float recall = computeMeanValue(metadata, recallField);
             int positiveSupport = computeSumValue(metadata, positiveSupportField);
             int negativeSupport = computeSumValue(metadata, negativeSupportField);
             Date evaluationDate = (Date) metadata.getFirstValue(modelEvaluationDateField);
             boolean uptodate = evaluationDate != null;
             ClassificationReport report = new ClassificationReport(precision, recall, positiveSupport,
                     negativeSupport, uptodate, evaluationDate);
             if (metadata.getFieldValues(falsePositivesField) == null) {
                 metadata.setField(falsePositivesField, new ArrayList<Object>());
             }
             for (Object falsePositiveId : metadata.getFieldValues(falsePositivesField)) {
                 report.falsePositiveExampleIds.add(falsePositiveId.toString());
             }
             if (metadata.getFieldValues(falseNegativesField) == null) {
                 metadata.setField(falseNegativesField, new ArrayList<Object>());
             }
             for (Object falseNegativeId : metadata.getFieldValues(falseNegativesField)) {
                 report.falseNegativeExampleIds.add(falseNegativeId.toString());
             }
             return report;
         } catch (SolrServerException e) {
             throw new ClassifierException(String.format("Error fetching the performance report for topic "
                                                         + conceptId));
         }
     }

     protected Float computeMeanValue(SolrDocument metadata, String fielName) {
         Float mean = 0f;
         Collection<Object> values = metadata.getFieldValues(fielName);
         if (values == null || values.isEmpty()) {
             return mean;
         }
         for (Object v : values) {
             mean += (Float) v / values.size();
         }
         return mean;
     }

     protected Integer computeSumValue(SolrDocument metadata, String fielName) {
         Integer sum = 0;
         Collection<Object> values = metadata.getFieldValues(fielName);
         if (values == null || values.isEmpty()) {
             return sum;
         }
         for (Object v : values) {
             sum += (Integer) v;
         }
         return sum;
     }

     @Override
     public List<String> getChainNames() throws InvalidSyntaxException {
         List<String> chainNames = new ArrayList<String>();
         BundleContext bundleContext = context.getBundleContext();
         ServiceReference[] references = bundleContext.getServiceReferences(Chain.class.getName(), null);
         if (references != null) {
             for (ServiceReference ref : references) {
                 Chain chain = (Chain) bundleContext.getService(ref);
                 try {
                     if (chain.getEngines().contains(getName())) {
                         chainNames.add(chain.getName());
                     }
                 } catch (ChainException e) {
                     // This chain is currently not active ... ignore
                 }
             }
         }
         return chainNames;
     }

     @Override
     public int importConceptsFromGraph(ImmutableGraph graph, IRI conceptClass, IRI broaderProperty) throws ClassifierException {
         int importedCount = 0;
         Iterator<Triple> conceptIterator = graph.filter(null,
             org.apache.stanbol.enhancer.servicesapi.rdf.Properties.RDF_TYPE, conceptClass);
         while (conceptIterator.hasNext()) {
             Triple conceptTriple = conceptIterator.next();
             if (!(conceptTriple.getSubject() instanceof IRI)) {
                 continue;
             }
             IRI conceptUri = (IRI) conceptTriple.getSubject();
             GraphNode node = new GraphNode(conceptUri, graph);
             List<String> broaderConcepts = new ArrayList<String>();
             // TODO: use OWL property inference on sub-properties here instead of explicit
             // property filter
             Iterator<GraphNode> broaderIterator = node.getObjectNodes(broaderProperty);
             while (broaderIterator.hasNext()) {
                 RDFTerm node2 = broaderIterator.next().getNode();
                 if (node2 instanceof IRI) {
                     broaderConcepts.add(((IRI) node2).getUnicodeString());
                 }
             }
             addConcept(conceptUri.getUnicodeString(), broaderConcepts);
             importedCount++;
         }
         return importedCount;
     }
 }