engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java - stanbol - Git at Google

 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.stanbol.enhancer.engines.keywordextraction.engine;

 import static org.apache.stanbol.entityhub.servicesapi.defaults.NamespaceEnum.getFullName;

 import java.io.IOException;
 import java.net.URI;
 import java.net.URISyntaxException;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.Dictionary;
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.Map;
 import java.util.Map.Entry;
 import java.util.Set;

 import org.apache.clerezza.rdf.core.Language;
 import org.apache.clerezza.rdf.core.Literal;
 import org.apache.clerezza.rdf.core.LiteralFactory;
 import org.apache.clerezza.rdf.core.MGraph;
 import org.apache.clerezza.rdf.core.Triple;
 import org.apache.clerezza.rdf.core.UriRef;
 import org.apache.clerezza.rdf.core.impl.PlainLiteralImpl;
 import org.apache.clerezza.rdf.core.impl.TripleImpl;
 import org.apache.commons.lang.StringUtils;
 import org.apache.felix.scr.annotations.Activate;
 import org.apache.felix.scr.annotations.Component;
 import org.apache.felix.scr.annotations.ConfigurationPolicy;
 import org.apache.felix.scr.annotations.Deactivate;
 import org.apache.felix.scr.annotations.Property;
 import org.apache.felix.scr.annotations.PropertyOption;
 import org.apache.felix.scr.annotations.ReferenceCardinality;
 import org.apache.felix.scr.annotations.ReferencePolicy;
 import org.apache.felix.scr.annotations.ReferenceStrategy;
 import org.apache.felix.scr.annotations.Service;
 import org.apache.stanbol.commons.opennlp.OpenNLP;
 import org.apache.stanbol.commons.opennlp.TextAnalyzer;
 import org.apache.stanbol.commons.opennlp.TextAnalyzer.TextAnalyzerConfig;
 import org.apache.stanbol.commons.stanboltools.offline.OfflineMode;
 import org.apache.stanbol.enhancer.engines.keywordextraction.linking.AnalysedContent;
 import org.apache.stanbol.enhancer.engines.keywordextraction.linking.EntityLinker;
 import org.apache.stanbol.enhancer.engines.keywordextraction.linking.EntityLinkerConfig;
 import org.apache.stanbol.enhancer.engines.keywordextraction.linking.EntitySearcher;
 import org.apache.stanbol.enhancer.engines.keywordextraction.linking.LinkedEntity;
 import org.apache.stanbol.enhancer.engines.keywordextraction.linking.Suggestion;
 import org.apache.stanbol.enhancer.engines.keywordextraction.linking.EntityLinkerConfig.RedirectProcessingMode;
 import org.apache.stanbol.enhancer.engines.keywordextraction.linking.LinkedEntity.Occurrence;
 import org.apache.stanbol.enhancer.engines.keywordextraction.linking.impl.EntityhubSearcher;
 import org.apache.stanbol.enhancer.engines.keywordextraction.linking.impl.OpenNlpAnalysedContentFactory;
 import org.apache.stanbol.enhancer.engines.keywordextraction.linking.impl.ReferencedSiteSearcher;
 import org.apache.stanbol.enhancer.engines.keywordextraction.linking.impl.TrackingEntitySearcher;
 import org.apache.stanbol.enhancer.servicesapi.Blob;
 import org.apache.stanbol.enhancer.servicesapi.ContentItem;
 import org.apache.stanbol.enhancer.servicesapi.EngineException;
 import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
 import org.apache.stanbol.enhancer.servicesapi.InvalidContentException;
 import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
 import org.apache.stanbol.enhancer.servicesapi.helper.ContentItemHelper;
 import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
 import org.apache.stanbol.enhancer.servicesapi.impl.AbstractEnhancementEngine;
 import org.apache.stanbol.enhancer.servicesapi.rdf.Properties;
 import org.apache.stanbol.entityhub.model.clerezza.RdfValueFactory;
 import org.apache.stanbol.entityhub.servicesapi.Entityhub;
 import org.apache.stanbol.entityhub.servicesapi.defaults.NamespaceEnum;
 import org.apache.stanbol.entityhub.servicesapi.model.Reference;
 import org.apache.stanbol.entityhub.servicesapi.model.Text;
 import org.apache.stanbol.entityhub.servicesapi.model.rdf.RdfResourceEnum;
 import org.osgi.framework.Constants;
 import org.osgi.service.cm.ConfigurationException;
 import org.osgi.service.component.ComponentContext;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 @Component(
     configurationFactory = true,
     policy = ConfigurationPolicy.REQUIRE, // the baseUri is required!
     specVersion = "1.1",
     metatype = true,
     immediate = true,
     inherit = true)
 @Service
 @org.apache.felix.scr.annotations.Properties(value={
     @Property(name=EnhancementEngine.PROPERTY_NAME),
     @Property(name=KeywordLinkingEngine.REFERENCED_SITE_ID),
     @Property(name=KeywordLinkingEngine.NAME_FIELD,value=EntityLinkerConfig.DEFAULT_NAME_FIELD),
     @Property(name=KeywordLinkingEngine.CASE_SENSITIVE,boolValue=EntityLinkerConfig.DEFAULT_CASE_SENSITIVE_MATCHING_STATE),
     @Property(name=KeywordLinkingEngine.TYPE_FIELD,value=EntityLinkerConfig.DEFAULT_TYPE_FIELD),
     @Property(name=KeywordLinkingEngine.REDIRECT_FIELD,value=EntityLinkerConfig.DEFAULT_REDIRECT_FIELD),
     @Property(name=KeywordLinkingEngine.REDIRECT_PROCESSING_MODE,options={
         @PropertyOption(
             value='%'+KeywordLinkingEngine.REDIRECT_PROCESSING_MODE+".option.ignore",
             name="IGNORE"),
         @PropertyOption(
             value='%'+KeywordLinkingEngine.REDIRECT_PROCESSING_MODE+".option.addValues",
             name="ADD_VALUES"),
         @PropertyOption(
                 value='%'+KeywordLinkingEngine.REDIRECT_PROCESSING_MODE+".option.follow",
                 name="FOLLOW")
         },value="IGNORE"),
     @Property(name=KeywordLinkingEngine.MIN_SEARCH_TOKEN_LENGTH,
         intValue=EntityLinkerConfig.DEFAULT_MIN_SEARCH_TOKEN_LENGTH),
     @Property(name=KeywordLinkingEngine.MIN_TOKEN_MATCH_FACTOR,floatValue=
             EntityLinkerConfig.DEFAULT_MIN_TOKEN_MATCH_FACTOR),
     @Property(name=KeywordLinkingEngine.KEYWORD_TOKENIZER,boolValue=false),
     @Property(name=KeywordLinkingEngine.MAX_SUGGESTIONS,
         intValue=EntityLinkerConfig.DEFAULT_SUGGESTIONS),
     @Property(name=KeywordLinkingEngine.PROCESSED_LANGUAGES,value=""),
     @Property(name=KeywordLinkingEngine.DEFAULT_MATCHING_LANGUAGE,value=""),
     @Property(name=KeywordLinkingEngine.TYPE_MAPPINGS,cardinality=1000),
     @Property(name=KeywordLinkingEngine.DEREFERENCE_ENTITIES,
         boolValue=KeywordLinkingEngine.DEFAULT_DEREFERENCE_ENTITIES_STATE),
     @Property(name=Constants.SERVICE_RANKING,intValue=0)
 })
 public class KeywordLinkingEngine
         extends AbstractEnhancementEngine<RuntimeException,RuntimeException>
         implements EnhancementEngine, ServiceProperties {

     private final Logger log = LoggerFactory.getLogger(KeywordLinkingEngine.class);
     /**
      * This is used to check the content type of parsed {@link ContentItem}s for
      * plain text
      */
     protected static final String TEXT_PLAIN_MIMETYPE = "text/plain";
     /**
      * Contains the only supported mime type {@link #TEXT_PLAIN_MIMETYPE}
      */
     protected static final Set<String> SUPPORTED_MIMETYPES = Collections.singleton(TEXT_PLAIN_MIMETYPE);
     /**
      * The default value for the Execution of this Engine.
      * This Engine creates TextAnnotations that should not be processed by other Engines.
      * Therefore it uses a lower rank than {@link ServiceProperties#ORDERING_DEFAULT}
      * to ensure that other engines do not get confused
      */
     public static final Integer DEFAULT_ORDER = ServiceProperties.ORDERING_DEFAULT - 10;


     public static final String REFERENCED_SITE_ID = "org.apache.stanbol.enhancer.engines.keywordextraction.referencedSiteId";
     public static final String NAME_FIELD = "org.apache.stanbol.enhancer.engines.keywordextraction.nameField";
     public static final String TYPE_FIELD = "org.apache.stanbol.enhancer.engines.keywordextraction.typeField";
     public static final String CASE_SENSITIVE = "org.apache.stanbol.enhancer.engines.keywordextraction.caseSensitive";
     public static final String REDIRECT_FIELD = "org.apache.stanbol.enhancer.engines.keywordextraction.redirectField";
     public static final String REDIRECT_PROCESSING_MODE = "org.apache.stanbol.enhancer.engines.keywordextraction.redirectMode";
     public static final String MIN_SEARCH_TOKEN_LENGTH = "org.apache.stanbol.enhancer.engines.keywordextraction.minSearchTokenLength";
     public static final String MAX_SUGGESTIONS = "org.apache.stanbol.enhancer.engines.keywordextraction.maxSuggestions";
     public static final String PROCESSED_LANGUAGES = "org.apache.stanbol.enhancer.engines.keywordextraction.processedLanguages";
     public static final String MIN_FOUND_TOKENS= "org.apache.stanbol.enhancer.engines.keywordextraction.minFoundTokens";
     public static final String DEFAULT_MATCHING_LANGUAGE = "org.apache.stanbol.enhancer.engines.keywordextraction.defaultMatchingLanguage";
     public static final String MIN_POS_TAG_PROBABILITY = "org.apache.stanbol.enhancer.engines.keywordextraction.minPosTagProbability";
     public static final String TYPE_MAPPINGS = "org.apache.stanbol.enhancer.engines.keywordextraction.typeMappings";
     public static final String KEYWORD_TOKENIZER = "org.apache.stanbol.enhancer.engines.keywordextraction.keywordTokenizer";
     public static final String MIN_TOKEN_MATCH_FACTOR = "org.apache.stanbol.enhancer.engines.keywordextraction.minTokenMatchFactor";
 //  public static final String ENABLE_CHUNKER = "org.apache.stanbol.enhancer.engines.keywordextraction.enableChunker";
     /**
      * Adds the dereference feature (STANBOL-333) also to this engine.
      * This will be replaced by STANBOL-336.
      */
     public static final String DEREFERENCE_ENTITIES = "org.apache.stanbol.enhancer.engines.keywordextraction.dereference";
     /**
      * The default state to dereference entities set to <code>true</code>.
      */
     public static final boolean DEFAULT_DEREFERENCE_ENTITIES_STATE = true;
     /**
      * Additional fields added for dereferenced entities
      */
     private static final Collection<String> DEREFERENCE_FIELDS = Arrays.asList(
         getFullName("rdfs:comment"),
         getFullName("geo:lat"),
         getFullName("geo:long"),
         getFullName("foaf:depiction"),
         getFullName("dbp-ont:thumbnail"));
     /**
      * The dereferenceEntitiesState as set in {@link #activateEntityDereference(Dictionary)}
      */
     private boolean dereferenceEntitiesState;
     /**
      * Default set of languages. This is an empty set indicating that texts in any
      * language are processed.
      */
     public static final Set<String> DEFAULT_LANGUAGES = Collections.emptySet();
     public static final double DEFAULT_MIN_POS_TAG_PROBABILITY = 0.6667;
     /**
      * The languages this engine is configured to enhance. An empty List is
      * considered as active for any language
      */
     private Set<String> languages = DEFAULT_LANGUAGES;
     /**
      * The literal representing the LangIDEngine as creator.
      */
     public static final Literal LANG_ID_ENGINE_NAME = LiteralFactory.getInstance().createTypedLiteral("org.apache.stanbol.enhancer.engines.langid.LangIdEnhancementEngine");

     /**
      * The default value for the LIMIT of the {@link EntitySearcher}
      */
     private static final int DEFAULT_ENTITY_SEARCHER_LIMIT = 10;

     private EntitySearcher entitySearcher;
     private EntityLinkerConfig linkerConfig;
     private TextAnalyzerConfig nlpConfig;

     /**
      * The reference to the OpenNLP component
      */
     @org.apache.felix.scr.annotations.Reference
     private OpenNLP openNLP;
     //TextAnalyzer was changed to have a scope of a single request ( call to
     //#computeEnhancement!
     //private TextAnalyzer textAnalyser;
     /**
      * Used to create {@link AnalysedContent} instances for parsed content items
      */
     private OpenNlpAnalysedContentFactory analysedContentFactory;
     /**
      * The literalFactory used to create typed literals
      */
     private LiteralFactory literalFactory = LiteralFactory.getInstance();

     /**
      * The {@link OfflineMode} is used by Stanbol to indicate that no external service should be referenced.
      * For this engine that means it is necessary to check if the used {@link ReferencedSite} can operate
      * offline or not.
      *
      * @see #enableOfflineMode(OfflineMode)
      * @see #disableOfflineMode(OfflineMode)
      */
     @org.apache.felix.scr.annotations.Reference(
         cardinality = ReferenceCardinality.OPTIONAL_UNARY,
         policy = ReferencePolicy.DYNAMIC,
         bind = "enableOfflineMode",
         unbind = "disableOfflineMode",
         strategy = ReferenceStrategy.EVENT)
     private OfflineMode offlineMode;
     /**
      * The name of the reference site ('local' or 'entityhub') if the
      * Entityhub is used for enhancing
      */
     protected String referencedSiteName;

     /**
      * Called by the ConfigurationAdmin to bind the {@link #offlineMode} if the service becomes available
      *
      * @param mode
      */
     protected final void enableOfflineMode(OfflineMode mode) {
         this.offlineMode = mode;
     }

     /**
      * Called by the ConfigurationAdmin to unbind the {@link #offlineMode} if the service becomes unavailable
      *
      * @param mode
      */
     protected final void disableOfflineMode(OfflineMode mode) {
         this.offlineMode = null;
     }

     /**
      * Returns <code>true</code> only if Stanbol operates in {@link OfflineMode}.
      *
      * @return the offline state
      */
     protected final boolean isOfflineMode() {
         return offlineMode != null;
     }

     /**
      * Default constructor as used by OSGI. This expects that
      * {@link #activate(ComponentContext)} is called before usage
      */
     public KeywordLinkingEngine() {
     }
     /**
      * Internal Constructor used by {@link #createInstance(OpenNLP, EntitySearcher, EntityLinkerConfig)}
      * @param openNLP
      * @param entitySearcher
      * @param config
      */
     protected KeywordLinkingEngine(OpenNLP openNLP,EntitySearcher entitySearcher,
                                    TextAnalyzerConfig nlpConfig,EntityLinkerConfig linkingConfig){
         this.openNLP = openNLP;
         this.linkerConfig = linkingConfig != null ? linkingConfig : new EntityLinkerConfig();
         this.nlpConfig = nlpConfig != null ? nlpConfig : new TextAnalyzerConfig();
         this.analysedContentFactory = OpenNlpAnalysedContentFactory.getInstance(openNLP,nlpConfig);
         this.entitySearcher = entitySearcher;
     }
     /**
      * Allows to create an instance that can be used outside of an OSGI
      * environment. This is mainly intended for unit tests.
      * @param openNLP The {@link OpenNLP} instance used for natural language processing
      * @param entitySearcher the searcher used to lookup terms
      * @param config the configuration or <code>null</code> to use the defaults
      * @return the created engine instance
      */
     public static KeywordLinkingEngine createInstance(OpenNLP openNLP,
                                                       EntitySearcher entitySearcher,
                                                       TextAnalyzerConfig nlpConfig,
                                                       EntityLinkerConfig linkingConfig){
         return new KeywordLinkingEngine(openNLP,entitySearcher,nlpConfig,linkingConfig);
     }


     /**
      * Checks if the parsed language is enabled for processing.
      * @param language The language to process
      * @return the processing state for the parsed language.
      */
     protected boolean isProcessableLanguages(String language) {
         return languages.isEmpty() || languages.contains(language);
     }

     @Override
     public Map<String,Object> getServiceProperties() {
         return Collections.unmodifiableMap(Collections.singletonMap(
             ENHANCEMENT_ENGINE_ORDERING,
             (Object) DEFAULT_ORDER));
     }

     @Override
     public int canEnhance(ContentItem ci) throws EngineException {
         if(ContentItemHelper.getBlob(ci, SUPPORTED_MIMETYPES) != null){
             return ENHANCE_ASYNC; //KeywordLinking now supports async processing
         } else {
             return CANNOT_ENHANCE;
         }
     }

     @Override
     public void computeEnhancements(ContentItem ci) throws EngineException {
         if(isOfflineMode() && !entitySearcher.supportsOfflineMode()){
             throw new EngineException("Offline mode is not supported by the Component used to lookup Entities");
         }
         Entry<UriRef,Blob> contentPart = ContentItemHelper.getBlob(ci, SUPPORTED_MIMETYPES);
         if(contentPart == null){
             throw new IllegalStateException("No ContentPart with a supported Mime Type"
                     + "found for ContentItem "+ci.getUri()+"(supported: '"
                     + SUPPORTED_MIMETYPES+"') -> this indicates that canEnhance was"
                     + "NOT called and indicates a bug in the used EnhancementJobManager!");
         }
         String text;
         try {
             text = ContentItemHelper.getText(contentPart.getValue());
         } catch (IOException e) {
             throw new InvalidContentException(String.format("Unable to extract "
                 +" text from ContentPart %s of ContentItem %s!",
                 contentPart.getKey(),ci.getUri()),e);
         }
         if (text.trim().length() == 0) {
             // TODO: make the length of the data a field of the ContentItem
             // interface to be able to filter out empty items in the canEnhance
             // method
             log.warn("ContentPart {} of ContentItem does not contain any Text to extract knowledge from",
                 contentPart.getKey(), ci);
             return;
         }
         //Determine the language
         String language;
         ci.getLock().readLock().lock();
         try {
          language = extractLanguage(ci);
         } finally {
             ci.getLock().readLock().unlock();
         }
         if(isProcessableLanguages(language)){
             log.debug("computeEnhancements for ContentItem {} language {} text={}",
                 new Object []{ci.getUri().getUnicodeString(), language, StringUtils.abbreviate(text, 100)});

             EntityLinker entityLinker = new EntityLinker(
                 analysedContentFactory.create(text, language),
                 entitySearcher, linkerConfig);
             //process
             entityLinker.process();
             //write results (requires a write lock)
             ci.getLock().writeLock().lock();
             try {
                 writeEnhancements(ci, entityLinker.getLinkedEntities().values(), language);
             } finally {
                 ci.getLock().writeLock().unlock();
             }
         } else {
             log.debug("ignore ContentItem {} because language '{}' is not configured to" +
             		"be processed by this engine.",ci.getUri().getUnicodeString(),language);
         }

     }

     /**
      * Writes the Enhancements for the {@link LinkedEntity LinkedEntities}
      * extracted from the parsed ContentItem
      * @param ci
      * @param linkedEntities
      * @param language
      */
     private void writeEnhancements(ContentItem ci, Collection<LinkedEntity> linkedEntities, String language) {
         Language languageObject = null;
         if(language != null && !language.isEmpty()){
             languageObject = new Language(language);
         }
         MGraph metadata = ci.getMetadata();
         for(LinkedEntity linkedEntity : linkedEntities){
             Collection<UriRef> textAnnotations = new ArrayList<UriRef>(linkedEntity.getOccurrences().size());
             //first create the TextAnnotations for the Occurrences
             for(Occurrence occurrence : linkedEntity.getOccurrences()){
                 UriRef textAnnotation = EnhancementEngineHelper.createTextEnhancement(ci, this);
                 textAnnotations.add(textAnnotation);
                 metadata.add(new TripleImpl(textAnnotation,
                     Properties.ENHANCER_START,
                     literalFactory.createTypedLiteral(occurrence.getStart())));
                 metadata.add(new TripleImpl(textAnnotation,
                     Properties.ENHANCER_END,
                     literalFactory.createTypedLiteral(occurrence.getEnd())));
                 metadata.add(new TripleImpl(textAnnotation,
                     Properties.ENHANCER_SELECTION_CONTEXT,
                     new PlainLiteralImpl(occurrence.getContext(),languageObject)));
                 metadata.add(new TripleImpl(textAnnotation,
                     Properties.ENHANCER_SELECTED_TEXT,
                     new PlainLiteralImpl(occurrence.getSelectedText(),languageObject)));
                 metadata.add(new TripleImpl(textAnnotation,
                     Properties.ENHANCER_CONFIDENCE,
                     literalFactory.createTypedLiteral(linkedEntity.getScore())));
                 for(UriRef dcType : linkedEntity.getTypes()){
                     metadata.add(new TripleImpl(
                         textAnnotation, Properties.DC_TYPE, dcType));
                 }
             }
             //now the EntityAnnotations for the Suggestions
             for(Suggestion suggestion : linkedEntity.getSuggestions()){
                 UriRef entityAnnotation = EnhancementEngineHelper.createEntityEnhancement(ci, this);
                 //should we use the label used for the match, or search the
                 //representation for the best label ... currently its the matched one
                 Text label = suggestion.getBestLabel(linkerConfig.getNameField(),language);
                 metadata.add(new TripleImpl(entityAnnotation,
                     Properties.ENHANCER_ENTITY_LABEL,
                     label.getLanguage() == null ?
                             new PlainLiteralImpl(label.getText()) :
                                 new PlainLiteralImpl(label.getText(),
                                     new Language(label.getLanguage()))));
                 metadata.add(new TripleImpl(entityAnnotation,
                     Properties.ENHANCER_ENTITY_REFERENCE,
                     new UriRef(suggestion.getRepresentation().getId())));
                 Iterator<Reference> suggestionTypes = suggestion.getRepresentation().getReferences(linkerConfig.getTypeField());
                 while(suggestionTypes.hasNext()){
                     metadata.add(new TripleImpl(entityAnnotation,
                         Properties.ENHANCER_ENTITY_TYPE, new UriRef(suggestionTypes.next().getReference())));
                 }
                 metadata.add(new TripleImpl(entityAnnotation,
                     Properties.ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(suggestion.getScore())));
                 for(UriRef textAnnotation : textAnnotations){
                     metadata.add(new TripleImpl(entityAnnotation,
                         Properties.DC_RELATION, textAnnotation));
                 }
                 //add the name of the ReferencedSite providing this suggestion
                 metadata.add(new TripleImpl(entityAnnotation,
                     new UriRef(RdfResourceEnum.site.getUri()),
                     new PlainLiteralImpl(referencedSiteName)));
                 //in case dereferencing of Entities is enabled we need also to
                 //add the RDF data for entities
                 if(dereferenceEntitiesState){
                     metadata.addAll(
                         RdfValueFactory.getInstance().toRdfRepresentation(
                             suggestion.getRepresentation()).getRdfGraph());
                 }
             }
         }
     }
     /**
      * Extracts the language of the parsed ContentItem by using
      * {@link EnhancementEngineHelper#getLanguage(ContentItem)} and "en" as
      * default.
      * @param ci the content item
      * @return the language
      */
     private String extractLanguage(ContentItem ci) {
         String lang = EnhancementEngineHelper.getLanguage(ci);
 //        if(lang != null){
 //        MGraph metadata = ci.getMetadata();
 //        Iterator<Triple> langaugeEnhancementCreatorTriples =
 //            metadata.filter(null, Properties.DC_CREATOR, LANG_ID_ENGINE_NAME);
 //        if(langaugeEnhancementCreatorTriples.hasNext()){
 //            String lang = EnhancementEngineHelper.getString(metadata,
 //                langaugeEnhancementCreatorTriples.next().getSubject(),
 //                Properties.DC_LANGUAGE);
         if(lang != null){
             return lang;
         } else {
             log.warn("Unable to extract language for ContentItem %s! The Enhancement of the %s is missing the %s property",
                 new Object[]{ci.getUri().getUnicodeString(),LANG_ID_ENGINE_NAME.getLexicalForm(),Properties.DC_LANGUAGE});
             log.warn(" ... return 'en' as default");
             return "en";
         }
 //        } else {
 //            log.warn("Unable to extract language for ContentItem %s! Is the %s active?",
 //                ci.getUri().getUnicodeString(),LANG_ID_ENGINE_NAME.getLexicalForm());
 //            log.warn(" ... return 'en' as default");
 //            return "en";
 //        }
     }


     /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
      * Methods for activate() and deactivate() the properties configureable via
      * OSGI.
      *
      * NOTEs:
      * Directly calling super.activate and super.deactivate
      * is possible but might not be applicable in all cases.
      * The activate**(...) and deactivate**() Methods are intended to be
      * called by subclasses that need more control over the initialisation
      * process.
      * - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
      */
     /**
      * Activates this Engine. Subclasses should not call this method but rather
      * call<ul>
      * <li> {@link #activateEntitySearcher(ComponentContext, Dictionary)}
      * <li> {@link #initEntityLinkerConfig(Dictionary, EntityLinkerConfig)} and
      * <li> {@link #activateTextAnalyzerConfig(Dictionary)}
      * <li> {@link #dereferenceEntitiesState} (needs to be called after
      * {@link #initEntityLinkerConfig(Dictionary, EntityLinkerConfig)})
      * </ul>
      * if applicable.
      * @param context the Component context
      * @throws ConfigurationException if the required {@link #REFERENCED_SITE_ID}
      * configuration is missing or any of the other properties has an illegal value
      */
     @Activate
     @SuppressWarnings("unchecked")
     protected void activate(ComponentContext context) throws ConfigurationException {
         super.activate(context);
         Dictionary<String,Object> properties = context.getProperties();
         activateTextAnalyzerConfig(properties);
         activateEntitySearcher(context, properties);
         activateEntityLinkerConfig(properties);
         activateEntityDereference(properties);
     }

     /**
      * Inits the {@link #dereferenceEntitiesState} based on the
      * {@link #DEREFERENCE_ENTITIES} configuration.
      * @param properties the configuration
      */
     protected final void activateEntityDereference(Dictionary<String,Object> properties) {
         Object value = properties.get(DEREFERENCE_ENTITIES);
         if(value instanceof Boolean){
             dereferenceEntitiesState = ((Boolean)value).booleanValue();
         } else if(value != null && !value.toString().isEmpty()){
             dereferenceEntitiesState = Boolean.parseBoolean(value.toString());
         } else {
             dereferenceEntitiesState = DEFAULT_DEREFERENCE_ENTITIES_STATE;
         }
         if(dereferenceEntitiesState){
             linkerConfig.getSelectedFields().addAll(DEREFERENCE_FIELDS);
         }
     }

     /**
      * Initialise the {@link TextAnalyzer} component.<p>
      * Currently this includes the following configurations: <ul>
      * <li>{@link #PROCESSED_LANGUAGES}: If no configuration is present the
      * default (process all languages) is used.
      * <li> {@value #MIN_POS_TAG_PROBABILITY}: If no configuration is
      * present the #DEFAULT_MIN_POS_TAG_PROBABILITY is used
      * languages based on the value of the
      *
      * @param configuration the OSGI component configuration
      */
     protected final void activateTextAnalyzerConfig(Dictionary<String,Object> configuration) throws ConfigurationException {
         nlpConfig = new TextAnalyzerConfig();
         Object value;
         value = configuration.get(PROCESSED_LANGUAGES);
         if(value == null){
             this.languages = DEFAULT_LANGUAGES;
         } else if (value.toString().trim().isEmpty()){
             this.languages = Collections.emptySet();
         } else {
             String[] languageArray = value.toString().split(",");
             languages = new HashSet<String>();
             for(String language : languageArray){
                 if(language != null){
                     language = language.trim();
                     if(!language.isEmpty()){
                         languages.add(language);
                     }
                 }
             }
         }
         value = configuration.get(MIN_POS_TAG_PROBABILITY);
         double minPosTagProb;
         if(value instanceof Number){
             minPosTagProb = ((Number)value).doubleValue();
         } else if(value != null && !value.toString().isEmpty()){
             try {
                 minPosTagProb = Double.valueOf(value.toString());
             } catch (NumberFormatException e) {
                 throw new ConfigurationException(MIN_POS_TAG_PROBABILITY,
                     "Unable to parse the min POS tag probability from the parsed value "+value,e);
             }
         } else {
             minPosTagProb = DEFAULT_MIN_POS_TAG_PROBABILITY;
         }
         if(minPosTagProb > 1){
             throw new ConfigurationException(MIN_POS_TAG_PROBABILITY,
                 "The configured min POS tag probability MUST BE in the range [0..1] " +
                 "or < 0 to deactivate this feature (parsed value "+value+")!");
         }
         nlpConfig.setMinPosTagProbability(minPosTagProb);
         value = configuration.get(KEYWORD_TOKENIZER);
         //the keyword tokenizer config
         if(value instanceof Boolean){
             nlpConfig.forceKeywordTokenizer((Boolean)value);
         } else if(value != null && !value.toString().isEmpty()){
             nlpConfig.forceKeywordTokenizer(Boolean.valueOf(value.toString()));
         }
         //nlpConfig.enablePosTypeChunker(false);
         //nlpConfig.enableChunker(false);
         analysedContentFactory = OpenNlpAnalysedContentFactory.getInstance(openNLP,nlpConfig);
     }

     /**
      * Configures the parsed {@link EntityLinkerConfig} with the values of the
      * following properties:<ul>
      * <li>{@link #NAME_FIELD}
      * <li>{@link #TYPE_FIELD}
      * <li>{@link #REDIRECT_FIELD}
      * <li>{@link #REDIRECT_PROCESSING_MODE}
      * <li>{@link #MAX_SUGGESTIONS}
      * <li>{@link #MIN_SEARCH_TOKEN_LENGTH}
      * <li>{@link #MIN_FOUND_TOKENS}
      * <li> {@link #MIN_TOKEN_MATCH_FACTOR}
      * </ul>
      * This Method create an new {@link EntityLinkerConfig} instance only if
      * <code>{@link #linkerConfig} == null</code>. If the instance is already initialised
      * that all current values for keys missing in the parsed configuration are
      * preserved.
      * @param configuration the configuration
      * @throws ConfigurationException In case of an illegal value in the parsed configuration.
      * Note that all configuration are assumed as optional, therefore missing values will not
      * case a ConfigurationException.
      */
     protected void activateEntityLinkerConfig(Dictionary<String,Object> configuration) throws ConfigurationException {
         if(linkerConfig == null){
             this.linkerConfig = new EntityLinkerConfig();
         }
         Object value;
         value = configuration.get(NAME_FIELD);
         if(value != null){
             if(value.toString().isEmpty()){
                 throw new ConfigurationException(NAME_FIELD,"The configured name field MUST NOT be empty");
             }
             linkerConfig.setNameField(value.toString());
         }
         //init case sensitivity
         value = configuration.get(CASE_SENSITIVE);
         if(value instanceof Boolean){
             linkerConfig.setCaseSensitiveMatchingState((Boolean)value);
         } else if(value != null && !value.toString().isEmpty()){
             linkerConfig.setCaseSensitiveMatchingState(Boolean.valueOf(value.toString()));
         } //if NULL or empty use default
         //init TYPE_FIELD
         value = configuration.get(TYPE_FIELD);
         if(value != null){
             if(value.toString().isEmpty()){
                 throw new ConfigurationException(TYPE_FIELD,"The configured name field MUST NOT be empty");
             }
             linkerConfig.setTypeField(value.toString());
         }
         //init REDIRECT_FIELD
         value = configuration.get(REDIRECT_FIELD);
         if(value != null){
             if(value.toString().isEmpty()){
                 throw new ConfigurationException(NAME_FIELD,"The configured name field MUST NOT be empty");
             }
             linkerConfig.setRedirectField(value.toString());
         }
         //init MAX_SUGGESTIONS
         value = configuration.get(MAX_SUGGESTIONS);
         Integer maxSuggestions;
         if(value instanceof Integer){
             maxSuggestions = (Integer)value;
         } else if (value != null){
             try {
                 maxSuggestions = Integer.valueOf(value.toString());
             } catch(NumberFormatException e){
                 throw new ConfigurationException(MAX_SUGGESTIONS, "Values MUST be valid Integer values > 0",e);
             }
         } else {
             maxSuggestions = null;
         }
         if(maxSuggestions != null){
             if(maxSuggestions < 1){
                 throw new ConfigurationException(MAX_SUGGESTIONS, "Values MUST be valid Integer values > 0");
             }
             linkerConfig.setMaxSuggestions(maxSuggestions);
         }
         //init MIN_FOUND_TOKENS
         value = configuration.get(MIN_FOUND_TOKENS);
         Integer minFoundTokens;
         if(value instanceof Integer){
             minFoundTokens = (Integer)value;
         } else if(value != null){
             try {
                 minFoundTokens = Integer.valueOf(value.toString());
             } catch(NumberFormatException e){
                 throw new ConfigurationException(MIN_FOUND_TOKENS, "Values MUST be valid Integer values > 0",e);
             }
         } else {
             minFoundTokens = null;
         }
         if(minFoundTokens != null){
             if(minFoundTokens < 1){
                 throw new ConfigurationException(MIN_FOUND_TOKENS, "Values MUST be valid Integer values > 0");
             }
             linkerConfig.setMinFoundTokens(minFoundTokens);
         }
         // init MIN_SEARCH_TOKEN_LENGTH
         value = configuration.get(MIN_SEARCH_TOKEN_LENGTH);
         Integer minSearchTokenLength;
         if(value instanceof Integer){
             minSearchTokenLength = (Integer)value;
         } else if (value != null){
             try {
                 minSearchTokenLength = Integer.valueOf(value.toString());
             } catch(NumberFormatException e){
                 throw new ConfigurationException(MIN_SEARCH_TOKEN_LENGTH, "Values MUST be valid Integer values > 0",e);
             }
         } else {
             minSearchTokenLength = null;
         }
         if(minSearchTokenLength != null){
             if(minSearchTokenLength < 1){
                 throw new ConfigurationException(MIN_SEARCH_TOKEN_LENGTH, "Values MUST be valid Integer values > 0");
             }
             linkerConfig.setMinSearchTokenLength(minSearchTokenLength);
         }
         //init the REDIRECT_PROCESSING_MODE
         value = configuration.get(REDIRECT_PROCESSING_MODE);
         if(value != null){
             try {
                 linkerConfig.setRedirectProcessingMode(RedirectProcessingMode.valueOf(value.toString()));
             } catch (IllegalArgumentException e) {
                 throw new ConfigurationException(REDIRECT_PROCESSING_MODE, "Values MUST be one of "+
                     Arrays.toString(RedirectProcessingMode.values()));
             }
         }
         //init the DEFAULT_LANGUAGE
         value = configuration.get(DEFAULT_MATCHING_LANGUAGE);
         if(value != null){
             String defaultLang = value.toString().trim();
             if(defaultLang.isEmpty()){
                 linkerConfig.setDefaultLanguage(null);
             } else if(defaultLang.length() == 1){
                 throw new ConfigurationException(DEFAULT_MATCHING_LANGUAGE, "Illegal language code '"+
                     defaultLang+"'! Language Codes MUST BE at least 2 chars long.");
             } else {
                 linkerConfig.setDefaultLanguage(defaultLang);
             }
         }
         // init MIN_TOKEN_MATCH_FACTOR
         value=configuration.get(MIN_TOKEN_MATCH_FACTOR);
         float minTokenMatchFactor;
         if(value instanceof Number){
             minTokenMatchFactor = ((Number)value).floatValue();
         } else if(value != null){
             try {
                 minTokenMatchFactor = Float.valueOf(value.toString());
             } catch (NumberFormatException e) {
                 throw new ConfigurationException(MIN_TOKEN_MATCH_FACTOR,
                     "Unable to parse the minimum token match factor from the parsed value "+value,e);
             }
             if(minTokenMatchFactor < 0){
                 minTokenMatchFactor = EntityLinkerConfig.DEFAULT_MIN_TOKEN_MATCH_FACTOR;
             }
         } else {
             minTokenMatchFactor = EntityLinkerConfig.DEFAULT_MIN_TOKEN_MATCH_FACTOR;
         }
         if(minTokenMatchFactor == 0 || minTokenMatchFactor > 1){
             throw new ConfigurationException(MIN_TOKEN_MATCH_FACTOR,
                 "The minimum token match factor MUST be > 0 and <= 1 (negative values for the default)");
         }
         linkerConfig.setMinTokenMatchFactor(minTokenMatchFactor);

         //init type mappings
         value = configuration.get(TYPE_MAPPINGS);
         if(value instanceof String[]){ //support array
             value = Arrays.asList((String[])value);
         } else if(value instanceof String) { //single value
             value = Collections.singleton(value);
         }
         if(value instanceof Collection<?>){ //and collection
             log.info("Init Type Mappings");
             configs :
             for(Object o : (Iterable<?>)value){
                 if(o != null){
                     StringBuilder usage = new StringBuilder("useages: ");
                     usage.append("a: '{uri}' short for {uri} > {uri} | ");
                     usage.append("b: '{source1};{source2};..;{sourceN} > {target}'");
                     String[] config = o.toString().split(">");
                     if(config[0].isEmpty()){
                         log.warn("Invalid Type Mapping Config '{}': Missing Source Type ({}) -> ignore this config",
                             o,usage);
                         continue configs;
                     }
                     String[] sourceTypes = config[0].split(";");
                     if(sourceTypes.length > 1 && (config.length < 2 || config[1].isEmpty())){
                         log.warn("Invalid Type Mapping Config '{}': Missing Target Type '{}' ({}) -> ignore this config",
                             o,usage);
                         continue configs;
                     }
                     String targetType = config.length < 2 ? sourceTypes[0] : config[1];
                     targetType = getFullName(targetType.trim()); //support for ns:localName
                     try { //validate
                         new URI(targetType);
                     } catch (URISyntaxException e) {
                         log.warn("Invalid URI '{}' in Type Mapping Config '{}' -> ignore this config",
                             sourceTypes[0],o);
                         continue configs;
                     }
                     UriRef targetUri = new UriRef(targetType);
                     for(String sourceType : sourceTypes){
                         if(!sourceType.isEmpty()){
                             sourceType = getFullName(sourceType.trim()); //support for ns:localName
                             try { //validate
                                 new URI(sourceType);
                                 UriRef old = linkerConfig.setTypeMapping(sourceType, targetUri);
                                 if(old == null){
                                     log.info(" > add type mapping {} > {}", sourceType,targetType);
                                 } else {
                                     log.info(" > set type mapping {} > {} (old: {})",
                                         new Object[]{sourceType,targetType,old.getUnicodeString()});
                                 }
                             } catch (URISyntaxException e) {
                                 log.warn("Invalid URI '{}' in Type Mapping Config '{}' -> ignore this source type",
                                     sourceTypes[0],o);
                             }
                         }
                     }
                 }
             }
         } else {
             log.debug("No Type mappings configured");
         }
     }

     /**
      * Initialise the {@link #entitySearcher} based on the value of the
      * {@link #REFERENCED_SITE_ID} property in the parsed configuration
      * @param context
      * @param configuration
      * @throws ConfigurationException
      */
     protected void activateEntitySearcher(ComponentContext context, Dictionary<String,Object> configuration) throws ConfigurationException {
         Object value = configuration.get(REFERENCED_SITE_ID);
         //init the EntitySource
         if (value == null) {
             throw new ConfigurationException(REFERENCED_SITE_ID,
                     "The ID of the Referenced Site is a required Parameter and MUST NOT be NULL!");
         }
         referencedSiteName = value.toString();
         if (referencedSiteName.isEmpty()) {
             throw new ConfigurationException(REFERENCED_SITE_ID,
                     "The ID of the Referenced Site is a required Parameter and MUST NOT be an empty String!");
         }
         //TODO: make limit configurable!
         if(Entityhub.ENTITYHUB_IDS.contains(referencedSiteName.toLowerCase())){
             entitySearcher = new EntityhubSearcher(context.getBundleContext(),DEFAULT_ENTITY_SEARCHER_LIMIT);
         } else {
             entitySearcher = new ReferencedSiteSearcher(context.getBundleContext(),referencedSiteName,DEFAULT_ENTITY_SEARCHER_LIMIT);
         }
     }
     /**
      * Deactivates this Engine. Subclasses should not call this method but rather
      * call<ul>
      * <li> {@link #deactivateEntitySearcher()}
      * <li> {@link #deactivateEntityLinkerConfig()} and
      * <li> {@link #deactivateProcessedLanguages())}
      * </ul>
      * @param context the context (not used)
      */
     @Deactivate
     protected void deactivate(ComponentContext context) {
         super.deactivate(context);
         deactivateEntitySearcher();
         deactivateTextAnalyzerConfig();
         deactivateEntityLinkerConfig();
         deactivateEntityDereference();
     }
     /**
      * Resets the {@link #dereferenceEntitiesState} to
      * {@link #DEFAULT_DEREFERENCE_ENTITIES_STATE}
      */
     protected final void deactivateEntityDereference() {
         dereferenceEntitiesState = DEFAULT_DEREFERENCE_ENTITIES_STATE;
     }

     /**
      * Deactivates the {@link TextAnalyzer} as well as resets the set of languages
      * to process to {@link #DEFAULT_LANGUAGES}
      */
     protected void deactivateTextAnalyzerConfig() {
         this.nlpConfig = null;
         this.analysedContentFactory = null;
         languages = DEFAULT_LANGUAGES;
     }

     /**
      * sets the {@link EntityLinkerConfig} to <code>null</code>
      */
     protected void deactivateEntityLinkerConfig() {
         linkerConfig = null;
     }

     /**
      * Closes and resets the EntitySearcher. Also calls
      * {@link TrackingEntitySearcher#close()} if applicable.
      */
     protected void deactivateEntitySearcher() {
         if(entitySearcher instanceof TrackingEntitySearcher<?>){
             //close tracking EntitySearcher
             ((TrackingEntitySearcher<?>)entitySearcher).close();
         }
         entitySearcher = null;
         referencedSiteName = null;
     }
 }