blob: 98cf07914f88adcc557dce6bd5a8e56f3d10fa21 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.stanbol.enhancer.engines.keywordextraction.engine;
import static org.apache.stanbol.entityhub.servicesapi.defaults.NamespaceEnum.getFullName;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Dictionary;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import org.apache.clerezza.rdf.core.Language;
import org.apache.clerezza.rdf.core.Literal;
import org.apache.clerezza.rdf.core.LiteralFactory;
import org.apache.clerezza.rdf.core.MGraph;
import org.apache.clerezza.rdf.core.Triple;
import org.apache.clerezza.rdf.core.UriRef;
import org.apache.clerezza.rdf.core.impl.PlainLiteralImpl;
import org.apache.clerezza.rdf.core.impl.TripleImpl;
import org.apache.commons.lang.StringUtils;
import org.apache.felix.scr.annotations.Activate;
import org.apache.felix.scr.annotations.Component;
import org.apache.felix.scr.annotations.ConfigurationPolicy;
import org.apache.felix.scr.annotations.Deactivate;
import org.apache.felix.scr.annotations.Property;
import org.apache.felix.scr.annotations.PropertyOption;
import org.apache.felix.scr.annotations.ReferenceCardinality;
import org.apache.felix.scr.annotations.ReferencePolicy;
import org.apache.felix.scr.annotations.ReferenceStrategy;
import org.apache.felix.scr.annotations.Service;
import org.apache.stanbol.commons.opennlp.OpenNLP;
import org.apache.stanbol.commons.opennlp.TextAnalyzer;
import org.apache.stanbol.commons.opennlp.TextAnalyzer.TextAnalyzerConfig;
import org.apache.stanbol.commons.stanboltools.offline.OfflineMode;
import org.apache.stanbol.enhancer.engines.keywordextraction.linking.AnalysedContent;
import org.apache.stanbol.enhancer.engines.keywordextraction.linking.EntityLinker;
import org.apache.stanbol.enhancer.engines.keywordextraction.linking.EntityLinkerConfig;
import org.apache.stanbol.enhancer.engines.keywordextraction.linking.EntitySearcher;
import org.apache.stanbol.enhancer.engines.keywordextraction.linking.LinkedEntity;
import org.apache.stanbol.enhancer.engines.keywordextraction.linking.Suggestion;
import org.apache.stanbol.enhancer.engines.keywordextraction.linking.EntityLinkerConfig.RedirectProcessingMode;
import org.apache.stanbol.enhancer.engines.keywordextraction.linking.LinkedEntity.Occurrence;
import org.apache.stanbol.enhancer.engines.keywordextraction.linking.impl.EntityhubSearcher;
import org.apache.stanbol.enhancer.engines.keywordextraction.linking.impl.OpenNlpAnalysedContentFactory;
import org.apache.stanbol.enhancer.engines.keywordextraction.linking.impl.ReferencedSiteSearcher;
import org.apache.stanbol.enhancer.engines.keywordextraction.linking.impl.TrackingEntitySearcher;
import org.apache.stanbol.enhancer.servicesapi.Blob;
import org.apache.stanbol.enhancer.servicesapi.ContentItem;
import org.apache.stanbol.enhancer.servicesapi.EngineException;
import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
import org.apache.stanbol.enhancer.servicesapi.InvalidContentException;
import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
import org.apache.stanbol.enhancer.servicesapi.helper.ContentItemHelper;
import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
import org.apache.stanbol.enhancer.servicesapi.impl.AbstractEnhancementEngine;
import org.apache.stanbol.enhancer.servicesapi.rdf.Properties;
import org.apache.stanbol.entityhub.model.clerezza.RdfValueFactory;
import org.apache.stanbol.entityhub.servicesapi.Entityhub;
import org.apache.stanbol.entityhub.servicesapi.defaults.NamespaceEnum;
import org.apache.stanbol.entityhub.servicesapi.model.Reference;
import org.apache.stanbol.entityhub.servicesapi.model.Text;
import org.apache.stanbol.entityhub.servicesapi.model.rdf.RdfResourceEnum;
import org.osgi.framework.Constants;
import org.osgi.service.cm.ConfigurationException;
import org.osgi.service.component.ComponentContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@Component(
configurationFactory = true,
policy = ConfigurationPolicy.REQUIRE, // the baseUri is required!
specVersion = "1.1",
metatype = true,
immediate = true,
inherit = true)
@Service
@org.apache.felix.scr.annotations.Properties(value={
@Property(name=EnhancementEngine.PROPERTY_NAME),
@Property(name=KeywordLinkingEngine.REFERENCED_SITE_ID),
@Property(name=KeywordLinkingEngine.NAME_FIELD,value=EntityLinkerConfig.DEFAULT_NAME_FIELD),
@Property(name=KeywordLinkingEngine.CASE_SENSITIVE,boolValue=EntityLinkerConfig.DEFAULT_CASE_SENSITIVE_MATCHING_STATE),
@Property(name=KeywordLinkingEngine.TYPE_FIELD,value=EntityLinkerConfig.DEFAULT_TYPE_FIELD),
@Property(name=KeywordLinkingEngine.REDIRECT_FIELD,value=EntityLinkerConfig.DEFAULT_REDIRECT_FIELD),
@Property(name=KeywordLinkingEngine.REDIRECT_PROCESSING_MODE,options={
@PropertyOption(
value='%'+KeywordLinkingEngine.REDIRECT_PROCESSING_MODE+".option.ignore",
name="IGNORE"),
@PropertyOption(
value='%'+KeywordLinkingEngine.REDIRECT_PROCESSING_MODE+".option.addValues",
name="ADD_VALUES"),
@PropertyOption(
value='%'+KeywordLinkingEngine.REDIRECT_PROCESSING_MODE+".option.follow",
name="FOLLOW")
},value="IGNORE"),
@Property(name=KeywordLinkingEngine.MIN_SEARCH_TOKEN_LENGTH,
intValue=EntityLinkerConfig.DEFAULT_MIN_SEARCH_TOKEN_LENGTH),
@Property(name=KeywordLinkingEngine.MIN_TOKEN_MATCH_FACTOR,floatValue=
EntityLinkerConfig.DEFAULT_MIN_TOKEN_MATCH_FACTOR),
@Property(name=KeywordLinkingEngine.KEYWORD_TOKENIZER,boolValue=false),
@Property(name=KeywordLinkingEngine.MAX_SUGGESTIONS,
intValue=EntityLinkerConfig.DEFAULT_SUGGESTIONS),
@Property(name=KeywordLinkingEngine.PROCESSED_LANGUAGES,value=""),
@Property(name=KeywordLinkingEngine.DEFAULT_MATCHING_LANGUAGE,value=""),
@Property(name=KeywordLinkingEngine.TYPE_MAPPINGS,cardinality=1000),
@Property(name=KeywordLinkingEngine.DEREFERENCE_ENTITIES,
boolValue=KeywordLinkingEngine.DEFAULT_DEREFERENCE_ENTITIES_STATE),
@Property(name=Constants.SERVICE_RANKING,intValue=0)
})
public class KeywordLinkingEngine
extends AbstractEnhancementEngine<RuntimeException,RuntimeException>
implements EnhancementEngine, ServiceProperties {
private final Logger log = LoggerFactory.getLogger(KeywordLinkingEngine.class);
/**
* This is used to check the content type of parsed {@link ContentItem}s for
* plain text
*/
protected static final String TEXT_PLAIN_MIMETYPE = "text/plain";
/**
* Contains the only supported mime type {@link #TEXT_PLAIN_MIMETYPE}
*/
protected static final Set<String> SUPPORTED_MIMETYPES = Collections.singleton(TEXT_PLAIN_MIMETYPE);
/**
* The default value for the Execution of this Engine.
* This Engine creates TextAnnotations that should not be processed by other Engines.
* Therefore it uses a lower rank than {@link ServiceProperties#ORDERING_DEFAULT}
* to ensure that other engines do not get confused
*/
public static final Integer DEFAULT_ORDER = ServiceProperties.ORDERING_DEFAULT - 10;
public static final String REFERENCED_SITE_ID = "org.apache.stanbol.enhancer.engines.keywordextraction.referencedSiteId";
public static final String NAME_FIELD = "org.apache.stanbol.enhancer.engines.keywordextraction.nameField";
public static final String TYPE_FIELD = "org.apache.stanbol.enhancer.engines.keywordextraction.typeField";
public static final String CASE_SENSITIVE = "org.apache.stanbol.enhancer.engines.keywordextraction.caseSensitive";
public static final String REDIRECT_FIELD = "org.apache.stanbol.enhancer.engines.keywordextraction.redirectField";
public static final String REDIRECT_PROCESSING_MODE = "org.apache.stanbol.enhancer.engines.keywordextraction.redirectMode";
public static final String MIN_SEARCH_TOKEN_LENGTH = "org.apache.stanbol.enhancer.engines.keywordextraction.minSearchTokenLength";
public static final String MAX_SUGGESTIONS = "org.apache.stanbol.enhancer.engines.keywordextraction.maxSuggestions";
public static final String PROCESSED_LANGUAGES = "org.apache.stanbol.enhancer.engines.keywordextraction.processedLanguages";
public static final String MIN_FOUND_TOKENS= "org.apache.stanbol.enhancer.engines.keywordextraction.minFoundTokens";
public static final String DEFAULT_MATCHING_LANGUAGE = "org.apache.stanbol.enhancer.engines.keywordextraction.defaultMatchingLanguage";
public static final String MIN_POS_TAG_PROBABILITY = "org.apache.stanbol.enhancer.engines.keywordextraction.minPosTagProbability";
public static final String TYPE_MAPPINGS = "org.apache.stanbol.enhancer.engines.keywordextraction.typeMappings";
public static final String KEYWORD_TOKENIZER = "org.apache.stanbol.enhancer.engines.keywordextraction.keywordTokenizer";
public static final String MIN_TOKEN_MATCH_FACTOR = "org.apache.stanbol.enhancer.engines.keywordextraction.minTokenMatchFactor";
// public static final String ENABLE_CHUNKER = "org.apache.stanbol.enhancer.engines.keywordextraction.enableChunker";
/**
* Adds the dereference feature (STANBOL-333) also to this engine.
* This will be replaced by STANBOL-336.
*/
public static final String DEREFERENCE_ENTITIES = "org.apache.stanbol.enhancer.engines.keywordextraction.dereference";
/**
* The default state to dereference entities set to <code>true</code>.
*/
public static final boolean DEFAULT_DEREFERENCE_ENTITIES_STATE = true;
/**
* Additional fields added for dereferenced entities
*/
private static final Collection<String> DEREFERENCE_FIELDS = Arrays.asList(
getFullName("rdfs:comment"),
getFullName("geo:lat"),
getFullName("geo:long"),
getFullName("foaf:depiction"),
getFullName("dbp-ont:thumbnail"));
/**
* The dereferenceEntitiesState as set in {@link #activateEntityDereference(Dictionary)}
*/
private boolean dereferenceEntitiesState;
/**
* Default set of languages. This is an empty set indicating that texts in any
* language are processed.
*/
public static final Set<String> DEFAULT_LANGUAGES = Collections.emptySet();
public static final double DEFAULT_MIN_POS_TAG_PROBABILITY = 0.6667;
/**
* The languages this engine is configured to enhance. An empty List is
* considered as active for any language
*/
private Set<String> languages = DEFAULT_LANGUAGES;
/**
* The literal representing the LangIDEngine as creator.
*/
public static final Literal LANG_ID_ENGINE_NAME = LiteralFactory.getInstance().createTypedLiteral("org.apache.stanbol.enhancer.engines.langid.LangIdEnhancementEngine");
/**
* The default value for the LIMIT of the {@link EntitySearcher}
*/
private static final int DEFAULT_ENTITY_SEARCHER_LIMIT = 10;
private EntitySearcher entitySearcher;
private EntityLinkerConfig linkerConfig;
private TextAnalyzerConfig nlpConfig;
/**
* The reference to the OpenNLP component
*/
@org.apache.felix.scr.annotations.Reference
private OpenNLP openNLP;
//TextAnalyzer was changed to have a scope of a single request ( call to
//#computeEnhancement!
//private TextAnalyzer textAnalyser;
/**
* Used to create {@link AnalysedContent} instances for parsed content items
*/
private OpenNlpAnalysedContentFactory analysedContentFactory;
/**
* The literalFactory used to create typed literals
*/
private LiteralFactory literalFactory = LiteralFactory.getInstance();
/**
* The {@link OfflineMode} is used by Stanbol to indicate that no external service should be referenced.
* For this engine that means it is necessary to check if the used {@link ReferencedSite} can operate
* offline or not.
*
* @see #enableOfflineMode(OfflineMode)
* @see #disableOfflineMode(OfflineMode)
*/
@org.apache.felix.scr.annotations.Reference(
cardinality = ReferenceCardinality.OPTIONAL_UNARY,
policy = ReferencePolicy.DYNAMIC,
bind = "enableOfflineMode",
unbind = "disableOfflineMode",
strategy = ReferenceStrategy.EVENT)
private OfflineMode offlineMode;
/**
* The name of the reference site ('local' or 'entityhub') if the
* Entityhub is used for enhancing
*/
protected String referencedSiteName;
/**
* Called by the ConfigurationAdmin to bind the {@link #offlineMode} if the service becomes available
*
* @param mode
*/
protected final void enableOfflineMode(OfflineMode mode) {
this.offlineMode = mode;
}
/**
* Called by the ConfigurationAdmin to unbind the {@link #offlineMode} if the service becomes unavailable
*
* @param mode
*/
protected final void disableOfflineMode(OfflineMode mode) {
this.offlineMode = null;
}
/**
* Returns <code>true</code> only if Stanbol operates in {@link OfflineMode}.
*
* @return the offline state
*/
protected final boolean isOfflineMode() {
return offlineMode != null;
}
/**
* Default constructor as used by OSGI. This expects that
* {@link #activate(ComponentContext)} is called before usage
*/
public KeywordLinkingEngine() {
}
/**
* Internal Constructor used by {@link #createInstance(OpenNLP, EntitySearcher, EntityLinkerConfig)}
* @param openNLP
* @param entitySearcher
* @param config
*/
protected KeywordLinkingEngine(OpenNLP openNLP,EntitySearcher entitySearcher,
TextAnalyzerConfig nlpConfig,EntityLinkerConfig linkingConfig){
this.openNLP = openNLP;
this.linkerConfig = linkingConfig != null ? linkingConfig : new EntityLinkerConfig();
this.nlpConfig = nlpConfig != null ? nlpConfig : new TextAnalyzerConfig();
this.analysedContentFactory = OpenNlpAnalysedContentFactory.getInstance(openNLP,nlpConfig);
this.entitySearcher = entitySearcher;
}
/**
* Allows to create an instance that can be used outside of an OSGI
* environment. This is mainly intended for unit tests.
* @param openNLP The {@link OpenNLP} instance used for natural language processing
* @param entitySearcher the searcher used to lookup terms
* @param config the configuration or <code>null</code> to use the defaults
* @return the created engine instance
*/
public static KeywordLinkingEngine createInstance(OpenNLP openNLP,
EntitySearcher entitySearcher,
TextAnalyzerConfig nlpConfig,
EntityLinkerConfig linkingConfig){
return new KeywordLinkingEngine(openNLP,entitySearcher,nlpConfig,linkingConfig);
}
/**
* Checks if the parsed language is enabled for processing.
* @param language The language to process
* @return the processing state for the parsed language.
*/
protected boolean isProcessableLanguages(String language) {
return languages.isEmpty() || languages.contains(language);
}
@Override
public Map<String,Object> getServiceProperties() {
return Collections.unmodifiableMap(Collections.singletonMap(
ENHANCEMENT_ENGINE_ORDERING,
(Object) DEFAULT_ORDER));
}
@Override
public int canEnhance(ContentItem ci) throws EngineException {
if(ContentItemHelper.getBlob(ci, SUPPORTED_MIMETYPES) != null){
return ENHANCE_ASYNC; //KeywordLinking now supports async processing
} else {
return CANNOT_ENHANCE;
}
}
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
if(isOfflineMode() && !entitySearcher.supportsOfflineMode()){
throw new EngineException("Offline mode is not supported by the Component used to lookup Entities");
}
Entry<UriRef,Blob> contentPart = ContentItemHelper.getBlob(ci, SUPPORTED_MIMETYPES);
if(contentPart == null){
throw new IllegalStateException("No ContentPart with a supported Mime Type"
+ "found for ContentItem "+ci.getUri()+"(supported: '"
+ SUPPORTED_MIMETYPES+"') -> this indicates that canEnhance was"
+ "NOT called and indicates a bug in the used EnhancementJobManager!");
}
String text;
try {
text = ContentItemHelper.getText(contentPart.getValue());
} catch (IOException e) {
throw new InvalidContentException(String.format("Unable to extract "
+" text from ContentPart %s of ContentItem %s!",
contentPart.getKey(),ci.getUri()),e);
}
if (text.trim().length() == 0) {
// TODO: make the length of the data a field of the ContentItem
// interface to be able to filter out empty items in the canEnhance
// method
log.warn("ContentPart {} of ContentItem does not contain any Text to extract knowledge from",
contentPart.getKey(), ci);
return;
}
//Determine the language
String language;
ci.getLock().readLock().lock();
try {
language = extractLanguage(ci);
} finally {
ci.getLock().readLock().unlock();
}
if(isProcessableLanguages(language)){
log.debug("computeEnhancements for ContentItem {} language {} text={}",
new Object []{ci.getUri().getUnicodeString(), language, StringUtils.abbreviate(text, 100)});
EntityLinker entityLinker = new EntityLinker(
analysedContentFactory.create(text, language),
entitySearcher, linkerConfig);
//process
entityLinker.process();
//write results (requires a write lock)
ci.getLock().writeLock().lock();
try {
writeEnhancements(ci, entityLinker.getLinkedEntities().values(), language);
} finally {
ci.getLock().writeLock().unlock();
}
} else {
log.debug("ignore ContentItem {} because language '{}' is not configured to" +
"be processed by this engine.",ci.getUri().getUnicodeString(),language);
}
}
/**
* Writes the Enhancements for the {@link LinkedEntity LinkedEntities}
* extracted from the parsed ContentItem
* @param ci
* @param linkedEntities
* @param language
*/
private void writeEnhancements(ContentItem ci, Collection<LinkedEntity> linkedEntities, String language) {
Language languageObject = null;
if(language != null && !language.isEmpty()){
languageObject = new Language(language);
}
MGraph metadata = ci.getMetadata();
for(LinkedEntity linkedEntity : linkedEntities){
Collection<UriRef> textAnnotations = new ArrayList<UriRef>(linkedEntity.getOccurrences().size());
//first create the TextAnnotations for the Occurrences
for(Occurrence occurrence : linkedEntity.getOccurrences()){
UriRef textAnnotation = EnhancementEngineHelper.createTextEnhancement(ci, this);
textAnnotations.add(textAnnotation);
metadata.add(new TripleImpl(textAnnotation,
Properties.ENHANCER_START,
literalFactory.createTypedLiteral(occurrence.getStart())));
metadata.add(new TripleImpl(textAnnotation,
Properties.ENHANCER_END,
literalFactory.createTypedLiteral(occurrence.getEnd())));
metadata.add(new TripleImpl(textAnnotation,
Properties.ENHANCER_SELECTION_CONTEXT,
new PlainLiteralImpl(occurrence.getContext(),languageObject)));
metadata.add(new TripleImpl(textAnnotation,
Properties.ENHANCER_SELECTED_TEXT,
new PlainLiteralImpl(occurrence.getSelectedText(),languageObject)));
metadata.add(new TripleImpl(textAnnotation,
Properties.ENHANCER_CONFIDENCE,
literalFactory.createTypedLiteral(linkedEntity.getScore())));
for(UriRef dcType : linkedEntity.getTypes()){
metadata.add(new TripleImpl(
textAnnotation, Properties.DC_TYPE, dcType));
}
}
//now the EntityAnnotations for the Suggestions
for(Suggestion suggestion : linkedEntity.getSuggestions()){
UriRef entityAnnotation = EnhancementEngineHelper.createEntityEnhancement(ci, this);
//should we use the label used for the match, or search the
//representation for the best label ... currently its the matched one
Text label = suggestion.getBestLabel(linkerConfig.getNameField(),language);
metadata.add(new TripleImpl(entityAnnotation,
Properties.ENHANCER_ENTITY_LABEL,
label.getLanguage() == null ?
new PlainLiteralImpl(label.getText()) :
new PlainLiteralImpl(label.getText(),
new Language(label.getLanguage()))));
metadata.add(new TripleImpl(entityAnnotation,
Properties.ENHANCER_ENTITY_REFERENCE,
new UriRef(suggestion.getRepresentation().getId())));
Iterator<Reference> suggestionTypes = suggestion.getRepresentation().getReferences(linkerConfig.getTypeField());
while(suggestionTypes.hasNext()){
metadata.add(new TripleImpl(entityAnnotation,
Properties.ENHANCER_ENTITY_TYPE, new UriRef(suggestionTypes.next().getReference())));
}
metadata.add(new TripleImpl(entityAnnotation,
Properties.ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(suggestion.getScore())));
for(UriRef textAnnotation : textAnnotations){
metadata.add(new TripleImpl(entityAnnotation,
Properties.DC_RELATION, textAnnotation));
}
//add the name of the ReferencedSite providing this suggestion
metadata.add(new TripleImpl(entityAnnotation,
new UriRef(RdfResourceEnum.site.getUri()),
new PlainLiteralImpl(referencedSiteName)));
//in case dereferencing of Entities is enabled we need also to
//add the RDF data for entities
if(dereferenceEntitiesState){
metadata.addAll(
RdfValueFactory.getInstance().toRdfRepresentation(
suggestion.getRepresentation()).getRdfGraph());
}
}
}
}
/**
* Extracts the language of the parsed ContentItem by using
* {@link EnhancementEngineHelper#getLanguage(ContentItem)} and "en" as
* default.
* @param ci the content item
* @return the language
*/
private String extractLanguage(ContentItem ci) {
String lang = EnhancementEngineHelper.getLanguage(ci);
// if(lang != null){
// MGraph metadata = ci.getMetadata();
// Iterator<Triple> langaugeEnhancementCreatorTriples =
// metadata.filter(null, Properties.DC_CREATOR, LANG_ID_ENGINE_NAME);
// if(langaugeEnhancementCreatorTriples.hasNext()){
// String lang = EnhancementEngineHelper.getString(metadata,
// langaugeEnhancementCreatorTriples.next().getSubject(),
// Properties.DC_LANGUAGE);
if(lang != null){
return lang;
} else {
log.warn("Unable to extract language for ContentItem %s! The Enhancement of the %s is missing the %s property",
new Object[]{ci.getUri().getUnicodeString(),LANG_ID_ENGINE_NAME.getLexicalForm(),Properties.DC_LANGUAGE});
log.warn(" ... return 'en' as default");
return "en";
}
// } else {
// log.warn("Unable to extract language for ContentItem %s! Is the %s active?",
// ci.getUri().getUnicodeString(),LANG_ID_ENGINE_NAME.getLexicalForm());
// log.warn(" ... return 'en' as default");
// return "en";
// }
}
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
* Methods for activate() and deactivate() the properties configureable via
* OSGI.
*
* NOTEs:
* Directly calling super.activate and super.deactivate
* is possible but might not be applicable in all cases.
* The activate**(...) and deactivate**() Methods are intended to be
* called by subclasses that need more control over the initialisation
* process.
* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
*/
/**
* Activates this Engine. Subclasses should not call this method but rather
* call<ul>
* <li> {@link #activateEntitySearcher(ComponentContext, Dictionary)}
* <li> {@link #initEntityLinkerConfig(Dictionary, EntityLinkerConfig)} and
* <li> {@link #activateTextAnalyzerConfig(Dictionary)}
* <li> {@link #dereferenceEntitiesState} (needs to be called after
* {@link #initEntityLinkerConfig(Dictionary, EntityLinkerConfig)})
* </ul>
* if applicable.
* @param context the Component context
* @throws ConfigurationException if the required {@link #REFERENCED_SITE_ID}
* configuration is missing or any of the other properties has an illegal value
*/
@Activate
@SuppressWarnings("unchecked")
protected void activate(ComponentContext context) throws ConfigurationException {
super.activate(context);
Dictionary<String,Object> properties = context.getProperties();
activateTextAnalyzerConfig(properties);
activateEntitySearcher(context, properties);
activateEntityLinkerConfig(properties);
activateEntityDereference(properties);
}
/**
* Inits the {@link #dereferenceEntitiesState} based on the
* {@link #DEREFERENCE_ENTITIES} configuration.
* @param properties the configuration
*/
protected final void activateEntityDereference(Dictionary<String,Object> properties) {
Object value = properties.get(DEREFERENCE_ENTITIES);
if(value instanceof Boolean){
dereferenceEntitiesState = ((Boolean)value).booleanValue();
} else if(value != null && !value.toString().isEmpty()){
dereferenceEntitiesState = Boolean.parseBoolean(value.toString());
} else {
dereferenceEntitiesState = DEFAULT_DEREFERENCE_ENTITIES_STATE;
}
if(dereferenceEntitiesState){
linkerConfig.getSelectedFields().addAll(DEREFERENCE_FIELDS);
}
}
/**
* Initialise the {@link TextAnalyzer} component.<p>
* Currently this includes the following configurations: <ul>
* <li>{@link #PROCESSED_LANGUAGES}: If no configuration is present the
* default (process all languages) is used.
* <li> {@value #MIN_POS_TAG_PROBABILITY}: If no configuration is
* present the #DEFAULT_MIN_POS_TAG_PROBABILITY is used
* languages based on the value of the
*
* @param configuration the OSGI component configuration
*/
protected final void activateTextAnalyzerConfig(Dictionary<String,Object> configuration) throws ConfigurationException {
nlpConfig = new TextAnalyzerConfig();
Object value;
value = configuration.get(PROCESSED_LANGUAGES);
if(value == null){
this.languages = DEFAULT_LANGUAGES;
} else if (value.toString().trim().isEmpty()){
this.languages = Collections.emptySet();
} else {
String[] languageArray = value.toString().split(",");
languages = new HashSet<String>();
for(String language : languageArray){
if(language != null){
language = language.trim();
if(!language.isEmpty()){
languages.add(language);
}
}
}
}
value = configuration.get(MIN_POS_TAG_PROBABILITY);
double minPosTagProb;
if(value instanceof Number){
minPosTagProb = ((Number)value).doubleValue();
} else if(value != null && !value.toString().isEmpty()){
try {
minPosTagProb = Double.valueOf(value.toString());
} catch (NumberFormatException e) {
throw new ConfigurationException(MIN_POS_TAG_PROBABILITY,
"Unable to parse the min POS tag probability from the parsed value "+value,e);
}
} else {
minPosTagProb = DEFAULT_MIN_POS_TAG_PROBABILITY;
}
if(minPosTagProb > 1){
throw new ConfigurationException(MIN_POS_TAG_PROBABILITY,
"The configured min POS tag probability MUST BE in the range [0..1] " +
"or < 0 to deactivate this feature (parsed value "+value+")!");
}
nlpConfig.setMinPosTagProbability(minPosTagProb);
value = configuration.get(KEYWORD_TOKENIZER);
//the keyword tokenizer config
if(value instanceof Boolean){
nlpConfig.forceKeywordTokenizer((Boolean)value);
} else if(value != null && !value.toString().isEmpty()){
nlpConfig.forceKeywordTokenizer(Boolean.valueOf(value.toString()));
}
//nlpConfig.enablePosTypeChunker(false);
//nlpConfig.enableChunker(false);
analysedContentFactory = OpenNlpAnalysedContentFactory.getInstance(openNLP,nlpConfig);
}
/**
* Configures the parsed {@link EntityLinkerConfig} with the values of the
* following properties:<ul>
* <li>{@link #NAME_FIELD}
* <li>{@link #TYPE_FIELD}
* <li>{@link #REDIRECT_FIELD}
* <li>{@link #REDIRECT_PROCESSING_MODE}
* <li>{@link #MAX_SUGGESTIONS}
* <li>{@link #MIN_SEARCH_TOKEN_LENGTH}
* <li>{@link #MIN_FOUND_TOKENS}
* <li> {@link #MIN_TOKEN_MATCH_FACTOR}
* </ul>
* This Method create an new {@link EntityLinkerConfig} instance only if
* <code>{@link #linkerConfig} == null</code>. If the instance is already initialised
* that all current values for keys missing in the parsed configuration are
* preserved.
* @param configuration the configuration
* @throws ConfigurationException In case of an illegal value in the parsed configuration.
* Note that all configuration are assumed as optional, therefore missing values will not
* case a ConfigurationException.
*/
protected void activateEntityLinkerConfig(Dictionary<String,Object> configuration) throws ConfigurationException {
if(linkerConfig == null){
this.linkerConfig = new EntityLinkerConfig();
}
Object value;
value = configuration.get(NAME_FIELD);
if(value != null){
if(value.toString().isEmpty()){
throw new ConfigurationException(NAME_FIELD,"The configured name field MUST NOT be empty");
}
linkerConfig.setNameField(value.toString());
}
//init case sensitivity
value = configuration.get(CASE_SENSITIVE);
if(value instanceof Boolean){
linkerConfig.setCaseSensitiveMatchingState((Boolean)value);
} else if(value != null && !value.toString().isEmpty()){
linkerConfig.setCaseSensitiveMatchingState(Boolean.valueOf(value.toString()));
} //if NULL or empty use default
//init TYPE_FIELD
value = configuration.get(TYPE_FIELD);
if(value != null){
if(value.toString().isEmpty()){
throw new ConfigurationException(TYPE_FIELD,"The configured name field MUST NOT be empty");
}
linkerConfig.setTypeField(value.toString());
}
//init REDIRECT_FIELD
value = configuration.get(REDIRECT_FIELD);
if(value != null){
if(value.toString().isEmpty()){
throw new ConfigurationException(NAME_FIELD,"The configured name field MUST NOT be empty");
}
linkerConfig.setRedirectField(value.toString());
}
//init MAX_SUGGESTIONS
value = configuration.get(MAX_SUGGESTIONS);
Integer maxSuggestions;
if(value instanceof Integer){
maxSuggestions = (Integer)value;
} else if (value != null){
try {
maxSuggestions = Integer.valueOf(value.toString());
} catch(NumberFormatException e){
throw new ConfigurationException(MAX_SUGGESTIONS, "Values MUST be valid Integer values > 0",e);
}
} else {
maxSuggestions = null;
}
if(maxSuggestions != null){
if(maxSuggestions < 1){
throw new ConfigurationException(MAX_SUGGESTIONS, "Values MUST be valid Integer values > 0");
}
linkerConfig.setMaxSuggestions(maxSuggestions);
}
//init MIN_FOUND_TOKENS
value = configuration.get(MIN_FOUND_TOKENS);
Integer minFoundTokens;
if(value instanceof Integer){
minFoundTokens = (Integer)value;
} else if(value != null){
try {
minFoundTokens = Integer.valueOf(value.toString());
} catch(NumberFormatException e){
throw new ConfigurationException(MIN_FOUND_TOKENS, "Values MUST be valid Integer values > 0",e);
}
} else {
minFoundTokens = null;
}
if(minFoundTokens != null){
if(minFoundTokens < 1){
throw new ConfigurationException(MIN_FOUND_TOKENS, "Values MUST be valid Integer values > 0");
}
linkerConfig.setMinFoundTokens(minFoundTokens);
}
// init MIN_SEARCH_TOKEN_LENGTH
value = configuration.get(MIN_SEARCH_TOKEN_LENGTH);
Integer minSearchTokenLength;
if(value instanceof Integer){
minSearchTokenLength = (Integer)value;
} else if (value != null){
try {
minSearchTokenLength = Integer.valueOf(value.toString());
} catch(NumberFormatException e){
throw new ConfigurationException(MIN_SEARCH_TOKEN_LENGTH, "Values MUST be valid Integer values > 0",e);
}
} else {
minSearchTokenLength = null;
}
if(minSearchTokenLength != null){
if(minSearchTokenLength < 1){
throw new ConfigurationException(MIN_SEARCH_TOKEN_LENGTH, "Values MUST be valid Integer values > 0");
}
linkerConfig.setMinSearchTokenLength(minSearchTokenLength);
}
//init the REDIRECT_PROCESSING_MODE
value = configuration.get(REDIRECT_PROCESSING_MODE);
if(value != null){
try {
linkerConfig.setRedirectProcessingMode(RedirectProcessingMode.valueOf(value.toString()));
} catch (IllegalArgumentException e) {
throw new ConfigurationException(REDIRECT_PROCESSING_MODE, "Values MUST be one of "+
Arrays.toString(RedirectProcessingMode.values()));
}
}
//init the DEFAULT_LANGUAGE
value = configuration.get(DEFAULT_MATCHING_LANGUAGE);
if(value != null){
String defaultLang = value.toString().trim();
if(defaultLang.isEmpty()){
linkerConfig.setDefaultLanguage(null);
} else if(defaultLang.length() == 1){
throw new ConfigurationException(DEFAULT_MATCHING_LANGUAGE, "Illegal language code '"+
defaultLang+"'! Language Codes MUST BE at least 2 chars long.");
} else {
linkerConfig.setDefaultLanguage(defaultLang);
}
}
// init MIN_TOKEN_MATCH_FACTOR
value=configuration.get(MIN_TOKEN_MATCH_FACTOR);
float minTokenMatchFactor;
if(value instanceof Number){
minTokenMatchFactor = ((Number)value).floatValue();
} else if(value != null){
try {
minTokenMatchFactor = Float.valueOf(value.toString());
} catch (NumberFormatException e) {
throw new ConfigurationException(MIN_TOKEN_MATCH_FACTOR,
"Unable to parse the minimum token match factor from the parsed value "+value,e);
}
if(minTokenMatchFactor < 0){
minTokenMatchFactor = EntityLinkerConfig.DEFAULT_MIN_TOKEN_MATCH_FACTOR;
}
} else {
minTokenMatchFactor = EntityLinkerConfig.DEFAULT_MIN_TOKEN_MATCH_FACTOR;
}
if(minTokenMatchFactor == 0 || minTokenMatchFactor > 1){
throw new ConfigurationException(MIN_TOKEN_MATCH_FACTOR,
"The minimum token match factor MUST be > 0 and <= 1 (negative values for the default)");
}
linkerConfig.setMinTokenMatchFactor(minTokenMatchFactor);
//init type mappings
value = configuration.get(TYPE_MAPPINGS);
if(value instanceof String[]){ //support array
value = Arrays.asList((String[])value);
} else if(value instanceof String) { //single value
value = Collections.singleton(value);
}
if(value instanceof Collection<?>){ //and collection
log.info("Init Type Mappings");
configs :
for(Object o : (Iterable<?>)value){
if(o != null){
StringBuilder usage = new StringBuilder("useages: ");
usage.append("a: '{uri}' short for {uri} > {uri} | ");
usage.append("b: '{source1};{source2};..;{sourceN} > {target}'");
String[] config = o.toString().split(">");
if(config[0].isEmpty()){
log.warn("Invalid Type Mapping Config '{}': Missing Source Type ({}) -> ignore this config",
o,usage);
continue configs;
}
String[] sourceTypes = config[0].split(";");
if(sourceTypes.length > 1 && (config.length < 2 || config[1].isEmpty())){
log.warn("Invalid Type Mapping Config '{}': Missing Target Type '{}' ({}) -> ignore this config",
o,usage);
continue configs;
}
String targetType = config.length < 2 ? sourceTypes[0] : config[1];
targetType = getFullName(targetType.trim()); //support for ns:localName
try { //validate
new URI(targetType);
} catch (URISyntaxException e) {
log.warn("Invalid URI '{}' in Type Mapping Config '{}' -> ignore this config",
sourceTypes[0],o);
continue configs;
}
UriRef targetUri = new UriRef(targetType);
for(String sourceType : sourceTypes){
if(!sourceType.isEmpty()){
sourceType = getFullName(sourceType.trim()); //support for ns:localName
try { //validate
new URI(sourceType);
UriRef old = linkerConfig.setTypeMapping(sourceType, targetUri);
if(old == null){
log.info(" > add type mapping {} > {}", sourceType,targetType);
} else {
log.info(" > set type mapping {} > {} (old: {})",
new Object[]{sourceType,targetType,old.getUnicodeString()});
}
} catch (URISyntaxException e) {
log.warn("Invalid URI '{}' in Type Mapping Config '{}' -> ignore this source type",
sourceTypes[0],o);
}
}
}
}
}
} else {
log.debug("No Type mappings configured");
}
}
/**
* Initialise the {@link #entitySearcher} based on the value of the
* {@link #REFERENCED_SITE_ID} property in the parsed configuration
* @param context
* @param configuration
* @throws ConfigurationException
*/
protected void activateEntitySearcher(ComponentContext context, Dictionary<String,Object> configuration) throws ConfigurationException {
Object value = configuration.get(REFERENCED_SITE_ID);
//init the EntitySource
if (value == null) {
throw new ConfigurationException(REFERENCED_SITE_ID,
"The ID of the Referenced Site is a required Parameter and MUST NOT be NULL!");
}
referencedSiteName = value.toString();
if (referencedSiteName.isEmpty()) {
throw new ConfigurationException(REFERENCED_SITE_ID,
"The ID of the Referenced Site is a required Parameter and MUST NOT be an empty String!");
}
//TODO: make limit configurable!
if(Entityhub.ENTITYHUB_IDS.contains(referencedSiteName.toLowerCase())){
entitySearcher = new EntityhubSearcher(context.getBundleContext(),DEFAULT_ENTITY_SEARCHER_LIMIT);
} else {
entitySearcher = new ReferencedSiteSearcher(context.getBundleContext(),referencedSiteName,DEFAULT_ENTITY_SEARCHER_LIMIT);
}
}
/**
* Deactivates this Engine. Subclasses should not call this method but rather
* call<ul>
* <li> {@link #deactivateEntitySearcher()}
* <li> {@link #deactivateEntityLinkerConfig()} and
* <li> {@link #deactivateProcessedLanguages())}
* </ul>
* @param context the context (not used)
*/
@Deactivate
protected void deactivate(ComponentContext context) {
super.deactivate(context);
deactivateEntitySearcher();
deactivateTextAnalyzerConfig();
deactivateEntityLinkerConfig();
deactivateEntityDereference();
}
/**
* Resets the {@link #dereferenceEntitiesState} to
* {@link #DEFAULT_DEREFERENCE_ENTITIES_STATE}
*/
protected final void deactivateEntityDereference() {
dereferenceEntitiesState = DEFAULT_DEREFERENCE_ENTITIES_STATE;
}
/**
* Deactivates the {@link TextAnalyzer} as well as resets the set of languages
* to process to {@link #DEFAULT_LANGUAGES}
*/
protected void deactivateTextAnalyzerConfig() {
this.nlpConfig = null;
this.analysedContentFactory = null;
languages = DEFAULT_LANGUAGES;
}
/**
* sets the {@link EntityLinkerConfig} to <code>null</code>
*/
protected void deactivateEntityLinkerConfig() {
linkerConfig = null;
}
/**
* Closes and resets the EntitySearcher. Also calls
* {@link TrackingEntitySearcher#close()} if applicable.
*/
protected void deactivateEntitySearcher() {
if(entitySearcher instanceof TrackingEntitySearcher<?>){
//close tracking EntitySearcher
((TrackingEntitySearcher<?>)entitySearcher).close();
}
entitySearcher = null;
referencedSiteName = null;
}
}