blob: a94913674f1d6ed72382f306ba80755edbde77bf [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.stanbol.enhancer.engines.entitycomention;
import static org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.CASE_SENSITIVE;
import static org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.DEFAULT_CASE_SENSITIVE_MATCHING_STATE;
import static org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.DEFAULT_DEREFERENCE_ENTITIES_STATE;
import static org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.DEFAULT_MATCHING_LANGUAGE;
import static org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.DEFAULT_MIN_TOKEN_SCORE;
import static org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.DEFAULT_SUGGESTIONS;
import static org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.DEREFERENCE_ENTITIES;
import static org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.DEREFERENCE_ENTITIES_FIELDS;
import static org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.MIN_TOKEN_SCORE;
import static org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.NAME_FIELD;
import static org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.REDIRECT_FIELD;
import static org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.REDIRECT_MODE;
import static org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.SUGGESTIONS;
import static org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.TYPE_FIELD;
import static org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.TYPE_MAPPINGS;
import static org.apache.stanbol.enhancer.engines.entitylinking.config.TextProcessingConfig.DEFAULT_MIN_SEARCH_TOKEN_LENGTH;
import static org.apache.stanbol.enhancer.engines.entitylinking.config.TextProcessingConfig.DEFAULT_PROCESS_ONLY_PROPER_NOUNS_STATE;
import static org.apache.stanbol.enhancer.engines.entitylinking.config.TextProcessingConfig.MIN_SEARCH_TOKEN_LENGTH;
import static org.apache.stanbol.enhancer.engines.entitylinking.config.TextProcessingConfig.PROCESSED_LANGUAGES;
import static org.apache.stanbol.enhancer.engines.entitylinking.config.TextProcessingConfig.PROCESS_ONLY_PROPER_NOUNS_STATE;
import static org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getAnalysedText;
import static org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getLanguage;
import static org.apache.stanbol.enhancer.servicesapi.EnhancementEngine.PROPERTY_NAME;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_CONTRIBUTOR;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_RELATION;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_TYPE;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_CONFIDENCE;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_END;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_REFERENCE;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_START;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.RDF_TYPE;
import static org.apache.stanbol.enhancer.servicesapi.rdf.TechnicalClasses.ENHANCER_TEXTANNOTATION;
import static org.osgi.framework.Constants.SERVICE_RANKING;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Dictionary;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.Map.Entry;
import org.apache.clerezza.rdf.core.Language;
import org.apache.clerezza.rdf.core.Literal;
import org.apache.clerezza.rdf.core.LiteralFactory;
import org.apache.clerezza.rdf.core.MGraph;
import org.apache.clerezza.rdf.core.NonLiteral;
import org.apache.clerezza.rdf.core.PlainLiteral;
import org.apache.clerezza.rdf.core.Resource;
import org.apache.clerezza.rdf.core.Triple;
import org.apache.clerezza.rdf.core.UriRef;
import org.apache.clerezza.rdf.core.impl.PlainLiteralImpl;
import org.apache.clerezza.rdf.core.impl.TripleImpl;
import org.apache.commons.lang.StringUtils;
import org.apache.felix.scr.annotations.Activate;
import org.apache.felix.scr.annotations.Component;
import org.apache.felix.scr.annotations.ConfigurationPolicy;
import org.apache.felix.scr.annotations.Deactivate;
import org.apache.felix.scr.annotations.Property;
import org.apache.felix.scr.annotations.PropertyOption;
import org.apache.felix.scr.annotations.Reference;
import org.apache.felix.scr.annotations.Service;
import org.apache.stanbol.commons.namespaceprefix.NamespacePrefixService;
import org.apache.stanbol.enhancer.engines.entitycomention.impl.ContentItemMentionBuilder;
import org.apache.stanbol.enhancer.engines.entitycomention.impl.InMemoryEntityIndex;
import org.apache.stanbol.enhancer.engines.entitylinking.Entity;
import org.apache.stanbol.enhancer.engines.entitylinking.EntitySearcher;
import org.apache.stanbol.enhancer.engines.entitylinking.EntitySearcherException;
import org.apache.stanbol.enhancer.engines.entitylinking.LabelTokenizer;
import org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig;
import org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.RedirectProcessingMode;
import org.apache.stanbol.enhancer.engines.entitylinking.config.LanguageProcessingConfig;
import org.apache.stanbol.enhancer.engines.entitylinking.config.TextProcessingConfig;
import org.apache.stanbol.enhancer.engines.entitylinking.impl.EntityLinker;
import org.apache.stanbol.enhancer.engines.entitylinking.impl.LinkedEntity;
import org.apache.stanbol.enhancer.engines.entitylinking.impl.Suggestion;
import org.apache.stanbol.enhancer.engines.entitylinking.impl.LinkedEntity.Occurrence;
import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
import org.apache.stanbol.enhancer.servicesapi.ContentItem;
import org.apache.stanbol.enhancer.servicesapi.EngineException;
import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
import org.apache.stanbol.enhancer.servicesapi.impl.AbstractEnhancementEngine;
import org.apache.stanbol.enhancer.servicesapi.rdf.Properties;
import org.apache.stanbol.enhancer.servicesapi.rdf.TechnicalClasses;
import org.osgi.framework.BundleContext;
import org.osgi.service.cm.ConfigurationException;
import org.osgi.service.component.ComponentContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* The Entity Co-Mentiaon Engine builds a local knowledge base already extracted
* <code>fise:TextAnnotation</code>s and suggested
* <code>fise:EntityAnnotation</code>s. This information are then used to perform
* an entity linking process. By doing so this engine will be able to detect
* Co-Mentions of Entities within the processed document. <p>
*
*
*
* @author Rupert Westenthaler
*
*/
@Component(
configurationFactory = true,
policy = ConfigurationPolicy.REQUIRE, // the baseUri is required!
specVersion = "1.1",
metatype = true,
immediate = true,
inherit = true)
@org.apache.felix.scr.annotations.Properties(value={
@Property(name=PROPERTY_NAME),
@Property(name=CASE_SENSITIVE,boolValue=DEFAULT_CASE_SENSITIVE_MATCHING_STATE),
@Property(name=MIN_SEARCH_TOKEN_LENGTH, intValue=DEFAULT_MIN_SEARCH_TOKEN_LENGTH),
@Property(name=PROCESS_ONLY_PROPER_NOUNS_STATE, boolValue=DEFAULT_PROCESS_ONLY_PROPER_NOUNS_STATE),
@Property(name=PROCESSED_LANGUAGES,
cardinality=Integer.MAX_VALUE,
value={"*;lmmtip;uc=LINK;prop=0.75;pprob=0.75", // link multiple matchable tokens in chunks; link upper case words
"de;uc=MATCH", //in German all Nouns are upper case
"es;lc=Noun", //the OpenNLP POS tagger for Spanish does not support ProperNouns
"nl;lc=Noun"}), //same for Dutch
@Property(name=DEFAULT_MATCHING_LANGUAGE,value=""),
@Property(name=TYPE_MAPPINGS,cardinality=Integer.MAX_VALUE, value={
"dbp-ont:Organisation; dbp-ont:Newspaper; schema:Organization > dbp-ont:Organisation",
"dbp-ont:Person; foaf:Person; schema:Person > dbp-ont:Person",
"dbp-ont:Place; schema:Place > dbp-ont:Place",
"dbp-ont:Work; schema:CreativeWork > dbp-ont:Work",
"dbp-ont:Event; schema:Event > dbp-ont:Event",
"schema:Product > schema:Product",
"skos:Concept > skos:Concept"}),
@Property(name=SERVICE_RANKING,intValue=0)
})
@Service(value=EnhancementEngine.class)
public class EntityCoMentionEngine extends AbstractEnhancementEngine<RuntimeException,RuntimeException> implements ServiceProperties {
private static final Integer ENGINE_ORDERING = ServiceProperties.ORDERING_POST_PROCESSING + 90;
private static final Map<String,Object> SERVICE_PROPERTIES =
Collections.unmodifiableMap(Collections.singletonMap(
ServiceProperties.ENHANCEMENT_ENGINE_ORDERING,
(Object)ENGINE_ORDERING));
private final Logger log = LoggerFactory.getLogger(EntityCoMentionEngine.class);
private final LiteralFactory literalFactory = LiteralFactory.getInstance();
@Reference
protected NamespacePrefixService prefixService;
@Reference
protected LabelTokenizer labelTokenizer;
// private BundleContext bundleContext;
/**
* EntityLinking configuration used for Co-Mention extractions
*/
private EntityLinkerConfig linkerConfig;
/**
* TextProcessingConfig used for Co-Mention extraction
*/
private TextProcessingConfig textProcessingConfig;
/**
* Default constructor as used by OSGI. This expects that
* {@link #activate(ComponentContext)} is called before usage
*/
public EntityCoMentionEngine() {
}
@Activate
@SuppressWarnings("unchecked")
protected void activate(ComponentContext ctx) throws ConfigurationException {
super.activate(ctx);
log.info("activate {}[name:{}]",getClass().getSimpleName(),getName());
Dictionary<String,Object> properties = ctx.getProperties();
// bundleContext = ctx.getBundleContext();
//extract TextProcessing and EnityLinking config from the provided properties
textProcessingConfig = TextProcessingConfig.createInstance(properties);
linkerConfig = EntityLinkerConfig.createInstance(properties,prefixService);
//some of the confiugration is predefined
linkerConfig.setNameField(CoMentionConstants.CO_MENTION_LABEL_FIELD);
linkerConfig.setTypeField(CoMentionConstants.CO_MENTION_TYPE_FIELD);
linkerConfig.setMaxSuggestions(5); //there should not be more as 5 suggestions
linkerConfig.setMinFoundTokens(1); //a single token is enough
linkerConfig.setMinLabelScore(0.24); //1/4 of the tokens
linkerConfig.setMinMatchScore( //labelScore * token match factor
linkerConfig.getMinLabelScore()*linkerConfig.getMinTokenMatchFactor());
linkerConfig.setRedirectProcessingMode(RedirectProcessingMode.IGNORE);
//get the metadata later set to the enhancement engine
}
/**
* Deactivates this components.
*/
@Deactivate
protected void deactivate(ComponentContext ctx) {
log.info("deactivate {}[name:{}]",getClass().getSimpleName(),getName());
textProcessingConfig = null;
linkerConfig = null;
super.deactivate(ctx);
}
@Override
public int canEnhance(ContentItem ci) throws EngineException {
String language = getLanguage(this, ci, false);
if(language == null || textProcessingConfig.getConfiguration(language) == null){
log.debug("Engine {} ignores ContentItem {} becuase language {} is not condigured.",
new Object[]{ getName(), ci.getUri(), language});
return CANNOT_ENHANCE;
}
//we need a detected language, the AnalyzedText contentPart with Tokens.
AnalysedText at = getAnalysedText(this, ci, false);
return at != null && at.getTokens().hasNext() ?
ENHANCE_ASYNC : CANNOT_ENHANCE;
}
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
AnalysedText at = getAnalysedText(this, ci, true);
String language = getLanguage(this, ci, true);
LanguageProcessingConfig languageConfig = textProcessingConfig.getConfiguration(language);
if(languageConfig == null){
throw new IllegalStateException("The language '"+language+"' is not configured "
+ "to be processed by this Engine. As this is already checked within the "
+ "canEnhance(..) method this may indicate an bug in the used "
+ "EnhanceemntJobManager implementation!");
}
if(log.isDebugEnabled()){
log.debug("compute co-mentions for ContentItem {} language {} text={}",
new Object []{ci.getUri().getUnicodeString(), language, StringUtils.abbreviate(at.getSpan(), 100)});
}
//create the in-memory database for the mentioned Entities
ContentItemMentionBuilder entityMentionIndex = new ContentItemMentionBuilder(ci,
labelTokenizer, language, linkerConfig.getDefaultLanguage());
EntityLinker entityLinker = new EntityLinker(at,language,
languageConfig, entityMentionIndex, linkerConfig, labelTokenizer,entityMentionIndex);
//process
try {
entityLinker.process();
} catch (EntitySearcherException e) {
log.error("Unable to link Entities with "+entityLinker,e);
throw new EngineException(this, ci, "Unable to link Entities with "+entityLinker, e);
}
//TODO: write results
ci.getLock().writeLock().lock();
try {
writeComentions(ci,entityLinker.getLinkedEntities().values(), language);
} finally {
ci.getLock().writeLock().unlock();
}
log.info("Found co-mentions:");
for(LinkedEntity linkedEntity : entityLinker.getLinkedEntities().values()){
log.info(" > {}",linkedEntity);
}
}
private void writeComentions(ContentItem ci,Collection<LinkedEntity> comentions, String language) {
Language languageObject = null;
if(language != null && !language.isEmpty()){
languageObject = new Language(language);
}
MGraph metadata = ci.getMetadata();
for(LinkedEntity comention : comentions){
//URIs of TextAnnotations for the initial mention of this co-mention
Collection<UriRef> initialMentions = new ArrayList<UriRef>(comention.getOccurrences().size());
for(Suggestion suggestion : comention.getSuggestions()){
Entity entity = suggestion.getEntity();
if(entity.getData().filter(entity.getUri(),RDF_TYPE,ENHANCER_TEXTANNOTATION).hasNext()){
//this is a textAnnotation
initialMentions.add(entity.getUri());
} //else TODO support also Entities!!
}
//first create the TextAnnotations for the co-mention
for(Occurrence occurrence : comention.getOccurrences()){
Literal startLiteral = literalFactory.createTypedLiteral(occurrence.getStart());
Literal endLiteral = literalFactory.createTypedLiteral(occurrence.getEnd());
//search for existing text annotation
boolean ignore = false;
//search for textAnnotations with the same end
UriRef textAnnotation = null;
Iterator<Triple> it = metadata.filter(null, ENHANCER_START, startLiteral);
while(it.hasNext()){
Triple t = it.next();
Integer end = EnhancementEngineHelper.get(metadata, t.getSubject(), ENHANCER_END, Integer.class, literalFactory);
if(end != null &&
metadata.filter(t.getSubject(), RDF_TYPE, ENHANCER_TEXTANNOTATION).hasNext()){
textAnnotation = (UriRef)t.getSubject();
if(end > occurrence.getEnd()){
// there is an other TextAnnotation selecting a bigger Span
//so we should ignore this Occurrence
ignore = true;
}
}
}
it = metadata.filter(null, ENHANCER_END, endLiteral);
while(it.hasNext()){
Triple t = it.next();
Integer start = EnhancementEngineHelper.get(metadata, t.getSubject(), ENHANCER_START, Integer.class, literalFactory);
if(start != null &&
metadata.filter(t.getSubject(), RDF_TYPE, ENHANCER_TEXTANNOTATION).hasNext()){
textAnnotation = (UriRef)t.getSubject();
if(start < occurrence.getStart()){
// there is an other TextAnnotation selecting a bigger Span
//so we should ignore this Occurrence
ignore = true;
}
}
}
if(!ignore){
//collect confidence values of co-mentions
Double maxConfidence = null;
if(textAnnotation == null){ //not found ... create a new TextAnnotation for the co-mention
textAnnotation = EnhancementEngineHelper.createTextEnhancement(ci, this);
metadata.add(new TripleImpl(textAnnotation,
Properties.ENHANCER_START,
startLiteral));
metadata.add(new TripleImpl(textAnnotation,
Properties.ENHANCER_END,
endLiteral));
metadata.add(new TripleImpl(textAnnotation,
Properties.ENHANCER_SELECTION_CONTEXT,
new PlainLiteralImpl(occurrence.getContext(),languageObject)));
metadata.add(new TripleImpl(textAnnotation,
Properties.ENHANCER_SELECTED_TEXT,
new PlainLiteralImpl(occurrence.getSelectedText(),languageObject)));
} else { //if existing add this engine as contributor
metadata.add(new TripleImpl(textAnnotation, DC_CONTRIBUTOR,
new PlainLiteralImpl(this.getClass().getName())));
//consider the confidence value of the existing TextAnnotation
maxConfidence = EnhancementEngineHelper.get(metadata, textAnnotation,
ENHANCER_CONFIDENCE, Double.class, literalFactory);
}
//now process initial mention(s) for the co-mention
for(UriRef initialMention : initialMentions){
//check confidence of the initial one
Double confidnece = EnhancementEngineHelper.get(metadata, initialMention,
ENHANCER_CONFIDENCE, Double.class, literalFactory);
if(confidnece != null){
if(maxConfidence == null){
maxConfidence = confidnece;
} else if(maxConfidence.compareTo(confidnece) <= 0){
maxConfidence = confidnece;
}
}
//add suggestions of the initial mention
Set<Resource> values = new HashSet<Resource>();
for(Iterator<Triple> suggestions = metadata.filter(initialMention, DC_TYPE, null); suggestions.hasNext();){
values.add(suggestions.next().getObject());
}
for(Resource dcType : values){
metadata.add(new TripleImpl(textAnnotation, DC_TYPE, dcType));
}
values.clear();
//add the suggestions of the initial mention to this one
for(Iterator<Triple> suggestions = metadata.filter(null, DC_RELATION, initialMention); suggestions.hasNext();){
values.add(suggestions.next().getSubject());
}
for(Resource suggestion : values){
metadata.add(new TripleImpl((NonLiteral)suggestion, DC_RELATION, textAnnotation));
}
//finally link the co-mentation with the initial one
metadata.add(new TripleImpl(textAnnotation, DC_RELATION, initialMention));
//metadata.add(new TripleImpl(initialMention, DC_RELATION, textAnnotation));
}
//TODO: support also Entities
if(maxConfidence != null){ //set the confidence value (if known)
EnhancementEngineHelper.set(metadata, textAnnotation, ENHANCER_CONFIDENCE, maxConfidence, literalFactory);
}
} //else ignore this occurence
}
}
}
@Override
public Map<String,Object> getServiceProperties() {
return SERVICE_PROPERTIES;
}
}