blob: 2db779f0a6a6c20322b77363d743336272c5f57d [file] [log] [blame]
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
package org.apache.stanbol.enhancer.engines.entitytagging.impl;
import static org.apache.commons.lang.StringUtils.getLevenshteinDistance;
import static org.apache.stanbol.enhancer.servicesapi.rdf.OntologicalClasses.DBPEDIA_ORGANISATION;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.RDF_TYPE;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Dictionary;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import org.apache.clerezza.rdf.core.LiteralFactory;
import org.apache.clerezza.rdf.core.MGraph;
import org.apache.clerezza.rdf.core.NonLiteral;
import org.apache.clerezza.rdf.core.Triple;
import org.apache.clerezza.rdf.core.UriRef;
import org.apache.commons.lang.StringUtils;
import org.apache.felix.scr.annotations.Activate;
import org.apache.felix.scr.annotations.Component;
import org.apache.felix.scr.annotations.ConfigurationPolicy;
import org.apache.felix.scr.annotations.Deactivate;
import org.apache.felix.scr.annotations.Property;
import org.apache.felix.scr.annotations.Reference;
import org.apache.felix.scr.annotations.ReferenceCardinality;
import org.apache.felix.scr.annotations.ReferencePolicy;
import org.apache.felix.scr.annotations.ReferenceStrategy;
import org.apache.felix.scr.annotations.Service;
import org.apache.stanbol.commons.namespaceprefix.NamespaceMappingUtils;
import org.apache.stanbol.commons.namespaceprefix.NamespacePrefixService;
import org.apache.stanbol.commons.stanboltools.offline.OfflineMode;
import org.apache.stanbol.enhancer.servicesapi.ContentItem;
import org.apache.stanbol.enhancer.servicesapi.EngineException;
import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
import org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager;
import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
import org.apache.stanbol.enhancer.servicesapi.impl.AbstractEnhancementEngine;
import org.apache.stanbol.enhancer.servicesapi.rdf.OntologicalClasses;
import org.apache.stanbol.enhancer.servicesapi.rdf.Properties;
import org.apache.stanbol.enhancer.servicesapi.rdf.TechnicalClasses;
import org.apache.stanbol.entityhub.model.clerezza.RdfValueFactory;
import org.apache.stanbol.entityhub.servicesapi.Entityhub;
import org.apache.stanbol.entityhub.servicesapi.EntityhubException;
import org.apache.stanbol.entityhub.servicesapi.model.Entity;
import org.apache.stanbol.entityhub.servicesapi.model.Representation;
import org.apache.stanbol.entityhub.servicesapi.model.Text;
import org.apache.stanbol.entityhub.servicesapi.model.rdf.RdfResourceEnum;
import org.apache.stanbol.entityhub.servicesapi.query.Constraint;
import org.apache.stanbol.entityhub.servicesapi.query.FieldQuery;
import org.apache.stanbol.entityhub.servicesapi.query.FieldQueryFactory;
import org.apache.stanbol.entityhub.servicesapi.query.QueryResultList;
import org.apache.stanbol.entityhub.servicesapi.query.ReferenceConstraint;
import org.apache.stanbol.entityhub.servicesapi.query.TextConstraint;
import org.osgi.framework.Constants;
import org.osgi.service.component.ComponentContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
* Engine that uses a {@link Site} to search for entities for existing TextAnnotations of an Content Item.
* @author ogrisel, rwesten
@Component(configurationFactory = true,
policy = ConfigurationPolicy.REQUIRE, // the baseUri is required!
specVersion = "1.1", metatype = true, immediate = true, inherit = true)
@org.apache.felix.scr.annotations.Properties(value = {@Property(name = EnhancementEngine.PROPERTY_NAME)})
public class NamedEntityTaggingEngine extends AbstractEnhancementEngine<RuntimeException,RuntimeException>
implements EnhancementEngine, ServiceProperties {
private final Logger log = LoggerFactory.getLogger(getClass());
@Property // (value = "dbpedia")
public static final String REFERENCED_SITE_ID = "org.apache.stanbol.enhancer.engines.entitytagging.referencedSiteId";
@Property(boolValue = false)
public static final String PERSON_STATE = "org.apache.stanbol.enhancer.engines.entitytagging.personState";
@Property // (value = "dbp-ont:Person")
public static final String PERSON_TYPE = "org.apache.stanbol.enhancer.engines.entitytagging.personType";
@Property(boolValue = false)
public static final String ORG_STATE = "org.apache.stanbol.enhancer.engines.entitytagging.organisationState";
@Property // (value = "dbp-ont:Organisation")
public static final String ORG_TYPE = "org.apache.stanbol.enhancer.engines.entitytagging.organisationType";
@Property(boolValue = false)
public static final String PLACE_STATE = "org.apache.stanbol.enhancer.engines.entitytagging.placeState";
@Property // (value = "dbp-ont:Place")
public static final String PLACE_TYPE = "org.apache.stanbol.enhancer.engines.entitytagging.placeType";
* Use the RDFS label as default
@Property(value = "rdfs:label")
public static final String NAME_FIELD = "org.apache.stanbol.enhancer.engines.entitytagging.nameField";
* Use the RDFS label as default
@Property(boolValue = true)
public static final String DEREFERENCE_ENTITIES = "org.apache.stanbol.enhancer.engines.entitytagging.dereference";
@Property(intValue = 0)
public static final String SERVICE_RANKING = Constants.SERVICE_RANKING;
* The default language for labels included in the enhancement metadata (if not available for the parsed
* content).
private static final String DEFAULT_LANGUAGE = "en";
* Service of the Entityhub that manages all the active referenced Site. This Service is used to lookup
* the configured Referenced Site when we need to enhance a content item.
protected SiteManager siteManager;
* Used to lookup Entities if the {@link #REFERENCED_SITE_ID} property is set to "entityhub" or "local"
protected Entityhub entityhub;
protected NamespacePrefixService nsPrefixService;
* This holds the id of the {@link Site} used to lookup Entities or <code>null</code> if the
* {@link Entityhub} is used.
protected String referencedSiteID;
* The default value for the Execution of this Engine. Currently set to
* {@link EnhancementJobManager#DEFAULT_ORDER}
public static final Integer defaultOrder = ORDERING_EXTRACTION_ENHANCEMENT;
* State if text annotations of type {@link OntologicalClasses#DBPEDIA_PERSON} are enhanced by this engine
protected boolean personState;
* State if text annotations of type {@link OntologicalClasses#DBPEDIA_ORGANISATION} are enhanced by this
* engine
protected boolean orgState;
* State if text annotations of type {@link OntologicalClasses#DBPEDIA_PLACE} are enhanced by this engine
protected boolean placeState;
* The rdf:type constraint used to search for persons or <code>null</code> if no type constraint should be
* used
protected String personType;
* The rdf:type constraint used to search for organisations or <code>null</code> if no type constraint
* should be used
protected String orgType;
* The rdf:type constraint used to search for places or <code>null</code> if no type constraint should be
* used
protected String placeType;
* The field used to search for the selected text of the TextAnnotation.
protected String nameField;
* The number of Suggestions to be added
protected Integer numSuggestions = 3;
protected boolean dereferenceEntities = true;
* The {@link OfflineMode} is used by Stanbol to indicate that no external service should be referenced.
* For this engine that means it is necessary to check if the used {@link Site} can operate offline or
* not.
* @see #enableOfflineMode(OfflineMode)
* @see #disableOfflineMode(OfflineMode)
@Reference(cardinality = ReferenceCardinality.OPTIONAL_UNARY, policy = ReferencePolicy.DYNAMIC, bind = "enableOfflineMode", unbind = "disableOfflineMode", strategy = ReferenceStrategy.EVENT)
private OfflineMode offlineMode;
* Called by the ConfigurationAdmin to bind the {@link #offlineMode} if the service becomes available
* @param mode
protected final void enableOfflineMode(OfflineMode mode) {
this.offlineMode = mode;
* Called by the ConfigurationAdmin to unbind the {@link #offlineMode} if the service becomes unavailable
* @param mode
protected final void disableOfflineMode(OfflineMode mode) {
this.offlineMode = null;
* Returns <code>true</code> only if Stanbol operates in {@link OfflineMode} .
* @return the offline state
protected final boolean isOfflineMode() {
return offlineMode != null;
protected void activate(ComponentContext context) throws ConfigurationException {
Dictionary<String,Object> config = context.getProperties();
Object referencedSiteID = config.get(REFERENCED_SITE_ID);
if (referencedSiteID == null) {
throw new ConfigurationException(REFERENCED_SITE_ID,
"The ID of the Referenced Site is a required Parameter and MUST NOT be NULL!");
this.referencedSiteID = referencedSiteID.toString();
if (this.referencedSiteID.isEmpty()) {
throw new ConfigurationException(REFERENCED_SITE_ID,
"The ID of the Referenced Site is a required Parameter and MUST NOT be an empty String!");
if (Entityhub.ENTITYHUB_IDS.contains(this.referencedSiteID.toLowerCase())) {
log.debug("Init NamedEntityTaggingEngine instance for the Entityhub");
this.referencedSiteID = null;
Object state = config.get(PERSON_STATE);
personState = state == null ? true : Boolean.parseBoolean(state.toString());
state = config.get(ORG_STATE);
orgState = state == null ? true : Boolean.parseBoolean(state.toString());
state = config.get(PLACE_STATE);
placeState = state == null ? true : Boolean.parseBoolean(state.toString());
Object type = config.get(PERSON_TYPE);
personType = type == null || type.toString().isEmpty() ? null :
NamespaceMappingUtils.getConfiguredUri(nsPrefixService,PERSON_TYPE, type.toString());
type = config.get(ORG_TYPE);
orgType = type == null || type.toString().isEmpty() ? null :
type = config.get(PLACE_TYPE);
placeType = type == null || type.toString().isEmpty() ? null :
Object nameField = config.get(NAME_FIELD);
this.nameField = nameField == null || nameField.toString().isEmpty() ?
"" :
Object dereferenceEntities = config.get(DEREFERENCE_ENTITIES);
this.dereferenceEntities = state == null ? true : Boolean
protected void deactivate(ComponentContext context) {
referencedSiteID = null;
personType = null;
orgType = null;
placeType = null;
nameField = null;
public void computeEnhancements(ContentItem ci) throws EngineException {
final Site site;
if (referencedSiteID != null) { // lookup the referenced site
site = siteManager.getSite(referencedSiteID);
// ensure that it is present
if (site == null) {
String msg = String.format(
"Unable to enhance %s because Referenced Site %s is currently not active!", ci.getUri()
.getUnicodeString(), referencedSiteID);
// TODO: throwing Exceptions is currently deactivated. We need a
// more clear
// policy what do to in such situations
// throw new EngineException(msg);
// and that it supports offline mode if required
if (isOfflineMode() && !site.supportsLocalMode()) {
"Unable to enhance ci {} because OfflineMode is not supported by ReferencedSite {}.", ci
.getUri().getUnicodeString(), site.getId());
} else { // null indicates to use the Entityhub to lookup Entities
site = null;
MGraph graph = ci.getMetadata();
LiteralFactory literalFactory = LiteralFactory.getInstance();
// Retrieve the existing text annotations (requires read lock)
Map<NamedEntity,List<UriRef>> textAnnotations = new HashMap<NamedEntity,List<UriRef>>();
// the language extracted for the parsed content or NULL if not
// available
String contentLangauge;
try {
contentLangauge = EnhancementEngineHelper.getLanguage(ci);
for (Iterator<Triple> it = graph.filter(null, RDF_TYPE, TechnicalClasses.ENHANCER_TEXTANNOTATION); it
.hasNext();) {
UriRef uri = (UriRef);
if (graph.filter(uri, Properties.DC_RELATION, null).hasNext()) {
// this is not the most specific occurrence of this name:
// skip
NamedEntity namedEntity = NamedEntity.createFromTextAnnotation(graph, uri);
if (namedEntity != null) {
// This is a first occurrence, collect any subsumed
// annotations
List<UriRef> subsumed = new ArrayList<UriRef>();
for (Iterator<Triple> it2 = graph.filter(null, Properties.DC_RELATION, uri); it2
.hasNext();) {
textAnnotations.put(namedEntity, subsumed);
} finally {
// search the suggestions
Map<NamedEntity,List<Suggestion>> suggestions = new HashMap<NamedEntity,List<Suggestion>>(
for (Entry<NamedEntity,List<UriRef>> entry : textAnnotations.entrySet()) {
try {
List<Suggestion> entitySuggestions = computeEntityRecommentations(site, entry.getKey(),
entry.getValue(), contentLangauge);
if (entitySuggestions != null && !entitySuggestions.isEmpty()) {
suggestions.put(entry.getKey(), entitySuggestions);
} catch (EntityhubException e) {
throw new EngineException(this, ci, e);
// now write the results (requires write lock)
try {
RdfValueFactory factory = RdfValueFactory.getInstance();
Map<String,Representation> entityData = new HashMap<String,Representation>();
for (Entry<NamedEntity,List<Suggestion>> entitySuggestions : suggestions.entrySet()) {
List<UriRef> subsumed = textAnnotations.get(entitySuggestions.getKey());
List<NonLiteral> annotationsToRelate = new ArrayList<NonLiteral>(subsumed);
for (Suggestion suggestion : entitySuggestions.getValue()) {
log.debug("Add Suggestion {} for {}", suggestion.getEntity().getId(),
EnhancementRDFUtils.writeEntityAnnotation(this, literalFactory, graph, ci.getUri(),
annotationsToRelate, suggestion, nameField,
// TODO: maybe we want labels in a different
// language than the
// language of the content (e.g. Accept-Language
// header)?!
contentLangauge == null ? DEFAULT_LANGUAGE : contentLangauge);
if (dereferenceEntities) {
entityData.put(suggestion.getEntity().getId(), suggestion.getEntity()
// if dereferneceEntities is true the entityData will also contain
// all
// Representations to add! If false entityData will be empty
for (Representation rep : entityData.values()) {
} finally {
* Computes the Enhancements
* @param site
* The {@link SiteException} id or <code>null</code> to use the {@link Entityhub}
* @param literalFactory
* the {@link LiteralFactory} used to create RDF Literals
* @param contentItemId
* the id of the contentItem
* @param textAnnotation
* the text annotation to enhance
* @param subsumedAnnotations
* other text annotations for the same entity
* @param language
* the language of the analysed text or <code>null</code> if not available.
* @return the suggestions for the parsed {@link NamedEntity}
* @throws EntityhubException
* On any Error while looking up Entities via the Entityhub
protected final List<Suggestion> computeEntityRecommentations(Site site,
NamedEntity namedEntity,
List<UriRef> subsumedAnnotations,
String language) throws EntityhubException {
// First get the required properties for the parsed textAnnotation
// ... and check the values
log.debug("Process {}", namedEntity);
// if site is NULL use
// the Entityhub
FieldQueryFactory queryFactory = site == null ? entityhub.getQueryFactory() : site.getQueryFactory();
log.trace("Will use a query-factory of type [{}].", queryFactory.getClass().toString());
FieldQuery query = queryFactory.createFieldQuery();
// replace spaces with plus to create an AND search for all words in the
// name!
Constraint labelConstraint;
// TODO: make case sensitivity configurable
boolean casesensitive = false;
String namedEntityLabel = casesensitive ? namedEntity.getName() : namedEntity.getName().toLowerCase();
if (language != null) {
// search labels in the language and without language
labelConstraint = new TextConstraint(namedEntityLabel, casesensitive, language, null);
} else {
labelConstraint = new TextConstraint(namedEntityLabel, casesensitive);
query.setConstraint(nameField, labelConstraint);
if (OntologicalClasses.DBPEDIA_PERSON.equals(namedEntity.getType())) {
if (personState) {
if (personType != null) {
query.setConstraint(RDF_TYPE.getUnicodeString(), new ReferenceConstraint(personType));
// else no type constraint
} else {
// ignore people
return Collections.emptyList();
} else if (DBPEDIA_ORGANISATION.equals(namedEntity.getType())) {
if (orgState) {
if (orgType != null) {
query.setConstraint(RDF_TYPE.getUnicodeString(), new ReferenceConstraint(orgType));
// else no type constraint
} else {
// ignore people
return Collections.emptyList();
} else if (OntologicalClasses.DBPEDIA_PLACE.equals(namedEntity.getType())) {
if (this.placeState) {
if (this.placeType != null) {
query.setConstraint(RDF_TYPE.getUnicodeString(), new ReferenceConstraint(placeType));
// else no type constraint
} else {
// ignore people
return Collections.emptyList();
query.setLimit(Math.max(20, this.numSuggestions * 3));
log.trace("A query has been created of type [{}] and the following settings:\n{}", query.getClass()
.toString(), query.toString());
if (null == site) log.trace("A query will be sent to the entity-hub of type [{}].", entityhub
else log.trace("A query will be sent to a site [id :: {}][type :: {}].", site.getId(), site
QueryResultList<Entity> results = site == null ? // if site is NULL
: // use the Entityhub
site.findEntities(query); // else the referenced site
log.debug(" - {} results returned by query {}", results.size(), results.getQuery());
if (results.isEmpty()) { // no results nothing to do
return Collections.emptyList();
// we need to normalise the confidence values from [0..1]
// * levenshtein distance as absolute (1.0 for exact match)
// * Solr scores * levenshtein to rank entities relative to each other
Float maxScore = null;
Float maxExactScore = null;
List<Suggestion> matches = new ArrayList<Suggestion>(numSuggestions);
// assumes entities are sorted by score
for (Iterator<Entity> guesses = results.iterator(); guesses.hasNext();) {
Suggestion match = new Suggestion(;
Representation rep = match.getEntity().getRepresentation();
Float score = rep.getFirst(RdfResourceEnum.resultScore.getUri(), Float.class);
if (maxScore == null) {
maxScore = score;
Iterator<Text> labels = rep.getText(nameField);
while (labels.hasNext() && match.getLevenshtein() < 1.0) {
Text label =;
if (language == null || // if the content language is unknown ->
// accept all labels
label.getLanguage() == null || // accept labels with no
// language
// and labels in the same language as the content
(language != null && label.getLanguage().startsWith(language))) {
double actMatch = levenshtein(
casesensitive ? label.getText() : label.getText().toLowerCase(), namedEntityLabel);
if (actMatch > match.getLevenshtein()) {
if (match.getMatchedLabel() != null) {
if (match.getLevenshtein() == 1.0) {
if (maxExactScore == null) {
maxExactScore = score;
// normalise exact matches against the best exact score
match.setScore(score.doubleValue() / maxExactScore.doubleValue());
} else {
// normalise partial matches against the best match and the
// Levenshtein similarity with the label
match.setScore(score.doubleValue() * match.getLevenshtein() / maxScore.doubleValue());
} else {
log.debug("No value of {} for Entity {}!", nameField, match.getEntity().getId());
// now sort the results
return matches.subList(0, Math.min(matches.size(), numSuggestions));
* This EnhancementEngine can enhance any ContentItem as it does consume existing TextAnnotations with the
* configured dc:type's
* @see org.apache.stanbol.enhancer.servicesapi.EnhancementEngine#canEnhance(org.apache.stanbol.enhancer.servicesapi.ContentItem)
public int canEnhance(ContentItem ci) {
return ENHANCE_ASYNC; // Entity tagging now supports asyc processing
public Map<String,Object> getServiceProperties() {
return Collections.unmodifiableMap(Collections.singletonMap(ENHANCEMENT_ENGINE_ORDERING,
(Object) defaultOrder));
* Compares two strings (after {@link StringUtils#trim(String) trimming}) by using the Levenshtein's Edit
* Distance of the two strings. Does not return the {@link Integer} number of changes but
* <code>1-(changes/maxStringSizeAfterTrim)</code>
* <p>
* @param s1
* the first string
* @param s2
* the second string
* @return the distance
* @throws IllegalArgumentException
* if any of the two parsed strings is NULL
private static double levenshtein(String s1, String s2) {
if (s1 == null || s2 == null) {
throw new IllegalArgumentException("NONE of the parsed String MUST BE NULL!");
s1 = StringUtils.trim(s1);
s2 = StringUtils.trim(s2);
return s1.isEmpty() || s2.isEmpty() ? 0
: 1.0 - (((double) getLevenshteinDistance(s1, s2)) / ((double) (Math.max(s1.length(),