blob: d72203fbf167a6edb42f31a4ee1d81495cfe688b [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.stanbol.enhancer.engines.zemanta.impl;
import static org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper.createTextEnhancement;
import static org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper.createTopicEnhancement;
import static org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper.getReferences;
import static org.apache.stanbol.enhancer.servicesapi.rdf.OntologicalClasses.SKOS_CONCEPT;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_RELATION;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_TYPE;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_CONFIDENCE;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_END;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_LABEL;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_REFERENCE;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_TYPE;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_SELECTED_TEXT;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_SELECTION_CONTEXT;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_START;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.RDF_TYPE;
import static org.apache.stanbol.enhancer.servicesapi.rdf.TechnicalClasses.ENHANCER_CATEGORY;
import static org.apache.stanbol.enhancer.servicesapi.rdf.TechnicalClasses.ENHANCER_TEXTANNOTATION;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import org.apache.clerezza.rdf.core.Literal;
import org.apache.clerezza.rdf.core.LiteralFactory;
import org.apache.clerezza.rdf.core.MGraph;
import org.apache.clerezza.rdf.core.NonLiteral;
import org.apache.clerezza.rdf.core.PlainLiteral;
import org.apache.clerezza.rdf.core.Triple;
import org.apache.clerezza.rdf.core.TripleCollection;
import org.apache.clerezza.rdf.core.UriRef;
import org.apache.clerezza.rdf.core.impl.PlainLiteralImpl;
import org.apache.clerezza.rdf.core.impl.SimpleMGraph;
import org.apache.clerezza.rdf.core.impl.TripleImpl;
import org.apache.commons.io.IOUtils;
import org.apache.felix.scr.annotations.Activate;
import org.apache.felix.scr.annotations.Component;
import org.apache.felix.scr.annotations.Deactivate;
import org.apache.felix.scr.annotations.Properties;
import org.apache.felix.scr.annotations.Property;
import org.apache.felix.scr.annotations.Reference;
import org.apache.felix.scr.annotations.Service;
import org.apache.stanbol.commons.stanboltools.offline.OnlineMode;
import org.apache.stanbol.enhancer.engines.zemanta.ZemantaOntologyEnum;
import org.apache.stanbol.enhancer.servicesapi.Blob;
import org.apache.stanbol.enhancer.servicesapi.ContentItem;
import org.apache.stanbol.enhancer.servicesapi.EngineException;
import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
import org.apache.stanbol.enhancer.servicesapi.InvalidContentException;
import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
import org.apache.stanbol.enhancer.servicesapi.helper.ContentItemHelper;
import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
import org.apache.stanbol.enhancer.servicesapi.impl.AbstractEnhancementEngine;
import org.apache.stanbol.enhancer.servicesapi.rdf.OntologicalClasses;
import org.osgi.framework.BundleContext;
import org.osgi.service.cm.ConfigurationException;
import org.osgi.service.component.ComponentContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Apache Stanbol Enhancer Zemanta enhancement engine.
* This enhancement engine uses the the Zemanta API for enhancing content.
* See http://developer.zemanta.com
* To run this engine you need a Zemanta API key configured (see README)
* <p>
* For detailed information on the mappings of Zemanta annotations to Stanbol
* Enhancer enhancements see
* <a>http://wiki.iks-project.eu/index.php/ZemantaEnhancementEngine</a>
* <p>
* This implementation currently only provides Stanbol Enhancer enhancements for
* Zemanta Recognitions.
*
* @author michaelmarth
* @author Rupert Westenthaler
*/
@Component(immediate = true, metatype = true, inherit = true)
@Service
@Properties(value={
@Property(name=EnhancementEngine.PROPERTY_NAME,value="zemanta")
})
public class ZemantaEnhancementEngine
extends AbstractEnhancementEngine<IOException,RuntimeException>
implements EnhancementEngine, ServiceProperties {
@Property
public static final String API_KEY_PROPERTY = "org.apache.stanbol.enhancer.engines.zemanta.key";
public static final String DMOZ_BASE_URL = "http://www.dmoz.org/";
public static final String ZEMANTA_DMOZ_PREFIX = "Top/";
protected static final Set<String> SUPPORTED_MIMETYPES =
Collections.unmodifiableSet(new HashSet<String>(
Arrays.asList("text/plain","text/html")));
/**
* The maximal prefix/suffix size used for the selection context. This is
* required, because Zemanta does only provide the Anchor text, but not the
* exact position within the text. So this engine creates a TextAnnotation
* for each occurrence of the Anchor within the text and uses the surrounding
* as context.
*/
private static final int SELECTION_CONTEXT_PREFIX_SUFFIX_SIZE = 50;
private static final Logger log = LoggerFactory.getLogger(ZemantaEnhancementEngine.class);
/**
* The default value for the Execution of this Engine. Currently set to
* {@link ServiceProperties#ORDERING_EXTRACTION_ENHANCEMENT} + 10. It should run after Metaxa and LangId.
*/
public static final Integer defaultOrder = ServiceProperties.ORDERING_EXTRACTION_ENHANCEMENT + 10;
private String key;
public LiteralFactory literalFactory;
protected BundleContext bundleContext;
/**
* Only activate this engine in online mode
*/
@SuppressWarnings("unused")
@Reference
private OnlineMode onlineMode;
@Activate
protected void activate(ComponentContext ce) throws IOException,ConfigurationException {
super.activate(ce);
bundleContext = ce.getBundleContext();
key = (String)ce.getProperties().get(API_KEY_PROPERTY);
checkConfig();
//init the LiteralFactory
literalFactory = LiteralFactory.getInstance();
}
@Deactivate
protected void deactivate(ComponentContext ce) {
super.deactivate(ce);
literalFactory = null;
key = null;
bundleContext = null;
}
/**
* Checks the configuration of the {@link #API_KEY_PROPERTY}
* @throws ConfigurationException if the Zemanta key is not configured
*/
private void checkConfig() throws ConfigurationException {
if(key == null || key.trim().length() == 0) {
throw new ConfigurationException(API_KEY_PROPERTY,String.format(
"%s : please configure a Zemanta key to use this engine (e.g. by" +
"using the 'Configuration' tab of the Apache Felix Web Console).",
getClass().getSimpleName()));
}
}
public int canEnhance(ContentItem ci) {
if(ContentItemHelper.getBlob(ci, SUPPORTED_MIMETYPES) != null){
return ENHANCE_ASYNC; //the ZEMANTA engine now supports async processing!
} else {
return CANNOT_ENHANCE;
}
}
public void computeEnhancements(ContentItem ci) throws EngineException {
Entry<UriRef,Blob> contentPart = ContentItemHelper.getBlob(ci, SUPPORTED_MIMETYPES);
if(contentPart == null){
throw new IllegalStateException("No ContentPart with a supported Mime Type"
+ "found for ContentItem "+ci.getUri()+"(supported: '"
+ SUPPORTED_MIMETYPES+"') -> this indicates that canEnhance was"
+ "NOT called and indicates a bug in the used EnhancementJobManager!");
}
String text;
try {
text = ContentItemHelper.getText(contentPart.getValue());
} catch (IOException e) {
throw new InvalidContentException(this, ci, e);
}
if (text.trim().length() == 0) {
log.warn("ContentPart {} of ContentItem {} does not contain any text to enhance",
contentPart.getKey(),ci.getUri());
return;
}
MGraph graph = ci.getMetadata();
UriRef ciId = ci.getUri();
//we need to store the results of Zemanta in an temp graph
MGraph results = new SimpleMGraph();
ZemantaAPIWrapper zemanta = new ZemantaAPIWrapper(key);
try {
results.addAll(zemanta.enhance(text));
} catch (IOException e) {
throw new EngineException("Unable to get Enhancement from remote Zemanta Service",e);
}
//now we need to process the results and convert them into the Enhancer
//annotation structure
ci.getLock().writeLock().lock();
try {
processRecognition(results, graph, text, ciId);
processCategories(results, graph, ciId);
} finally {
ci.getLock().writeLock().unlock();
}
}
public Map<String, Object> getServiceProperties() {
// TODO Auto-generated method stub
return Collections.unmodifiableMap(Collections.singletonMap(
ENHANCEMENT_ENGINE_ORDERING,
(Object) defaultOrder));
}
protected void processCategories(MGraph results, MGraph enhancements, UriRef ciId) {
Iterator<Triple> categories = results.filter(null, RDF_TYPE, ZemantaOntologyEnum.Category.getUri());
//add the root Text annotation as soon as the first TopicAnnotation is added.
UriRef textAnnotation = null;
while (categories.hasNext()) {
NonLiteral category = categories.next().getSubject();
log.debug("process category " + category);
Double confidence = parseConfidence(results, category);
log.debug(" > confidence :" + confidence);
//now we need to follow the Target link
UriRef target = EnhancementEngineHelper.getReference(results, category, ZemantaOntologyEnum.target.getUri());
if (target != null) {
//first check the used categorisation
UriRef categorisationScheme = EnhancementEngineHelper.getReference(results, target, ZemantaOntologyEnum.categorization.getUri());
if (categorisationScheme != null && categorisationScheme.equals(ZemantaOntologyEnum.categorization_DMOZ.getUri())) {
String categoryTitle = EnhancementEngineHelper.getString(results, target, ZemantaOntologyEnum.title.getUri());
if (categoryTitle != null) {
if(textAnnotation == null){
//this is the first category ... create the TextAnnotation used
//to link all fise:TopicAnnotations
textAnnotation = createTextEnhancement(enhancements, this, ciId);
enhancements.add(new TripleImpl(textAnnotation,DC_TYPE,SKOS_CONCEPT));
}
//now write the TopicAnnotation
UriRef categoryEnhancement = createTopicEnhancement(enhancements, this, ciId);
//make related to the EntityAnnotation
enhancements.add(new TripleImpl(categoryEnhancement, DC_RELATION, textAnnotation));
//write the title
enhancements.add(new TripleImpl(categoryEnhancement, ENHANCER_ENTITY_LABEL, new PlainLiteralImpl(categoryTitle)));
//write the reference
if (categoryTitle.startsWith(ZEMANTA_DMOZ_PREFIX)) {
enhancements.add(
new TripleImpl(categoryEnhancement, ENHANCER_ENTITY_REFERENCE, new UriRef(DMOZ_BASE_URL + categoryTitle.substring(ZEMANTA_DMOZ_PREFIX.length()))));
}
//write the confidence
if (confidence != null) {
enhancements.add(new TripleImpl(categoryEnhancement, ENHANCER_CONFIDENCE,
literalFactory.createTypedLiteral(confidence)));
}
//we need to write the fise:entity-type
//as of STANBOL-617 we use now both the zemanta:Category AND the skos:Concept
//type. dc:type is no longer used as this is only used by fise:TextAnnotations
// see http://wiki.iks-project.eu/index.php/ZemantaEnhancementEngine#Mapping_of_Categories
// for more Information
enhancements.add(new TripleImpl(categoryEnhancement, ENHANCER_ENTITY_TYPE, SKOS_CONCEPT));
//Use also Zemanta Category as type for the referred Entity
enhancements.add(new TripleImpl(categoryEnhancement, ENHANCER_ENTITY_TYPE, ZemantaOntologyEnum.Category.getUri()));
} else {
log.warn("Unable to process category " + category + " because no title is present");
}
} else {
log.warn("Unable to process category " + category + " because categorisation scheme != DMOZ (" + categorisationScheme + " != " + ZemantaOntologyEnum.categorization_DMOZ.getUri() + ")");
}
} else {
log.warn("Unable to process category " + category + " because no target node was found");
}
}
}
/**
* Processes all Zemanta Recognitions and converts them to the according
* FISE enhancements
*
* @param results the results of the Zemanta enhancement process
* @param enhancements the graph containing the current Stanbol Enhancer
* enhancements
* @param text the content of the content item as string
*/
protected void processRecognition(MGraph results, MGraph enhancements, String text, UriRef ciId) {
Iterator<Triple> recognitions = results.filter(null, RDF_TYPE, ZemantaOntologyEnum.Recognition.getUri());
while (recognitions.hasNext()) {
NonLiteral recognition = recognitions.next().getSubject();
log.debug("process recognition " + recognition);
//first get everything we need for the textAnnotations
Double confidence = parseConfidence(results, recognition);
log.debug(" > confidence :" + confidence);
String anchor = EnhancementEngineHelper.getString(results, recognition, ZemantaOntologyEnum.anchor.getUri());
log.debug(" > anchor :" + anchor);
Collection<NonLiteral> textAnnotations = processTextAnnotation(enhancements, text, ciId, anchor, confidence);
log.debug(" > number of textAnnotations :" + textAnnotations.size());
//second we need to create the EntityAnnotation that represent the
//recognition
NonLiteral object = EnhancementEngineHelper.getReference(results, recognition, ZemantaOntologyEnum.object.getUri());
log.debug(" > object :" + object);
//The targets represent the linked entities
// ... and yes there can be more of them!
//TODO: can we create an EntityAnnotation with several referred entities?
// Should we use the owl:sameAs to decide that!
Set<UriRef> sameAsSet = new HashSet<UriRef>();
for (Iterator<UriRef> sameAs = getReferences(results, object, ZemantaOntologyEnum.owlSameAs.getUri()); sameAs.hasNext(); sameAsSet.add(sameAs.next()))
;
log.debug(" > sameAs :" + sameAsSet);
//now parse the targets and look if there are others than the one
//merged by using sameAs
Iterator<UriRef> targets = EnhancementEngineHelper.getReferences(results, object, ZemantaOntologyEnum.target.getUri());
String title = null;
while (targets.hasNext()) {
//the entityRef is the URL of the target
UriRef entity = targets.next();
log.debug(" - target :" + entity);
UriRef targetType = EnhancementEngineHelper.getReference(results, entity, ZemantaOntologyEnum.targetType.getUri());
log.debug(" o type :" + targetType);
if (ZemantaOntologyEnum.targetType_RDF.getUri().equals(targetType)) {
String targetTitle = EnhancementEngineHelper.getString(results, entity, ZemantaOntologyEnum.title.getUri());
log.debug(" o title :" + targetTitle);
if (sameAsSet.contains(entity)) {
if (title == null) {
title = targetTitle;
} else if (!title.equals(targetTitle)) {
log.warn("Entities marked with owl:sameAs do use different labels '" + title + "' != '" + targetTitle + "'!");
} //else the same label used by both -> thats expected
} else {
//maybe we should create an second entityEnhancement, but I think, that such a case should
//not happen. So write an warning for now
log.warn("Found Target with type RDF, that is not linked with owl:sameAs to the others (this: '" + entity + " | sameAs: " + sameAsSet + ")");
log.warn(" - no Enhancement for " + entity + " will be created");
}
} //else -> do not process -> RDF Entities only
//TODO: targetTypes are not parsed by Zemanta, therefore we can not set
// any entity types!
}
//create the entityEnhancement
UriRef entityEnhancement = EnhancementEngineHelper.createEntityEnhancement(enhancements, this, ciId);
if (confidence != null) {
enhancements.add(
new TripleImpl(entityEnhancement, ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(confidence)));
}
for (NonLiteral relatedTextAnnotation : textAnnotations) {
enhancements.add(
new TripleImpl(entityEnhancement, DC_RELATION, relatedTextAnnotation));
}
for (UriRef entity : sameAsSet) {
enhancements.add(
new TripleImpl(entityEnhancement, ENHANCER_ENTITY_REFERENCE, entity));
}
enhancements.add(
new TripleImpl(entityEnhancement, ENHANCER_ENTITY_LABEL, new PlainLiteralImpl(title)));
}
}
/**
* Helper method to parse the confidence property for an resource. Zemanta
* does not the the xsd data type, because of that we need to parse the
* double value based on the string.
*
* @param tc the graph used to query for confidence value
* @param resource the resource holding the confidence property
*
* @return the confidence of <code>null</code> if no confidence property is
* present for the parsed resource of the value can not be converted to a
* double value.
* @see ZemantaOntologyEnum#confidence
*/
private static Double parseConfidence(TripleCollection tc, NonLiteral resource) {
String confidenceString = EnhancementEngineHelper.getString(tc, resource, ZemantaOntologyEnum.confidence.getUri());
Double confidence;
if (confidenceString != null) {
try {
confidence = Double.valueOf(confidenceString);
} catch (NumberFormatException e) {
log.warn("Unable to parse Float confidence for Literal value '" + confidenceString + "'");
confidence = null;
}
} else {
confidence = null;
}
return confidence;
}
/**
* This Methods searches/creates text annotations for anchor points of Zemanta
* extractions.
* <p>
* First this method searches for text annotations that do use the anchor as
* selected text. Second it searches for occurrences of the anchor within the
* content of the content and checks if there is an text annotation for that
* occurrence. If not it creates an new one.
*
* @param enhancements the graph containing the meta data
* @param text the content as string
* @param ciId the ID of the content item
* @param anchor the anchor text
* @param confidence the confidence to be used for newly created text annotations
*
* @return a collection of all existing/created text annotations for the parsed anchor
*/
private Collection<NonLiteral> processTextAnnotation(MGraph enhancements, String text, UriRef ciId, String anchor, Double confidence) {
Collection<NonLiteral> textAnnotations = new ArrayList<NonLiteral>();
int anchorLength = anchor.length();
Literal anchorLiteral = new PlainLiteralImpl(anchor);
//first search for existing TextAnnotations for the anchor
Map<Integer, Collection<NonLiteral>> existingTextAnnotationsMap = searchExistingTextAnnotations(enhancements, anchorLiteral);
for (int current = text.indexOf(anchor); current >= 0; current = text.indexOf(anchor, current + 1)) {
Collection<NonLiteral> existingTextAnnotations = existingTextAnnotationsMap.get(current);
if (existingTextAnnotations != null) {
//use the existing once
textAnnotations.addAll(existingTextAnnotations);
} else {
//we need to create an new one!
UriRef textAnnotation = EnhancementEngineHelper.createTextEnhancement(enhancements, this, ciId);
textAnnotations.add(textAnnotation);
//write the selection
enhancements.add(
new TripleImpl(textAnnotation, ENHANCER_START, literalFactory.createTypedLiteral(current)));
enhancements.add(
new TripleImpl(textAnnotation, ENHANCER_END, literalFactory.createTypedLiteral(current + anchorLength)));
enhancements.add(
new TripleImpl(textAnnotation, ENHANCER_SELECTED_TEXT, anchorLiteral));
//extract the selection context
int beginPos;
if(current <= SELECTION_CONTEXT_PREFIX_SUFFIX_SIZE){
beginPos = 0;
} else {
int start = current-SELECTION_CONTEXT_PREFIX_SUFFIX_SIZE;
beginPos = text.indexOf(' ',start);
if(beginPos < 0 || beginPos >= current){ //no words
beginPos = start; //begin within a word
}
}
int endPos;
if(current+anchorLength+SELECTION_CONTEXT_PREFIX_SUFFIX_SIZE >= text.length()){
endPos = text.length();
} else {
int start = current+anchorLength+SELECTION_CONTEXT_PREFIX_SUFFIX_SIZE;
endPos = text.lastIndexOf(' ', start);
if(endPos <= current+anchorLength){
endPos = start; //end within a word;
}
}
enhancements.add(new TripleImpl(textAnnotation,ENHANCER_SELECTION_CONTEXT,new PlainLiteralImpl(text.substring(beginPos, endPos))));
//TODO: Currently I use the confidence of the extraction, but I think this is more
// related to the annotated Entity rather to the selected text.
if (confidence != null) {
enhancements.add(
new TripleImpl(textAnnotation, ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(confidence)));
}
//TODO: No idea about the type of the Annotation, because we do not have an type of the entity!
// One would need to get the types from the referred Source
}
}
return textAnnotations;
}
/**
* Search for existing TextAnnotations for an given selected text and
* returns an Map that uses the start position as an key and a list of
* text annotations as an value.
*
* @param enhancements the graph containing the enhancements to be searched
* @param anchorLiteral the Literal representing the selected text
*
* @return Map that uses the start position as an key and a list of
* text annotations as an value.
*/
private Map<Integer, Collection<NonLiteral>> searchExistingTextAnnotations(MGraph enhancements, Literal anchorLiteral) {
Iterator<Triple> textAnnotationsIterator = enhancements.filter(null, ENHANCER_SELECTED_TEXT, anchorLiteral);
Map<Integer, Collection<NonLiteral>> existingTextAnnotationsMap = new HashMap<Integer, Collection<NonLiteral>>();
while (textAnnotationsIterator.hasNext()) {
NonLiteral subject = textAnnotationsIterator.next().getSubject();
//test rdfType
if (enhancements.contains(new TripleImpl(subject, RDF_TYPE, ENHANCER_TEXTANNOTATION))) {
Integer start = EnhancementEngineHelper.get(enhancements, subject, ENHANCER_START, Integer.class, literalFactory);
if (start != null) {
Collection<NonLiteral> textAnnotationList = existingTextAnnotationsMap.get(start);
if (textAnnotationList == null) {
textAnnotationList = new ArrayList<NonLiteral>();
existingTextAnnotationsMap.put(start, textAnnotationList);
}
textAnnotationList.add(subject);
}
}
}
return existingTextAnnotationsMap;
}
}