blob: e7b52431461823877497eea4d959ff0f664ea6a8 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.stanbol.enhancer.servicesapi.helper;
import static java.util.Collections.singleton;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_LANGUAGE;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.RDF_TYPE;
import static org.apache.stanbol.enhancer.servicesapi.rdf.TechnicalClasses.ENHANCER_TEXTANNOTATION;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.UUID;
import org.apache.clerezza.rdf.core.Literal;
import org.apache.clerezza.rdf.core.LiteralFactory;
import org.apache.clerezza.rdf.core.MGraph;
import org.apache.clerezza.rdf.core.NonLiteral;
import org.apache.clerezza.rdf.core.Resource;
import org.apache.clerezza.rdf.core.Triple;
import org.apache.clerezza.rdf.core.TripleCollection;
import org.apache.clerezza.rdf.core.TypedLiteral;
import org.apache.clerezza.rdf.core.UriRef;
import org.apache.clerezza.rdf.core.impl.TripleImpl;
import org.apache.stanbol.enhancer.servicesapi.Chain;
import org.apache.stanbol.enhancer.servicesapi.ContentItem;
import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
import org.apache.stanbol.enhancer.servicesapi.rdf.Properties;
import org.apache.stanbol.enhancer.servicesapi.rdf.TechnicalClasses;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class EnhancementEngineHelper {
protected final static Random rng = new Random();
private final static Logger log = LoggerFactory.getLogger(EnhancementEngineHelper.class);
private final static LiteralFactory lf = LiteralFactory.getInstance();
public static void setSeed(long seed) {
rng.setSeed(seed);
}
/**
* Create a new instance with the types enhancer:Enhancement and
* enhancer:TextAnnotation in the metadata-graph of the content
* item along with default properties (dc:creator and dc:created) and return
* the UriRef of the extraction so that engines can further add.
*
* @param ci the ContentItem being under analysis
* @param engine the Engine performing the analysis
*
* @return the URI of the new enhancement instance
*/
public static UriRef createTextEnhancement(ContentItem ci,
EnhancementEngine engine){
return createTextEnhancement(ci.getMetadata(), engine, new UriRef(ci.getUri().getUnicodeString()));
}
/**
* Create a new instance with the types enhancer:Enhancement and
* enhancer:TextAnnotation in the parsed graph along with default properties
* (dc:creator, dc:created and enhancer:extracted-form) and return
* the UriRef of the extraction so that engines can further add.
*
* @param metadata the graph
* @param engine the engine
* @param contentItemId the id
*
* @return the URI of the new enhancement instance
*/
public static UriRef createTextEnhancement(MGraph metadata,
EnhancementEngine engine, UriRef contentItemId){
UriRef enhancement = createEnhancement(metadata, engine,contentItemId);
//add the Text Annotation Type
metadata.add(new TripleImpl(enhancement, Properties.RDF_TYPE,
TechnicalClasses.ENHANCER_TEXTANNOTATION));
return enhancement;
}
/**
* Create a new instance with the types enhancer:Enhancement and
* enhancer:EntityAnnotation in the metadata-graph of the content
* item along with default properties (dc:creator and dc:created) and return
* the UriRef of the extraction so that engines can further add
*
* @param ci the ContentItem being under analysis
* @param engine the Engine performing the analysis
* @return the URI of the new enhancement instance
*/
public static UriRef createEntityEnhancement(ContentItem ci,
EnhancementEngine engine){
return createEntityEnhancement(ci.getMetadata(), engine, new UriRef(ci.getUri().getUnicodeString()));
}
/**
* Create a new instance with the types enhancer:Enhancement and
* enhancer:EntityAnnotation in the parsed graph along with default properties
* (dc:creator, dc:created and enhancer:extracted-form) and return
* the UriRef of the extraction so that engines can further add.
*
* @param metadata the graph
* @param engine the engine
* @param contentItemId the id
*
* @return the URI of the new enhancement instance
*/
public static UriRef createEntityEnhancement(MGraph metadata,
EnhancementEngine engine, UriRef contentItemId){
UriRef enhancement = createEnhancement(metadata, engine, contentItemId);
metadata.add(new TripleImpl(enhancement, Properties.RDF_TYPE,
TechnicalClasses.ENHANCER_ENTITYANNOTATION));
return enhancement;
}
/**
* Create a new instance with the types enhancer:Enhancement and
* enhancer:TopicAnnotation in the parsed graph along with default properties
* (dc:creator, dc:created and enhancer:extracted-form) and return
* the UriRef of the extraction so that engines can further add.
*
* @param metadata the graph
* @param engine the engine
* @param contentItemId the id
*
* @return the URI of the new enhancement instance
*/
public static UriRef createTopicEnhancement(MGraph metadata,
EnhancementEngine engine, UriRef contentItemId){
UriRef enhancement = createEnhancement(metadata, engine, contentItemId);
metadata.add(new TripleImpl(enhancement, Properties.RDF_TYPE,
TechnicalClasses.ENHANCER_TOPICANNOTATION));
return enhancement;
}
/**
* Create a new instance with the types enhancer:Enhancement and
* enhancer:TopicAnnotation in the metadata-graph of the content
* item along with default properties (dc:creator and dc:created) and return
* the UriRef of the extraction so that engines can further add
*
* @param ci the ContentItem being under analysis
* @param engine the Engine performing the analysis
* @return the URI of the new enhancement instance
*/
public static UriRef createTopicEnhancement(ContentItem ci,
EnhancementEngine engine){
return createTopicEnhancement(ci.getMetadata(), engine, new UriRef(ci.getUri().getUnicodeString()));
}
/**
* Create a new enhancement instance in the metadata-graph of the content
* item along with default properties (dc:creator and dc:created) and return
* the UriRef of the extraction so that engines can further add.
*
* @param ci the ContentItem being under analysis
* @param engine the Engine performing the analysis
*
* @return the URI of the new enhancement instance
*/
protected static UriRef createEnhancement(MGraph metadata,
EnhancementEngine engine, UriRef contentItemId){
LiteralFactory literalFactory = LiteralFactory.getInstance();
UriRef enhancement = new UriRef("urn:enhancement-"
+ EnhancementEngineHelper.randomUUID());
//add the Enhancement Type
metadata.add(new TripleImpl(enhancement, Properties.RDF_TYPE,
TechnicalClasses.ENHANCER_ENHANCEMENT));
//add the extracted from content item
metadata.add(new TripleImpl(enhancement,
Properties.ENHANCER_EXTRACTED_FROM, contentItemId));
// creation date
metadata.add(new TripleImpl(enhancement, Properties.DC_CREATED,
literalFactory.createTypedLiteral(new Date())));
// the engines that extracted the data
// TODO: add some kind of versioning info for the extractor?
// TODO: use a public dereferencing URI instead? that would allow for
// explicit versioning too
/* NOTE (Rupert Westenthaler 2010-05-26):
* The Idea is to use the ComponentContext in the activate() method of
* an Enhancer to get the bundle name/version and use that as an
* URI for the creator.
* We would need to add getEnhancerID() method to the enhancer interface
* to access this information
*/
metadata.add(new TripleImpl(enhancement, Properties.DC_CREATOR,
literalFactory.createTypedLiteral(engine.getClass().getName())));
return enhancement;
}
/**
* Adds the parsed {@link EnhancementEngine} as dc:contributer to the
* enhancement and also sets the dc:modified property accordingly
* @param metadata the {@link ContentItem#getMetadata()}
* @param enhancement the enhancement
* @param engine the engine
*/
public static void addContributingEngine(MGraph metadata, UriRef enhancement,
EnhancementEngine engine){
LiteralFactory literalFactory = LiteralFactory.getInstance();
// TODO: use a public dereferencing URI instead?
metadata.add(new TripleImpl(enhancement, Properties.DC_CONTRIBUTOR,
literalFactory.createTypedLiteral(engine.getClass().getName())));
//set the modification date to the current date.
set(metadata,enhancement,Properties.DC_MODIFIED,new Date(),literalFactory);
}
/**
* Create a new extraction instance in the metadata-graph of the content
* item along with default properties (dc:creator and dc:created) and return
* the UriRef of the extraction so that engines can further add
*
* @param ci the ContentItem being under analysis
* @param engine the Engine performing the analysis
* @return the URI of the new extraction instance
* @deprecated
* @see EnhancementEngineHelper#createEntityEnhancement(ContentItem, EnhancementEngine)
* @see EnhancementEngineHelper#createTextEnhancement(ContentItem, EnhancementEngine)
*/
@Deprecated
public static UriRef createNewExtraction(ContentItem ci,
EnhancementEngine engine) {
LiteralFactory literalFactory = LiteralFactory.getInstance();
MGraph metadata = ci.getMetadata();
UriRef extraction = new UriRef("urn:extraction-"
+ EnhancementEngineHelper.randomUUID());
metadata.add(new TripleImpl(extraction, Properties.RDF_TYPE,
TechnicalClasses.ENHANCER_EXTRACTION));
// relate the extraction to the content item
metadata.add(new TripleImpl(extraction,
Properties.ENHANCER_RELATED_CONTENT_ITEM, new UriRef(ci.getUri().getUnicodeString())));
// creation date
metadata.add(new TripleImpl(extraction, Properties.DC_CREATED,
literalFactory.createTypedLiteral(new Date())));
// the engines that extracted the data
// TODO: add some kind of versioning info for the extractor?
// TODO: use a public dereferencing URI instead? that would allow for
// explicit versioning too
metadata.add(new TripleImpl(extraction, Properties.DC_CREATOR,
literalFactory.createTypedLiteral(engine.getClass().getName())));
return extraction;
}
/**
* Random UUID generator with re-seedable RNG for the tests.
*
* @return a new Random UUID
*/
public static UUID randomUUID() {
return new UUID(rng.nextLong(), rng.nextLong());
}
/**
* Getter for the first typed literal value of the property for a resource.
*
* @param <T> the java class the literal value needs to be converted to.
* Note that the parsed LiteralFactory needs to support this conversion
* @param graph the graph used to query for the property value
* @param resource the resource
* @param property the property
* @param type the type the literal needs to be converted to
* @param literalFactory the literalFactory
* @return the value
*/
public static <T> T get(TripleCollection graph, NonLiteral resource, UriRef property, Class<T> type,
LiteralFactory literalFactory){
Iterator<Triple> results = graph.filter(resource, property, null);
if(results.hasNext()){
while(results.hasNext()){
Triple result = results.next();
if(result.getObject() instanceof TypedLiteral){
return literalFactory.createObject(type, (TypedLiteral)result.getObject());
} else {
log.debug("Triple {} does not have a TypedLiteral as object! -> ignore",result);
}
}
log.info("No value for {} and property {} had the requested Type {} -> return null",
new Object[]{resource,property,type});
return null;
} else {
log.debug("No Triple found for {} and property {}! -> return null",resource,property);
return null;
}
}
/**
* Replaces all current values of the property for the resource
* with the parsed value
* @param graph the graph
* @param resource the resource
* @param property the property
* @param value the value
*/
public static void set(MGraph graph, NonLiteral resource, UriRef property, Resource value){
set(graph,resource,property,value == null ? null : singleton(value),null);
}
/**
* Replaces all current values of the property for the resource
* with the parsed values
* @param graph the graph
* @param resource the resource
* @param property the property
* @param value the value
*/
public static void set(MGraph graph, NonLiteral resource, UriRef property, Collection<Resource> values){
set(graph,resource,property,values,null);
}
/**
* Replaces all current values of the property for the resource
* with the parsed value
* @param graph the graph
* @param resource the resource
* @param property the property
* @param value the value. In case it is an instance of {@link Resource} it
* is directly added to the graph. Otherwise the parsed {@link LiteralFactory}
* is used to create a {@link TypedLiteral} for the parsed value.
* @param literalFactory the {@link LiteralFactory} used in case the parsed
* value is not an {@link Resource}
*/
public static void set(MGraph graph, NonLiteral resource, UriRef property,
Object value, LiteralFactory literalFactory){
set(graph,resource,property,value == null ? null : singleton(value),literalFactory);
}
/**
* Replaces all current values of the property for the resource
* with the parsed values
* @param graph the graph
* @param resource the resource
* @param property the property
* @param value the value. In case it is an instance of {@link Resource} it
* is directly added to the graph. Otherwise the parsed {@link LiteralFactory}
* is used to create a {@link TypedLiteral} for the parsed value.
* @param literalFactory the {@link LiteralFactory} used in case the parsed
* value is not an {@link Resource}
*/
public static void set(MGraph graph, NonLiteral resource, UriRef property,
Collection<?> values, LiteralFactory literalFactory){
Iterator<Triple> currentValues = graph.filter(resource, property, null);
while(currentValues.hasNext()){
currentValues.next();
currentValues.remove();
}
if(values != null){
for(Object value : values){
if(value instanceof Resource){
graph.add(new TripleImpl(resource, property, (Resource) value));
} else if (value != null){
graph.add(new TripleImpl(resource, property,
literalFactory.createTypedLiteral(value)));
}
}
}
}
/**
* Getter for the typed literal values of the property for a resource
* @param <T> the java class the literal value needs to be converted to.
* Note that the parsed LiteralFactory needs to support this conversion
* @param graph the graph used to query for the property value
* @param resource the resource
* @param property the property
* @param type the type the literal needs to be converted to
* @param literalFactory the literalFactory
* @return the value
*/
public static <T> Iterator<T> getValues(TripleCollection graph, NonLiteral resource,
UriRef property, final Class<T> type, final LiteralFactory literalFactory){
final Iterator<Triple> results = graph.filter(resource, property, null);
return new Iterator<T>() {
//TODO: dose not check if the object of the triple is of type UriRef
@Override
public boolean hasNext() { return results.hasNext(); }
@Override
public T next() {
return literalFactory.createObject(type, (TypedLiteral)results.next().getObject());
}
@Override
public void remove() { results.remove(); }
};
}
/**
* Getter for the first String literal value the property for a resource
* @param graph the graph used to query for the property value
* @param resource the resource
* @param property the property
* @return the value
*/
public static String getString(TripleCollection graph, NonLiteral resource, UriRef property){
Iterator<Triple> results = graph.filter(resource, property, null);
if(results.hasNext()){
while (results.hasNext()){
Triple result = results.next();
if(result.getObject() instanceof Literal){
return ((Literal)result.getObject()).getLexicalForm();
} else {
log.debug("Triple {} does not have a literal as object! -> ignore",result);
}
}
log.info("No Literal value for {} and property {} -> return null",
resource,property);
return null;
} else {
log.debug("No Triple found for "+resource+" and property "+property+"! -> return null");
return null;
}
}
/**
* Getter for the string literal values the property for a resource
* @param graph the graph used to query for the property value
* @param resource the resource
* @param property the property
* @return the value
*/
public static Iterator<String> getStrings(TripleCollection graph, NonLiteral resource, UriRef property){
final Iterator<Triple> results = graph.filter(resource, property, null);
return new Iterator<String>() {
//TODO: dose not check if the object of the triple is of type UriRef
@Override
public boolean hasNext() { return results.hasNext(); }
@Override
public String next() {
return ((Literal)results.next().getObject()).getLexicalForm();
}
@Override
public void remove() { results.remove(); }
};
}
/**
* Getter for the first value of the data type property for a resource
* @param graph the graph used to query for the property value
* @param resource the resource
* @param property the property
* @return the value
*/
public static UriRef getReference(TripleCollection graph, NonLiteral resource, UriRef property){
Iterator<Triple> results = graph.filter(resource, property, null);
if(results.hasNext()){
while(results.hasNext()){
Triple result = results.next();
if(result.getObject() instanceof UriRef){
return (UriRef)result.getObject();
} else {
log.debug("Triple "+result+" does not have a UriRef as object! -> ignore");
}
}
log.info("No UriRef value for {} and property {} -> return null",resource,property);
return null;
} else {
log.debug("No Triple found for {} and property {}! -> return null",resource,property);
return null;
}
}
/**
* Getter for the values of the data type property for a resource.
*
* @param graph the graph used to query for the property value
* @param resource the resource
* @param property the property
* @return The iterator over all the values (
*/
public static Iterator<UriRef> getReferences(TripleCollection graph, NonLiteral resource, UriRef property){
final Iterator<Triple> results = graph.filter(resource, property, null);
return new Iterator<UriRef>() {
//TODO: dose not check if the object of the triple is of type UriRef
@Override
public boolean hasNext() { return results.hasNext(); }
@Override
public UriRef next() { return (UriRef)results.next().getObject(); }
@Override
public void remove() { results.remove(); }
};
}
/**
* Comparator that allows to sort a list/array of {@link EnhancementEngine}s
* based on there {@link ServiceProperties#ENHANCEMENT_ENGINE_ORDERING}.
*/
public static final Comparator<EnhancementEngine> EXECUTION_ORDER_COMPARATOR = new Comparator<EnhancementEngine>() {
@Override
public int compare(EnhancementEngine engine1, EnhancementEngine engine2) {
Integer order1 = getEngineOrder(engine1);
Integer order2 = getEngineOrder(engine2);
//start with the highest number finish with the lowest ...
return order1 == order2?0:order1<order2?1:-1;
}
};
/**
* Gets the {@link ServiceProperties#ENHANCEMENT_ENGINE_ORDERING} value
* for the parsed EnhancementEngine. If the Engine does not implement the
* {@link ServiceProperties} interface or does not provide the
* {@link ServiceProperties#ENHANCEMENT_ENGINE_ORDERING} the
* {@link ServiceProperties#ORDERING_DEFAULT} is returned <p>
* This method is guaranteed to NOT return <code>null</code>.
* @param engine the engine
* @return the ordering
*/
public static Integer getEngineOrder(EnhancementEngine engine){
log.debug("getOrder "+engine);
if (engine instanceof ServiceProperties){
log.debug(" ... implements ServiceProperties");
Object value = ((ServiceProperties)engine).getServiceProperties().get(ServiceProperties.ENHANCEMENT_ENGINE_ORDERING);
log.debug(" > value = "+value +" "+value.getClass());
if (value !=null && value instanceof Integer){
return (Integer)value;
}
}
return ServiceProperties.ORDERING_DEFAULT;
}
/**
* Getter for the Resources of fise:TextAnnotations that do have a value
* of the dc:language property. The returned list is sorted by 'fise:confidence'.
* Annotations with missing confidence are ranked last.<p>
* NOTE that the returned list will likely contain annotations for the same language
* if multiple language identification are used in the same {@link Chain}.
* @param graph the graph with the enhancement.
* Typically {@link ContentItem#getMetadata()}
* @return the sorted list of language annotations or an empty list if none.
* @throws IllegalArgumentException if <code>null</code> is parsed as graph
*/
public static List<NonLiteral> getLanguageAnnotations(TripleCollection graph){
if(graph == null){
throw new IllegalArgumentException("The parsed graph MUST NOT be NULL!");
}
// I do not use SPARQL, because I do not want to instantiate a QueryEngine
final Map<NonLiteral,Double> confidences = new HashMap<NonLiteral,Double>();
List<NonLiteral> langAnnotations = new ArrayList<NonLiteral>();
Iterator<Triple> textAnnoataions = graph.filter(null, RDF_TYPE, ENHANCER_TEXTANNOTATION);
while(textAnnoataions.hasNext()){
NonLiteral textAnnotation = textAnnoataions.next().getSubject();
String language = getString(graph, textAnnotation, DC_LANGUAGE);
if(language != null){
Double confidence = get(graph, textAnnotation, Properties.ENHANCER_CONFIDENCE, Double.class, lf);
confidences.put(textAnnotation,confidence);
langAnnotations.add(textAnnotation);
}
}
if(langAnnotations.size() > 1){
Collections.sort(langAnnotations,new Comparator<NonLiteral>() {
@Override
public int compare(NonLiteral o1, NonLiteral o2) {
Double c1 = confidences.get(o1);
Double c2 = confidences.get(o2);
//decrising order (values without confidence last)
if(c1 == null){
return c2 == null ? 0 : 1;
} else if(c2 == null){
return -1;
} else {
return c2.compareTo(c1);
}
}
});
}
return langAnnotations;
}
/**
* Getter for language identified for (extracted-from) the parsed
* ContentItem. The returned value is the Annotation with the highest
* 'fise:confidence' value - or if no annotations are present - the
* 'dc-terms:language' value of the {@link ContentItem#getUri()}.<p>
* Users that want to obtain all language annotations should use
* {@link #getLanguageAnnotations(TripleCollection)} instead.<p>
* This method ensures a write lock on the {@link ContentItem}.
* @param ci the contentItem
* @return the identified language of the parsed {@link ContentItem}.
* <code>null</code> if not available.
* @throws IllegalArgumentException if <code>null</code> is parsed as content item
* @see #getLanguageAnnotations(TripleCollection)
*/
public static String getLanguage(ContentItem ci){
if(ci == null){
throw new IllegalArgumentException("The parsed ContentItem MUST NOT be NULL!");
}
ci.getLock().readLock().lock();
try {
List<NonLiteral> langAnnotations = getLanguageAnnotations(ci.getMetadata());
if(langAnnotations.isEmpty()){ //fallback
return getString(ci.getMetadata(), ci.getUri(), DC_LANGUAGE);
} else {
return getString(ci.getMetadata(), langAnnotations.get(0), DC_LANGUAGE);
}
} finally {
ci.getLock().readLock().unlock();
}
}
}