generic/servicesapi/src/main/java/org/apache/stanbol/enhancer/servicesapi/helper/EnhancementEngineHelper.java - stanbol - Git at Google

 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.stanbol.enhancer.servicesapi.helper;

 import static java.util.Collections.singleton;
 import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_LANGUAGE;
 import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.RDF_TYPE;
 import static org.apache.stanbol.enhancer.servicesapi.rdf.TechnicalClasses.ENHANCER_TEXTANNOTATION;

 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.Comparator;
 import java.util.Date;
 import java.util.HashMap;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
 import java.util.Random;
 import java.util.UUID;

 import org.apache.clerezza.rdf.core.Literal;
 import org.apache.clerezza.rdf.core.LiteralFactory;
 import org.apache.clerezza.rdf.core.MGraph;
 import org.apache.clerezza.rdf.core.NonLiteral;
 import org.apache.clerezza.rdf.core.Resource;
 import org.apache.clerezza.rdf.core.Triple;
 import org.apache.clerezza.rdf.core.TripleCollection;
 import org.apache.clerezza.rdf.core.TypedLiteral;
 import org.apache.clerezza.rdf.core.UriRef;
 import org.apache.clerezza.rdf.core.impl.TripleImpl;
 import org.apache.stanbol.enhancer.servicesapi.Chain;
 import org.apache.stanbol.enhancer.servicesapi.ContentItem;
 import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
 import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
 import org.apache.stanbol.enhancer.servicesapi.rdf.Properties;
 import org.apache.stanbol.enhancer.servicesapi.rdf.TechnicalClasses;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;


 public class EnhancementEngineHelper {

     protected final static Random rng = new Random();

     private final static Logger log = LoggerFactory.getLogger(EnhancementEngineHelper.class);

     private final static LiteralFactory lf = LiteralFactory.getInstance();

     public static void setSeed(long seed) {
         rng.setSeed(seed);
     }

     /**
      * Create a new instance with the types enhancer:Enhancement and
      * enhancer:TextAnnotation in the metadata-graph of the content
      * item along with default properties (dc:creator and dc:created) and return
      * the UriRef of the extraction so that engines can further add.
      *
      * @param ci the ContentItem being under analysis
      * @param engine the Engine performing the analysis
      *
      * @return the URI of the new enhancement instance
      */
     public static UriRef createTextEnhancement(ContentItem ci,
             EnhancementEngine engine){
         return createTextEnhancement(ci.getMetadata(), engine, new UriRef(ci.getUri().getUnicodeString()));
     }
     /**
      * Create a new instance with the types enhancer:Enhancement and
      * enhancer:TextAnnotation in the parsed graph along with default properties
      * (dc:creator, dc:created and enhancer:extracted-form) and return
      * the UriRef of the extraction so that engines can further add.
      *
      * @param metadata the graph
      * @param engine the engine
      * @param contentItemId the id
      *
      * @return the URI of the new enhancement instance
      */
     public static UriRef createTextEnhancement(MGraph metadata,
                 EnhancementEngine engine, UriRef contentItemId){
         UriRef enhancement = createEnhancement(metadata, engine,contentItemId);
         //add the Text Annotation Type
         metadata.add(new TripleImpl(enhancement, Properties.RDF_TYPE,
                 TechnicalClasses.ENHANCER_TEXTANNOTATION));
         return enhancement;
     }
     /**
      * Create a new instance with the types enhancer:Enhancement and
      * enhancer:EntityAnnotation in the metadata-graph of the content
      * item along with default properties (dc:creator and dc:created) and return
      * the UriRef of the extraction so that engines can further add
      *
      * @param ci the ContentItem being under analysis
      * @param engine the Engine performing the analysis
      * @return the URI of the new enhancement instance
      */
     public static UriRef createEntityEnhancement(ContentItem ci,
             EnhancementEngine engine){
         return createEntityEnhancement(ci.getMetadata(), engine, new UriRef(ci.getUri().getUnicodeString()));
     }
     /**
      * Create a new instance with the types enhancer:Enhancement and
      * enhancer:EntityAnnotation in the parsed graph along with default properties
      * (dc:creator, dc:created and enhancer:extracted-form) and return
      * the UriRef of the extraction so that engines can further add.
      *
      * @param metadata the graph
      * @param engine the engine
      * @param contentItemId the id
      *
      * @return the URI of the new enhancement instance
      */
     public static UriRef createEntityEnhancement(MGraph metadata,
                 EnhancementEngine engine, UriRef contentItemId){
         UriRef enhancement = createEnhancement(metadata, engine, contentItemId);
         metadata.add(new TripleImpl(enhancement, Properties.RDF_TYPE,
                 TechnicalClasses.ENHANCER_ENTITYANNOTATION));
         return enhancement;
     }
     /**
      * Create a new instance with the types enhancer:Enhancement and
      * enhancer:TopicAnnotation in the parsed graph along with default properties
      * (dc:creator, dc:created and enhancer:extracted-form) and return
      * the UriRef of the extraction so that engines can further add.
      *
      * @param metadata the graph
      * @param engine the engine
      * @param contentItemId the id
      *
      * @return the URI of the new enhancement instance
      */
     public static UriRef createTopicEnhancement(MGraph metadata,
                  EnhancementEngine engine, UriRef contentItemId){
          UriRef enhancement = createEnhancement(metadata, engine, contentItemId);
          metadata.add(new TripleImpl(enhancement, Properties.RDF_TYPE,
                  TechnicalClasses.ENHANCER_TOPICANNOTATION));
          return enhancement;
      }
     /**
      * Create a new instance with the types enhancer:Enhancement and
      * enhancer:TopicAnnotation in the metadata-graph of the content
      * item along with default properties (dc:creator and dc:created) and return
      * the UriRef of the extraction so that engines can further add
      *
      * @param ci the ContentItem being under analysis
      * @param engine the Engine performing the analysis
      * @return the URI of the new enhancement instance
      */
     public static UriRef createTopicEnhancement(ContentItem ci,
             EnhancementEngine engine){
         return createTopicEnhancement(ci.getMetadata(), engine, new UriRef(ci.getUri().getUnicodeString()));
     }
     /**
      * Create a new enhancement instance in the metadata-graph of the content
      * item along with default properties (dc:creator and dc:created) and return
      * the UriRef of the extraction so that engines can further add.
      *
      * @param ci the ContentItem being under analysis
      * @param engine the Engine performing the analysis
      *
      * @return the URI of the new enhancement instance
      */
     protected static UriRef createEnhancement(MGraph metadata,
             EnhancementEngine engine, UriRef contentItemId){
         LiteralFactory literalFactory = LiteralFactory.getInstance();

         UriRef enhancement = new UriRef("urn:enhancement-"
                 + EnhancementEngineHelper.randomUUID());
         //add the Enhancement Type
         metadata.add(new TripleImpl(enhancement, Properties.RDF_TYPE,
                 TechnicalClasses.ENHANCER_ENHANCEMENT));
         //add the extracted from content item
         metadata.add(new TripleImpl(enhancement,
                 Properties.ENHANCER_EXTRACTED_FROM, contentItemId));
         // creation date
         metadata.add(new TripleImpl(enhancement, Properties.DC_CREATED,
                 literalFactory.createTypedLiteral(new Date())));

         // the engines that extracted the data
         // TODO: add some kind of versioning info for the extractor?
         // TODO: use a public dereferencing URI instead? that would allow for
         // explicit versioning too
         /* NOTE (Rupert Westenthaler 2010-05-26):
          * The Idea is to use the  ComponentContext in the activate() method of
          * an Enhancer to get the bundle name/version and use that as an
          * URI for the creator.
          * We would need to add getEnhancerID() method to the enhancer interface
          * to access this information
           */
         metadata.add(new TripleImpl(enhancement, Properties.DC_CREATOR,
                 literalFactory.createTypedLiteral(engine.getClass().getName())));
         return enhancement;
     }
     /**
      * Adds the parsed {@link EnhancementEngine} as dc:contributer to the
      * enhancement and also sets the dc:modified property accordingly
      * @param metadata the {@link ContentItem#getMetadata()}
      * @param enhancement the enhancement
      * @param engine the engine
      */
     public static void addContributingEngine(MGraph metadata, UriRef enhancement,
                                              EnhancementEngine engine){
         LiteralFactory literalFactory = LiteralFactory.getInstance();
         // TODO: use a public dereferencing URI instead?
         metadata.add(new TripleImpl(enhancement, Properties.DC_CONTRIBUTOR,
             literalFactory.createTypedLiteral(engine.getClass().getName())));
         //set the modification date to the current date.
         set(metadata,enhancement,Properties.DC_MODIFIED,new Date(),literalFactory);
     }

     /**
      * Create a new extraction instance in the metadata-graph of the content
      * item along with default properties (dc:creator and dc:created) and return
      * the UriRef of the extraction so that engines can further add
      *
      * @param ci the ContentItem being under analysis
      * @param engine the Engine performing the analysis
      * @return the URI of the new extraction instance
      * @deprecated
      * @see EnhancementEngineHelper#createEntityEnhancement(ContentItem, EnhancementEngine)
      * @see EnhancementEngineHelper#createTextEnhancement(ContentItem, EnhancementEngine)
      */
     @Deprecated
     public static UriRef createNewExtraction(ContentItem ci,
             EnhancementEngine engine) {
         LiteralFactory literalFactory = LiteralFactory.getInstance();

         MGraph metadata = ci.getMetadata();
         UriRef extraction = new UriRef("urn:extraction-"
                 + EnhancementEngineHelper.randomUUID());

         metadata.add(new TripleImpl(extraction, Properties.RDF_TYPE,
                 TechnicalClasses.ENHANCER_EXTRACTION));

         // relate the extraction to the content item
         metadata.add(new TripleImpl(extraction,
                 Properties.ENHANCER_RELATED_CONTENT_ITEM, new UriRef(ci.getUri().getUnicodeString())));

         // creation date
         metadata.add(new TripleImpl(extraction, Properties.DC_CREATED,
                 literalFactory.createTypedLiteral(new Date())));

         // the engines that extracted the data
         // TODO: add some kind of versioning info for the extractor?
         // TODO: use a public dereferencing URI instead? that would allow for
         // explicit versioning too
         metadata.add(new TripleImpl(extraction, Properties.DC_CREATOR,
                 literalFactory.createTypedLiteral(engine.getClass().getName())));

         return extraction;
     }

     /**
      * Random UUID generator with re-seedable RNG for the tests.
      *
      * @return a new Random UUID
      */
     public static UUID randomUUID() {
         return new UUID(rng.nextLong(), rng.nextLong());
     }

     /**
      * Getter for the first typed literal value of the property for a resource.
      *
      * @param <T> the java class the literal value needs to be converted to.
      * Note that the parsed LiteralFactory needs to support this conversion
      * @param graph the graph used to query for the property value
      * @param resource the resource
      * @param property the property
      * @param type the type the literal needs to be converted to
      * @param literalFactory the literalFactory
      * @return the value
      */
     public static <T> T get(TripleCollection graph, NonLiteral resource, UriRef property, Class<T> type,
             LiteralFactory literalFactory){
         Iterator<Triple> results = graph.filter(resource, property, null);
         if(results.hasNext()){
             while(results.hasNext()){
                 Triple result = results.next();
                 if(result.getObject() instanceof TypedLiteral){
                     return literalFactory.createObject(type, (TypedLiteral)result.getObject());
                 } else {
                     log.debug("Triple {} does not have a TypedLiteral as object! -> ignore",result);
                 }
             }
             log.info("No value for {} and property {} had the requested Type {} -> return null",
                 new Object[]{resource,property,type});
             return null;
         } else {
             log.debug("No Triple found for {} and property {}! -> return null",resource,property);
             return null;
         }
     }
     /**
      * Replaces all current values of the property for the resource
      * with the parsed value
      * @param graph the graph
      * @param resource the resource
      * @param property the property
      * @param value the value
      */
     public static void set(MGraph graph, NonLiteral resource, UriRef property, Resource value){
         set(graph,resource,property,value == null ? null : singleton(value),null);
     }
     /**
      * Replaces all current values of the property for the resource
      * with the parsed values
      * @param graph the graph
      * @param resource the resource
      * @param property the property
      * @param value the value
      */
     public static void set(MGraph graph, NonLiteral resource, UriRef property, Collection<Resource> values){
         set(graph,resource,property,values,null);
     }

     /**
      * Replaces all current values of the property for the resource
      * with the parsed value
      * @param graph the graph
      * @param resource the resource
      * @param property the property
      * @param value the value. In case it is an instance of {@link Resource} it
      * is directly added to the graph. Otherwise the parsed {@link LiteralFactory}
      * is used to create a {@link TypedLiteral} for the parsed value.
      * @param literalFactory the {@link LiteralFactory} used in case the parsed
      * value is not an {@link Resource}
      */
     public static void set(MGraph graph, NonLiteral resource, UriRef property,
                            Object value, LiteralFactory literalFactory){
         set(graph,resource,property,value == null ? null : singleton(value),literalFactory);
     }
     /**
      * Replaces all current values of the property for the resource
      * with the parsed values
      * @param graph the graph
      * @param resource the resource
      * @param property the property
      * @param value the value. In case it is an instance of {@link Resource} it
      * is directly added to the graph. Otherwise the parsed {@link LiteralFactory}
      * is used to create a {@link TypedLiteral} for the parsed value.
      * @param literalFactory the {@link LiteralFactory} used in case the parsed
      * value is not an {@link Resource}
      */
     public static void set(MGraph graph, NonLiteral resource, UriRef property,
                                Collection<?> values, LiteralFactory literalFactory){
         Iterator<Triple> currentValues = graph.filter(resource, property, null);
         while(currentValues.hasNext()){
             currentValues.next();
             currentValues.remove();
         }
         if(values != null){
             for(Object value : values){
                 if(value instanceof Resource){
                     graph.add(new TripleImpl(resource, property, (Resource) value));
                 } else if (value != null){
                     graph.add(new TripleImpl(resource, property,
                         literalFactory.createTypedLiteral(value)));
                 }
             }
         }
     }

     /**
      * Getter for the typed literal values of the property for a resource
      * @param <T> the java class the literal value needs to be converted to.
      * Note that the parsed LiteralFactory needs to support this conversion
      * @param graph the graph used to query for the property value
      * @param resource the resource
      * @param property the property
      * @param type the type the literal needs to be converted to
      * @param literalFactory the literalFactory
      * @return the value
      */
     public static <T> Iterator<T> getValues(TripleCollection graph, NonLiteral resource,
             UriRef property, final Class<T> type, final  LiteralFactory literalFactory){
         final Iterator<Triple> results = graph.filter(resource, property, null);
         return new Iterator<T>() {
             //TODO: dose not check if the object of the triple is of type UriRef
             @Override
             public boolean hasNext() {    return results.hasNext(); }
             @Override
             public T next() {
                 return literalFactory.createObject(type, (TypedLiteral)results.next().getObject());
             }
             @Override
             public void remove() { results.remove(); }
         };
     }
     /**
      * Getter for the first String literal value the property for a resource
      * @param graph the graph used to query for the property value
      * @param resource the resource
      * @param property the property
      * @return the value
      */
     public static String getString(TripleCollection graph, NonLiteral resource, UriRef property){
         Iterator<Triple> results = graph.filter(resource, property, null);
         if(results.hasNext()){
             while (results.hasNext()){
                 Triple result = results.next();
                 if(result.getObject() instanceof Literal){
                     return ((Literal)result.getObject()).getLexicalForm();
                 } else {
                     log.debug("Triple {} does not have a literal as object! -> ignore",result);
                 }
             }
             log.info("No Literal value for {} and property {} -> return null",
                 resource,property);
             return null;
         } else {
             log.debug("No Triple found for "+resource+" and property "+property+"! -> return null");
             return null;
         }
     }
     /**
      * Getter for the string literal values the property for a resource
      * @param graph the graph used to query for the property value
      * @param resource the resource
      * @param property the property
      * @return the value
      */
     public static Iterator<String> getStrings(TripleCollection graph, NonLiteral resource, UriRef property){
         final Iterator<Triple> results = graph.filter(resource, property, null);
         return new Iterator<String>() {
             //TODO: dose not check if the object of the triple is of type UriRef
             @Override
             public boolean hasNext() { return results.hasNext(); }
             @Override
             public String next() {
                 return ((Literal)results.next().getObject()).getLexicalForm();
             }
             @Override
             public void remove() { results.remove(); }
         };
     }
     /**
      * Getter for the first value of the data type property for a resource
      * @param graph the graph used to query for the property value
      * @param resource the resource
      * @param property the property
      * @return the value
      */
     public static UriRef getReference(TripleCollection graph, NonLiteral resource, UriRef property){
         Iterator<Triple> results = graph.filter(resource, property, null);
         if(results.hasNext()){
             while(results.hasNext()){
             Triple result = results.next();
                 if(result.getObject() instanceof UriRef){
                     return (UriRef)result.getObject();
                 } else {
                     log.debug("Triple "+result+" does not have a UriRef as object! -> ignore");
                 }
             }
             log.info("No UriRef value for {} and property {} -> return null",resource,property);
             return null;
         } else {
             log.debug("No Triple found for {} and property {}! -> return null",resource,property);
             return null;
         }
     }
     /**
      * Getter for the values of the data type property for a resource.
      *
      * @param graph the graph used to query for the property value
      * @param resource the resource
      * @param property the property
      * @return The iterator over all the values (
      */
     public static Iterator<UriRef> getReferences(TripleCollection graph, NonLiteral resource, UriRef property){
         final Iterator<Triple> results = graph.filter(resource, property, null);
         return new Iterator<UriRef>() {
             //TODO: dose not check if the object of the triple is of type UriRef
             @Override
             public boolean hasNext() { return results.hasNext(); }
             @Override
             public UriRef next() { return (UriRef)results.next().getObject(); }
             @Override
             public void remove() { results.remove(); }
         };
     }

     /**
      * Comparator that allows to sort a list/array of {@link EnhancementEngine}s
      * based on there {@link ServiceProperties#ENHANCEMENT_ENGINE_ORDERING}.
      */
     public static final Comparator<EnhancementEngine> EXECUTION_ORDER_COMPARATOR = new Comparator<EnhancementEngine>() {

         @Override
         public int compare(EnhancementEngine engine1, EnhancementEngine engine2) {
             Integer order1 = getEngineOrder(engine1);
             Integer order2 = getEngineOrder(engine2);
             //start with the highest number finish with the lowest ...
             return order1 == order2?0:order1<order2?1:-1;
         }

     };
     /**
      * Gets the {@link ServiceProperties#ENHANCEMENT_ENGINE_ORDERING} value
      * for the parsed EnhancementEngine. If the Engine does not implement the
      * {@link ServiceProperties} interface or does not provide the
      * {@link ServiceProperties#ENHANCEMENT_ENGINE_ORDERING} the
      * {@link ServiceProperties#ORDERING_DEFAULT} is returned <p>
      * This method is guaranteed to NOT return <code>null</code>.
      * @param engine the engine
      * @return the ordering
      */
     public static Integer getEngineOrder(EnhancementEngine engine){
         log.debug("getOrder "+engine);
         if (engine instanceof ServiceProperties){
             log.debug(" ... implements ServiceProperties");
             Object value = ((ServiceProperties)engine).getServiceProperties().get(ServiceProperties.ENHANCEMENT_ENGINE_ORDERING);
             log.debug("   > value = "+value +" "+value.getClass());
             if (value !=null && value instanceof Integer){
                 return (Integer)value;
             }
         }
         return ServiceProperties.ORDERING_DEFAULT;
     }

     /**
      * Getter for the Resources of fise:TextAnnotations that do have a value
      * of the dc:language property. The returned list is sorted by 'fise:confidence'.
      * Annotations with missing confidence are ranked last.<p>
      * NOTE that the returned list will likely contain annotations for the same language
      * if multiple language identification are used in the same {@link Chain}.
      * @param graph the graph with the enhancement.
      * Typically {@link ContentItem#getMetadata()}
      * @return the sorted list of language annotations or an empty list if none.
      * @throws IllegalArgumentException if <code>null</code> is parsed as graph
      */
     public static List<NonLiteral> getLanguageAnnotations(TripleCollection graph){
         if(graph == null){
             throw new IllegalArgumentException("The parsed graph MUST NOT be NULL!");
         }
         // I do not use SPARQL, because I do not want to instantiate a QueryEngine
         final Map<NonLiteral,Double> confidences = new HashMap<NonLiteral,Double>();
         List<NonLiteral> langAnnotations = new ArrayList<NonLiteral>();
         Iterator<Triple> textAnnoataions = graph.filter(null, RDF_TYPE, ENHANCER_TEXTANNOTATION);
         while(textAnnoataions.hasNext()){
             NonLiteral textAnnotation = textAnnoataions.next().getSubject();
             String language = getString(graph, textAnnotation, DC_LANGUAGE);
             if(language != null){
                 Double confidence = get(graph, textAnnotation, Properties.ENHANCER_CONFIDENCE, Double.class, lf);
                 confidences.put(textAnnotation,confidence);
                 langAnnotations.add(textAnnotation);
             }
         }
         if(langAnnotations.size() > 1){
             Collections.sort(langAnnotations,new Comparator<NonLiteral>() {
                 @Override
                 public int compare(NonLiteral o1, NonLiteral o2) {
                     Double c1 = confidences.get(o1);
                     Double c2 = confidences.get(o2);
                     //decrising order (values without confidence last)
                     if(c1 == null){
                         return c2 == null ? 0 : 1;
                     } else if(c2 == null){
                         return -1;
                     } else {
                         return c2.compareTo(c1);
                     }
                 }
             });
         }
         return langAnnotations;
     }
     /**
      * Getter for language identified for (extracted-from) the parsed
      * ContentItem. The returned value is the Annotation with the highest
      * 'fise:confidence' value - or if no annotations are present - the
      * 'dc-terms:language' value of the {@link ContentItem#getUri()}.<p>
      * Users that want to obtain all language annotations should use
      * {@link #getLanguageAnnotations(TripleCollection)} instead.<p>
      * This method ensures a write lock on the {@link ContentItem}.
      * @param ci the contentItem
      * @return the identified language of the parsed {@link ContentItem}.
      * <code>null</code> if not available.
      * @throws IllegalArgumentException if <code>null</code> is parsed as content item
      * @see #getLanguageAnnotations(TripleCollection)
      */
     public static String getLanguage(ContentItem ci){
         if(ci == null){
             throw new IllegalArgumentException("The parsed ContentItem MUST NOT be NULL!");
         }
         ci.getLock().readLock().lock();
         try {
             List<NonLiteral> langAnnotations = getLanguageAnnotations(ci.getMetadata());
             if(langAnnotations.isEmpty()){ //fallback
                 return getString(ci.getMetadata(), ci.getUri(), DC_LANGUAGE);
             } else {
                 return getString(ci.getMetadata(), langAnnotations.get(0), DC_LANGUAGE);
             }
         } finally {
             ci.getLock().readLock().unlock();
         }
     }
 }