enhancement-engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/MetaxaEngine.java - stanbol - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.stanbol.enhancer.engines.metaxa;

 import static org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper.randomUUID;

 import java.io.BufferedWriter;
 import java.io.IOException;
 import java.io.OutputStreamWriter;
 import java.nio.charset.Charset;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.Dictionary;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Map;
 import java.util.Set;

 import org.apache.clerezza.rdf.core.BNode;
 import org.apache.clerezza.rdf.core.MGraph;
 import org.apache.clerezza.rdf.core.NonLiteral;
 import org.apache.clerezza.rdf.core.Resource;
 import org.apache.clerezza.rdf.core.Triple;
 import org.apache.clerezza.rdf.core.UriRef;
 import org.apache.clerezza.rdf.core.impl.PlainLiteralImpl;
 import org.apache.clerezza.rdf.core.impl.SimpleMGraph;
 import org.apache.clerezza.rdf.core.impl.TripleImpl;
 import org.apache.clerezza.rdf.core.impl.TypedLiteralImpl;
 import org.apache.commons.io.IOUtils;
 import org.apache.felix.scr.annotations.Component;
 import org.apache.felix.scr.annotations.Property;
 import org.apache.felix.scr.annotations.Reference;
 import org.apache.felix.scr.annotations.Service;
 import org.apache.stanbol.enhancer.engines.metaxa.core.MetaxaCore;
 import org.apache.stanbol.enhancer.engines.metaxa.core.RDF2GoUtils;
 import org.apache.stanbol.enhancer.engines.metaxa.core.html.BundleURIResolver;
 import org.apache.stanbol.enhancer.engines.metaxa.core.html.HtmlExtractorFactory;
 import org.apache.stanbol.enhancer.servicesapi.Blob;
 import org.apache.stanbol.enhancer.servicesapi.ContentItem;
 import org.apache.stanbol.enhancer.servicesapi.ContentItemFactory;
 import org.apache.stanbol.enhancer.servicesapi.ContentSink;
 import org.apache.stanbol.enhancer.servicesapi.EngineException;
 import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
 import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
 import org.apache.stanbol.enhancer.servicesapi.impl.AbstractEnhancementEngine;
 import org.apache.stanbol.enhancer.servicesapi.rdf.NamespaceEnum;
 import org.ontoware.aifbcommons.collection.ClosableIterator;
 import org.ontoware.rdf2go.model.Model;
 import org.ontoware.rdf2go.model.Statement;
 import org.ontoware.rdf2go.model.node.BlankNode;
 import org.ontoware.rdf2go.model.node.DatatypeLiteral;
 import org.ontoware.rdf2go.model.node.Node;
 import org.ontoware.rdf2go.model.node.PlainLiteral;
 import org.ontoware.rdf2go.model.node.URI;
 import org.ontoware.rdf2go.model.node.impl.URIImpl;
 import org.osgi.framework.BundleContext;
 import org.osgi.service.cm.ConfigurationException;
 import org.osgi.service.component.ComponentContext;
 import org.semanticdesktop.aperture.extractor.ExtractorException;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;


 /**
  * {@link MetaxaEngine}
  *
  * @author Joerg Steffen, DFKI
  * @version $Id$
  */
 @Component(immediate = true, metatype = true, inherit = true)
 @Service
 @org.apache.felix.scr.annotations.Properties(value={
     @Property(name=EnhancementEngine.PROPERTY_NAME, value="metaxa")
 })
 public class MetaxaEngine
         extends AbstractEnhancementEngine<IOException, RuntimeException>
         implements EnhancementEngine, ServiceProperties {

     private static final Logger log = LoggerFactory.getLogger(MetaxaEngine.class);
     /**
      * The default charset
      */
     private static final Charset UTF8 = Charset.forName("UTF-8");
     /**
      * Plain text content of a content item.
       */
     public static final UriRef NIE_PLAINTEXTCONTENT = new UriRef(NamespaceEnum.nie + "plainTextContent");
     private static final URIImpl NIE_PLAINTEXT_PROPERTY = new URIImpl(NIE_PLAINTEXTCONTENT.getUnicodeString());
     /**
      * The default value for the Execution of this Engine. Currently set to
      * {@link ServiceProperties#ORDERING_PRE_PROCESSING}
      */
     public static final Integer defaultOrder = ORDERING_PRE_PROCESSING;

     /**
      * name of a file defining the available docuemnt extractors for Metaxa. By default, the builtin file 'extractionregistry.xml' is used.
      */
     @Property(value=MetaxaEngine.DEFAULT_EXTRACTION_REGISTRY)
     public static final String GLOBAL_EXTRACTOR_REGISTRY = "org.apache.stanbol.enhancer.engines.metaxa.extractionregistry";

     /**
      * name of a file that defines the set of extractors for HTML documents. By default, the builtin file 'htmlextractors.xml' is used."
      */
     @Property(value=MetaxaEngine.DEFAULT_HTML_EXTRACTOR_REGISTRY)
     public static final String HTML_EXTRACTOR_REGISTRY = "org.apache.stanbol.enhancer.engines.metaxa.htmlextractors";

     @Property(value={"text/plain"},cardinality=1000)
     public static final String IGNORE_MIME_TYPES = "org.apache.stanbol.enhancer.engines.metaxa.ignoreMimeTypes";

     /**
      * a boolean option whether extracted text should be included in the metadata as value of the NIE.plainTextContent property
      */
     @Property(boolValue=false)
     public static final String INCLUDE_TEXT_IN_METADATA = "org.apache.stanbol.enhancer.engines.metaxa.includeText";

     /**
      * Internally used to create additional {@link Blob} for transformed
      * versions af the original content
      */
     @Reference
     private ContentItemFactory ciFactory;

     private MetaxaCore extractor;

     BundleContext bundleContext;

     public static final String DEFAULT_EXTRACTION_REGISTRY = "extractionregistry.xml";
     public static final String DEFAULT_HTML_EXTRACTOR_REGISTRY = "htmlextractors.xml";

     private Set<String> ignoredMimeTypes;
     private boolean includeText = false;

     /**
      * The activate method.
      *
      * @param ce the {@link ComponentContext}
      * @throws IOException if initializing fails
      */
     protected void activate(ComponentContext ce) throws ConfigurationException, IOException {
         super.activate(ce);
         String extractionRegistry = DEFAULT_EXTRACTION_REGISTRY;
         String htmlExtractors = DEFAULT_HTML_EXTRACTOR_REGISTRY;
         this.bundleContext = ce.getBundleContext();
         BundleURIResolver.BUNDLE = this.bundleContext.getBundle();
         try {
             Dictionary<String, Object> properties = ce.getProperties();
             String confFile = (String)properties.get(GLOBAL_EXTRACTOR_REGISTRY);
             if (confFile != null && confFile.trim().length() > 0) {
                 extractionRegistry = confFile;
             }
             confFile = (String)properties.get(HTML_EXTRACTOR_REGISTRY);
             if (confFile != null && confFile.trim().length() > 0) {
                 htmlExtractors = confFile;
             }
             this.extractor = new MetaxaCore(extractionRegistry);
             HtmlExtractorFactory.REGISTRY_CONFIGURATION = htmlExtractors;
         } catch (IOException e) {
             log.error(e.getLocalizedMessage(), e);
             throw e;
         }
         Object value = ce.getProperties().get(IGNORE_MIME_TYPES);
         if(value instanceof String[]){
             ignoredMimeTypes = new HashSet<String>(Arrays.asList((String[])value));
         } else if(value instanceof Iterable<?>){
             ignoredMimeTypes = new HashSet<String>();
             for(Object mimeType : (Iterable<?>)value){
                 if(mimeType != null){
                     ignoredMimeTypes.add(mimeType.toString());
                 }
             }
         } else if(value != null && !value.toString().isEmpty()){
             ignoredMimeTypes = Collections.singleton(value.toString());
         } else {
             ignoredMimeTypes = Collections.singleton("text/plain");
         }
         value = ce.getProperties().get(INCLUDE_TEXT_IN_METADATA);
         if (value instanceof Boolean) {
           includeText = ((Boolean)value).booleanValue();
           log.info("Include Text set to: {}",value);
         }
     }

     /**
      * The deactivate method.
      *
      * @param ce the {@link ComponentContext}
      */
     protected void deactivate(ComponentContext ce) {
         super.deactivate(ce);
         this.extractor = null;
     }

     public int canEnhance(ContentItem ci) throws EngineException {
         String mimeType = ci.getMimeType();
         if (!ignoredMimeTypes.contains(mimeType) &&
                 this.extractor.isSupported(mimeType)) {
             return ENHANCE_ASYNC; //supports now asynchronous execution!
         }
         return CANNOT_ENHANCE;
     }

     public void computeEnhancements(ContentItem ci) throws EngineException {
         // get model from the extraction
         URIImpl docId;
         Model m = null;
         ci.getLock().readLock().lock();
         try {
             docId = new URIImpl(ci.getUri().getUnicodeString());
             m = this.extractor.extract(ci.getStream(), docId, ci.getMimeType());
         } catch (ExtractorException e) {
             throw new EngineException("Error while processing ContentItem "
                 + ci.getUri()+" with Metaxa",e);
         } catch (IOException e) {
             throw new EngineException("Error while processing ContentItem "
                     + ci.getUri()+" with Metaxa",e);
         } finally {
             ci.getLock().readLock().unlock();
         }
         // Convert the RDF2go model to a Clerezza Graph and also extract
         // the extracted plain text from the model
         if (null == m) {
             log.debug("Unable to preocess ContentItem {} (mime type {}) with Metaxa",
                 ci.getUri(),ci.getMimeType());
             return;
         }
         ContentSink plainTextSink;
         try {
             plainTextSink = ciFactory.createContentSink("text/plain");
         } catch (IOException e) {
             m.close();
             throw new EngineException("Unable to initialise Blob for storing" +
             		"the plain text content",e);
         }
         HashMap<BlankNode, BNode> blankNodeMap = new HashMap<BlankNode, BNode>();
         RDF2GoUtils.urifyBlankNodes(m);
         ClosableIterator<Statement> it = m.iterator();
         BufferedWriter out = new BufferedWriter(new OutputStreamWriter(
             plainTextSink.getOutputStream(), UTF8));
         boolean textExtracted = false; //used to detect if some text was extracted
         try {
             MGraph g = new SimpleMGraph(); //first add to a temporary graph
             while (it.hasNext()) {
                 Statement oneStmt = it.next();
                 //we need to treat triples that provide the plain/text
                 //version differently. Such Objects need to be added to
                 //the plain text Blob!
                 if(oneStmt.getSubject().equals(docId) &&
                         oneStmt.getPredicate().equals(NIE_PLAINTEXT_PROPERTY)){
                     String text = oneStmt.getObject().toString();
                     if(text != null && !text.isEmpty()){
                         try {
                             out.write(oneStmt.getObject().toString());
                         } catch (IOException e) {
                             throw new EngineException("Unable to write extracted" +
                             		"plain text to Blob (blob impl: "
                                     + plainTextSink.getBlob().getClass()+")",e);
                         }
                         textExtracted = true;
                         if (includeText) {
                             NonLiteral subject = (NonLiteral) asClerezzaResource(oneStmt.getSubject(), blankNodeMap);
                             UriRef predicate = (UriRef) asClerezzaResource(oneStmt.getPredicate(), blankNodeMap);
                             Resource object = asClerezzaResource(oneStmt.getObject(), blankNodeMap);
                             g.add(new TripleImpl(subject, predicate, object));
                         }
                     }
                 } else { //add metadata to the metadata of the contentItem
                     NonLiteral subject = (NonLiteral) asClerezzaResource(oneStmt.getSubject(), blankNodeMap);
                     UriRef predicate = (UriRef) asClerezzaResource(oneStmt.getPredicate(), blankNodeMap);
                     Resource object = asClerezzaResource(oneStmt.getObject(), blankNodeMap);

                     if (null != subject && null != predicate && null != object) {
                         Triple t = new TripleImpl(subject, predicate, object);
                         g.add(t);
                         log.debug("added " + t.toString());
                     }
                 }
             }
             //add the extracted triples to the metadata of the ContentItem
             ci.getLock().writeLock().lock();
             try {
                 ci.getMetadata().addAll(g);
                 g = null;
             } finally {
                 ci.getLock().writeLock().unlock();
             }
         } finally {
             it.close();
             m.close();
             IOUtils.closeQuietly(out);
         }
         if(textExtracted){
             //add plain text to the content item
             UriRef blobUri = new UriRef("urn:metaxa:plain-text:"+randomUUID());
             ci.addPart(blobUri, plainTextSink.getBlob());
         }
     }

     /**
      * Converts the given RDF2Go node into a corresponding Clerezza object.
      *
      * @param node a {@link Node}
      * @return a {@link Resource}
      */
     public static Resource asClerezzaResource(Node node, HashMap<BlankNode, BNode> blankNodeMap) {

         if (node instanceof URI) {
             return new UriRef(node.asURI().toString());
         } else if (node instanceof BlankNode) {
             BNode bNode = blankNodeMap.get(node);
             if (bNode == null) {
                 bNode = new BNode();
                 blankNodeMap.put(node.asBlankNode(), bNode);
             }
             return bNode;
         } else if (node instanceof DatatypeLiteral) {
             DatatypeLiteral dtl = node.asDatatypeLiteral();
             return new TypedLiteralImpl(dtl.getValue(), new UriRef(dtl.getDatatype().asURI().toString()));
         } else if (node instanceof PlainLiteral) {
             return new PlainLiteralImpl(node.asLiteral().getValue());
         }

         return null;
     }

     public Map<String, Object> getServiceProperties() {
         return Collections.unmodifiableMap(Collections.singletonMap(ENHANCEMENT_ENGINE_ORDERING, (Object) defaultOrder));
     }

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.stanbol.enhancer.engines.metaxa;

	import static org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper.randomUUID;

	import java.io.BufferedWriter;
	import java.io.IOException;
	import java.io.OutputStreamWriter;
	import java.nio.charset.Charset;
	import java.util.Arrays;
	import java.util.Collections;
	import java.util.Dictionary;
	import java.util.HashMap;
	import java.util.HashSet;
	import java.util.Map;
	import java.util.Set;

	import org.apache.clerezza.rdf.core.BNode;
	import org.apache.clerezza.rdf.core.MGraph;
	import org.apache.clerezza.rdf.core.NonLiteral;
	import org.apache.clerezza.rdf.core.Resource;
	import org.apache.clerezza.rdf.core.Triple;
	import org.apache.clerezza.rdf.core.UriRef;
	import org.apache.clerezza.rdf.core.impl.PlainLiteralImpl;
	import org.apache.clerezza.rdf.core.impl.SimpleMGraph;
	import org.apache.clerezza.rdf.core.impl.TripleImpl;
	import org.apache.clerezza.rdf.core.impl.TypedLiteralImpl;
	import org.apache.commons.io.IOUtils;
	import org.apache.felix.scr.annotations.Component;
	import org.apache.felix.scr.annotations.Property;
	import org.apache.felix.scr.annotations.Reference;
	import org.apache.felix.scr.annotations.Service;
	import org.apache.stanbol.enhancer.engines.metaxa.core.MetaxaCore;
	import org.apache.stanbol.enhancer.engines.metaxa.core.RDF2GoUtils;
	import org.apache.stanbol.enhancer.engines.metaxa.core.html.BundleURIResolver;
	import org.apache.stanbol.enhancer.engines.metaxa.core.html.HtmlExtractorFactory;
	import org.apache.stanbol.enhancer.servicesapi.Blob;
	import org.apache.stanbol.enhancer.servicesapi.ContentItem;
	import org.apache.stanbol.enhancer.servicesapi.ContentItemFactory;
	import org.apache.stanbol.enhancer.servicesapi.ContentSink;
	import org.apache.stanbol.enhancer.servicesapi.EngineException;
	import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
	import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
	import org.apache.stanbol.enhancer.servicesapi.impl.AbstractEnhancementEngine;
	import org.apache.stanbol.enhancer.servicesapi.rdf.NamespaceEnum;
	import org.ontoware.aifbcommons.collection.ClosableIterator;
	import org.ontoware.rdf2go.model.Model;
	import org.ontoware.rdf2go.model.Statement;
	import org.ontoware.rdf2go.model.node.BlankNode;
	import org.ontoware.rdf2go.model.node.DatatypeLiteral;
	import org.ontoware.rdf2go.model.node.Node;
	import org.ontoware.rdf2go.model.node.PlainLiteral;
	import org.ontoware.rdf2go.model.node.URI;
	import org.ontoware.rdf2go.model.node.impl.URIImpl;
	import org.osgi.framework.BundleContext;
	import org.osgi.service.cm.ConfigurationException;
	import org.osgi.service.component.ComponentContext;
	import org.semanticdesktop.aperture.extractor.ExtractorException;
	import org.slf4j.Logger;
	import org.slf4j.LoggerFactory;


	/**
	* {@link MetaxaEngine}
	*
	* @author Joerg Steffen, DFKI
	* @version $Id$
	*/
	@Component(immediate = true, metatype = true, inherit = true)
	@Service
	@org.apache.felix.scr.annotations.Properties(value={
	@Property(name=EnhancementEngine.PROPERTY_NAME, value="metaxa")
	})
	public class MetaxaEngine
	extends AbstractEnhancementEngine<IOException, RuntimeException>
	implements EnhancementEngine, ServiceProperties {

	private static final Logger log = LoggerFactory.getLogger(MetaxaEngine.class);
	/**
	* The default charset
	*/
	private static final Charset UTF8 = Charset.forName("UTF-8");
	/**
	* Plain text content of a content item.
	*/
	public static final UriRef NIE_PLAINTEXTCONTENT = new UriRef(NamespaceEnum.nie + "plainTextContent");
	private static final URIImpl NIE_PLAINTEXT_PROPERTY = new URIImpl(NIE_PLAINTEXTCONTENT.getUnicodeString());
	/**
	* The default value for the Execution of this Engine. Currently set to
	* {@link ServiceProperties#ORDERING_PRE_PROCESSING}
	*/
	public static final Integer defaultOrder = ORDERING_PRE_PROCESSING;

	/**
	* name of a file defining the available docuemnt extractors for Metaxa. By default, the builtin file 'extractionregistry.xml' is used.
	*/
	@Property(value=MetaxaEngine.DEFAULT_EXTRACTION_REGISTRY)
	public static final String GLOBAL_EXTRACTOR_REGISTRY = "org.apache.stanbol.enhancer.engines.metaxa.extractionregistry";

	/**
	* name of a file that defines the set of extractors for HTML documents. By default, the builtin file 'htmlextractors.xml' is used."
	*/
	@Property(value=MetaxaEngine.DEFAULT_HTML_EXTRACTOR_REGISTRY)
	public static final String HTML_EXTRACTOR_REGISTRY = "org.apache.stanbol.enhancer.engines.metaxa.htmlextractors";

	@Property(value={"text/plain"},cardinality=1000)
	public static final String IGNORE_MIME_TYPES = "org.apache.stanbol.enhancer.engines.metaxa.ignoreMimeTypes";

	/**
	* a boolean option whether extracted text should be included in the metadata as value of the NIE.plainTextContent property
	*/
	@Property(boolValue=false)
	public static final String INCLUDE_TEXT_IN_METADATA = "org.apache.stanbol.enhancer.engines.metaxa.includeText";

	/**
	* Internally used to create additional {@link Blob} for transformed
	* versions af the original content
	*/
	@Reference
	private ContentItemFactory ciFactory;

	private MetaxaCore extractor;

	BundleContext bundleContext;

	public static final String DEFAULT_EXTRACTION_REGISTRY = "extractionregistry.xml";
	public static final String DEFAULT_HTML_EXTRACTOR_REGISTRY = "htmlextractors.xml";

	private Set<String> ignoredMimeTypes;
	private boolean includeText = false;

	/**
	* The activate method.
	*
	* @param ce the {@link ComponentContext}
	* @throws IOException if initializing fails
	*/
	protected void activate(ComponentContext ce) throws ConfigurationException, IOException {
	super.activate(ce);
	String extractionRegistry = DEFAULT_EXTRACTION_REGISTRY;
	String htmlExtractors = DEFAULT_HTML_EXTRACTOR_REGISTRY;
	this.bundleContext = ce.getBundleContext();
	BundleURIResolver.BUNDLE = this.bundleContext.getBundle();
	try {
	Dictionary<String, Object> properties = ce.getProperties();
	String confFile = (String)properties.get(GLOBAL_EXTRACTOR_REGISTRY);
	if (confFile != null && confFile.trim().length() > 0) {
	extractionRegistry = confFile;
	}
	confFile = (String)properties.get(HTML_EXTRACTOR_REGISTRY);
	if (confFile != null && confFile.trim().length() > 0) {
	htmlExtractors = confFile;
	}
	this.extractor = new MetaxaCore(extractionRegistry);
	HtmlExtractorFactory.REGISTRY_CONFIGURATION = htmlExtractors;
	} catch (IOException e) {
	log.error(e.getLocalizedMessage(), e);
	throw e;
	}
	Object value = ce.getProperties().get(IGNORE_MIME_TYPES);
	if(value instanceof String[]){
	ignoredMimeTypes = new HashSet<String>(Arrays.asList((String[])value));
	} else if(value instanceof Iterable<?>){
	ignoredMimeTypes = new HashSet<String>();
	for(Object mimeType : (Iterable<?>)value){
	if(mimeType != null){
	ignoredMimeTypes.add(mimeType.toString());
	}
	}
	} else if(value != null && !value.toString().isEmpty()){
	ignoredMimeTypes = Collections.singleton(value.toString());
	} else {
	ignoredMimeTypes = Collections.singleton("text/plain");
	}
	value = ce.getProperties().get(INCLUDE_TEXT_IN_METADATA);
	if (value instanceof Boolean) {
	includeText = ((Boolean)value).booleanValue();
	log.info("Include Text set to: {}",value);
	}
	}

	/**
	* The deactivate method.
	*
	* @param ce the {@link ComponentContext}
	*/
	protected void deactivate(ComponentContext ce) {
	super.deactivate(ce);
	this.extractor = null;
	}

	public int canEnhance(ContentItem ci) throws EngineException {
	String mimeType = ci.getMimeType();
	if (!ignoredMimeTypes.contains(mimeType) &&
	this.extractor.isSupported(mimeType)) {
	return ENHANCE_ASYNC; //supports now asynchronous execution!
	}
	return CANNOT_ENHANCE;
	}

	public void computeEnhancements(ContentItem ci) throws EngineException {
	// get model from the extraction
	URIImpl docId;
	Model m = null;
	ci.getLock().readLock().lock();
	try {
	docId = new URIImpl(ci.getUri().getUnicodeString());
	m = this.extractor.extract(ci.getStream(), docId, ci.getMimeType());
	} catch (ExtractorException e) {
	throw new EngineException("Error while processing ContentItem "
	+ ci.getUri()+" with Metaxa",e);
	} catch (IOException e) {
	throw new EngineException("Error while processing ContentItem "
	+ ci.getUri()+" with Metaxa",e);
	} finally {
	ci.getLock().readLock().unlock();
	}
	// Convert the RDF2go model to a Clerezza Graph and also extract
	// the extracted plain text from the model
	if (null == m) {
	log.debug("Unable to preocess ContentItem {} (mime type {}) with Metaxa",
	ci.getUri(),ci.getMimeType());
	return;
	}
	ContentSink plainTextSink;
	try {
	plainTextSink = ciFactory.createContentSink("text/plain");
	} catch (IOException e) {
	m.close();
	throw new EngineException("Unable to initialise Blob for storing" +
	"the plain text content",e);
	}
	HashMap<BlankNode, BNode> blankNodeMap = new HashMap<BlankNode, BNode>();
	RDF2GoUtils.urifyBlankNodes(m);
	ClosableIterator<Statement> it = m.iterator();
	BufferedWriter out = new BufferedWriter(new OutputStreamWriter(
	plainTextSink.getOutputStream(), UTF8));
	boolean textExtracted = false; //used to detect if some text was extracted
	try {
	MGraph g = new SimpleMGraph(); //first add to a temporary graph
	while (it.hasNext()) {
	Statement oneStmt = it.next();
	//we need to treat triples that provide the plain/text
	//version differently. Such Objects need to be added to
	//the plain text Blob!
	if(oneStmt.getSubject().equals(docId) &&
	oneStmt.getPredicate().equals(NIE_PLAINTEXT_PROPERTY)){
	String text = oneStmt.getObject().toString();
	if(text != null && !text.isEmpty()){
	try {
	out.write(oneStmt.getObject().toString());
	} catch (IOException e) {
	throw new EngineException("Unable to write extracted" +
	"plain text to Blob (blob impl: "
	+ plainTextSink.getBlob().getClass()+")",e);
	}
	textExtracted = true;
	if (includeText) {
	NonLiteral subject = (NonLiteral) asClerezzaResource(oneStmt.getSubject(), blankNodeMap);
	UriRef predicate = (UriRef) asClerezzaResource(oneStmt.getPredicate(), blankNodeMap);
	Resource object = asClerezzaResource(oneStmt.getObject(), blankNodeMap);
	g.add(new TripleImpl(subject, predicate, object));
	}
	}
	} else { //add metadata to the metadata of the contentItem
	NonLiteral subject = (NonLiteral) asClerezzaResource(oneStmt.getSubject(), blankNodeMap);
	UriRef predicate = (UriRef) asClerezzaResource(oneStmt.getPredicate(), blankNodeMap);
	Resource object = asClerezzaResource(oneStmt.getObject(), blankNodeMap);

	if (null != subject && null != predicate && null != object) {
	Triple t = new TripleImpl(subject, predicate, object);
	g.add(t);
	log.debug("added " + t.toString());
	}
	}
	}
	//add the extracted triples to the metadata of the ContentItem
	ci.getLock().writeLock().lock();
	try {
	ci.getMetadata().addAll(g);
	g = null;
	} finally {
	ci.getLock().writeLock().unlock();
	}
	} finally {
	it.close();
	m.close();
	IOUtils.closeQuietly(out);
	}
	if(textExtracted){
	//add plain text to the content item
	UriRef blobUri = new UriRef("urn:metaxa:plain-text:"+randomUUID());
	ci.addPart(blobUri, plainTextSink.getBlob());
	}
	}

	/**
	* Converts the given RDF2Go node into a corresponding Clerezza object.
	*
	* @param node a {@link Node}
	* @return a {@link Resource}
	*/
	public static Resource asClerezzaResource(Node node, HashMap<BlankNode, BNode> blankNodeMap) {

	if (node instanceof URI) {
	return new UriRef(node.asURI().toString());
	} else if (node instanceof BlankNode) {
	BNode bNode = blankNodeMap.get(node);
	if (bNode == null) {
	bNode = new BNode();
	blankNodeMap.put(node.asBlankNode(), bNode);
	}
	return bNode;
	} else if (node instanceof DatatypeLiteral) {
	DatatypeLiteral dtl = node.asDatatypeLiteral();
	return new TypedLiteralImpl(dtl.getValue(), new UriRef(dtl.getDatatype().asURI().toString()));
	} else if (node instanceof PlainLiteral) {
	return new PlainLiteralImpl(node.asLiteral().getValue());
	}

	return null;
	}

	public Map<String, Object> getServiceProperties() {
	return Collections.unmodifiableMap(Collections.singletonMap(ENHANCEMENT_ENGINE_ORDERING, (Object) defaultOrder));
	}

	}