| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.stanbol.enhancer.engines.metaxa; |
| |
| import static org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper.randomUUID; |
| |
| import java.io.BufferedWriter; |
| import java.io.IOException; |
| import java.io.OutputStreamWriter; |
| import java.nio.charset.Charset; |
| import java.util.Arrays; |
| import java.util.Collections; |
| import java.util.Dictionary; |
| import java.util.HashMap; |
| import java.util.HashSet; |
| import java.util.Map; |
| import java.util.Set; |
| |
| import org.apache.clerezza.rdf.core.BNode; |
| import org.apache.clerezza.rdf.core.MGraph; |
| import org.apache.clerezza.rdf.core.NonLiteral; |
| import org.apache.clerezza.rdf.core.Resource; |
| import org.apache.clerezza.rdf.core.Triple; |
| import org.apache.clerezza.rdf.core.UriRef; |
| import org.apache.clerezza.rdf.core.impl.PlainLiteralImpl; |
| import org.apache.clerezza.rdf.core.impl.SimpleMGraph; |
| import org.apache.clerezza.rdf.core.impl.TripleImpl; |
| import org.apache.clerezza.rdf.core.impl.TypedLiteralImpl; |
| import org.apache.commons.io.IOUtils; |
| import org.apache.felix.scr.annotations.Component; |
| import org.apache.felix.scr.annotations.Property; |
| import org.apache.felix.scr.annotations.Reference; |
| import org.apache.felix.scr.annotations.Service; |
| import org.apache.stanbol.enhancer.engines.metaxa.core.MetaxaCore; |
| import org.apache.stanbol.enhancer.engines.metaxa.core.RDF2GoUtils; |
| import org.apache.stanbol.enhancer.engines.metaxa.core.html.BundleURIResolver; |
| import org.apache.stanbol.enhancer.engines.metaxa.core.html.HtmlExtractorFactory; |
| import org.apache.stanbol.enhancer.servicesapi.Blob; |
| import org.apache.stanbol.enhancer.servicesapi.ContentItem; |
| import org.apache.stanbol.enhancer.servicesapi.ContentItemFactory; |
| import org.apache.stanbol.enhancer.servicesapi.ContentSink; |
| import org.apache.stanbol.enhancer.servicesapi.EngineException; |
| import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine; |
| import org.apache.stanbol.enhancer.servicesapi.ServiceProperties; |
| import org.apache.stanbol.enhancer.servicesapi.impl.AbstractEnhancementEngine; |
| import org.apache.stanbol.enhancer.servicesapi.rdf.NamespaceEnum; |
| import org.ontoware.aifbcommons.collection.ClosableIterator; |
| import org.ontoware.rdf2go.model.Model; |
| import org.ontoware.rdf2go.model.Statement; |
| import org.ontoware.rdf2go.model.node.BlankNode; |
| import org.ontoware.rdf2go.model.node.DatatypeLiteral; |
| import org.ontoware.rdf2go.model.node.Node; |
| import org.ontoware.rdf2go.model.node.PlainLiteral; |
| import org.ontoware.rdf2go.model.node.URI; |
| import org.ontoware.rdf2go.model.node.impl.URIImpl; |
| import org.osgi.framework.BundleContext; |
| import org.osgi.service.cm.ConfigurationException; |
| import org.osgi.service.component.ComponentContext; |
| import org.semanticdesktop.aperture.extractor.ExtractorException; |
| import org.slf4j.Logger; |
| import org.slf4j.LoggerFactory; |
| |
| |
| /** |
| * {@link MetaxaEngine} |
| * |
| * @author Joerg Steffen, DFKI |
| * @version $Id$ |
| */ |
| @Component(immediate = true, metatype = true, inherit = true) |
| @Service |
| @org.apache.felix.scr.annotations.Properties(value={ |
| @Property(name=EnhancementEngine.PROPERTY_NAME, value="metaxa") |
| }) |
| public class MetaxaEngine |
| extends AbstractEnhancementEngine<IOException, RuntimeException> |
| implements EnhancementEngine, ServiceProperties { |
| |
| private static final Logger log = LoggerFactory.getLogger(MetaxaEngine.class); |
| /** |
| * The default charset |
| */ |
| private static final Charset UTF8 = Charset.forName("UTF-8"); |
| /** |
| * Plain text content of a content item. |
| */ |
| public static final UriRef NIE_PLAINTEXTCONTENT = new UriRef(NamespaceEnum.nie + "plainTextContent"); |
| private static final URIImpl NIE_PLAINTEXT_PROPERTY = new URIImpl(NIE_PLAINTEXTCONTENT.getUnicodeString()); |
| /** |
| * The default value for the Execution of this Engine. Currently set to |
| * {@link ServiceProperties#ORDERING_PRE_PROCESSING} |
| */ |
| public static final Integer defaultOrder = ORDERING_PRE_PROCESSING; |
| |
| /** |
| * name of a file defining the available docuemnt extractors for Metaxa. By default, the builtin file 'extractionregistry.xml' is used. |
| */ |
| @Property(value=MetaxaEngine.DEFAULT_EXTRACTION_REGISTRY) |
| public static final String GLOBAL_EXTRACTOR_REGISTRY = "org.apache.stanbol.enhancer.engines.metaxa.extractionregistry"; |
| |
| /** |
| * name of a file that defines the set of extractors for HTML documents. By default, the builtin file 'htmlextractors.xml' is used." |
| */ |
| @Property(value=MetaxaEngine.DEFAULT_HTML_EXTRACTOR_REGISTRY) |
| public static final String HTML_EXTRACTOR_REGISTRY = "org.apache.stanbol.enhancer.engines.metaxa.htmlextractors"; |
| |
| @Property(value={"text/plain"},cardinality=1000) |
| public static final String IGNORE_MIME_TYPES = "org.apache.stanbol.enhancer.engines.metaxa.ignoreMimeTypes"; |
| |
| /** |
| * a boolean option whether extracted text should be included in the metadata as value of the NIE.plainTextContent property |
| */ |
| @Property(boolValue=false) |
| public static final String INCLUDE_TEXT_IN_METADATA = "org.apache.stanbol.enhancer.engines.metaxa.includeText"; |
| |
| /** |
| * Internally used to create additional {@link Blob} for transformed |
| * versions af the original content |
| */ |
| @Reference |
| private ContentItemFactory ciFactory; |
| |
| private MetaxaCore extractor; |
| |
| BundleContext bundleContext; |
| |
| public static final String DEFAULT_EXTRACTION_REGISTRY = "extractionregistry.xml"; |
| public static final String DEFAULT_HTML_EXTRACTOR_REGISTRY = "htmlextractors.xml"; |
| |
| private Set<String> ignoredMimeTypes; |
| private boolean includeText = false; |
| |
| /** |
| * The activate method. |
| * |
| * @param ce the {@link ComponentContext} |
| * @throws IOException if initializing fails |
| */ |
| protected void activate(ComponentContext ce) throws ConfigurationException, IOException { |
| super.activate(ce); |
| String extractionRegistry = DEFAULT_EXTRACTION_REGISTRY; |
| String htmlExtractors = DEFAULT_HTML_EXTRACTOR_REGISTRY; |
| this.bundleContext = ce.getBundleContext(); |
| BundleURIResolver.BUNDLE = this.bundleContext.getBundle(); |
| try { |
| Dictionary<String, Object> properties = ce.getProperties(); |
| String confFile = (String)properties.get(GLOBAL_EXTRACTOR_REGISTRY); |
| if (confFile != null && confFile.trim().length() > 0) { |
| extractionRegistry = confFile; |
| } |
| confFile = (String)properties.get(HTML_EXTRACTOR_REGISTRY); |
| if (confFile != null && confFile.trim().length() > 0) { |
| htmlExtractors = confFile; |
| } |
| this.extractor = new MetaxaCore(extractionRegistry); |
| HtmlExtractorFactory.REGISTRY_CONFIGURATION = htmlExtractors; |
| } catch (IOException e) { |
| log.error(e.getLocalizedMessage(), e); |
| throw e; |
| } |
| Object value = ce.getProperties().get(IGNORE_MIME_TYPES); |
| if(value instanceof String[]){ |
| ignoredMimeTypes = new HashSet<String>(Arrays.asList((String[])value)); |
| } else if(value instanceof Iterable<?>){ |
| ignoredMimeTypes = new HashSet<String>(); |
| for(Object mimeType : (Iterable<?>)value){ |
| if(mimeType != null){ |
| ignoredMimeTypes.add(mimeType.toString()); |
| } |
| } |
| } else if(value != null && !value.toString().isEmpty()){ |
| ignoredMimeTypes = Collections.singleton(value.toString()); |
| } else { |
| ignoredMimeTypes = Collections.singleton("text/plain"); |
| } |
| value = ce.getProperties().get(INCLUDE_TEXT_IN_METADATA); |
| if (value instanceof Boolean) { |
| includeText = ((Boolean)value).booleanValue(); |
| log.info("Include Text set to: {}",value); |
| } |
| } |
| |
| /** |
| * The deactivate method. |
| * |
| * @param ce the {@link ComponentContext} |
| */ |
| protected void deactivate(ComponentContext ce) { |
| super.deactivate(ce); |
| this.extractor = null; |
| } |
| |
| public int canEnhance(ContentItem ci) throws EngineException { |
| String mimeType = ci.getMimeType(); |
| if (!ignoredMimeTypes.contains(mimeType) && |
| this.extractor.isSupported(mimeType)) { |
| return ENHANCE_ASYNC; //supports now asynchronous execution! |
| } |
| return CANNOT_ENHANCE; |
| } |
| |
| public void computeEnhancements(ContentItem ci) throws EngineException { |
| // get model from the extraction |
| URIImpl docId; |
| Model m = null; |
| ci.getLock().readLock().lock(); |
| try { |
| docId = new URIImpl(ci.getUri().getUnicodeString()); |
| m = this.extractor.extract(ci.getStream(), docId, ci.getMimeType()); |
| } catch (ExtractorException e) { |
| throw new EngineException("Error while processing ContentItem " |
| + ci.getUri()+" with Metaxa",e); |
| } catch (IOException e) { |
| throw new EngineException("Error while processing ContentItem " |
| + ci.getUri()+" with Metaxa",e); |
| } finally { |
| ci.getLock().readLock().unlock(); |
| } |
| // Convert the RDF2go model to a Clerezza Graph and also extract |
| // the extracted plain text from the model |
| if (null == m) { |
| log.debug("Unable to preocess ContentItem {} (mime type {}) with Metaxa", |
| ci.getUri(),ci.getMimeType()); |
| return; |
| } |
| ContentSink plainTextSink; |
| try { |
| plainTextSink = ciFactory.createContentSink("text/plain"); |
| } catch (IOException e) { |
| m.close(); |
| throw new EngineException("Unable to initialise Blob for storing" + |
| "the plain text content",e); |
| } |
| HashMap<BlankNode, BNode> blankNodeMap = new HashMap<BlankNode, BNode>(); |
| RDF2GoUtils.urifyBlankNodes(m); |
| ClosableIterator<Statement> it = m.iterator(); |
| BufferedWriter out = new BufferedWriter(new OutputStreamWriter( |
| plainTextSink.getOutputStream(), UTF8)); |
| boolean textExtracted = false; //used to detect if some text was extracted |
| try { |
| MGraph g = new SimpleMGraph(); //first add to a temporary graph |
| while (it.hasNext()) { |
| Statement oneStmt = it.next(); |
| //we need to treat triples that provide the plain/text |
| //version differently. Such Objects need to be added to |
| //the plain text Blob! |
| if(oneStmt.getSubject().equals(docId) && |
| oneStmt.getPredicate().equals(NIE_PLAINTEXT_PROPERTY)){ |
| String text = oneStmt.getObject().toString(); |
| if(text != null && !text.isEmpty()){ |
| try { |
| out.write(oneStmt.getObject().toString()); |
| } catch (IOException e) { |
| throw new EngineException("Unable to write extracted" + |
| "plain text to Blob (blob impl: " |
| + plainTextSink.getBlob().getClass()+")",e); |
| } |
| textExtracted = true; |
| if (includeText) { |
| NonLiteral subject = (NonLiteral) asClerezzaResource(oneStmt.getSubject(), blankNodeMap); |
| UriRef predicate = (UriRef) asClerezzaResource(oneStmt.getPredicate(), blankNodeMap); |
| Resource object = asClerezzaResource(oneStmt.getObject(), blankNodeMap); |
| g.add(new TripleImpl(subject, predicate, object)); |
| } |
| } |
| } else { //add metadata to the metadata of the contentItem |
| NonLiteral subject = (NonLiteral) asClerezzaResource(oneStmt.getSubject(), blankNodeMap); |
| UriRef predicate = (UriRef) asClerezzaResource(oneStmt.getPredicate(), blankNodeMap); |
| Resource object = asClerezzaResource(oneStmt.getObject(), blankNodeMap); |
| |
| if (null != subject && null != predicate && null != object) { |
| Triple t = new TripleImpl(subject, predicate, object); |
| g.add(t); |
| log.debug("added " + t.toString()); |
| } |
| } |
| } |
| //add the extracted triples to the metadata of the ContentItem |
| ci.getLock().writeLock().lock(); |
| try { |
| ci.getMetadata().addAll(g); |
| g = null; |
| } finally { |
| ci.getLock().writeLock().unlock(); |
| } |
| } finally { |
| it.close(); |
| m.close(); |
| IOUtils.closeQuietly(out); |
| } |
| if(textExtracted){ |
| //add plain text to the content item |
| UriRef blobUri = new UriRef("urn:metaxa:plain-text:"+randomUUID()); |
| ci.addPart(blobUri, plainTextSink.getBlob()); |
| } |
| } |
| |
| /** |
| * Converts the given RDF2Go node into a corresponding Clerezza object. |
| * |
| * @param node a {@link Node} |
| * @return a {@link Resource} |
| */ |
| public static Resource asClerezzaResource(Node node, HashMap<BlankNode, BNode> blankNodeMap) { |
| |
| if (node instanceof URI) { |
| return new UriRef(node.asURI().toString()); |
| } else if (node instanceof BlankNode) { |
| BNode bNode = blankNodeMap.get(node); |
| if (bNode == null) { |
| bNode = new BNode(); |
| blankNodeMap.put(node.asBlankNode(), bNode); |
| } |
| return bNode; |
| } else if (node instanceof DatatypeLiteral) { |
| DatatypeLiteral dtl = node.asDatatypeLiteral(); |
| return new TypedLiteralImpl(dtl.getValue(), new UriRef(dtl.getDatatype().asURI().toString())); |
| } else if (node instanceof PlainLiteral) { |
| return new PlainLiteralImpl(node.asLiteral().getValue()); |
| } |
| |
| return null; |
| } |
| |
| public Map<String, Object> getServiceProperties() { |
| return Collections.unmodifiableMap(Collections.singletonMap(ENHANCEMENT_ENGINE_ORDERING, (Object) defaultOrder)); |
| } |
| |
| } |