openie/src/main/java/org/apache/any23/plugin/extractor/openie/OpenIEExtractor.java - any23-plugins - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *  http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.any23.plugin.extractor.openie;

 import java.io.IOException;
 import java.util.List;

 import javax.xml.transform.TransformerConfigurationException;
 import javax.xml.transform.TransformerFactoryConfigurationError;

 import org.apache.any23.extractor.Extractor;
 import org.apache.any23.extractor.IssueReport;
 import org.apache.any23.extractor.ExtractionContext;
 import org.apache.any23.extractor.ExtractorDescription;
 import org.apache.any23.plugin.Author;
 import org.apache.any23.rdf.RDFUtils;
 import org.apache.any23.util.StreamUtils;
 import org.apache.tika.Tika;
 import org.apache.tika.exception.TikaException;
 import org.eclipse.rdf4j.model.IRI;
 import org.eclipse.rdf4j.model.Resource;
 import org.eclipse.rdf4j.model.Value;
 import org.eclipse.rdf4j.model.vocabulary.RDF;
 import org.eclipse.rdf4j.model.vocabulary.RDFS;
 import org.apache.any23.extractor.ExtractionException;
 import org.apache.any23.extractor.ExtractionParameters;
 import org.apache.any23.extractor.ExtractionResult;

 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.w3c.dom.Document;

 import edu.knowitall.openie.Argument;
 import edu.knowitall.openie.Instance;
 import edu.knowitall.openie.OpenIE;
 import edu.knowitall.tool.parse.ClearParser;
 import edu.knowitall.tool.postag.ClearPostagger;
 import edu.knowitall.tool.srl.ClearSrl;
 import edu.knowitall.tool.tokenize.ClearTokenizer;
 import scala.collection.JavaConversions;
 import scala.collection.Seq;

 /**
  * An <a href="https://github.com/allenai/openie-standalone">OpenIE</a>
  * extractor able to generate <i>RDF</i> statements from
  * sentences representing relations in the text.
  */
 @Author(name="Lewis John McGibbney (lewismc@apache.org)")
 public class OpenIEExtractor implements Extractor.TagSoupDOMExtractor {

     private static final Logger LOG = LoggerFactory.getLogger(OpenIEExtractor.class);

     /**
      * default constructor
      */
     public OpenIEExtractor() {
         // default constructor
     }

     /**
      * @see org.apache.any23.extractor.Extractor#getDescription()
      */
     @Override
     public ExtractorDescription getDescription() {
         return OpenIEExtractorFactory.getDescriptionInstance();
     }

     @Override
     public void run(ExtractionParameters extractionParameters,
             ExtractionContext context, Document in, ExtractionResult out)
                     throws IOException, ExtractionException {

         Runtime runtime = Runtime.getRuntime();
         long maxMemory = runtime.maxMemory();
         //free up as much memory as possible before performing this calculation
         runtime.gc();
         long usedMemory = Math.max(0L, runtime.totalMemory() - runtime.freeMemory());
         long availableMemory = maxMemory - usedMemory;
         if (availableMemory < 4294967296L) {
             out.notifyIssue(IssueReport.IssueLevel.FATAL,
                     "Not enough heap space available to perform OpenIE extraction: "
                             + (availableMemory/1048576L) + "/" + (maxMemory / 1048576L)
                             + " MB. Requires 4096 MB.", -1, -1);
             LOG.error("Increase JVM heap size when running OpenIE extractor. max=" + maxMemory + "; available=" + availableMemory);
             return;
         }

         IRI documentIRI = context.getDocumentIRI();
         RDFUtils.iri(documentIRI.toString() + "root");
         out.writeNamespace(RDF.PREFIX, RDF.NAMESPACE);
         out.writeNamespace(RDFS.PREFIX, RDFS.NAMESPACE);
         LOG.debug("Processing: {}", documentIRI.toString());

         OpenIE openIE = new OpenIE(
                 new ClearParser(
                         new ClearPostagger(
                                 new ClearTokenizer())), new ClearSrl(), false, false);

         Seq<Instance> extractions = null;
         Tika tika = new Tika();
         try {
             extractions = openIE.extract(tika.parseToString(StreamUtils.documentToInputStream(in)));
         } catch (TransformerConfigurationException | TransformerFactoryConfigurationError e) {
             LOG.error("Encountered error during OpenIE extraction.", e);
         } catch (TikaException e) {
             LOG.error("Encountered error whilst parsing InputStream with Tika.", e);
         }

         List<Instance> listExtractions = JavaConversions.seqAsJavaList(extractions);
         // for each extraction instance we can obtain a number of extraction elements
         // instance.confidence() - a confidence value for the extraction itself
         // instance.extr().context() - an optional representation of the context for this extraction
         // instance.extr().arg1().text() - subject
         // instance.extr().rel().text() - predicate
         // instance.extr().arg2s().text() - object
         String thresholdString;
         try {
             thresholdString = extractionParameters.getProperty("any23.extraction.openie.confidence.threshold");
         } catch (RuntimeException e) {
             thresholdString = null;
         }
         double threshold = thresholdString == null ? 0.5 : Double.parseDouble(thresholdString);
         for(Instance instance : listExtractions) {
             if (instance.confidence() > threshold) {
                 List<Argument> listArg2s = JavaConversions.seqAsJavaList(instance.extr().arg2s());
                 for(Argument argument : listArg2s) {
                     Resource subject = RDFUtils.makeIRI(instance.extr().arg1().text(), documentIRI);
                     IRI predicate = (IRI) RDFUtils.makeIRI(instance.extr().rel().text(), documentIRI);
                     Value object = RDFUtils.toValue(argument.text());
                     out.writeTriple(subject, predicate, object);
                 }
             }
         }
     }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.any23.plugin.extractor.openie;

	import java.io.IOException;
	import java.util.List;

	import javax.xml.transform.TransformerConfigurationException;
	import javax.xml.transform.TransformerFactoryConfigurationError;

	import org.apache.any23.extractor.Extractor;
	import org.apache.any23.extractor.IssueReport;
	import org.apache.any23.extractor.ExtractionContext;
	import org.apache.any23.extractor.ExtractorDescription;
	import org.apache.any23.plugin.Author;
	import org.apache.any23.rdf.RDFUtils;
	import org.apache.any23.util.StreamUtils;
	import org.apache.tika.Tika;
	import org.apache.tika.exception.TikaException;
	import org.eclipse.rdf4j.model.IRI;
	import org.eclipse.rdf4j.model.Resource;
	import org.eclipse.rdf4j.model.Value;
	import org.eclipse.rdf4j.model.vocabulary.RDF;
	import org.eclipse.rdf4j.model.vocabulary.RDFS;
	import org.apache.any23.extractor.ExtractionException;
	import org.apache.any23.extractor.ExtractionParameters;
	import org.apache.any23.extractor.ExtractionResult;

	import org.slf4j.Logger;
	import org.slf4j.LoggerFactory;
	import org.w3c.dom.Document;

	import edu.knowitall.openie.Argument;
	import edu.knowitall.openie.Instance;
	import edu.knowitall.openie.OpenIE;
	import edu.knowitall.tool.parse.ClearParser;
	import edu.knowitall.tool.postag.ClearPostagger;
	import edu.knowitall.tool.srl.ClearSrl;
	import edu.knowitall.tool.tokenize.ClearTokenizer;
	import scala.collection.JavaConversions;
	import scala.collection.Seq;

	/**
	* An <a href="https://github.com/allenai/openie-standalone">OpenIE</a>
	* extractor able to generate <i>RDF</i> statements from
	* sentences representing relations in the text.
	*/
	@Author(name="Lewis John McGibbney (lewismc@apache.org)")
	public class OpenIEExtractor implements Extractor.TagSoupDOMExtractor {

	private static final Logger LOG = LoggerFactory.getLogger(OpenIEExtractor.class);

	/**
	* default constructor
	*/
	public OpenIEExtractor() {
	// default constructor
	}

	/**
	* @see org.apache.any23.extractor.Extractor#getDescription()
	*/
	@Override
	public ExtractorDescription getDescription() {
	return OpenIEExtractorFactory.getDescriptionInstance();
	}

	@Override
	public void run(ExtractionParameters extractionParameters,
	ExtractionContext context, Document in, ExtractionResult out)
	throws IOException, ExtractionException {

	Runtime runtime = Runtime.getRuntime();
	long maxMemory = runtime.maxMemory();
	//free up as much memory as possible before performing this calculation
	runtime.gc();
	long usedMemory = Math.max(0L, runtime.totalMemory() - runtime.freeMemory());
	long availableMemory = maxMemory - usedMemory;
	if (availableMemory < 4294967296L) {
	out.notifyIssue(IssueReport.IssueLevel.FATAL,
	"Not enough heap space available to perform OpenIE extraction: "
	+ (availableMemory/1048576L) + "/" + (maxMemory / 1048576L)
	+ " MB. Requires 4096 MB.", -1, -1);
	LOG.error("Increase JVM heap size when running OpenIE extractor. max=" + maxMemory + "; available=" + availableMemory);
	return;
	}

	IRI documentIRI = context.getDocumentIRI();
	RDFUtils.iri(documentIRI.toString() + "root");
	out.writeNamespace(RDF.PREFIX, RDF.NAMESPACE);
	out.writeNamespace(RDFS.PREFIX, RDFS.NAMESPACE);
	LOG.debug("Processing: {}", documentIRI.toString());

	OpenIE openIE = new OpenIE(
	new ClearParser(
	new ClearPostagger(
	new ClearTokenizer())), new ClearSrl(), false, false);

	Seq<Instance> extractions = null;
	Tika tika = new Tika();
	try {
	extractions = openIE.extract(tika.parseToString(StreamUtils.documentToInputStream(in)));
	} catch (TransformerConfigurationException \| TransformerFactoryConfigurationError e) {
	LOG.error("Encountered error during OpenIE extraction.", e);
	} catch (TikaException e) {
	LOG.error("Encountered error whilst parsing InputStream with Tika.", e);
	}

	List<Instance> listExtractions = JavaConversions.seqAsJavaList(extractions);
	// for each extraction instance we can obtain a number of extraction elements
	// instance.confidence() - a confidence value for the extraction itself
	// instance.extr().context() - an optional representation of the context for this extraction
	// instance.extr().arg1().text() - subject
	// instance.extr().rel().text() - predicate
	// instance.extr().arg2s().text() - object
	String thresholdString;
	try {
	thresholdString = extractionParameters.getProperty("any23.extraction.openie.confidence.threshold");
	} catch (RuntimeException e) {
	thresholdString = null;
	}
	double threshold = thresholdString == null ? 0.5 : Double.parseDouble(thresholdString);
	for(Instance instance : listExtractions) {
	if (instance.confidence() > threshold) {
	List<Argument> listArg2s = JavaConversions.seqAsJavaList(instance.extr().arg2s());
	for(Argument argument : listArg2s) {
	Resource subject = RDFUtils.makeIRI(instance.extr().arg1().text(), documentIRI);
	IRI predicate = (IRI) RDFUtils.makeIRI(instance.extr().rel().text(), documentIRI);
	Value object = RDFUtils.toValue(argument.text());
	out.writeTriple(subject, predicate, object);
	}
	}
	}
	}
	}