src/plugin/any23/src/java/org/apache/nutch/any23/Any23ParseFilter.java - nutch - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.nutch.any23;

 import java.io.ByteArrayOutputStream;
 import java.io.IOException;
 import java.net.URISyntaxException;
 import java.nio.charset.Charset;
 import java.util.Set;
 import java.util.TreeSet;
 import java.util.Collections;
 import java.util.Arrays;

 import org.apache.any23.Any23;
 import org.apache.any23.extractor.ExtractionException;
 import org.apache.any23.writer.BenchmarkTripleHandler;
 import org.apache.any23.writer.NTriplesWriter;
 import org.apache.any23.writer.TripleHandler;
 import org.apache.any23.writer.TripleHandlerException;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.metadata.Metadata;
 import org.apache.nutch.parse.HTMLMetaTags;
 import org.apache.nutch.parse.HtmlParseFilter;
 import org.apache.nutch.parse.Parse;
 import org.apache.nutch.parse.ParseResult;
 import org.apache.nutch.protocol.Content;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.w3c.dom.DocumentFragment;

 /**
  * <p>This implementation of {@link org.apache.nutch.parse.HtmlParseFilter}
  * uses the <a href="https://any23.apache.org/">Apache Any23</a> library
  * for parsing and extracting structured data in RDF format from a
  * variety of Web documents. The supported formats can be found at <a href="https://any23.apache.org/">Apache Any23</a>.
  * <p>In this implementation triples are written as Notation3 e.g.
  * <code><http://www.bbc.co.uk/news/scotland/> <http://iptc.org/std/rNews/2011-10-07#datePublished> "2014/03/31 13:53:03"@en-gb .</code>
  * and triples are identified within output triple streams by the presence of '\n'.
  * The presence of the '\n' is a characteristic specific to N3 serialization in Any23.
  * In order to use another/other writers implementing the
  * <a href="https://any23.apache.org/apidocs/index.html?org/apache/any23/writer/TripleHandler.html">TripleHandler</a>
  * interface, we will most likely need to identify an alternative data characteristic
  * which we can use to split triples streams.</p>
  * <p>
  */
 public class Any23ParseFilter implements HtmlParseFilter {

   /** Logging instance */
   public static final Logger LOG = LoggerFactory.getLogger(Any23ParseFilter.class);

   private Configuration conf = null;

   /**
    * Constant identifier used as a Key for writing and reading
    * triples to and from the metadata Map field.
    */
   public static final String ANY23_TRIPLES = "Any23-Triples";

   public static final String ANY_23_EXTRACTORS_CONF = "any23.extractors";
   public static final String ANY_23_CONTENT_TYPES_CONF = "any23.content_types";

   private static class Any23Parser {

     Set<String> triples = null;

     Any23Parser(String url, String htmlContent, String contentType, String... extractorNames) throws TripleHandlerException {
       triples = new TreeSet<>();
       try {
         parse(url, htmlContent, contentType, extractorNames);
       } catch (URISyntaxException e) {
         LOG.error("Error parsing URI: {}", url, e);
         throw new RuntimeException(e.getReason());
       } catch (IOException e) {
         e.printStackTrace();
       }
     }

     /**
      * Maintains a {@link java.util.Set} containing the triples
      * @return a {@link java.util.Set} of triples.
      */
     private Set<String> getTriples() {
       return triples;
     }

     private void parse(String url, String htmlContent, String contentType, String... extractorNames) throws URISyntaxException, IOException, TripleHandlerException {
       Any23 any23 = new Any23(extractorNames);
       any23.setMIMETypeDetector(null);
       ByteArrayOutputStream baos = new ByteArrayOutputStream();
       try {
         TripleHandler tHandler = new NTriplesWriter(baos);
         BenchmarkTripleHandler bHandler = new BenchmarkTripleHandler(tHandler);
         try {
           any23.extract(htmlContent, url, contentType, "UTF-8", bHandler);
         } catch (IOException e) {
           LOG.error("Error while reading the source", e);
         } catch (ExtractionException e) {
           LOG.error("Error while extracting structured data", e);
         } finally {
           tHandler.close();
           bHandler.close();
         }

         LOG.debug("Any23 BenchmarkTripleHandler.report: " + bHandler.report());

         String n3 = baos.toString("UTF-8");
         String[] triplesStrings = n3.split("\n");
         Collections.addAll(triples, triplesStrings);
       } catch (IOException e) {
         LOG.error("Unexpected IOException", e);
       }
     }
   }

   @Override
   public Configuration getConf() {
     return this.conf;
   }

   @Override
   public void setConf(Configuration conf) {
     this.conf = conf;
   }

   /**
    * @see org.apache.nutch.parse.HtmlParseFilter#filter(Content, ParseResult, HTMLMetaTags, DocumentFragment)
    */
   @Override
   public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) {
     String[] extractorNames = conf.getStrings(ANY_23_EXTRACTORS_CONF, "html-head-meta");
     String[] supportedContentTypes = conf.getStrings(ANY_23_CONTENT_TYPES_CONF, "text/html", "application/xhtml+xml");
     String contentType = content.getContentType();
     if (supportedContentTypes != null && !Arrays.asList(supportedContentTypes).contains(contentType)) {
       LOG.debug("Ignoring document at {} because it has an unsupported Content-Type {}", content.getUrl(), contentType);
       return parseResult;
     }

     Any23Parser parser;
     try {
       String htmlContent = new String(content.getContent(), Charset.forName("UTF-8"));
       parser = new Any23Parser(content.getUrl(), htmlContent, contentType, extractorNames);
     } catch (TripleHandlerException e) {
       throw new RuntimeException("Error running Any23 parser: " + e.getMessage());
     }
     Set<String> triples = parser.getTriples();

     Parse parse = parseResult.get(content.getUrl());
     Metadata metadata = parse.getData().getParseMeta();

     for (String triple : triples) {
       metadata.add(ANY23_TRIPLES, triple);
     }

     return parseResult;
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.nutch.any23;

	import java.io.ByteArrayOutputStream;
	import java.io.IOException;
	import java.net.URISyntaxException;
	import java.nio.charset.Charset;
	import java.util.Set;
	import java.util.TreeSet;
	import java.util.Collections;
	import java.util.Arrays;

	import org.apache.any23.Any23;
	import org.apache.any23.extractor.ExtractionException;
	import org.apache.any23.writer.BenchmarkTripleHandler;
	import org.apache.any23.writer.NTriplesWriter;
	import org.apache.any23.writer.TripleHandler;
	import org.apache.any23.writer.TripleHandlerException;
	import org.apache.hadoop.conf.Configuration;
	import org.apache.nutch.metadata.Metadata;
	import org.apache.nutch.parse.HTMLMetaTags;
	import org.apache.nutch.parse.HtmlParseFilter;
	import org.apache.nutch.parse.Parse;
	import org.apache.nutch.parse.ParseResult;
	import org.apache.nutch.protocol.Content;
	import org.slf4j.Logger;
	import org.slf4j.LoggerFactory;
	import org.w3c.dom.DocumentFragment;

	/**
	* <p>This implementation of {@link org.apache.nutch.parse.HtmlParseFilter}
	* uses the <a href="https://any23.apache.org/">Apache Any23</a> library
	* for parsing and extracting structured data in RDF format from a
	* variety of Web documents. The supported formats can be found at <a href="https://any23.apache.org/">Apache Any23</a>.
	* <p>In this implementation triples are written as Notation3 e.g.
	* <code><http://www.bbc.co.uk/news/scotland/> <http://iptc.org/std/rNews/2011-10-07#datePublished> "2014/03/31 13:53:03"@en-gb .</code>
	* and triples are identified within output triple streams by the presence of '\n'.
	* The presence of the '\n' is a characteristic specific to N3 serialization in Any23.
	* In order to use another/other writers implementing the
	* <a href="https://any23.apache.org/apidocs/index.html?org/apache/any23/writer/TripleHandler.html">TripleHandler</a>
	* interface, we will most likely need to identify an alternative data characteristic
	* which we can use to split triples streams.</p>
	* <p>
	*/
	public class Any23ParseFilter implements HtmlParseFilter {

	/** Logging instance */
	public static final Logger LOG = LoggerFactory.getLogger(Any23ParseFilter.class);

	private Configuration conf = null;

	/**
	* Constant identifier used as a Key for writing and reading
	* triples to and from the metadata Map field.
	*/
	public static final String ANY23_TRIPLES = "Any23-Triples";

	public static final String ANY_23_EXTRACTORS_CONF = "any23.extractors";
	public static final String ANY_23_CONTENT_TYPES_CONF = "any23.content_types";

	private static class Any23Parser {

	Set<String> triples = null;

	Any23Parser(String url, String htmlContent, String contentType, String... extractorNames) throws TripleHandlerException {
	triples = new TreeSet<>();
	try {
	parse(url, htmlContent, contentType, extractorNames);
	} catch (URISyntaxException e) {
	LOG.error("Error parsing URI: {}", url, e);
	throw new RuntimeException(e.getReason());
	} catch (IOException e) {
	e.printStackTrace();
	}
	}

	/**
	* Maintains a {@link java.util.Set} containing the triples
	* @return a {@link java.util.Set} of triples.
	*/
	private Set<String> getTriples() {
	return triples;
	}

	private void parse(String url, String htmlContent, String contentType, String... extractorNames) throws URISyntaxException, IOException, TripleHandlerException {
	Any23 any23 = new Any23(extractorNames);
	any23.setMIMETypeDetector(null);
	ByteArrayOutputStream baos = new ByteArrayOutputStream();
	try {
	TripleHandler tHandler = new NTriplesWriter(baos);
	BenchmarkTripleHandler bHandler = new BenchmarkTripleHandler(tHandler);
	try {
	any23.extract(htmlContent, url, contentType, "UTF-8", bHandler);
	} catch (IOException e) {
	LOG.error("Error while reading the source", e);
	} catch (ExtractionException e) {
	LOG.error("Error while extracting structured data", e);
	} finally {
	tHandler.close();
	bHandler.close();
	}

	LOG.debug("Any23 BenchmarkTripleHandler.report: " + bHandler.report());

	String n3 = baos.toString("UTF-8");
	String[] triplesStrings = n3.split("\n");
	Collections.addAll(triples, triplesStrings);
	} catch (IOException e) {
	LOG.error("Unexpected IOException", e);
	}
	}
	}

	@Override
	public Configuration getConf() {
	return this.conf;
	}

	@Override
	public void setConf(Configuration conf) {
	this.conf = conf;
	}

	/**
	* @see org.apache.nutch.parse.HtmlParseFilter#filter(Content, ParseResult, HTMLMetaTags, DocumentFragment)
	*/
	@Override
	public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) {
	String[] extractorNames = conf.getStrings(ANY_23_EXTRACTORS_CONF, "html-head-meta");
	String[] supportedContentTypes = conf.getStrings(ANY_23_CONTENT_TYPES_CONF, "text/html", "application/xhtml+xml");
	String contentType = content.getContentType();
	if (supportedContentTypes != null && !Arrays.asList(supportedContentTypes).contains(contentType)) {
	LOG.debug("Ignoring document at {} because it has an unsupported Content-Type {}", content.getUrl(), contentType);
	return parseResult;
	}

	Any23Parser parser;
	try {
	String htmlContent = new String(content.getContent(), Charset.forName("UTF-8"));
	parser = new Any23Parser(content.getUrl(), htmlContent, contentType, extractorNames);
	} catch (TripleHandlerException e) {
	throw new RuntimeException("Error running Any23 parser: " + e.getMessage());
	}
	Set<String> triples = parser.getTriples();

	Parse parse = parseResult.get(content.getUrl());
	Metadata metadata = parse.getData().getParseMeta();

	for (String triple : triples) {
	metadata.add(ANY23_TRIPLES, triple);
	}

	return parseResult;
	}
	}