blob: 7fc32a8db0158310e2fc8a0cf88a8dbbc73e227d [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.any23;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.net.URISyntaxException;
import java.nio.charset.Charset;
import java.util.Set;
import java.util.TreeSet;
import java.util.Collections;
import java.util.Arrays;
import org.apache.any23.Any23;
import org.apache.any23.extractor.ExtractionException;
import org.apache.any23.writer.BenchmarkTripleHandler;
import org.apache.any23.writer.NTriplesWriter;
import org.apache.any23.writer.TripleHandler;
import org.apache.any23.writer.TripleHandlerException;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.parse.HTMLMetaTags;
import org.apache.nutch.parse.HtmlParseFilter;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseResult;
import org.apache.nutch.protocol.Content;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.DocumentFragment;
/**
* <p>This implementation of {@link org.apache.nutch.parse.HtmlParseFilter}
* uses the <a href="https://any23.apache.org/">Apache Any23</a> library
* for parsing and extracting structured data in RDF format from a
* variety of Web documents. The supported formats can be found at <a href="https://any23.apache.org/">Apache Any23</a>.
* <p>In this implementation triples are written as Notation3 e.g.
* <code><http://www.bbc.co.uk/news/scotland/> <http://iptc.org/std/rNews/2011-10-07#datePublished> "2014/03/31 13:53:03"@en-gb .</code>
* and triples are identified within output triple streams by the presence of '\n'.
* The presence of the '\n' is a characteristic specific to N3 serialization in Any23.
* In order to use another/other writers implementing the
* <a href="https://any23.apache.org/apidocs/index.html?org/apache/any23/writer/TripleHandler.html">TripleHandler</a>
* interface, we will most likely need to identify an alternative data characteristic
* which we can use to split triples streams.</p>
* <p>
*/
public class Any23ParseFilter implements HtmlParseFilter {
/** Logging instance */
public static final Logger LOG = LoggerFactory.getLogger(Any23ParseFilter.class);
private Configuration conf = null;
/**
* Constant identifier used as a Key for writing and reading
* triples to and from the metadata Map field.
*/
public static final String ANY23_TRIPLES = "Any23-Triples";
public static final String ANY_23_EXTRACTORS_CONF = "any23.extractors";
public static final String ANY_23_CONTENT_TYPES_CONF = "any23.content_types";
private static class Any23Parser {
Set<String> triples = null;
Any23Parser(String url, String htmlContent, String contentType, String... extractorNames) throws TripleHandlerException {
triples = new TreeSet<>();
try {
parse(url, htmlContent, contentType, extractorNames);
} catch (URISyntaxException e) {
LOG.error("Error parsing URI: {}", url, e);
throw new RuntimeException(e.getReason());
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* Maintains a {@link java.util.Set} containing the triples
* @return a {@link java.util.Set} of triples.
*/
private Set<String> getTriples() {
return triples;
}
private void parse(String url, String htmlContent, String contentType, String... extractorNames) throws URISyntaxException, IOException, TripleHandlerException {
Any23 any23 = new Any23(extractorNames);
any23.setMIMETypeDetector(null);
ByteArrayOutputStream baos = new ByteArrayOutputStream();
try {
TripleHandler tHandler = new NTriplesWriter(baos);
BenchmarkTripleHandler bHandler = new BenchmarkTripleHandler(tHandler);
try {
any23.extract(htmlContent, url, contentType, "UTF-8", bHandler);
} catch (IOException e) {
LOG.error("Error while reading the source", e);
} catch (ExtractionException e) {
LOG.error("Error while extracting structured data", e);
} finally {
tHandler.close();
bHandler.close();
}
LOG.debug("Any23 BenchmarkTripleHandler.report: " + bHandler.report());
String n3 = baos.toString("UTF-8");
String[] triplesStrings = n3.split("\n");
Collections.addAll(triples, triplesStrings);
} catch (IOException e) {
LOG.error("Unexpected IOException", e);
}
}
}
@Override
public Configuration getConf() {
return this.conf;
}
@Override
public void setConf(Configuration conf) {
this.conf = conf;
}
/**
* @see org.apache.nutch.parse.HtmlParseFilter#filter(Content, ParseResult, HTMLMetaTags, DocumentFragment)
*/
@Override
public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) {
String[] extractorNames = conf.getStrings(ANY_23_EXTRACTORS_CONF, "html-head-meta");
String[] supportedContentTypes = conf.getStrings(ANY_23_CONTENT_TYPES_CONF, "text/html", "application/xhtml+xml");
String contentType = content.getContentType();
if (supportedContentTypes != null && !Arrays.asList(supportedContentTypes).contains(contentType)) {
LOG.debug("Ignoring document at {} because it has an unsupported Content-Type {}", content.getUrl(), contentType);
return parseResult;
}
Any23Parser parser;
try {
String htmlContent = new String(content.getContent(), Charset.forName("UTF-8"));
parser = new Any23Parser(content.getUrl(), htmlContent, contentType, extractorNames);
} catch (TripleHandlerException e) {
throw new RuntimeException("Error running Any23 parser: " + e.getMessage());
}
Set<String> triples = parser.getTriples();
Parse parse = parseResult.get(content.getUrl());
Metadata metadata = parse.getData().getParseMeta();
for (String triple : triples) {
metadata.add(ANY23_TRIPLES, triple);
}
return parseResult;
}
}