blob: d3df6996c2c794879be2a663996ab85c4857b086 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.parse.tika;
import org.apache.avro.util.Utf8;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.html.dom.HTMLDocumentImpl;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.parse.*;
import org.apache.nutch.storage.ParseStatus;
import org.apache.nutch.storage.WebPage;
import org.apache.nutch.storage.WebPage.Field;
import org.apache.nutch.util.Bytes;
import org.apache.nutch.util.MimeUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.TableUtil;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.CompositeParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.html.HtmlMapper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.DocumentFragment;
import java.io.ByteArrayInputStream;
import java.io.DataInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashSet;
/**
* Wrapper for Tika parsers. Mimics the HTMLParser but using the XHTML
* representation returned by Tika as SAX events
***/
public class TikaParser implements org.apache.nutch.parse.Parser {
public static final Logger LOG = LoggerFactory.getLogger(TikaParser.class);
private static Collection<WebPage.Field> FIELDS = new HashSet<WebPage.Field>();
static {
FIELDS.add(WebPage.Field.BASE_URL);
FIELDS.add(WebPage.Field.CONTENT_TYPE);
}
private Configuration conf;
private TikaConfig tikaConfig = null;
private DOMContentUtils utils;
private ParseFilters htmlParseFilters;
private String cachingPolicy;
private HtmlMapper HTMLMapper;
@Override
public Parse getParse(String url, WebPage page) {
String baseUrl = TableUtil.toString(page.getBaseUrl());
URL base;
try {
base = new URL(baseUrl);
} catch (MalformedURLException e) {
return ParseStatusUtils.getEmptyParse(e, getConf());
}
// get the right parser using the mime type as a clue
String mimeType = page.getContentType().toString();
CompositeParser compositeParser = (CompositeParser) tikaConfig.getParser();
Parser parser = compositeParser.getParsers().get(MediaType.parse(mimeType));
ByteBuffer raw = page.getContent();
if (parser == null) {
String message = "Can't retrieve Tika parser for mime-type " + mimeType;
LOG.error(message);
return ParseStatusUtils.getEmptyParse(ParseStatusCodes.FAILED_EXCEPTION,
message, getConf());
}
LOG.debug("Using Tika parser " + parser.getClass().getName()
+ " for mime-type " + mimeType);
Metadata tikamd = new Metadata();
HTMLDocumentImpl doc = new HTMLDocumentImpl();
doc.setErrorChecking(false);
DocumentFragment root = doc.createDocumentFragment();
DOMBuilder domhandler = new DOMBuilder(doc, root);
ParseContext context = new ParseContext();
if (HTMLMapper != null)
context.set(HtmlMapper.class, HTMLMapper);
// to add once available in Tika
// context.set(HtmlMapper.class, IdentityHtmlMapper.INSTANCE);
tikamd.set(Metadata.CONTENT_TYPE, mimeType);
try {
parser.parse(new ByteArrayInputStream(raw.array(), raw.arrayOffset()
+ raw.position(), raw.remaining()), domhandler, tikamd, context);
} catch (Exception e) {
LOG.error("Error parsing " + url, e);
return ParseStatusUtils.getEmptyParse(e, getConf());
}
HTMLMetaTags metaTags = new HTMLMetaTags();
String text = "";
String title = "";
Outlink[] outlinks = new Outlink[0];
// we have converted the sax events generated by Tika into a DOM object
// so we can now use the usual HTML resources from Nutch
// get meta directives
HTMLMetaProcessor.getMetaTags(metaTags, root, base);
if (LOG.isTraceEnabled()) {
LOG.trace("Meta tags for " + base + ": " + metaTags.toString());
}
// check meta directives
if (!metaTags.getNoIndex()) { // okay to index
StringBuffer sb = new StringBuffer();
if (LOG.isTraceEnabled()) {
LOG.trace("Getting text...");
}
utils.getText(sb, root); // extract text
text = sb.toString();
sb.setLength(0);
if (LOG.isTraceEnabled()) {
LOG.trace("Getting title...");
}
utils.getTitle(sb, root); // extract title
title = sb.toString().trim();
}
if (!metaTags.getNoFollow()) { // okay to follow links
ArrayList<Outlink> l = new ArrayList<Outlink>(); // extract outlinks
URL baseTag = utils.getBase(root);
if (LOG.isTraceEnabled()) {
LOG.trace("Getting links...");
}
utils.getOutlinks(baseTag != null ? baseTag : base, l, root);
outlinks = l.toArray(new Outlink[l.size()]);
if (LOG.isTraceEnabled()) {
LOG.trace("found " + outlinks.length + " outlinks in " + base);
}
}
// populate Nutch metadata with Tika metadata
String[] TikaMDNames = tikamd.names();
for (String tikaMDName : TikaMDNames) {
if (tikaMDName.equalsIgnoreCase(TikaCoreProperties.TITLE.toString()))
continue;
// TODO what if multivalued?
page.getMetadata().put(new Utf8(tikaMDName),
ByteBuffer.wrap(Bytes.toBytes(tikamd.get(tikaMDName))));
}
// no outlinks? try OutlinkExtractor e.g works for mime types where no
// explicit markup for anchors
if (outlinks.length == 0) {
outlinks = OutlinkExtractor.getOutlinks(text, getConf());
}
ParseStatus status = ParseStatusUtils.STATUS_SUCCESS;
if (metaTags.getRefresh()) {
status.setMinorCode((int) ParseStatusCodes.SUCCESS_REDIRECT);
status.getArgs().add(new Utf8(metaTags.getRefreshHref().toString()));
status.getArgs().add(
new Utf8(Integer.toString(metaTags.getRefreshTime())));
}
Parse parse = new Parse(text, title, outlinks, status);
parse = htmlParseFilters.filter(url, page, parse, metaTags, root);
if (metaTags.getNoCache()) { // not okay to cache
page.getMetadata().put(new Utf8(Nutch.CACHING_FORBIDDEN_KEY),
ByteBuffer.wrap(Bytes.toBytes(cachingPolicy)));
}
return parse;
}
public void setConf(Configuration conf) {
this.conf = conf;
this.tikaConfig = null;
try {
tikaConfig = TikaConfig.getDefaultConfig();
} catch (Exception e2) {
String message = "Problem loading default Tika configuration";
LOG.error(message, e2);
throw new RuntimeException(e2);
}
// use a custom htmlmapper
String htmlmapperClassName = conf.get("tika.htmlmapper.classname");
if (StringUtils.isNotBlank(htmlmapperClassName)) {
try {
Class HTMLMapperClass = Class.forName(htmlmapperClassName);
boolean interfaceOK = HtmlMapper.class
.isAssignableFrom(HTMLMapperClass);
if (!interfaceOK) {
throw new RuntimeException("Class " + htmlmapperClassName
+ " does not implement HtmlMapper");
}
HTMLMapper = (HtmlMapper) HTMLMapperClass.newInstance();
} catch (Exception e) {
LOG.error("Can't generate instance for class " + htmlmapperClassName);
throw new RuntimeException("Can't generate instance for class "
+ htmlmapperClassName);
}
}
this.htmlParseFilters = new ParseFilters(getConf());
this.utils = new DOMContentUtils(conf);
this.cachingPolicy = getConf().get("parser.caching.forbidden.policy",
Nutch.CACHING_FORBIDDEN_CONTENT);
}
public TikaConfig getTikaConfig() {
return this.tikaConfig;
}
public Configuration getConf() {
return this.conf;
}
@Override
public Collection<Field> getFields() {
return FIELDS;
}
// main class used for debuggin
public static void main(String[] args) throws Exception {
String name = args[0];
String url = "file:" + name;
File file = new File(name);
byte[] bytes = new byte[(int) file.length()];
@SuppressWarnings("resource")
DataInputStream in = new DataInputStream(new FileInputStream(file));
in.readFully(bytes);
Configuration conf = NutchConfiguration.create();
// TikaParser parser = new TikaParser();
// parser.setConf(conf);
WebPage page = WebPage.newBuilder().build();
page.setBaseUrl(new Utf8(url));
page.setContent(ByteBuffer.wrap(bytes));
MimeUtil mimeutil = new MimeUtil(conf);
String mtype = mimeutil.getMimeType(file);
page.setContentType(new Utf8(mtype));
// Parse parse = parser.getParse(url, page);
Parse parse = new ParseUtil(conf).parse(url, page);
System.out.println("content type: " + mtype);
System.out.println("title: " + parse.getTitle());
System.out.println("text: " + parse.getText());
System.out.println("outlinks: " + Arrays.toString(parse.getOutlinks()));
}
}