src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java - nutch - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.nutch.parse.tika;

 import org.apache.avro.util.Utf8;
 import org.apache.commons.lang.StringUtils;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.html.dom.HTMLDocumentImpl;
 import org.apache.nutch.metadata.Nutch;
 import org.apache.nutch.parse.*;
 import org.apache.nutch.storage.ParseStatus;
 import org.apache.nutch.storage.WebPage;
 import org.apache.nutch.storage.WebPage.Field;
 import org.apache.nutch.util.Bytes;
 import org.apache.nutch.util.MimeUtil;
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.TableUtil;
 import org.apache.tika.config.TikaConfig;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.CompositeParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.html.HtmlMapper;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.w3c.dom.DocumentFragment;

 import java.io.ByteArrayInputStream;
 import java.io.DataInputStream;
 import java.io.File;
 import java.io.FileInputStream;
 import java.net.MalformedURLException;
 import java.net.URL;
 import java.nio.ByteBuffer;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
 import java.util.HashSet;

 /**
  * Wrapper for Tika parsers. Mimics the HTMLParser but using the XHTML
  * representation returned by Tika as SAX events
  ***/

 public class TikaParser implements org.apache.nutch.parse.Parser {

   public static final Logger LOG = LoggerFactory.getLogger(TikaParser.class);

   private static Collection<WebPage.Field> FIELDS = new HashSet<WebPage.Field>();

   static {
     FIELDS.add(WebPage.Field.BASE_URL);
     FIELDS.add(WebPage.Field.CONTENT_TYPE);
   }

   private Configuration conf;
   private TikaConfig tikaConfig = null;
   private DOMContentUtils utils;
   private ParseFilters htmlParseFilters;
   private String cachingPolicy;

   private HtmlMapper HTMLMapper;

   @Override
   public Parse getParse(String url, WebPage page) {

     String baseUrl = TableUtil.toString(page.getBaseUrl());
     URL base;
     try {
       base = new URL(baseUrl);
     } catch (MalformedURLException e) {
       return ParseStatusUtils.getEmptyParse(e, getConf());
     }

     // get the right parser using the mime type as a clue
     String mimeType = page.getContentType().toString();
     CompositeParser compositeParser = (CompositeParser) tikaConfig.getParser();
     Parser parser = compositeParser.getParsers().get(MediaType.parse(mimeType));
     ByteBuffer raw = page.getContent();

     if (parser == null) {
       String message = "Can't retrieve Tika parser for mime-type " + mimeType;
       LOG.error(message);
       return ParseStatusUtils.getEmptyParse(ParseStatusCodes.FAILED_EXCEPTION,
           message, getConf());
     }

     LOG.debug("Using Tika parser " + parser.getClass().getName()
         + " for mime-type " + mimeType);

     Metadata tikamd = new Metadata();

     HTMLDocumentImpl doc = new HTMLDocumentImpl();
     doc.setErrorChecking(false);
     DocumentFragment root = doc.createDocumentFragment();
     DOMBuilder domhandler = new DOMBuilder(doc, root);
     ParseContext context = new ParseContext();
     if (HTMLMapper != null)
       context.set(HtmlMapper.class, HTMLMapper);
     // to add once available in Tika
     // context.set(HtmlMapper.class, IdentityHtmlMapper.INSTANCE);
     tikamd.set(Metadata.CONTENT_TYPE, mimeType);
     try {
       parser.parse(new ByteArrayInputStream(raw.array(), raw.arrayOffset()
           + raw.position(), raw.remaining()), domhandler, tikamd, context);
     } catch (Exception e) {
       LOG.error("Error parsing " + url, e);
       return ParseStatusUtils.getEmptyParse(e, getConf());
     }

     HTMLMetaTags metaTags = new HTMLMetaTags();
     String text = "";
     String title = "";
     Outlink[] outlinks = new Outlink[0];

     // we have converted the sax events generated by Tika into a DOM object
     // so we can now use the usual HTML resources from Nutch
     // get meta directives
     HTMLMetaProcessor.getMetaTags(metaTags, root, base);
     if (LOG.isTraceEnabled()) {
       LOG.trace("Meta tags for " + base + ": " + metaTags.toString());
     }

     // check meta directives
     if (!metaTags.getNoIndex()) { // okay to index
       StringBuffer sb = new StringBuffer();
       if (LOG.isTraceEnabled()) {
         LOG.trace("Getting text...");
       }
       utils.getText(sb, root); // extract text
       text = sb.toString();
       sb.setLength(0);
       if (LOG.isTraceEnabled()) {
         LOG.trace("Getting title...");
       }
       utils.getTitle(sb, root); // extract title
       title = sb.toString().trim();
     }

     if (!metaTags.getNoFollow()) { // okay to follow links
       ArrayList<Outlink> l = new ArrayList<Outlink>(); // extract outlinks
       URL baseTag = utils.getBase(root);
       if (LOG.isTraceEnabled()) {
         LOG.trace("Getting links...");
       }
       utils.getOutlinks(baseTag != null ? baseTag : base, l, root);
       outlinks = l.toArray(new Outlink[l.size()]);
       if (LOG.isTraceEnabled()) {
         LOG.trace("found " + outlinks.length + " outlinks in " + base);
       }
     }

     // populate Nutch metadata with Tika metadata
     String[] TikaMDNames = tikamd.names();
     for (String tikaMDName : TikaMDNames) {
       if (tikaMDName.equalsIgnoreCase(TikaCoreProperties.TITLE.toString()))
         continue;
       // TODO what if multivalued?
       page.getMetadata().put(new Utf8(tikaMDName),
           ByteBuffer.wrap(Bytes.toBytes(tikamd.get(tikaMDName))));
     }

     // no outlinks? try OutlinkExtractor e.g works for mime types where no
     // explicit markup for anchors

     if (outlinks.length == 0) {
       outlinks = OutlinkExtractor.getOutlinks(text, getConf());
     }

     ParseStatus status = ParseStatusUtils.STATUS_SUCCESS;
     if (metaTags.getRefresh()) {
       status.setMinorCode((int) ParseStatusCodes.SUCCESS_REDIRECT);
       status.getArgs().add(new Utf8(metaTags.getRefreshHref().toString()));
       status.getArgs().add(
           new Utf8(Integer.toString(metaTags.getRefreshTime())));
     }

     Parse parse = new Parse(text, title, outlinks, status);
     parse = htmlParseFilters.filter(url, page, parse, metaTags, root);

     if (metaTags.getNoCache()) { // not okay to cache
       page.getMetadata().put(new Utf8(Nutch.CACHING_FORBIDDEN_KEY),
           ByteBuffer.wrap(Bytes.toBytes(cachingPolicy)));
     }

     return parse;
   }

   public void setConf(Configuration conf) {
     this.conf = conf;
     this.tikaConfig = null;

     try {
       tikaConfig = TikaConfig.getDefaultConfig();
     } catch (Exception e2) {
       String message = "Problem loading default Tika configuration";
       LOG.error(message, e2);
       throw new RuntimeException(e2);
     }

     // use a custom htmlmapper
     String htmlmapperClassName = conf.get("tika.htmlmapper.classname");
     if (StringUtils.isNotBlank(htmlmapperClassName)) {
       try {
         Class HTMLMapperClass = Class.forName(htmlmapperClassName);
         boolean interfaceOK = HtmlMapper.class
             .isAssignableFrom(HTMLMapperClass);
         if (!interfaceOK) {
           throw new RuntimeException("Class " + htmlmapperClassName
               + " does not implement HtmlMapper");
         }
         HTMLMapper = (HtmlMapper) HTMLMapperClass.newInstance();
       } catch (Exception e) {
         LOG.error("Can't generate instance for class " + htmlmapperClassName);
         throw new RuntimeException("Can't generate instance for class "
             + htmlmapperClassName);
       }
     }

     this.htmlParseFilters = new ParseFilters(getConf());
     this.utils = new DOMContentUtils(conf);
     this.cachingPolicy = getConf().get("parser.caching.forbidden.policy",
         Nutch.CACHING_FORBIDDEN_CONTENT);
   }

   public TikaConfig getTikaConfig() {
     return this.tikaConfig;
   }

   public Configuration getConf() {
     return this.conf;
   }

   @Override
   public Collection<Field> getFields() {
     return FIELDS;
   }

   // main class used for debuggin
   public static void main(String[] args) throws Exception {
     String name = args[0];
     String url = "file:" + name;
     File file = new File(name);
     byte[] bytes = new byte[(int) file.length()];
     @SuppressWarnings("resource")
     DataInputStream in = new DataInputStream(new FileInputStream(file));
     in.readFully(bytes);
     Configuration conf = NutchConfiguration.create();
     // TikaParser parser = new TikaParser();
     // parser.setConf(conf);
     WebPage page = WebPage.newBuilder().build();
     page.setBaseUrl(new Utf8(url));
     page.setContent(ByteBuffer.wrap(bytes));
     MimeUtil mimeutil = new MimeUtil(conf);
     String mtype = mimeutil.getMimeType(file);
     page.setContentType(new Utf8(mtype));
     // Parse parse = parser.getParse(url, page);

     Parse parse = new ParseUtil(conf).parse(url, page);

     System.out.println("content type: " + mtype);
     System.out.println("title: " + parse.getTitle());
     System.out.println("text: " + parse.getText());
     System.out.println("outlinks: " + Arrays.toString(parse.getOutlinks()));
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.nutch.parse.tika;

	import org.apache.avro.util.Utf8;
	import org.apache.commons.lang.StringUtils;
	import org.apache.hadoop.conf.Configuration;
	import org.apache.html.dom.HTMLDocumentImpl;
	import org.apache.nutch.metadata.Nutch;
	import org.apache.nutch.parse.*;
	import org.apache.nutch.storage.ParseStatus;
	import org.apache.nutch.storage.WebPage;
	import org.apache.nutch.storage.WebPage.Field;
	import org.apache.nutch.util.Bytes;
	import org.apache.nutch.util.MimeUtil;
	import org.apache.nutch.util.NutchConfiguration;
	import org.apache.nutch.util.TableUtil;
	import org.apache.tika.config.TikaConfig;
	import org.apache.tika.metadata.Metadata;
	import org.apache.tika.metadata.TikaCoreProperties;
	import org.apache.tika.mime.MediaType;
	import org.apache.tika.parser.CompositeParser;
	import org.apache.tika.parser.ParseContext;
	import org.apache.tika.parser.Parser;
	import org.apache.tika.parser.html.HtmlMapper;
	import org.slf4j.Logger;
	import org.slf4j.LoggerFactory;
	import org.w3c.dom.DocumentFragment;

	import java.io.ByteArrayInputStream;
	import java.io.DataInputStream;
	import java.io.File;
	import java.io.FileInputStream;
	import java.net.MalformedURLException;
	import java.net.URL;
	import java.nio.ByteBuffer;
	import java.util.ArrayList;
	import java.util.Arrays;
	import java.util.Collection;
	import java.util.HashSet;

	/**
	* Wrapper for Tika parsers. Mimics the HTMLParser but using the XHTML
	* representation returned by Tika as SAX events
	***/

	public class TikaParser implements org.apache.nutch.parse.Parser {

	public static final Logger LOG = LoggerFactory.getLogger(TikaParser.class);

	private static Collection<WebPage.Field> FIELDS = new HashSet<WebPage.Field>();

	static {
	FIELDS.add(WebPage.Field.BASE_URL);
	FIELDS.add(WebPage.Field.CONTENT_TYPE);
	}

	private Configuration conf;
	private TikaConfig tikaConfig = null;
	private DOMContentUtils utils;
	private ParseFilters htmlParseFilters;
	private String cachingPolicy;

	private HtmlMapper HTMLMapper;

	@Override
	public Parse getParse(String url, WebPage page) {

	String baseUrl = TableUtil.toString(page.getBaseUrl());
	URL base;
	try {
	base = new URL(baseUrl);
	} catch (MalformedURLException e) {
	return ParseStatusUtils.getEmptyParse(e, getConf());
	}

	// get the right parser using the mime type as a clue
	String mimeType = page.getContentType().toString();
	CompositeParser compositeParser = (CompositeParser) tikaConfig.getParser();
	Parser parser = compositeParser.getParsers().get(MediaType.parse(mimeType));
	ByteBuffer raw = page.getContent();

	if (parser == null) {
	String message = "Can't retrieve Tika parser for mime-type " + mimeType;
	LOG.error(message);
	return ParseStatusUtils.getEmptyParse(ParseStatusCodes.FAILED_EXCEPTION,
	message, getConf());
	}

	LOG.debug("Using Tika parser " + parser.getClass().getName()
	+ " for mime-type " + mimeType);

	Metadata tikamd = new Metadata();

	HTMLDocumentImpl doc = new HTMLDocumentImpl();
	doc.setErrorChecking(false);
	DocumentFragment root = doc.createDocumentFragment();
	DOMBuilder domhandler = new DOMBuilder(doc, root);
	ParseContext context = new ParseContext();
	if (HTMLMapper != null)
	context.set(HtmlMapper.class, HTMLMapper);
	// to add once available in Tika
	// context.set(HtmlMapper.class, IdentityHtmlMapper.INSTANCE);
	tikamd.set(Metadata.CONTENT_TYPE, mimeType);
	try {
	parser.parse(new ByteArrayInputStream(raw.array(), raw.arrayOffset()
	+ raw.position(), raw.remaining()), domhandler, tikamd, context);
	} catch (Exception e) {
	LOG.error("Error parsing " + url, e);
	return ParseStatusUtils.getEmptyParse(e, getConf());
	}

	HTMLMetaTags metaTags = new HTMLMetaTags();
	String text = "";
	String title = "";
	Outlink[] outlinks = new Outlink[0];

	// we have converted the sax events generated by Tika into a DOM object
	// so we can now use the usual HTML resources from Nutch
	// get meta directives
	HTMLMetaProcessor.getMetaTags(metaTags, root, base);
	if (LOG.isTraceEnabled()) {
	LOG.trace("Meta tags for " + base + ": " + metaTags.toString());
	}

	// check meta directives
	if (!metaTags.getNoIndex()) { // okay to index
	StringBuffer sb = new StringBuffer();
	if (LOG.isTraceEnabled()) {
	LOG.trace("Getting text...");
	}
	utils.getText(sb, root); // extract text
	text = sb.toString();
	sb.setLength(0);
	if (LOG.isTraceEnabled()) {
	LOG.trace("Getting title...");
	}
	utils.getTitle(sb, root); // extract title
	title = sb.toString().trim();
	}

	if (!metaTags.getNoFollow()) { // okay to follow links
	ArrayList<Outlink> l = new ArrayList<Outlink>(); // extract outlinks
	URL baseTag = utils.getBase(root);
	if (LOG.isTraceEnabled()) {
	LOG.trace("Getting links...");
	}
	utils.getOutlinks(baseTag != null ? baseTag : base, l, root);
	outlinks = l.toArray(new Outlink[l.size()]);
	if (LOG.isTraceEnabled()) {
	LOG.trace("found " + outlinks.length + " outlinks in " + base);
	}
	}

	// populate Nutch metadata with Tika metadata
	String[] TikaMDNames = tikamd.names();
	for (String tikaMDName : TikaMDNames) {
	if (tikaMDName.equalsIgnoreCase(TikaCoreProperties.TITLE.toString()))
	continue;
	// TODO what if multivalued?
	page.getMetadata().put(new Utf8(tikaMDName),
	ByteBuffer.wrap(Bytes.toBytes(tikamd.get(tikaMDName))));
	}

	// no outlinks? try OutlinkExtractor e.g works for mime types where no
	// explicit markup for anchors

	if (outlinks.length == 0) {
	outlinks = OutlinkExtractor.getOutlinks(text, getConf());
	}

	ParseStatus status = ParseStatusUtils.STATUS_SUCCESS;
	if (metaTags.getRefresh()) {
	status.setMinorCode((int) ParseStatusCodes.SUCCESS_REDIRECT);
	status.getArgs().add(new Utf8(metaTags.getRefreshHref().toString()));
	status.getArgs().add(
	new Utf8(Integer.toString(metaTags.getRefreshTime())));
	}

	Parse parse = new Parse(text, title, outlinks, status);
	parse = htmlParseFilters.filter(url, page, parse, metaTags, root);

	if (metaTags.getNoCache()) { // not okay to cache
	page.getMetadata().put(new Utf8(Nutch.CACHING_FORBIDDEN_KEY),
	ByteBuffer.wrap(Bytes.toBytes(cachingPolicy)));
	}

	return parse;
	}

	public void setConf(Configuration conf) {
	this.conf = conf;
	this.tikaConfig = null;

	try {
	tikaConfig = TikaConfig.getDefaultConfig();
	} catch (Exception e2) {
	String message = "Problem loading default Tika configuration";
	LOG.error(message, e2);
	throw new RuntimeException(e2);
	}

	// use a custom htmlmapper
	String htmlmapperClassName = conf.get("tika.htmlmapper.classname");
	if (StringUtils.isNotBlank(htmlmapperClassName)) {
	try {
	Class HTMLMapperClass = Class.forName(htmlmapperClassName);
	boolean interfaceOK = HtmlMapper.class
	.isAssignableFrom(HTMLMapperClass);
	if (!interfaceOK) {
	throw new RuntimeException("Class " + htmlmapperClassName
	+ " does not implement HtmlMapper");
	}
	HTMLMapper = (HtmlMapper) HTMLMapperClass.newInstance();
	} catch (Exception e) {
	LOG.error("Can't generate instance for class " + htmlmapperClassName);
	throw new RuntimeException("Can't generate instance for class "
	+ htmlmapperClassName);
	}
	}

	this.htmlParseFilters = new ParseFilters(getConf());
	this.utils = new DOMContentUtils(conf);
	this.cachingPolicy = getConf().get("parser.caching.forbidden.policy",
	Nutch.CACHING_FORBIDDEN_CONTENT);
	}

	public TikaConfig getTikaConfig() {
	return this.tikaConfig;
	}

	public Configuration getConf() {
	return this.conf;
	}

	@Override
	public Collection<Field> getFields() {
	return FIELDS;
	}

	// main class used for debuggin
	public static void main(String[] args) throws Exception {
	String name = args[0];
	String url = "file:" + name;
	File file = new File(name);
	byte[] bytes = new byte[(int) file.length()];
	@SuppressWarnings("resource")
	DataInputStream in = new DataInputStream(new FileInputStream(file));
	in.readFully(bytes);
	Configuration conf = NutchConfiguration.create();
	// TikaParser parser = new TikaParser();
	// parser.setConf(conf);
	WebPage page = WebPage.newBuilder().build();
	page.setBaseUrl(new Utf8(url));
	page.setContent(ByteBuffer.wrap(bytes));
	MimeUtil mimeutil = new MimeUtil(conf);
	String mtype = mimeutil.getMimeType(file);
	page.setContentType(new Utf8(mtype));
	// Parse parse = parser.getParse(url, page);

	Parse parse = new ParseUtil(conf).parse(url, page);

	System.out.println("content type: " + mtype);
	System.out.println("title: " + parse.getTitle());
	System.out.println("text: " + parse.getText());
	System.out.println("outlinks: " + Arrays.toString(parse.getOutlinks()));
	}
	}