src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java - nutch - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.nutch.parse.html;

 import java.io.ByteArrayInputStream;
 import java.io.DataInputStream;
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.IOException;
 import java.lang.invoke.MethodHandles;
 import java.util.ArrayList;
 import java.util.Map;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 import java.net.URL;
 import java.net.MalformedURLException;
 import java.nio.charset.StandardCharsets;

 import org.xml.sax.InputSource;
 import org.xml.sax.SAXException;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.w3c.dom.DOMException;
 import org.w3c.dom.DocumentFragment;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.html.dom.HTMLDocumentImpl;
 import org.apache.nutch.metadata.Metadata;
 import org.apache.nutch.metadata.Nutch;
 import org.apache.nutch.parse.HTMLMetaTags;
 import org.apache.nutch.parse.HtmlParseFilters;
 import org.apache.nutch.parse.Outlink;
 import org.apache.nutch.parse.Parse;
 import org.apache.nutch.parse.ParseData;
 import org.apache.nutch.parse.ParseImpl;
 import org.apache.nutch.parse.ParseResult;
 import org.apache.nutch.parse.ParseStatus;
 import org.apache.nutch.parse.Parser;
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.util.EncodingDetector;
 import org.apache.nutch.util.NutchConfiguration;
 import org.cyberneko.html.parsers.DOMFragmentParser;

 public class HtmlParser implements Parser {
   private static final Logger LOG = LoggerFactory
       .getLogger(MethodHandles.lookup().lookupClass());

   // I used 1000 bytes at first, but found that some documents have
   // meta tag well past the first 1000 bytes.
   // (e.g. http://cn.promo.yahoo.com/customcare/music.html)
   // NUTCH-2042 (cf. TIKA-357): increased to 8 kB
   private static final int CHUNK_SIZE = 8192;

   // NUTCH-1006 Meta equiv with single quotes not accepted
   private static Pattern metaPattern = Pattern.compile(
       "<meta\\s+([^>]*http-equiv=(\"|')?content-type(\"|')?[^>]*)>",
       Pattern.CASE_INSENSITIVE);
   private static Pattern charsetPattern = Pattern.compile(
       "charset=\\s*([a-z][_\\-0-9a-z]*)", Pattern.CASE_INSENSITIVE);
   private static Pattern charsetPatternHTML5 = Pattern.compile(
       "<meta\\s+charset\\s*=\\s*[\"']?([a-z][_\\-0-9a-z]*)[^>]*>",
       Pattern.CASE_INSENSITIVE);

   private String parserImpl;

   /**
    * Given a <code>byte[]</code> representing an html file of an
    * <em>unknown</em> encoding, read out 'charset' parameter in the meta tag
    * from the first <code>CHUNK_SIZE</code> bytes. If there's no meta tag for
    * Content-Type or no charset is specified, the content is checked for a
    * Unicode Byte Order Mark (BOM). This will also cover non-byte oriented
    * character encodings (UTF-16 only). If no character set can be determined,
    * <code>null</code> is returned. <br />
    * See also
    * http://www.w3.org/International/questions/qa-html-encoding-declarations,
    * http://www.w3.org/TR/2011/WD-html5-diff-20110405/#character-encoding, and
    * http://www.w3.org/TR/REC-xml/#sec-guessing
    *
    * @param content
    *          <code>byte[]</code> representation of an html file
    */

   private static String sniffCharacterEncoding(byte[] content) {
     int length = content.length < CHUNK_SIZE ? content.length : CHUNK_SIZE;

     // We don't care about non-ASCII parts so that it's sufficient
     // to just inflate each byte to a 16-bit value by padding.
     // For instance, the sequence {0x41, 0x82, 0xb7} will be turned into
     // {U+0041, U+0082, U+00B7}.
     String str = new String(content, 0, length, StandardCharsets.US_ASCII);

     Matcher metaMatcher = metaPattern.matcher(str);
     String encoding = null;
     if (metaMatcher.find()) {
       Matcher charsetMatcher = charsetPattern.matcher(metaMatcher.group(1));
       if (charsetMatcher.find())
         encoding = charsetMatcher.group(1);
     }
     if (encoding == null) {
       // check for HTML5 meta charset
       metaMatcher = charsetPatternHTML5.matcher(str);
       if (metaMatcher.find()) {
         encoding = metaMatcher.group(1);
       }
     }
     if (encoding == null) {
       // check for BOM
       if (content.length >= 3 && content[0] == (byte) 0xEF
           && content[1] == (byte) 0xBB && content[2] == (byte) 0xBF) {
         encoding = "UTF-8";
       } else if (content.length >= 2) {
         if (content[0] == (byte) 0xFF && content[1] == (byte) 0xFE) {
           encoding = "UTF-16LE";
         } else if (content[0] == (byte) 0xFE && content[1] == (byte) 0xFF) {
           encoding = "UTF-16BE";
         }
       }
     }

     return encoding;
   }

   private String defaultCharEncoding;

   private Configuration conf;

   private DOMContentUtils utils;

   private HtmlParseFilters htmlParseFilters;

   private String cachingPolicy;

   public ParseResult getParse(Content content) {
     HTMLMetaTags metaTags = new HTMLMetaTags();

     URL base;
     try {
       base = new URL(content.getBaseUrl());
     } catch (MalformedURLException e) {
       return new ParseStatus(e)
           .getEmptyParseResult(content.getUrl(), getConf());
     }

     String text = "";
     String title = "";
     Outlink[] outlinks = new Outlink[0];
     Metadata metadata = new Metadata();

     // parse the content
     DocumentFragment root;
     try {
       byte[] contentInOctets = content.getContent();
       InputSource input = new InputSource(new ByteArrayInputStream(
           contentInOctets));

       EncodingDetector detector = new EncodingDetector(conf);
       detector.autoDetectClues(content, true);
       detector.addClue(sniffCharacterEncoding(contentInOctets), "sniffed");
       String encoding = detector.guessEncoding(content, defaultCharEncoding);

       metadata.set(Metadata.ORIGINAL_CHAR_ENCODING, encoding);
       metadata.set(Metadata.CHAR_ENCODING_FOR_CONVERSION, encoding);

       input.setEncoding(encoding);
       if (LOG.isTraceEnabled()) {
         LOG.trace("Parsing...");
       }
       root = parse(input);
     } catch (IOException e) {
       return new ParseStatus(e)
           .getEmptyParseResult(content.getUrl(), getConf());
     } catch (DOMException e) {
       return new ParseStatus(e)
           .getEmptyParseResult(content.getUrl(), getConf());
     } catch (SAXException e) {
       return new ParseStatus(e)
           .getEmptyParseResult(content.getUrl(), getConf());
     } catch (Exception e) {
       LOG.error("Error: ", e);
       return new ParseStatus(e)
           .getEmptyParseResult(content.getUrl(), getConf());
     }

     // get meta directives
     HTMLMetaProcessor.getMetaTags(metaTags, root, base);

     // populate Nutch metadata with HTML meta directives
     metadata.addAll(metaTags.getGeneralTags());

     if (LOG.isTraceEnabled()) {
       LOG.trace("Meta tags for " + base + ": " + metaTags.toString());
     }
     // check meta directives
     if (!metaTags.getNoIndex()) { // okay to index
       StringBuffer sb = new StringBuffer();
       if (LOG.isTraceEnabled()) {
         LOG.trace("Getting text...");
       }
       utils.getText(sb, root); // extract text
       text = sb.toString();
       sb.setLength(0);
       if (LOG.isTraceEnabled()) {
         LOG.trace("Getting title...");
       }
       utils.getTitle(sb, root); // extract title
       title = sb.toString().trim();
     }

     if (!metaTags.getNoFollow()) { // okay to follow links
       ArrayList<Outlink> l = new ArrayList<Outlink>(); // extract outlinks
       URL baseTag = base;
       String baseTagHref = utils.getBase(root);
       if (baseTagHref != null) {
         try {
           baseTag = new URL(base, baseTagHref);
         } catch (MalformedURLException e) {
           baseTag = base;
         }
       }
       if (LOG.isTraceEnabled()) {
         LOG.trace("Getting links...");
       }
       utils.getOutlinks(baseTag, l, root);
       outlinks = l.toArray(new Outlink[l.size()]);
       if (LOG.isTraceEnabled()) {
         LOG.trace("found " + outlinks.length + " outlinks in "
             + content.getUrl());
       }
     }

     ParseStatus status = new ParseStatus(ParseStatus.SUCCESS);
     if (metaTags.getRefresh()) {
       status.setMinorCode(ParseStatus.SUCCESS_REDIRECT);
       status.setArgs(new String[] { metaTags.getRefreshHref().toString(),
           Integer.toString(metaTags.getRefreshTime()) });
     }
     ParseData parseData = new ParseData(status, title, outlinks,
         content.getMetadata(), metadata);
     ParseResult parseResult = ParseResult.createParseResult(content.getUrl(),
         new ParseImpl(text, parseData));

     // run filters on parse
     ParseResult filteredParse = this.htmlParseFilters.filter(content,
         parseResult, metaTags, root);
     if (metaTags.getNoCache()) { // not okay to cache
       for (Map.Entry<org.apache.hadoop.io.Text, Parse> entry : filteredParse)
         entry.getValue().getData().getParseMeta()
             .set(Nutch.CACHING_FORBIDDEN_KEY, cachingPolicy);
     }
     return filteredParse;
   }

   private DocumentFragment parse(InputSource input) throws Exception {
     if ("tagsoup".equalsIgnoreCase(parserImpl))
       return parseTagSoup(input);
     else
       return parseNeko(input);
   }

   private DocumentFragment parseTagSoup(InputSource input) throws Exception {
     HTMLDocumentImpl doc = new HTMLDocumentImpl();
     DocumentFragment frag = doc.createDocumentFragment();
     DOMBuilder builder = new DOMBuilder(doc, frag);
     org.ccil.cowan.tagsoup.Parser reader = new org.ccil.cowan.tagsoup.Parser();
     reader.setContentHandler(builder);
     reader.setFeature(org.ccil.cowan.tagsoup.Parser.ignoreBogonsFeature, true);
     reader.setFeature(org.ccil.cowan.tagsoup.Parser.bogonsEmptyFeature, false);
     reader
         .setProperty("http://xml.org/sax/properties/lexical-handler", builder);
     reader.parse(input);
     return frag;
   }

   private DocumentFragment parseNeko(InputSource input) throws Exception {
     DOMFragmentParser parser = new DOMFragmentParser();
     try {
       parser
           .setFeature(
               "http://cyberneko.org/html/features/scanner/allow-selfclosing-iframe",
               true);
       parser.setFeature("http://cyberneko.org/html/features/augmentations",
           true);
       parser.setProperty(
           "http://cyberneko.org/html/properties/default-encoding",
           defaultCharEncoding);
       parser
           .setFeature(
               "http://cyberneko.org/html/features/scanner/ignore-specified-charset",
               true);
       parser
           .setFeature(
               "http://cyberneko.org/html/features/balance-tags/ignore-outside-content",
               false);
       parser.setFeature(
           "http://cyberneko.org/html/features/balance-tags/document-fragment",
           true);
       parser.setFeature("http://cyberneko.org/html/features/report-errors",
           LOG.isTraceEnabled());
     } catch (SAXException e) {
     }
     // convert Document to DocumentFragment
     HTMLDocumentImpl doc = new HTMLDocumentImpl();
     doc.setErrorChecking(false);
     DocumentFragment res = doc.createDocumentFragment();
     DocumentFragment frag = doc.createDocumentFragment();
     parser.parse(input, frag);
     res.appendChild(frag);

     try {
       while (true) {
         frag = doc.createDocumentFragment();
         parser.parse(input, frag);
         if (!frag.hasChildNodes())
           break;
         if (LOG.isInfoEnabled()) {
           LOG.info(" - new frag, " + frag.getChildNodes().getLength()
               + " nodes.");
         }
         res.appendChild(frag);
       }
     } catch (Exception e) {
       LOG.error("Error: ", e);
     }
     ;
     return res;
   }

   public static void main(String[] args) throws Exception {
     String name = args[0];
     String url = "file:" + name;
     File file = new File(name);
     byte[] bytes = new byte[(int) file.length()];
     @SuppressWarnings("resource")
     DataInputStream in = new DataInputStream(new FileInputStream(file));
     in.readFully(bytes);
     Configuration conf = NutchConfiguration.create();
     HtmlParser parser = new HtmlParser();
     parser.setConf(conf);
     Parse parse = parser.getParse(
         new Content(url, url, bytes, "text/html", new Metadata(), conf)).get(
         url);
     System.out.println("data: " + parse.getData());

     System.out.println("text: " + parse.getText());

   }

   @Override
   public void setConf(Configuration conf) {
     this.conf = conf;
     this.htmlParseFilters = new HtmlParseFilters(getConf());
     this.parserImpl = getConf().get("parser.html.impl", "neko");
     this.defaultCharEncoding = getConf().get(
         "parser.character.encoding.default", "windows-1252");
     this.utils = new DOMContentUtils(conf);
     this.cachingPolicy = getConf().get("parser.caching.forbidden.policy",
         Nutch.CACHING_FORBIDDEN_CONTENT);
   }

   @Override
   public Configuration getConf() {
     return this.conf;
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.nutch.parse.html;

	import java.io.ByteArrayInputStream;
	import java.io.DataInputStream;
	import java.io.File;
	import java.io.FileInputStream;
	import java.io.IOException;
	import java.lang.invoke.MethodHandles;
	import java.util.ArrayList;
	import java.util.Map;
	import java.util.regex.Matcher;
	import java.util.regex.Pattern;
	import java.net.URL;
	import java.net.MalformedURLException;
	import java.nio.charset.StandardCharsets;

	import org.xml.sax.InputSource;
	import org.xml.sax.SAXException;
	import org.slf4j.Logger;
	import org.slf4j.LoggerFactory;
	import org.w3c.dom.DOMException;
	import org.w3c.dom.DocumentFragment;
	import org.apache.hadoop.conf.Configuration;
	import org.apache.html.dom.HTMLDocumentImpl;
	import org.apache.nutch.metadata.Metadata;
	import org.apache.nutch.metadata.Nutch;
	import org.apache.nutch.parse.HTMLMetaTags;
	import org.apache.nutch.parse.HtmlParseFilters;
	import org.apache.nutch.parse.Outlink;
	import org.apache.nutch.parse.Parse;
	import org.apache.nutch.parse.ParseData;
	import org.apache.nutch.parse.ParseImpl;
	import org.apache.nutch.parse.ParseResult;
	import org.apache.nutch.parse.ParseStatus;
	import org.apache.nutch.parse.Parser;
	import org.apache.nutch.protocol.Content;
	import org.apache.nutch.util.EncodingDetector;
	import org.apache.nutch.util.NutchConfiguration;
	import org.cyberneko.html.parsers.DOMFragmentParser;

	public class HtmlParser implements Parser {
	private static final Logger LOG = LoggerFactory
	.getLogger(MethodHandles.lookup().lookupClass());

	// I used 1000 bytes at first, but found that some documents have
	// meta tag well past the first 1000 bytes.
	// (e.g. http://cn.promo.yahoo.com/customcare/music.html)
	// NUTCH-2042 (cf. TIKA-357): increased to 8 kB
	private static final int CHUNK_SIZE = 8192;

	// NUTCH-1006 Meta equiv with single quotes not accepted
	private static Pattern metaPattern = Pattern.compile(
	"<meta\\s+([^>]http-equiv=(\"\|')?content-type(\"\|')?[^>])>",
	Pattern.CASE_INSENSITIVE);
	private static Pattern charsetPattern = Pattern.compile(
	"charset=\\s([a-z][_\\-0-9a-z])", Pattern.CASE_INSENSITIVE);
	private static Pattern charsetPatternHTML5 = Pattern.compile(
	"<meta\\s+charset\\s=\\s[\"']?([a-z][_\\-0-9a-z])[^>]>",
	Pattern.CASE_INSENSITIVE);

	private String parserImpl;

	/**
	* Given a <code>byte[]</code> representing an html file of an
	* <em>unknown</em> encoding, read out 'charset' parameter in the meta tag
	* from the first <code>CHUNK_SIZE</code> bytes. If there's no meta tag for
	* Content-Type or no charset is specified, the content is checked for a
	* Unicode Byte Order Mark (BOM). This will also cover non-byte oriented
	* character encodings (UTF-16 only). If no character set can be determined,
	* <code>null</code> is returned. <br />
	* See also
	* http://www.w3.org/International/questions/qa-html-encoding-declarations,
	* http://www.w3.org/TR/2011/WD-html5-diff-20110405/#character-encoding, and
	* http://www.w3.org/TR/REC-xml/#sec-guessing
	*
	* @param content
	* <code>byte[]</code> representation of an html file
	*/

	private static String sniffCharacterEncoding(byte[] content) {
	int length = content.length < CHUNK_SIZE ? content.length : CHUNK_SIZE;

	// We don't care about non-ASCII parts so that it's sufficient
	// to just inflate each byte to a 16-bit value by padding.
	// For instance, the sequence {0x41, 0x82, 0xb7} will be turned into
	// {U+0041, U+0082, U+00B7}.
	String str = new String(content, 0, length, StandardCharsets.US_ASCII);

	Matcher metaMatcher = metaPattern.matcher(str);
	String encoding = null;
	if (metaMatcher.find()) {
	Matcher charsetMatcher = charsetPattern.matcher(metaMatcher.group(1));
	if (charsetMatcher.find())
	encoding = charsetMatcher.group(1);
	}
	if (encoding == null) {
	// check for HTML5 meta charset
	metaMatcher = charsetPatternHTML5.matcher(str);
	if (metaMatcher.find()) {
	encoding = metaMatcher.group(1);
	}
	}
	if (encoding == null) {
	// check for BOM
	if (content.length >= 3 && content[0] == (byte) 0xEF
	&& content[1] == (byte) 0xBB && content[2] == (byte) 0xBF) {
	encoding = "UTF-8";
	} else if (content.length >= 2) {
	if (content[0] == (byte) 0xFF && content[1] == (byte) 0xFE) {
	encoding = "UTF-16LE";
	} else if (content[0] == (byte) 0xFE && content[1] == (byte) 0xFF) {
	encoding = "UTF-16BE";
	}
	}
	}

	return encoding;
	}

	private String defaultCharEncoding;

	private Configuration conf;

	private DOMContentUtils utils;

	private HtmlParseFilters htmlParseFilters;

	private String cachingPolicy;

	public ParseResult getParse(Content content) {
	HTMLMetaTags metaTags = new HTMLMetaTags();

	URL base;
	try {
	base = new URL(content.getBaseUrl());
	} catch (MalformedURLException e) {
	return new ParseStatus(e)
	.getEmptyParseResult(content.getUrl(), getConf());
	}

	String text = "";
	String title = "";
	Outlink[] outlinks = new Outlink[0];
	Metadata metadata = new Metadata();

	// parse the content
	DocumentFragment root;
	try {
	byte[] contentInOctets = content.getContent();
	InputSource input = new InputSource(new ByteArrayInputStream(
	contentInOctets));

	EncodingDetector detector = new EncodingDetector(conf);
	detector.autoDetectClues(content, true);
	detector.addClue(sniffCharacterEncoding(contentInOctets), "sniffed");
	String encoding = detector.guessEncoding(content, defaultCharEncoding);

	metadata.set(Metadata.ORIGINAL_CHAR_ENCODING, encoding);
	metadata.set(Metadata.CHAR_ENCODING_FOR_CONVERSION, encoding);

	input.setEncoding(encoding);
	if (LOG.isTraceEnabled()) {
	LOG.trace("Parsing...");
	}
	root = parse(input);
	} catch (IOException e) {
	return new ParseStatus(e)
	.getEmptyParseResult(content.getUrl(), getConf());
	} catch (DOMException e) {
	return new ParseStatus(e)
	.getEmptyParseResult(content.getUrl(), getConf());
	} catch (SAXException e) {
	return new ParseStatus(e)
	.getEmptyParseResult(content.getUrl(), getConf());
	} catch (Exception e) {
	LOG.error("Error: ", e);
	return new ParseStatus(e)
	.getEmptyParseResult(content.getUrl(), getConf());
	}

	// get meta directives
	HTMLMetaProcessor.getMetaTags(metaTags, root, base);

	// populate Nutch metadata with HTML meta directives
	metadata.addAll(metaTags.getGeneralTags());

	if (LOG.isTraceEnabled()) {
	LOG.trace("Meta tags for " + base + ": " + metaTags.toString());
	}
	// check meta directives
	if (!metaTags.getNoIndex()) { // okay to index
	StringBuffer sb = new StringBuffer();
	if (LOG.isTraceEnabled()) {
	LOG.trace("Getting text...");
	}
	utils.getText(sb, root); // extract text
	text = sb.toString();
	sb.setLength(0);
	if (LOG.isTraceEnabled()) {
	LOG.trace("Getting title...");
	}
	utils.getTitle(sb, root); // extract title
	title = sb.toString().trim();
	}

	if (!metaTags.getNoFollow()) { // okay to follow links
	ArrayList<Outlink> l = new ArrayList<Outlink>(); // extract outlinks
	URL baseTag = base;
	String baseTagHref = utils.getBase(root);
	if (baseTagHref != null) {
	try {
	baseTag = new URL(base, baseTagHref);
	} catch (MalformedURLException e) {
	baseTag = base;
	}
	}
	if (LOG.isTraceEnabled()) {
	LOG.trace("Getting links...");
	}
	utils.getOutlinks(baseTag, l, root);
	outlinks = l.toArray(new Outlink[l.size()]);
	if (LOG.isTraceEnabled()) {
	LOG.trace("found " + outlinks.length + " outlinks in "
	+ content.getUrl());
	}
	}

	ParseStatus status = new ParseStatus(ParseStatus.SUCCESS);
	if (metaTags.getRefresh()) {
	status.setMinorCode(ParseStatus.SUCCESS_REDIRECT);
	status.setArgs(new String[] { metaTags.getRefreshHref().toString(),
	Integer.toString(metaTags.getRefreshTime()) });
	}
	ParseData parseData = new ParseData(status, title, outlinks,
	content.getMetadata(), metadata);
	ParseResult parseResult = ParseResult.createParseResult(content.getUrl(),
	new ParseImpl(text, parseData));

	// run filters on parse
	ParseResult filteredParse = this.htmlParseFilters.filter(content,
	parseResult, metaTags, root);
	if (metaTags.getNoCache()) { // not okay to cache
	for (Map.Entry<org.apache.hadoop.io.Text, Parse> entry : filteredParse)
	entry.getValue().getData().getParseMeta()
	.set(Nutch.CACHING_FORBIDDEN_KEY, cachingPolicy);
	}
	return filteredParse;
	}

	private DocumentFragment parse(InputSource input) throws Exception {
	if ("tagsoup".equalsIgnoreCase(parserImpl))
	return parseTagSoup(input);
	else
	return parseNeko(input);
	}

	private DocumentFragment parseTagSoup(InputSource input) throws Exception {
	HTMLDocumentImpl doc = new HTMLDocumentImpl();
	DocumentFragment frag = doc.createDocumentFragment();
	DOMBuilder builder = new DOMBuilder(doc, frag);
	org.ccil.cowan.tagsoup.Parser reader = new org.ccil.cowan.tagsoup.Parser();
	reader.setContentHandler(builder);
	reader.setFeature(org.ccil.cowan.tagsoup.Parser.ignoreBogonsFeature, true);
	reader.setFeature(org.ccil.cowan.tagsoup.Parser.bogonsEmptyFeature, false);
	reader
	.setProperty("http://xml.org/sax/properties/lexical-handler", builder);
	reader.parse(input);
	return frag;
	}

	private DocumentFragment parseNeko(InputSource input) throws Exception {
	DOMFragmentParser parser = new DOMFragmentParser();
	try {
	parser
	.setFeature(
	"http://cyberneko.org/html/features/scanner/allow-selfclosing-iframe",
	true);
	parser.setFeature("http://cyberneko.org/html/features/augmentations",
	true);
	parser.setProperty(
	"http://cyberneko.org/html/properties/default-encoding",
	defaultCharEncoding);
	parser
	.setFeature(
	"http://cyberneko.org/html/features/scanner/ignore-specified-charset",
	true);
	parser
	.setFeature(
	"http://cyberneko.org/html/features/balance-tags/ignore-outside-content",
	false);
	parser.setFeature(
	"http://cyberneko.org/html/features/balance-tags/document-fragment",
	true);
	parser.setFeature("http://cyberneko.org/html/features/report-errors",
	LOG.isTraceEnabled());
	} catch (SAXException e) {
	}
	// convert Document to DocumentFragment
	HTMLDocumentImpl doc = new HTMLDocumentImpl();
	doc.setErrorChecking(false);
	DocumentFragment res = doc.createDocumentFragment();
	DocumentFragment frag = doc.createDocumentFragment();
	parser.parse(input, frag);
	res.appendChild(frag);

	try {
	while (true) {
	frag = doc.createDocumentFragment();
	parser.parse(input, frag);
	if (!frag.hasChildNodes())
	break;
	if (LOG.isInfoEnabled()) {
	LOG.info(" - new frag, " + frag.getChildNodes().getLength()
	+ " nodes.");
	}
	res.appendChild(frag);
	}
	} catch (Exception e) {
	LOG.error("Error: ", e);
	}
	;
	return res;
	}

	public static void main(String[] args) throws Exception {
	String name = args[0];
	String url = "file:" + name;
	File file = new File(name);
	byte[] bytes = new byte[(int) file.length()];
	@SuppressWarnings("resource")
	DataInputStream in = new DataInputStream(new FileInputStream(file));
	in.readFully(bytes);
	Configuration conf = NutchConfiguration.create();
	HtmlParser parser = new HtmlParser();
	parser.setConf(conf);
	Parse parse = parser.getParse(
	new Content(url, url, bytes, "text/html", new Metadata(), conf)).get(
	url);
	System.out.println("data: " + parse.getData());

	System.out.println("text: " + parse.getText());

	}

	@Override
	public void setConf(Configuration conf) {
	this.conf = conf;
	this.htmlParseFilters = new HtmlParseFilters(getConf());
	this.parserImpl = getConf().get("parser.html.impl", "neko");
	this.defaultCharEncoding = getConf().get(
	"parser.character.encoding.default", "windows-1252");
	this.utils = new DOMContentUtils(conf);
	this.cachingPolicy = getConf().get("parser.caching.forbidden.policy",
	Nutch.CACHING_FORBIDDEN_CONTENT);
	}

	@Override
	public Configuration getConf() {
	return this.conf;
	}
	}