| /** |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package org.apache.nutch.parse.html; |
| |
| import java.io.ByteArrayInputStream; |
| import java.io.DataInputStream; |
| import java.io.File; |
| import java.io.FileInputStream; |
| import java.io.IOException; |
| import java.io.UnsupportedEncodingException; |
| import java.net.MalformedURLException; |
| import java.net.URL; |
| import java.nio.ByteBuffer; |
| import java.nio.charset.Charset; |
| import java.nio.charset.StandardCharsets; |
| import java.util.ArrayList; |
| import java.util.Arrays; |
| import java.util.Collection; |
| import java.util.HashSet; |
| import java.util.regex.Matcher; |
| import java.util.regex.Pattern; |
| |
| import org.apache.avro.util.Utf8; |
| import org.slf4j.Logger; |
| import org.slf4j.LoggerFactory; |
| import org.apache.hadoop.conf.Configuration; |
| import org.apache.html.dom.HTMLDocumentImpl; |
| import org.apache.nutch.metadata.Metadata; |
| import org.apache.nutch.metadata.Nutch; |
| import org.apache.nutch.parse.HTMLMetaTags; |
| import org.apache.nutch.parse.ParseFilters; |
| import org.apache.nutch.parse.Outlink; |
| import org.apache.nutch.parse.Parse; |
| import org.apache.nutch.parse.ParseStatusCodes; |
| import org.apache.nutch.parse.ParseStatusUtils; |
| import org.apache.nutch.parse.Parser; |
| import org.apache.nutch.storage.ParseStatus; |
| import org.apache.nutch.storage.WebPage; |
| import org.apache.nutch.util.Bytes; |
| import org.apache.nutch.util.EncodingDetector; |
| import org.apache.nutch.util.NutchConfiguration; |
| import org.apache.nutch.util.TableUtil; |
| import org.cyberneko.html.parsers.DOMFragmentParser; |
| import org.w3c.dom.DOMException; |
| import org.w3c.dom.DocumentFragment; |
| import org.xml.sax.InputSource; |
| import org.xml.sax.SAXException; |
| |
| public class HtmlParser implements Parser { |
| public static final Logger LOG = LoggerFactory |
| .getLogger("org.apache.nutch.parse.html"); |
| |
| // I used 1000 bytes at first, but found that some documents have |
| // meta tag well past the first 1000 bytes. |
| // (e.g. http://cn.promo.yahoo.com/customcare/music.html) |
| // NUTCH-2042 (cf. TIKA-357): increased to 8 kB |
| private static final int CHUNK_SIZE = 8192; |
| |
| // NUTCH-1006 Meta equiv with single quotes not accepted |
| private static Pattern metaPattern = Pattern.compile( |
| "<meta\\s+([^>]*http-equiv=(\"|')?content-type(\"|')?[^>]*)>", |
| Pattern.CASE_INSENSITIVE); |
| private static Pattern charsetPattern = Pattern.compile( |
| "charset=\\s*([a-z][_\\-0-9a-z]*)", Pattern.CASE_INSENSITIVE); |
| private static Pattern charsetPatternHTML5 = Pattern.compile( |
| "<meta\\s+charset\\s*=\\s*[\"']?([a-z][_\\-0-9a-z]*)[^>]*>", |
| Pattern.CASE_INSENSITIVE); |
| |
| private static Collection<WebPage.Field> FIELDS = new HashSet<WebPage.Field>(); |
| |
| static { |
| FIELDS.add(WebPage.Field.BASE_URL); |
| } |
| |
| private String parserImpl; |
| |
| /** |
| * Given a <code>ByteBuffer</code> representing an html file of an |
| * <em>unknown</em> encoding, read out 'charset' parameter in the meta tag |
| * from the first <code>CHUNK_SIZE</code> bytes. If there's no meta tag for |
| * Content-Type or no charset is specified, the content is checked for a |
| * Unicode Byte Order Mark (BOM). This will also cover non-byte oriented |
| * character encodings (UTF-16 only). If no character set can be determined, |
| * <code>null</code> is returned. <br /> |
| * See also |
| * http://www.w3.org/International/questions/qa-html-encoding-declarations, |
| * http://www.w3.org/TR/2011/WD-html5-diff-20110405/#character-encoding, and |
| * http://www.w3.org/TR/REC-xml/#sec-guessing <br /> |
| * |
| * @param content |
| * <code>ByteBuffer</code> representation of an html file |
| */ |
| |
| private static String sniffCharacterEncoding(ByteBuffer content) { |
| int length = Math.min(content.remaining(), CHUNK_SIZE); |
| |
| // We don't care about non-ASCII parts so that it's sufficient |
| // to just inflate each byte to a 16-bit value by padding. |
| // For instance, the sequence {0x41, 0x82, 0xb7} will be turned into |
| // {U+0041, U+0082, U+00B7}. |
| String str = new String(content.array(), content.arrayOffset() |
| + content.position(), length, StandardCharsets.US_ASCII); |
| |
| Matcher metaMatcher = metaPattern.matcher(str); |
| String encoding = null; |
| if (metaMatcher.find()) { |
| Matcher charsetMatcher = charsetPattern.matcher(metaMatcher.group(1)); |
| if (charsetMatcher.find()) |
| encoding = new String(charsetMatcher.group(1)); |
| } |
| if (encoding == null) { |
| // check for HTML5 meta charset |
| metaMatcher = charsetPatternHTML5.matcher(str); |
| if (metaMatcher.find()) { |
| encoding = new String(metaMatcher.group(1)); |
| } |
| } |
| if (encoding == null) { |
| // check for BOM |
| if (length >= 3 && content.get(0) == (byte) 0xEF |
| && content.get(1) == (byte) 0xBB && content.get(2) == (byte) 0xBF) { |
| encoding = "UTF-8"; |
| } else if (length >= 2) { |
| if (content.get(0) == (byte) 0xFF && content.get(1) == (byte) 0xFE) { |
| encoding = "UTF-16LE"; |
| } else if (content.get(0) == (byte) 0xFE |
| && content.get(1) == (byte) 0xFF) { |
| encoding = "UTF-16BE"; |
| } |
| } |
| } |
| |
| return encoding; |
| } |
| |
| private String defaultCharEncoding; |
| |
| private Configuration conf; |
| |
| private DOMContentUtils utils; |
| |
| private ParseFilters htmlParseFilters; |
| |
| private String cachingPolicy; |
| |
| public Parse getParse(String url, WebPage page) { |
| HTMLMetaTags metaTags = new HTMLMetaTags(); |
| |
| String baseUrl = TableUtil.toString(page.getBaseUrl()); |
| URL base; |
| try { |
| base = new URL(baseUrl); |
| } catch (MalformedURLException e) { |
| return ParseStatusUtils.getEmptyParse(e, getConf()); |
| } |
| |
| String text = ""; |
| String title = ""; |
| Outlink[] outlinks = new Outlink[0]; |
| |
| // parse the content |
| DocumentFragment root; |
| try { |
| ByteBuffer contentInOctets = page.getContent(); |
| InputSource input = new InputSource(new ByteArrayInputStream( |
| contentInOctets.array(), contentInOctets.arrayOffset() |
| + contentInOctets.position(), contentInOctets.remaining())); |
| |
| EncodingDetector detector = new EncodingDetector(conf); |
| detector.autoDetectClues(page, true); |
| detector.addClue(sniffCharacterEncoding(contentInOctets), "sniffed"); |
| String encoding = detector.guessEncoding(page, defaultCharEncoding); |
| |
| page.getMetadata().put(new Utf8(Metadata.ORIGINAL_CHAR_ENCODING), |
| ByteBuffer.wrap(Bytes.toBytes(encoding))); |
| page.getMetadata().put(new Utf8(Metadata.CHAR_ENCODING_FOR_CONVERSION), |
| ByteBuffer.wrap(Bytes.toBytes(encoding))); |
| |
| input.setEncoding(encoding); |
| if (LOG.isTraceEnabled()) { |
| LOG.trace("Parsing..."); |
| } |
| root = parse(input); |
| } catch (IOException e) { |
| LOG.error("Failed with the following IOException: ", e); |
| return ParseStatusUtils.getEmptyParse(e, getConf()); |
| } catch (DOMException e) { |
| LOG.error("Failed with the following DOMException: ", e); |
| return ParseStatusUtils.getEmptyParse(e, getConf()); |
| } catch (SAXException e) { |
| LOG.error("Failed with the following SAXException: ", e); |
| return ParseStatusUtils.getEmptyParse(e, getConf()); |
| } catch (Exception e) { |
| LOG.error("Failed with the following Exception: ", e); |
| return ParseStatusUtils.getEmptyParse(e, getConf()); |
| } |
| |
| // get meta directives |
| HTMLMetaProcessor.getMetaTags(metaTags, root, base); |
| if (LOG.isTraceEnabled()) { |
| LOG.trace("Meta tags for " + base + ": " + metaTags.toString()); |
| } |
| // check meta directives |
| if (!metaTags.getNoIndex()) { // okay to index |
| StringBuilder sb = new StringBuilder(); |
| if (LOG.isTraceEnabled()) { |
| LOG.trace("Getting text..."); |
| } |
| utils.getText(sb, root); // extract text |
| text = sb.toString(); |
| sb.setLength(0); |
| if (LOG.isTraceEnabled()) { |
| LOG.trace("Getting title..."); |
| } |
| utils.getTitle(sb, root); // extract title |
| title = sb.toString().trim(); |
| } |
| |
| if (!metaTags.getNoFollow()) { // okay to follow links |
| ArrayList<Outlink> l = new ArrayList<Outlink>(); // extract outlinks |
| URL baseTag = utils.getBase(root); |
| if (LOG.isTraceEnabled()) { |
| LOG.trace("Getting links..."); |
| } |
| utils.getOutlinks(baseTag != null ? baseTag : base, l, root); |
| outlinks = l.toArray(new Outlink[l.size()]); |
| if (LOG.isTraceEnabled()) { |
| LOG.trace("found " + outlinks.length + " outlinks in " + url); |
| } |
| } |
| |
| ParseStatus status = ParseStatus.newBuilder().build(); |
| status.setMajorCode((int) ParseStatusCodes.SUCCESS); |
| if (metaTags.getRefresh()) { |
| status.setMinorCode((int) ParseStatusCodes.SUCCESS_REDIRECT); |
| status.getArgs().add(new Utf8(metaTags.getRefreshHref().toString())); |
| status.getArgs().add( |
| new Utf8(Integer.toString(metaTags.getRefreshTime()))); |
| } |
| |
| Parse parse = new Parse(text, title, outlinks, status); |
| parse = htmlParseFilters.filter(url, page, parse, metaTags, root); |
| |
| if (metaTags.getNoCache()) { // not okay to cache |
| page.getMetadata().put(new Utf8(Nutch.CACHING_FORBIDDEN_KEY), |
| ByteBuffer.wrap(Bytes.toBytes(cachingPolicy))); |
| } |
| |
| return parse; |
| } |
| |
| private DocumentFragment parse(InputSource input) throws Exception { |
| if (parserImpl.equalsIgnoreCase("tagsoup")) |
| return parseTagSoup(input); |
| else |
| return parseNeko(input); |
| } |
| |
| private DocumentFragment parseTagSoup(InputSource input) throws Exception { |
| HTMLDocumentImpl doc = new HTMLDocumentImpl(); |
| DocumentFragment frag = doc.createDocumentFragment(); |
| DOMBuilder builder = new DOMBuilder(doc, frag); |
| org.ccil.cowan.tagsoup.Parser reader = new org.ccil.cowan.tagsoup.Parser(); |
| reader.setContentHandler(builder); |
| reader.setFeature(org.ccil.cowan.tagsoup.Parser.ignoreBogonsFeature, true); |
| reader.setFeature(org.ccil.cowan.tagsoup.Parser.bogonsEmptyFeature, false); |
| reader |
| .setProperty("http://xml.org/sax/properties/lexical-handler", builder); |
| reader.parse(input); |
| return frag; |
| } |
| |
| private DocumentFragment parseNeko(InputSource input) throws Exception { |
| DOMFragmentParser parser = new DOMFragmentParser(); |
| try { |
| parser |
| .setFeature( |
| "http://cyberneko.org/html/features/scanner/allow-selfclosing-iframe", |
| true); |
| parser.setFeature("http://cyberneko.org/html/features/augmentations", |
| true); |
| parser.setProperty( |
| "http://cyberneko.org/html/properties/default-encoding", |
| defaultCharEncoding); |
| parser |
| .setFeature( |
| "http://cyberneko.org/html/features/scanner/ignore-specified-charset", |
| true); |
| parser |
| .setFeature( |
| "http://cyberneko.org/html/features/balance-tags/ignore-outside-content", |
| false); |
| parser.setFeature( |
| "http://cyberneko.org/html/features/balance-tags/document-fragment", |
| true); |
| parser.setFeature("http://cyberneko.org/html/features/report-errors", |
| LOG.isTraceEnabled()); |
| } catch (SAXException e) { |
| } |
| // convert Document to DocumentFragment |
| HTMLDocumentImpl doc = new HTMLDocumentImpl(); |
| doc.setErrorChecking(false); |
| DocumentFragment res = doc.createDocumentFragment(); |
| DocumentFragment frag = doc.createDocumentFragment(); |
| parser.parse(input, frag); |
| res.appendChild(frag); |
| |
| try { |
| while (true) { |
| frag = doc.createDocumentFragment(); |
| parser.parse(input, frag); |
| if (!frag.hasChildNodes()) |
| break; |
| if (LOG.isInfoEnabled()) { |
| LOG.info(" - new frag, " + frag.getChildNodes().getLength() |
| + " nodes."); |
| } |
| res.appendChild(frag); |
| } |
| } catch (Exception x) { |
| LOG.error("Failed with the following Exception: ", x); |
| } |
| ; |
| return res; |
| } |
| |
| public void setConf(Configuration conf) { |
| this.conf = conf; |
| this.htmlParseFilters = new ParseFilters(getConf()); |
| this.parserImpl = getConf().get("parser.html.impl", "neko"); |
| this.defaultCharEncoding = getConf().get( |
| "parser.character.encoding.default", "windows-1252"); |
| this.utils = new DOMContentUtils(conf); |
| this.cachingPolicy = getConf().get("parser.caching.forbidden.policy", |
| Nutch.CACHING_FORBIDDEN_CONTENT); |
| } |
| |
| public Configuration getConf() { |
| return this.conf; |
| } |
| |
| @Override |
| public Collection<WebPage.Field> getFields() { |
| return FIELDS; |
| } |
| |
| public static void main(String[] args) throws Exception { |
| // LOG.setLevel(Level.FINE); |
| String name = args[0]; |
| String url = "file:" + name; |
| File file = new File(name); |
| byte[] bytes = new byte[(int) file.length()]; |
| DataInputStream in = new DataInputStream(new FileInputStream(file)); |
| in.readFully(bytes); |
| Configuration conf = NutchConfiguration.create(); |
| HtmlParser parser = new HtmlParser(); |
| parser.setConf(conf); |
| WebPage page = WebPage.newBuilder().build(); |
| page.setBaseUrl(new Utf8(url)); |
| page.setContent(ByteBuffer.wrap(bytes)); |
| page.setContentType(new Utf8("text/html")); |
| Parse parse = parser.getParse(url, page); |
| System.out.println("title: " + parse.getTitle()); |
| System.out.println("text: " + parse.getText()); |
| System.out.println("outlinks: " + Arrays.toString(parse.getOutlinks())); |
| |
| } |
| |
| } |