src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java - nutch - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.nutch.parse.html;

 import java.lang.invoke.MethodHandles;
 import java.nio.charset.Charset;
 import java.nio.charset.StandardCharsets;

 import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.metadata.Metadata;
 import org.apache.nutch.parse.html.HtmlParser;
 import org.apache.nutch.parse.Outlink;
 import org.apache.nutch.parse.Parse;
 import org.apache.nutch.parse.Parser;
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.util.NutchConfiguration;
 import org.junit.Assert;
 import org.junit.Test;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 public class TestHtmlParser {

   private static final Logger LOG = LoggerFactory
       .getLogger(MethodHandles.lookup().lookupClass());

   private static final String encodingTestKeywords = "français, español, русский язык, čeština, ελληνικά";
   private static final String encodingTestBody = "<ul>\n  <li>français\n  <li>español\n  <li>русский язык\n  <li>čeština\n  <li>ελληνικά\n</ul>";
   private static final String encodingTestContent = "<title>"
       + encodingTestKeywords + "</title>\n"
       + "<meta name=\"keywords\" content=\"" + encodingTestKeywords + "\" />\n"
       + "</head>\n<body>" + encodingTestBody + "</body>\n</html>";

   private static String[][] encodingTestPages = {
       {
           "HTML4, utf-8, meta http-equiv, no quotes",
           "utf-8",
           "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" "
               + "\"http://www.w3.org/TR/html4/loose.dtd\">\n"
               + "<html>\n<head>\n"
               + "<meta http-equiv=Content-Type content=\"text/html; charset=utf-8\" />"
               + encodingTestContent },
       {
           "HTML4, utf-8, meta http-equiv, single quotes",
           "utf-8",
           "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" "
               + "\"http://www.w3.org/TR/html4/loose.dtd\">\n"
               + "<html>\n<head>\n"
               + "<meta http-equiv='Content-Type' content='text/html; charset=utf-8' />"
               + encodingTestContent },
       {
           "XHTML, utf-8, meta http-equiv, double quotes",
           "utf-8",
           "<?xml version=\"1.0\"?>\n<html xmlns=\"http://www.w3.org/1999/xhtml\">"
               + "<html>\n<head>\n"
               + "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />"
               + encodingTestContent },
       {
           "HTML5, utf-8, meta charset",
           "utf-8",
           "<!DOCTYPE html>\n<html>\n<head>\n" + "<meta charset=\"utf-8\">"
               + encodingTestContent },
       { "HTML5, utf-8, BOM", "utf-8",
           "\ufeff<!DOCTYPE html>\n<html>\n<head>\n" + encodingTestContent },
       { "HTML5, utf-16, BOM", "utf-16",
           "\ufeff<!DOCTYPE html>\n<html>\n<head>\n" + encodingTestContent } };

   private static final String resolveBaseUrlTestContent = //
       "<html>\n<head>\n" + //
       "  <title>Test Resolve Base URLs (NUTCH-2478)</title>\n" + //
       "  <base href=\"//www.example.com/\">\n" + //
       "</head>\n<body>\n" + //
       "  <a href=\"index.html\">outlink</a>\n" + //
       "</body>\n</html>";

   private Configuration conf;
   private Parser parser;

   public TestHtmlParser() {
     conf = NutchConfiguration.create();
     conf.set("plugin.includes", "parse-html");
     parser = new HtmlParser();
     parser.setConf(conf);
   }

   protected Parse parse(byte[] contentBytes) {
     String dummyUrl = "http://example.com/";
     return parser.getParse(
         new Content(dummyUrl, dummyUrl, contentBytes, "text/html",
             new Metadata(), conf)).get(dummyUrl);
   }

   @Test
   public void testEncodingDetection() {
     for (String[] testPage : encodingTestPages) {
       String name = testPage[0];
       Charset charset = Charset.forName(testPage[1]);
       byte[] contentBytes = testPage[2].getBytes(charset);
       Parse parse = parse(contentBytes);
       String text = parse.getText();
       String title = parse.getData().getTitle();
       String keywords = parse.getData().getMeta("keywords");
       LOG.info(name);
       LOG.info("title:\t" + title);
       LOG.info("keywords:\t" + keywords);
       LOG.info("text:\t" + text);
       Assert.assertEquals("Title not extracted properly (" + name + ")",
           encodingTestKeywords, title);
       for (String keyword : encodingTestKeywords.split(",\\s*")) {
         Assert.assertTrue(keyword + " not found in text (" + name + ")",
             text.contains(keyword));
       }
       Assert.assertNotNull("No keywords extracted", keywords);
       Assert.assertEquals("Keywords not extracted properly (" + name + ")",
           encodingTestKeywords, keywords);
     }
   }

   @Test
   public void testResolveBaseUrl() {
     byte[] contentBytes = resolveBaseUrlTestContent
         .getBytes(StandardCharsets.UTF_8);
     // parse using http://example.com/ as "fetch" URL
     Parse parse = parse(contentBytes);
     LOG.info(parse.getData().toString());
     Outlink[] outlinks = parse.getData().getOutlinks();
     Assert.assertEquals(1, outlinks.length);
     Assert.assertEquals("http://www.example.com/index.html",
         outlinks[0].getToUrl());
   }

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.nutch.parse.html;

	import java.lang.invoke.MethodHandles;
	import java.nio.charset.Charset;
	import java.nio.charset.StandardCharsets;

	import org.apache.hadoop.conf.Configuration;
	import org.apache.nutch.metadata.Metadata;
	import org.apache.nutch.parse.html.HtmlParser;
	import org.apache.nutch.parse.Outlink;
	import org.apache.nutch.parse.Parse;
	import org.apache.nutch.parse.Parser;
	import org.apache.nutch.protocol.Content;
	import org.apache.nutch.util.NutchConfiguration;
	import org.junit.Assert;
	import org.junit.Test;
	import org.slf4j.Logger;
	import org.slf4j.LoggerFactory;

	public class TestHtmlParser {

	private static final Logger LOG = LoggerFactory
	.getLogger(MethodHandles.lookup().lookupClass());

	private static final String encodingTestKeywords = "français, español, русский язык, čeština, ελληνικά";
	private static final String encodingTestBody = "<ul>\n <li>français\n <li>español\n <li>русский язык\n <li>čeština\n <li>ελληνικά\n</ul>";
	private static final String encodingTestContent = "<title>"
	+ encodingTestKeywords + "</title>\n"
	+ "<meta name=\"keywords\" content=\"" + encodingTestKeywords + "\" />\n"
	+ "</head>\n<body>" + encodingTestBody + "</body>\n</html>";

	private static String[][] encodingTestPages = {
	{
	"HTML4, utf-8, meta http-equiv, no quotes",
	"utf-8",
	"<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" "
	+ "\"http://www.w3.org/TR/html4/loose.dtd\">\n"
	+ "<html>\n<head>\n"
	+ "<meta http-equiv=Content-Type content=\"text/html; charset=utf-8\" />"
	+ encodingTestContent },
	{
	"HTML4, utf-8, meta http-equiv, single quotes",
	"utf-8",
	"<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" "
	+ "\"http://www.w3.org/TR/html4/loose.dtd\">\n"
	+ "<html>\n<head>\n"
	+ "<meta http-equiv='Content-Type' content='text/html; charset=utf-8' />"
	+ encodingTestContent },
	{
	"XHTML, utf-8, meta http-equiv, double quotes",
	"utf-8",
	"<?xml version=\"1.0\"?>\n<html xmlns=\"http://www.w3.org/1999/xhtml\">"
	+ "<html>\n<head>\n"
	+ "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />"
	+ encodingTestContent },
	{
	"HTML5, utf-8, meta charset",
	"utf-8",
	"<!DOCTYPE html>\n<html>\n<head>\n" + "<meta charset=\"utf-8\">"
	+ encodingTestContent },
	{ "HTML5, utf-8, BOM", "utf-8",
	"\ufeff<!DOCTYPE html>\n<html>\n<head>\n" + encodingTestContent },
	{ "HTML5, utf-16, BOM", "utf-16",
	"\ufeff<!DOCTYPE html>\n<html>\n<head>\n" + encodingTestContent } };

	private static final String resolveBaseUrlTestContent = //
	"<html>\n<head>\n" + //
	" <title>Test Resolve Base URLs (NUTCH-2478)</title>\n" + //
	" <base href=\"//www.example.com/\">\n" + //
	"</head>\n<body>\n" + //
	" <a href=\"index.html\">outlink</a>\n" + //
	"</body>\n</html>";

	private Configuration conf;
	private Parser parser;

	public TestHtmlParser() {
	conf = NutchConfiguration.create();
	conf.set("plugin.includes", "parse-html");
	parser = new HtmlParser();
	parser.setConf(conf);
	}

	protected Parse parse(byte[] contentBytes) {
	String dummyUrl = "http://example.com/";
	return parser.getParse(
	new Content(dummyUrl, dummyUrl, contentBytes, "text/html",
	new Metadata(), conf)).get(dummyUrl);
	}

	@Test
	public void testEncodingDetection() {
	for (String[] testPage : encodingTestPages) {
	String name = testPage[0];
	Charset charset = Charset.forName(testPage[1]);
	byte[] contentBytes = testPage[2].getBytes(charset);
	Parse parse = parse(contentBytes);
	String text = parse.getText();
	String title = parse.getData().getTitle();
	String keywords = parse.getData().getMeta("keywords");
	LOG.info(name);
	LOG.info("title:\t" + title);
	LOG.info("keywords:\t" + keywords);
	LOG.info("text:\t" + text);
	Assert.assertEquals("Title not extracted properly (" + name + ")",
	encodingTestKeywords, title);
	for (String keyword : encodingTestKeywords.split(",\\s*")) {
	Assert.assertTrue(keyword + " not found in text (" + name + ")",
	text.contains(keyword));
	}
	Assert.assertNotNull("No keywords extracted", keywords);
	Assert.assertEquals("Keywords not extracted properly (" + name + ")",
	encodingTestKeywords, keywords);
	}
	}

	@Test
	public void testResolveBaseUrl() {
	byte[] contentBytes = resolveBaseUrlTestContent
	.getBytes(StandardCharsets.UTF_8);
	// parse using http://example.com/ as "fetch" URL
	Parse parse = parse(contentBytes);
	LOG.info(parse.getData().toString());
	Outlink[] outlinks = parse.getData().getOutlinks();
	Assert.assertEquals(1, outlinks.length);
	Assert.assertEquals("http://www.example.com/index.html",
	outlinks[0].getToUrl());
	}

	}