src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java - nutch - Git at Google

 /**
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.nutch.parse.html;

 import java.nio.ByteBuffer;
 import java.nio.charset.Charset;

 import org.apache.avro.util.Utf8;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.parse.Parse;
 import org.apache.nutch.parse.Parser;
 import org.apache.nutch.storage.WebPage;
 import org.apache.nutch.util.Bytes;
 import org.apache.nutch.util.NutchConfiguration;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.junit.Before;
 import org.junit.Test;

 import static org.junit.Assert.*;

 public class TestHtmlParser {

   public static final Logger LOG = LoggerFactory
       .getLogger(TestHtmlParser.class);

   private static final String encodingTestKeywords = "français, español, русский язык, čeština, ελληνικά";
   private static final String encodingTestBody = "<ul>\n  <li>français\n  <li>español\n  <li>русский язык\n  <li>čeština\n  <li>ελληνικά\n</ul>";
   private static final String encodingTestContent = "<title>"
       + encodingTestKeywords + "</title>\n"
       + "<meta name=\"keywords\" content=\"" + encodingTestKeywords
       + "</meta>\n" + "</head>\n<body>" + encodingTestBody + "</body>\n</html>";

   private static String[][] encodingTestPages = {
       {
           "HTML4, utf-8, meta http-equiv, no quotes",
           "utf-8",
           "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" "
               + "\"http://www.w3.org/TR/html4/loose.dtd\">\n"
               + "<html>\n<head>\n"
               + "<meta http-equiv=Content-Type content=\"text/html; charset=utf-8\" />"
               + encodingTestContent },
       {
           "HTML4, utf-8, meta http-equiv, single quotes",
           "utf-8",
           "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" "
               + "\"http://www.w3.org/TR/html4/loose.dtd\">\n"
               + "<html>\n<head>\n"
               + "<meta http-equiv='Content-Type' content='text/html; charset=utf-8' />"
               + encodingTestContent },
       {
           "XHTML, utf-8, meta http-equiv, double quotes",
           "utf-8",
           "<?xml version=\"1.0\"?>\n<html xmlns=\"http://www.w3.org/1999/xhtml\">"
               + "<html>\n<head>\n"
               + "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />"
               + encodingTestContent },
       {
           "HTML5, utf-8, meta charset",
           "utf-8",
           "<!DOCTYPE html>\n<html>\n<head>\n" + "<meta charset=\"utf-8\">"
               + encodingTestContent },
       { "HTML5, utf-8, BOM", "utf-8",
           "\ufeff<!DOCTYPE html>\n<html>\n<head>\n" + encodingTestContent },
       { "HTML5, utf-16, BOM", "utf-16",
           "\ufeff<!DOCTYPE html>\n<html>\n<head>\n" + encodingTestContent } };

   private Configuration conf;
   private Parser parser;

   private static final String dummyUrl = "http://dummy.url/";

   @Before
   public void setup() {
     conf = NutchConfiguration.create();
     parser = new HtmlParser();
     parser.setConf(conf);
   }

   protected WebPage page(byte[] contentBytes) {
     WebPage page = WebPage.newBuilder().build();
     page.setBaseUrl(new Utf8(dummyUrl));
     page.setContent(ByteBuffer.wrap(contentBytes));
     page.setContentType(new Utf8("text/html"));
     return page;
   }

   protected Parse parse(WebPage page) {
     return parser.getParse(dummyUrl, page);
   }

   @Test
   public void testEncodingDetection() {
     for (String[] testPage : encodingTestPages) {
       String name = testPage[0];
       Charset charset = Charset.forName(testPage[1]);
       byte[] contentBytes = testPage[2].getBytes(charset);
       // Parse parse = parse(contentBytes);
       WebPage page = page(contentBytes);
       Parse parse = parse(page);
       String text = parse.getText();
       String title = parse.getTitle();
       // String keywords = parse.getMeta("keywords");
       String keywords = Bytes.toString(page.getMetadata().get(
           new Utf8("keywords")));
       LOG.info(name);
       LOG.info("title:\t" + title);
       LOG.info("keywords:\t" + keywords);
       LOG.info("text:\t" + text);
       assertEquals("Title not extracted properly (" + name + ")",
           encodingTestKeywords, title);
       for (String keyword : encodingTestKeywords.split(",\\s*")) {
         assertTrue(keyword + " not found in text (" + name + ")",
             text.contains(keyword));
       }
       if (keywords != null) {
         assertEquals("Keywords not extracted properly (" + name + ")",
             encodingTestKeywords, keywords);
       }
     }
   }

 }
	/**
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.nutch.parse.html;

	import java.nio.ByteBuffer;
	import java.nio.charset.Charset;

	import org.apache.avro.util.Utf8;
	import org.apache.hadoop.conf.Configuration;
	import org.apache.nutch.parse.Parse;
	import org.apache.nutch.parse.Parser;
	import org.apache.nutch.storage.WebPage;
	import org.apache.nutch.util.Bytes;
	import org.apache.nutch.util.NutchConfiguration;
	import org.slf4j.Logger;
	import org.slf4j.LoggerFactory;
	import org.junit.Before;
	import org.junit.Test;

	import static org.junit.Assert.*;

	public class TestHtmlParser {

	public static final Logger LOG = LoggerFactory
	.getLogger(TestHtmlParser.class);

	private static final String encodingTestKeywords = "français, español, русский язык, čeština, ελληνικά";
	private static final String encodingTestBody = "<ul>\n <li>français\n <li>español\n <li>русский язык\n <li>čeština\n <li>ελληνικά\n</ul>";
	private static final String encodingTestContent = "<title>"
	+ encodingTestKeywords + "</title>\n"
	+ "<meta name=\"keywords\" content=\"" + encodingTestKeywords
	+ "</meta>\n" + "</head>\n<body>" + encodingTestBody + "</body>\n</html>";

	private static String[][] encodingTestPages = {
	{
	"HTML4, utf-8, meta http-equiv, no quotes",
	"utf-8",
	"<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" "
	+ "\"http://www.w3.org/TR/html4/loose.dtd\">\n"
	+ "<html>\n<head>\n"
	+ "<meta http-equiv=Content-Type content=\"text/html; charset=utf-8\" />"
	+ encodingTestContent },
	{
	"HTML4, utf-8, meta http-equiv, single quotes",
	"utf-8",
	"<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" "
	+ "\"http://www.w3.org/TR/html4/loose.dtd\">\n"
	+ "<html>\n<head>\n"
	+ "<meta http-equiv='Content-Type' content='text/html; charset=utf-8' />"
	+ encodingTestContent },
	{
	"XHTML, utf-8, meta http-equiv, double quotes",
	"utf-8",
	"<?xml version=\"1.0\"?>\n<html xmlns=\"http://www.w3.org/1999/xhtml\">"
	+ "<html>\n<head>\n"
	+ "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />"
	+ encodingTestContent },
	{
	"HTML5, utf-8, meta charset",
	"utf-8",
	"<!DOCTYPE html>\n<html>\n<head>\n" + "<meta charset=\"utf-8\">"
	+ encodingTestContent },
	{ "HTML5, utf-8, BOM", "utf-8",
	"\ufeff<!DOCTYPE html>\n<html>\n<head>\n" + encodingTestContent },
	{ "HTML5, utf-16, BOM", "utf-16",
	"\ufeff<!DOCTYPE html>\n<html>\n<head>\n" + encodingTestContent } };

	private Configuration conf;
	private Parser parser;

	private static final String dummyUrl = "http://dummy.url/";

	@Before
	public void setup() {
	conf = NutchConfiguration.create();
	parser = new HtmlParser();
	parser.setConf(conf);
	}

	protected WebPage page(byte[] contentBytes) {
	WebPage page = WebPage.newBuilder().build();
	page.setBaseUrl(new Utf8(dummyUrl));
	page.setContent(ByteBuffer.wrap(contentBytes));
	page.setContentType(new Utf8("text/html"));
	return page;
	}

	protected Parse parse(WebPage page) {
	return parser.getParse(dummyUrl, page);
	}

	@Test
	public void testEncodingDetection() {
	for (String[] testPage : encodingTestPages) {
	String name = testPage[0];
	Charset charset = Charset.forName(testPage[1]);
	byte[] contentBytes = testPage[2].getBytes(charset);
	// Parse parse = parse(contentBytes);
	WebPage page = page(contentBytes);
	Parse parse = parse(page);
	String text = parse.getText();
	String title = parse.getTitle();
	// String keywords = parse.getMeta("keywords");
	String keywords = Bytes.toString(page.getMetadata().get(
	new Utf8("keywords")));
	LOG.info(name);
	LOG.info("title:\t" + title);
	LOG.info("keywords:\t" + keywords);
	LOG.info("text:\t" + text);
	assertEquals("Title not extracted properly (" + name + ")",
	encodingTestKeywords, title);
	for (String keyword : encodingTestKeywords.split(",\\s*")) {
	assertTrue(keyword + " not found in text (" + name + ")",
	text.contains(keyword));
	}
	if (keywords != null) {
	assertEquals("Keywords not extracted properly (" + name + ")",
	encodingTestKeywords, keywords);
	}
	}
	}

	}