src/plugin/headings/src/test/org/apache/nutch/parse/headings/TestHeadingsParseFilter.java - nutch - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.nutch.parse.headings;

 import org.apache.hadoop.conf.Configuration;
 import org.apache.html.dom.HTMLDocumentImpl;
 import org.apache.nutch.metadata.Metadata;
 import org.apache.nutch.parse.*;
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.util.NutchConfiguration;

 import java.io.ByteArrayInputStream;
 import java.io.IOException;

 import org.w3c.dom.DocumentFragment;
 import org.xml.sax.InputSource;
 import org.xml.sax.SAXException;
 import org.cyberneko.html.parsers.DOMFragmentParser;
 import org.junit.Assert;
 import org.junit.Test;

 public class TestHeadingsParseFilter {
   private static Configuration conf = NutchConfiguration.create();

   @Test
   public void testExtractHeadingFromNestedNodes()
       throws IOException, SAXException {

     conf.setStrings("headings", "h1", "h2");
     HtmlParseFilter filter = new HeadingsParseFilter();
     filter.setConf(conf);

     Content content = new Content("http://www.foo.com/", "http://www.foo.com/",
         "".getBytes("UTF8"), "text/html; charset=UTF-8", new Metadata(), conf);
     ParseImpl parse = new ParseImpl("foo bar", new ParseData());
     ParseResult parseResult = ParseResult
         .createParseResult("http://www.foo.com/", parse);
     HTMLMetaTags metaTags = new HTMLMetaTags();
     DOMFragmentParser parser = new DOMFragmentParser();
     DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();

     parser.parse(new InputSource(new ByteArrayInputStream(
         ("<html><head><title>test header with span element</title></head><body><h1>header with <span>span element</span></h1></body></html>")
             .getBytes())), node);

     parseResult = filter.filter(content, parseResult, metaTags, node);

     Assert.assertEquals(
         "The h1 tag must include the content of the inner span node",
         "header with span element",
         parseResult.get(content.getUrl()).getData().getParseMeta().get("h1"));
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.nutch.parse.headings;

	import org.apache.hadoop.conf.Configuration;
	import org.apache.html.dom.HTMLDocumentImpl;
	import org.apache.nutch.metadata.Metadata;
	import org.apache.nutch.parse.*;
	import org.apache.nutch.protocol.Content;
	import org.apache.nutch.util.NutchConfiguration;

	import java.io.ByteArrayInputStream;
	import java.io.IOException;

	import org.w3c.dom.DocumentFragment;
	import org.xml.sax.InputSource;
	import org.xml.sax.SAXException;
	import org.cyberneko.html.parsers.DOMFragmentParser;
	import org.junit.Assert;
	import org.junit.Test;

	public class TestHeadingsParseFilter {
	private static Configuration conf = NutchConfiguration.create();

	@Test
	public void testExtractHeadingFromNestedNodes()
	throws IOException, SAXException {

	conf.setStrings("headings", "h1", "h2");
	HtmlParseFilter filter = new HeadingsParseFilter();
	filter.setConf(conf);

	Content content = new Content("http://www.foo.com/", "http://www.foo.com/",
	"".getBytes("UTF8"), "text/html; charset=UTF-8", new Metadata(), conf);
	ParseImpl parse = new ParseImpl("foo bar", new ParseData());
	ParseResult parseResult = ParseResult
	.createParseResult("http://www.foo.com/", parse);
	HTMLMetaTags metaTags = new HTMLMetaTags();
	DOMFragmentParser parser = new DOMFragmentParser();
	DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();

	parser.parse(new InputSource(new ByteArrayInputStream(
	("<html><head><title>test header with span element</title></head><body><h1>header with <span>span element</span></h1></body></html>")
	.getBytes())), node);

	parseResult = filter.filter(content, parseResult, metaTags, node);

	Assert.assertEquals(
	"The h1 tag must include the content of the inner span node",
	"header with span element",
	parseResult.get(content.getUrl()).getData().getParseMeta().get("h1"));
	}
	}