blob: 0795a9a05b8238fe9e74589cf9f4efc997d5a060 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.parse.headings;
import org.apache.hadoop.conf.Configuration;
import org.apache.html.dom.HTMLDocumentImpl;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.parse.*;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.util.NutchConfiguration;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import org.w3c.dom.DocumentFragment;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.cyberneko.html.parsers.DOMFragmentParser;
import org.junit.Assert;
import org.junit.Test;
public class TestHeadingsParseFilter {
private static Configuration conf = NutchConfiguration.create();
@Test
public void testExtractHeadingFromNestedNodes()
throws IOException, SAXException {
conf.setStrings("headings", "h1", "h2");
HtmlParseFilter filter = new HeadingsParseFilter();
filter.setConf(conf);
Content content = new Content("http://www.foo.com/", "http://www.foo.com/",
"".getBytes("UTF8"), "text/html; charset=UTF-8", new Metadata(), conf);
ParseImpl parse = new ParseImpl("foo bar", new ParseData());
ParseResult parseResult = ParseResult
.createParseResult("http://www.foo.com/", parse);
HTMLMetaTags metaTags = new HTMLMetaTags();
DOMFragmentParser parser = new DOMFragmentParser();
DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();
parser.parse(new InputSource(new ByteArrayInputStream(
("<html><head><title>test header with span element</title></head><body><h1>header with <span>span element</span></h1></body></html>")
.getBytes())), node);
parseResult = filter.filter(content, parseResult, metaTags, node);
Assert.assertEquals(
"The h1 tag must include the content of the inner span node",
"header with span element",
parseResult.get(content.getUrl()).getData().getParseMeta().get("h1"));
}
}