blob: e2ae01929f36e7638c6c38c0aab1d210c891eb34 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.sax;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import java.util.ArrayList;
import java.util.List;
import org.apache.tika.config.TikaConfigTest;
import org.apache.tika.metadata.Metadata;
import org.junit.Before;
import org.junit.Test;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
/**
* Unit tests for the {@link XHTMLContentHandler} class.
*/
public class XHTMLContentHandlerTest {
private ContentHandler output;
private XHTMLContentHandler xhtml;
@Before
public void setUp() {
output = new BodyContentHandler();
xhtml = new XHTMLContentHandler(output, new Metadata());
}
/**
* Test that content in block elements are properly separated in text
* output.
*
* @see <a href="https://issues.apache.org/jira/browse/TIKA-188">TIKA-188</a>
*/
@Test
public void testExtraWhitespace() throws SAXException {
xhtml.startDocument();
xhtml.element("p", "foo");
xhtml.startElement("p");
xhtml.characters("b");
xhtml.element("b", "a"); // inlines should not cause extra whitespace
xhtml.characters("r");
xhtml.endElement("p");
xhtml.startElement("table");
xhtml.startElement("tr");
xhtml.element("th", "x");
xhtml.element("th", "y");
xhtml.endElement("tr");
xhtml.startElement("tr");
xhtml.element("td", "a");
xhtml.element("td", "b");
xhtml.endElement("tr");
xhtml.endElement("table");
xhtml.endDocument();
String[] words = output.toString().split("\\s+");
assertEquals(6, words.length);
assertEquals("foo", words[0]);
assertEquals("bar", words[1]);
assertEquals("x", words[2]);
assertEquals("y", words[3]);
assertEquals("a", words[4]);
assertEquals("b", words[5]);
}
/**
* Test that content in option elements are properly separated in text
* output.
*
* @see <a href="https://issues.apache.org/jira/browse/TIKA-394">TIKA-394</a>
*/
@Test
public void testWhitespaceWithOptions() throws Exception {
xhtml.startDocument();
xhtml.startElement("form");
xhtml.startElement("select");
xhtml.element("option", "opt1");
xhtml.element("option", "opt2");
xhtml.endElement("select");
xhtml.endElement("form");
xhtml.endDocument();
String[] words = output.toString().split("\\s+");
assertEquals(2, words.length);
assertEquals("opt1", words[0]);
assertEquals("opt2", words[1]);
}
@Test
public void testWhitespaceWithMenus() throws Exception {
xhtml.startDocument();
xhtml.startElement("menu");
xhtml.element("li", "one");
xhtml.element("li", "two");
xhtml.endElement("menu");
xhtml.endDocument();
String[] words = getRealWords(output.toString());
assertEquals(2, words.length);
assertEquals("one", words[0]);
assertEquals("two", words[1]);
}
@Test
public void testAttributesOnBody() throws Exception {
ToHTMLContentHandler toHTMLContentHandler = new ToHTMLContentHandler();
XHTMLContentHandler xhtmlContentHandler = new XHTMLContentHandler(toHTMLContentHandler, new Metadata());
AttributesImpl attributes = new AttributesImpl();
attributes.addAttribute(XHTMLContentHandler.XHTML, "itemscope", "itemscope", "", "");
attributes.addAttribute(XHTMLContentHandler.XHTML, "itemtype", "itemtype", "", "http://schema.org/Event");
xhtmlContentHandler.startDocument();
xhtmlContentHandler.startElement(XHTMLContentHandler.XHTML, "body", "body", attributes);
xhtmlContentHandler.endElement("body");
xhtmlContentHandler.endDocument();
assertTrue(toHTMLContentHandler.toString().contains("itemscope"));
}
@Test
public void testAttributesOnHtml() throws Exception {
ToHTMLContentHandler toHTMLContentHandler = new ToHTMLContentHandler();
XHTMLContentHandler xhtmlContentHandler = new XHTMLContentHandler(toHTMLContentHandler, new Metadata());
AttributesImpl attributes = new AttributesImpl();
attributes.addAttribute(XHTMLContentHandler.XHTML, "itemscope", "itemscope", "", "");
attributes.addAttribute(XHTMLContentHandler.XHTML, "itemtype", "itemtype", "", "http://schema.org/Event");
xhtmlContentHandler.startDocument();
xhtmlContentHandler.startElement(XHTMLContentHandler.XHTML, "html", "html", attributes);
xhtmlContentHandler.endElement("html");
xhtmlContentHandler.endDocument();
assertTrue(toHTMLContentHandler.toString().contains("itemscope"));
}
@Test
public void testInvalidControlCharacter0x7F() throws Exception {
xhtml.startDocument();
xhtml.startElement("menu");
xhtml.element("li", "a\u007Fz");
xhtml.endElement("menu");
xhtml.endDocument();
String[] words = getRealWords(output.toString());
System.out.println(words[0]);
assertEquals(1, words.length);
assertEquals("a\ufffdz", words[0]);
}
@Test
public void testInvalidControlCharacter0x9F() throws Exception {
xhtml.startDocument();
xhtml.startElement("menu");
xhtml.element("li", "a\u009Fz");
xhtml.endElement("menu");
xhtml.endDocument();
String[] words = getRealWords(output.toString());
System.out.println(words[0]);
assertEquals(1, words.length);
assertEquals("a\ufffdz", words[0]);
}
@Test
public void testInvalidControlCharacter0x93() throws Exception {
xhtml.startDocument();
xhtml.startElement("menu");
xhtml.element("li", "a\u0093z");
xhtml.endElement("menu");
xhtml.endDocument();
String[] words = getRealWords(output.toString());
System.out.println(words[0]);
assertEquals(1, words.length);
assertEquals("a\ufffdz", words[0]);
}
/**
* Return array of non-zerolength words. Splitting on whitespace will get us
* empty words for emptylines.
*
* @param string some mix of newlines and real words
* @return array of real words.
*/
private static String[] getRealWords(String string) {
String[] possibleWords = string.split("\\s+");
List<String> words = new ArrayList<String>(possibleWords.length);
for (String word : possibleWords) {
if (word.length() > 0) {
words.add(word);
}
}
return words.toArray(new String[words.size()]);
}
}