| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.tika.sax; |
| |
| import static java.nio.charset.StandardCharsets.UTF_8; |
| import static org.junit.Assert.assertEquals; |
| import static org.junit.Assert.assertFalse; |
| import static org.junit.Assert.assertTrue; |
| |
| import java.io.ByteArrayOutputStream; |
| import java.io.IOException; |
| import java.io.InputStream; |
| import java.io.UnsupportedEncodingException; |
| import java.util.Set; |
| |
| import org.junit.Test; |
| import org.xml.sax.Attributes; |
| import org.xml.sax.ContentHandler; |
| import org.xml.sax.SAXException; |
| import org.xml.sax.helpers.AttributesImpl; |
| import org.xml.sax.helpers.DefaultHandler; |
| |
| import org.apache.tika.exception.TikaException; |
| import org.apache.tika.exception.WriteLimitReachedException; |
| import org.apache.tika.metadata.Metadata; |
| import org.apache.tika.mime.MediaType; |
| import org.apache.tika.parser.ParseContext; |
| import org.apache.tika.parser.Parser; |
| |
| /** |
| * Test cases for the {@link org.apache.tika.sax.BodyContentHandler} class. |
| */ |
| public class BasicContentHandlerFactoryTest { |
| |
| private static final String ENCODING = UTF_8.name(); |
| //default max char len (at least in WriteOutContentHandler is 100k) |
| private static final int OVER_DEFAULT = 120000; |
| |
| //copied from TikaTest in tika-parsers package |
| public static void assertNotContains(String needle, String haystack) { |
| assertFalse(needle + " found in:\n" + haystack, haystack.contains(needle)); |
| } |
| |
| public static void assertNotContains(String needle, byte[] hayStack) |
| throws UnsupportedEncodingException { |
| assertNotContains(needle, new String(hayStack, ENCODING)); |
| } |
| |
| public static void assertContains(String needle, String haystack) { |
| assertTrue(needle + " not found in:\n" + haystack, haystack.contains(needle)); |
| } |
| |
| public static void assertContains(String needle, byte[] hayStack) |
| throws UnsupportedEncodingException { |
| assertContains(needle, new String(hayStack, ENCODING)); |
| } |
| |
| @Test |
| public void testIgnore() throws Exception { |
| Parser p = new MockParser(OVER_DEFAULT); |
| ContentHandler handler = |
| new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1) |
| .getNewContentHandler(); |
| assertTrue(handler instanceof DefaultHandler); |
| p.parse(null, handler, null, null); |
| //unfortunatley, the DefaultHandler does not return "", |
| assertContains("org.xml.sax.helpers.DefaultHandler", handler.toString()); |
| |
| //tests that no write limit exception is thrown |
| p = new MockParser(100); |
| handler = new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, 5) |
| .getNewContentHandler(); |
| assertTrue(handler instanceof DefaultHandler); |
| p.parse(null, handler, null, null); |
| assertContains("org.xml.sax.helpers.DefaultHandler", handler.toString()); |
| } |
| |
| @Test |
| public void testText() throws Exception { |
| Parser p = new MockParser(OVER_DEFAULT); |
| BasicContentHandlerFactory.HANDLER_TYPE type = BasicContentHandlerFactory.HANDLER_TYPE.TEXT; |
| ContentHandler handler = new BasicContentHandlerFactory(type, -1).getNewContentHandler(); |
| |
| assertTrue(handler instanceof ToTextContentHandler); |
| p.parse(null, handler, null, null); |
| String extracted = handler.toString(); |
| assertContains("This is the title", extracted); |
| assertContains("aaaaaaaaaa", extracted); |
| assertNotContains("<body", extracted); |
| assertNotContains("<html", extracted); |
| assertTrue(extracted.length() > 110000); |
| //now test write limit |
| p = new MockParser(10); |
| handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler(); |
| assertTrue(handler instanceof WriteOutContentHandler); |
| assertWriteLimitReached(p, (WriteOutContentHandler) handler); |
| extracted = handler.toString(); |
| assertContains("This ", extracted); |
| assertNotContains("aaaa", extracted); |
| |
| //now test outputstream call |
| p = new MockParser(OVER_DEFAULT); |
| ByteArrayOutputStream os = new ByteArrayOutputStream(); |
| handler = new BasicContentHandlerFactory(type, -1).getNewContentHandler(os, ENCODING); |
| assertTrue(handler instanceof ToTextContentHandler); |
| p.parse(null, handler, null, null); |
| assertContains("This is the title", os.toByteArray()); |
| assertContains("aaaaaaaaaa", os.toByteArray()); |
| assertTrue(os.toByteArray().length > 110000); |
| assertNotContains("<body", os.toByteArray()); |
| assertNotContains("<html", os.toByteArray()); |
| |
| p = new MockParser(10); |
| os = new ByteArrayOutputStream(); |
| handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler(os, ENCODING); |
| assertTrue(handler instanceof WriteOutContentHandler); |
| assertWriteLimitReached(p, (WriteOutContentHandler) handler); |
| //When writing to an OutputStream and a write limit is reached, |
| //currently, nothing is written. |
| assertEquals(0, os.toByteArray().length); |
| } |
| |
| @Test |
| public void testHTML() throws Exception { |
| Parser p = new MockParser(OVER_DEFAULT); |
| BasicContentHandlerFactory.HANDLER_TYPE type = BasicContentHandlerFactory.HANDLER_TYPE.HTML; |
| ContentHandler handler = new BasicContentHandlerFactory(type, -1).getNewContentHandler(); |
| |
| assertTrue(handler instanceof ToHTMLContentHandler); |
| p.parse(null, handler, null, null); |
| String extracted = handler.toString(); |
| assertContains("<head><title>This is the title", extracted); |
| assertContains("aaaaaaaaaa", extracted); |
| assertTrue(extracted.length() > 110000); |
| |
| //now test write limit |
| p = new MockParser(10); |
| handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler(); |
| assertTrue(handler instanceof WriteOutContentHandler); |
| assertWriteLimitReached(p, (WriteOutContentHandler) handler); |
| extracted = handler.toString(); |
| assertContains("This ", extracted); |
| assertNotContains("aaaa", extracted); |
| |
| //now test outputstream call |
| p = new MockParser(OVER_DEFAULT); |
| ByteArrayOutputStream os = new ByteArrayOutputStream(); |
| handler = new BasicContentHandlerFactory(type, -1).getNewContentHandler(os, ENCODING); |
| assertTrue(handler instanceof ToHTMLContentHandler); |
| p.parse(null, handler, null, null); |
| assertContains("This is the title", os.toByteArray()); |
| assertContains("aaaaaaaaaa", os.toByteArray()); |
| assertContains("<body", os.toByteArray()); |
| assertContains("<html", os.toByteArray()); |
| assertTrue(os.toByteArray().length > 110000); |
| |
| |
| p = new MockParser(10); |
| os = new ByteArrayOutputStream(); |
| handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler(os, ENCODING); |
| assertTrue(handler instanceof WriteOutContentHandler); |
| assertWriteLimitReached(p, (WriteOutContentHandler) handler); |
| assertEquals(0, os.toByteArray().length); |
| } |
| |
| @Test |
| public void testXML() throws Exception { |
| Parser p = new MockParser(OVER_DEFAULT); |
| BasicContentHandlerFactory.HANDLER_TYPE type = BasicContentHandlerFactory.HANDLER_TYPE.HTML; |
| ContentHandler handler = new BasicContentHandlerFactory(type, -1).getNewContentHandler(); |
| |
| assertTrue(handler instanceof ToXMLContentHandler); |
| p.parse(null, handler, new Metadata(), null); |
| String extracted = handler.toString(); |
| assertContains("<head><title>This is the title", extracted); |
| assertContains("aaaaaaaaaa", extracted); |
| assertTrue(handler.toString().length() > 110000); |
| |
| //now test write limit |
| p = new MockParser(10); |
| handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler(); |
| assertTrue(handler instanceof WriteOutContentHandler); |
| assertWriteLimitReached(p, (WriteOutContentHandler) handler); |
| extracted = handler.toString(); |
| assertContains("This ", extracted); |
| assertNotContains("aaaa", extracted); |
| |
| //now test outputstream call |
| p = new MockParser(OVER_DEFAULT); |
| ByteArrayOutputStream os = new ByteArrayOutputStream(); |
| handler = new BasicContentHandlerFactory(type, -1).getNewContentHandler(os, ENCODING); |
| assertTrue(handler instanceof ToXMLContentHandler); |
| p.parse(null, handler, null, null); |
| |
| assertContains("This is the title", os.toByteArray()); |
| assertContains("aaaaaaaaaa", os.toByteArray()); |
| assertContains("<body", os.toByteArray()); |
| assertContains("<html", os.toByteArray()); |
| assertTrue(os.toByteArray().length > 110000); |
| |
| |
| p = new MockParser(10); |
| os = new ByteArrayOutputStream(); |
| handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler(os, ENCODING); |
| assertTrue(handler instanceof WriteOutContentHandler); |
| assertWriteLimitReached(p, (WriteOutContentHandler) handler); |
| assertEquals(0, os.toByteArray().length); |
| } |
| |
| @Test |
| public void testBody() throws Exception { |
| Parser p = new MockParser(OVER_DEFAULT); |
| BasicContentHandlerFactory.HANDLER_TYPE type = BasicContentHandlerFactory.HANDLER_TYPE.BODY; |
| ContentHandler handler = new BasicContentHandlerFactory(type, -1).getNewContentHandler(); |
| |
| assertTrue(handler instanceof BodyContentHandler); |
| |
| p.parse(null, handler, null, null); |
| String extracted = handler.toString(); |
| assertNotContains("title", extracted); |
| assertContains("aaaaaaaaaa", extracted); |
| assertTrue(extracted.length() > 110000); |
| |
| //now test write limit |
| p = new MockParser(10); |
| handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler(); |
| assertTrue(handler instanceof BodyContentHandler); |
| assertWriteLimitReached(p, (BodyContentHandler) handler); |
| extracted = handler.toString(); |
| assertNotContains("This ", extracted); |
| assertContains("aaaa", extracted); |
| |
| //now test outputstream call |
| p = new MockParser(OVER_DEFAULT); |
| ByteArrayOutputStream os = new ByteArrayOutputStream(); |
| handler = new BasicContentHandlerFactory(type, -1).getNewContentHandler(os, ENCODING); |
| assertTrue(handler instanceof BodyContentHandler); |
| p.parse(null, handler, null, null); |
| assertNotContains("title", os.toByteArray()); |
| assertContains("aaaaaaaaaa", os.toByteArray()); |
| assertNotContains("<body", os.toByteArray()); |
| assertNotContains("<html", os.toByteArray()); |
| assertTrue(os.toByteArray().length > 110000); |
| |
| p = new MockParser(10); |
| os = new ByteArrayOutputStream(); |
| handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler(os, ENCODING); |
| assertTrue(handler instanceof WriteOutContentHandler); |
| assertWriteLimitReached(p, (WriteOutContentHandler) handler); |
| assertEquals(0, os.toByteArray().length); |
| } |
| |
| private void assertWriteLimitReached(Parser p, WriteOutContentHandler handler) |
| throws Exception { |
| boolean wlr = false; |
| try { |
| p.parse(null, handler, null, null); |
| } catch (SAXException e) { |
| if (!WriteLimitReachedException.isWriteLimitReached(e)) { |
| throw e; |
| } |
| wlr = true; |
| } |
| assertTrue("WriteLimitReached", wlr); |
| } |
| |
| //TODO: is there a better way than to repeat this with diff signature? |
| private void assertWriteLimitReached(Parser p, BodyContentHandler handler) throws Exception { |
| boolean wlr = false; |
| try { |
| p.parse(null, handler, null, null); |
| } catch (SAXException e) { |
| if (! WriteLimitReachedException.isWriteLimitReached(e)) { |
| throw e; |
| } |
| |
| wlr = true; |
| } |
| assertTrue("WriteLimitReached", wlr); |
| } |
| |
| //Simple mockparser that writes a title |
| //and charsToWrite number of 'a' |
| private static class MockParser implements Parser { |
| private final String XHTML = "http://www.w3.org/1999/xhtml"; |
| private final Attributes EMPTY_ATTRIBUTES = new AttributesImpl(); |
| private final char[] TITLE = "This is the title".toCharArray(); |
| |
| private final int charsToWrite; |
| |
| public MockParser(int charsToWrite) { |
| this.charsToWrite = charsToWrite; |
| } |
| |
| @Override |
| public Set<MediaType> getSupportedTypes(ParseContext context) { |
| return null; |
| } |
| |
| @Override |
| public void parse(InputStream stream, ContentHandler handler, Metadata metadata, |
| ParseContext context) throws IOException, SAXException, TikaException { |
| handler.startDocument(); |
| handler.startPrefixMapping("", XHTML); |
| handler.startElement(XHTML, "html", "html", EMPTY_ATTRIBUTES); |
| handler.startElement(XHTML, "head", "head", EMPTY_ATTRIBUTES); |
| handler.startElement(XHTML, "title", "head", EMPTY_ATTRIBUTES); |
| handler.characters(TITLE, 0, TITLE.length); |
| handler.endElement(XHTML, "title", "head"); |
| |
| handler.endElement(XHTML, "head", "head"); |
| handler.startElement(XHTML, "body", "body", EMPTY_ATTRIBUTES); |
| char[] body = new char[charsToWrite]; |
| for (int i = 0; i < charsToWrite; i++) { |
| body[i] = 'a'; |
| } |
| handler.characters(body, 0, body.length); |
| handler.endElement(XHTML, "body", "body"); |
| handler.endElement(XHTML, "html", "html"); |
| handler.endDocument(); |
| } |
| } |
| } |