tika-parsers/src/test/java/org/apache/tika/parser/epub/EpubParserTest.java - tika - Git at Google

 /**
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  * <p>
  * http://www.apache.org/licenses/LICENSE-2.0
  * <p>
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.tika.parser.epub;

 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertNotNull;

 import java.io.InputStream;
 import java.util.List;

 import org.apache.tika.TikaTest;
 import org.apache.tika.config.TikaConfig;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.sax.RecursiveParserWrapperHandler;
 import org.junit.Test;

 public class EpubParserTest extends TikaTest {

     @Test
     public void testXMLParser() throws Exception {

         XMLResult xmlResult = getXML("testEPUB.epub");

         assertEquals("application/epub+zip",
                 xmlResult.metadata.get(Metadata.CONTENT_TYPE));
         assertEquals("en",
                 xmlResult.metadata.get(TikaCoreProperties.LANGUAGE));
         assertEquals("This is an ePub test publication for Tika.",
                 xmlResult.metadata.get(TikaCoreProperties.DESCRIPTION));
         assertEquals("Apache",
                 xmlResult.metadata.get(TikaCoreProperties.PUBLISHER));

         String content = xmlResult.xml;
         assertContains("Plus a simple div", content);
         assertContains("First item", content);
         assertContains("The previous headings were <strong>subchapters</strong>", content);
         assertContains("Table data", content);
         assertContains("This is the text for chapter Two", content);

         //make sure style/script elements aren't extracted
         assertNotContained("nothing to see here", content);
         assertNotContained("nor here", content);
         assertNotContained("font-style", content);

         //make sure that there is only one of each
         assertContainsCount("<html", content, 1);
         assertContainsCount("<head", content, 1);
         assertContainsCount("<body", content, 1);
     }

     @Test
     public void testEpubOrder() throws Exception {
         List<Metadata> metadataList = getRecursiveMetadata("testEPUB.epub");

         //test attachments
         assertEquals(2, metadataList.size());
         assertEquals("image/jpeg", metadataList.get(1).get(Metadata.CONTENT_TYPE));
         String xml = metadataList.get(0).get(RecursiveParserWrapperHandler.TIKA_CONTENT);
         int tocIndex = xml.indexOf("h3 class=\"toc_heading\">Table of Contents<");
         int ch1 = xml.indexOf("<h1>Chapter 1");
         int ch2 = xml.indexOf("<h1>Chapter 2");
         assert(tocIndex > -1 && ch1 > -1 && ch2 > -1);
         assert(tocIndex < ch1);
         assert(tocIndex < ch2);
         assert(ch1 < ch2);

         InputStream is = getClass().getResourceAsStream("/org/apache/tika/parser/epub/tika-config.xml");
         assertNotNull(is);
         Parser p = new AutoDetectParser(new TikaConfig(is));
         xml = getXML("testEPUB.epub", p).xml;
         tocIndex = xml.indexOf("h3 class=\"toc_heading\">Table of Contents<");
         ch1 = xml.indexOf("<h1>Chapter 1");
         ch2 = xml.indexOf("<h1>Chapter 2");
         assert(tocIndex > -1 && ch1 > -1 && ch2 > -1);
         assert(tocIndex > ch1);
         assert(tocIndex > ch2);
         assert(ch1 < ch2);
     }


     @Test
     public void testTruncated() throws Exception {
         Parser p = new EpubParser();
         List<Metadata> metadataList;
         try (InputStream is = truncate("testEPUB.epub", 10000)) {
             metadataList = getRecursiveMetadata(is, p, true);
         }
         String xml = metadataList.get(0).get(RecursiveParserWrapperHandler.TIKA_CONTENT);
         int ch1 = xml.indexOf("<h1>Chapter 1");
         int ch2 = xml.indexOf("<h1>Chapter 2");
         assert(ch1 < ch2);
     }

     @Test
     public void testContentsWXMLExtensions() throws Exception {
         //TIKA-2310
         List<Metadata> metadataList = getRecursiveMetadata("testEPUB_xml_ext.epub");
         assertEquals(1, metadataList.size());
         assertContains("It was a bright cold day in April",
                 metadataList.get(0).get(RecursiveParserWrapperHandler.TIKA_CONTENT));
     }
 }
	/**
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	* <p>
	* http://www.apache.org/licenses/LICENSE-2.0
	* <p>
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.tika.parser.epub;

	import static org.junit.Assert.assertEquals;
	import static org.junit.Assert.assertNotNull;

	import java.io.InputStream;
	import java.util.List;

	import org.apache.tika.TikaTest;
	import org.apache.tika.config.TikaConfig;
	import org.apache.tika.metadata.Metadata;
	import org.apache.tika.metadata.TikaCoreProperties;
	import org.apache.tika.parser.AutoDetectParser;
	import org.apache.tika.parser.Parser;
	import org.apache.tika.sax.RecursiveParserWrapperHandler;
	import org.junit.Test;

	public class EpubParserTest extends TikaTest {

	@Test
	public void testXMLParser() throws Exception {

	XMLResult xmlResult = getXML("testEPUB.epub");

	assertEquals("application/epub+zip",
	xmlResult.metadata.get(Metadata.CONTENT_TYPE));
	assertEquals("en",
	xmlResult.metadata.get(TikaCoreProperties.LANGUAGE));
	assertEquals("This is an ePub test publication for Tika.",
	xmlResult.metadata.get(TikaCoreProperties.DESCRIPTION));
	assertEquals("Apache",
	xmlResult.metadata.get(TikaCoreProperties.PUBLISHER));

	String content = xmlResult.xml;
	assertContains("Plus a simple div", content);
	assertContains("First item", content);
	assertContains("The previous headings were <strong>subchapters</strong>", content);
	assertContains("Table data", content);
	assertContains("This is the text for chapter Two", content);

	//make sure style/script elements aren't extracted
	assertNotContained("nothing to see here", content);
	assertNotContained("nor here", content);
	assertNotContained("font-style", content);

	//make sure that there is only one of each
	assertContainsCount("<html", content, 1);
	assertContainsCount("<head", content, 1);
	assertContainsCount("<body", content, 1);
	}

	@Test
	public void testEpubOrder() throws Exception {
	List<Metadata> metadataList = getRecursiveMetadata("testEPUB.epub");

	//test attachments
	assertEquals(2, metadataList.size());
	assertEquals("image/jpeg", metadataList.get(1).get(Metadata.CONTENT_TYPE));
	String xml = metadataList.get(0).get(RecursiveParserWrapperHandler.TIKA_CONTENT);
	int tocIndex = xml.indexOf("h3 class=\"toc_heading\">Table of Contents<");
	int ch1 = xml.indexOf("<h1>Chapter 1");
	int ch2 = xml.indexOf("<h1>Chapter 2");
	assert(tocIndex > -1 && ch1 > -1 && ch2 > -1);
	assert(tocIndex < ch1);
	assert(tocIndex < ch2);
	assert(ch1 < ch2);

	InputStream is = getClass().getResourceAsStream("/org/apache/tika/parser/epub/tika-config.xml");
	assertNotNull(is);
	Parser p = new AutoDetectParser(new TikaConfig(is));
	xml = getXML("testEPUB.epub", p).xml;
	tocIndex = xml.indexOf("h3 class=\"toc_heading\">Table of Contents<");
	ch1 = xml.indexOf("<h1>Chapter 1");
	ch2 = xml.indexOf("<h1>Chapter 2");
	assert(tocIndex > -1 && ch1 > -1 && ch2 > -1);
	assert(tocIndex > ch1);
	assert(tocIndex > ch2);
	assert(ch1 < ch2);
	}


	@Test
	public void testTruncated() throws Exception {
	Parser p = new EpubParser();
	List<Metadata> metadataList;
	try (InputStream is = truncate("testEPUB.epub", 10000)) {
	metadataList = getRecursiveMetadata(is, p, true);
	}
	String xml = metadataList.get(0).get(RecursiveParserWrapperHandler.TIKA_CONTENT);
	int ch1 = xml.indexOf("<h1>Chapter 1");
	int ch2 = xml.indexOf("<h1>Chapter 2");
	assert(ch1 < ch2);
	}

	@Test
	public void testContentsWXMLExtensions() throws Exception {
	//TIKA-2310
	List<Metadata> metadataList = getRecursiveMetadata("testEPUB_xml_ext.epub");
	assertEquals(1, metadataList.size());
	assertContains("It was a bright cold day in April",
	metadataList.get(0).get(RecursiveParserWrapperHandler.TIKA_CONTENT));
	}
	}