tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java - tika - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.tika.parser.pkg;

 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertNotNull;
 import static org.junit.Assert.assertTrue;

 import java.io.InputStream;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Set;

 import org.junit.Test;
 import org.xml.sax.ContentHandler;

 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.sax.BodyContentHandler;

 /**
  * Test case for parsing zip files.
  */
 public class ZipParserTest extends AbstractPkgTest {

     @Test
     public void testZipParsing() throws Exception {
         ContentHandler handler = new BodyContentHandler();
         Metadata metadata = new Metadata();

         try (InputStream stream = getResourceAsStream("/test-documents/test-documents.zip")) {
             AUTO_DETECT_PARSER.parse(stream, handler, metadata, recursingContext);
         }

         assertEquals("application/zip", metadata.get(Metadata.CONTENT_TYPE));
         String content = handler.toString();
         assertContains("testEXCEL.xls", content);
         assertContains("Sample Excel Worksheet", content);
         assertContains("testHTML.html", content);
         assertContains("Test Indexation Html", content);
         assertContains("testOpenOffice2.odt", content);
         assertContains("This is a sample Open Office document", content);
         assertContains("testPDF.pdf", content);
         assertContains("Apache Tika", content);
         assertContains("testPPT.ppt", content);
         assertContains("Sample Powerpoint Slide", content);
         assertContains("testRTF.rtf", content);
         assertContains("indexation Word", content);
         assertContains("testTXT.txt", content);
         assertContains("Test d'indexation de Txt", content);
         assertContains("testWORD.doc", content);
         assertContains("This is a sample Microsoft Word Document", content);
         assertContains("testXML.xml", content);
         assertContains("Rida Benjelloun", content);
     }

     // TIKA-1036
     @Test
     public void testPlaceholders() throws Exception {
         String xml = getXML("testEmbedded.zip").xml;
         assertContains("<div class=\"embedded\" id=\"test1.txt\" />", xml);
         assertContains("<div class=\"embedded\" id=\"test2.txt\" />", xml);

         // Also make sure EMBEDDED_RELATIONSHIP_ID was
         // passed when parsing the embedded docs:
         ParseContext context = new ParseContext();
         GatherRelIDsDocumentExtractor relIDs = new GatherRelIDsDocumentExtractor();
         context.set(EmbeddedDocumentExtractor.class, relIDs);
         try (InputStream input = getResourceAsStream("/test-documents/testEmbedded.zip")) {
             AUTO_DETECT_PARSER.parse(input, new BodyContentHandler(), new Metadata(), context);
         }

         assertTrue(relIDs.allRelIDs.contains("test1.txt"));
         assertTrue(relIDs.allRelIDs.contains("test2.txt"));
     }

     @Test
     public void testZipEncrypted() throws Exception {
         List<Metadata> metadataList = getRecursiveMetadata("testZipEncrypted.zip");
         assertEquals(2, metadataList.size());
         String[] values = metadataList.get(0)
                 .getValues(TikaCoreProperties.TIKA_META_EXCEPTION_EMBEDDED_STREAM);
         assertNotNull(values);
         assertEquals(1, values.length);
         assertContains("EncryptedDocumentException: stream (encrypted.txt) is encrypted",
                 values[0]);


         assertContains("hello world", metadataList.get(1).get(TikaCoreProperties.TIKA_CONTENT));
     }

     @Test
     public void testDataDescriptorWithEmptyEntry() throws Exception {

         //test that an empty first entry does not cause problems
         List<Metadata> results = getRecursiveMetadata("testZip_with_DataDescriptor2.zip");
         assertEquals(5, results.size());

         //mime is 0 bytes
         assertContains("InputStream must have > 0 bytes",
                 results.get(1).get("X-TIKA:EXCEPTION:embedded_exception"));
         //source.xml is binary, not xml
         assertContains("TikaException: XML parse error",
                 results.get(2).get("X-TIKA:EXCEPTION:embedded_exception"));
         //manifest.xml has malformed xml
         assertContains("TikaException: XML parse error",
                 results.get(4).get("X-TIKA:EXCEPTION:embedded_exception"));
     }

     private static class GatherRelIDsDocumentExtractor implements EmbeddedDocumentExtractor {
         public Set<String> allRelIDs = new HashSet<>();

         public boolean shouldParseEmbedded(Metadata metadata) {
             String relID = metadata.get(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID);
             if (relID != null) {
                 allRelIDs.add(relID);
             }
             return false;
         }

         public void parseEmbedded(InputStream inputStream, ContentHandler contentHandler,
                                   Metadata metadata, boolean outputHtml) {
             throw new UnsupportedOperationException("should never be called");
         }
     }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.tika.parser.pkg;

	import static org.junit.Assert.assertEquals;
	import static org.junit.Assert.assertNotNull;
	import static org.junit.Assert.assertTrue;

	import java.io.InputStream;
	import java.util.HashSet;
	import java.util.List;
	import java.util.Set;

	import org.junit.Test;
	import org.xml.sax.ContentHandler;

	import org.apache.tika.extractor.EmbeddedDocumentExtractor;
	import org.apache.tika.metadata.Metadata;
	import org.apache.tika.metadata.TikaCoreProperties;
	import org.apache.tika.parser.ParseContext;
	import org.apache.tika.sax.BodyContentHandler;

	/**
	* Test case for parsing zip files.
	*/
	public class ZipParserTest extends AbstractPkgTest {

	@Test
	public void testZipParsing() throws Exception {
	ContentHandler handler = new BodyContentHandler();
	Metadata metadata = new Metadata();

	try (InputStream stream = getResourceAsStream("/test-documents/test-documents.zip")) {
	AUTO_DETECT_PARSER.parse(stream, handler, metadata, recursingContext);
	}

	assertEquals("application/zip", metadata.get(Metadata.CONTENT_TYPE));
	String content = handler.toString();
	assertContains("testEXCEL.xls", content);
	assertContains("Sample Excel Worksheet", content);
	assertContains("testHTML.html", content);
	assertContains("Test Indexation Html", content);
	assertContains("testOpenOffice2.odt", content);
	assertContains("This is a sample Open Office document", content);
	assertContains("testPDF.pdf", content);
	assertContains("Apache Tika", content);
	assertContains("testPPT.ppt", content);
	assertContains("Sample Powerpoint Slide", content);
	assertContains("testRTF.rtf", content);
	assertContains("indexation Word", content);
	assertContains("testTXT.txt", content);
	assertContains("Test d'indexation de Txt", content);
	assertContains("testWORD.doc", content);
	assertContains("This is a sample Microsoft Word Document", content);
	assertContains("testXML.xml", content);
	assertContains("Rida Benjelloun", content);
	}

	// TIKA-1036
	@Test
	public void testPlaceholders() throws Exception {
	String xml = getXML("testEmbedded.zip").xml;
	assertContains("<div class=\"embedded\" id=\"test1.txt\" />", xml);
	assertContains("<div class=\"embedded\" id=\"test2.txt\" />", xml);

	// Also make sure EMBEDDED_RELATIONSHIP_ID was
	// passed when parsing the embedded docs:
	ParseContext context = new ParseContext();
	GatherRelIDsDocumentExtractor relIDs = new GatherRelIDsDocumentExtractor();
	context.set(EmbeddedDocumentExtractor.class, relIDs);
	try (InputStream input = getResourceAsStream("/test-documents/testEmbedded.zip")) {
	AUTO_DETECT_PARSER.parse(input, new BodyContentHandler(), new Metadata(), context);
	}

	assertTrue(relIDs.allRelIDs.contains("test1.txt"));
	assertTrue(relIDs.allRelIDs.contains("test2.txt"));
	}

	@Test
	public void testZipEncrypted() throws Exception {
	List<Metadata> metadataList = getRecursiveMetadata("testZipEncrypted.zip");
	assertEquals(2, metadataList.size());
	String[] values = metadataList.get(0)
	.getValues(TikaCoreProperties.TIKA_META_EXCEPTION_EMBEDDED_STREAM);
	assertNotNull(values);
	assertEquals(1, values.length);
	assertContains("EncryptedDocumentException: stream (encrypted.txt) is encrypted",
	values[0]);


	assertContains("hello world", metadataList.get(1).get(TikaCoreProperties.TIKA_CONTENT));
	}

	@Test
	public void testDataDescriptorWithEmptyEntry() throws Exception {

	//test that an empty first entry does not cause problems
	List<Metadata> results = getRecursiveMetadata("testZip_with_DataDescriptor2.zip");
	assertEquals(5, results.size());

	//mime is 0 bytes
	assertContains("InputStream must have > 0 bytes",
	results.get(1).get("X-TIKA:EXCEPTION:embedded_exception"));
	//source.xml is binary, not xml
	assertContains("TikaException: XML parse error",
	results.get(2).get("X-TIKA:EXCEPTION:embedded_exception"));
	//manifest.xml has malformed xml
	assertContains("TikaException: XML parse error",
	results.get(4).get("X-TIKA:EXCEPTION:embedded_exception"));
	}

	private static class GatherRelIDsDocumentExtractor implements EmbeddedDocumentExtractor {
	public Set<String> allRelIDs = new HashSet<>();

	public boolean shouldParseEmbedded(Metadata metadata) {
	String relID = metadata.get(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID);
	if (relID != null) {
	allRelIDs.add(relID);
	}
	return false;
	}

	public void parseEmbedded(InputStream inputStream, ContentHandler contentHandler,
	Metadata metadata, boolean outputHtml) {
	throw new UnsupportedOperationException("should never be called");
	}
	}
	}