trunk/src/scratchpad/testcases/org/apache/poi/hslf/extractor/TestExtractor.java - poi - Git at Google

 /* ====================================================================
    Licensed to the Apache Software Foundation (ASF) under one or more
    contributor license agreements.  See the NOTICE file distributed with
    this work for additional information regarding copyright ownership.
    The ASF licenses this file to You under the Apache License, Version 2.0
    (the "License"); you may not use this file except in compliance with
    the License.  You may obtain a copy of the License at

        http://www.apache.org/licenses/LICENSE-2.0

    Unless required by applicable law or agreed to in writing, software
    distributed under the License is distributed on an "AS IS" BASIS,
    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    See the License for the specific language governing permissions and
    limitations under the License.
 ==================================================================== */

 package org.apache.poi.hslf.extractor;

 import static org.apache.poi.POITestCase.assertContains;
 import static org.apache.poi.POITestCase.assertContainsIgnoreCase;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertNotNull;
 import static org.junit.Assert.assertTrue;

 import java.io.ByteArrayOutputStream;
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.util.List;

 import org.apache.poi.POIDataSamples;
 import org.apache.poi.hslf.model.OLEShape;
 import org.apache.poi.hslf.usermodel.HSLFSlideShow;
 import org.apache.poi.hslf.usermodel.HSLFSlideShowImpl;
 import org.apache.poi.hssf.usermodel.HSSFWorkbook;
 import org.apache.poi.hwpf.HWPFDocument;
 import org.apache.poi.poifs.filesystem.DirectoryNode;
 import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
 import org.apache.poi.poifs.filesystem.OPOIFSFileSystem;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 import org.apache.poi.util.IOUtils;
 import org.junit.Test;

 /**
  * Tests that the extractor correctly gets the text out of our sample file
  */
 public final class TestExtractor {
     /**
      * Extractor primed on the 2 page basic test data
      */
     private static final String expectText = "This is a test title\nThis is a test subtitle\nThis is on page 1\nThis is the title on page 2\nThis is page two\nIt has several blocks of text\nNone of them have formatting\n";

     /**
      * Extractor primed on the 1 page but text-box'd test data
      */
     private static final String expectText2 = "Hello, World!!!\nI am just a poor boy\nThis is Times New Roman\nPlain Text \n";

     /**
      * Where our embeded files live
      */
     private static POIDataSamples slTests = POIDataSamples.getSlideShowInstance();

 //    @Before
 //    public void setUp() throws Exception {
 //        ppe = new PowerPointExtractor(slTests.getFile("basic_test_ppt_file.ppt").getCanonicalPath());
 //        ppe2 = new PowerPointExtractor(slTests.getFile("with_textbox.ppt").getCanonicalPath());
 //    }

 //    @After
 //    public void closeResources() throws Exception {
 //        ppe2.close();
 //        ppe.close();
 //    }

     private PowerPointExtractor openExtractor(String fileName) throws IOException {
         InputStream is = slTests.openResourceAsStream(fileName);
         try {
             return new PowerPointExtractor(is);
         } finally {
             is.close();
         }
     }

     @Test
     public void testReadSheetText() throws IOException {
         // Basic 2 page example
         PowerPointExtractor ppe = openExtractor("basic_test_ppt_file.ppt");
         ensureTwoStringsTheSame(expectText, ppe.getText());
         ppe.close();

         // 1 page example with text boxes
         PowerPointExtractor ppe2 = openExtractor("with_textbox.ppt");
         ensureTwoStringsTheSame(expectText2, ppe2.getText());
         ppe2.close();
     }

     @Test
     public void testReadNoteText() throws IOException {
         // Basic 2 page example
         PowerPointExtractor ppe = openExtractor("basic_test_ppt_file.ppt");
         String notesText = ppe.getNotes();
         String expText = "These are the notes for page 1\nThese are the notes on page two, again lacking formatting\n";
         ensureTwoStringsTheSame(expText, notesText);
         ppe.close();

         // Other one doesn't have notes
         PowerPointExtractor ppe2 = openExtractor("with_textbox.ppt");
         notesText = ppe2.getNotes();
         expText = "";
         ensureTwoStringsTheSame(expText, notesText);
         ppe2.close();
     }

     @Test
     public void testReadBoth() throws IOException {
         String[] slText = new String[]{
                 "This is a test title\nThis is a test subtitle\nThis is on page 1\n",
                 "This is the title on page 2\nThis is page two\nIt has several blocks of text\nNone of them have formatting\n"
         };
         String[] ntText = new String[]{
                 "These are the notes for page 1\n",
                 "These are the notes on page two, again lacking formatting\n"
         };

         PowerPointExtractor ppe = openExtractor("basic_test_ppt_file.ppt");
         ppe.setSlidesByDefault(true);
         ppe.setNotesByDefault(false);
         assertEquals(slText[0] + slText[1], ppe.getText());

         ppe.setSlidesByDefault(false);
         ppe.setNotesByDefault(true);
         assertEquals(ntText[0] + ntText[1], ppe.getText());

         ppe.setSlidesByDefault(true);
         ppe.setNotesByDefault(true);
         assertEquals(slText[0] + slText[1] + "\n" + ntText[0] + ntText[1], ppe.getText());
         ppe.close();
     }

     /**
      * Test that when presented with a PPT file missing the odd
      * core record, we can still get the rest of the text out
      *
      * @throws Exception
      */
     @Test
     public void testMissingCoreRecords() throws IOException {
         PowerPointExtractor ppe = openExtractor("missing_core_records.ppt");

         String text = ppe.getText(true, false);
         String nText = ppe.getNotes();

         assertNotNull(text);
         assertNotNull(nText);

         // Notes record were corrupt, so don't expect any
         assertEquals(nText.length(), 0);

         // Slide records were fine
         assertTrue(text.startsWith("Using Disease Surveillance and Response"));

         ppe.close();
     }

     private void ensureTwoStringsTheSame(String exp, String act) {
         assertEquals(exp.length(), act.length());
         char[] expC = exp.toCharArray();
         char[] actC = act.toCharArray();
         for (int i = 0; i < expC.length; i++) {
             assertEquals("Char " + i, expC[i], actC[i]);
         }
         assertEquals(exp, act);
     }

     @Test
     public void testExtractFromEmbeded() throws IOException {
         InputStream is = POIDataSamples.getSpreadSheetInstance().openResourceAsStream("excel_with_embeded.xls");
         POIFSFileSystem fs = new POIFSFileSystem(is);
         DirectoryNode root = fs.getRoot();
         PowerPointExtractor ppe1 = assertExtractFromEmbedded(root, "MBD0000A3B6", "Sample PowerPoint file\nThis is the 1st file\nNot much too it\n");
         PowerPointExtractor ppe2 = assertExtractFromEmbedded(root, "MBD0000A3B3", "Sample PowerPoint file\nThis is the 2nd file\nNot much too it either\n");
         ppe2.close();
         ppe1.close();
         fs.close();
     }

     private PowerPointExtractor assertExtractFromEmbedded(DirectoryNode root, String entryName, String expected)
     throws IOException {
         DirectoryNode dir = (DirectoryNode)root.getEntry(entryName);
         assertTrue(dir.hasEntry("PowerPoint Document"));

         // Check the first file
         HSLFSlideShowImpl ppt = new HSLFSlideShowImpl(dir);
         PowerPointExtractor ppe = new PowerPointExtractor(ppt);
         assertEquals(expected, ppe.getText(true, false));
         return ppe;
     }

     /**
      * A powerpoint file with embeded powerpoint files
      */
     @Test
     public void testExtractFromOwnEmbeded() throws IOException {
         PowerPointExtractor ppe = openExtractor("ppt_with_embeded.ppt");
         List<OLEShape> shapes = ppe.getOLEShapes();
         assertEquals("Expected 6 ole shapes", 6, shapes.size());
         int num_ppt = 0, num_doc = 0, num_xls = 0;
         for (OLEShape ole : shapes) {
             String name = ole.getInstanceName();
             InputStream data = ole.getObjectData().getData();
             if ("Worksheet".equals(name)) {
                 HSSFWorkbook wb = new HSSFWorkbook(data);
                 num_xls++;
                 wb.close();
             } else if ("Document".equals(name)) {
                 HWPFDocument doc = new HWPFDocument(data);
                 num_doc++;
                 doc.close();
             } else if ("Presentation".equals(name)) {
                 num_ppt++;
                 HSLFSlideShow ppt = new HSLFSlideShow(data);
                 ppt.close();
             }
             data.close();
         }
         assertEquals("Expected 2 embedded Word Documents", 2, num_doc);
         assertEquals("Expected 2 embedded Excel Spreadsheets", 2, num_xls);
         assertEquals("Expected 2 embedded PowerPoint Presentations", 2, num_ppt);
         ppe.close();
     }

     /**
      * A powerpoint file with embeded powerpoint files
      */
     @Test
     public void test52991() throws IOException {
         PowerPointExtractor ppe = openExtractor("badzip.ppt");
         for (OLEShape shape : ppe.getOLEShapes()) {
             IOUtils.copy(shape.getObjectData().getData(), new ByteArrayOutputStream());
         }
         ppe.close();
     }

     /**
      * From bug #45543
      */
     @Test
     public void testWithComments() throws IOException {
         PowerPointExtractor ppe1 = openExtractor("WithComments.ppt");
         String text = ppe1.getText();
         assertFalse("Comments not in by default", text.contains("This is a test comment"));

         ppe1.setCommentsByDefault(true);

         text = ppe1.getText();
         assertContains(text, "This is a test comment");
         ppe1.close();


         // And another file
         PowerPointExtractor ppe2 = openExtractor("45543.ppt");
         text = ppe2.getText();
         assertFalse("Comments not in by default", text.contains("testdoc"));

         ppe2.setCommentsByDefault(true);

         text = ppe2.getText();
         assertContains(text, "testdoc");
         ppe2.close();
     }

     /**
      * From bug #45537
      */
     @Test
     public void testHeaderFooter() throws IOException {
         String text;

         // With a header on the notes
         InputStream is1 = slTests.openResourceAsStream("45537_Header.ppt");
         HSLFSlideShow ppt1 = new HSLFSlideShow(is1);
         is1.close();
         assertNotNull(ppt1.getNotesHeadersFooters());
         assertEquals("testdoc test phrase", ppt1.getNotesHeadersFooters().getHeaderText());

         PowerPointExtractor ppe1 = new PowerPointExtractor(ppt1.getSlideShowImpl());

         text = ppe1.getText();
         assertFalse("Header shouldn't be there by default\n" + text, text.contains("testdoc"));
         assertFalse("Header shouldn't be there by default\n" + text, text.contains("test phrase"));

         ppe1.setNotesByDefault(true);
         text = ppe1.getText();
         assertContains(text, "testdoc");
         assertContains(text, "test phrase");
         ppe1.close();
         ppt1.close();

         // And with a footer, also on notes
         InputStream is2 = slTests.openResourceAsStream("45537_Footer.ppt");
         HSLFSlideShow ppt2 = new HSLFSlideShow(is2);
         is2.close();

         assertNotNull(ppt2.getNotesHeadersFooters());
         assertEquals("testdoc test phrase", ppt2.getNotesHeadersFooters().getFooterText());
         ppt2.close();

         PowerPointExtractor ppe2 = openExtractor("45537_Footer.ppt");

         text = ppe2.getText();
         assertFalse("Header shouldn't be there by default\n" + text, text.contains("testdoc"));
         assertFalse("Header shouldn't be there by default\n" + text, text.contains("test phrase"));

         ppe2.setNotesByDefault(true);
         text = ppe2.getText();
         assertContains(text, "testdoc");
         assertContains(text, "test phrase");
         ppe2.close();
     }

     @SuppressWarnings("unused")
     @Test
     public void testSlideMasterText() throws IOException {
         String masterTitleText = "This is the Master Title";
         String masterRandomText = "This text comes from the Master Slide";
         String masterFooterText = "Footer from the master slide";
         PowerPointExtractor ppe = openExtractor("WithMaster.ppt");
         ppe.setMasterByDefault(true);

         String text = ppe.getText();
         assertContains(text, masterRandomText);
         assertContains(text, masterFooterText);
         ppe.close();
     }

     @Test
     public void testMasterText() throws IOException {
         PowerPointExtractor ppe1 = openExtractor("master_text.ppt");

         // Initially not there
         String text = ppe1.getText();
         assertFalse(text.contains("Text that I added to the master slide"));

         // Enable, shows up
         ppe1.setMasterByDefault(true);
         text = ppe1.getText();
         assertTrue(text.contains("Text that I added to the master slide"));

         // Make sure placeholder text does not come out
         assertFalse(text.contains("Click to edit Master"));
         ppe1.close();

         // Now with another file only containing master text
         // Will always show up
         PowerPointExtractor ppe2 = openExtractor("WithMaster.ppt");
         String masterText = "Footer from the master slide";

         text = ppe2.getText();
         assertContainsIgnoreCase(text, "master");
         assertContains(text, masterText);
         ppe2.close();
     }

     /**
      * Bug #54880 Chinese text not extracted properly
      */
     @Test
     public void testChineseText() throws IOException {
         PowerPointExtractor ppe = openExtractor("54880_chinese.ppt");

         String text = ppe.getText();

         // Check for the english text line
         assertContains(text, "Single byte");

         // Check for the english text in the mixed line
         assertContains(text, "Mix");

         // Check for the chinese text in the mixed line
         assertContains(text, "\u8868");

         // Check for the chinese only text line
         assertContains(text, "\uff8a\uff9d\uff76\uff78");
         ppe.close();
     }

     /**
      * Tests that we can work with both {@link POIFSFileSystem}
      * and {@link NPOIFSFileSystem}
      */
     @SuppressWarnings("resource")
     @Test
     public void testDifferentPOIFS() throws IOException {
         // Open the two filesystems
         File pptFile = slTests.getFile("basic_test_ppt_file.ppt");
         InputStream is1 = new FileInputStream(pptFile);
         OPOIFSFileSystem opoifs = new OPOIFSFileSystem(is1);
         is1.close();
         NPOIFSFileSystem npoifs = new NPOIFSFileSystem(pptFile);

         DirectoryNode[] files = { opoifs.getRoot(), npoifs.getRoot() };

         // Open directly
         for (DirectoryNode dir : files) {
             PowerPointExtractor extractor = new PowerPointExtractor(dir);
             assertEquals(expectText, extractor.getText());
         }

         // Open via a HSLFSlideShow
         for (DirectoryNode dir : files) {
             HSLFSlideShowImpl slideshow = new HSLFSlideShowImpl(dir);
             PowerPointExtractor extractor = new PowerPointExtractor(slideshow);
             assertEquals(expectText, extractor.getText());
             extractor.close();
             slideshow.close();
         }

         npoifs.close();
     }

     @Test
     public void testTable() throws Exception {
         PowerPointExtractor ppe1 = openExtractor("54111.ppt");
         String text1 = ppe1.getText();
         String target1 = "TH Cell 1\tTH Cell 2\tTH Cell 3\tTH Cell 4\n"+
                          "Row 1, Cell 1\tRow 1, Cell 2\tRow 1, Cell 3\tRow 1, Cell 4\n"+
                          "Row 2, Cell 1\tRow 2, Cell 2\tRow 2, Cell 3\tRow 2, Cell 4\n"+
                          "Row 3, Cell 1\tRow 3, Cell 2\tRow 3, Cell 3\tRow 3, Cell 4\n"+
                          "Row 4, Cell 1\tRow 4, Cell 2\tRow 4, Cell 3\tRow 4, Cell 4\n"+
                          "Row 5, Cell 1\tRow 5, Cell 2\tRow 5, Cell 3\tRow 5, Cell 4\n";
         assertTrue(text1.contains(target1));
         ppe1.close();

         PowerPointExtractor ppe2 = openExtractor("54722.ppt");
         String text2 = ppe2.getText();

         String target2 = "this\tText\tis\twithin\ta\n" +
                 "table\t1\t2\t3\t4";
         assertTrue(text2.contains(target2));
         ppe2.close();
     }

     // bug 60003
     @Test
     public void testExtractMasterSlideFooterText() throws Exception {
         PowerPointExtractor ppe = openExtractor("60003.ppt");
         ppe.setMasterByDefault(true);

         String text = ppe.getText();
         assertContains(text, "Prague");
         ppe.close();
     }
 }
	/* ====================================================================
	Licensed to the Apache Software Foundation (ASF) under one or more
	contributor license agreements. See the NOTICE file distributed with
	this work for additional information regarding copyright ownership.
	The ASF licenses this file to You under the Apache License, Version 2.0
	(the "License"); you may not use this file except in compliance with
	the License. You may obtain a copy of the License at

	http://www.apache.org/licenses/LICENSE-2.0

	Unless required by applicable law or agreed to in writing, software
	distributed under the License is distributed on an "AS IS" BASIS,
	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	See the License for the specific language governing permissions and
	limitations under the License.
	==================================================================== */

	package org.apache.poi.hslf.extractor;

	import static org.apache.poi.POITestCase.assertContains;
	import static org.apache.poi.POITestCase.assertContainsIgnoreCase;
	import static org.junit.Assert.assertEquals;
	import static org.junit.Assert.assertFalse;
	import static org.junit.Assert.assertNotNull;
	import static org.junit.Assert.assertTrue;

	import java.io.ByteArrayOutputStream;
	import java.io.File;
	import java.io.FileInputStream;
	import java.io.IOException;
	import java.io.InputStream;
	import java.util.List;

	import org.apache.poi.POIDataSamples;
	import org.apache.poi.hslf.model.OLEShape;
	import org.apache.poi.hslf.usermodel.HSLFSlideShow;
	import org.apache.poi.hslf.usermodel.HSLFSlideShowImpl;
	import org.apache.poi.hssf.usermodel.HSSFWorkbook;
	import org.apache.poi.hwpf.HWPFDocument;
	import org.apache.poi.poifs.filesystem.DirectoryNode;
	import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
	import org.apache.poi.poifs.filesystem.OPOIFSFileSystem;
	import org.apache.poi.poifs.filesystem.POIFSFileSystem;
	import org.apache.poi.util.IOUtils;
	import org.junit.Test;

	/**
	* Tests that the extractor correctly gets the text out of our sample file
	*/
	public final class TestExtractor {
	/**
	* Extractor primed on the 2 page basic test data
	*/
	private static final String expectText = "This is a test title\nThis is a test subtitle\nThis is on page 1\nThis is the title on page 2\nThis is page two\nIt has several blocks of text\nNone of them have formatting\n";

	/**
	* Extractor primed on the 1 page but text-box'd test data
	*/
	private static final String expectText2 = "Hello, World!!!\nI am just a poor boy\nThis is Times New Roman\nPlain Text \n";

	/**
	* Where our embeded files live
	*/
	private static POIDataSamples slTests = POIDataSamples.getSlideShowInstance();

	// @Before
	// public void setUp() throws Exception {
	// ppe = new PowerPointExtractor(slTests.getFile("basic_test_ppt_file.ppt").getCanonicalPath());
	// ppe2 = new PowerPointExtractor(slTests.getFile("with_textbox.ppt").getCanonicalPath());
	// }

	// @After
	// public void closeResources() throws Exception {
	// ppe2.close();
	// ppe.close();
	// }

	private PowerPointExtractor openExtractor(String fileName) throws IOException {
	InputStream is = slTests.openResourceAsStream(fileName);
	try {
	return new PowerPointExtractor(is);
	} finally {
	is.close();
	}
	}

	@Test
	public void testReadSheetText() throws IOException {
	// Basic 2 page example
	PowerPointExtractor ppe = openExtractor("basic_test_ppt_file.ppt");
	ensureTwoStringsTheSame(expectText, ppe.getText());
	ppe.close();

	// 1 page example with text boxes
	PowerPointExtractor ppe2 = openExtractor("with_textbox.ppt");
	ensureTwoStringsTheSame(expectText2, ppe2.getText());
	ppe2.close();
	}

	@Test
	public void testReadNoteText() throws IOException {
	// Basic 2 page example
	PowerPointExtractor ppe = openExtractor("basic_test_ppt_file.ppt");
	String notesText = ppe.getNotes();
	String expText = "These are the notes for page 1\nThese are the notes on page two, again lacking formatting\n";
	ensureTwoStringsTheSame(expText, notesText);
	ppe.close();

	// Other one doesn't have notes
	PowerPointExtractor ppe2 = openExtractor("with_textbox.ppt");
	notesText = ppe2.getNotes();
	expText = "";
	ensureTwoStringsTheSame(expText, notesText);
	ppe2.close();
	}

	@Test
	public void testReadBoth() throws IOException {
	String[] slText = new String[]{
	"This is a test title\nThis is a test subtitle\nThis is on page 1\n",
	"This is the title on page 2\nThis is page two\nIt has several blocks of text\nNone of them have formatting\n"
	};
	String[] ntText = new String[]{
	"These are the notes for page 1\n",
	"These are the notes on page two, again lacking formatting\n"
	};

	PowerPointExtractor ppe = openExtractor("basic_test_ppt_file.ppt");
	ppe.setSlidesByDefault(true);
	ppe.setNotesByDefault(false);
	assertEquals(slText[0] + slText[1], ppe.getText());

	ppe.setSlidesByDefault(false);
	ppe.setNotesByDefault(true);
	assertEquals(ntText[0] + ntText[1], ppe.getText());

	ppe.setSlidesByDefault(true);
	ppe.setNotesByDefault(true);
	assertEquals(slText[0] + slText[1] + "\n" + ntText[0] + ntText[1], ppe.getText());
	ppe.close();
	}

	/**
	* Test that when presented with a PPT file missing the odd
	* core record, we can still get the rest of the text out
	*
	* @throws Exception
	*/
	@Test
	public void testMissingCoreRecords() throws IOException {
	PowerPointExtractor ppe = openExtractor("missing_core_records.ppt");

	String text = ppe.getText(true, false);
	String nText = ppe.getNotes();

	assertNotNull(text);
	assertNotNull(nText);

	// Notes record were corrupt, so don't expect any
	assertEquals(nText.length(), 0);

	// Slide records were fine
	assertTrue(text.startsWith("Using Disease Surveillance and Response"));

	ppe.close();
	}

	private void ensureTwoStringsTheSame(String exp, String act) {
	assertEquals(exp.length(), act.length());
	char[] expC = exp.toCharArray();
	char[] actC = act.toCharArray();
	for (int i = 0; i < expC.length; i++) {
	assertEquals("Char " + i, expC[i], actC[i]);
	}
	assertEquals(exp, act);
	}

	@Test
	public void testExtractFromEmbeded() throws IOException {
	InputStream is = POIDataSamples.getSpreadSheetInstance().openResourceAsStream("excel_with_embeded.xls");
	POIFSFileSystem fs = new POIFSFileSystem(is);
	DirectoryNode root = fs.getRoot();
	PowerPointExtractor ppe1 = assertExtractFromEmbedded(root, "MBD0000A3B6", "Sample PowerPoint file\nThis is the 1st file\nNot much too it\n");
	PowerPointExtractor ppe2 = assertExtractFromEmbedded(root, "MBD0000A3B3", "Sample PowerPoint file\nThis is the 2nd file\nNot much too it either\n");
	ppe2.close();
	ppe1.close();
	fs.close();
	}

	private PowerPointExtractor assertExtractFromEmbedded(DirectoryNode root, String entryName, String expected)
	throws IOException {
	DirectoryNode dir = (DirectoryNode)root.getEntry(entryName);
	assertTrue(dir.hasEntry("PowerPoint Document"));

	// Check the first file
	HSLFSlideShowImpl ppt = new HSLFSlideShowImpl(dir);
	PowerPointExtractor ppe = new PowerPointExtractor(ppt);
	assertEquals(expected, ppe.getText(true, false));
	return ppe;
	}

	/**
	* A powerpoint file with embeded powerpoint files
	*/
	@Test
	public void testExtractFromOwnEmbeded() throws IOException {
	PowerPointExtractor ppe = openExtractor("ppt_with_embeded.ppt");
	List<OLEShape> shapes = ppe.getOLEShapes();
	assertEquals("Expected 6 ole shapes", 6, shapes.size());
	int num_ppt = 0, num_doc = 0, num_xls = 0;
	for (OLEShape ole : shapes) {
	String name = ole.getInstanceName();
	InputStream data = ole.getObjectData().getData();
	if ("Worksheet".equals(name)) {
	HSSFWorkbook wb = new HSSFWorkbook(data);
	num_xls++;
	wb.close();
	} else if ("Document".equals(name)) {
	HWPFDocument doc = new HWPFDocument(data);
	num_doc++;
	doc.close();
	} else if ("Presentation".equals(name)) {
	num_ppt++;
	HSLFSlideShow ppt = new HSLFSlideShow(data);
	ppt.close();
	}
	data.close();
	}
	assertEquals("Expected 2 embedded Word Documents", 2, num_doc);
	assertEquals("Expected 2 embedded Excel Spreadsheets", 2, num_xls);
	assertEquals("Expected 2 embedded PowerPoint Presentations", 2, num_ppt);
	ppe.close();
	}

	/**
	* A powerpoint file with embeded powerpoint files
	*/
	@Test
	public void test52991() throws IOException {
	PowerPointExtractor ppe = openExtractor("badzip.ppt");
	for (OLEShape shape : ppe.getOLEShapes()) {
	IOUtils.copy(shape.getObjectData().getData(), new ByteArrayOutputStream());
	}
	ppe.close();
	}

	/**
	* From bug #45543
	*/
	@Test
	public void testWithComments() throws IOException {
	PowerPointExtractor ppe1 = openExtractor("WithComments.ppt");
	String text = ppe1.getText();
	assertFalse("Comments not in by default", text.contains("This is a test comment"));

	ppe1.setCommentsByDefault(true);

	text = ppe1.getText();
	assertContains(text, "This is a test comment");
	ppe1.close();


	// And another file
	PowerPointExtractor ppe2 = openExtractor("45543.ppt");
	text = ppe2.getText();
	assertFalse("Comments not in by default", text.contains("testdoc"));

	ppe2.setCommentsByDefault(true);

	text = ppe2.getText();
	assertContains(text, "testdoc");
	ppe2.close();
	}

	/**
	* From bug #45537
	*/
	@Test
	public void testHeaderFooter() throws IOException {
	String text;

	// With a header on the notes
	InputStream is1 = slTests.openResourceAsStream("45537_Header.ppt");
	HSLFSlideShow ppt1 = new HSLFSlideShow(is1);
	is1.close();
	assertNotNull(ppt1.getNotesHeadersFooters());
	assertEquals("testdoc test phrase", ppt1.getNotesHeadersFooters().getHeaderText());

	PowerPointExtractor ppe1 = new PowerPointExtractor(ppt1.getSlideShowImpl());

	text = ppe1.getText();
	assertFalse("Header shouldn't be there by default\n" + text, text.contains("testdoc"));
	assertFalse("Header shouldn't be there by default\n" + text, text.contains("test phrase"));

	ppe1.setNotesByDefault(true);
	text = ppe1.getText();
	assertContains(text, "testdoc");
	assertContains(text, "test phrase");
	ppe1.close();
	ppt1.close();

	// And with a footer, also on notes
	InputStream is2 = slTests.openResourceAsStream("45537_Footer.ppt");
	HSLFSlideShow ppt2 = new HSLFSlideShow(is2);
	is2.close();

	assertNotNull(ppt2.getNotesHeadersFooters());
	assertEquals("testdoc test phrase", ppt2.getNotesHeadersFooters().getFooterText());
	ppt2.close();

	PowerPointExtractor ppe2 = openExtractor("45537_Footer.ppt");

	text = ppe2.getText();
	assertFalse("Header shouldn't be there by default\n" + text, text.contains("testdoc"));
	assertFalse("Header shouldn't be there by default\n" + text, text.contains("test phrase"));

	ppe2.setNotesByDefault(true);
	text = ppe2.getText();
	assertContains(text, "testdoc");
	assertContains(text, "test phrase");
	ppe2.close();
	}

	@SuppressWarnings("unused")
	@Test
	public void testSlideMasterText() throws IOException {
	String masterTitleText = "This is the Master Title";
	String masterRandomText = "This text comes from the Master Slide";
	String masterFooterText = "Footer from the master slide";
	PowerPointExtractor ppe = openExtractor("WithMaster.ppt");
	ppe.setMasterByDefault(true);

	String text = ppe.getText();
	assertContains(text, masterRandomText);
	assertContains(text, masterFooterText);
	ppe.close();
	}

	@Test
	public void testMasterText() throws IOException {
	PowerPointExtractor ppe1 = openExtractor("master_text.ppt");

	// Initially not there
	String text = ppe1.getText();
	assertFalse(text.contains("Text that I added to the master slide"));

	// Enable, shows up
	ppe1.setMasterByDefault(true);
	text = ppe1.getText();
	assertTrue(text.contains("Text that I added to the master slide"));

	// Make sure placeholder text does not come out
	assertFalse(text.contains("Click to edit Master"));
	ppe1.close();

	// Now with another file only containing master text
	// Will always show up
	PowerPointExtractor ppe2 = openExtractor("WithMaster.ppt");
	String masterText = "Footer from the master slide";

	text = ppe2.getText();
	assertContainsIgnoreCase(text, "master");
	assertContains(text, masterText);
	ppe2.close();
	}

	/**
	* Bug #54880 Chinese text not extracted properly
	*/
	@Test
	public void testChineseText() throws IOException {
	PowerPointExtractor ppe = openExtractor("54880_chinese.ppt");

	String text = ppe.getText();

	// Check for the english text line
	assertContains(text, "Single byte");

	// Check for the english text in the mixed line
	assertContains(text, "Mix");

	// Check for the chinese text in the mixed line
	assertContains(text, "\u8868");

	// Check for the chinese only text line
	assertContains(text, "\uff8a\uff9d\uff76\uff78");
	ppe.close();
	}

	/**
	* Tests that we can work with both {@link POIFSFileSystem}
	* and {@link NPOIFSFileSystem}
	*/
	@SuppressWarnings("resource")
	@Test
	public void testDifferentPOIFS() throws IOException {
	// Open the two filesystems
	File pptFile = slTests.getFile("basic_test_ppt_file.ppt");
	InputStream is1 = new FileInputStream(pptFile);
	OPOIFSFileSystem opoifs = new OPOIFSFileSystem(is1);
	is1.close();
	NPOIFSFileSystem npoifs = new NPOIFSFileSystem(pptFile);

	DirectoryNode[] files = { opoifs.getRoot(), npoifs.getRoot() };

	// Open directly
	for (DirectoryNode dir : files) {
	PowerPointExtractor extractor = new PowerPointExtractor(dir);
	assertEquals(expectText, extractor.getText());
	}

	// Open via a HSLFSlideShow
	for (DirectoryNode dir : files) {
	HSLFSlideShowImpl slideshow = new HSLFSlideShowImpl(dir);
	PowerPointExtractor extractor = new PowerPointExtractor(slideshow);
	assertEquals(expectText, extractor.getText());
	extractor.close();
	slideshow.close();
	}

	npoifs.close();
	}

	@Test
	public void testTable() throws Exception {
	PowerPointExtractor ppe1 = openExtractor("54111.ppt");
	String text1 = ppe1.getText();
	String target1 = "TH Cell 1\tTH Cell 2\tTH Cell 3\tTH Cell 4\n"+
	"Row 1, Cell 1\tRow 1, Cell 2\tRow 1, Cell 3\tRow 1, Cell 4\n"+
	"Row 2, Cell 1\tRow 2, Cell 2\tRow 2, Cell 3\tRow 2, Cell 4\n"+
	"Row 3, Cell 1\tRow 3, Cell 2\tRow 3, Cell 3\tRow 3, Cell 4\n"+
	"Row 4, Cell 1\tRow 4, Cell 2\tRow 4, Cell 3\tRow 4, Cell 4\n"+
	"Row 5, Cell 1\tRow 5, Cell 2\tRow 5, Cell 3\tRow 5, Cell 4\n";
	assertTrue(text1.contains(target1));
	ppe1.close();

	PowerPointExtractor ppe2 = openExtractor("54722.ppt");
	String text2 = ppe2.getText();

	String target2 = "this\tText\tis\twithin\ta\n" +
	"table\t1\t2\t3\t4";
	assertTrue(text2.contains(target2));
	ppe2.close();
	}

	// bug 60003
	@Test
	public void testExtractMasterSlideFooterText() throws Exception {
	PowerPointExtractor ppe = openExtractor("60003.ppt");
	ppe.setMasterByDefault(true);

	String text = ppe.getText();
	assertContains(text, "Prague");
	ppe.close();
	}
	}