| /* ==================================================================== |
| Licensed to the Apache Software Foundation (ASF) under one or more |
| contributor license agreements. See the NOTICE file distributed with |
| this work for additional information regarding copyright ownership. |
| The ASF licenses this file to You under the Apache License, Version 2.0 |
| (the "License"); you may not use this file except in compliance with |
| the License. You may obtain a copy of the License at |
| |
| http://www.apache.org/licenses/LICENSE-2.0 |
| |
| Unless required by applicable law or agreed to in writing, software |
| distributed under the License is distributed on an "AS IS" BASIS, |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| See the License for the specific language governing permissions and |
| limitations under the License. |
| ==================================================================== */ |
| |
| package org.apache.poi.hslf.extractor; |
| |
| import static org.apache.poi.POITestCase.assertContains; |
| import static org.apache.poi.POITestCase.assertContainsIgnoreCase; |
| import static org.apache.poi.POITestCase.assertNotContained; |
| import static org.junit.jupiter.api.Assertions.assertEquals; |
| import static org.junit.jupiter.api.Assertions.assertFalse; |
| import static org.junit.jupiter.api.Assertions.assertNotNull; |
| import static org.junit.jupiter.api.Assertions.assertTrue; |
| |
| import java.io.ByteArrayOutputStream; |
| import java.io.File; |
| import java.io.IOException; |
| import java.io.InputStream; |
| import java.security.MessageDigest; |
| import java.util.BitSet; |
| import java.util.List; |
| |
| import com.zaxxer.sparsebits.SparseBitSet; |
| import org.apache.commons.codec.binary.Base64; |
| import org.apache.commons.codec.binary.Hex; |
| import org.apache.poi.POIDataSamples; |
| import org.apache.poi.hslf.usermodel.HSLFObjectShape; |
| import org.apache.poi.hslf.usermodel.HSLFSlideShow; |
| import org.apache.poi.hssf.usermodel.HSSFWorkbook; |
| import org.apache.poi.hwpf.HWPFDocument; |
| import org.apache.poi.poifs.crypt.CryptoFunctions; |
| import org.apache.poi.poifs.crypt.HashAlgorithm; |
| import org.apache.poi.poifs.filesystem.DirectoryNode; |
| import org.apache.poi.poifs.filesystem.POIFSFileSystem; |
| import org.apache.poi.sl.extractor.SlideShowExtractor; |
| import org.apache.poi.sl.usermodel.ObjectShape; |
| import org.apache.poi.sl.usermodel.SlideShow; |
| import org.apache.poi.sl.usermodel.SlideShowFactory; |
| import org.apache.poi.util.IOUtils; |
| import org.apache.poi.util.NullOutputStream; |
| import org.junit.jupiter.api.Test; |
| |
| /** |
| * Tests that the extractor correctly gets the text out of our sample file |
| */ |
| public final class TestExtractor { |
| /** |
| * Extractor primed on the 2 page basic test data |
| */ |
| private static final String EXPECTED_PAGE1 = |
| "This is a test title\n" + |
| "This is a test subtitle\n\n" + |
| "This is on page 1\n"; |
| |
| private static final String EXPECTED_PAGE2 = |
| "This is the title on page 2\n" + |
| "This is page two\n\n" + |
| "It has several blocks of text\n\n" + |
| "None of them have formatting\n"; |
| |
| private static final String NOTES_PAGE1 = |
| "\nThese are the notes for page 1\n"; |
| |
| private static final String NOTES_PAGE2 = |
| "\nThese are the notes on page two, again lacking formatting\n"; |
| |
| /** |
| * Where our embeded files live |
| */ |
| private static final POIDataSamples slTests = POIDataSamples.getSlideShowInstance(); |
| |
| private SlideShowExtractor<?,?> openExtractor(String fileName) throws IOException { |
| try (InputStream is = slTests.openResourceAsStream(fileName)) { |
| return new SlideShowExtractor<>(SlideShowFactory.create(is)); |
| } |
| } |
| |
| @Test |
| void testReadSheetText() throws IOException { |
| // Basic 2 page example |
| try (SlideShowExtractor<?,?> ppe = openExtractor("basic_test_ppt_file.ppt")) { |
| assertEquals(EXPECTED_PAGE1+EXPECTED_PAGE2, ppe.getText()); |
| } |
| |
| // Extractor primed on the 1 page but text-box'd test data |
| final String expectText2 = |
| "Hello, World!!!\n" + |
| "I am just a poor boy\n" + |
| "This is Times New Roman\n" + |
| "Plain Text \n"; |
| |
| // 1 page example with text boxes |
| try (SlideShowExtractor<?,?> ppe = openExtractor("with_textbox.ppt")) { |
| assertEquals(expectText2, ppe.getText()); |
| } |
| } |
| |
| @Test |
| void testReadNoteText() throws IOException { |
| // Basic 2 page example |
| try (SlideShowExtractor<?,?> ppe = openExtractor("basic_test_ppt_file.ppt")) { |
| ppe.setNotesByDefault(true); |
| ppe.setSlidesByDefault(false); |
| ppe.setMasterByDefault(false); |
| String notesText = ppe.getText(); |
| assertEquals(NOTES_PAGE1+NOTES_PAGE2, notesText); |
| } |
| |
| // Other one doesn't have notes |
| try (SlideShowExtractor<?,?> ppe = openExtractor("with_textbox.ppt")) { |
| ppe.setNotesByDefault(true); |
| ppe.setSlidesByDefault(false); |
| ppe.setMasterByDefault(false); |
| String notesText = ppe.getText(); |
| String expText = ""; |
| assertEquals(expText, notesText); |
| } |
| } |
| |
| @Test |
| void testReadBoth() throws IOException { |
| String[] slText = { EXPECTED_PAGE1, EXPECTED_PAGE2 }; |
| String[] ntText = { NOTES_PAGE1, NOTES_PAGE2 }; |
| |
| try (SlideShowExtractor<?,?> ppe = openExtractor("basic_test_ppt_file.ppt")) { |
| ppe.setSlidesByDefault(true); |
| ppe.setNotesByDefault(false); |
| assertEquals(slText[0] + slText[1], ppe.getText()); |
| |
| ppe.setSlidesByDefault(false); |
| ppe.setNotesByDefault(true); |
| assertEquals(ntText[0] + ntText[1], ppe.getText()); |
| |
| ppe.setSlidesByDefault(true); |
| ppe.setNotesByDefault(true); |
| assertEquals(slText[0] + ntText[0] + slText[1] + ntText[1], ppe.getText()); |
| } |
| } |
| |
| /** |
| * Test that when presented with a PPT file missing the odd |
| * core record, we can still get the rest of the text out |
| */ |
| @Test |
| void testMissingCoreRecords() throws IOException { |
| try (SlideShowExtractor<?,?> ppe = openExtractor("missing_core_records.ppt")) { |
| ppe.setSlidesByDefault(true); |
| ppe.setNotesByDefault(false); |
| String text = ppe.getText(); |
| ppe.setSlidesByDefault(false); |
| ppe.setNotesByDefault(true); |
| String nText = ppe.getText(); |
| |
| assertNotNull(text); |
| assertNotNull(nText); |
| |
| // Notes record were corrupt, so don't expect any |
| assertEquals(nText.length(), 0); |
| |
| // Slide records were fine |
| assertContains(text, "Using Disease Surveillance and Response"); |
| } |
| } |
| |
| @Test |
| void testExtractFromEmbeded() throws IOException { |
| try (final InputStream is = POIDataSamples.getSpreadSheetInstance().openResourceAsStream("excel_with_embeded.xls"); |
| final POIFSFileSystem fs = new POIFSFileSystem(is)) { |
| final DirectoryNode root = fs.getRoot(); |
| |
| final String[] TEST_SET = { |
| "MBD0000A3B6", "Sample PowerPoint file\nThis is the 1st file\n\nNot much too it\n", |
| "MBD0000A3B3", "Sample PowerPoint file\nThis is the 2nd file\n\nNot much too it either\n" |
| }; |
| |
| for (int i=0; i<TEST_SET.length; i+=2) { |
| DirectoryNode dir = (DirectoryNode)root.getEntry(TEST_SET[i]); |
| assertTrue(dir.hasEntry(HSLFSlideShow.POWERPOINT_DOCUMENT)); |
| |
| try (final SlideShow<?,?> ppt = SlideShowFactory.create(dir); |
| final SlideShowExtractor<?,?> ppe = new SlideShowExtractor<>(ppt)) { |
| assertEquals(TEST_SET[i+1], ppe.getText()); |
| } |
| } |
| } |
| } |
| |
| /** |
| * A powerpoint file with embeded powerpoint files |
| */ |
| @Test |
| void testExtractFromOwnEmbeded() throws IOException { |
| try (SlideShowExtractor<?,?> ppe = openExtractor("ppt_with_embeded.ppt")) { |
| List<? extends ObjectShape<?,?>> shapes = ppe.getOLEShapes(); |
| assertEquals(6, shapes.size(), "Expected 6 ole shapes"); |
| int num_ppt = 0, num_doc = 0, num_xls = 0; |
| for (ObjectShape<?,?> ole : shapes) { |
| String name = ((HSLFObjectShape)ole).getInstanceName(); |
| InputStream data = ole.getObjectData().getInputStream(); |
| if ("Worksheet".equals(name)) { |
| HSSFWorkbook wb = new HSSFWorkbook(data); |
| num_xls++; |
| wb.close(); |
| } else if ("Document".equals(name)) { |
| HWPFDocument doc = new HWPFDocument(data); |
| num_doc++; |
| doc.close(); |
| } else if ("Presentation".equals(name)) { |
| num_ppt++; |
| HSLFSlideShow ppt = new HSLFSlideShow(data); |
| ppt.close(); |
| } |
| data.close(); |
| } |
| assertEquals(2, num_doc, "Expected 2 embedded Word Documents"); |
| assertEquals(2, num_xls, "Expected 2 embedded Excel Spreadsheets"); |
| assertEquals(2, num_ppt, "Expected 2 embedded PowerPoint Presentations"); |
| } |
| } |
| |
| /** |
| * A powerpoint file with embeded powerpoint files |
| */ |
| @Test |
| void test52991() throws IOException { |
| try (SlideShowExtractor<?,?> ppe = openExtractor("badzip.ppt")) { |
| List<? extends ObjectShape<?, ?>> shapes = ppe.getOLEShapes(); |
| assertEquals(1, shapes.size()); |
| MessageDigest sha2 = CryptoFunctions.getMessageDigest(HashAlgorithm.sha256); |
| try (InputStream is = shapes.get(0).getObjectData().getInputStream()) { |
| sha2.update(IOUtils.toByteArray(is)); |
| } |
| String exp = "lIRRfGMin6B4++WR4XvA82usdQ3ijeHBHU85j523sKY="; |
| String act = Base64.encodeBase64String(sha2.digest()); |
| assertEquals(exp, act); |
| } |
| } |
| |
| /** |
| * From bug #45543 |
| */ |
| @Test |
| void testWithComments() throws IOException { |
| try (final SlideShowExtractor<?,?> ppe = openExtractor("WithComments.ppt")) { |
| String text = ppe.getText(); |
| assertFalse(text.contains("This is a test comment"), "Comments not in by default"); |
| |
| ppe.setCommentsByDefault(true); |
| |
| text = ppe.getText(); |
| assertContains(text, "This is a test comment"); |
| } |
| |
| |
| // And another file |
| try (SlideShowExtractor<?,?> ppe = openExtractor("45543.ppt")) { |
| String text = ppe.getText(); |
| assertFalse(text.contains("testdoc"), "Comments not in by default"); |
| |
| ppe.setCommentsByDefault(true); |
| |
| text = ppe.getText(); |
| assertContains(text, "testdoc"); |
| } |
| } |
| |
| /** |
| * From bug #45537 |
| */ |
| @Test |
| void testHeaderFooter() throws IOException { |
| // With a header on the notes |
| try (InputStream is = slTests.openResourceAsStream("45537_Header.ppt"); |
| HSLFSlideShow ppt = new HSLFSlideShow(is)) { |
| |
| assertNotNull(ppt.getNotesHeadersFooters()); |
| assertEquals("testdoc test phrase", ppt.getNotesHeadersFooters().getHeaderText()); |
| |
| testHeaderFooterInner(ppt); |
| } |
| |
| // And with a footer, also on notes |
| try (final InputStream is = slTests.openResourceAsStream("45537_Footer.ppt"); |
| final HSLFSlideShow ppt = new HSLFSlideShow(is)) { |
| assertNotNull(ppt.getNotesHeadersFooters()); |
| assertEquals("testdoc test phrase", ppt.getNotesHeadersFooters().getFooterText()); |
| |
| testHeaderFooterInner(ppt); |
| } |
| } |
| |
| private void testHeaderFooterInner(final HSLFSlideShow ppt) throws IOException { |
| try (final SlideShowExtractor<?,?> ppe = new SlideShowExtractor<>(ppt)) { |
| String text = ppe.getText(); |
| assertFalse(text.contains("testdoc"), "Header shouldn't be there by default\n" + text); |
| assertFalse(text.contains("test phrase"), "Header shouldn't be there by default\n" + text); |
| |
| ppe.setNotesByDefault(true); |
| text = ppe.getText(); |
| assertContains(text, "testdoc"); |
| assertContains(text, "test phrase"); |
| } |
| } |
| |
| @Test |
| void testSlideMasterText() throws IOException { |
| String masterTitleText = "This is the Master Title"; |
| String masterRandomText = "This text comes from the Master Slide"; |
| String masterFooterText = "Footer from the master slide"; |
| try (final SlideShowExtractor<?,?> ppe = openExtractor("WithMaster.ppt")) { |
| ppe.setMasterByDefault(true); |
| |
| String text = ppe.getText(); |
| assertContains(text, masterRandomText); |
| assertNotContained(text, masterTitleText); |
| |
| //make sure that the footer only appears once |
| int masterFooters = 0; |
| int offset = text.indexOf(masterFooterText); |
| while (offset > -1) { |
| masterFooters++; |
| offset = text.indexOf(masterFooterText, offset+1); |
| } |
| assertEquals(1, masterFooters); |
| } |
| } |
| |
| @Test |
| void testSlideMasterText2() throws IOException { |
| try (final SlideShowExtractor<?,?> ppe = openExtractor("bug62591.ppt")) { |
| ppe.setMasterByDefault(true); |
| String text = ppe.getText(); |
| assertNotContained(text, "Titelmasterformat"); |
| } |
| } |
| |
| @Test |
| void testMasterText() throws IOException { |
| try (final SlideShowExtractor<?,?> ppe = openExtractor("master_text.ppt")) { |
| // Initially not there |
| String text = ppe.getText(); |
| assertFalse(text.contains("Text that I added to the master slide")); |
| |
| // Enable, shows up |
| ppe.setMasterByDefault(true); |
| text = ppe.getText(); |
| assertContains(text, "Text that I added to the master slide"); |
| |
| // Make sure placeholder text does not come out |
| assertNotContained(text, "Click to edit Master"); |
| } |
| |
| // Now with another file only containing master text |
| // Will always show up |
| try (final SlideShowExtractor<?,?> ppe = openExtractor("WithMaster.ppt")) { |
| String masterText = "Footer from the master slide"; |
| |
| String text = ppe.getText(); |
| assertContainsIgnoreCase(text, "master"); |
| assertContains(text, masterText); |
| } |
| } |
| |
| /** |
| * Bug #54880 Chinese text not extracted properly |
| */ |
| @Test |
| void testChineseText() throws IOException { |
| try (final SlideShowExtractor<?,?> ppe = openExtractor("54880_chinese.ppt")) { |
| String text = ppe.getText(); |
| |
| // Check for the english text line |
| assertContains(text, "Single byte"); |
| |
| // Check for the english text in the mixed line |
| assertContains(text, "Mix"); |
| |
| // Check for the chinese text in the mixed line |
| assertContains(text, "\u8868"); |
| |
| // Check for the chinese only text line |
| assertContains(text, "\uff8a\uff9d\uff76\uff78"); |
| } |
| } |
| |
| /** |
| * Tests that we can work with both {@link POIFSFileSystem} |
| * and {@link POIFSFileSystem} |
| */ |
| @SuppressWarnings("resource") |
| @Test |
| void testDifferentPOIFS() throws IOException { |
| // Open the two filesystems |
| File pptFile = slTests.getFile("basic_test_ppt_file.ppt"); |
| try (final POIFSFileSystem poifs = new POIFSFileSystem(pptFile, true)) { |
| // Open directly |
| try (SlideShow<?,?> ppt = SlideShowFactory.create(poifs.getRoot()); |
| SlideShowExtractor<?,?> extractor = new SlideShowExtractor<>(ppt)) { |
| assertEquals(EXPECTED_PAGE1+EXPECTED_PAGE2, extractor.getText()); |
| } |
| } |
| } |
| |
| @Test |
| void testTable() throws Exception { |
| try (SlideShowExtractor<?,?> ppe = openExtractor("54111.ppt")) { |
| String text = ppe.getText(); |
| String target = "TH Cell 1\tTH Cell 2\tTH Cell 3\tTH Cell 4\n" + |
| "Row 1, Cell 1\tRow 1, Cell 2\tRow 1, Cell 3\tRow 1, Cell 4\n" + |
| "Row 2, Cell 1\tRow 2, Cell 2\tRow 2, Cell 3\tRow 2, Cell 4\n" + |
| "Row 3, Cell 1\tRow 3, Cell 2\tRow 3, Cell 3\tRow 3, Cell 4\n" + |
| "Row 4, Cell 1\tRow 4, Cell 2\tRow 4, Cell 3\tRow 4, Cell 4\n" + |
| "Row 5, Cell 1\tRow 5, Cell 2\tRow 5, Cell 3\tRow 5, Cell 4\n"; |
| assertContains(text, target); |
| } |
| |
| try (SlideShowExtractor<?,?> ppe = openExtractor("54722.ppt")) { |
| String text = ppe.getText(); |
| |
| String target = "this\tText\tis\twithin\ta\n" + |
| "table\t1\t2\t3\t4"; |
| assertContains(text, target); |
| } |
| } |
| |
| // bug 60003 |
| @Test |
| void testExtractMasterSlideFooterText() throws Exception { |
| try (SlideShowExtractor<?,?> ppe = openExtractor("60003.ppt")) { |
| ppe.setMasterByDefault(true); |
| |
| String text = ppe.getText(); |
| assertContains(text, "Prague"); |
| } |
| } |
| |
| @Test |
| void testExtractGroupedShapeText() throws Exception { |
| try (final SlideShowExtractor<?,?> ppe = openExtractor("bug62092.ppt")) { |
| final String text = ppe.getText(); |
| |
| //this tests that we're ignoring text shapes at depth=0 |
| //i.e. POI has already included them in the slide's getTextParagraphs() |
| assertContains(text, "Text box1"); |
| assertEquals(1, countMatches(text,"Text box1")); |
| |
| |
| //the WordArt and text box count tests will fail |
| //if this content is available via getTextParagraphs() of the slide in POI |
| //i.e. when POI is fixed, these tests will fail, and |
| //we'll have to remove the workaround in HSLFExtractor's extractGroupText(...) |
| assertEquals(1, countMatches(text,"WordArt1")); |
| assertEquals(1, countMatches(text,"WordArt2")); |
| assertEquals(1, countMatches(text,"Ungrouped text box"));//should only be 1 |
| assertContains(text, "Text box2"); |
| assertContains(text, "Text box3"); |
| assertContains(text, "Text box4"); |
| assertContains(text, "Text box5"); |
| |
| //see below -- need to extract hyperlinks |
| assertContains(text, "tika"); |
| assertContains(text, "MyTitle"); |
| |
| } |
| } |
| |
| private static int countMatches(final String base, final String find) { |
| return base.split(find).length-1; |
| } |
| |
| @Test |
| void glyphCounting() throws IOException { |
| String[] expected = { |
| "Times New Roman", "\t\n ,-./01234679:ABDEFGILMNOPRSTVWabcdefghijklmnoprstuvwxyz\u00F3\u201C\u201D", |
| "Arial", " Lacdilnost" |
| }; |
| |
| StringBuilder sb = new StringBuilder(); |
| |
| try (SlideShowExtractor<?,?> ppt = openExtractor("45543.ppt")) { |
| for (int i=0; i<expected.length; i+=2) { |
| final String font = expected[i]; |
| final String cps = expected[i+1]; |
| |
| sb.setLength(0); |
| |
| BitSet l1 = ppt.getCodepoints(font, null, null); |
| l1.stream().mapToObj(Character::toChars).forEach(sb::append); |
| assertEquals(cps, sb.toString()); |
| |
| sb.setLength(0); |
| |
| SparseBitSet l2 = ppt.getCodepointsInSparseBitSet(font, null, null); |
| int cp = 0; |
| while ((cp = l2.nextSetBit(cp+1)) != -1) { |
| sb.append(Character.toChars(cp)); |
| } |
| assertEquals(cps, sb.toString()); |
| } |
| } |
| } |
| } |