| /* ==================================================================== |
| Licensed to the Apache Software Foundation (ASF) under one or more |
| contributor license agreements. See the NOTICE file distributed with |
| this work for additional information regarding copyright ownership. |
| The ASF licenses this file to You under the Apache License, Version 2.0 |
| (the "License"); you may not use this file except in compliance with |
| the License. You may obtain a copy of the License at |
| |
| http://www.apache.org/licenses/LICENSE-2.0 |
| |
| Unless required by applicable law or agreed to in writing, software |
| distributed under the License is distributed on an "AS IS" BASIS, |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| See the License for the specific language governing permissions and |
| limitations under the License. |
| ==================================================================== */ |
| package org.apache.poi.extractor; |
| |
| import static org.apache.poi.POITestCase.assertContains; |
| import static org.junit.Assert.assertEquals; |
| import static org.junit.Assert.assertFalse; |
| import static org.junit.Assert.assertNotNull; |
| import static org.junit.Assert.assertNull; |
| import static org.junit.Assert.assertTrue; |
| import static org.junit.Assert.fail; |
| |
| import java.io.File; |
| import java.io.FileInputStream; |
| import java.io.IOException; |
| |
| import org.apache.poi.POIDataSamples; |
| import org.apache.poi.POIOLE2TextExtractor; |
| import org.apache.poi.POITextExtractor; |
| import org.apache.poi.POIXMLException; |
| import org.apache.poi.POIXMLTextExtractor; |
| import org.apache.poi.UnsupportedFileFormatException; |
| import org.apache.poi.hdgf.extractor.VisioTextExtractor; |
| import org.apache.poi.hpbf.extractor.PublisherTextExtractor; |
| import org.apache.poi.hslf.extractor.PowerPointExtractor; |
| import org.apache.poi.hsmf.extractor.OutlookTextExtactor; |
| import org.apache.poi.hssf.HSSFTestDataSamples; |
| import org.apache.poi.hssf.OldExcelFormatException; |
| import org.apache.poi.hssf.extractor.EventBasedExcelExtractor; |
| import org.apache.poi.hssf.extractor.ExcelExtractor; |
| import org.apache.poi.hwpf.extractor.Word6Extractor; |
| import org.apache.poi.hwpf.extractor.WordExtractor; |
| import org.apache.poi.openxml4j.opc.OPCPackage; |
| import org.apache.poi.openxml4j.opc.PackageAccess; |
| import org.apache.poi.poifs.filesystem.OPOIFSFileSystem; |
| import org.apache.poi.poifs.filesystem.POIFSFileSystem; |
| import org.apache.poi.util.IOUtils; |
| import org.apache.poi.xdgf.extractor.XDGFVisioExtractor; |
| import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor; |
| import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor; |
| import org.apache.poi.xssf.extractor.XSSFExcelExtractor; |
| import org.apache.poi.xwpf.extractor.XWPFWordExtractor; |
| import org.junit.BeforeClass; |
| import org.junit.Test; |
| |
| /** |
| * Test that the extractor factory plays nicely |
| */ |
| public class TestExtractorFactory { |
| private static File txt; |
| |
| private static File xls; |
| private static File xlsx; |
| private static File xlsxStrict; |
| private static File xltx; |
| private static File xlsEmb; |
| |
| private static File doc; |
| private static File doc6; |
| private static File doc95; |
| private static File docx; |
| private static File dotx; |
| private static File docEmb; |
| private static File docEmbOOXML; |
| |
| private static File ppt; |
| private static File pptx; |
| |
| private static File msg; |
| private static File msgEmb; |
| private static File msgEmbMsg; |
| |
| private static File vsd; |
| private static File vsdx; |
| |
| private static File pub; |
| |
| private static File getFileAndCheck(POIDataSamples samples, String name) { |
| File file = samples.getFile(name); |
| |
| assertNotNull("Did not get a file for " + name, file); |
| assertTrue("Did not get a type file for " + name, file.isFile()); |
| assertTrue("File did not exist: " + name, file.exists()); |
| |
| return file; |
| } |
| |
| @BeforeClass |
| public static void setUp() throws Exception { |
| |
| POIDataSamples ssTests = POIDataSamples.getSpreadSheetInstance(); |
| xls = getFileAndCheck(ssTests, "SampleSS.xls"); |
| xlsx = getFileAndCheck(ssTests, "SampleSS.xlsx"); |
| xlsxStrict = getFileAndCheck(ssTests, "SampleSS.strict.xlsx"); |
| xltx = getFileAndCheck(ssTests, "test.xltx"); |
| xlsEmb = getFileAndCheck(ssTests, "excel_with_embeded.xls"); |
| |
| POIDataSamples wpTests = POIDataSamples.getDocumentInstance(); |
| doc = getFileAndCheck(wpTests, "SampleDoc.doc"); |
| doc6 = getFileAndCheck(wpTests, "Word6.doc"); |
| doc95 = getFileAndCheck(wpTests, "Word95.doc"); |
| docx = getFileAndCheck(wpTests, "SampleDoc.docx"); |
| dotx = getFileAndCheck(wpTests, "test.dotx"); |
| docEmb = getFileAndCheck(wpTests, "word_with_embeded.doc"); |
| docEmbOOXML = getFileAndCheck(wpTests, "word_with_embeded_ooxml.doc"); |
| |
| POIDataSamples slTests = POIDataSamples.getSlideShowInstance(); |
| ppt = getFileAndCheck(slTests, "SampleShow.ppt"); |
| pptx = getFileAndCheck(slTests, "SampleShow.pptx"); |
| txt = getFileAndCheck(slTests, "SampleShow.txt"); |
| |
| POIDataSamples dgTests = POIDataSamples.getDiagramInstance(); |
| vsd = getFileAndCheck(dgTests, "Test_Visio-Some_Random_Text.vsd"); |
| vsdx = getFileAndCheck(dgTests, "test.vsdx"); |
| |
| POIDataSamples pubTests = POIDataSamples.getPublisherInstance(); |
| pub = getFileAndCheck(pubTests, "Simple.pub"); |
| |
| POIDataSamples olTests = POIDataSamples.getHSMFInstance(); |
| msg = getFileAndCheck(olTests, "quick.msg"); |
| msgEmb = getFileAndCheck(olTests, "attachment_test_msg.msg"); |
| msgEmbMsg = getFileAndCheck(olTests, "attachment_msg_pdf.msg"); |
| } |
| |
| @Test |
| public void testFile() throws Exception { |
| // Excel |
| POITextExtractor xlsExtractor = ExtractorFactory.createExtractor(xls); |
| assertNotNull("Had empty extractor for " + xls, xlsExtractor); |
| assertTrue("Expected instanceof ExcelExtractor, but had: " + xlsExtractor.getClass(), |
| xlsExtractor |
| instanceof ExcelExtractor |
| ); |
| assertTrue( |
| xlsExtractor.getText().length() > 200 |
| ); |
| xlsExtractor.close(); |
| |
| POITextExtractor extractor = ExtractorFactory.createExtractor(xlsx); |
| assertTrue( |
| extractor.getClass().getName(), |
| extractor |
| instanceof XSSFExcelExtractor |
| ); |
| extractor.close(); |
| |
| extractor = ExtractorFactory.createExtractor(xlsx); |
| assertTrue( |
| extractor.getText().length() > 200 |
| ); |
| extractor.close(); |
| |
| extractor = ExtractorFactory.createExtractor(xltx); |
| assertTrue( |
| extractor.getClass().getName(), |
| extractor |
| instanceof XSSFExcelExtractor |
| ); |
| extractor.close(); |
| |
| extractor = ExtractorFactory.createExtractor(xltx); |
| assertTrue( |
| extractor.getText().contains("test") |
| ); |
| extractor.close(); |
| |
| // TODO Support OOXML-Strict, see bug #57699 |
| try { |
| /*extractor =*/ ExtractorFactory.createExtractor(xlsxStrict); |
| fail("OOXML-Strict isn't yet supported"); |
| } catch (POIXMLException e) { |
| // Expected, for now |
| } |
| // extractor = ExtractorFactory.createExtractor(xlsxStrict); |
| // assertTrue( |
| // extractor |
| // instanceof XSSFExcelExtractor |
| // ); |
| // extractor.close(); |
| // |
| // extractor = ExtractorFactory.createExtractor(xlsxStrict); |
| // assertTrue( |
| // extractor.getText().contains("test") |
| // ); |
| // extractor.close(); |
| |
| |
| // Word |
| extractor = ExtractorFactory.createExtractor(doc); |
| assertTrue( |
| extractor |
| instanceof WordExtractor |
| ); |
| assertTrue( |
| extractor.getText().length() > 120 |
| ); |
| extractor.close(); |
| |
| extractor = ExtractorFactory.createExtractor(doc6); |
| assertTrue( |
| extractor |
| instanceof Word6Extractor |
| ); |
| assertTrue( |
| extractor.getText().length() > 20 |
| ); |
| extractor.close(); |
| |
| extractor = ExtractorFactory.createExtractor(doc95); |
| assertTrue( |
| extractor |
| instanceof Word6Extractor |
| ); |
| assertTrue( |
| extractor.getText().length() > 120 |
| ); |
| extractor.close(); |
| |
| extractor = ExtractorFactory.createExtractor(docx); |
| assertTrue( |
| extractor instanceof XWPFWordExtractor |
| ); |
| extractor.close(); |
| |
| extractor = ExtractorFactory.createExtractor(docx); |
| assertTrue( |
| extractor.getText().length() > 120 |
| ); |
| extractor.close(); |
| |
| extractor = ExtractorFactory.createExtractor(dotx); |
| assertTrue( |
| extractor instanceof XWPFWordExtractor |
| ); |
| extractor.close(); |
| |
| extractor = ExtractorFactory.createExtractor(dotx); |
| assertTrue( |
| extractor.getText().contains("Test") |
| ); |
| extractor.close(); |
| |
| // PowerPoint (PPT) |
| extractor = ExtractorFactory.createExtractor(ppt); |
| assertTrue( |
| extractor |
| instanceof PowerPointExtractor |
| ); |
| assertTrue( |
| extractor.getText().length() > 120 |
| ); |
| extractor.close(); |
| |
| // PowerPoint (PPTX) |
| extractor = ExtractorFactory.createExtractor(pptx); |
| assertTrue( |
| extractor |
| instanceof XSLFPowerPointExtractor |
| ); |
| assertTrue( |
| extractor.getText().length() > 120 |
| ); |
| extractor.close(); |
| |
| // Visio - binary |
| extractor = ExtractorFactory.createExtractor(vsd); |
| assertTrue( |
| extractor |
| instanceof VisioTextExtractor |
| ); |
| assertTrue( |
| extractor.getText().length() > 50 |
| ); |
| extractor.close(); |
| |
| // Visio - vsdx |
| extractor = ExtractorFactory.createExtractor(vsdx); |
| assertTrue( |
| extractor |
| instanceof XDGFVisioExtractor |
| ); |
| assertTrue( |
| extractor.getText().length() > 20 |
| ); |
| extractor.close(); |
| |
| // Publisher |
| extractor = ExtractorFactory.createExtractor(pub); |
| assertTrue( |
| extractor |
| instanceof PublisherTextExtractor |
| ); |
| assertTrue( |
| extractor.getText().length() > 50 |
| ); |
| extractor.close(); |
| |
| // Outlook msg |
| extractor = ExtractorFactory.createExtractor(msg); |
| assertTrue( |
| extractor |
| instanceof OutlookTextExtactor |
| ); |
| assertTrue( |
| extractor.getText().length() > 50 |
| ); |
| extractor.close(); |
| |
| // Text |
| try { |
| ExtractorFactory.createExtractor(txt); |
| fail(); |
| } catch(IllegalArgumentException e) { |
| // Good |
| } |
| } |
| |
| @Test |
| public void testInputStream() throws Exception { |
| // Excel |
| POITextExtractor extractor = ExtractorFactory.createExtractor(new FileInputStream(xls)); |
| assertTrue( |
| extractor |
| instanceof ExcelExtractor |
| ); |
| assertTrue( |
| extractor.getText().length() > 200 |
| ); |
| extractor.close(); |
| |
| extractor = ExtractorFactory.createExtractor(new FileInputStream(xlsx)); |
| assertTrue( |
| extractor.getClass().getName(), |
| extractor |
| instanceof XSSFExcelExtractor |
| ); |
| assertTrue( |
| extractor.getText().length() > 200 |
| ); |
| // TODO Support OOXML-Strict, see bug #57699 |
| // assertTrue( |
| // ExtractorFactory.createExtractor(new FileInputStream(xlsxStrict)) |
| // instanceof XSSFExcelExtractor |
| // ); |
| // assertTrue( |
| // ExtractorFactory.createExtractor(new FileInputStream(xlsxStrict)).getText().length() > 200 |
| // ); |
| extractor.close(); |
| |
| // Word |
| extractor = ExtractorFactory.createExtractor(new FileInputStream(doc)); |
| assertTrue( |
| extractor.getClass().getName(), |
| extractor |
| instanceof WordExtractor |
| ); |
| assertTrue( |
| extractor.getText().length() > 120 |
| ); |
| extractor.close(); |
| |
| extractor = ExtractorFactory.createExtractor(new FileInputStream(doc6)); |
| assertTrue( |
| extractor.getClass().getName(), |
| extractor |
| instanceof Word6Extractor |
| ); |
| assertTrue( |
| extractor.getText().length() > 20 |
| ); |
| extractor.close(); |
| |
| extractor = ExtractorFactory.createExtractor(new FileInputStream(doc95)); |
| assertTrue( |
| extractor.getClass().getName(), |
| extractor |
| instanceof Word6Extractor |
| ); |
| assertTrue( |
| extractor.getText().length() > 120 |
| ); |
| extractor.close(); |
| |
| extractor = ExtractorFactory.createExtractor(new FileInputStream(docx)); |
| assertTrue( |
| extractor |
| instanceof XWPFWordExtractor |
| ); |
| assertTrue( |
| extractor.getText().length() > 120 |
| ); |
| extractor.close(); |
| |
| // PowerPoint |
| extractor = ExtractorFactory.createExtractor(new FileInputStream(ppt)); |
| assertTrue( |
| extractor |
| instanceof PowerPointExtractor |
| ); |
| assertTrue( |
| extractor.getText().length() > 120 |
| ); |
| extractor.close(); |
| |
| extractor = ExtractorFactory.createExtractor(new FileInputStream(pptx)); |
| assertTrue( |
| extractor |
| instanceof XSLFPowerPointExtractor |
| ); |
| assertTrue( |
| extractor.getText().length() > 120 |
| ); |
| extractor.close(); |
| |
| // Visio |
| extractor = ExtractorFactory.createExtractor(new FileInputStream(vsd)); |
| assertTrue( |
| extractor |
| instanceof VisioTextExtractor |
| ); |
| assertTrue( |
| extractor.getText().length() > 50 |
| ); |
| extractor.close(); |
| |
| // Visio - vsdx |
| extractor = ExtractorFactory.createExtractor(new FileInputStream(vsdx)); |
| assertTrue( |
| extractor |
| instanceof XDGFVisioExtractor |
| ); |
| assertTrue( |
| extractor.getText().length() > 20 |
| ); |
| extractor.close(); |
| |
| // Publisher |
| extractor = ExtractorFactory.createExtractor(new FileInputStream(pub)); |
| assertTrue( |
| extractor |
| instanceof PublisherTextExtractor |
| ); |
| assertTrue( |
| extractor.getText().length() > 50 |
| ); |
| extractor.close(); |
| |
| // Outlook msg |
| extractor = ExtractorFactory.createExtractor(new FileInputStream(msg)); |
| assertTrue( |
| extractor |
| instanceof OutlookTextExtactor |
| ); |
| assertTrue( |
| extractor.getText().length() > 50 |
| ); |
| extractor.close(); |
| |
| // Text |
| try { |
| FileInputStream stream = new FileInputStream(txt); |
| try { |
| ExtractorFactory.createExtractor(stream); |
| fail(); |
| } finally { |
| IOUtils.closeQuietly(stream); |
| } |
| } catch(IllegalArgumentException e) { |
| // Good |
| } |
| } |
| |
| @Test |
| public void testPOIFS() throws Exception { |
| // Excel |
| assertTrue( |
| ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls))) |
| instanceof ExcelExtractor |
| ); |
| assertTrue( |
| ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls))).getText().length() > 200 |
| ); |
| |
| // Word |
| assertTrue( |
| ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc))) |
| instanceof WordExtractor |
| ); |
| assertTrue( |
| ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc))).getText().length() > 120 |
| ); |
| |
| assertTrue( |
| ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc6))) |
| instanceof Word6Extractor |
| ); |
| assertTrue( |
| ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc6))).getText().length() > 20 |
| ); |
| |
| assertTrue( |
| ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc95))) |
| instanceof Word6Extractor |
| ); |
| assertTrue( |
| ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc95))).getText().length() > 120 |
| ); |
| |
| // PowerPoint |
| assertTrue( |
| ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(ppt))) |
| instanceof PowerPointExtractor |
| ); |
| assertTrue( |
| ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(ppt))).getText().length() > 120 |
| ); |
| |
| // Visio |
| assertTrue( |
| ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(vsd))) |
| instanceof VisioTextExtractor |
| ); |
| assertTrue( |
| ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(vsd))).getText().length() > 50 |
| ); |
| |
| // Publisher |
| assertTrue( |
| ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(pub))) |
| instanceof PublisherTextExtractor |
| ); |
| assertTrue( |
| ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(pub))).getText().length() > 50 |
| ); |
| |
| // Outlook msg |
| assertTrue( |
| ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(msg))) |
| instanceof OutlookTextExtactor |
| ); |
| assertTrue( |
| ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(msg))).getText().length() > 50 |
| ); |
| |
| // Text |
| try { |
| ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(txt))); |
| fail(); |
| } catch(IOException e) { |
| // Good |
| } |
| } |
| |
| |
| @Test |
| public void testOPOIFS() throws Exception { |
| // Excel |
| assertTrue( |
| ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(xls))) |
| instanceof ExcelExtractor |
| ); |
| assertTrue( |
| ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(xls))).getText().length() > 200 |
| ); |
| |
| // Word |
| assertTrue( |
| ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc))) |
| instanceof WordExtractor |
| ); |
| assertTrue( |
| ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc))).getText().length() > 120 |
| ); |
| |
| assertTrue( |
| ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc6))) |
| instanceof Word6Extractor |
| ); |
| assertTrue( |
| ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc6))).getText().length() > 20 |
| ); |
| |
| assertTrue( |
| ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc95))) |
| instanceof Word6Extractor |
| ); |
| assertTrue( |
| ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc95))).getText().length() > 120 |
| ); |
| |
| // PowerPoint |
| assertTrue( |
| ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(ppt))) |
| instanceof PowerPointExtractor |
| ); |
| assertTrue( |
| ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(ppt))).getText().length() > 120 |
| ); |
| |
| // Visio |
| assertTrue( |
| ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(vsd))) |
| instanceof VisioTextExtractor |
| ); |
| assertTrue( |
| ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(vsd))).getText().length() > 50 |
| ); |
| |
| // Publisher |
| assertTrue( |
| ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(pub))) |
| instanceof PublisherTextExtractor |
| ); |
| assertTrue( |
| ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(pub))).getText().length() > 50 |
| ); |
| |
| // Outlook msg |
| assertTrue( |
| ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(msg))) |
| instanceof OutlookTextExtactor |
| ); |
| assertTrue( |
| ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(msg))).getText().length() > 50 |
| ); |
| |
| // Text |
| try { |
| ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(txt))); |
| fail(); |
| } catch(IOException e) { |
| // Good |
| } |
| } |
| |
| @Test |
| public void testPackage() throws Exception { |
| // Excel |
| POIXMLTextExtractor extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString(), PackageAccess.READ)); |
| assertTrue(extractor instanceof XSSFExcelExtractor); |
| extractor.close(); |
| extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString())); |
| assertTrue(extractor.getText().length() > 200); |
| extractor.close(); |
| |
| // Word |
| extractor = ExtractorFactory.createExtractor(OPCPackage.open(docx.toString())); |
| assertTrue(extractor instanceof XWPFWordExtractor); |
| extractor.close(); |
| |
| extractor = ExtractorFactory.createExtractor(OPCPackage.open(docx.toString())); |
| assertTrue(extractor.getText().length() > 120); |
| extractor.close(); |
| |
| // PowerPoint |
| extractor = ExtractorFactory.createExtractor(OPCPackage.open(pptx.toString())); |
| assertTrue(extractor instanceof XSLFPowerPointExtractor); |
| extractor.close(); |
| |
| extractor = ExtractorFactory.createExtractor(OPCPackage.open(pptx.toString())); |
| assertTrue(extractor.getText().length() > 120); |
| extractor.close(); |
| |
| // Visio |
| extractor = ExtractorFactory.createExtractor(OPCPackage.open(vsdx.toString())); |
| assertTrue(extractor instanceof XDGFVisioExtractor); |
| assertTrue(extractor.getText().length() > 20); |
| extractor.close(); |
| |
| // Text |
| try { |
| ExtractorFactory.createExtractor(OPCPackage.open(txt.toString())); |
| fail("TestExtractorFactory.testPackage() failed on " + txt); |
| } catch(UnsupportedFileFormatException e) { |
| // Good |
| } catch (Exception e) { |
| System.out.println("TestExtractorFactory.testPackage() failed on " + txt); |
| throw e; |
| } |
| } |
| |
| @Test |
| public void testPreferEventBased() throws Exception { |
| assertFalse(ExtractorFactory.getPreferEventExtractor()); |
| assertFalse(ExtractorFactory.getThreadPrefersEventExtractors()); |
| assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors()); |
| |
| ExtractorFactory.setThreadPrefersEventExtractors(true); |
| |
| assertTrue(ExtractorFactory.getPreferEventExtractor()); |
| assertTrue(ExtractorFactory.getThreadPrefersEventExtractors()); |
| assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors()); |
| |
| ExtractorFactory.setAllThreadsPreferEventExtractors(false); |
| |
| assertFalse(ExtractorFactory.getPreferEventExtractor()); |
| assertTrue(ExtractorFactory.getThreadPrefersEventExtractors()); |
| assertEquals(Boolean.FALSE, ExtractorFactory.getAllThreadsPreferEventExtractors()); |
| |
| ExtractorFactory.setAllThreadsPreferEventExtractors(null); |
| |
| assertTrue(ExtractorFactory.getPreferEventExtractor()); |
| assertTrue(ExtractorFactory.getThreadPrefersEventExtractors()); |
| assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors()); |
| |
| |
| // Check we get the right extractors now |
| POITextExtractor extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls))); |
| assertTrue( |
| extractor |
| instanceof EventBasedExcelExtractor |
| ); |
| extractor.close(); |
| extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls))); |
| assertTrue( |
| extractor.getText().length() > 200 |
| ); |
| extractor.close(); |
| |
| extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString(), PackageAccess.READ)); |
| assertTrue(extractor instanceof XSSFEventBasedExcelExtractor); |
| extractor.close(); |
| |
| extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString(), PackageAccess.READ)); |
| assertTrue( |
| extractor.getText().length() > 200 |
| ); |
| extractor.close(); |
| |
| |
| // Put back to normal |
| ExtractorFactory.setThreadPrefersEventExtractors(false); |
| assertFalse(ExtractorFactory.getPreferEventExtractor()); |
| assertFalse(ExtractorFactory.getThreadPrefersEventExtractors()); |
| assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors()); |
| |
| // And back |
| extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls))); |
| assertTrue( |
| extractor |
| instanceof ExcelExtractor |
| ); |
| extractor.close(); |
| extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls))); |
| assertTrue( |
| extractor.getText().length() > 200 |
| ); |
| extractor.close(); |
| |
| extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString(), PackageAccess.READ)); |
| assertTrue( |
| extractor |
| instanceof XSSFExcelExtractor |
| ); |
| extractor.close(); |
| extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString())); |
| assertTrue( |
| extractor.getText().length() > 200 |
| ); |
| extractor.close(); |
| } |
| |
| /** |
| * Test embeded docs text extraction. For now, only |
| * does poifs embeded, but will do ooxml ones |
| * at some point. |
| */ |
| @Test |
| public void testEmbeded() throws Exception { |
| POIOLE2TextExtractor ext; |
| POITextExtractor[] embeds; |
| |
| // No embedings |
| ext = (POIOLE2TextExtractor) |
| ExtractorFactory.createExtractor(xls); |
| embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext); |
| assertEquals(0, embeds.length); |
| ext.close(); |
| |
| // Excel |
| ext = (POIOLE2TextExtractor) |
| ExtractorFactory.createExtractor(xlsEmb); |
| embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext); |
| |
| assertEquals(6, embeds.length); |
| int numWord = 0, numXls = 0, numPpt = 0, numMsg = 0, numWordX; |
| for (POITextExtractor embed : embeds) { |
| assertTrue(embed.getText().length() > 20); |
| |
| if (embed instanceof PowerPointExtractor) numPpt++; |
| else if (embed instanceof ExcelExtractor) numXls++; |
| else if (embed instanceof WordExtractor) numWord++; |
| else if (embed instanceof OutlookTextExtactor) numMsg++; |
| } |
| assertEquals(2, numPpt); |
| assertEquals(2, numXls); |
| assertEquals(2, numWord); |
| assertEquals(0, numMsg); |
| ext.close(); |
| |
| // Word |
| ext = (POIOLE2TextExtractor) |
| ExtractorFactory.createExtractor(docEmb); |
| embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext); |
| |
| numWord = 0; numXls = 0; numPpt = 0; numMsg = 0; |
| assertEquals(4, embeds.length); |
| for (POITextExtractor embed : embeds) { |
| assertTrue(embed.getText().length() > 20); |
| if (embed instanceof PowerPointExtractor) numPpt++; |
| else if (embed instanceof ExcelExtractor) numXls++; |
| else if (embed instanceof WordExtractor) numWord++; |
| else if (embed instanceof OutlookTextExtactor) numMsg++; |
| } |
| assertEquals(1, numPpt); |
| assertEquals(2, numXls); |
| assertEquals(1, numWord); |
| assertEquals(0, numMsg); |
| ext.close(); |
| |
| // Word which contains an OOXML file |
| ext = (POIOLE2TextExtractor) |
| ExtractorFactory.createExtractor(docEmbOOXML); |
| embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext); |
| |
| numWord = 0; numXls = 0; numPpt = 0; numMsg = 0; numWordX = 0; |
| assertEquals(3, embeds.length); |
| for (POITextExtractor embed : embeds) { |
| assertTrue(embed.getText().length() > 20); |
| if (embed instanceof PowerPointExtractor) numPpt++; |
| else if (embed instanceof ExcelExtractor) numXls++; |
| else if (embed instanceof WordExtractor) numWord++; |
| else if (embed instanceof OutlookTextExtactor) numMsg++; |
| else if (embed instanceof XWPFWordExtractor) numWordX++; |
| } |
| assertEquals(1, numPpt); |
| assertEquals(1, numXls); |
| assertEquals(0, numWord); |
| assertEquals(1, numWordX); |
| assertEquals(0, numMsg); |
| ext.close(); |
| |
| // Outlook |
| ext = (OutlookTextExtactor) |
| ExtractorFactory.createExtractor(msgEmb); |
| embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext); |
| |
| numWord = 0; numXls = 0; numPpt = 0; numMsg = 0; |
| assertEquals(1, embeds.length); |
| for (POITextExtractor embed : embeds) { |
| assertTrue(embed.getText().length() > 20); |
| if (embed instanceof PowerPointExtractor) numPpt++; |
| else if (embed instanceof ExcelExtractor) numXls++; |
| else if (embed instanceof WordExtractor) numWord++; |
| else if (embed instanceof OutlookTextExtactor) numMsg++; |
| } |
| assertEquals(0, numPpt); |
| assertEquals(0, numXls); |
| assertEquals(1, numWord); |
| assertEquals(0, numMsg); |
| ext.close(); |
| |
| // Outlook with another outlook file in it |
| ext = (OutlookTextExtactor) |
| ExtractorFactory.createExtractor(msgEmbMsg); |
| embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext); |
| |
| numWord = 0; numXls = 0; numPpt = 0; numMsg = 0; |
| assertEquals(1, embeds.length); |
| for (POITextExtractor embed : embeds) { |
| assertTrue(embed.getText().length() > 20); |
| if (embed instanceof PowerPointExtractor) numPpt++; |
| else if (embed instanceof ExcelExtractor) numXls++; |
| else if (embed instanceof WordExtractor) numWord++; |
| else if (embed instanceof OutlookTextExtactor) numMsg++; |
| } |
| assertEquals(0, numPpt); |
| assertEquals(0, numXls); |
| assertEquals(0, numWord); |
| assertEquals(1, numMsg); |
| ext.close(); |
| |
| // TODO - PowerPoint |
| // TODO - Publisher |
| // TODO - Visio |
| } |
| |
| private static final String[] EXPECTED_FAILURES = new String[] { |
| // password protected files |
| "spreadsheet/password.xls", |
| "spreadsheet/protected_passtika.xlsx", |
| "spreadsheet/51832.xls", |
| "document/PasswordProtected.doc", |
| "slideshow/Password_Protected-hello.ppt", |
| "slideshow/Password_Protected-56-hello.ppt", |
| "slideshow/Password_Protected-np-hello.ppt", |
| "slideshow/cryptoapi-proc2356.ppt", |
| //"document/bug53475-password-is-pass.docx", |
| //"document/bug53475-password-is-solrcell.docx", |
| "spreadsheet/xor-encryption-abc.xls", |
| "spreadsheet/35897-type4.xls", |
| //"poifs/protect.xlsx", |
| //"poifs/protected_sha512.xlsx", |
| //"poifs/extenxls_pwd123.xlsx", |
| //"poifs/protected_agile.docx", |
| "spreadsheet/58616.xlsx", |
| |
| // TODO: fails XMLExportTest, is this ok? |
| "spreadsheet/CustomXMLMapping-singleattributenamespace.xlsx", |
| "spreadsheet/55864.xlsx", |
| "spreadsheet/57890.xlsx", |
| |
| // TODO: these fail now with some NPE/file read error because we now try to compute every value via Cell.toString()! |
| "spreadsheet/44958.xls", |
| "spreadsheet/44958_1.xls", |
| "spreadsheet/testArraysAndTables.xls", |
| |
| // TODO: good to ignore? |
| "spreadsheet/sample-beta.xlsx", |
| |
| // This is actually a spreadsheet! |
| "hpsf/TestRobert_Flaherty.doc", |
| |
| // some files that are broken, eg Word 95, ... |
| "spreadsheet/43493.xls", |
| "spreadsheet/46904.xls", |
| "document/Bug50955.doc", |
| "slideshow/PPT95.ppt", |
| "openxml4j/OPCCompliance_CoreProperties_DCTermsNamespaceLimitedUseFAIL.docx", |
| "openxml4j/OPCCompliance_CoreProperties_DoNotUseCompatibilityMarkupFAIL.docx", |
| "openxml4j/OPCCompliance_CoreProperties_LimitedXSITypeAttribute_NotPresentFAIL.docx", |
| "openxml4j/OPCCompliance_CoreProperties_LimitedXSITypeAttribute_PresentWithUnauthorizedValueFAIL.docx", |
| "openxml4j/OPCCompliance_CoreProperties_OnlyOneCorePropertiesPartFAIL.docx", |
| "openxml4j/OPCCompliance_CoreProperties_UnauthorizedXMLLangAttributeFAIL.docx", |
| "openxml4j/OPCCompliance_DerivedPartNameFAIL.docx", |
| "openxml4j/invalid.xlsx", |
| "spreadsheet/54764-2.xlsx", // see TestXSSFBugs.bug54764() |
| "spreadsheet/54764.xlsx", // see TestXSSFBugs.bug54764() |
| "spreadsheet/Simple.xlsb", |
| "poifs/unknown_properties.msg", // POIFS properties corrupted |
| "poifs/only-zero-byte-streams.ole2", // No actual contents |
| "spreadsheet/poc-xmlbomb.xlsx", // contains xml-entity-expansion |
| "spreadsheet/poc-shared-strings.xlsx", // contains shared-string-entity-expansion |
| |
| // old Excel files, which we only support simple text extraction of |
| "spreadsheet/testEXCEL_2.xls", |
| "spreadsheet/testEXCEL_3.xls", |
| "spreadsheet/testEXCEL_4.xls", |
| "spreadsheet/testEXCEL_5.xls", |
| "spreadsheet/testEXCEL_95.xls", |
| |
| // OOXML Strict is not yet supported, see bug #57699 |
| "spreadsheet/SampleSS.strict.xlsx", |
| "spreadsheet/SimpleStrict.xlsx", |
| "spreadsheet/sample.strict.xlsx", |
| |
| // non-TNEF files |
| "ddf/Container.dat", |
| "ddf/47143.dat", |
| |
| // sheet cloning errors |
| "spreadsheet/47813.xlsx", |
| "spreadsheet/56450.xls", |
| "spreadsheet/57231_MixedGasReport.xls", |
| "spreadsheet/OddStyleRecord.xls", |
| "spreadsheet/WithChartSheet.xlsx", |
| "spreadsheet/chart_sheet.xlsx", |
| }; |
| |
| @Test |
| public void testFileLeak() throws Exception { |
| // run a number of files that might fail in order to catch |
| // leaked file resources when using file-leak-detector while |
| // running the test |
| |
| for(String file : EXPECTED_FAILURES) { |
| try { |
| ExtractorFactory.createExtractor(POIDataSamples.getSpreadSheetInstance().getFile(file)); |
| } catch (Exception e) { |
| // catch all exceptions here as we are only interested in file-handle leaks |
| } |
| } |
| } |
| |
| /** |
| * #59074 - Excel 95 files should give a helpful message, not just |
| * "No supported documents found in the OLE2 stream" |
| */ |
| @Test |
| public void bug59074() throws Exception { |
| try { |
| ExtractorFactory.createExtractor( |
| POIDataSamples.getSpreadSheetInstance().getFile("59074.xls")); |
| fail("Old excel formats not supported via ExtractorFactory"); |
| } catch (OldExcelFormatException e) { |
| // expected here |
| } |
| } |
| |
| @Test |
| public void testGetEmbeddedFromXMLExtractor() { |
| try { |
| // currently not implemented |
| ExtractorFactory.getEmbededDocsTextExtractors((POIXMLTextExtractor)null); |
| fail("Unsupported currently"); |
| } catch (IllegalStateException e) { |
| // expected here |
| } |
| } |
| |
| // This bug is currently open. This test will fail with "expected error not thrown" when the bug has been fixed. |
| // When this happens, change this from @Test(expected=...) to @Test |
| // bug 45565: text within TextBoxes is extracted by ExcelExtractor and WordExtractor |
| @Test(expected=AssertionError.class) |
| public void test45565() throws Exception { |
| POITextExtractor extractor = ExtractorFactory.createExtractor(HSSFTestDataSamples.getSampleFile("45565.xls")); |
| try { |
| String text = extractor.getText(); |
| assertContains(text, "testdoc"); |
| assertContains(text, "test phrase"); |
| } finally { |
| extractor.close(); |
| } |
| } |
| } |