blob: a8755e960121f2bd214f27217c535e62173c0dea [file] [log] [blame]
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.extractor.ooxml;
import static org.apache.poi.POITestCase.assertContains;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertNull;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.Locale;
import org.apache.poi.POIDataSamples;
import org.apache.poi.UnsupportedFileFormatException;
import org.apache.poi.extractor.POIOLE2TextExtractor;
import org.apache.poi.extractor.POITextExtractor;
import org.apache.poi.hssf.HSSFTestDataSamples;
import org.apache.poi.hssf.OldExcelFormatException;
import org.apache.poi.hssf.extractor.EventBasedExcelExtractor;
import org.apache.poi.hssf.extractor.ExcelExtractor;
import org.apache.poi.ooxml.extractor.ExtractorFactory;
import org.apache.poi.ooxml.extractor.POIXMLTextExtractor;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.openxml4j.opc.PackageAccess;
import org.apache.poi.poifs.filesystem.FileMagic;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
import org.apache.xmlbeans.XmlException;
import org.junit.Test;
/**
* Test that the extractor factory plays nicely
*/
public class TestExtractorFactory {
private static final POIDataSamples ssTests = POIDataSamples.getSpreadSheetInstance();
private static final File xls = getFileAndCheck(ssTests, "SampleSS.xls");
private static final File xlsx = getFileAndCheck(ssTests, "SampleSS.xlsx");
@SuppressWarnings("unused")
private static final File xlsxStrict = getFileAndCheck(ssTests, "SampleSS.strict.xlsx");
private static final File xltx = getFileAndCheck(ssTests, "test.xltx");
private static final File xlsEmb = getFileAndCheck(ssTests, "excel_with_embeded.xls");
private static final File xlsb = getFileAndCheck(ssTests, "testVarious.xlsb");
private static final POIDataSamples wpTests = POIDataSamples.getDocumentInstance();
private static final File doc = getFileAndCheck(wpTests, "SampleDoc.doc");
private static final File doc6 = getFileAndCheck(wpTests, "Word6.doc");
private static final File doc95 = getFileAndCheck(wpTests, "Word95.doc");
private static final File docx = getFileAndCheck(wpTests, "SampleDoc.docx");
private static final File dotx = getFileAndCheck(wpTests, "test.dotx");
private static final File docEmb = getFileAndCheck(wpTests, "word_with_embeded.doc");
private static final File docEmbOOXML = getFileAndCheck(wpTests, "word_with_embeded_ooxml.doc");
private static final POIDataSamples slTests = POIDataSamples.getSlideShowInstance();
private static final File ppt = getFileAndCheck(slTests, "SampleShow.ppt");
private static final File pptx = getFileAndCheck(slTests, "SampleShow.pptx");
private static final File txt = getFileAndCheck(slTests, "SampleShow.txt");
private static final POIDataSamples olTests = POIDataSamples.getHSMFInstance();
private static final File msg = getFileAndCheck(olTests, "quick.msg");
private static final File msgEmb = getFileAndCheck(olTests, "attachment_test_msg.msg");
private static final File msgEmbMsg = getFileAndCheck(olTests, "attachment_msg_pdf.msg");
private static final POIDataSamples dgTests = POIDataSamples.getDiagramInstance();
private static final File vsd = getFileAndCheck(dgTests, "Test_Visio-Some_Random_Text.vsd");
private static final File vsdx = getFileAndCheck(dgTests, "test.vsdx");
private static POIDataSamples pubTests = POIDataSamples.getPublisherInstance();
private static File pub = getFileAndCheck(pubTests, "Simple.pub");
private static File getFileAndCheck(POIDataSamples samples, String name) {
File file = samples.getFile(name);
assertNotNull("Did not get a file for " + name, file);
assertTrue("Did not get a type file for " + name, file.isFile());
assertTrue("File did not exist: " + name, file.exists());
return file;
}
private static final Object[] TEST_SET = {
"Excel", xls, "ExcelExtractor", 200,
"Excel - xlsx", xlsx, "XSSFExcelExtractor", 200,
"Excel - xltx", xltx, "XSSFExcelExtractor", -1,
"Excel - xlsb", xlsb, "XSSFBEventBasedExcelExtractor", -1,
"Word", doc, "WordExtractor", 120,
"Word - docx", docx, "XWPFWordExtractor", 120,
"Word - dotx", dotx, "XWPFWordExtractor", -1,
"Word 6", doc6, "Word6Extractor", 20,
"Word 95", doc95, "Word6Extractor", 120,
"PowerPoint", ppt, "SlideShowExtractor", 120,
"PowerPoint - pptx", pptx, "SlideShowExtractor", 120,
"Visio", vsd, "VisioTextExtractor", 50,
"Visio - vsdx", vsdx, "XDGFVisioExtractor", 20,
"Publisher", pub, "PublisherTextExtractor", 50,
"Outlook msg", msg, "OutlookTextExtactor", 50,
// TODO Support OOXML-Strict, see bug #57699
// xlsxStrict
};
@FunctionalInterface
interface FunctionEx<T, R> {
R apply(T t) throws IOException, OpenXML4JException, XmlException;
}
@Test
public void testFile() throws Exception {
for (int i = 0; i < TEST_SET.length; i += 4) {
try (POITextExtractor ext = ExtractorFactory.createExtractor((File) TEST_SET[i + 1])) {
testExtractor(ext, (String) TEST_SET[i], (String) TEST_SET[i + 2], (Integer) TEST_SET[i + 3]);
}
}
}
@Test(expected = IllegalArgumentException.class)
public void testFileInvalid() throws Exception {
// Text
try (POITextExtractor ignored = ExtractorFactory.createExtractor(txt)) {
fail("extracting from invalid package");
}
}
@Test
public void testInputStream() throws Exception {
testStream(ExtractorFactory::createExtractor, true);
}
@Test(expected = IllegalArgumentException.class)
public void testInputStreamInvalid() throws Exception {
testInvalid(ExtractorFactory::createExtractor);
}
@Test
public void testPOIFS() throws Exception {
testStream((f) -> ExtractorFactory.createExtractor(new POIFSFileSystem(f)), false);
}
@Test(expected = IOException.class)
public void testPOIFSInvalid() throws Exception {
testInvalid((f) -> ExtractorFactory.createExtractor(new POIFSFileSystem(f)));
}
private void testStream(final FunctionEx<FileInputStream, POITextExtractor> poifsIS, final boolean loadOOXML)
throws IOException, OpenXML4JException, XmlException {
for (int i = 0; i < TEST_SET.length; i += 4) {
File testFile = (File) TEST_SET[i + 1];
if (!loadOOXML && (testFile.getName().endsWith("x") || testFile.getName().endsWith("xlsb"))) {
continue;
}
try (FileInputStream fis = new FileInputStream(testFile);
POITextExtractor ext = poifsIS.apply(fis)) {
testExtractor(ext, (String) TEST_SET[i], (String) TEST_SET[i + 2], (Integer) TEST_SET[i + 3]);
} catch (IllegalArgumentException e) {
fail("failed to process "+testFile);
}
}
}
private void testExtractor(final POITextExtractor ext, final String testcase, final String extrClass, final Integer minLength) {
assertEquals("invalid extractor for " + testcase, extrClass, ext.getClass().getSimpleName());
final String actual = ext.getText();
if (minLength == -1) {
assertContains(actual.toLowerCase(Locale.ROOT), "test");
} else {
assertTrue("extracted content too short for " + testcase, actual.length() > minLength);
}
}
private void testInvalid(FunctionEx<FileInputStream, POITextExtractor> poifs) throws IOException, OpenXML4JException, XmlException {
// Text
try (FileInputStream fis = new FileInputStream(txt);
POITextExtractor ignored = poifs.apply(fis)) {
fail("extracting from invalid package");
} catch (IllegalArgumentException e) {
assertTrue("Had: " + e,
e.getMessage().contains(FileMagic.UNKNOWN.name()));
throw e;
}
}
@Test
public void testPackage() throws Exception {
for (int i = 0; i < TEST_SET.length; i += 4) {
final File testFile = (File) TEST_SET[i + 1];
if (!testFile.getName().endsWith("x")) {
continue;
}
try (final OPCPackage pkg = OPCPackage.open(testFile, PackageAccess.READ);
final POITextExtractor ext = ExtractorFactory.createExtractor(pkg)) {
testExtractor(ext, (String) TEST_SET[i], (String) TEST_SET[i + 2], (Integer) TEST_SET[i + 3]);
pkg.revert();
}
}
}
@Test(expected = UnsupportedFileFormatException.class)
public void testPackageInvalid() throws Exception {
// Text
try (final OPCPackage pkg = OPCPackage.open(txt, PackageAccess.READ);
final POITextExtractor ignored = ExtractorFactory.createExtractor(pkg)) {
fail("extracting from invalid package");
}
}
@Test
public void testPreferEventBased() throws Exception {
assertFalse(ExtractorFactory.getPreferEventExtractor());
assertFalse(ExtractorFactory.getThreadPrefersEventExtractors());
assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors());
ExtractorFactory.setThreadPrefersEventExtractors(true);
assertTrue(ExtractorFactory.getPreferEventExtractor());
assertTrue(ExtractorFactory.getThreadPrefersEventExtractors());
assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors());
ExtractorFactory.setAllThreadsPreferEventExtractors(false);
assertFalse(ExtractorFactory.getPreferEventExtractor());
assertTrue(ExtractorFactory.getThreadPrefersEventExtractors());
assertEquals(Boolean.FALSE, ExtractorFactory.getAllThreadsPreferEventExtractors());
ExtractorFactory.setAllThreadsPreferEventExtractors(null);
assertTrue(ExtractorFactory.getPreferEventExtractor());
assertTrue(ExtractorFactory.getThreadPrefersEventExtractors());
assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors());
// Check we get the right extractors now
POITextExtractor extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)));
assertTrue(
extractor
instanceof EventBasedExcelExtractor
);
extractor.close();
extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)));
assertTrue(
extractor.getText().length() > 200
);
extractor.close();
extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString(), PackageAccess.READ));
assertTrue(extractor instanceof XSSFEventBasedExcelExtractor);
extractor.close();
extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString(), PackageAccess.READ));
assertTrue(
extractor.getText().length() > 200
);
extractor.close();
// Put back to normal
ExtractorFactory.setThreadPrefersEventExtractors(false);
assertFalse(ExtractorFactory.getPreferEventExtractor());
assertFalse(ExtractorFactory.getThreadPrefersEventExtractors());
assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors());
// And back
extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)));
assertTrue(
extractor
instanceof ExcelExtractor
);
extractor.close();
extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)));
assertTrue(
extractor.getText().length() > 200
);
extractor.close();
extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString(), PackageAccess.READ));
assertTrue(
extractor
instanceof XSSFExcelExtractor
);
extractor.close();
extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString()));
assertTrue(
extractor.getText().length() > 200
);
extractor.close();
}
/**
* Test embedded docs text extraction. For now, only
* does poifs embedded, but will do ooxml ones
* at some point.
*/
@Test
public void testEmbedded() throws Exception {
final Object[] testObj = {
"No embeddings", xls, "0-0-0-0-0-0",
"Excel", xlsEmb, "6-2-2-2-0-0",
"Word", docEmb, "4-1-2-1-0-0",
"Word which contains an OOXML file", docEmbOOXML, "3-0-1-1-0-1",
"Outlook", msgEmb, "1-1-0-0-0-0",
"Outlook with another outlook file in it", msgEmbMsg, "1-0-0-0-1-0",
};
for (int i=0; i<testObj.length; i+=3) {
try (final POIOLE2TextExtractor ext = ExtractorFactory.createExtractor((File)testObj[i+1])) {
final POITextExtractor[] embeds = ExtractorFactory.getEmbeddedDocsTextExtractors(ext);
int numWord = 0, numXls = 0, numPpt = 0, numMsg = 0, numWordX = 0;
for (POITextExtractor embed : embeds) {
assertTrue(embed.getText().length() > 20);
switch (embed.getClass().getSimpleName()) {
case "SlideShowExtractor":
numPpt++;
break;
case "ExcelExtractor":
numXls++;
break;
case "WordExtractor":
numWord++;
break;
case "OutlookTextExtactor":
numMsg++;
break;
case "XWPFWordExtractor":
numWordX++;
break;
}
}
final String actual = embeds.length+"-"+numWord+"-"+numXls+"-"+numPpt+"-"+numMsg+"-"+numWordX;
final String expected = (String)testObj[i+2];
assertEquals("invalid number of embeddings - "+testObj[i], expected, actual);
}
}
// TODO - PowerPoint
// TODO - Publisher
// TODO - Visio
}
private static final String[] EXPECTED_FAILURES = {
// password protected files
"spreadsheet/password.xls",
"spreadsheet/protected_passtika.xlsx",
"spreadsheet/51832.xls",
"document/PasswordProtected.doc",
"slideshow/Password_Protected-hello.ppt",
"slideshow/Password_Protected-56-hello.ppt",
"slideshow/Password_Protected-np-hello.ppt",
"slideshow/cryptoapi-proc2356.ppt",
//"document/bug53475-password-is-pass.docx",
//"document/bug53475-password-is-solrcell.docx",
"spreadsheet/xor-encryption-abc.xls",
"spreadsheet/35897-type4.xls",
//"poifs/protect.xlsx",
//"poifs/protected_sha512.xlsx",
//"poifs/extenxls_pwd123.xlsx",
//"poifs/protected_agile.docx",
"spreadsheet/58616.xlsx",
// TODO: fails XMLExportTest, is this ok?
"spreadsheet/CustomXMLMapping-singleattributenamespace.xlsx",
"spreadsheet/55864.xlsx",
"spreadsheet/57890.xlsx",
// TODO: these fail now with some NPE/file read error because we now try to compute every value via Cell.toString()!
"spreadsheet/44958.xls",
"spreadsheet/44958_1.xls",
"spreadsheet/testArraysAndTables.xls",
// TODO: good to ignore?
"spreadsheet/sample-beta.xlsx",
// This is actually a spreadsheet!
"hpsf/TestRobert_Flaherty.doc",
// some files that are broken, eg Word 95, ...
"spreadsheet/43493.xls",
"spreadsheet/46904.xls",
"document/Bug50955.doc",
"slideshow/PPT95.ppt",
"openxml4j/OPCCompliance_CoreProperties_DCTermsNamespaceLimitedUseFAIL.docx",
"openxml4j/OPCCompliance_CoreProperties_DoNotUseCompatibilityMarkupFAIL.docx",
"openxml4j/OPCCompliance_CoreProperties_LimitedXSITypeAttribute_NotPresentFAIL.docx",
"openxml4j/OPCCompliance_CoreProperties_LimitedXSITypeAttribute_PresentWithUnauthorizedValueFAIL.docx",
"openxml4j/OPCCompliance_CoreProperties_OnlyOneCorePropertiesPartFAIL.docx",
"openxml4j/OPCCompliance_CoreProperties_UnauthorizedXMLLangAttributeFAIL.docx",
"openxml4j/OPCCompliance_DerivedPartNameFAIL.docx",
"openxml4j/invalid.xlsx",
"spreadsheet/54764-2.xlsx", // see TestXSSFBugs.bug54764()
"spreadsheet/54764.xlsx", // see TestXSSFBugs.bug54764()
"spreadsheet/Simple.xlsb",
"poifs/unknown_properties.msg", // POIFS properties corrupted
"poifs/only-zero-byte-streams.ole2", // No actual contents
"spreadsheet/poc-xmlbomb.xlsx", // contains xml-entity-expansion
"spreadsheet/poc-xmlbomb-empty.xlsx", // contains xml-entity-expansion
"spreadsheet/poc-shared-strings.xlsx", // contains shared-string-entity-expansion
// old Excel files, which we only support simple text extraction of
"spreadsheet/testEXCEL_2.xls",
"spreadsheet/testEXCEL_3.xls",
"spreadsheet/testEXCEL_4.xls",
"spreadsheet/testEXCEL_5.xls",
"spreadsheet/testEXCEL_95.xls",
// OOXML Strict is not yet supported, see bug #57699
"spreadsheet/SampleSS.strict.xlsx",
"spreadsheet/SimpleStrict.xlsx",
"spreadsheet/sample.strict.xlsx",
// non-TNEF files
"ddf/Container.dat",
"ddf/47143.dat",
// sheet cloning errors
"spreadsheet/47813.xlsx",
"spreadsheet/56450.xls",
"spreadsheet/57231_MixedGasReport.xls",
"spreadsheet/OddStyleRecord.xls",
"spreadsheet/WithChartSheet.xlsx",
"spreadsheet/chart_sheet.xlsx",
};
@Test
public void testFileLeak() {
// run a number of files that might fail in order to catch
// leaked file resources when using file-leak-detector while
// running the test
for(String file : EXPECTED_FAILURES) {
try {
ExtractorFactory.createExtractor(POIDataSamples.getSpreadSheetInstance().getFile(file));
} catch (Exception e) {
// catch all exceptions here as we are only interested in file-handle leaks
}
}
}
/**
* #59074 - Excel 95 files should give a helpful message, not just
* "No supported documents found in the OLE2 stream"
*/
@Test(expected = OldExcelFormatException.class)
public void bug59074() throws Exception {
ExtractorFactory.createExtractor(
POIDataSamples.getSpreadSheetInstance().getFile("59074.xls"));
}
@SuppressWarnings("deprecation")
@Test(expected = IllegalStateException.class)
public void testGetEmbedFromXMLExtractor() {
// currently not implemented
ExtractorFactory.getEmbededDocsTextExtractors((POIXMLTextExtractor) null);
}
@Test(expected = IllegalStateException.class)
public void testGetEmbeddedFromXMLExtractor() {
// currently not implemented
ExtractorFactory.getEmbeddedDocsTextExtractors((POIXMLTextExtractor)null);
}
// This bug is currently open. This test will fail with "expected error not thrown" when the bug has been fixed.
// When this happens, change this from @Test(expected=...) to @Test
// bug 45565: text within TextBoxes is extracted by ExcelExtractor and WordExtractor
@Test(expected=AssertionError.class)
public void test45565() throws Exception {
try (POITextExtractor extractor = ExtractorFactory.createExtractor(HSSFTestDataSamples.getSampleFile("45565.xls"))) {
String text = extractor.getText();
assertContains(text, "testdoc");
assertContains(text, "test phrase");
}
}
}