TIKA-2356 -- temporary workaround for bug I added to POI (Bug 61034) <face_palm/>
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
index 11277d5..de45e28 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
@@ -20,9 +20,11 @@
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashMap;
+import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
+import java.util.Set;
import org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.hssf.extractor.ExcelExtractor;
@@ -125,6 +127,7 @@
try {
xssfReader = new XSSFReader(container);
styles = xssfReader.getStylesTable();
+
iter = (XSSFReader.SheetIterator) xssfReader.getSheetsData();
strings = new ReadOnlySharedStringsTable(container);
} catch (InvalidFormatException e) {
@@ -133,25 +136,37 @@
throw new XmlException(oe);
}
+ //temporary workaround for POI-61034
+ //remove once POI 3.17-beta1 is released
+ Set<String> seen = new HashSet<>();
+
while (iter.hasNext()) {
- InputStream stream = iter.next();
- PackagePart sheetPart = iter.getSheetPart();
- addDrawingHyperLinks(sheetPart);
- sheetParts.add(sheetPart);
SheetTextAsHTML sheetExtractor = new SheetTextAsHTML(xhtml);
- CommentsTable comments = iter.getSheetComments();
+ PackagePart sheetPart = null;
+ try (InputStream stream = iter.next()) {
+ sheetPart = iter.getSheetPart();
+ final String partName = sheetPart.getPartName().toString();
+ if (seen.contains(partName)) {
+ continue;
+ }
+ seen.add(partName);
- // Start, and output the sheet name
- xhtml.startElement("div");
- xhtml.element("h1", iter.getSheetName());
+ addDrawingHyperLinks(sheetPart);
+ sheetParts.add(sheetPart);
- // Extract the main sheet contents
- xhtml.startElement("table");
- xhtml.startElement("tbody");
+ CommentsTable comments = iter.getSheetComments();
- processSheet(sheetExtractor, comments, styles, strings, stream);
+ // Start, and output the sheet name
+ xhtml.startElement("div");
+ xhtml.element("h1", iter.getSheetName());
+ // Extract the main sheet contents
+ xhtml.startElement("table");
+ xhtml.startElement("tbody");
+
+ processSheet(sheetExtractor, comments, styles, strings, stream);
+ }
xhtml.endElement("tbody");
xhtml.endElement("table");
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index 525913f..d2ec7b6 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -35,9 +35,13 @@
import java.util.Arrays;
import java.util.Date;
import java.util.HashMap;
+import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
import org.apache.poi.util.LocaleUtil;
import org.apache.tika.TikaTest;
@@ -1538,6 +1542,22 @@
}
+ @Test
+ public void testPOI61034() throws Exception {
+ //tests temporary work around until POI 3.17-beta1 is released
+ XMLResult r = getXML("testEXCEL_poi-61034.xlsx");
+ Matcher m = Pattern.compile("<h1>(Sheet\\d+)</h1>").matcher(r.xml);
+ Set<String> seen = new HashSet<>();
+ while (m.find()) {
+ String sheetName = m.group(1);
+ if (seen.contains(sheetName)) {
+ fail("Should only see each sheet once: "+sheetName);
+ }
+ seen.add(sheetName);
+ }
+
+ }
+
}
diff --git a/tika-parsers/src/test/resources/test-documents/testEXCEL_poi-61034.xlsx b/tika-parsers/src/test/resources/test-documents/testEXCEL_poi-61034.xlsx
new file mode 100644
index 0000000..cd2c5e5
--- /dev/null
+++ b/tika-parsers/src/test/resources/test-documents/testEXCEL_poi-61034.xlsx
Binary files differ