Make modifications for future integration of POI 4.1.x.
Note: commons-codec started throwing an exception on
an embedded base64 encoded doc so I had to make some
changes there.
TODO: we should figure out if the exception is correct and/or
if we should handle that leniently. The wordml file was
generated by MSWord and shouldn't cause exceptions in
embedded files
diff --git a/tika-bundle/pom.xml b/tika-bundle/pom.xml
index 4c04b4d..3c4fcc8 100644
--- a/tika-bundle/pom.xml
+++ b/tika-bundle/pom.xml
@@ -172,6 +172,7 @@
poi|poi-scratchpad|
poi-ooxml|
poi-ooxml-schemas|
+ commons-math3|
curvesapi|
xmlbeans|
jackcess|
@@ -279,6 +280,18 @@
opennlp.tools.namefind;resolution:=optional,
opennlp.tools.authorage;resolution:=optional,
net.didion.jwnl;resolution:=optional,
+ net.sf.saxon;resolution:=optional,
+ net.sf.saxon.dom;resolution:=optional,
+ net.sf.saxon.om;resolution:=optional,
+ net.sf.saxon.query;resolution:=optional,
+ net.sf.saxon.sxpath;resolution:=optional,
+ net.sf.saxon.value;resolution:=optional,
+ org.apache.batik.anim.dom;resolution:=optional,
+ org.apache.batik.bridge;resolution:=optional,
+ org.apache.batik.ext.awt;resolution:=optional,
+ org.apache.batik.ext.awt.image.renderable;resolution:=optional,
+ org.apache.batik.gvt;resolution:=optional,
+ org.apache.batik.util;resolution:=optional,
org.apache.cxf.jaxrs.client;resolution:=optional,
org.apache.cxf.jaxrs.ext.multipart;resolution:=optional,
org.apache.commons.exec;resolution:=optional,
@@ -289,10 +302,6 @@
org.apache.commons.httpclient.params;resolution:=optional,
org.apache.commons.httpclient.protocol;resolution:=optional,
org.apache.commons.httpclient.util;resolution:=optional,
-
- org.apache.commons.math3.exception;resolution:=optional,
- org.apache.commons.math3.linear;resolution:=optional,
- org.apache.commons.math3.stat.regression;resolution:=optional,
org.apache.commons.vfs2;resolution:=optional,
org.apache.commons.vfs2.provider;resolution:=optional,
org.apache.commons.vfs2.util;resolution:=optional,
diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml
index 7c239aa..1c2cf91 100644
--- a/tika-parent/pom.xml
+++ b/tika-parent/pom.xml
@@ -334,8 +334,8 @@
<maven.shade.version>3.2.1</maven.shade.version>
<rat.version>0.13</rat.version>
<!-- NOTE: sync tukaani version with commons-compress in tika-parsers -->
- <poi.version>4.0.1</poi.version>
- <commons.compress.version>1.18</commons.compress.version>
+ <poi.version>4.1.1-SNAPSHOT</poi.version>
+ <commons.compress.version>1.19</commons.compress.version>
<commons.io.version>2.6</commons.io.version>
<commons.lang3.version>3.9</commons.lang3.version>
<gson.version>2.8.5</gson.version>
diff --git a/tika-parsers/pom.xml b/tika-parsers/pom.xml
index 98bfbe1..ab8a769 100644
--- a/tika-parsers/pom.xml
+++ b/tika-parsers/pom.xml
@@ -36,7 +36,7 @@
<properties>
<!-- NOTE: sync codec version with POI -->
- <codec.version>1.12</codec.version>
+ <codec.version>1.13</codec.version>
<!-- NOTE: sync tukaani version with commons-compress in tika-parent-->
<tukaani.version>1.8</tukaani.version>
<!-- NOTE: sync brotli version with commons-compress in tika-parent-->
@@ -78,7 +78,7 @@
<scope>test</scope>
</dependency>
- <!-- for java 10
+ <!-- for java 10
See TIKA-2778 for why we need to do this now.
May the gods of API design fix this in the future.
-->
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/EMFParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/EMFParser.java
index 6c967e3..6f28648 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/EMFParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/EMFParser.java
@@ -22,13 +22,11 @@
import java.util.Collections;
import java.util.Set;
-import org.apache.poi.hemf.extractor.HemfExtractor;
-import org.apache.poi.hemf.record.AbstractHemfComment;
-import org.apache.poi.hemf.record.HemfCommentPublic;
-import org.apache.poi.hemf.record.HemfCommentRecord;
-import org.apache.poi.hemf.record.HemfRecord;
-import org.apache.poi.hemf.record.HemfRecordType;
-import org.apache.poi.hemf.record.HemfText;
+import org.apache.poi.hemf.record.emf.HemfComment;
+import org.apache.poi.hemf.record.emf.HemfRecord;
+import org.apache.poi.hemf.record.emf.HemfRecordType;
+import org.apache.poi.hemf.record.emf.HemfText;
+import org.apache.poi.hemf.usermodel.HemfPicture;
import org.apache.poi.util.RecordFormatException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
@@ -74,41 +72,46 @@
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
try {
- HemfExtractor ex = new HemfExtractor(stream);
- long lastY = -1;
- long lastX = -1;
+ HemfPicture ex = new HemfPicture(stream);
+ double lastY = -1;
+ double lastX = -1;
long fudgeFactorX = 1000;//derive this from the font or frame/bounds information
StringBuilder buffer = new StringBuilder();
for (HemfRecord record : ex) {
- if (record.getRecordType() == HemfRecordType.comment) {
- AbstractHemfComment comment = ((HemfCommentRecord) record).getComment();
- if (comment instanceof HemfCommentPublic.MultiFormats) {
+ if (record.getEmfRecordType() == HemfRecordType.comment) {
+ HemfComment.EmfCommentData commentData = ((HemfComment.EmfComment) record).getCommentData();
+ if (commentData instanceof HemfComment.EmfCommentDataMultiformats) {
if (embeddedDocumentExtractor == null) {
embeddedDocumentExtractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
}
- handleMultiFormats((HemfCommentPublic.MultiFormats)comment, xhtml, embeddedDocumentExtractor);
- } else if (comment instanceof HemfCommentPublic.WindowsMetafile) {
+ handleMultiFormats(
+ (HemfComment.EmfCommentDataMultiformats)commentData, xhtml, embeddedDocumentExtractor);
+ } else if (commentData instanceof HemfComment.EmfCommentDataWMF) {
if (embeddedDocumentExtractor == null) {
embeddedDocumentExtractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
}
- handleWMF((HemfCommentPublic.WindowsMetafile)comment, xhtml, embeddedDocumentExtractor);
+ handleWMF(((HemfComment.EmfCommentDataWMF) commentData).getWMFData(),
+ xhtml, embeddedDocumentExtractor);
}
- } else if (record.getRecordType().equals(HemfRecordType.exttextoutw)) {
- HemfText.ExtTextOutW extTextOutW = (HemfText.ExtTextOutW) record;
- if (lastY > -1 && lastY != extTextOutW.getY()) {
+ } else if (record.getEmfRecordType().equals(HemfRecordType.extTextOutW)) {
+
+ HemfText.EmfExtTextOutW extTextOutW = (HemfText.EmfExtTextOutW) record;
+ //change equality to delta diff;
+
+ if (lastY > -1 && lastY != extTextOutW.getReference().getY()) {
xhtml.startElement("p");
xhtml.characters(buffer.toString());
xhtml.endElement("p");
buffer.setLength(0);
lastX = -1;
}
- if (lastX > -1 && extTextOutW.getX() - lastX > fudgeFactorX) {
+ if (lastX > -1 && extTextOutW.getReference().getX() - lastX > fudgeFactorX) {
buffer.append(" ");
}
String txt = extTextOutW.getText();
buffer.append(txt);
- lastY = extTextOutW.getY();
- lastX = extTextOutW.getX();
+ lastY = extTextOutW.getReference().getY();
+ lastX = extTextOutW.getReference().getX();
}
}
if (buffer.length() > 0) {
@@ -124,12 +127,12 @@
xhtml.endDocument();
}
- private void handleWMF(HemfCommentPublic.WindowsMetafile comment, ContentHandler contentHandler,
+ private void handleWMF(byte[] bytes, ContentHandler contentHandler,
EmbeddedDocumentExtractor embeddedDocumentExtractor) throws IOException, SAXException, TikaException {
Metadata embeddedMetadata = new Metadata();
embeddedMetadata.set(Metadata.CONTENT_TYPE, WMF_MEDIA_TYPE.toString());
if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
- try (InputStream is = TikaInputStream.get(comment.getWmfInputStream())) {
+ try (InputStream is = TikaInputStream.get(bytes)) {
embeddedDocumentExtractor.parseEmbedded(is,
new EmbeddedContentHandler(contentHandler), embeddedMetadata, false);
@@ -139,11 +142,13 @@
}
- private void handleMultiFormats(HemfCommentPublic.MultiFormats comment, ContentHandler handler,
+ private void handleMultiFormats(HemfComment.EmfCommentDataMultiformats commentData, ContentHandler handler,
EmbeddedDocumentExtractor embeddedDocumentExtractor) throws IOException, TikaException, SAXException {
- for (HemfCommentPublic.HemfMultiFormatsData data :
- ((HemfCommentPublic.MultiFormats) comment).getData()) {
- handleEmbedded(data.getData(), embeddedDocumentExtractor, handler);
+
+ for (HemfComment.EmfCommentDataFormat dataFormat :
+ commentData.getFormats()) {
+ //is this right?!
+ handleEmbedded(dataFormat.getRawData(), embeddedDocumentExtractor, handler);
}
}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
index 756b7fd..95e7ba0 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
@@ -137,7 +137,7 @@
//We might consider not bothering to check for macros in root,
//if we know we're processing ppt based on content-type identified in metadata
- extractMacros(root.getNFileSystem(), xhtml,
+ extractMacros(root.getFileSystem(), xhtml,
EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context));
}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WMFParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WMFParser.java
index 5343751..82020a9 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WMFParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WMFParser.java
@@ -69,18 +69,18 @@
//this is pure hackery for specifying the font
//TODO: do what Graphics does by maintaining the stack, etc.!
//This fix should be done within POI
- if (record.getRecordType().equals(HwmfRecordType.createFontIndirect)) {
+ if (record.getWmfRecordType().equals(HwmfRecordType.createFontIndirect)) {
HwmfFont font = ((HwmfText.WmfCreateFontIndirect) record).getFont();
charset = (font.getCharset() == null || font.getCharset().getCharset() == null)
? LocaleUtil.CHARSET_1252 :
font.getCharset().getCharset();
}
- if (record.getRecordType().equals(HwmfRecordType.extTextOut)) {
+ if (record.getWmfRecordType().equals(HwmfRecordType.extTextOut)) {
HwmfText.WmfExtTextOut textOut = (HwmfText.WmfExtTextOut) record;
xhtml.startElement("p");
xhtml.characters(textOut.getText(charset));
xhtml.endElement("p");
- } else if (record.getRecordType().equals(HwmfRecordType.textOut)) {
+ } else if (record.getWmfRecordType().equals(HwmfRecordType.textOut)) {
HwmfText.WmfTextOut textOut = (HwmfText.WmfTextOut) record;
xhtml.startElement("p");
xhtml.characters(textOut.getText(charset));
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/xml/WordMLParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/xml/WordMLParser.java
index 6d1ea8e..202db8e 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/xml/WordMLParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/xml/WordMLParser.java
@@ -86,7 +86,7 @@
new WordMLHandler(ch),
new HyperlinkHandler(ch,
WORD_ML_URL),
- new PictHandler(ch,
+ new PictHandler(ch, metadata,
EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context)));
}
@@ -180,6 +180,7 @@
private class PictHandler extends DefaultHandler {
final StringBuilder buffer = new StringBuilder();
+ final Metadata parentMetadata;
final ContentHandler handler;
byte[] rawBytes = null;
EmbeddedDocumentExtractor embeddedDocumentExtractor;
@@ -189,8 +190,10 @@
String pictSource = null;
final Base64 base64 = new Base64();
- public PictHandler(ContentHandler handler, EmbeddedDocumentExtractor embeddedDocumentExtractor) {
+ public PictHandler(ContentHandler handler, Metadata metadata,
+ EmbeddedDocumentExtractor embeddedDocumentExtractor) {
this.handler = handler;
+ this.parentMetadata = metadata;
this.embeddedDocumentExtractor = embeddedDocumentExtractor;
}
@@ -263,11 +266,17 @@
handleEmbedded();
} else if (BIN_DATA.equals(localName)) {
inBin = false;
- rawBytes = base64.decode(buffer.toString());
- //reset
- buffer.setLength(0);
-
- if (! inPict) {
+ boolean success = false;
+ try {
+ rawBytes = base64.decode(buffer.toString());
+ success = true;
+ } catch (IllegalArgumentException e) {
+ EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
+ } finally {
+ //reset
+ buffer.setLength(0);
+ }
+ if (success && ! inPict) {
handleEmbedded();
}
}
diff --git a/tika-parsers/src/test/java/org/apache/tika/TestXXEInXML.java b/tika-parsers/src/test/java/org/apache/tika/TestXXEInXML.java
index 5fd3dca..367b3af 100644
--- a/tika-parsers/src/test/java/org/apache/tika/TestXXEInXML.java
+++ b/tika-parsers/src/test/java/org/apache/tika/TestXXEInXML.java
@@ -16,7 +16,9 @@
*/
package org.apache.tika;
+import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.tika.config.TikaConfig;
+import org.apache.tika.exception.TikaException;
import org.apache.tika.io.IOUtils;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
@@ -28,7 +30,6 @@
import org.junit.Ignore;
import org.junit.Test;
import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
@@ -104,8 +105,9 @@
new AutoDetectParser(), new ParseContext());
}
+
@Test
- public void testXMLInZips() throws Exception {
+ public void testPOIOOXMLs() throws Exception {
for (String fileName : new String[]{
"testWORD.docx",
"testWORD_1img.docx",
@@ -119,15 +121,13 @@
"testPPT_2imgs.pptx",
"testPPT_comment.pptx",
"testPPT_EmbeddedPDF.pptx",
- "testPPT_macros.pptm",
- "testEPUB.epub"
+ "testPPT_macros.pptm"
}) {
- _testOOXML(fileName);
+ _testPOIOOXMLs(fileName);
}
}
- private void _testOOXML(String fileName) throws Exception {
-
+ private void _testPOIOOXMLs(String fileName) throws Exception {
Path originalOOXML = getResourceAsFile("/test-documents/"+fileName).toPath();
Path injected = injectZippedXMLs(originalOOXML, XXE, false);
@@ -138,10 +138,13 @@
Metadata metadata = new Metadata();
try {
p.parse(Files.newInputStream(injected), xhtml, metadata, parseContext);
-
- } catch (FileNotFoundException e) {
- e.printStackTrace();
- fail("problem with: "+fileName + ": "+ e.getMessage());
+ } catch (TikaException e) {
+ Throwable cause = e.getCause();
+ if (!(cause instanceof InvalidFormatException)) {
+ //as of POI 4.1.x
+ fail("POI should have thrown an IFE complaining about " +
+ "not being able to read content types part !");
+ }
} finally {
Files.delete(injected);
}
@@ -166,6 +169,33 @@
}
@Test
+ public void testXMLInZips() throws Exception {
+ for (String fileName : new String[]{
+ "testEPUB.epub"
+ }) {
+ _testXMLInZips(fileName);
+ }
+ }
+
+ private void _testXMLInZips(String fileName) throws Exception {
+ Path originalOOXML = getResourceAsFile("/test-documents/"+fileName).toPath();
+ Path injected = injectZippedXMLs(originalOOXML, XXE, false);
+
+ Parser p = new AutoDetectParser();
+ ContentHandler xhtml = new ToHTMLContentHandler();
+ ParseContext parseContext = new ParseContext();
+ //if the SafeContentHandler is turned off, this will throw an FNFE
+ Metadata metadata = new Metadata();
+ try {
+ p.parse(Files.newInputStream(injected), xhtml, metadata, parseContext);
+ } finally {
+ Files.delete(injected);
+ }
+
+ }
+
+
+ @Test
public void testDOM() throws Exception {
byte[] bytes = "<?xml version=\"1.0\" encoding=\"UTF-8\"?><document>blah</document>".getBytes(StandardCharsets.UTF_8);
byte[] injected = injectXML(bytes, XXE);
@@ -207,9 +237,4 @@
TikaConfig tikaConfig = new TikaConfig(new ByteArrayInputStream(injected));
}
}
-
-
-
-
-
}
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WMFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WMFParserTest.java
index fb2d631..8d04697 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WMFParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WMFParserTest.java
@@ -37,6 +37,11 @@
testTextExtraction("testWMF_charset.wmf", 0, "普林斯");
}
+ @Test
+ public void testOneOff() throws Exception {
+ debug(getRecursiveMetadata("testWMF-bad.wmf"));
+ }
+
private void testTextExtraction(String fileName, int metaDataItemIndex, String expectedText) throws Exception {
List<Metadata> metadataList = getRecursiveMetadata(fileName);
Metadata wmfMetadata = metadataList.get(metaDataItemIndex);
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/xml/XML2003ParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/xml/XML2003ParserTest.java
index 4e52792..1ee6d06 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/xml/XML2003ParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/xml/XML2003ParserTest.java
@@ -47,8 +47,7 @@
@Test
public void testBasicWord() throws Exception {
List<Metadata> list = getRecursiveMetadata("testWORD2003.xml");
- assertEquals(8, list.size());
-
+ assertEquals(6, list.size());
Metadata m = list.get(0);//container doc
String xml = m.get(RecursiveParserWrapper.TIKA_CONTENT);
xml = xml.replaceAll("\\s+", " ");
@@ -81,7 +80,9 @@
//make sure embedded docs were properly processed
assertContains("moscow-birds",
- Arrays.asList(list.get(7).getValues(TikaCoreProperties.KEYWORDS)));
+ Arrays.asList(list.get(5).getValues(TikaCoreProperties.KEYWORDS)));
+
+ assertEquals("testJPEG_EXIF.jpg", list.get(5).get(TikaCoreProperties.ORIGINAL_RESOURCE_NAME));
//check that text is extracted with breaks between elements
String txt = getText(getResourceAsStream("/test-documents/testWORD2003.xml"), new AutoDetectParser());
@@ -92,7 +93,6 @@
assertContains("footnote Figure", txt);
assertContains("test space", txt);
- assertEquals("testJPEG_EXIF.jpg", list.get(7).get(TikaCoreProperties.ORIGINAL_RESOURCE_NAME));
}
@Test