NPOIFS->POIFS and add jackcess shim
diff --git a/tika-eval/pom.xml b/tika-eval/pom.xml
index c7d28fd..9289116 100644
--- a/tika-eval/pom.xml
+++ b/tika-eval/pom.xml
@@ -36,7 +36,7 @@
<properties>
<cli.version>1.4</cli.version> <!--sync version with tika-server or move to parent? -->
<lucene.version>7.4.0</lucene.version>
- <poi.version>3.17</poi.version>
+ <poi.version>4.0.0</poi.version>
</properties>
<dependencies>
diff --git a/tika-parsers/pom.xml b/tika-parsers/pom.xml
index f279f32..9c6437e 100644
--- a/tika-parsers/pom.xml
+++ b/tika-parsers/pom.xml
@@ -35,7 +35,7 @@
<url>http://tika.apache.org/</url>
<properties>
- <poi.version>4.0.0-SNAPSHOT</poi.version>
+ <poi.version>4.0.0</poi.version>
<!-- NOTE: sync codec version with POI -->
<codec.version>1.11</codec.version>
<!-- NOTE: sync tukaani version with commons-compress in tika-parent-->
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
index ff5971a..0dd86ba 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
@@ -64,7 +64,7 @@
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.DocumentInputStream;
import org.apache.poi.poifs.filesystem.Entry;
-import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
@@ -139,7 +139,7 @@
* or writing the extracted content
*/
protected void parse(
- NPOIFSFileSystem filesystem, XHTMLContentHandler xhtml,
+ POIFSFileSystem filesystem, XHTMLContentHandler xhtml,
Locale locale) throws IOException, SAXException, TikaException {
parse(filesystem.getRoot(), xhtml, locale);
}
@@ -273,7 +273,7 @@
* @throws IOException on any IO errors.
* @throws SAXException on any SAX parsing errors.
*/
- public void processFile(NPOIFSFileSystem filesystem, boolean listenForAllRecords)
+ public void processFile(POIFSFileSystem filesystem, boolean listenForAllRecords)
throws IOException, SAXException, TikaException {
processFile(filesystem.getRoot(), listenForAllRecords);
}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
index 9990f30..5095709 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
@@ -45,7 +45,7 @@
import org.apache.poi.hslf.usermodel.HSLFTextRun;
import org.apache.poi.hslf.usermodel.HSLFTextShape;
import org.apache.poi.poifs.filesystem.DirectoryNode;
-import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.sl.usermodel.Comment;
import org.apache.poi.sl.usermodel.SimpleShape;
import org.apache.tika.exception.TikaException;
@@ -68,7 +68,7 @@
}
protected void parse(
- NPOIFSFileSystem filesystem, XHTMLContentHandler xhtml)
+ POIFSFileSystem filesystem, XHTMLContentHandler xhtml)
throws IOException, SAXException, TikaException {
parse(filesystem.getRoot(), xhtml);
}
@@ -270,9 +270,9 @@
long persistId = vbaAtom.getPersistIdRef();
for (HSLFObjectData objData : ppt.getEmbeddedObjects()) {
if (objData.getExOleObjStg().getPersistId() == persistId) {
- try (NPOIFSFileSystem npoifsFileSystem = new NPOIFSFileSystem(objData.getInputStream())) {
+ try (POIFSFileSystem poifsFileSystem = new POIFSFileSystem(objData.getInputStream())) {
try {
- OfficeParser.extractMacros(npoifsFileSystem, xhtml,
+ OfficeParser.extractMacros(poifsFileSystem, xhtml,
EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context));
} catch (IOException|SAXException inner) {
EmbeddedDocumentUtil.recordException(inner, parentMetadata);
@@ -494,18 +494,18 @@
}
if (mediaType.equals("application/x-tika-msoffice-embedded; format=comp_obj")
|| mediaType.equals("application/x-tika-msoffice")) {
- NPOIFSFileSystem npoifs = null;
+ POIFSFileSystem poifs = null;
try {
- npoifs = new NPOIFSFileSystem(new CloseShieldInputStream(stream));
+ poifs = new POIFSFileSystem(new CloseShieldInputStream(stream));
} catch (RuntimeException e) {
throw new IOExceptionWithCause(e);
}
try {
- handleEmbeddedOfficeDoc(npoifs.getRoot(), objID, xhtml);
+ handleEmbeddedOfficeDoc(poifs.getRoot(), objID, xhtml);
} finally {
- if (npoifs != null) {
- npoifs.close();
+ if (poifs != null) {
+ poifs.close();
}
}
} else {
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessCompoundOleUtil.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessCompoundOleUtil.java
new file mode 100644
index 0000000..b09f19d
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessCompoundOleUtil.java
@@ -0,0 +1,268 @@
+/*
+Copyright (c) 2013 James Ahlborn
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package org.apache.tika.parser.microsoft;
+
+import com.healthmarketscience.jackcess.RuntimeIOException;
+import com.healthmarketscience.jackcess.impl.ByteUtil;
+import com.healthmarketscience.jackcess.impl.CustomToStringStyle;
+import com.healthmarketscience.jackcess.util.MemFileChannel;
+import com.healthmarketscience.jackcess.util.OleBlob;
+import org.apache.commons.lang.builder.ToStringBuilder;
+import org.apache.poi.poifs.filesystem.DirectoryEntry;
+import org.apache.poi.poifs.filesystem.DocumentEntry;
+import org.apache.poi.poifs.filesystem.DocumentInputStream;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.io.UnsupportedEncodingException;
+import java.net.URLDecoder;
+import java.net.URLEncoder;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+
+/**
+ * Temporary copy/paste from Jackcess to allow upgrade to POI 4.0.0.
+ * This class will be removed once POI 4.0.0 is released and jackcess
+ * updates to the most recent version of POI.
+ * @deprecated -- this class will be removed in Tika >= 1.20
+ */
+@Deprecated
+class JackcessCompoundOleUtil implements JackcessOleUtil.CompoundPackageFactory {
+ private static final String ENTRY_NAME_CHARSET = "UTF-8";
+ private static final String ENTRY_SEPARATOR = "/";
+ private static final String CONTENTS_ENTRY = "CONTENTS";
+
+ static {
+ // force a poi class to be loaded to ensure that when this class is
+ // loaded, we know that the poi classes are available
+ POIFSFileSystem.class.getName();
+ }
+
+ public JackcessCompoundOleUtil() {
+ }
+
+ /**
+ * Creates a nes CompoundContent for the given blob information.
+ */
+ public JackcessOleUtil.ContentImpl createCompoundPackageContent(
+ JackcessOleUtil.OleBlobImpl blob, String prettyName, String className, String typeName,
+ ByteBuffer blobBb, int dataBlockLen) {
+ return new CompoundContentImpl(blob, prettyName, className, typeName,
+ blobBb.position(), dataBlockLen);
+ }
+
+ /**
+ * Gets a DocumentEntry from compound storage based on a fully qualified,
+ * encoded entry name.
+ *
+ * @param entryName fully qualified, encoded entry name
+ * @param dir root directory of the compound storage
+ * @return the relevant DocumentEntry
+ * @throws FileNotFoundException if the entry does not exist
+ * @throws IOException if some other io error occurs
+ */
+ public static DocumentEntry getDocumentEntry(String entryName,
+ DirectoryEntry dir)
+ throws IOException {
+ // split entry name into individual components and decode them
+ List<String> entryNames = new ArrayList<String>();
+ for (String str : entryName.split(ENTRY_SEPARATOR)) {
+ if (str.length() == 0) {
+ continue;
+ }
+ entryNames.add(decodeEntryName(str));
+ }
+
+ DocumentEntry entry = null;
+ Iterator<String> iter = entryNames.iterator();
+ while (iter.hasNext()) {
+ org.apache.poi.poifs.filesystem.Entry tmpEntry = dir.getEntry(iter.next());
+ if (tmpEntry instanceof DirectoryEntry) {
+ dir = (DirectoryEntry) tmpEntry;
+ } else if (!iter.hasNext() && (tmpEntry instanceof DocumentEntry)) {
+ entry = (DocumentEntry) tmpEntry;
+ } else {
+ break;
+ }
+ }
+
+ if (entry == null) {
+ throw new FileNotFoundException("Could not find document " + entryName);
+ }
+
+ return entry;
+ }
+
+ private static String encodeEntryName(String name) {
+ try {
+ return URLEncoder.encode(name, ENTRY_NAME_CHARSET);
+ } catch (UnsupportedEncodingException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ private static String decodeEntryName(String name) {
+ try {
+ return URLDecoder.decode(name, ENTRY_NAME_CHARSET);
+ } catch (UnsupportedEncodingException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ private static final class CompoundContentImpl
+ extends JackcessOleUtil.EmbeddedPackageContentImpl
+ implements OleBlob.CompoundContent {
+ private POIFSFileSystem _fs;
+
+ private CompoundContentImpl(
+ JackcessOleUtil.OleBlobImpl blob, String prettyName, String className,
+ String typeName, int position, int length) {
+ super(blob, prettyName, className, typeName, position, length);
+ }
+
+ public OleBlob.ContentType getType() {
+ return OleBlob.ContentType.COMPOUND_STORAGE;
+ }
+
+ private POIFSFileSystem getFileSystem() throws IOException {
+ if (_fs == null) {
+ _fs = new POIFSFileSystem(MemFileChannel.newChannel(getStream(), "r"));
+ }
+ return _fs;
+ }
+
+ public Iterator<Entry> iterator() {
+ try {
+ return getEntries(new ArrayList<Entry>(), getFileSystem().getRoot(),
+ ENTRY_SEPARATOR).iterator();
+ } catch (IOException e) {
+ throw new RuntimeIOException(e);
+ }
+ }
+
+ public EntryImpl getEntry(String entryName) throws IOException {
+ return new EntryImpl(entryName,
+ getDocumentEntry(entryName, getFileSystem().getRoot()));
+ }
+
+ public boolean hasContentsEntry() throws IOException {
+ return getFileSystem().getRoot().hasEntry(CONTENTS_ENTRY);
+ }
+
+ public EntryImpl getContentsEntry() throws IOException {
+ return getEntry(CONTENTS_ENTRY);
+ }
+
+ private List<Entry> getEntries(List<Entry> entries, DirectoryEntry dir,
+ String prefix) {
+ for (org.apache.poi.poifs.filesystem.Entry entry : dir) {
+ if (entry instanceof DirectoryEntry) {
+ // .. recurse into this directory
+ getEntries(entries, (DirectoryEntry) entry, prefix + ENTRY_SEPARATOR);
+ } else if (entry instanceof DocumentEntry) {
+ // grab the entry name/detils
+ DocumentEntry de = (DocumentEntry) entry;
+ String entryName = prefix + encodeEntryName(entry.getName());
+ entries.add(new EntryImpl(entryName, de));
+ }
+ }
+ return entries;
+ }
+
+ @Override
+ public void close() {
+ ByteUtil.closeQuietly(_fs);
+ _fs = null;
+ super.close();
+ }
+
+ @Override
+ public String toString() {
+ ToStringBuilder sb = toString(CustomToStringStyle.builder(this));
+
+ try {
+ sb.append("hasContentsEntry", hasContentsEntry());
+ sb.append("entries", getEntries(new ArrayList<Entry>(),
+ getFileSystem().getRoot(),
+ ENTRY_SEPARATOR));
+ } catch (IOException e) {
+ sb.append("entries", "<" + e + ">");
+ }
+
+ return sb.toString();
+ }
+
+ private final class EntryImpl implements OleBlob.CompoundContent.Entry {
+ private final String _name;
+ private final DocumentEntry _docEntry;
+
+ private EntryImpl(String name, DocumentEntry docEntry) {
+ _name = name;
+ _docEntry = docEntry;
+ }
+
+ public OleBlob.ContentType getType() {
+ return OleBlob.ContentType.UNKNOWN;
+ }
+
+ public String getName() {
+ return _name;
+ }
+
+ public CompoundContentImpl getParent() {
+ return CompoundContentImpl.this;
+ }
+
+ public JackcessOleUtil.OleBlobImpl getBlob() {
+ return getParent().getBlob();
+ }
+
+ public long length() {
+ return _docEntry.getSize();
+ }
+
+ public InputStream getStream() throws IOException {
+ return new DocumentInputStream(_docEntry);
+ }
+
+ public void writeTo(OutputStream out) throws IOException {
+ InputStream in = null;
+ try {
+ ByteUtil.copy(in = getStream(), out);
+ } finally {
+ ByteUtil.closeQuietly(in);
+ }
+ }
+
+ @Override
+ public String toString() {
+ return CustomToStringStyle.valueBuilder(this)
+ .append("name", _name)
+ .append("length", length())
+ .toString();
+ }
+ }
+ }
+}
+
+
+
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
index bf5c5d0..3a10346 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
@@ -41,7 +41,7 @@
import com.healthmarketscience.jackcess.Table;
import com.healthmarketscience.jackcess.query.Query;
import com.healthmarketscience.jackcess.util.OleBlob;
-import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.IOUtils;
@@ -302,8 +302,9 @@
}
}
+
private void handleOLE(Row row, String cName, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException {
- OleBlob blob = row.getBlob(cName);
+ OleBlob blob = getBlob(row, cName);
//lifted shamelessly from Jackcess's OleBlobTest
if (blob == null)
return;
@@ -367,9 +368,21 @@
}
}
+ /*
+ Temporary work around until POI 4.0.0 is released and jackcess upgrades
+ This is copy/pasted from jackcess
+ */
+ private OleBlob getBlob(Row row, String cName) {
+ byte[] bytes = row.getBytes(cName);
+ if (bytes == null) {
+ return null;
+ }
+ return JackcessOleUtil.parseBlob(bytes);
+ }
+
private void handleCompoundContent(OleBlob.CompoundContent cc, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException {
InputStream is = null;
- NPOIFSFileSystem nfs = null;
+ POIFSFileSystem fileSystem = null;
try {
try {
is = cc.getStream();
@@ -379,18 +392,18 @@
}
try {
- nfs = new NPOIFSFileSystem(is);
+ fileSystem = new POIFSFileSystem(is);
} catch (Exception e) {
EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
return;
}
- handleEmbeddedOfficeDoc(nfs.getRoot(), xhtml);
+ handleEmbeddedOfficeDoc(fileSystem.getRoot(), xhtml);
} finally {
- if (nfs != null) {
+ if (fileSystem != null) {
try {
- nfs.close();
+ fileSystem.close();
} catch (IOException e) {
//swallow
}
@@ -414,5 +427,6 @@
}
return shortDateTimeFormatter.format(d);
}
+
}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessOleUtil.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessOleUtil.java
new file mode 100644
index 0000000..a1432d6
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessOleUtil.java
@@ -0,0 +1,813 @@
+/*
+Copyright (c) 2013 James Ahlborn
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package org.apache.tika.parser.microsoft;
+
+import java.io.ByteArrayInputStream;
+import java.io.Closeable;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.nio.ByteBuffer;
+import java.nio.charset.Charset;
+import java.sql.Blob;
+import java.sql.SQLException;
+import java.sql.SQLFeatureNotSupportedException;
+import java.text.Normalizer;
+import java.util.EnumSet;
+import java.util.Set;
+import java.util.regex.Pattern;
+
+import com.healthmarketscience.jackcess.DataType;
+import com.healthmarketscience.jackcess.util.OleBlob;
+import static com.healthmarketscience.jackcess.util.OleBlob.*;
+import org.apache.commons.lang.builder.ToStringBuilder;
+
+import com.healthmarketscience.jackcess.impl.ByteUtil;
+import com.healthmarketscience.jackcess.impl.CustomToStringStyle;
+import com.healthmarketscience.jackcess.impl.PageChannel;
+
+/**
+ * Utility code for working with OLE data.
+ * Temporary workaround until POI 4.0.0 is released and Jackcess is updated
+ *
+ *
+ * @author James Ahlborn
+ * @usage _advanced_class_
+ * @deprecated this class will be removed in Tika >= 1.20
+ */
+@Deprecated
+class JackcessOleUtil {
+
+
+ /**
+ * Interface used to allow optional inclusion of the poi library for working
+ * with compound ole data.
+ */
+ interface CompoundPackageFactory
+ {
+ public ContentImpl createCompoundPackageContent(
+ OleBlobImpl blob, String prettyName, String className, String typeName,
+ ByteBuffer blobBb, int dataBlockLen);
+ }
+
+ private static final int PACKAGE_SIGNATURE = 0x1C15;
+ private static final Charset OLE_CHARSET = Charset.forName("US-ASCII");
+ private static final Charset OLE_UTF_CHARSET = Charset.forName("UTF-16LE");
+ private static final byte[] COMPOUND_STORAGE_SIGNATURE =
+ {(byte)0xd0,(byte)0xcf,(byte)0x11,(byte)0xe0,
+ (byte)0xa1,(byte)0xb1,(byte)0x1a,(byte)0xe1};
+ private static final String SIMPLE_PACKAGE_TYPE = "Package";
+ private static final int PACKAGE_OBJECT_TYPE = 0x02;
+ private static final int OLE_VERSION = 0x0501;
+ private static final int OLE_FORMAT = 0x02;
+ private static final int PACKAGE_STREAM_SIGNATURE = 0x02;
+ private static final int PS_EMBEDDED_FILE = 0x030000;
+ private static final int PS_LINKED_FILE = 0x010000;
+ private static final Set<ContentType> WRITEABLE_TYPES = EnumSet.of(
+ ContentType.LINK, ContentType.SIMPLE_PACKAGE, ContentType.OTHER);
+ private static final byte[] NO_DATA = new byte[0];
+ private static final int LINK_HEADER = 0x01;
+ private static final byte[] PACKAGE_FOOTER = {
+ 0x01, 0x05, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x01, (byte)0xAD, 0x05, (byte)0xFE
+ };
+
+ // regex pattern which matches all the crazy extra stuff in unicode
+ private static final Pattern UNICODE_ACCENT_PATTERN =
+ Pattern.compile("[\\p{InCombiningDiacriticalMarks}\\p{IsLm}\\p{IsSk}]+");
+
+ private static final CompoundPackageFactory COMPOUND_FACTORY;
+
+ static {
+ CompoundPackageFactory compoundFactory = null;
+ try {
+ compoundFactory = (CompoundPackageFactory)
+ Class.forName("org.apache.tika.parser.microsoft.JackcessCompoundOleUtil")
+ .newInstance();
+ } catch(Throwable t) {
+ // must not have poi, will load compound ole data as "other"
+ }
+ COMPOUND_FACTORY = compoundFactory;
+ }
+
+ /**
+ * Parses an access database blob structure and returns an appropriate
+ * OleBlob instance.
+ */
+ public static OleBlob parseBlob(byte[] bytes) {
+ return new OleBlobImpl(bytes);
+ }
+
+ /**
+ * Creates a new OlBlob instance using the given information.
+ */
+ public static OleBlob createBlob(Builder oleBuilder)
+ throws IOException
+ {
+ try {
+
+ if(!WRITEABLE_TYPES.contains(oleBuilder.getType())) {
+ throw new IllegalArgumentException(
+ "Cannot currently create ole values of type " +
+ oleBuilder.getType());
+ }
+
+ long contentLen = oleBuilder.getContentLength();
+ byte[] contentBytes = oleBuilder.getBytes();
+ InputStream contentStream = oleBuilder.getStream();
+ byte[] packageStreamHeader = NO_DATA;
+ byte[] packageStreamFooter = NO_DATA;
+
+ switch(oleBuilder.getType()) {
+ case LINK:
+ packageStreamHeader = writePackageStreamHeader(oleBuilder);
+
+ // link "content" is file path
+ contentBytes = getZeroTermStrBytes(oleBuilder.getFilePath());
+ contentLen = contentBytes.length;
+ break;
+
+ case SIMPLE_PACKAGE:
+ packageStreamHeader = writePackageStreamHeader(oleBuilder);
+ packageStreamFooter = writePackageStreamFooter(oleBuilder);
+ break;
+
+ case OTHER:
+ // nothing more to do
+ break;
+ default:
+ throw new RuntimeException("unexpected type " + oleBuilder.getType());
+ }
+
+ long payloadLen = packageStreamHeader.length + packageStreamFooter.length +
+ contentLen;
+ byte[] packageHeader = writePackageHeader(oleBuilder, payloadLen);
+
+ long totalOleLen = packageHeader.length + PACKAGE_FOOTER.length +
+ payloadLen;
+ if(totalOleLen > DataType.OLE.getMaxSize()) {
+ throw new IllegalArgumentException("Content size of " + totalOleLen +
+ " is too large for ole column");
+ }
+
+ byte[] oleBytes = new byte[(int)totalOleLen];
+ ByteBuffer bb = PageChannel.wrap(oleBytes);
+ bb.put(packageHeader);
+ bb.put(packageStreamHeader);
+
+ if(contentLen > 0L) {
+ if(contentBytes != null) {
+ bb.put(contentBytes);
+ } else {
+ byte[] buf = new byte[8192];
+ int numBytes = 0;
+ while((numBytes = contentStream.read(buf)) >= 0) {
+ bb.put(buf, 0, numBytes);
+ }
+ }
+ }
+
+ bb.put(packageStreamFooter);
+ bb.put(PACKAGE_FOOTER);
+
+ return parseBlob(oleBytes);
+
+ } finally {
+ ByteUtil.closeQuietly(oleBuilder.getStream());
+ }
+ }
+
+ private static byte[] writePackageHeader(Builder oleBuilder,
+ long contentLen) {
+
+ byte[] prettyNameBytes = getZeroTermStrBytes(oleBuilder.getPrettyName());
+ String className = oleBuilder.getClassName();
+ String typeName = oleBuilder.getTypeName();
+ if(className == null) {
+ className = typeName;
+ } else if(typeName == null) {
+ typeName = className;
+ }
+ byte[] classNameBytes = getZeroTermStrBytes(className);
+ byte[] typeNameBytes = getZeroTermStrBytes(typeName);
+
+ int packageHeaderLen = 20 + prettyNameBytes.length + classNameBytes.length;
+
+ int oleHeaderLen = 24 + typeNameBytes.length;
+
+ byte[] headerBytes = new byte[packageHeaderLen + oleHeaderLen];
+
+ ByteBuffer bb = PageChannel.wrap(headerBytes);
+
+ // write outer package header
+ bb.putShort((short)PACKAGE_SIGNATURE);
+ bb.putShort((short)packageHeaderLen);
+ bb.putInt(PACKAGE_OBJECT_TYPE);
+ bb.putShort((short)prettyNameBytes.length);
+ bb.putShort((short)classNameBytes.length);
+ int prettyNameOff = bb.position() + 8;
+ bb.putShort((short)prettyNameOff);
+ bb.putShort((short)(prettyNameOff + prettyNameBytes.length));
+ bb.putInt(-1);
+ bb.put(prettyNameBytes);
+ bb.put(classNameBytes);
+
+ // put ole header
+ bb.putInt(OLE_VERSION);
+ bb.putInt(OLE_FORMAT);
+ bb.putInt(typeNameBytes.length);
+ bb.put(typeNameBytes);
+ bb.putLong(0L);
+ bb.putInt((int)contentLen);
+
+ return headerBytes;
+ }
+
+ private static byte[] writePackageStreamHeader(Builder oleBuilder) {
+
+ byte[] fileNameBytes = getZeroTermStrBytes(oleBuilder.getFileName());
+ byte[] filePathBytes = getZeroTermStrBytes(oleBuilder.getFilePath());
+
+ int headerLen = 6 + fileNameBytes.length + filePathBytes.length;
+
+ if(oleBuilder.getType() == ContentType.SIMPLE_PACKAGE) {
+
+ headerLen += 8 + filePathBytes.length;
+
+ } else {
+
+ headerLen += 2;
+ }
+
+ byte[] headerBytes = new byte[headerLen];
+ ByteBuffer bb = PageChannel.wrap(headerBytes);
+ bb.putShort((short)PACKAGE_STREAM_SIGNATURE);
+ bb.put(fileNameBytes);
+ bb.put(filePathBytes);
+
+ if(oleBuilder.getType() == ContentType.SIMPLE_PACKAGE) {
+ bb.putInt(PS_EMBEDDED_FILE);
+ bb.putInt(filePathBytes.length);
+ bb.put(filePathBytes, 0, filePathBytes.length);
+ bb.putInt((int) oleBuilder.getContentLength());
+ } else {
+ bb.putInt(PS_LINKED_FILE);
+ bb.putShort((short)LINK_HEADER);
+ }
+
+ return headerBytes;
+ }
+
+ private static byte[] writePackageStreamFooter(Builder oleBuilder) {
+
+ // note, these are _not_ zero terminated
+ byte[] fileNameBytes = oleBuilder.getFileName().getBytes(OLE_UTF_CHARSET);
+ byte[] filePathBytes = oleBuilder.getFilePath().getBytes(OLE_UTF_CHARSET);
+
+ int footerLen = 12 + (filePathBytes.length * 2) + fileNameBytes.length;
+
+ byte[] footerBytes = new byte[footerLen];
+ ByteBuffer bb = PageChannel.wrap(footerBytes);
+
+ bb.putInt(filePathBytes.length/2);
+ bb.put(filePathBytes);
+ bb.putInt(fileNameBytes.length/2);
+ bb.put(fileNameBytes);
+ bb.putInt(filePathBytes.length/2);
+ bb.put(filePathBytes);
+
+ return footerBytes;
+ }
+
+ /**
+ * creates the appropriate ContentImpl for the given blob.
+ */
+ private static ContentImpl parseContent(OleBlobImpl blob)
+ throws IOException
+ {
+ ByteBuffer bb = PageChannel.wrap(blob.getBytes());
+
+ if((bb.remaining() < 2) || (bb.getShort() != PACKAGE_SIGNATURE)) {
+ return new UnknownContentImpl(blob);
+ }
+
+ // read outer package header
+ int headerSize = bb.getShort();
+ /* int objType = */ bb.getInt();
+ int prettyNameLen = bb.getShort();
+ int classNameLen = bb.getShort();
+ int prettyNameOff = bb.getShort();
+ int classNameOff = bb.getShort();
+ /* int objSize = */ bb.getInt();
+ String prettyName = readStr(bb, prettyNameOff, prettyNameLen);
+ String className = readStr(bb, classNameOff, classNameLen);
+ bb.position(headerSize);
+
+ // read ole header
+ int oleVer = bb.getInt();
+ /* int format = */ bb.getInt();
+
+ if(oleVer != OLE_VERSION) {
+ return new UnknownContentImpl(blob);
+ }
+
+ int typeNameLen = bb.getInt();
+ String typeName = readStr(bb, bb.position(), typeNameLen);
+ bb.getLong(); // unused
+ int dataBlockLen = bb.getInt();
+ int dataBlockPos = bb.position();
+
+
+ if(SIMPLE_PACKAGE_TYPE.equalsIgnoreCase(typeName)) {
+ return createSimplePackageContent(
+ blob, prettyName, className, typeName, bb, dataBlockLen);
+ }
+
+ // if COMPOUND_FACTORY is null, the poi library isn't available, so just
+ // load compound data as "other"
+ if((COMPOUND_FACTORY != null) &&
+ (bb.remaining() >= COMPOUND_STORAGE_SIGNATURE.length) &&
+ ByteUtil.matchesRange(bb, bb.position(), COMPOUND_STORAGE_SIGNATURE)) {
+ return COMPOUND_FACTORY.createCompoundPackageContent(
+ blob, prettyName, className, typeName, bb, dataBlockLen);
+ }
+
+ // this is either some other "special" (as yet unhandled) format, or it is
+ // simply an embedded file (or it is compound data and poi isn't available)
+ return new OtherContentImpl(blob, prettyName, className,
+ typeName, dataBlockPos, dataBlockLen);
+ }
+
+ private static ContentImpl createSimplePackageContent(
+ OleBlobImpl blob, String prettyName, String className, String typeName,
+ ByteBuffer blobBb, int dataBlockLen) {
+
+ int dataBlockPos = blobBb.position();
+ ByteBuffer bb = PageChannel.narrowBuffer(blobBb, dataBlockPos,
+ dataBlockPos + dataBlockLen);
+
+ int packageSig = bb.getShort();
+ if(packageSig != PACKAGE_STREAM_SIGNATURE) {
+ return new OtherContentImpl(blob, prettyName, className,
+ typeName, dataBlockPos, dataBlockLen);
+ }
+
+ String fileName = readZeroTermStr(bb);
+ String filePath = readZeroTermStr(bb);
+ int packageType = bb.getInt();
+
+ if(packageType == PS_EMBEDDED_FILE) {
+
+ int localFilePathLen = bb.getInt();
+ String localFilePath = readStr(bb, bb.position(), localFilePathLen);
+ int dataLen = bb.getInt();
+ int dataPos = bb.position();
+ bb.position(dataLen + dataPos);
+
+ // remaining strings are in "reverse" order (local file path, file name,
+ // file path). these string usee a real utf charset, and therefore can
+ // "fix" problems with ascii based names (so we prefer these strings to
+ // the original strings we found)
+ int strNum = 0;
+ while(true) {
+
+ int rem = bb.remaining();
+ if(rem < 4) {
+ break;
+ }
+
+ int strLen = bb.getInt();
+ String remStr = readStr(bb, bb.position(), strLen * 2, OLE_UTF_CHARSET);
+
+ switch(strNum) {
+ case 0:
+ localFilePath = remStr;
+ break;
+ case 1:
+ fileName = remStr;
+ break;
+ case 2:
+ filePath = remStr;
+ break;
+ default:
+ // ignore
+ }
+
+ ++strNum;
+ }
+
+ return new SimplePackageContentImpl(
+ blob, prettyName, className, typeName, dataPos, dataLen,
+ fileName, filePath, localFilePath);
+ }
+
+ if(packageType == PS_LINKED_FILE) {
+
+ bb.getShort(); //unknown
+ String linkStr = readZeroTermStr(bb);
+
+ return new LinkContentImpl(blob, prettyName, className, typeName,
+ fileName, linkStr, filePath);
+ }
+
+ return new OtherContentImpl(blob, prettyName, className,
+ typeName, dataBlockPos, dataBlockLen);
+ }
+
+ private static String readStr(ByteBuffer bb, int off, int len) {
+ return readStr(bb, off, len, OLE_CHARSET);
+ }
+
+ private static String readZeroTermStr(ByteBuffer bb) {
+ int off = bb.position();
+ while(bb.hasRemaining()) {
+ byte b = bb.get();
+ if(b == 0) {
+ break;
+ }
+ }
+ int len = bb.position() - off;
+ return readStr(bb, off, len);
+ }
+
+ private static String readStr(ByteBuffer bb, int off, int len,
+ Charset charset) {
+ String str = new String(bb.array(), off, len, charset);
+ bb.position(off + len);
+ if(str.charAt(str.length() - 1) == '\0') {
+ str = str.substring(0, str.length() - 1);
+ }
+ return str;
+ }
+
+ private static byte[] getZeroTermStrBytes(String str) {
+ // since we are converting to ascii, try to make "nicer" versions of crazy
+ // chars (e.g. convert "u with an umlaut" to just "u"). this may not
+ // ultimately help anything but it is what ms access does.
+
+ // decompose complex chars into combos of char and accent
+ str = Normalizer.normalize(str, Normalizer.Form.NFD);
+ // strip the accents
+ str = UNICODE_ACCENT_PATTERN.matcher(str).replaceAll("");
+ // (re)normalize what is left
+ str = Normalizer.normalize(str, Normalizer.Form.NFC);
+
+ return (str + '\0').getBytes(OLE_CHARSET);
+ }
+
+
+ static final class OleBlobImpl implements OleBlob
+ {
+ private byte[] _bytes;
+ private ContentImpl _content;
+
+ private OleBlobImpl(byte[] bytes) {
+ _bytes = bytes;
+ }
+
+ public void writeTo(OutputStream out) throws IOException {
+ out.write(_bytes);
+ }
+
+ public Content getContent() throws IOException {
+ if(_content == null) {
+ _content = parseContent(this);
+ }
+ return _content;
+ }
+
+ public InputStream getBinaryStream() throws SQLException {
+ return new ByteArrayInputStream(_bytes);
+ }
+
+ public InputStream getBinaryStream(long pos, long len)
+ throws SQLException
+ {
+ return new ByteArrayInputStream(_bytes, fromJdbcOffset(pos), (int)len);
+ }
+
+ public long length() throws SQLException {
+ return _bytes.length;
+ }
+
+ public byte[] getBytes() throws IOException {
+ if(_bytes == null) {
+ throw new IOException("blob is closed");
+ }
+ return _bytes;
+ }
+
+ public byte[] getBytes(long pos, int len) throws SQLException {
+ return ByteUtil.copyOf(_bytes, fromJdbcOffset(pos), len);
+ }
+
+ public long position(byte[] pattern, long start) throws SQLException {
+ int pos = ByteUtil.findRange(PageChannel.wrap(_bytes),
+ fromJdbcOffset(start), pattern);
+ return((pos >= 0) ? toJdbcOffset(pos) : pos);
+ }
+
+ public long position(Blob pattern, long start) throws SQLException {
+ return position(pattern.getBytes(1L, (int)pattern.length()), start);
+ }
+
+ public OutputStream setBinaryStream(long position) throws SQLException {
+ throw new SQLFeatureNotSupportedException();
+ }
+
+ public void truncate(long len) throws SQLException {
+ throw new SQLFeatureNotSupportedException();
+ }
+
+ public int setBytes(long pos, byte[] bytes) throws SQLException {
+ throw new SQLFeatureNotSupportedException();
+ }
+
+ public int setBytes(long pos, byte[] bytes, int offset, int lesn)
+ throws SQLException {
+ throw new SQLFeatureNotSupportedException();
+ }
+
+ public void free() {
+ close();
+ }
+
+ public void close() {
+ _bytes = null;
+ ByteUtil.closeQuietly(_content);
+ _content = null;
+ }
+
+ private static int toJdbcOffset(int off) {
+ return off + 1;
+ }
+
+ private static int fromJdbcOffset(long off) {
+ return (int)off - 1;
+ }
+
+ @Override
+ public String toString() {
+ ToStringBuilder sb = CustomToStringStyle.builder(this);
+ if(_content != null) {
+ sb.append("content", _content);
+ } else {
+ sb.append("bytes", _bytes);
+ sb.append("content", "(uninitialized)");
+ }
+ return sb.toString();
+ }
+ }
+
+ static abstract class ContentImpl implements Content, Closeable
+ {
+ protected final OleBlobImpl _blob;
+
+ protected ContentImpl(OleBlobImpl blob) {
+ _blob = blob;
+ }
+
+ public OleBlobImpl getBlob() {
+ return _blob;
+ }
+
+ protected byte[] getBytes() throws IOException {
+ return getBlob().getBytes();
+ }
+
+ public void close() {
+ // base does nothing
+ }
+
+ protected ToStringBuilder toString(ToStringBuilder sb) {
+ sb.append("type", getType());
+ return sb;
+ }
+ }
+
+ static abstract class EmbeddedContentImpl extends ContentImpl
+ implements EmbeddedContent
+ {
+ private final int _position;
+ private final int _length;
+
+ protected EmbeddedContentImpl(OleBlobImpl blob, int position, int length)
+ {
+ super(blob);
+ _position = position;
+ _length = length;
+ }
+
+ public long length() {
+ return _length;
+ }
+
+ public InputStream getStream() throws IOException {
+ return new ByteArrayInputStream(getBytes(), _position, _length);
+ }
+
+ public void writeTo(OutputStream out) throws IOException {
+ out.write(getBytes(), _position, _length);
+ }
+
+ @Override
+ protected ToStringBuilder toString(ToStringBuilder sb) {
+ super.toString(sb);
+ if(_position >= 0) {
+ sb.append("content", ByteBuffer.wrap(_blob._bytes, _position, _length));
+ }
+ return sb;
+ }
+ }
+
+ static abstract class EmbeddedPackageContentImpl
+ extends EmbeddedContentImpl
+ implements PackageContent
+ {
+ private final String _prettyName;
+ private final String _className;
+ private final String _typeName;
+
+ protected EmbeddedPackageContentImpl(
+ OleBlobImpl blob, String prettyName, String className,
+ String typeName, int position, int length)
+ {
+ super(blob, position, length);
+ _prettyName = prettyName;
+ _className = className;
+ _typeName = typeName;
+ }
+
+ public String getPrettyName() {
+ return _prettyName;
+ }
+
+ public String getClassName() {
+ return _className;
+ }
+
+ public String getTypeName() {
+ return _typeName;
+ }
+
+ @Override
+ protected ToStringBuilder toString(ToStringBuilder sb) {
+ sb.append("prettyName", _prettyName)
+ .append("className", _className)
+ .append("typeName", _typeName);
+ super.toString(sb);
+ return sb;
+ }
+ }
+
+ private static final class LinkContentImpl
+ extends EmbeddedPackageContentImpl
+ implements LinkContent
+ {
+ private final String _fileName;
+ private final String _linkPath;
+ private final String _filePath;
+
+ private LinkContentImpl(OleBlobImpl blob, String prettyName,
+ String className, String typeName,
+ String fileName, String linkPath,
+ String filePath)
+ {
+ super(blob, prettyName, className, typeName, -1, -1);
+ _fileName = fileName;
+ _linkPath = linkPath;
+ _filePath = filePath;
+ }
+
+ public ContentType getType() {
+ return ContentType.LINK;
+ }
+
+ public String getFileName() {
+ return _fileName;
+ }
+
+ public String getLinkPath() {
+ return _linkPath;
+ }
+
+ public String getFilePath() {
+ return _filePath;
+ }
+
+ public InputStream getLinkStream() throws IOException {
+ return new FileInputStream(getLinkPath());
+ }
+
+ @Override
+ public String toString() {
+ return toString(CustomToStringStyle.builder(this))
+ .append("fileName", _fileName)
+ .append("linkPath", _linkPath)
+ .append("filePath", _filePath)
+ .toString();
+ }
+ }
+
+ private static final class SimplePackageContentImpl
+ extends EmbeddedPackageContentImpl
+ implements SimplePackageContent
+ {
+ private final String _fileName;
+ private final String _filePath;
+ private final String _localFilePath;
+
+ private SimplePackageContentImpl(OleBlobImpl blob, String prettyName,
+ String className, String typeName,
+ int position, int length,
+ String fileName, String filePath,
+ String localFilePath)
+ {
+ super(blob, prettyName, className, typeName, position, length);
+ _fileName = fileName;
+ _filePath = filePath;
+ _localFilePath = localFilePath;
+ }
+
+ public ContentType getType() {
+ return ContentType.SIMPLE_PACKAGE;
+ }
+
+ public String getFileName() {
+ return _fileName;
+ }
+
+ public String getFilePath() {
+ return _filePath;
+ }
+
+ public String getLocalFilePath() {
+ return _localFilePath;
+ }
+
+ @Override
+ public String toString() {
+ return toString(CustomToStringStyle.builder(this))
+ .append("fileName", _fileName)
+ .append("filePath", _filePath)
+ .append("localFilePath", _localFilePath)
+ .toString();
+ }
+ }
+
+ private static final class OtherContentImpl
+ extends EmbeddedPackageContentImpl
+ implements OtherContent
+ {
+ private OtherContentImpl(
+ OleBlobImpl blob, String prettyName, String className,
+ String typeName, int position, int length)
+ {
+ super(blob, prettyName, className, typeName, position, length);
+ }
+
+ public ContentType getType() {
+ return ContentType.OTHER;
+ }
+
+ @Override
+ public String toString() {
+ return toString(CustomToStringStyle.builder(this))
+ .toString();
+ }
+ }
+
+ private static final class UnknownContentImpl extends ContentImpl
+ {
+ private UnknownContentImpl(OleBlobImpl blob) {
+ super(blob);
+ }
+
+ public ContentType getType() {
+ return ContentType.UNKNOWN;
+ }
+
+ @Override
+ public String toString() {
+ return toString(CustomToStringStyle.builder(this))
+ .append("content", _blob._bytes)
+ .toString();
+ }
+ }
+
+ }
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
index 133d5e4..779d5ee 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
@@ -36,7 +36,6 @@
import org.apache.poi.poifs.filesystem.DirectoryEntry;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.Entry;
-import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.poifs.macros.VBAMacroReader;
import org.apache.poi.util.IOUtils;
@@ -105,23 +104,23 @@
final DirectoryNode root;
TikaInputStream tstream = TikaInputStream.cast(stream);
- NPOIFSFileSystem mustCloseFs = null;
+ POIFSFileSystem mustCloseFs = null;
try {
if (tstream == null) {
- mustCloseFs = new NPOIFSFileSystem(new CloseShieldInputStream(stream));
+ mustCloseFs = new POIFSFileSystem(new CloseShieldInputStream(stream));
root = mustCloseFs.getRoot();
} else {
final Object container = tstream.getOpenContainer();
- if (container instanceof NPOIFSFileSystem) {
- root = ((NPOIFSFileSystem) container).getRoot();
+ if (container instanceof POIFSFileSystem) {
+ root = ((POIFSFileSystem) container).getRoot();
} else if (container instanceof DirectoryNode) {
root = (DirectoryNode) container;
} else {
- NPOIFSFileSystem fs = null;
+ POIFSFileSystem fs = null;
if (tstream.hasFile()) {
- fs = new NPOIFSFileSystem(tstream.getFile(), true);
+ fs = new POIFSFileSystem(tstream.getFile(), true);
} else {
- fs = new NPOIFSFileSystem(new CloseShieldInputStream(tstream));
+ fs = new POIFSFileSystem(new CloseShieldInputStream(tstream));
}
//tstream will close the fs, no need to close this below
tstream.setOpenContainer(fs);
@@ -274,10 +273,6 @@
return detectType(fs.getRoot());
}
- public static POIFSDocumentType detectType(NPOIFSFileSystem fs) {
- return detectType(fs.getRoot());
- }
-
public static POIFSDocumentType detectType(DirectoryEntry node) {
Set<String> names = new HashSet<String>();
for (Entry entry : node) {
@@ -313,7 +308,7 @@
* @throws IOException on IOException if it occurs during the extraction of the embedded doc
* @throws SAXException on SAXException for writing to xhtml
*/
- public static void extractMacros(NPOIFSFileSystem fs, ContentHandler xhtml,
+ public static void extractMacros(POIFSFileSystem fs, ContentHandler xhtml,
EmbeddedDocumentExtractor embeddedDocumentExtractor) throws IOException, SAXException {
VBAMacroReader reader = null;
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
index 0aed803..5d13351 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
@@ -51,7 +51,7 @@
import org.apache.poi.hsmf.datatypes.Types;
import org.apache.poi.hsmf.exceptions.ChunkNotFoundException;
import org.apache.poi.poifs.filesystem.DirectoryNode;
-import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.util.CodePageUtil;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
@@ -126,7 +126,7 @@
private final boolean extractAllAlternatives;
- public OutlookExtractor(NPOIFSFileSystem filesystem, ParseContext context) throws TikaException {
+ public OutlookExtractor(POIFSFileSystem filesystem, ParseContext context) throws TikaException {
this(filesystem.getRoot(), context);
}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
index 1c98690..1b5a0a9 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
@@ -33,7 +33,7 @@
import org.apache.poi.poifs.filesystem.DocumentInputStream;
import org.apache.poi.poifs.filesystem.DocumentNode;
import org.apache.poi.poifs.filesystem.Entry;
-import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.tika.detect.Detector;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
@@ -385,7 +385,7 @@
File file = stream.getFile();
try {
- NPOIFSFileSystem fs = new NPOIFSFileSystem(file, true);
+ POIFSFileSystem fs = new POIFSFileSystem(file, true);
// Optimize a possible later parsing process by keeping
// a reference to the already opened POI file system
@@ -423,8 +423,8 @@
Set<String> names = null;
if (tis != null) {
Object container = tis.getOpenContainer();
- if (container instanceof NPOIFSFileSystem) {
- names = getTopLevelNames(((NPOIFSFileSystem) container).getRoot());
+ if (container instanceof POIFSFileSystem) {
+ names = getTopLevelNames(((POIFSFileSystem) container).getRoot());
} else if (container instanceof DirectoryNode) {
names = getTopLevelNames((DirectoryNode) container);
}
@@ -454,8 +454,8 @@
// Detect based on the names (as available)
if (tis != null &&
tis.getOpenContainer() != null &&
- tis.getOpenContainer() instanceof NPOIFSFileSystem) {
- return detect(names, ((NPOIFSFileSystem) tis.getOpenContainer()).getRoot());
+ tis.getOpenContainer() instanceof POIFSFileSystem) {
+ return detect(names, ((POIFSFileSystem) tis.getOpenContainer()).getRoot());
} else {
return detect(names, null);
}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java
index 3e2ea26..8017184 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java
@@ -32,7 +32,7 @@
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.DocumentEntry;
import org.apache.poi.poifs.filesystem.DocumentInputStream;
-import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.MSOffice;
import org.apache.tika.metadata.Metadata;
@@ -63,7 +63,7 @@
this.metadata = metadata;
}
- public void parseSummaries(NPOIFSFileSystem filesystem)
+ public void parseSummaries(POIFSFileSystem filesystem)
throws IOException, TikaException {
parseSummaries(filesystem.getRoot());
}
@@ -94,8 +94,6 @@
// no property stream, just skip it
} catch (UnexpectedPropertySetTypeException e) {
throw new TikaException("Unexpected HPSF document", e);
- } catch (MarkUnsupportedException e) {
- throw new TikaException("Invalid DocumentInputStream", e);
} catch (Exception e) {
LOG.warn("Ignoring unexpected exception while parsing summary entry {}", entryName, e);
}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
index 4a80420..30bd4bb 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
@@ -49,7 +49,7 @@
import org.apache.poi.poifs.filesystem.DirectoryEntry;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.Entry;
-import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
@@ -145,7 +145,7 @@
}
protected void parse(
- NPOIFSFileSystem filesystem, XHTMLContentHandler xhtml)
+ POIFSFileSystem filesystem, XHTMLContentHandler xhtml)
throws IOException, SAXException, TikaException {
parse(filesystem.getRoot(), xhtml);
}
@@ -661,7 +661,7 @@
}
protected void parseWord6(
- NPOIFSFileSystem filesystem, XHTMLContentHandler xhtml)
+ POIFSFileSystem filesystem, XHTMLContentHandler xhtml)
throws IOException, SAXException, TikaException {
parseWord6(filesystem.getRoot(), xhtml);
}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
index ac5abc9..57c38a6 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
@@ -41,7 +41,6 @@
import org.apache.poi.openxml4j.opc.TargetMode;
import org.apache.poi.openxml4j.opc.internal.FileHelper;
import org.apache.poi.poifs.filesystem.DirectoryNode;
-import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
import org.apache.poi.poifs.filesystem.Ole10Native;
import org.apache.poi.poifs.filesystem.Ole10NativeException;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
@@ -418,9 +417,9 @@
if (officeParserConfig.getExtractMacros()) {
try (InputStream is = macroPart.getInputStream()) {
- try (NPOIFSFileSystem npoifs = new NPOIFSFileSystem(is)) {
+ try (POIFSFileSystem poifs = new POIFSFileSystem(is)) {
//Macro reading exceptions are already swallowed here
- OfficeParser.extractMacros(npoifs, handler, embeddedExtractor);
+ OfficeParser.extractMacros(poifs, handler, embeddedExtractor);
}
} catch (IOException e) {
throw new TikaException("Broken OOXML file", e);
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java
index 4387ca4..90ea58b 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java
@@ -33,7 +33,7 @@
import org.apache.poi.poifs.filesystem.DocumentInputStream;
import org.apache.poi.poifs.filesystem.Entry;
import org.apache.poi.poifs.filesystem.FileMagic;
-import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.poifs.filesystem.Ole10Native;
import org.apache.poi.poifs.filesystem.Ole10NativeException;
import org.apache.poi.poifs.storage.HeaderBlock;
@@ -141,7 +141,7 @@
throws IOException {
byte[] ret = null;
- try (NPOIFSFileSystem fs = new NPOIFSFileSystem(is)) {
+ try (POIFSFileSystem fs = new POIFSFileSystem(is)) {
DirectoryNode root = fs.getRoot();
diff --git a/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java b/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
index 57b91ca..ad12517 100644
--- a/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
+++ b/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
@@ -28,7 +28,7 @@
import java.nio.file.Path;
import java.util.Random;
-import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.tika.MultiThreadedTikaTest;
import org.apache.tika.Tika;
import org.apache.tika.config.TikaConfig;
@@ -43,7 +43,7 @@
import org.junit.Test;
/**
- * Junit test class for {@link ContainerAwareDetector}
+ * Junit test class for {@link org.apache.tika.parser.microsoft.POIFSContainerDetector}
*/
public class TestContainerAwareDetector extends MultiThreadedTikaTest {
private final TikaConfig tikaConfig = TikaConfig.getDefaultConfig();
@@ -184,7 +184,7 @@
assertEquals(
MediaType.parse("application/vnd.ms-powerpoint"),
detector.detect(stream, new Metadata()));
- assertTrue(stream.getOpenContainer() instanceof NPOIFSFileSystem);
+ assertTrue(stream.getOpenContainer() instanceof POIFSFileSystem);
}
}
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/JackcessParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/JackcessParserTest.java
index f39b961..2ec2a56 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/JackcessParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/JackcessParserTest.java
@@ -65,6 +65,7 @@
IOUtils.closeQuietly(is);
}
List<Metadata> list = handler.getMetadataList();
+ debug(list);
assertEquals(4, list.size());
String mainContent = list.get(0).get(RecursiveParserWrapper.TIKA_CONTENT);