TIKA-3081 -- convert TikaInputStream's skip to the equivalent of skipFully
diff --git a/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java b/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
index c995270..3997f9e 100644
--- a/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
+++ b/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
@@ -60,11 +60,19 @@
* associated with a TikaInputStream should first use the
* {@link #get(InputStream)} factory method to cast or wrap a given
* {@link InputStream} into a TikaInputStream instance.
+ * <p>
+ * TikaInputStream includes a few safety features to protect against parsers
+ * that may fail to check for an EOF or may incorrectly rely on the unreliable
+ * value returned from {@link FileInputStream#skip}. These parser failures
+ * can lead to infinite loops. We strongly encourage the use of
+ * TikaInputStream.
*
* @since Apache Tika 0.8
*/
public class TikaInputStream extends TaggedInputStream {
+ private static final int MAX_CONSECUTIVE_EOFS = 1000;
+
/**
* Checks whether the given stream is a TikaInputStream instance.
* The given stream can be <code>null</code>, in which case the return
@@ -731,9 +739,21 @@
return position;
}
+ /**
+ * This relies on {@link IOUtils#skip(InputStream, long)} to ensure
+ * that the alleged bytes skipped were actually skipped.
+ *
+ * @param ln the number of bytes to skip
+ * @return the number of bytes skipped
+ * @throws IOException if the number of bytes requested to be skipped does not match the number of bytes skipped
+ * or if there's an IOException during the read.
+ */
@Override
public long skip(long ln) throws IOException {
- long n = super.skip(ln);
+ long n = IOUtils.skip(super.in, ln);
+ if (n != ln) {
+ throw new IOException("tried to skip "+ln + " but actually skipped: "+n);
+ }
position += n;
return n;
}
@@ -777,7 +797,7 @@
position += n;
} else {
consecutiveEOFs++;
- if (consecutiveEOFs > 1000) {
+ if (consecutiveEOFs > MAX_CONSECUTIVE_EOFS) {
throw new IOException("Read too many -1 (EOFs); there could be an infinite loop." +
"If you think your file is not corrupt, please open an issue on Tika's JIRA");
}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNotePtr.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNotePtr.java
index c3fb150..85b20e8 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNotePtr.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNotePtr.java
@@ -337,7 +337,12 @@
// + 4 bytes for the FileNode header
CheckedFileNodePushBack pushBack = new CheckedFileNodePushBack(data);
try {
+ long initialOffset = offset;
FileNode fileNode = deserializeFileNode(data.children.get(data.children.size() - 1), curPath);
+ if (initialOffset == offset) {
+ //nothing read; avoid an infinite loop
+ break;
+ }
if (fileNode.id == FndStructureConstants.ChunkTerminatorFND || fileNode.id == 0) {
terminated = true;
break;
@@ -678,7 +683,6 @@
end = backup.end;
if (reserved != 1) {
- System.exit(1);
throw new TikaException("RESERVED_NONZERO");
}