blob: dfee5a2c8b102f373c2197595824cdc059bd232f [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.pdfbox.pdfparser;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.security.GeneralSecurityException;
import java.security.KeyStore;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Map.Entry;
import java.util.Set;
import org.apache.logging.log4j.Logger;
import org.apache.logging.log4j.LogManager;
import org.apache.pdfbox.cos.COSArray;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSDictionary;
import org.apache.pdfbox.cos.COSDocument;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.cos.COSNull;
import org.apache.pdfbox.cos.COSNumber;
import org.apache.pdfbox.cos.COSObject;
import org.apache.pdfbox.cos.COSObjectKey;
import org.apache.pdfbox.cos.COSStream;
import org.apache.pdfbox.cos.ICOSParser;
import org.apache.pdfbox.io.IOUtils;
import org.apache.pdfbox.io.RandomAccessRead;
import org.apache.pdfbox.io.RandomAccessReadView;
import org.apache.pdfbox.io.RandomAccessStreamCache.StreamCacheCreateFunction;
import org.apache.pdfbox.pdfparser.XrefTrailerResolver.XRefType;
import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
import org.apache.pdfbox.pdmodel.encryption.DecryptionMaterial;
import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException;
import org.apache.pdfbox.pdmodel.encryption.PDEncryption;
import org.apache.pdfbox.pdmodel.encryption.ProtectionPolicy;
import org.apache.pdfbox.pdmodel.encryption.PublicKeyDecryptionMaterial;
import org.apache.pdfbox.pdmodel.encryption.SecurityHandler;
import org.apache.pdfbox.pdmodel.encryption.StandardDecryptionMaterial;
import org.apache.pdfbox.util.StringUtil;
/**
* COS-Parser which first reads startxref and xref tables in order to know valid objects and parse only these objects.
*
* This class is a much enhanced version of <code>QuickParser</code> presented in
* <a href="https://issues.apache.org/jira/browse/PDFBOX-1104">PDFBOX-1104</a> by Jeremy Villalobos.
*/
public class COSParser extends BaseParser implements ICOSParser
{
private static final String PDF_HEADER = "%PDF-";
private static final String FDF_HEADER = "%FDF-";
private static final String PDF_DEFAULT_VERSION = "1.4";
private static final String FDF_DEFAULT_VERSION = "1.0";
private static final char[] XREF_TABLE = { 'x', 'r', 'e', 'f' };
private static final char[] STARTXREF = { 's','t','a','r','t','x','r','e','f' };
private static final byte[] ENDSTREAM = { E, N, D, S, T, R, E, A, M };
private static final byte[] ENDOBJ = { E, N, D, O, B, J };
private static final long MINIMUM_SEARCH_OFFSET = 6;
private static final int X = 'x';
private static final int STRMBUFLEN = 2048;
private final byte[] strmBuf = new byte[ STRMBUFLEN ];
private AccessPermission accessPermission;
private InputStream keyStoreInputStream = null;
@SuppressWarnings({"squid:S2068"})
private String password = "";
private String keyAlias = null;
/**
* The range within the %%EOF marker will be searched.
* Useful if there are additional characters after %%EOF within the PDF.
*/
public static final String SYSPROP_EOFLOOKUPRANGE =
"org.apache.pdfbox.pdfparser.nonSequentialPDFParser.eofLookupRange";
/**
* How many trailing bytes to read for EOF marker.
*/
private static final int DEFAULT_TRAIL_BYTECOUNT = 2048;
/**
* EOF-marker.
*/
protected static final char[] EOF_MARKER = { '%', '%', 'E', 'O', 'F' };
/**
* obj-marker.
*/
protected static final char[] OBJ_MARKER = { 'o', 'b', 'j' };
/**
* file length.
*/
protected long fileLen;
/**
* is parser using auto healing capacity ?
*/
private boolean isLenient = true;
protected boolean initialParseDone = false;
private boolean trailerWasRebuild = false;
private BruteForceParser bruteForceParser = null;
private PDEncryption encryption = null;
/**
* Intermediate cache. Contains all objects of already read compressed object streams. Objects are removed after
* dereferencing them.
*/
private final Map<Long, Map<COSObjectKey, COSBase>> decompressedObjects = new HashMap<>();
/**
* The security handler.
*/
protected SecurityHandler<? extends ProtectionPolicy> securityHandler = null;
/**
* how many trailing bytes to read for EOF marker.
*/
private int readTrailBytes = DEFAULT_TRAIL_BYTECOUNT;
private static final Logger LOG = LogManager.getLogger(COSParser.class);
/**
* Collects all Xref/trailer objects and resolves them into single
* object using startxref reference.
*/
protected XrefTrailerResolver xrefTrailerResolver = new XrefTrailerResolver();
/**
* Default constructor.
*
* @param source input representing the pdf.
*
* @throws IOException if something went wrong
*/
public COSParser(RandomAccessRead source) throws IOException
{
this(source, null, null, null);
}
/**
* Constructor for encrypted pdfs.
*
* @param source input representing the pdf.
* @param password password to be used for decryption.
* @param keyStore key store to be used for decryption when using public key security
* @param keyAlias alias to be used for decryption when using public key security
*
* @throws IOException if the source data could not be read
*/
public COSParser(RandomAccessRead source, String password, InputStream keyStore,
String keyAlias) throws IOException
{
this(source, password, keyStore, keyAlias, null);
}
/**
* Constructor for encrypted pdfs.
*
* @param source input representing the pdf.
* @param password password to be used for decryption.
* @param keyStore key store to be used for decryption when using public key security
* @param keyAlias alias to be used for decryption when using public key security
* @param streamCacheCreateFunction a function to create an instance of the stream cache
*
* @throws IOException if the source data could not be read
*/
public COSParser(RandomAccessRead source, String password, InputStream keyStore,
String keyAlias, StreamCacheCreateFunction streamCacheCreateFunction) throws IOException
{
super(source);
this.password = password;
this.keyAlias = keyAlias;
fileLen = source.length();
keyStoreInputStream = keyStore;
init(streamCacheCreateFunction);
}
private void init(StreamCacheCreateFunction streamCacheCreateFunction)
{
String eofLookupRangeStr = System.getProperty(SYSPROP_EOFLOOKUPRANGE);
if (eofLookupRangeStr != null)
{
try
{
setEOFLookupRange(Integer.parseInt(eofLookupRangeStr));
}
catch (NumberFormatException nfe)
{
LOG.warn(
"System property " + SYSPROP_EOFLOOKUPRANGE + " does not contain an integer value, but: '{}'",
eofLookupRangeStr);
}
}
document = new COSDocument(streamCacheCreateFunction, this);
}
/**
* Sets how many trailing bytes of PDF file are searched for EOF marker and 'startxref' marker. If not set we use
* default value {@link #DEFAULT_TRAIL_BYTECOUNT}.
*
* <p>We check that new value is at least 16. However for practical use cases this value should not be lower than
* 1000; even 2000 was found to not be enough in some cases where some trailing garbage like HTML snippets followed
* the EOF marker.</p>
*
* <p>
* In case system property {@link #SYSPROP_EOFLOOKUPRANGE} is defined this value will be set on initialization but
* can be overwritten later.
* </p>
*
* @param byteCount number of trailing bytes
*/
public void setEOFLookupRange(int byteCount)
{
if (byteCount > 15)
{
readTrailBytes = byteCount;
}
}
/**
* Read the trailer information and provide a COSDictionary containing the trailer information.
*
* @return a COSDictionary containing the trailer information
* @throws IOException if something went wrong
*/
protected COSDictionary retrieveTrailer() throws IOException
{
COSDictionary trailer = null;
boolean rebuildTrailer = false;
try
{
// parse startxref
// TODO FDF files don't have a startxref value, so that rebuildTrailer is triggered
long startXRefOffset = getStartxrefOffset();
if (startXRefOffset > -1)
{
trailer = parseXref(startXRefOffset);
}
else
{
rebuildTrailer = isLenient();
}
}
catch (IOException exception)
{
if (isLenient())
{
rebuildTrailer = true;
}
else
{
throw exception;
}
}
// check if the trailer contains a Root object
if (trailer != null && trailer.getItem(COSName.ROOT) == null)
{
rebuildTrailer = isLenient();
}
if (rebuildTrailer)
{
trailer = getBruteForceParser().rebuildTrailer(xrefTrailerResolver, null);
trailerWasRebuild = true;
// transfer encryption information from BruteForceParser
encryption = getBruteForceParser().getEncryption();
if (encryption != null)
{
securityHandler = encryption.getSecurityHandler();
accessPermission = securityHandler.getCurrentAccessPermission();
}
}
else
{
// prepare decryption if necessary
prepareDecryption();
// don't use the getter as it creates an instance of BruteForceParser
if (bruteForceParser != null && bruteForceParser.bfSearchTriggered())
{
getBruteForceParser().bfSearchForObjStreams(xrefTrailerResolver, securityHandler);
}
}
if (resetTrailerResolver())
{
xrefTrailerResolver.reset();
xrefTrailerResolver = null;
}
return trailer;
}
/**
* Indicates whether the xref trailer resolver should be reset or not. Should be overwritten if the xref trailer
* resolver is needed after the initial parsing.
*
* @return true if the xref trailer resolver should be reset
*/
protected boolean resetTrailerResolver()
{
return true;
}
/**
* Parses cross reference tables.
*
* @param startXRefOffset start offset of the first table
* @return the trailer dictionary
* @throws IOException if something went wrong
*/
private COSDictionary parseXref(long startXRefOffset) throws IOException
{
source.seek(startXRefOffset);
long startXrefOffset = Math.max(0, parseStartXref());
// check the startxref offset
long fixedOffset = checkXRefOffset(startXrefOffset);
if (fixedOffset > -1)
{
startXrefOffset = fixedOffset;
}
document.setStartXref(startXrefOffset);
long prev = startXrefOffset;
// ---- parse whole chain of xref tables/object streams using PREV reference
Set<Long> prevSet = new HashSet<>();
COSDictionary trailer = null;
while (prev > 0)
{
// save expected position for loop detection
prevSet.add(prev);
// seek to xref table
source.seek(prev);
// skip white spaces
skipSpaces();
// save current position as well due to skipped spaces
prevSet.add(source.getPosition());
// -- parse xref
if (source.peek() == X)
{
// xref table and trailer
// use existing parser to parse xref table
if (!parseXrefTable(prev) || !parseTrailer())
{
throw new IOException("Expected trailer object at offset "
+ source.getPosition());
}
trailer = xrefTrailerResolver.getCurrentTrailer();
// check for a XRef stream, it may contain some object ids of compressed objects
if(trailer.containsKey(COSName.XREF_STM))
{
int streamOffset = trailer.getInt(COSName.XREF_STM);
// check the xref stream reference
fixedOffset = checkXRefOffset(streamOffset);
if (fixedOffset > -1 && fixedOffset != streamOffset)
{
LOG.warn("/XRefStm offset {} is incorrect, corrected to {}", streamOffset,
fixedOffset);
streamOffset = (int)fixedOffset;
trailer.setInt(COSName.XREF_STM, streamOffset);
}
if (streamOffset > 0)
{
source.seek(streamOffset);
skipSpaces();
try
{
parseXrefObjStream(prev, false);
document.setHasHybridXRef();
}
catch (IOException ex)
{
if (isLenient)
{
LOG.error("Failed to parse /XRefStm at offset {}", streamOffset,
ex);
}
else
{
throw ex;
}
}
}
else
{
if(isLenient)
{
LOG.error("Skipped XRef stream due to a corrupt offset:{}",
streamOffset);
}
else
{
throw new IOException("Skipped XRef stream due to a corrupt offset:"+streamOffset);
}
}
}
prev = trailer.getLong(COSName.PREV);
}
else
{
// parse xref stream
prev = parseXrefObjStream(prev, true);
trailer = xrefTrailerResolver.getCurrentTrailer();
}
if (prev > 0)
{
// check the xref table reference
fixedOffset = checkXRefOffset(prev);
if (fixedOffset > -1 && fixedOffset != prev)
{
prev = fixedOffset;
trailer.setLong(COSName.PREV, prev);
}
}
if (prevSet.contains(prev))
{
throw new IOException("/Prev loop at offset " + prev);
}
}
// ---- build valid xrefs out of the xref chain
xrefTrailerResolver.setStartxref(startXrefOffset);
trailer = xrefTrailerResolver.getTrailer();
document.setTrailer(trailer);
document.setIsXRefStream(XRefType.STREAM == xrefTrailerResolver.getXrefType());
// check the offsets of all referenced objects
if (isLenient)
{
checkXrefOffsets();
}
// copy xref table
document.addXRefTable(xrefTrailerResolver.getXrefTable());
// remember the highest XRef object number to avoid it being reused in incremental saving
Optional<Long> maxValue = document.getXrefTable().keySet().stream() //
.map(COSObjectKey::getNumber) //
.reduce(Long::max);
document.setHighestXRefObjectNumber(maxValue.isPresent() ? maxValue.get() : 0);
return trailer;
}
/**
* Parses an xref object stream starting with indirect object id.
*
* @return value of PREV item in dictionary or <code>-1</code> if no such item exists
*/
private long parseXrefObjStream(long objByteOffset, boolean isStandalone) throws IOException
{
// ---- parse indirect object head
readObjectNumber();
readGenerationNumber();
readExpectedString(OBJ_MARKER, true);
COSDictionary dict = parseCOSDictionary(false);
try (COSStream xrefStream = parseCOSStream(dict))
{
// the cross reference stream of a hybrid xref table will be added to the existing one
// and we must not override the offset and the trailer
if ( isStandalone )
{
xrefTrailerResolver.nextXrefObj( objByteOffset, XRefType.STREAM );
xrefTrailerResolver.setTrailer(xrefStream);
}
PDFXrefStreamParser parser = new PDFXrefStreamParser(xrefStream, document);
parser.parse(xrefTrailerResolver);
}
return dict.getLong(COSName.PREV);
}
/**
* Looks for and parses startxref. We first look for last '%%EOF' marker (within last
* {@link #DEFAULT_TRAIL_BYTECOUNT} bytes (or range set via {@link #setEOFLookupRange(int)}) and go back to find
* <code>startxref</code>.
*
* @return the offset of StartXref
* @throws IOException If something went wrong.
*/
private long getStartxrefOffset() throws IOException
{
byte[] buf;
long skipBytes;
// read trailing bytes into buffer
try
{
final int trailByteCount = (fileLen < readTrailBytes) ? (int) fileLen : readTrailBytes;
buf = new byte[trailByteCount];
skipBytes = fileLen - trailByteCount;
source.seek(skipBytes);
int off = 0;
int readBytes;
while (off < trailByteCount)
{
readBytes = source.read(buf, off, trailByteCount - off);
// in order to not get stuck in a loop we check readBytes (this should never happen)
if (readBytes < 1)
{
throw new IOException(
"No more bytes to read for trailing buffer, but expected: "
+ (trailByteCount - off));
}
off += readBytes;
}
}
finally
{
source.seek(0);
}
// find last '%%EOF'
int bufOff = lastIndexOf(EOF_MARKER, buf, buf.length);
if (bufOff < 0)
{
if (isLenient)
{
// in lenient mode the '%%EOF' isn't needed
bufOff = buf.length;
LOG.debug("Missing end of file marker '{}'", new String(EOF_MARKER));
}
else
{
throw new IOException("Missing end of file marker '" + new String(EOF_MARKER) + "'");
}
}
// find last startxref preceding EOF marker
bufOff = lastIndexOf(STARTXREF, buf, bufOff);
if (bufOff < 0)
{
throw new IOException("Missing 'startxref' marker.");
}
else
{
return skipBytes + bufOff;
}
}
/**
* Searches last appearance of pattern within buffer. Lookup before _lastOff and goes back until 0.
*
* @param pattern pattern to search for
* @param buf buffer to search pattern in
* @param endOff offset (exclusive) where lookup starts at
*
* @return start offset of pattern within buffer or <code>-1</code> if pattern could not be found
*/
protected int lastIndexOf(final char[] pattern, final byte[] buf, final int endOff)
{
final int lastPatternChOff = pattern.length - 1;
int bufOff = endOff;
int patOff = lastPatternChOff;
char lookupCh = pattern[patOff];
while (--bufOff >= 0)
{
if (buf[bufOff] == lookupCh)
{
if (--patOff < 0)
{
// whole pattern matched
return bufOff;
}
// matched current char, advance to preceding one
lookupCh = pattern[patOff];
}
else if (patOff < lastPatternChOff)
{
// no char match but already matched some chars; reset
patOff = lastPatternChOff;
lookupCh = pattern[patOff];
}
}
return -1;
}
/**
* Return true if parser is lenient. Meaning auto healing capacity of the parser are used.
*
* @return true if parser is lenient
*/
public boolean isLenient()
{
return isLenient;
}
/**
* Change the parser leniency flag.
*
* This method can only be called before the parsing of the file.
*
* @param lenient try to handle malformed PDFs.
*
*/
protected void setLenient(boolean lenient)
{
if (initialParseDone)
{
throw new IllegalArgumentException("Cannot change leniency after parsing");
}
this.isLenient = lenient;
}
@Override
public COSBase dereferenceCOSObject(COSObject obj) throws IOException
{
long currentPos = source.getPosition();
COSObjectKey key = obj.getKey();
COSBase parsedObj = parseObjectDynamically(key, false);
if (parsedObj != null)
{
parsedObj.setDirect(false);
parsedObj.setKey(key);
}
if (currentPos > 0)
{
source.seek(currentPos);
}
return parsedObj;
}
@Override
public RandomAccessReadView createRandomAccessReadView(long startPosition, long streamLength)
throws IOException
{
return source.createView(startPosition, streamLength);
}
/**
* Parse the object for the given object key.
*
* @param objKey key of object to be parsed
* @param requireExistingNotCompressedObj if <code>true</code> the object to be parsed must be defined in xref
* (comment: null objects may be missing from xref) and it must not be a compressed object within object stream
* (this is used to circumvent being stuck in a loop in a malicious PDF)
*
* @return the parsed object (which is also added to document object)
*
* @throws IOException If an IO error occurs.
*/
protected synchronized COSBase parseObjectDynamically(COSObjectKey objKey,
boolean requireExistingNotCompressedObj) throws IOException
{
COSObject pdfObject = document.getObjectFromPool(objKey);
if (!pdfObject.isObjectNull())
{
return pdfObject.getObject();
}
Long offsetOrObjstmObNr = getObjectOffset(objKey, requireExistingNotCompressedObj);
COSBase referencedObject = null;
if (offsetOrObjstmObNr != null)
{
if (offsetOrObjstmObNr > 0)
{
referencedObject = parseFileObject(offsetOrObjstmObNr, objKey);
}
else
{
// xref value is object nr of object stream containing object to be parsed
// since our object was not found it means object stream was not parsed so far
referencedObject = parseObjectStreamObject(-offsetOrObjstmObNr, objKey);
}
}
if (referencedObject == null || referencedObject instanceof COSNull)
{
// not defined object -> NULL object (Spec. 1.7, chap. 3.2.9)
// or some other issue with dereferencing
// remove parser to avoid endless recursion
pdfObject.setToNull();
}
return referencedObject;
}
private Long getObjectOffset(COSObjectKey objKey, boolean requireExistingNotCompressedObj)
throws IOException
{
// read offset or object stream object number from xref table
Long offsetOrObjstmObNr = document.getXrefTable().get(objKey);
// maybe something is wrong with the xref table -> perform brute force search for all objects
if (offsetOrObjstmObNr == null && isLenient)
{
offsetOrObjstmObNr = getBruteForceParser().getBFCOSObjectOffsets().get(objKey);
if (offsetOrObjstmObNr != null)
{
LOG.debug("Set missing offset {} for object {}", offsetOrObjstmObNr, objKey);
document.getXrefTable().put(objKey, offsetOrObjstmObNr);
}
}
// test to circumvent loops with broken documents
if (requireExistingNotCompressedObj
&& (offsetOrObjstmObNr == null || offsetOrObjstmObNr <= 0))
{
throw new IOException("Object must be defined and must not be compressed object: "
+ objKey.getNumber() + ":" + objKey.getGeneration());
}
return offsetOrObjstmObNr;
}
private COSBase parseFileObject(Long objOffset, final COSObjectKey objKey)
throws IOException
{
// jump to the object start
source.seek(objOffset);
// an indirect object starts with the object number/generation number
final long readObjNr = readObjectNumber();
final int readObjGen = readGenerationNumber();
readExpectedString(OBJ_MARKER, true);
// consistency check
if (readObjNr != objKey.getNumber() || readObjGen != objKey.getGeneration())
{
throw new IOException("XREF for " + objKey.getNumber() + ":"
+ objKey.getGeneration() + " points to wrong object: " + readObjNr
+ ":" + readObjGen + " at offset " + objOffset);
}
skipSpaces();
COSBase parsedObject = parseDirObject();
if (parsedObject != null)
{
parsedObject.setDirect(false);
parsedObject.setKey(objKey);
}
String endObjectKey = readString();
if (endObjectKey.equals(STREAM_STRING))
{
source.rewind(endObjectKey.getBytes(StandardCharsets.ISO_8859_1).length);
if (parsedObject instanceof COSDictionary)
{
COSStream stream = parseCOSStream((COSDictionary) parsedObject);
if (securityHandler != null)
{
securityHandler.decryptStream(stream, objKey.getNumber(), objKey.getGeneration());
}
parsedObject = stream;
}
else
{
// this is not legal
// the combination of a dict and the stream/endstream
// forms a complete stream object
throw new IOException("Stream not preceded by dictionary (offset: "
+ objOffset + ").");
}
skipSpaces();
endObjectKey = readLine();
// we have case with a second 'endstream' before endobj
if (!endObjectKey.startsWith(ENDOBJ_STRING) && endObjectKey.startsWith(ENDSTREAM_STRING))
{
endObjectKey = endObjectKey.substring(9).trim();
if (endObjectKey.isEmpty())
{
// no other characters in extra endstream line
// read next line
endObjectKey = readLine();
}
}
}
else if (securityHandler != null)
{
securityHandler.decrypt(parsedObject, objKey.getNumber(), objKey.getGeneration());
}
if (!endObjectKey.startsWith(ENDOBJ_STRING))
{
if (isLenient)
{
LOG.warn("Object ({}:{}) at offset {} does not end with 'endobj' but with '{}'",
readObjNr, readObjGen, objOffset, endObjectKey);
}
else
{
throw new IOException("Object (" + readObjNr + ":" + readObjGen
+ ") at offset " + objOffset
+ " does not end with 'endobj' but with '" + endObjectKey + "'");
}
}
return parsedObject;
}
/**
* Parse the object with the given key from the object stream with the given number.
*
* @param objstmObjNr the number of the offset stream
* @param key the key of the object to be parsed
* @return the parsed object
* @throws IOException if something went wrong when parsing the object
*/
protected COSBase parseObjectStreamObject(long objstmObjNr, COSObjectKey key) throws IOException
{
Map<COSObjectKey, COSBase> streamObjects = decompressedObjects.computeIfAbsent(objstmObjNr,
n -> new HashMap<>());
// did we already read the compressed object stream?
COSBase objectStreamObject = streamObjects.remove(key);
if (objectStreamObject != null)
{
return objectStreamObject;
}
final COSObjectKey objKey = getObjectKey(objstmObjNr, 0);
final COSBase objstmBaseObj = document.getObjectFromPool(objKey).getObject();
if (objstmBaseObj instanceof COSStream)
{
try
{
PDFObjectStreamParser parser = new PDFObjectStreamParser((COSStream) objstmBaseObj,
document);
Map<COSObjectKey, COSBase> allStreamObjects = parser.parseAllObjects();
objectStreamObject = allStreamObjects.remove(key);
allStreamObjects.entrySet().stream()
.forEach(e -> streamObjects.putIfAbsent(e.getKey(), e.getValue()));
}
catch (IOException ex)
{
if (isLenient)
{
LOG.error("object stream {} could not be parsed due to an exception",
objstmObjNr, ex);
}
else
{
throw ex;
}
}
}
return objectStreamObject;
}
/**
* Returns length value referred to or defined in given object.
*/
private COSNumber getLength(final COSBase lengthBaseObj) throws IOException
{
if (lengthBaseObj == null)
{
return null;
}
// maybe length was given directly
if (lengthBaseObj instanceof COSNumber)
{
return (COSNumber) lengthBaseObj;
}
// length in referenced object
if (lengthBaseObj instanceof COSObject)
{
COSObject lengthObj = (COSObject) lengthBaseObj;
COSBase length = lengthObj.getObject();
if (length == null)
{
throw new IOException("Length object content was not read.");
}
if (COSNull.NULL == length)
{
LOG.warn("Length object ({} {}) not found", lengthObj.getKey());
return null;
}
if (length instanceof COSNumber)
{
return (COSNumber) length;
}
throw new IOException("Wrong type of referenced length object " + lengthObj + ": "
+ length.getClass().getSimpleName());
}
throw new IOException(
"Wrong type of length object: " + lengthBaseObj.getClass().getSimpleName());
}
/**
* This will read a COSStream from the input stream using length attribute within dictionary. If
* length attribute is a indirect reference it is first resolved to get the stream length. This
* means we copy stream data without testing for 'endstream' or 'endobj' and thus it is no
* problem if these keywords occur within stream. We require 'endstream' to be found after
* stream data is read.
*
* @param dic dictionary that goes with this stream.
*
* @return parsed pdf stream.
*
* @throws IOException if an error occurred reading the stream, like problems with reading
* length attribute, stream does not end with 'endstream' after data read, stream too short etc.
*/
protected COSStream parseCOSStream(COSDictionary dic) throws IOException
{
// read 'stream'; this was already tested in parseObjectsDynamically()
readString();
skipWhiteSpaces();
/*
* This needs to be dic.getItem because when we are parsing, the underlying object might still be null.
*/
COSNumber streamLengthObj = getLength(dic.getItem(COSName.LENGTH));
if (streamLengthObj == null)
{
if (isLenient)
{
LOG.warn(
"The stream doesn't provide any stream length, using fallback readUntilEnd, at offset {}",
source.getPosition());
}
else
{
throw new IOException("Missing length for stream.");
}
}
long streamStartPosition = source.getPosition();
long streamLength;
if (streamLengthObj != null && validateStreamLength(streamLengthObj.longValue()))
{
streamLength = streamLengthObj.longValue();
// skip stream
source.seek(source.getPosition() + streamLengthObj.intValue());
}
else
{
streamLength = readUntilEndStream(new EndstreamFilterStream());
}
String endStream = readString();
if (endStream.equals("endobj") && isLenient)
{
LOG.warn("stream ends with 'endobj' instead of 'endstream' at offset {}",
source.getPosition());
// avoid follow-up warning about missing endobj
source.rewind(ENDOBJ.length);
}
else if (endStream.length() > 9 && isLenient && endStream.startsWith(ENDSTREAM_STRING))
{
LOG.warn("stream ends with '{}' instead of 'endstream' at offset {}", endStream,
source.getPosition());
// unread the "extra" bytes
source.rewind(endStream.substring(9).getBytes(StandardCharsets.ISO_8859_1).length);
}
else if (!endStream.equals(ENDSTREAM_STRING))
{
throw new IOException(
"Error reading stream, expected='endstream' actual='"
+ endStream + "' at offset " + source.getPosition());
}
return document.createCOSStream(dic, streamStartPosition, streamLength);
}
/**
* This method will read through the current stream object until
* we find the keyword "endstream" meaning we're at the end of this
* object. Some pdf files, however, forget to write some endstream tags
* and just close off objects with an "endobj" tag so we have to handle
* this case as well.
*
* This method is optimized using buffered IO and reduced number of
* byte compare operations.
*
* @param out stream we write out to.
*
* @throws IOException if something went wrong
*/
private long readUntilEndStream(final EndstreamFilterStream out) throws IOException
{
int bufSize;
int charMatchCount = 0;
byte[] keyw = ENDSTREAM;
// last character position of shortest keyword ('endobj')
final int quickTestOffset = 5;
// read next chunk into buffer; already matched chars are added to beginning of buffer
while ( ( bufSize = source.read( strmBuf, charMatchCount, STRMBUFLEN - charMatchCount ) ) > 0 )
{
bufSize += charMatchCount;
int bIdx = charMatchCount;
int quickTestIdx;
// iterate over buffer, trying to find keyword match
for ( int maxQuicktestIdx = bufSize - quickTestOffset; bIdx < bufSize; bIdx++ )
{
// reduce compare operations by first test last character we would have to
// match if current one matches; if it is not a character from keywords
// we can move behind the test character; this shortcut is inspired by the
// Boyer-Moore string search algorithm and can reduce parsing time by approx. 20%
quickTestIdx = bIdx + quickTestOffset;
if (charMatchCount == 0 && quickTestIdx < maxQuicktestIdx)
{
final byte ch = strmBuf[quickTestIdx];
if ( ( ch > 't' ) || ( ch < 'a' ) )
{
// last character we would have to match if current character would match
// is not a character from keywords -> jump behind and start over
bIdx = quickTestIdx;
continue;
}
}
// could be negative - but we only compare to ASCII
final byte ch = strmBuf[bIdx];
if ( ch == keyw[ charMatchCount ] )
{
if ( ++charMatchCount == keyw.length )
{
// match found
bIdx++;
break;
}
}
else
{
if ( ( charMatchCount == 3 ) && ( ch == ENDOBJ[ charMatchCount ] ) )
{
// maybe ENDSTREAM is missing but we could have ENDOBJ
keyw = ENDOBJ;
charMatchCount++;
}
else
{
// no match; incrementing match start by 1 would be dumb since we already know
// matched chars depending on current char read we may already have beginning
// of a new match: 'e': first char matched; 'n': if we are at match position
// idx 7 we already read 'e' thus 2 chars matched for each other char we have
// to start matching first keyword char beginning with next read position
charMatchCount = ( ch == E ) ? 1 : ( ( ch == N ) && ( charMatchCount == 7 ) ) ? 2 : 0;
// search again for 'endstream'
keyw = ENDSTREAM;
}
}
}
int contentBytes = Math.max( 0, bIdx - charMatchCount );
// write buffer content until first matched char to output stream
if ( contentBytes > 0 )
{
out.filter(strmBuf, 0, contentBytes);
}
if ( charMatchCount == keyw.length )
{
// keyword matched; unread matched keyword (endstream/endobj) and following buffered content
source.rewind( bufSize - contentBytes );
break;
}
else
{
// copy matched chars at start of buffer
System.arraycopy( keyw, 0, strmBuf, 0, charMatchCount );
}
}
// this writes a lonely CR or drops trailing CR LF and LF
return out.calculateLength();
}
private boolean validateStreamLength(long streamLength) throws IOException
{
boolean streamLengthIsValid = true;
long originOffset = source.getPosition();
long expectedEndOfStream = originOffset + streamLength;
if (expectedEndOfStream > fileLen)
{
streamLengthIsValid = false;
LOG.warn(
"The end of the stream is out of range, using workaround to read the stream, stream start position: {}, length: {}, expected end position: {}",
originOffset, streamLength, expectedEndOfStream);
}
else
{
source.seek(expectedEndOfStream);
skipSpaces();
if (!isString(ENDSTREAM))
{
streamLengthIsValid = false;
LOG.warn(
"The end of the stream doesn't point to the correct offset, using workaround to read the stream, stream start position: {}, length: {}, expected end position: {}",
originOffset, streamLength, expectedEndOfStream);
}
source.seek(originOffset);
}
return streamLengthIsValid;
}
/**
* Check if the cross reference table/stream can be found at the current offset.
*
* @param startXRefOffset
* @return the revised offset
* @throws IOException
*/
private long checkXRefOffset(long startXRefOffset) throws IOException
{
// repair mode isn't available in non-lenient mode
if (!isLenient)
{
return startXRefOffset;
}
source.seek(startXRefOffset);
skipSpaces();
if (isString(XREF_TABLE))
{
return startXRefOffset;
}
if (startXRefOffset > 0)
{
if (checkXRefStreamOffset(startXRefOffset))
{
return startXRefOffset;
}
else
{
return calculateXRefFixedOffset(startXRefOffset);
}
}
// can't find a valid offset
return -1;
}
/**
* Check if the cross reference stream can be found at the current offset.
*
* @param startXRefOffset the expected start offset of the XRef stream
* @return the revised offset
* @throws IOException if something went wrong
*/
private boolean checkXRefStreamOffset(long startXRefOffset) throws IOException
{
// repair mode isn't available in non-lenient mode
if (!isLenient || startXRefOffset == 0)
{
return true;
}
// seek to offset-1
source.seek(startXRefOffset-1);
int nextValue = source.read();
// the first character has to be a whitespace, and then a digit
if (isWhitespace(nextValue))
{
skipSpaces();
if (isDigit())
{
try
{
// it's a XRef stream
readObjectNumber();
readGenerationNumber();
readExpectedString(OBJ_MARKER, true);
// check the dictionary to avoid false positives
COSDictionary dict = parseCOSDictionary(false);
source.seek(startXRefOffset);
if ("XRef".equals(dict.getNameAsString(COSName.TYPE)))
{
return true;
}
}
catch (IOException exception)
{
// there wasn't an object of a xref stream
LOG.debug("No Xref stream at given location {}", startXRefOffset, exception);
source.seek(startXRefOffset);
}
}
}
return false;
}
/**
* Try to find a fixed offset for the given xref table/stream.
*
* @param objectOffset the given offset where to look at
* @return the fixed offset
*
* @throws IOException if something went wrong
*/
private long calculateXRefFixedOffset(long objectOffset) throws IOException
{
if (objectOffset < 0)
{
LOG.error("Invalid object offset {} when searching for a xref table/stream",
objectOffset);
return 0;
}
// search for the offset of the given xref table/stream among those found by a brute force search.
long newOffset = getBruteForceParser().bfSearchForXRef(objectOffset);
if (newOffset > -1)
{
LOG.debug("Fixed reference for xref table/stream {} -> {}", objectOffset, newOffset);
return newOffset;
}
LOG.error("Can't find the object xref table/stream at offset {}", objectOffset);
return 0;
}
private boolean validateXrefOffsets(Map<COSObjectKey, Long> xrefOffset) throws IOException
{
if (xrefOffset == null)
{
return true;
}
Map<COSObjectKey, COSObjectKey> correctedKeys = new HashMap<>();
HashSet<COSObjectKey> validKeys = new HashSet<>();
for (Entry<COSObjectKey, Long> objectEntry : xrefOffset.entrySet())
{
COSObjectKey objectKey = objectEntry.getKey();
Long objectOffset = objectEntry.getValue();
// a negative offset number represents an object number itself
// see type 2 entry in xref stream
if (objectOffset != null && objectOffset >= 0)
{
COSObjectKey foundObjectKey = findObjectKey(objectKey, objectOffset, xrefOffset);
if (foundObjectKey == null)
{
LOG.debug(
"Stop checking xref offsets as at least one ({}) couldn't be dereferenced",
objectKey);
return false;
}
else if (foundObjectKey != objectKey)
{
// Generation was fixed - need to update map later, after iteration
correctedKeys.put(objectKey, foundObjectKey);
}
else
{
validKeys.add(objectKey);
}
}
}
Map<COSObjectKey, Long> correctedPointers = new HashMap<>();
for (Entry<COSObjectKey, COSObjectKey> correctedKeyEntry : correctedKeys.entrySet())
{
if (!validKeys.contains(correctedKeyEntry.getValue()))
{
// Only replace entries, if the original entry does not point to a valid object
correctedPointers.put(correctedKeyEntry.getValue(),
xrefOffset.get(correctedKeyEntry.getKey()));
}
}
// remove old invalid, as some might not be replaced
correctedKeys.forEach((key, value) -> xrefOffset.remove(key));
xrefOffset.putAll(correctedPointers);
return true;
}
/**
* Check the XRef table by dereferencing all objects and fixing the offset if necessary.
*
* @throws IOException if something went wrong.
*/
private void checkXrefOffsets() throws IOException
{
Map<COSObjectKey, Long> xrefOffset = xrefTrailerResolver.getXrefTable();
if (!validateXrefOffsets(xrefOffset))
{
Map<COSObjectKey, Long> bfCOSObjectKeyOffsets = getBruteForceParser()
.getBFCOSObjectOffsets();
if (!bfCOSObjectKeyOffsets.isEmpty())
{
LOG.debug("Replaced read xref table with the results of a brute force search");
xrefOffset.clear();
xrefOffset.putAll(bfCOSObjectKeyOffsets);
}
}
}
/**
* Check if the given object can be found at the given offset. Returns the provided object key if everything is ok.
* If the generation number differs it will be fixed and a new object key is returned.
*
* @param objectKey the key of object we are looking for
* @param offset the offset where to look
* @param xrefOffset a map with with all known xref entries
* @return returns the found/fixed object key
*
* @throws IOException if something went wrong
*/
private COSObjectKey findObjectKey(COSObjectKey objectKey, long offset,
Map<COSObjectKey, Long> xrefOffset) throws IOException
{
// there can't be any object at the very beginning of a pdf
if (offset < MINIMUM_SEARCH_OFFSET)
{
return null;
}
try
{
source.seek(offset);
skipWhiteSpaces();
if (source.getPosition() == offset)
{
// ensure that at least one whitespace is skipped in front of the object number
source.seek(offset - 1);
if (source.getPosition() < offset)
{
if (!isDigit())
{
// anything else but a digit may be some garbage of the previous object -> just ignore it
source.read();
}
else
{
long current = source.getPosition();
source.seek(--current);
while (isDigit())
source.seek(--current);
long newObjNr = readObjectNumber();
int newGenNr = readGenerationNumber();
COSObjectKey newObjKey = new COSObjectKey(newObjNr, newGenNr);
Long existingOffset = xrefOffset.get(newObjKey);
// the found object number belongs to another uncompressed object at the same or nearby offset
// something has to be wrong
if (existingOffset != null && existingOffset > 0
&& Math.abs(offset - existingOffset) < 10)
{
LOG.debug("Found the object {} instead of {} at offset {} - ignoring",
newObjKey, objectKey, offset);
return null;
}
// something seems to be wrong but it's hard to determine what exactly -> simply continue
source.seek(offset);
}
}
}
// try to read the given object/generation number
long foundObjectNumber = readObjectNumber();
if (objectKey.getNumber() != foundObjectNumber)
{
LOG.warn("found wrong object number. expected [{}] found [{}]",
objectKey.getNumber(), foundObjectNumber);
if (!isLenient)
{
return null;
}
else
{
objectKey = new COSObjectKey(foundObjectNumber, objectKey.getGeneration());
}
}
int genNumber = readGenerationNumber();
// finally try to read the object marker
readExpectedString(OBJ_MARKER, true);
if (genNumber == objectKey.getGeneration())
{
return objectKey;
}
else if (isLenient && genNumber > objectKey.getGeneration())
{
return new COSObjectKey(objectKey.getNumber(), genNumber);
}
}
catch (IOException exception)
{
// Swallow the exception, obviously there isn't any valid object number
LOG.debug("No valid object at given location {} - ignoring", offset, exception);
}
return null;
}
private BruteForceParser getBruteForceParser() throws IOException
{
if (bruteForceParser == null)
{
bruteForceParser = new BruteForceParser(source, document);
}
return bruteForceParser;
}
/**
* Check if all entries of the pages dictionary are present. Those which can't be dereferenced are removed.
*
* @param root the root dictionary of the pdf
* @throws java.io.IOException if the page tree root is null
*/
protected void checkPages(COSDictionary root) throws IOException
{
if (trailerWasRebuild)
{
// check if all page objects are dereferenced
COSDictionary pages = root.getCOSDictionary(COSName.PAGES);
if (pages != null)
{
checkPagesDictionary(pages, new HashSet<>());
}
}
if (root.getCOSDictionary(COSName.PAGES) == null)
{
throw new IOException("Page tree root must be a dictionary");
}
}
private int checkPagesDictionary(COSDictionary pagesDict, Set<COSObject> set)
{
// check for kids
COSArray kidsArray = pagesDict.getCOSArray(COSName.KIDS);
int numberOfPages = 0;
if (kidsArray != null)
{
List<? extends COSBase> kidsList = kidsArray.toList();
for (COSBase kid : kidsList)
{
if (!(kid instanceof COSObject) || set.contains((COSObject) kid))
{
kidsArray.remove(kid);
continue;
}
COSObject kidObject = (COSObject) kid;
COSBase kidBaseobject = kidObject.getObject();
// object wasn't dereferenced -> remove it
if (kidBaseobject == null || kidBaseobject.equals(COSNull.NULL))
{
LOG.warn("Removed null object {} from pages dictionary", kid);
kidsArray.remove(kid);
}
else if (kidBaseobject instanceof COSDictionary)
{
COSDictionary kidDictionary = (COSDictionary) kidBaseobject;
COSName type = kidDictionary.getCOSName(COSName.TYPE);
if (COSName.PAGES.equals(type))
{
// process nested pages dictionaries
set.add(kidObject);
numberOfPages += checkPagesDictionary(kidDictionary, set);
}
else if (COSName.PAGE.equals(type))
{
// count pages
numberOfPages++;
}
}
}
}
// fix counter
pagesDict.setInt(COSName.COUNT, numberOfPages);
return numberOfPages;
}
/**
* This will parse the startxref section from the stream. The startxref value is ignored.
*
* @return the startxref value or -1 on parsing error
* @throws IOException If an IO error occurs.
*/
private long parseStartXref() throws IOException
{
long startXref = -1;
if (isString(STARTXREF))
{
readString();
skipSpaces();
// This integer is the byte offset of the first object referenced by the xref or xref stream
startXref = readLong();
}
return startXref;
}
/**
* Checks if the given string can be found at the current offset.
*
* @param string the bytes of the string to look for
* @return true if the bytes are in place, false if not
* @throws IOException if something went wrong
*/
private boolean isString(byte[] string) throws IOException
{
boolean bytesMatching = true;
long originOffset = source.getPosition();
for (byte c : string)
{
if (source.read() != c)
{
bytesMatching = false;
break;
}
}
source.seek(originOffset);
return bytesMatching;
}
/**
* Checks if the given string can be found at the current offset.
*
* @param string the bytes of the string to look for
* @return true if the bytes are in place, false if not
* @throws IOException if something went wrong
*/
protected boolean isString(char[] string) throws IOException
{
boolean bytesMatching = true;
long originOffset = source.getPosition();
for (char c : string)
{
if (source.read() != c)
{
bytesMatching = false;
break;
}
}
source.seek(originOffset);
return bytesMatching;
}
/**
* This will parse the trailer from the stream and add it to the state.
*
* @return false on parsing error
* @throws IOException If an IO error occurs.
*/
private boolean parseTrailer() throws IOException
{
// parse the last trailer.
long trailerOffset = source.getPosition();
// PDFBOX-1739 skip extra xref entries in RegisSTAR documents
if (isLenient)
{
int nextCharacter = source.peek();
while (nextCharacter != 't' && isDigit(nextCharacter))
{
if (source.getPosition() == trailerOffset)
{
// warn only the first time
LOG.warn("Expected trailer object at offset {}, keep trying", trailerOffset);
}
readLine();
nextCharacter = source.peek();
}
}
if(source.peek() != 't')
{
return false;
}
//read "trailer"
long currentOffset = source.getPosition();
String nextLine = readLine();
if( !nextLine.trim().equals( "trailer" ) )
{
// in some cases the EOL is missing and the trailer immediately
// continues with "<<" or with a blank character
// even if this does not comply with PDF reference we want to support as many PDFs as possible
// Acrobat reader can also deal with this.
if (nextLine.startsWith("trailer"))
{
// we can't just unread a portion of the read data as we don't know if the EOL consist of 1 or 2 bytes
int len = "trailer".length();
// jump back right after "trailer"
source.seek(currentOffset + len);
}
else
{
return false;
}
}
// in some cases the EOL is missing and the trailer continues with " <<"
// even if this does not comply with PDF reference we want to support as many PDFs as possible
// Acrobat reader can also deal with this.
skipSpaces();
COSDictionary parsedTrailer = parseCOSDictionary(true);
xrefTrailerResolver.setTrailer( parsedTrailer );
skipSpaces();
return true;
}
/**
* Parse the header of a pdf.
*
* @return true if a PDF header was found
* @throws IOException if something went wrong
*/
protected boolean parsePDFHeader() throws IOException
{
return parseHeader(PDF_HEADER, PDF_DEFAULT_VERSION);
}
/**
* Parse the header of a fdf.
*
* @return true if a FDF header was found
* @throws IOException if something went wrong
*/
protected boolean parseFDFHeader() throws IOException
{
return parseHeader(FDF_HEADER, FDF_DEFAULT_VERSION);
}
private boolean parseHeader(String headerMarker, String defaultVersion) throws IOException
{
// read first line
String header = readLine();
// some pdf-documents are broken and the pdf-version is in one of the following lines
if (!header.contains(headerMarker))
{
header = readLine();
while (!header.contains(headerMarker))
{
// if a line starts with a digit, it has to be the first one with data in it
if ((!header.isEmpty()) && (Character.isDigit(header.charAt(0))))
{
break;
}
header = readLine();
}
}
// nothing found
if (!header.contains(headerMarker))
{
source.seek(0);
return false;
}
//sometimes there is some garbage in the header before the header
//actually starts, so lets try to find the header first.
int headerStart = header.indexOf( headerMarker );
// greater than zero because if it is zero then there is no point of trimming
if ( headerStart > 0 )
{
//trim off any leading characters
header = header.substring(headerStart);
}
// This is used if there is garbage after the header on the same line
if (header.startsWith(headerMarker) && !header.matches(headerMarker + "\\d.\\d"))
{
if (header.length() < headerMarker.length() + 3)
{
// No version number at all, set to 1.4 as default
header = headerMarker + defaultVersion;
LOG.debug("No version found, set to {} as default.", defaultVersion);
}
else
{
String headerGarbage = header.substring(headerMarker.length() + 3) + "\n";
header = header.substring(0, headerMarker.length() + 3);
source.rewind(headerGarbage.getBytes(StandardCharsets.ISO_8859_1).length);
}
}
float headerVersion = -1;
try
{
String[] headerParts = header.split("-");
if (headerParts.length == 2)
{
headerVersion = Float.parseFloat(headerParts[1]);
}
}
catch (NumberFormatException exception)
{
LOG.debug("Can't parse the header version.", exception);
}
if (headerVersion < 0)
{
if (isLenient)
{
headerVersion = 1.7f;
}
else
{
throw new IOException("Error getting header version: " + header);
}
}
document.setVersion(headerVersion);
// rewind
source.seek(0);
return true;
}
/**
* This will parse the xref table from the stream and add it to the state
* The XrefTable contents are ignored.
* @param startByteOffset the offset to start at
* @return false on parsing error
* @throws IOException If an IO error occurs.
*/
protected boolean parseXrefTable(long startByteOffset) throws IOException
{
if(source.peek() != 'x')
{
return false;
}
String xref = readString();
if( !xref.trim().equals( "xref" ) )
{
return false;
}
// check for trailer after xref
String str = readString();
byte[] b = str.getBytes(StandardCharsets.ISO_8859_1);
source.rewind(b.length);
// signal start of new XRef
xrefTrailerResolver.nextXrefObj( startByteOffset, XRefType.TABLE );
if (str.startsWith("trailer"))
{
LOG.warn("skipping empty xref table");
return false;
}
// Xref tables can have multiple sections. Each starts with a starting object id and a count.
while(true)
{
String currentLine = readLine();
String[] splitString = StringUtil.splitOnSpace(currentLine);
if (splitString.length != 2)
{
LOG.warn("Unexpected XRefTable Entry: {}", currentLine);
return false;
}
// first obj id
long currObjID;
try
{
currObjID = Long.parseLong(splitString[0]);
}
catch (NumberFormatException exception)
{
LOG.warn("XRefTable: invalid ID for the first object: {}", currentLine);
return false;
}
// the number of objects in the xref table
int count = 0;
try
{
count = Integer.parseInt(splitString[1]);
}
catch (NumberFormatException exception)
{
LOG.warn("XRefTable: invalid number of objects: {}", currentLine);
return false;
}
skipSpaces();
for(int i = 0; i < count; i++)
{
if (source.isEOF() || isEndOfName(source.peek()))
{
break;
}
if(source.peek() == 't')
{
break;
}
//Ignore table contents
currentLine = readLine();
splitString = StringUtil.splitOnSpace(currentLine);
if (splitString.length < 3)
{
LOG.warn("invalid xref line: {}", currentLine);
break;
}
/* This supports the corrupt table as reported in
* PDFBOX-474 (XXXX XXX XX n) */
if(splitString[splitString.length-1].equals("n"))
{
try
{
long currOffset = Long.parseLong(splitString[0]);
// skip 0 offsets
if (currOffset > 0)
{
int currGenID = Integer.parseInt(splitString[1]);
COSObjectKey objKey = new COSObjectKey(currObjID, currGenID);
xrefTrailerResolver.setXRef(objKey, currOffset);
}
}
catch (IllegalArgumentException e)
{
throw new IOException(e);
}
}
else if(!splitString[2].equals("f"))
{
throw new IOException("Corrupt XRefTable Entry - ObjID:" + currObjID);
}
currObjID++;
skipSpaces();
}
skipSpaces();
if (!isDigit())
{
break;
}
}
return true;
}
/**
* This will get the encryption dictionary. The document must be parsed before this is called.
*
* @return The encryption dictionary of the document that was parsed.
*
* @throws IOException If there is an error getting the document.
*/
protected PDEncryption getEncryption() throws IOException
{
if (document == null)
{
throw new IOException(
"You must parse the document first before calling getEncryption()");
}
return encryption;
}
/**
* This will get the AccessPermission. The document must be parsed before this is called.
*
* @return The access permission of document that was parsed.
*
* @throws IOException If there is an error getting the document.
*/
protected AccessPermission getAccessPermission() throws IOException
{
if (document == null)
{
throw new IOException(
"You must parse the document first before calling getAccessPermission()");
}
return accessPermission;
}
/**
* Prepare for decryption.
*
* @throws InvalidPasswordException If the password is incorrect.
* @throws IOException if something went wrong
*/
protected void prepareDecryption() throws IOException
{
if (encryption != null)
{
return;
}
COSDictionary encryptionDictionary = document.getEncryptionDictionary();
if (encryptionDictionary == null)
{
return;
}
try
{
encryption = new PDEncryption(encryptionDictionary);
DecryptionMaterial decryptionMaterial;
if (keyStoreInputStream != null)
{
KeyStore ks = KeyStore.getInstance("PKCS12");
ks.load(keyStoreInputStream, password.toCharArray());
decryptionMaterial = new PublicKeyDecryptionMaterial(ks, keyAlias, password);
}
else
{
decryptionMaterial = new StandardDecryptionMaterial(password);
}
securityHandler = encryption.getSecurityHandler();
securityHandler.prepareForDecryption(encryption, document.getDocumentID(),
decryptionMaterial);
accessPermission = securityHandler.getCurrentAccessPermission();
}
catch (IOException e)
{
throw e;
}
catch (GeneralSecurityException e)
{
throw new IOException("Error (" + e.getClass().getSimpleName()
+ ") while creating security handler for decryption", e);
}
finally
{
if (keyStoreInputStream != null)
{
IOUtils.closeQuietly(keyStoreInputStream);
}
}
}
}