| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.pdfbox.pdfparser; |
| |
| import java.io.IOException; |
| import java.io.OutputStream; |
| import java.util.ArrayList; |
| import java.util.Arrays; |
| import java.util.Collection; |
| import java.util.Collections; |
| import java.util.HashMap; |
| import java.util.HashSet; |
| import java.util.Iterator; |
| import java.util.LinkedList; |
| import java.util.List; |
| import java.util.Map; |
| import java.util.Map.Entry; |
| import java.util.Queue; |
| import java.util.Set; |
| import java.util.TreeMap; |
| import java.util.Vector; |
| import org.apache.commons.logging.Log; |
| import org.apache.commons.logging.LogFactory; |
| import org.apache.pdfbox.cos.COSArray; |
| import org.apache.pdfbox.cos.COSBase; |
| import org.apache.pdfbox.cos.COSDictionary; |
| import org.apache.pdfbox.cos.COSDocument; |
| import org.apache.pdfbox.cos.COSName; |
| import org.apache.pdfbox.cos.COSNull; |
| import org.apache.pdfbox.cos.COSNumber; |
| import org.apache.pdfbox.cos.COSObject; |
| import org.apache.pdfbox.cos.COSObjectKey; |
| import org.apache.pdfbox.cos.COSStream; |
| import org.apache.pdfbox.io.RandomAccessRead; |
| import org.apache.pdfbox.pdfparser.XrefTrailerResolver.XRefType; |
| import org.apache.pdfbox.pdmodel.encryption.SecurityHandler; |
| |
| |
| import static org.apache.pdfbox.util.Charsets.ISO_8859_1; |
| |
| /** |
| * PDF-Parser which first reads startxref and xref tables in order to know valid objects and parse only these objects. |
| * |
| * First {@link PDFParser#parse()} or {@link FDFParser#parse()} must be called before page objects |
| * can be retrieved, e.g. {@link PDFParser#getPDDocument()}. |
| * |
| * This class is a much enhanced version of <code>QuickParser</code> presented in <a |
| * href="https://issues.apache.org/jira/browse/PDFBOX-1104">PDFBOX-1104</a> by Jeremy Villalobos. |
| */ |
| public class COSParser extends BaseParser |
| { |
| private static final String PDF_HEADER = "%PDF-"; |
| private static final String FDF_HEADER = "%FDF-"; |
| |
| private static final String PDF_DEFAULT_VERSION = "1.4"; |
| private static final String FDF_DEFAULT_VERSION = "1.0"; |
| |
| private static final char[] XREF_TABLE = new char[] { 'x', 'r', 'e', 'f' }; |
| private static final char[] XREF_STREAM = new char[] { '/', 'X', 'R', 'e', 'f' }; |
| private static final char[] STARTXREF = new char[] { 's','t','a','r','t','x','r','e','f' }; |
| |
| private static final byte[] ENDSTREAM = new byte[] { E, N, D, S, T, R, E, A, M }; |
| |
| private static final byte[] ENDOBJ = new byte[] { E, N, D, O, B, J }; |
| |
| private static final long MINIMUM_SEARCH_OFFSET = 6; |
| |
| private static final int X = 'x'; |
| |
| private static final int STRMBUFLEN = 2048; |
| private final byte[] strmBuf = new byte[ STRMBUFLEN ]; |
| |
| protected final RandomAccessRead source; |
| |
| /** |
| * Only parse the PDF file minimally allowing access to basic information. |
| */ |
| public static final String SYSPROP_PARSEMINIMAL = |
| "org.apache.pdfbox.pdfparser.nonSequentialPDFParser.parseMinimal"; |
| |
| /** |
| * The range within the %%EOF marker will be searched. |
| * Useful if there are additional characters after %%EOF within the PDF. |
| */ |
| public static final String SYSPROP_EOFLOOKUPRANGE = |
| "org.apache.pdfbox.pdfparser.nonSequentialPDFParser.eofLookupRange"; |
| |
| /** |
| * How many trailing bytes to read for EOF marker. |
| */ |
| private static final int DEFAULT_TRAIL_BYTECOUNT = 2048; |
| /** |
| * EOF-marker. |
| */ |
| protected static final char[] EOF_MARKER = new char[] { '%', '%', 'E', 'O', 'F' }; |
| /** |
| * obj-marker. |
| */ |
| protected static final char[] OBJ_MARKER = new char[] { 'o', 'b', 'j' }; |
| |
| private long trailerOffset; |
| |
| /** |
| * file length. |
| */ |
| protected long fileLen; |
| |
| /** |
| * is parser using auto healing capacity ? |
| */ |
| private boolean isLenient = true; |
| |
| protected boolean initialParseDone = false; |
| /** |
| * Contains all found objects of a brute force search. |
| */ |
| private Map<COSObjectKey, Long> bfSearchCOSObjectKeyOffsets = null; |
| private List<Long> bfSearchXRefTablesOffsets = null; |
| private List<Long> bfSearchXRefStreamsOffsets = null; |
| |
| /** |
| * The security handler. |
| */ |
| protected SecurityHandler securityHandler = null; |
| |
| /** |
| * how many trailing bytes to read for EOF marker. |
| */ |
| private int readTrailBytes = DEFAULT_TRAIL_BYTECOUNT; |
| |
| private static final Log LOG = LogFactory.getLog(COSParser.class); |
| |
| /** |
| * Collects all Xref/trailer objects and resolves them into single |
| * object using startxref reference. |
| */ |
| protected XrefTrailerResolver xrefTrailerResolver = new XrefTrailerResolver(); |
| |
| |
| /** |
| * The prefix for the temp file being used. |
| */ |
| public static final String TMP_FILE_PREFIX = "tmpPDF"; |
| |
| /** |
| * Default constructor. |
| */ |
| public COSParser(RandomAccessRead source) |
| { |
| super(new RandomAccessSource(source)); |
| this.source = source; |
| } |
| |
| /** |
| * Sets how many trailing bytes of PDF file are searched for EOF marker and 'startxref' marker. If not set we use |
| * default value {@link #DEFAULT_TRAIL_BYTECOUNT}. |
| * |
| * <p>We check that new value is at least 16. However for practical use cases this value should not be lower than |
| * 1000; even 2000 was found to not be enough in some cases where some trailing garbage like HTML snippets followed |
| * the EOF marker.</p> |
| * |
| * <p> |
| * In case system property {@link #SYSPROP_EOFLOOKUPRANGE} is defined this value will be set on initialization but |
| * can be overwritten later. |
| * </p> |
| * |
| * @param byteCount number of trailing bytes |
| */ |
| public void setEOFLookupRange(int byteCount) |
| { |
| if (byteCount > 15) |
| { |
| readTrailBytes = byteCount; |
| } |
| } |
| |
| /** |
| * Parses cross reference tables. |
| * |
| * @param startXRefOffset start offset of the first table |
| * @return the trailer dictionary |
| * @throws IOException if something went wrong |
| */ |
| protected COSDictionary parseXref(long startXRefOffset) throws IOException |
| { |
| source.seek(startXRefOffset); |
| long startXrefOffset = Math.max(0, parseStartXref()); |
| // check the startxref offset |
| long fixedOffset = checkXRefOffset(startXrefOffset); |
| if (fixedOffset > -1) |
| { |
| startXrefOffset = fixedOffset; |
| } |
| document.setStartXref(startXrefOffset); |
| long prev = startXrefOffset; |
| // ---- parse whole chain of xref tables/object streams using PREV reference |
| long lastPrev = -1; |
| while (prev > 0 && prev != lastPrev) |
| { |
| lastPrev = prev; |
| // seek to xref table |
| source.seek(prev); |
| |
| // skip white spaces |
| skipSpaces(); |
| // -- parse xref |
| if (source.peek() == X) |
| { |
| // xref table and trailer |
| // use existing parser to parse xref table |
| parseXrefTable(prev); |
| // parse the last trailer. |
| trailerOffset = source.getPosition(); |
| // PDFBOX-1739 skip extra xref entries in RegisSTAR documents |
| while (isLenient && source.peek() != 't') |
| { |
| if (source.getPosition() == trailerOffset) |
| { |
| // warn only the first time |
| LOG.warn("Expected trailer object at position " + trailerOffset |
| + ", keep trying"); |
| } |
| readLine(); |
| } |
| if (!parseTrailer()) |
| { |
| throw new IOException("Expected trailer object at position: " |
| + source.getPosition()); |
| } |
| COSDictionary trailer = xrefTrailerResolver.getCurrentTrailer(); |
| // check for a XRef stream, it may contain some object ids of compressed objects |
| if(trailer.containsKey(COSName.XREF_STM)) |
| { |
| int streamOffset = trailer.getInt(COSName.XREF_STM); |
| // check the xref stream reference |
| fixedOffset = checkXRefStreamOffset(streamOffset, false); |
| if (fixedOffset > -1 && fixedOffset != streamOffset) |
| { |
| streamOffset = (int)fixedOffset; |
| trailer.setInt(COSName.XREF_STM, streamOffset); |
| } |
| if (streamOffset > 0) |
| { |
| source.seek(streamOffset); |
| skipSpaces(); |
| parseXrefObjStream(prev, false); |
| } |
| else |
| { |
| if(isLenient) |
| { |
| LOG.error("Skipped XRef stream due to a corrupt offset:"+streamOffset); |
| } |
| else |
| { |
| throw new IOException("Skipped XRef stream due to a corrupt offset:"+streamOffset); |
| } |
| } |
| } |
| prev = trailer.getInt(COSName.PREV); |
| if (prev > 0) |
| { |
| // check the xref table reference |
| fixedOffset = checkXRefOffset(prev); |
| if (fixedOffset > -1 && fixedOffset != prev) |
| { |
| prev = fixedOffset; |
| trailer.setLong(COSName.PREV, prev); |
| } |
| } |
| } |
| else |
| { |
| // parse xref stream |
| prev = parseXrefObjStream(prev, true); |
| if (prev > 0) |
| { |
| // check the xref table reference |
| fixedOffset = checkXRefOffset(prev); |
| if (fixedOffset > -1 && fixedOffset != prev) |
| { |
| prev = fixedOffset; |
| COSDictionary trailer = xrefTrailerResolver.getCurrentTrailer(); |
| trailer.setLong(COSName.PREV, prev); |
| } |
| } |
| } |
| } |
| if (prev == lastPrev) |
| { |
| //TODO better idea needed? PDFBOX-3446 |
| throw new IOException("/Prev loop at offset " + prev); |
| } |
| // ---- build valid xrefs out of the xref chain |
| xrefTrailerResolver.setStartxref(startXrefOffset); |
| COSDictionary trailer = xrefTrailerResolver.getTrailer(); |
| document.setTrailer(trailer); |
| document.setIsXRefStream(XRefType.STREAM == xrefTrailerResolver.getXrefType()); |
| // check the offsets of all referenced objects |
| checkXrefOffsets(); |
| // copy xref table |
| document.addXRefTable(xrefTrailerResolver.getXrefTable()); |
| return trailer; |
| } |
| |
| /** |
| * Parses an xref object stream starting with indirect object id. |
| * |
| * @return value of PREV item in dictionary or <code>-1</code> if no such item exists |
| */ |
| private long parseXrefObjStream(long objByteOffset, boolean isStandalone) throws IOException |
| { |
| // ---- parse indirect object head |
| readObjectNumber(); |
| readGenerationNumber(); |
| readExpectedString(OBJ_MARKER, true); |
| |
| COSDictionary dict = parseCOSDictionary(); |
| COSStream xrefStream = parseCOSStream(dict); |
| parseXrefStream(xrefStream, objByteOffset, isStandalone); |
| xrefStream.close(); |
| |
| return dict.getLong(COSName.PREV); |
| } |
| |
| /** |
| * Looks for and parses startxref. We first look for last '%%EOF' marker (within last |
| * {@link #DEFAULT_TRAIL_BYTECOUNT} bytes (or range set via {@link #setEOFLookupRange(int)}) and go back to find |
| * <code>startxref</code>. |
| * |
| * @return the offset of StartXref |
| * @throws IOException If something went wrong. |
| */ |
| protected final long getStartxrefOffset() throws IOException |
| { |
| byte[] buf; |
| long skipBytes; |
| // read trailing bytes into buffer |
| try |
| { |
| final int trailByteCount = (fileLen < readTrailBytes) ? (int) fileLen : readTrailBytes; |
| buf = new byte[trailByteCount]; |
| skipBytes = fileLen - trailByteCount; |
| source.seek(skipBytes); |
| int off = 0; |
| int readBytes; |
| while (off < trailByteCount) |
| { |
| readBytes = source.read(buf, off, trailByteCount - off); |
| // in order to not get stuck in a loop we check readBytes (this should never happen) |
| if (readBytes < 1) |
| { |
| throw new IOException( |
| "No more bytes to read for trailing buffer, but expected: " |
| + (trailByteCount - off)); |
| } |
| off += readBytes; |
| } |
| } |
| finally |
| { |
| source.seek(0); |
| } |
| // find last '%%EOF' |
| int bufOff = lastIndexOf(EOF_MARKER, buf, buf.length); |
| if (bufOff < 0) |
| { |
| if (isLenient) |
| { |
| // in lenient mode the '%%EOF' isn't needed |
| bufOff = buf.length; |
| LOG.debug("Missing end of file marker '" + new String(EOF_MARKER) + "'"); |
| } |
| else |
| { |
| throw new IOException("Missing end of file marker '" + new String(EOF_MARKER) + "'"); |
| } |
| } |
| // find last startxref preceding EOF marker |
| bufOff = lastIndexOf(STARTXREF, buf, bufOff); |
| long startXRefOffset = skipBytes + bufOff; |
| |
| if (bufOff < 0) |
| { |
| if (isLenient) |
| { |
| LOG.debug("Can't find offset for startxref"); |
| return -1; |
| } |
| else |
| { |
| throw new IOException("Missing 'startxref' marker."); |
| } |
| } |
| return startXRefOffset; |
| } |
| |
| /** |
| * Searches last appearance of pattern within buffer. Lookup before _lastOff and goes back until 0. |
| * |
| * @param pattern pattern to search for |
| * @param buf buffer to search pattern in |
| * @param endOff offset (exclusive) where lookup starts at |
| * |
| * @return start offset of pattern within buffer or <code>-1</code> if pattern could not be found |
| */ |
| protected int lastIndexOf(final char[] pattern, final byte[] buf, final int endOff) |
| { |
| final int lastPatternChOff = pattern.length - 1; |
| |
| int bufOff = endOff; |
| int patOff = lastPatternChOff; |
| char lookupCh = pattern[patOff]; |
| |
| while (--bufOff >= 0) |
| { |
| if (buf[bufOff] == lookupCh) |
| { |
| if (--patOff < 0) |
| { |
| // whole pattern matched |
| return bufOff; |
| } |
| // matched current char, advance to preceding one |
| lookupCh = pattern[patOff]; |
| } |
| else if (patOff < lastPatternChOff) |
| { |
| // no char match but already matched some chars; reset |
| patOff = lastPatternChOff; |
| lookupCh = pattern[patOff]; |
| } |
| } |
| return -1; |
| } |
| |
| /** |
| * Return true if parser is lenient. Meaning auto healing capacity of the parser are used. |
| * |
| * @return true if parser is lenient |
| */ |
| public boolean isLenient() |
| { |
| return isLenient; |
| } |
| |
| /** |
| * Change the parser leniency flag. |
| * |
| * This method can only be called before the parsing of the file. |
| * |
| * @param lenient try to handle malformed PDFs. |
| * |
| */ |
| public void setLenient(boolean lenient) |
| { |
| if (initialParseDone) |
| { |
| throw new IllegalArgumentException("Cannot change leniency after parsing"); |
| } |
| this.isLenient = lenient; |
| } |
| |
| /** |
| * Creates a unique object id using object number and object generation |
| * number. (requires object number < 2^31)) |
| */ |
| private long getObjectId(final COSObject obj) |
| { |
| return obj.getObjectNumber() << 32 | obj.getGenerationNumber(); |
| } |
| |
| /** |
| * Adds all from newObjects to toBeParsedList if it is not an COSObject or |
| * we didn't add this COSObject already (checked via addedObjects). |
| */ |
| private void addNewToList(final Queue<COSBase> toBeParsedList, |
| final Collection<COSBase> newObjects, final Set<Long> addedObjects) |
| { |
| for (COSBase newObject : newObjects) |
| { |
| addNewToList(toBeParsedList, newObject, addedObjects); |
| } |
| } |
| |
| /** |
| * Adds newObject to toBeParsedList if it is not an COSObject or we didn't |
| * add this COSObject already (checked via addedObjects). |
| */ |
| private void addNewToList(final Queue<COSBase> toBeParsedList, final COSBase newObject, |
| final Set<Long> addedObjects) |
| { |
| if (newObject instanceof COSObject) |
| { |
| final long objId = getObjectId((COSObject) newObject); |
| if (!addedObjects.add(objId)) |
| { |
| return; |
| } |
| } |
| toBeParsedList.add(newObject); |
| } |
| |
| /** |
| * Will parse every object necessary to load a single page from the pdf document. We try our |
| * best to order objects according to offset in file before reading to minimize seek operations. |
| * |
| * @param dict the COSObject from the parent pages. |
| * @param excludeObjects dictionary object reference entries with these names will not be parsed |
| * |
| * @throws IOException if something went wrong |
| */ |
| protected void parseDictObjects(COSDictionary dict, COSName... excludeObjects) throws IOException |
| { |
| // ---- create queue for objects waiting for further parsing |
| final Queue<COSBase> toBeParsedList = new LinkedList<COSBase>(); |
| // offset ordered object map |
| final TreeMap<Long, List<COSObject>> objToBeParsed = new TreeMap<Long, List<COSObject>>(); |
| // in case of compressed objects offset points to stmObj |
| final Set<Long> parsedObjects = new HashSet<Long>(); |
| final Set<Long> addedObjects = new HashSet<Long>(); |
| |
| addExcludedToList(excludeObjects, dict, parsedObjects); |
| addNewToList(toBeParsedList, dict.getValues(), addedObjects); |
| |
| // ---- go through objects to be parsed |
| while (!(toBeParsedList.isEmpty() && objToBeParsed.isEmpty())) |
| { |
| // -- first get all COSObject from other kind of objects and |
| // put them in objToBeParsed; afterwards toBeParsedList is empty |
| COSBase baseObj; |
| while ((baseObj = toBeParsedList.poll()) != null) |
| { |
| if (baseObj instanceof COSDictionary) |
| { |
| addNewToList(toBeParsedList, ((COSDictionary) baseObj).getValues(), addedObjects); |
| } |
| else if (baseObj instanceof COSArray) |
| { |
| final Iterator<COSBase> arrIter = ((COSArray) baseObj).iterator(); |
| while (arrIter.hasNext()) |
| { |
| addNewToList(toBeParsedList, arrIter.next(), addedObjects); |
| } |
| } |
| else if (baseObj instanceof COSObject) |
| { |
| COSObject obj = (COSObject) baseObj; |
| long objId = getObjectId(obj); |
| COSObjectKey objKey = new COSObjectKey(obj.getObjectNumber(), obj.getGenerationNumber()); |
| |
| if (!parsedObjects.contains(objId)) |
| { |
| Long fileOffset = xrefTrailerResolver.getXrefTable().get(objKey); |
| // it is allowed that object references point to null, |
| // thus we have to test |
| if (fileOffset != null && fileOffset != 0) |
| { |
| if (fileOffset > 0) |
| { |
| objToBeParsed.put(fileOffset, Collections.singletonList(obj)); |
| } |
| else |
| { |
| // negative offset means we have a compressed |
| // object within object stream; |
| // get offset of object stream |
| fileOffset = xrefTrailerResolver.getXrefTable().get( |
| new COSObjectKey((int)-fileOffset, 0)); |
| if ((fileOffset == null) || (fileOffset <= 0)) |
| { |
| throw new IOException( |
| "Invalid object stream xref object reference for key '" + objKey + "': " |
| + fileOffset); |
| } |
| |
| List<COSObject> stmObjects = objToBeParsed.get(fileOffset); |
| if (stmObjects == null) |
| { |
| stmObjects = new ArrayList<COSObject>(); |
| objToBeParsed.put(fileOffset, stmObjects); |
| } |
| stmObjects.add(obj); |
| } |
| } |
| else |
| { |
| // NULL object |
| COSObject pdfObject = document.getObjectFromPool(objKey); |
| pdfObject.setObject(COSNull.NULL); |
| } |
| } |
| } |
| } |
| |
| // ---- read first COSObject with smallest offset |
| // resulting object will be added to toBeParsedList |
| if (objToBeParsed.isEmpty()) |
| { |
| break; |
| } |
| |
| for (COSObject obj : objToBeParsed.remove(objToBeParsed.firstKey())) |
| { |
| COSBase parsedObj = parseObjectDynamically(obj, false); |
| if (parsedObj != null) |
| { |
| obj.setObject(parsedObj); |
| addNewToList(toBeParsedList, parsedObj, addedObjects); |
| parsedObjects.add(getObjectId(obj)); |
| } |
| } |
| } |
| } |
| |
| // add objects not to be parsed to list of already parsed objects |
| private void addExcludedToList(COSName[] excludeObjects, COSDictionary dict, final Set<Long> parsedObjects) |
| { |
| if (excludeObjects != null) |
| { |
| for (COSName objName : excludeObjects) |
| { |
| COSBase baseObj = dict.getItem(objName); |
| if (baseObj instanceof COSObject) |
| { |
| parsedObjects.add(getObjectId((COSObject) baseObj)); |
| } |
| } |
| } |
| } |
| |
| /** |
| * This will parse the next object from the stream and add it to the local state. |
| * |
| * @param obj object to be parsed (we only take object number and generation number for lookup start offset) |
| * @param requireExistingNotCompressedObj if <code>true</code> object to be parsed must not be contained within |
| * compressed stream |
| * @return the parsed object (which is also added to document object) |
| * |
| * @throws IOException If an IO error occurs. |
| */ |
| protected final COSBase parseObjectDynamically(COSObject obj, |
| boolean requireExistingNotCompressedObj) throws IOException |
| { |
| return parseObjectDynamically(obj.getObjectNumber(), |
| obj.getGenerationNumber(), requireExistingNotCompressedObj); |
| } |
| |
| /** |
| * This will parse the next object from the stream and add it to the local state. |
| * It's reduced to parsing an indirect object. |
| * |
| * @param objNr object number of object to be parsed |
| * @param objGenNr object generation number of object to be parsed |
| * @param requireExistingNotCompressedObj if <code>true</code> the object to be parsed must be defined in xref |
| * (comment: null objects may be missing from xref) and it must not be a compressed object within object stream |
| * (this is used to circumvent being stuck in a loop in a malicious PDF) |
| * |
| * @return the parsed object (which is also added to document object) |
| * |
| * @throws IOException If an IO error occurs. |
| */ |
| protected COSBase parseObjectDynamically(long objNr, int objGenNr, |
| boolean requireExistingNotCompressedObj) throws IOException |
| { |
| // ---- create object key and get object (container) from pool |
| final COSObjectKey objKey = new COSObjectKey(objNr, objGenNr); |
| final COSObject pdfObject = document.getObjectFromPool(objKey); |
| |
| if (pdfObject.getObject() == null) |
| { |
| // not previously parsed |
| // ---- read offset or object stream object number from xref table |
| Long offsetOrObjstmObNr = xrefTrailerResolver.getXrefTable().get(objKey); |
| |
| // sanity test to circumvent loops with broken documents |
| if (requireExistingNotCompressedObj |
| && ((offsetOrObjstmObNr == null) || (offsetOrObjstmObNr <= 0))) |
| { |
| throw new IOException("Object must be defined and must not be compressed object: " |
| + objKey.getNumber() + ":" + objKey.getGeneration()); |
| } |
| |
| // maybe something is wrong with the xref table -> perform brute force search for all objects |
| if (offsetOrObjstmObNr == null && isLenient && bfSearchCOSObjectKeyOffsets == null) |
| { |
| bfSearchForObjects(); |
| if (bfSearchCOSObjectKeyOffsets != null && !bfSearchCOSObjectKeyOffsets.isEmpty()) |
| { |
| LOG.debug("Add all new read objects from brute force search to the xref table"); |
| Map<COSObjectKey, Long> xrefOffset = xrefTrailerResolver.getXrefTable(); |
| final Set<Map.Entry<COSObjectKey, Long>> entries = bfSearchCOSObjectKeyOffsets.entrySet(); |
| for (Entry<COSObjectKey, Long> entry : entries) |
| { |
| COSObjectKey key = entry.getKey(); |
| // add all missing objects to the xref table |
| if (!xrefOffset.containsKey(key)) |
| { |
| xrefOffset.put(key, entry.getValue()); |
| } |
| } |
| offsetOrObjstmObNr = xrefOffset.get(objKey); |
| } |
| } |
| |
| if (offsetOrObjstmObNr == null) |
| { |
| // not defined object -> NULL object (Spec. 1.7, chap. 3.2.9) |
| pdfObject.setObject(COSNull.NULL); |
| } |
| else if (offsetOrObjstmObNr > 0) |
| { |
| // offset of indirect object in file |
| parseFileObject(offsetOrObjstmObNr, objKey, pdfObject); |
| } |
| else |
| { |
| // xref value is object nr of object stream containing object to be parsed |
| // since our object was not found it means object stream was not parsed so far |
| parseObjectStream((int) -offsetOrObjstmObNr); |
| } |
| } |
| return pdfObject.getObject(); |
| } |
| |
| private void parseFileObject(Long offsetOrObjstmObNr, final COSObjectKey objKey, final COSObject pdfObject) throws IOException |
| { |
| // ---- go to object start |
| source.seek(offsetOrObjstmObNr); |
| |
| // ---- we must have an indirect object |
| final long readObjNr = readObjectNumber(); |
| final int readObjGen = readGenerationNumber(); |
| readExpectedString(OBJ_MARKER, true); |
| |
| // ---- consistency check |
| if ((readObjNr != objKey.getNumber()) || (readObjGen != objKey.getGeneration())) |
| { |
| throw new IOException("XREF for " + objKey.getNumber() + ":" |
| + objKey.getGeneration() + " points to wrong object: " + readObjNr |
| + ":" + readObjGen + " at offset " + offsetOrObjstmObNr); |
| } |
| |
| skipSpaces(); |
| COSBase pb = parseDirObject(); |
| String endObjectKey = readString(); |
| |
| if (endObjectKey.equals(STREAM_STRING)) |
| { |
| source.rewind(endObjectKey.getBytes(ISO_8859_1).length); |
| if (pb instanceof COSDictionary) |
| { |
| COSStream stream = parseCOSStream((COSDictionary) pb); |
| |
| if (securityHandler != null) |
| { |
| securityHandler.decryptStream(stream, objKey.getNumber(), objKey.getGeneration()); |
| } |
| pb = stream; |
| } |
| else |
| { |
| // this is not legal |
| // the combination of a dict and the stream/endstream |
| // forms a complete stream object |
| throw new IOException("Stream not preceded by dictionary (offset: " |
| + offsetOrObjstmObNr + ")."); |
| } |
| skipSpaces(); |
| endObjectKey = readLine(); |
| |
| // we have case with a second 'endstream' before endobj |
| if (!endObjectKey.startsWith(ENDOBJ_STRING) && endObjectKey.startsWith(ENDSTREAM_STRING)) |
| { |
| endObjectKey = endObjectKey.substring(9).trim(); |
| if (endObjectKey.length() == 0) |
| { |
| // no other characters in extra endstream line |
| // read next line |
| endObjectKey = readLine(); |
| } |
| } |
| } |
| else if (securityHandler != null) |
| { |
| securityHandler.decrypt(pb, objKey.getNumber(), objKey.getGeneration()); |
| } |
| |
| pdfObject.setObject(pb); |
| |
| if (!endObjectKey.startsWith(ENDOBJ_STRING)) |
| { |
| if (isLenient) |
| { |
| LOG.warn("Object (" + readObjNr + ":" + readObjGen + ") at offset " |
| + offsetOrObjstmObNr + " does not end with 'endobj' but with '" |
| + endObjectKey + "'"); |
| } |
| else |
| { |
| throw new IOException("Object (" + readObjNr + ":" + readObjGen |
| + ") at offset " + offsetOrObjstmObNr |
| + " does not end with 'endobj' but with '" + endObjectKey + "'"); |
| } |
| } |
| } |
| |
| private void parseObjectStream(int objstmObjNr) throws IOException |
| { |
| final COSBase objstmBaseObj = parseObjectDynamically(objstmObjNr, 0, true); |
| if (objstmBaseObj instanceof COSStream) |
| { |
| // parse object stream |
| PDFObjectStreamParser parser = new PDFObjectStreamParser((COSStream) objstmBaseObj, document); |
| try |
| { |
| parser.parse(); |
| } |
| catch(IOException exception) |
| { |
| if (isLenient) |
| { |
| LOG.debug("Stop reading object stream "+objstmObjNr+" due to an exception", exception); |
| // the error is handled in parseDictObjects |
| return; |
| } |
| else |
| { |
| throw exception; |
| } |
| } |
| // register all objects which are referenced to be contained in object stream |
| for (COSObject next : parser.getObjects()) |
| { |
| COSObjectKey stmObjKey = new COSObjectKey(next); |
| Long offset = xrefTrailerResolver.getXrefTable().get(stmObjKey); |
| if (offset != null && offset == -objstmObjNr) |
| { |
| COSObject stmObj = document.getObjectFromPool(stmObjKey); |
| stmObj.setObject(next.getObject()); |
| } |
| } |
| } |
| } |
| |
| /** |
| * Returns length value referred to or defined in given object. |
| */ |
| private COSNumber getLength(final COSBase lengthBaseObj, final COSName streamType) throws IOException |
| { |
| if (lengthBaseObj == null) |
| { |
| return null; |
| } |
| COSNumber retVal = null; |
| // maybe length was given directly |
| if (lengthBaseObj instanceof COSNumber) |
| { |
| retVal = (COSNumber) lengthBaseObj; |
| } |
| // length in referenced object |
| else if (lengthBaseObj instanceof COSObject) |
| { |
| COSObject lengthObj = (COSObject) lengthBaseObj; |
| if (lengthObj.getObject() == null) |
| { |
| // not read so far, keep current stream position |
| final long curFileOffset = source.getPosition(); |
| boolean isObjectStream = COSName.OBJ_STM.equals(streamType); |
| parseObjectDynamically(lengthObj, isObjectStream); |
| // reset current stream position |
| source.seek(curFileOffset); |
| if (lengthObj.getObject() == null) |
| { |
| throw new IOException("Length object content was not read."); |
| } |
| } |
| if (!(lengthObj.getObject() instanceof COSNumber)) |
| { |
| throw new IOException("Wrong type of referenced length object " + lengthObj |
| + ": " + lengthObj.getObject().getClass().getSimpleName()); |
| } |
| retVal = (COSNumber) lengthObj.getObject(); |
| } |
| else |
| { |
| throw new IOException("Wrong type of length object: " |
| + lengthBaseObj.getClass().getSimpleName()); |
| } |
| return retVal; |
| } |
| |
| private static final int STREAMCOPYBUFLEN = 8192; |
| private final byte[] streamCopyBuf = new byte[STREAMCOPYBUFLEN]; |
| |
| /** |
| * This will read a COSStream from the input stream using length attribute within dictionary. If |
| * length attribute is a indirect reference it is first resolved to get the stream length. This |
| * means we copy stream data without testing for 'endstream' or 'endobj' and thus it is no |
| * problem if these keywords occur within stream. We require 'endstream' to be found after |
| * stream data is read. |
| * |
| * @param dic dictionary that goes with this stream. |
| * |
| * @return parsed pdf stream. |
| * |
| * @throws IOException if an error occurred reading the stream, like problems with reading |
| * length attribute, stream does not end with 'endstream' after data read, stream too short etc. |
| */ |
| protected COSStream parseCOSStream(COSDictionary dic) throws IOException |
| { |
| COSStream stream = document.createCOSStream(dic); |
| |
| // read 'stream'; this was already tested in parseObjectsDynamically() |
| readString(); |
| |
| skipWhiteSpaces(); |
| |
| /* |
| * This needs to be dic.getItem because when we are parsing, the underlying object might still be null. |
| */ |
| COSNumber streamLengthObj = getLength(dic.getItem(COSName.LENGTH), dic.getCOSName(COSName.TYPE)); |
| if (streamLengthObj == null) |
| { |
| if (isLenient) |
| { |
| LOG.warn("The stream doesn't provide any stream length, using fallback readUntilEnd, at offset " |
| + source.getPosition()); |
| } |
| else |
| { |
| throw new IOException("Missing length for stream."); |
| } |
| } |
| |
| // get output stream to copy data to |
| if (streamLengthObj != null && validateStreamLength(streamLengthObj.longValue())) |
| { |
| OutputStream out = stream.createRawOutputStream(); |
| try |
| { |
| readValidStream(out, streamLengthObj); |
| } |
| finally |
| { |
| out.close(); |
| // restore original (possibly incorrect) length |
| stream.setItem(COSName.LENGTH, streamLengthObj); |
| } |
| } |
| else |
| { |
| OutputStream out = stream.createRawOutputStream(); |
| try |
| { |
| readUntilEndStream(new EndstreamOutputStream(out)); |
| } |
| finally |
| { |
| out.close(); |
| // restore original (possibly incorrect) length |
| if (streamLengthObj != null) |
| { |
| stream.setItem(COSName.LENGTH, streamLengthObj); |
| } |
| else |
| { |
| stream.removeItem(COSName.LENGTH); |
| } |
| } |
| } |
| String endStream = readString(); |
| if (endStream.equals("endobj") && isLenient) |
| { |
| LOG.warn("stream ends with 'endobj' instead of 'endstream' at offset " |
| + source.getPosition()); |
| // avoid follow-up warning about missing endobj |
| source.rewind(ENDOBJ.length); |
| } |
| else if (endStream.length() > 9 && isLenient && endStream.substring(0,9).equals(ENDSTREAM_STRING)) |
| { |
| LOG.warn("stream ends with '" + endStream + "' instead of 'endstream' at offset " |
| + source.getPosition()); |
| // unread the "extra" bytes |
| source.rewind(endStream.substring(9).getBytes(ISO_8859_1).length); |
| } |
| else if (!endStream.equals(ENDSTREAM_STRING)) |
| { |
| throw new IOException( |
| "Error reading stream, expected='endstream' actual='" |
| + endStream + "' at offset " + source.getPosition()); |
| } |
| |
| return stream; |
| } |
| |
| /** |
| * This method will read through the current stream object until |
| * we find the keyword "endstream" meaning we're at the end of this |
| * object. Some pdf files, however, forget to write some endstream tags |
| * and just close off objects with an "endobj" tag so we have to handle |
| * this case as well. |
| * |
| * This method is optimized using buffered IO and reduced number of |
| * byte compare operations. |
| * |
| * @param out stream we write out to. |
| * |
| * @throws IOException if something went wrong |
| */ |
| private void readUntilEndStream( final OutputStream out ) throws IOException |
| { |
| int bufSize; |
| int charMatchCount = 0; |
| byte[] keyw = ENDSTREAM; |
| |
| // last character position of shortest keyword ('endobj') |
| final int quickTestOffset = 5; |
| |
| // read next chunk into buffer; already matched chars are added to beginning of buffer |
| while ( ( bufSize = source.read( strmBuf, charMatchCount, STRMBUFLEN - charMatchCount ) ) > 0 ) |
| { |
| bufSize += charMatchCount; |
| |
| int bIdx = charMatchCount; |
| int quickTestIdx; |
| |
| // iterate over buffer, trying to find keyword match |
| for ( int maxQuicktestIdx = bufSize - quickTestOffset; bIdx < bufSize; bIdx++ ) |
| { |
| // reduce compare operations by first test last character we would have to |
| // match if current one matches; if it is not a character from keywords |
| // we can move behind the test character; this shortcut is inspired by the |
| // Boyer-Moore string search algorithm and can reduce parsing time by approx. 20% |
| quickTestIdx = bIdx + quickTestOffset; |
| if (charMatchCount == 0 && quickTestIdx < maxQuicktestIdx) |
| { |
| final byte ch = strmBuf[quickTestIdx]; |
| if ( ( ch > 't' ) || ( ch < 'a' ) ) |
| { |
| // last character we would have to match if current character would match |
| // is not a character from keywords -> jump behind and start over |
| bIdx = quickTestIdx; |
| continue; |
| } |
| } |
| |
| // could be negative - but we only compare to ASCII |
| final byte ch = strmBuf[bIdx]; |
| |
| if ( ch == keyw[ charMatchCount ] ) |
| { |
| if ( ++charMatchCount == keyw.length ) |
| { |
| // match found |
| bIdx++; |
| break; |
| } |
| } |
| else |
| { |
| if ( ( charMatchCount == 3 ) && ( ch == ENDOBJ[ charMatchCount ] ) ) |
| { |
| // maybe ENDSTREAM is missing but we could have ENDOBJ |
| keyw = ENDOBJ; |
| charMatchCount++; |
| } |
| else |
| { |
| // no match; incrementing match start by 1 would be dumb since we already know |
| // matched chars depending on current char read we may already have beginning |
| // of a new match: 'e': first char matched; 'n': if we are at match position |
| // idx 7 we already read 'e' thus 2 chars matched for each other char we have |
| // to start matching first keyword char beginning with next read position |
| charMatchCount = ( ch == E ) ? 1 : ( ( ch == N ) && ( charMatchCount == 7 ) ) ? 2 : 0; |
| // search again for 'endstream' |
| keyw = ENDSTREAM; |
| } |
| } |
| } |
| |
| int contentBytes = Math.max( 0, bIdx - charMatchCount ); |
| |
| // write buffer content until first matched char to output stream |
| if ( contentBytes > 0 ) |
| { |
| out.write( strmBuf, 0, contentBytes ); |
| } |
| if ( charMatchCount == keyw.length ) |
| { |
| // keyword matched; unread matched keyword (endstream/endobj) and following buffered content |
| source.rewind( bufSize - contentBytes ); |
| break; |
| } |
| else |
| { |
| // copy matched chars at start of buffer |
| System.arraycopy( keyw, 0, strmBuf, 0, charMatchCount ); |
| } |
| } |
| // this writes a lonely CR or drops trailing CR LF and LF |
| out.flush(); |
| } |
| |
| private void readValidStream(OutputStream out, COSNumber streamLengthObj) throws IOException |
| { |
| long remainBytes = streamLengthObj.longValue(); |
| while (remainBytes > 0) |
| { |
| final int chunk = (remainBytes > STREAMCOPYBUFLEN) ? STREAMCOPYBUFLEN : (int) remainBytes; |
| final int readBytes = source.read(streamCopyBuf, 0, chunk); |
| if (readBytes <= 0) |
| { |
| // shouldn't happen, the stream length has already been validated |
| throw new IOException("read error at offset " + source.getPosition() |
| + ": expected " + chunk + " bytes, but read() returns " + readBytes); |
| } |
| out.write(streamCopyBuf, 0, readBytes); |
| remainBytes -= readBytes; |
| } |
| } |
| |
| private boolean validateStreamLength(long streamLength) throws IOException |
| { |
| boolean streamLengthIsValid = true; |
| long originOffset = source.getPosition(); |
| long expectedEndOfStream = originOffset + streamLength; |
| if (expectedEndOfStream > fileLen) |
| { |
| streamLengthIsValid = false; |
| LOG.warn("The end of the stream is out of range, using workaround to read the stream, " |
| + "stream start position: " + originOffset + ", length: " + streamLength |
| + ", expected end position: " + expectedEndOfStream); |
| } |
| else |
| { |
| source.seek(expectedEndOfStream); |
| skipSpaces(); |
| if (!isString(ENDSTREAM)) |
| { |
| streamLengthIsValid = false; |
| LOG.warn("The end of the stream doesn't point to the correct offset, using workaround to read the stream, " |
| + "stream start position: " + originOffset + ", length: " + streamLength |
| + ", expected end position: " + expectedEndOfStream); |
| } |
| source.seek(originOffset); |
| } |
| return streamLengthIsValid; |
| } |
| |
| /** |
| * Check if the cross reference table/stream can be found at the current offset. |
| * |
| * @param startXRefOffset |
| * @return the revised offset |
| * @throws IOException |
| */ |
| private long checkXRefOffset(long startXRefOffset) throws IOException |
| { |
| // repair mode isn't available in non-lenient mode |
| if (!isLenient) |
| { |
| return startXRefOffset; |
| } |
| source.seek(startXRefOffset); |
| if (source.peek() == X && isString(XREF_TABLE)) |
| { |
| return startXRefOffset; |
| } |
| if (startXRefOffset > 0) |
| { |
| long fixedOffset = checkXRefStreamOffset(startXRefOffset, true); |
| if (fixedOffset > -1) |
| { |
| return fixedOffset; |
| } |
| } |
| // try to find a fixed offset |
| return calculateXRefFixedOffset(startXRefOffset, false); |
| } |
| |
| /** |
| * Check if the cross reference stream can be found at the current offset. |
| * |
| * @param startXRefOffset the expected start offset of the XRef stream |
| * @param checkOnly check only but don't repair the offset if set to true |
| * @return the revised offset |
| * @throws IOException if something went wrong |
| */ |
| private long checkXRefStreamOffset(long startXRefOffset, boolean checkOnly) throws IOException |
| { |
| // repair mode isn't available in non-lenient mode |
| if (!isLenient || startXRefOffset == 0) |
| { |
| return startXRefOffset; |
| } |
| // seek to offset-1 |
| source.seek(startXRefOffset-1); |
| int nextValue = source.read(); |
| // the first character has to be a whitespace, and then a digit |
| if (isWhitespace(nextValue)) |
| { |
| skipSpaces(); |
| if (isDigit()) |
| { |
| try |
| { |
| // it's a XRef stream |
| readObjectNumber(); |
| readGenerationNumber(); |
| readExpectedString(OBJ_MARKER, true); |
| // check the dictionary to avoid false positives |
| COSDictionary dict = parseCOSDictionary(); |
| source.seek(startXRefOffset); |
| if (dict != null && "XRef".equals(dict.getNameAsString(COSName.TYPE))) |
| { |
| return startXRefOffset; |
| } |
| } |
| catch (IOException exception) |
| { |
| // there wasn't an object of a xref stream |
| // try to repair the offset |
| source.seek(startXRefOffset); |
| } |
| } |
| } |
| // try to find a fixed offset |
| return checkOnly ? -1 : calculateXRefFixedOffset(startXRefOffset, true); |
| } |
| |
| /** |
| * Try to find a fixed offset for the given xref table/stream. |
| * |
| * @param objectOffset the given offset where to look at |
| * @param streamsOnly search for xref streams only |
| * @return the fixed offset |
| * |
| * @throws IOException if something went wrong |
| */ |
| private long calculateXRefFixedOffset(long objectOffset, boolean streamsOnly) throws IOException |
| { |
| if (objectOffset < 0) |
| { |
| LOG.error("Invalid object offset " + objectOffset + " when searching for a xref table/stream"); |
| return 0; |
| } |
| // start a brute force search for all xref tables and try to find the offset we are looking for |
| long newOffset = bfSearchForXRef(objectOffset, streamsOnly); |
| if (newOffset > -1) |
| { |
| LOG.debug("Fixed reference for xref table/stream " + objectOffset + " -> " + newOffset); |
| return newOffset; |
| } |
| LOG.error("Can't find the object xref table/stream at offset " + objectOffset); |
| return 0; |
| } |
| |
| /** |
| * Check the XRef table by dereferencing all objects and fixing the offset if necessary. |
| * |
| * @throws IOException if something went wrong. |
| */ |
| private void checkXrefOffsets() throws IOException |
| { |
| // repair mode isn't available in non-lenient mode |
| if (!isLenient) |
| { |
| return; |
| } |
| Map<COSObjectKey, Long> xrefOffset = xrefTrailerResolver.getXrefTable(); |
| if (xrefOffset != null) |
| { |
| boolean bruteForceSearch = false; |
| for (Entry<COSObjectKey, Long> objectEntry : xrefOffset.entrySet()) |
| { |
| COSObjectKey objectKey = objectEntry.getKey(); |
| Long objectOffset = objectEntry.getValue(); |
| // a negative offset number represents a object number itself |
| // see type 2 entry in xref stream |
| if (objectOffset != null && objectOffset >= 0 |
| && !checkObjectKeys(objectKey, objectOffset)) |
| { |
| LOG.debug("Stop checking xref offsets as at least one couldn't be dereferenced"); |
| bruteForceSearch = true; |
| break; |
| } |
| } |
| if (bruteForceSearch) |
| { |
| bfSearchForObjects(); |
| if (bfSearchCOSObjectKeyOffsets != null && !bfSearchCOSObjectKeyOffsets.isEmpty()) |
| { |
| List<COSObjectKey> objStreams = new ArrayList<COSObjectKey>(); |
| // find all object streams |
| for (COSObjectKey key : xrefOffset.keySet()) |
| { |
| Long offset = xrefOffset.get(key); |
| if (offset != null && offset < 0 ) |
| { |
| COSObjectKey objStream = new COSObjectKey(-offset, 0); |
| if (!objStreams.contains(objStream)) |
| { |
| objStreams.add(new COSObjectKey(-offset, 0)); |
| } |
| } |
| } |
| // remove all found object streams |
| for (COSObjectKey key : bfSearchCOSObjectKeyOffsets.keySet()) |
| { |
| objStreams.remove(key); |
| } |
| // remove all objects which are part of an object stream which wasn't found |
| for (COSObjectKey key : objStreams) |
| { |
| Set<Long> objects = xrefTrailerResolver.getContainedObjectNumbers((int)(key.getNumber())); |
| for (Long objNr :objects) |
| { |
| xrefOffset.remove(new COSObjectKey(objNr, 0)); |
| } |
| } |
| LOG.debug("Replaced read xref table with the results of a brute force search"); |
| xrefOffset.putAll(bfSearchCOSObjectKeyOffsets); |
| } |
| } |
| } |
| } |
| |
| /** |
| * Check if the given object can be found at the given offset. |
| * |
| * @param objectKey the object we are looking for |
| * @param offset the offset where to look |
| * @return returns true if the given object can be dereferenced at the given offset |
| * @throws IOException if something went wrong |
| */ |
| private boolean checkObjectKeys(COSObjectKey objectKey, long offset) throws IOException |
| { |
| // there can't be any object at the very beginning of a pdf |
| if (offset < MINIMUM_SEARCH_OFFSET) |
| { |
| return false; |
| } |
| long objectNr = objectKey.getNumber(); |
| int objectGen = objectKey.getGeneration(); |
| long originOffset = source.getPosition(); |
| source.seek(offset); |
| String objectString = createObjectString(objectNr, objectGen); |
| try |
| { |
| if (isString(objectString.getBytes(ISO_8859_1))) |
| { |
| // everything is ok, return origin object key |
| source.seek(originOffset); |
| return true; |
| } |
| } |
| catch (IOException exception) |
| { |
| // Swallow the exception, obviously there isn't any valid object number |
| } |
| finally |
| { |
| source.seek(originOffset); |
| } |
| // no valid object number found |
| return false; |
| } |
| /** |
| * Create a string for the given object id. |
| * |
| * @param objectID the object id |
| * @param genID the generation id |
| * @return the generated string |
| */ |
| private String createObjectString(long objectID, int genID) |
| { |
| return Long.toString(objectID) + " " + Integer.toString(genID) + " obj"; |
| } |
| |
| /** |
| * Brute force search for every object in the pdf. |
| * |
| * @throws IOException if something went wrong |
| */ |
| private void bfSearchForObjects() throws IOException |
| { |
| if (bfSearchCOSObjectKeyOffsets == null) |
| { |
| bfSearchCOSObjectKeyOffsets = new HashMap<COSObjectKey, Long>(); |
| long originOffset = source.getPosition(); |
| long currentOffset = MINIMUM_SEARCH_OFFSET; |
| String objString = " obj"; |
| char[] string = objString.toCharArray(); |
| do |
| { |
| source.seek(currentOffset); |
| if (isString(string)) |
| { |
| long tempOffset = currentOffset - 1; |
| source.seek(tempOffset); |
| int genID = source.peek(); |
| // is the next char a digit? |
| if (isDigit(genID)) |
| { |
| genID -= 48; |
| tempOffset--; |
| source.seek(tempOffset); |
| if (isSpace()) |
| { |
| while (tempOffset > MINIMUM_SEARCH_OFFSET && isSpace()) |
| { |
| source.seek(--tempOffset); |
| } |
| int length = 0; |
| while (tempOffset > MINIMUM_SEARCH_OFFSET && isDigit()) |
| { |
| source.seek(--tempOffset); |
| length++; |
| } |
| if (length > 0) |
| { |
| source.read(); |
| byte[] objIDBytes = source.readFully(length); |
| String objIdString = new String(objIDBytes, 0, |
| objIDBytes.length, ISO_8859_1); |
| Long objectID; |
| try |
| { |
| objectID = Long.valueOf(objIdString); |
| } |
| catch (NumberFormatException exception) |
| { |
| objectID = null; |
| } |
| if (objectID != null) |
| { |
| bfSearchCOSObjectKeyOffsets.put(new COSObjectKey(objectID, genID), tempOffset+1); |
| } |
| } |
| } |
| } |
| } |
| currentOffset++; |
| } |
| while (!source.isEOF()); |
| // reestablish origin position |
| source.seek(originOffset); |
| } |
| } |
| |
| /** |
| * Search for the offset of the given xref table/stream among those found by a brute force search. |
| * |
| * @param streamsOnly search for xref streams only |
| * @return the offset of the xref entry |
| * @throws IOException if something went wrong |
| */ |
| private long bfSearchForXRef(long xrefOffset, boolean streamsOnly) throws IOException |
| { |
| long newOffset = -1; |
| long newOffsetTable = -1; |
| long newOffsetStream = -1; |
| if (!streamsOnly) |
| { |
| bfSearchForXRefTables(); |
| } |
| bfSearchForXRefStreams(); |
| if (!streamsOnly && bfSearchXRefTablesOffsets != null) |
| { |
| // TODO to be optimized, this won't work in every case |
| newOffsetTable = searchNearestValue(bfSearchXRefTablesOffsets, xrefOffset); |
| } |
| if (bfSearchXRefStreamsOffsets != null) |
| { |
| // TODO to be optimized, this won't work in every case |
| newOffsetStream = searchNearestValue(bfSearchXRefStreamsOffsets, xrefOffset); |
| } |
| // choose the nearest value |
| if (newOffsetTable > -1 && newOffsetStream > -1) |
| { |
| long differenceTable = xrefOffset - newOffsetTable; |
| long differenceStream = xrefOffset - newOffsetStream; |
| if (Math.abs(differenceTable) > Math.abs(differenceStream)) |
| { |
| newOffset = newOffsetStream; |
| bfSearchXRefStreamsOffsets.remove(newOffsetStream); |
| } |
| else |
| { |
| newOffset = newOffsetTable; |
| bfSearchXRefTablesOffsets.remove(newOffsetTable); |
| } |
| } |
| else if (newOffsetTable > -1) |
| { |
| newOffset = newOffsetTable; |
| bfSearchXRefTablesOffsets.remove(newOffsetTable); |
| } |
| else if (newOffsetStream > -1) |
| { |
| newOffset = newOffsetStream; |
| bfSearchXRefStreamsOffsets.remove(newOffsetStream); |
| } |
| return newOffset; |
| } |
| |
| private long searchNearestValue(List<Long> values, long offset) |
| { |
| long newValue = -1; |
| long currentDifference = -1; |
| int currentOffsetIndex = -1; |
| int numberOfOffsets = values.size(); |
| // find the nearest value |
| for (int i = 0; i < numberOfOffsets; i++) |
| { |
| long newDifference = offset - values.get(i); |
| // find the nearest offset |
| if (currentDifference == -1 |
| || (Math.abs(currentDifference) > Math.abs(newDifference))) |
| { |
| currentDifference = newDifference; |
| currentOffsetIndex = i; |
| } |
| } |
| if (currentOffsetIndex > -1) |
| { |
| newValue = values.get(currentOffsetIndex); |
| } |
| return newValue; |
| } |
| /** |
| * Brute force search for all xref entries (tables). |
| * |
| * @throws IOException if something went wrong |
| */ |
| private void bfSearchForXRefTables() throws IOException |
| { |
| if (bfSearchXRefTablesOffsets == null) |
| { |
| // a pdf may contain more than one xref entry |
| bfSearchXRefTablesOffsets = new Vector<Long>(); |
| long originOffset = source.getPosition(); |
| source.seek(MINIMUM_SEARCH_OFFSET); |
| // search for xref tables |
| while (!source.isEOF()) |
| { |
| if (isString(XREF_TABLE)) |
| { |
| long newOffset = source.getPosition(); |
| source.seek(newOffset - 1); |
| // ensure that we don't read "startxref" instead of "xref" |
| if (isWhitespace()) |
| { |
| bfSearchXRefTablesOffsets.add(newOffset); |
| } |
| source.seek(newOffset + 4); |
| } |
| source.read(); |
| } |
| source.seek(originOffset); |
| } |
| } |
| |
| /** |
| * Brute force search for all /XRef entries (streams). |
| * |
| * @throws IOException if something went wrong |
| */ |
| private void bfSearchForXRefStreams() throws IOException |
| { |
| if (bfSearchXRefStreamsOffsets == null) |
| { |
| // a pdf may contain more than one /XRef entry |
| bfSearchXRefStreamsOffsets = new Vector<Long>(); |
| long originOffset = source.getPosition(); |
| source.seek(MINIMUM_SEARCH_OFFSET); |
| // search for XRef streams |
| String objString = " obj"; |
| char[] string = objString.toCharArray(); |
| while (!source.isEOF()) |
| { |
| if (isString(XREF_STREAM)) |
| { |
| // search backwards for the beginning of the stream |
| long newOffset = -1; |
| long xrefOffset = source.getPosition(); |
| boolean objFound = false; |
| for (int i = 1; i < 30 && !objFound; i++) |
| { |
| long currentOffset = xrefOffset - (i * 10); |
| if (currentOffset > 0) |
| { |
| source.seek(currentOffset); |
| for (int j = 0; j < 10; j++) |
| { |
| if (isString(string)) |
| { |
| long tempOffset = currentOffset - 1; |
| source.seek(tempOffset); |
| int genID = source.peek(); |
| // is the next char a digit? |
| if (isDigit(genID)) |
| { |
| genID -= 48; |
| tempOffset--; |
| source.seek(tempOffset); |
| if (isSpace()) |
| { |
| int length = 0; |
| source.seek(--tempOffset); |
| while (tempOffset > MINIMUM_SEARCH_OFFSET && isDigit()) |
| { |
| source.seek(--tempOffset); |
| length++; |
| } |
| if (length > 0) |
| { |
| source.read(); |
| newOffset = source.getPosition(); |
| } |
| } |
| } |
| LOG.debug("Fixed reference for xref stream " + xrefOffset |
| + " -> " + newOffset); |
| objFound = true; |
| break; |
| } |
| else |
| { |
| currentOffset++; |
| source.read(); |
| } |
| } |
| } |
| } |
| if (newOffset > -1) |
| { |
| bfSearchXRefStreamsOffsets.add(newOffset); |
| } |
| source.seek(xrefOffset + 5); |
| } |
| source.read(); |
| } |
| source.seek(originOffset); |
| } |
| } |
| |
| /** |
| * Rebuild the trailer dictionary if startxref can't be found. |
| * |
| * @return the rebuild trailer dictionary |
| * |
| * @throws IOException if something went wrong |
| */ |
| protected final COSDictionary rebuildTrailer() throws IOException |
| { |
| COSDictionary trailer = null; |
| bfSearchForObjects(); |
| if (bfSearchCOSObjectKeyOffsets != null) |
| { |
| xrefTrailerResolver.nextXrefObj( 0, XRefType.TABLE ); |
| for (Entry<COSObjectKey, Long> entry : bfSearchCOSObjectKeyOffsets.entrySet()) |
| { |
| xrefTrailerResolver.setXRef(entry.getKey(), entry.getValue()); |
| } |
| xrefTrailerResolver.setStartxref(0); |
| trailer = xrefTrailerResolver.getTrailer(); |
| getDocument().setTrailer(trailer); |
| // search for the different parts of the trailer dictionary |
| for(Entry<COSObjectKey, Long> entry : bfSearchCOSObjectKeyOffsets.entrySet()) |
| { |
| Long offset = entry.getValue(); |
| source.seek(offset); |
| readObjectNumber(); |
| readGenerationNumber(); |
| readExpectedString(OBJ_MARKER, true); |
| try |
| { |
| COSDictionary dictionary = parseCOSDictionary(); |
| if (dictionary != null) |
| { |
| // document catalog |
| if (COSName.CATALOG.equals(dictionary.getCOSName(COSName.TYPE))) |
| { |
| trailer.setItem(COSName.ROOT, document.getObjectFromPool(entry.getKey())); |
| } |
| // info dictionary |
| else if (dictionary.containsKey(COSName.MOD_DATE) && |
| (dictionary.containsKey(COSName.TITLE) |
| || dictionary.containsKey(COSName.AUTHOR) |
| || dictionary.containsKey(COSName.SUBJECT) |
| || dictionary.containsKey(COSName.KEYWORDS) |
| || dictionary.containsKey(COSName.CREATOR) |
| || dictionary.containsKey(COSName.PRODUCER) |
| || dictionary.containsKey(COSName.CREATION_DATE))) |
| { |
| trailer.setItem(COSName.INFO, document.getObjectFromPool(entry.getKey())); |
| } |
| // TODO encryption dictionary |
| } |
| } |
| catch(IOException exception) |
| { |
| LOG.debug("Skipped object " + entry.getKey() + ", either it's corrupt or not a dictionary"); |
| } |
| } |
| } |
| return trailer; |
| } |
| |
| /** |
| * This will parse the startxref section from the stream. |
| * The startxref value is ignored. |
| * |
| * @return the startxref value or -1 on parsing error |
| * @throws IOException If an IO error occurs. |
| */ |
| private long parseStartXref() throws IOException |
| { |
| long startXref = -1; |
| if (isString(STARTXREF)) |
| { |
| readString(); |
| skipSpaces(); |
| // This integer is the byte offset of the first object referenced by the xref or xref stream |
| startXref = readLong(); |
| } |
| return startXref; |
| } |
| |
| /** |
| * Checks if the given string can be found at the current offset. |
| * |
| * @param string the bytes of the string to look for |
| * @return true if the bytes are in place, false if not |
| * @throws IOException if something went wrong |
| */ |
| private boolean isString(byte[] string) throws IOException |
| { |
| boolean bytesMatching = false; |
| if (source.peek() == string[0]) |
| { |
| int length = string.length; |
| byte[] bytesRead = new byte[length]; |
| int numberOfBytes = source.read(bytesRead, 0, length); |
| while (numberOfBytes < length) |
| { |
| int readMore = source.read(bytesRead, numberOfBytes, length - numberOfBytes); |
| if (readMore < 0) |
| { |
| break; |
| } |
| numberOfBytes += readMore; |
| } |
| if (Arrays.equals(string, bytesRead)) |
| { |
| bytesMatching = true; |
| } |
| source.rewind(numberOfBytes); |
| } |
| return bytesMatching; |
| } |
| |
| /** |
| * Checks if the given string can be found at the current offset. |
| * |
| * @param string the bytes of the string to look for |
| * @return true if the bytes are in place, false if not |
| * @throws IOException if something went wrong |
| */ |
| private boolean isString(char[] string) throws IOException |
| { |
| boolean bytesMatching = true; |
| long originOffset = source.getPosition(); |
| for (char c : string) |
| { |
| if (source.read() != c) |
| { |
| bytesMatching = false; |
| } |
| } |
| source.seek(originOffset); |
| return bytesMatching; |
| } |
| |
| /** |
| * This will parse the trailer from the stream and add it to the state. |
| * |
| * @return false on parsing error |
| * @throws IOException If an IO error occurs. |
| */ |
| private boolean parseTrailer() throws IOException |
| { |
| if(source.peek() != 't') |
| { |
| return false; |
| } |
| //read "trailer" |
| long currentOffset = source.getPosition(); |
| String nextLine = readLine(); |
| if( !nextLine.trim().equals( "trailer" ) ) |
| { |
| // in some cases the EOL is missing and the trailer immediately |
| // continues with "<<" or with a blank character |
| // even if this does not comply with PDF reference we want to support as many PDFs as possible |
| // Acrobat reader can also deal with this. |
| if (nextLine.startsWith("trailer")) |
| { |
| // we can't just unread a portion of the read data as we don't know if the EOL consist of 1 or 2 bytes |
| int len = "trailer".length(); |
| // jump back right after "trailer" |
| source.seek(currentOffset + len); |
| } |
| else |
| { |
| return false; |
| } |
| } |
| |
| // in some cases the EOL is missing and the trailer continues with " <<" |
| // even if this does not comply with PDF reference we want to support as many PDFs as possible |
| // Acrobat reader can also deal with this. |
| skipSpaces(); |
| |
| COSDictionary parsedTrailer = parseCOSDictionary(); |
| xrefTrailerResolver.setTrailer( parsedTrailer ); |
| |
| skipSpaces(); |
| return true; |
| } |
| |
| /** |
| * Parse the header of a pdf. |
| * |
| * @return true if a PDF header was found |
| * @throws IOException if something went wrong |
| */ |
| protected boolean parsePDFHeader() throws IOException |
| { |
| return parseHeader(PDF_HEADER, PDF_DEFAULT_VERSION); |
| } |
| |
| /** |
| * Parse the header of a fdf. |
| * |
| * @return true if a FDF header was found |
| * @throws IOException if something went wrong |
| */ |
| protected boolean parseFDFHeader() throws IOException |
| { |
| return parseHeader(FDF_HEADER, FDF_DEFAULT_VERSION); |
| } |
| |
| private boolean parseHeader(String headerMarker, String defaultVersion) throws IOException |
| { |
| // read first line |
| String header = readLine(); |
| // some pdf-documents are broken and the pdf-version is in one of the following lines |
| if (!header.contains(headerMarker)) |
| { |
| header = readLine(); |
| while (!header.contains(headerMarker)) |
| { |
| // if a line starts with a digit, it has to be the first one with data in it |
| if ((header.length() > 0) && (Character.isDigit(header.charAt(0)))) |
| { |
| break; |
| } |
| header = readLine(); |
| } |
| } |
| |
| // nothing found |
| if (!header.contains(headerMarker)) |
| { |
| source.seek(0); |
| return false; |
| } |
| |
| //sometimes there is some garbage in the header before the header |
| //actually starts, so lets try to find the header first. |
| int headerStart = header.indexOf( headerMarker ); |
| |
| // greater than zero because if it is zero then there is no point of trimming |
| if ( headerStart > 0 ) |
| { |
| //trim off any leading characters |
| header = header.substring( headerStart, header.length() ); |
| } |
| |
| // This is used if there is garbage after the header on the same line |
| if (header.startsWith(headerMarker) && !header.matches(headerMarker + "\\d.\\d")) |
| { |
| if (header.length() < headerMarker.length() + 3) |
| { |
| // No version number at all, set to 1.4 as default |
| header = headerMarker + defaultVersion; |
| LOG.debug("No version found, set to " + defaultVersion + " as default."); |
| } |
| else |
| { |
| String headerGarbage = header.substring(headerMarker.length() + 3, header.length()) + "\n"; |
| header = header.substring(0, headerMarker.length() + 3); |
| source.rewind(headerGarbage.getBytes(ISO_8859_1).length); |
| } |
| } |
| float headerVersion = -1; |
| try |
| { |
| String[] headerParts = header.split("-"); |
| if (headerParts.length == 2) |
| { |
| headerVersion = Float.parseFloat(headerParts[1]); |
| } |
| } |
| catch (NumberFormatException exception) |
| { |
| LOG.debug("Can't parse the header version.", exception); |
| } |
| if (headerVersion < 0) |
| { |
| throw new IOException( "Error getting header version: " + header); |
| } |
| document.setVersion(headerVersion); |
| // rewind |
| source.seek(0); |
| return true; |
| } |
| |
| /** |
| * This will parse the xref table from the stream and add it to the state |
| * The XrefTable contents are ignored. |
| * @param startByteOffset the offset to start at |
| * @return false on parsing error |
| * @throws IOException If an IO error occurs. |
| */ |
| protected boolean parseXrefTable(long startByteOffset) throws IOException |
| { |
| if(source.peek() != 'x') |
| { |
| return false; |
| } |
| String xref = readString(); |
| if( !xref.trim().equals( "xref" ) ) |
| { |
| return false; |
| } |
| |
| // check for trailer after xref |
| String str = readString(); |
| byte[] b = str.getBytes(ISO_8859_1); |
| source.rewind(b.length); |
| |
| // signal start of new XRef |
| xrefTrailerResolver.nextXrefObj( startByteOffset, XRefType.TABLE ); |
| |
| if (str.startsWith("trailer")) |
| { |
| LOG.warn("skipping empty xref table"); |
| return false; |
| } |
| |
| // Xref tables can have multiple sections. Each starts with a starting object id and a count. |
| while(true) |
| { |
| // first obj id |
| long currObjID = readObjectNumber(); |
| |
| // the number of objects in the xref table |
| long count = readLong(); |
| |
| skipSpaces(); |
| for(int i = 0; i < count; i++) |
| { |
| if(source.isEOF() || isEndOfName((char)source.peek())) |
| { |
| break; |
| } |
| if(source.peek() == 't') |
| { |
| break; |
| } |
| //Ignore table contents |
| String currentLine = readLine(); |
| String[] splitString = currentLine.split("\\s"); |
| if (splitString.length < 3) |
| { |
| LOG.warn("invalid xref line: " + currentLine); |
| break; |
| } |
| /* This supports the corrupt table as reported in |
| * PDFBOX-474 (XXXX XXX XX n) */ |
| if(splitString[splitString.length-1].equals("n")) |
| { |
| try |
| { |
| long currOffset = Long.parseLong(splitString[0]); |
| int currGenID = Integer.parseInt(splitString[1]); |
| COSObjectKey objKey = new COSObjectKey(currObjID, currGenID); |
| xrefTrailerResolver.setXRef(objKey, currOffset); |
| } |
| catch(NumberFormatException e) |
| { |
| throw new IOException(e); |
| } |
| } |
| else if(!splitString[2].equals("f")) |
| { |
| throw new IOException("Corrupt XRefTable Entry - ObjID:" + currObjID); |
| } |
| currObjID++; |
| skipSpaces(); |
| } |
| skipSpaces(); |
| if (!isDigit()) |
| { |
| break; |
| } |
| } |
| return true; |
| } |
| |
| /** |
| * Fills XRefTrailerResolver with data of given stream. |
| * Stream must be of type XRef. |
| * @param stream the stream to be read |
| * @param objByteOffset the offset to start at |
| * @param isStandalone should be set to true if the stream is not part of a hybrid xref table |
| * @throws IOException if there is an error parsing the stream |
| */ |
| private void parseXrefStream(COSStream stream, long objByteOffset, boolean isStandalone) throws IOException |
| { |
| // the cross reference stream of a hybrid xref table will be added to the existing one |
| // and we must not override the offset and the trailer |
| if ( isStandalone ) |
| { |
| xrefTrailerResolver.nextXrefObj( objByteOffset, XRefType.STREAM ); |
| xrefTrailerResolver.setTrailer( stream ); |
| } |
| PDFXrefStreamParser parser = new PDFXrefStreamParser( stream, document, xrefTrailerResolver ); |
| parser.parse(); |
| } |
| |
| /** |
| * This will get the document that was parsed. parse() must be called before this is called. |
| * When you are done with this document you must call close() on it to release |
| * resources. |
| * |
| * @return The document that was parsed. |
| * |
| * @throws IOException If there is an error getting the document. |
| */ |
| public COSDocument getDocument() throws IOException |
| { |
| if( document == null ) |
| { |
| throw new IOException( "You must call parse() before calling getDocument()" ); |
| } |
| return document; |
| } |
| |
| /** |
| * Parse the values of the trailer dictionary and return the root object. |
| * |
| * @param trailer The trailer dictionary. |
| * @return The parsed root object. |
| * @throws IOException If an IO error occurs or if the root object is |
| * missing in the trailer dictionary. |
| */ |
| protected COSBase parseTrailerValuesDynamically(COSDictionary trailer) throws IOException |
| { |
| // PDFBOX-1557 - ensure that all COSObject are loaded in the trailer |
| // PDFBOX-1606 - after securityHandler has been instantiated |
| for (COSBase trailerEntry : trailer.getValues()) |
| { |
| if (trailerEntry instanceof COSObject) |
| { |
| COSObject tmpObj = (COSObject) trailerEntry; |
| parseObjectDynamically(tmpObj, false); |
| } |
| } |
| // parse catalog or root object |
| COSObject root = (COSObject) trailer.getItem(COSName.ROOT); |
| if (root == null) |
| { |
| throw new IOException("Missing root object specification in trailer."); |
| } |
| return parseObjectDynamically(root, false); |
| } |
| |
| } |