pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java - pdfbox - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.pdfbox.pdfparser;

 import java.io.IOException;
 import java.io.OutputStream;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.LinkedList;
 import java.util.List;
 import java.util.Map;
 import java.util.Map.Entry;
 import java.util.Queue;
 import java.util.Set;
 import java.util.TreeMap;
 import java.util.Vector;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.pdfbox.cos.COSArray;
 import org.apache.pdfbox.cos.COSBase;
 import org.apache.pdfbox.cos.COSDictionary;
 import org.apache.pdfbox.cos.COSDocument;
 import org.apache.pdfbox.cos.COSName;
 import org.apache.pdfbox.cos.COSNull;
 import org.apache.pdfbox.cos.COSNumber;
 import org.apache.pdfbox.cos.COSObject;
 import org.apache.pdfbox.cos.COSObjectKey;
 import org.apache.pdfbox.cos.COSStream;
 import org.apache.pdfbox.io.RandomAccessRead;
 import org.apache.pdfbox.pdfparser.XrefTrailerResolver.XRefType;
 import org.apache.pdfbox.pdmodel.encryption.SecurityHandler;


 import static org.apache.pdfbox.util.Charsets.ISO_8859_1;

 /**
  * PDF-Parser which first reads startxref and xref tables in order to know valid objects and parse only these objects.
  *
  * First {@link PDFParser#parse()} or  {@link FDFParser#parse()} must be called before page objects
  * can be retrieved, e.g. {@link PDFParser#getPDDocument()}.
  *
  * This class is a much enhanced version of <code>QuickParser</code> presented in <a
  * href="https://issues.apache.org/jira/browse/PDFBOX-1104">PDFBOX-1104</a> by Jeremy Villalobos.
  */
 public class COSParser extends BaseParser
 {
     private static final String PDF_HEADER = "%PDF-";
     private static final String FDF_HEADER = "%FDF-";

     private static final String PDF_DEFAULT_VERSION = "1.4";
     private static final String FDF_DEFAULT_VERSION = "1.0";

     private static final char[] XREF_TABLE = new char[] { 'x', 'r', 'e', 'f' };
     private static final char[] XREF_STREAM = new char[] { '/', 'X', 'R', 'e', 'f' };
     private static final char[] STARTXREF = new char[] { 's','t','a','r','t','x','r','e','f' };

     private static final byte[] ENDSTREAM = new byte[] { E, N, D, S, T, R, E, A, M };

     private static final byte[] ENDOBJ = new byte[] { E, N, D, O, B, J };

     private static final long MINIMUM_SEARCH_OFFSET = 6;

     private static final int X = 'x';

     private static final int STRMBUFLEN = 2048;
     private final byte[] strmBuf    = new byte[ STRMBUFLEN ];

     protected final RandomAccessRead source;

     /**
      * Only parse the PDF file minimally allowing access to basic information.
      */
     public static final String SYSPROP_PARSEMINIMAL =
             "org.apache.pdfbox.pdfparser.nonSequentialPDFParser.parseMinimal";

     /**
      * The range within the %%EOF marker will be searched.
      * Useful if there are additional characters after %%EOF within the PDF.
      */
     public static final String SYSPROP_EOFLOOKUPRANGE =
             "org.apache.pdfbox.pdfparser.nonSequentialPDFParser.eofLookupRange";

     /**
      * How many trailing bytes to read for EOF marker.
      */
     private static final int DEFAULT_TRAIL_BYTECOUNT = 2048;
     /**
      * EOF-marker.
      */
     protected static final char[] EOF_MARKER = new char[] { '%', '%', 'E', 'O', 'F' };
     /**
      * obj-marker.
      */
     protected static final char[] OBJ_MARKER = new char[] { 'o', 'b', 'j' };

     private long trailerOffset;

     /**
      * file length.
      */
     protected long fileLen;

     /**
      * is parser using auto healing capacity ?
      */
     private boolean isLenient = true;

     protected boolean initialParseDone = false;
     /**
      * Contains all found objects of a brute force search.
      */
     private Map<COSObjectKey, Long> bfSearchCOSObjectKeyOffsets = null;
     private List<Long> bfSearchXRefTablesOffsets = null;
     private List<Long> bfSearchXRefStreamsOffsets = null;

     /**
      * The security handler.
      */
     protected SecurityHandler securityHandler = null;

     /**
      *  how many trailing bytes to read for EOF marker.
      */
     private int readTrailBytes = DEFAULT_TRAIL_BYTECOUNT;

     private static final Log LOG = LogFactory.getLog(COSParser.class);

     /**
      * Collects all Xref/trailer objects and resolves them into single
      * object using startxref reference.
      */
     protected XrefTrailerResolver xrefTrailerResolver = new XrefTrailerResolver();


     /**
      * The prefix for the temp file being used.
      */
     public static final String TMP_FILE_PREFIX = "tmpPDF";

     /**
      * Default constructor.
      */
     public COSParser(RandomAccessRead source)
     {
         super(new RandomAccessSource(source));
         this.source = source;
     }

     /**
      * Sets how many trailing bytes of PDF file are searched for EOF marker and 'startxref' marker. If not set we use
      * default value {@link #DEFAULT_TRAIL_BYTECOUNT}.
      *
      * <p>We check that new value is at least 16. However for practical use cases this value should not be lower than
      * 1000; even 2000 was found to not be enough in some cases where some trailing garbage like HTML snippets followed
      * the EOF marker.</p>
      *
      * <p>
      * In case system property {@link #SYSPROP_EOFLOOKUPRANGE} is defined this value will be set on initialization but
      * can be overwritten later.
      * </p>
      *
      * @param byteCount number of trailing bytes
      */
     public void setEOFLookupRange(int byteCount)
     {
         if (byteCount > 15)
         {
             readTrailBytes = byteCount;
         }
     }

     /**
      * Parses cross reference tables.
      *
      * @param startXRefOffset start offset of the first table
      * @return the trailer dictionary
      * @throws IOException if something went wrong
      */
     protected COSDictionary parseXref(long startXRefOffset) throws IOException
     {
         source.seek(startXRefOffset);
         long startXrefOffset = Math.max(0, parseStartXref());
         // check the startxref offset
         long fixedOffset = checkXRefOffset(startXrefOffset);
         if (fixedOffset > -1)
         {
             startXrefOffset = fixedOffset;
         }
         document.setStartXref(startXrefOffset);
         long prev = startXrefOffset;
         // ---- parse whole chain of xref tables/object streams using PREV reference
         long lastPrev = -1;
         while (prev > 0 && prev != lastPrev)
         {
             lastPrev = prev;
             // seek to xref table
             source.seek(prev);

             // skip white spaces
             skipSpaces();
             // -- parse xref
             if (source.peek() == X)
             {
                 // xref table and trailer
                 // use existing parser to parse xref table
                 parseXrefTable(prev);
                 // parse the last trailer.
                 trailerOffset = source.getPosition();
                 // PDFBOX-1739 skip extra xref entries in RegisSTAR documents
                 while (isLenient && source.peek() != 't')
                 {
                     if (source.getPosition() == trailerOffset)
                     {
                         // warn only the first time
                         LOG.warn("Expected trailer object at position " + trailerOffset
                                 + ", keep trying");
                     }
                     readLine();
                 }
                 if (!parseTrailer())
                 {
                     throw new IOException("Expected trailer object at position: "
                             + source.getPosition());
                 }
                 COSDictionary trailer = xrefTrailerResolver.getCurrentTrailer();
                 // check for a XRef stream, it may contain some object ids of compressed objects
                 if(trailer.containsKey(COSName.XREF_STM))
                 {
                     int streamOffset = trailer.getInt(COSName.XREF_STM);
                     // check the xref stream reference
                     fixedOffset = checkXRefStreamOffset(streamOffset, false);
                     if (fixedOffset > -1 && fixedOffset != streamOffset)
                     {
                         streamOffset = (int)fixedOffset;
                         trailer.setInt(COSName.XREF_STM, streamOffset);
                     }
                     if (streamOffset > 0)
                     {
                         source.seek(streamOffset);
                         skipSpaces();
                         parseXrefObjStream(prev, false);
                     }
                     else
                     {
                         if(isLenient)
                         {
                             LOG.error("Skipped XRef stream due to a corrupt offset:"+streamOffset);
                         }
                         else
                         {
                             throw new IOException("Skipped XRef stream due to a corrupt offset:"+streamOffset);
                         }
                     }
                 }
                 prev = trailer.getInt(COSName.PREV);
                 if (prev > 0)
                 {
                     // check the xref table reference
                     fixedOffset = checkXRefOffset(prev);
                     if (fixedOffset > -1 && fixedOffset != prev)
                     {
                         prev = fixedOffset;
                         trailer.setLong(COSName.PREV, prev);
                     }
                 }
             }
             else
             {
                 // parse xref stream
                 prev = parseXrefObjStream(prev, true);
                 if (prev > 0)
                 {
                     // check the xref table reference
                     fixedOffset = checkXRefOffset(prev);
                     if (fixedOffset > -1 && fixedOffset != prev)
                     {
                         prev = fixedOffset;
                         COSDictionary trailer = xrefTrailerResolver.getCurrentTrailer();
                         trailer.setLong(COSName.PREV, prev);
                     }
                 }
             }
         }
         if (prev == lastPrev)
         {
             //TODO better idea needed? PDFBOX-3446
             throw new IOException("/Prev loop at offset " + prev);
         }
         // ---- build valid xrefs out of the xref chain
         xrefTrailerResolver.setStartxref(startXrefOffset);
         COSDictionary trailer = xrefTrailerResolver.getTrailer();
         document.setTrailer(trailer);
         document.setIsXRefStream(XRefType.STREAM == xrefTrailerResolver.getXrefType());
         // check the offsets of all referenced objects
         checkXrefOffsets();
         // copy xref table
         document.addXRefTable(xrefTrailerResolver.getXrefTable());
         return trailer;
     }

     /**
      * Parses an xref object stream starting with indirect object id.
      *
      * @return value of PREV item in dictionary or <code>-1</code> if no such item exists
      */
     private long parseXrefObjStream(long objByteOffset, boolean isStandalone) throws IOException
     {
         // ---- parse indirect object head
         readObjectNumber();
         readGenerationNumber();
         readExpectedString(OBJ_MARKER, true);

         COSDictionary dict = parseCOSDictionary();
         COSStream xrefStream = parseCOSStream(dict);
         parseXrefStream(xrefStream, objByteOffset, isStandalone);
         xrefStream.close();

         return dict.getLong(COSName.PREV);
     }

     /**
      * Looks for and parses startxref. We first look for last '%%EOF' marker (within last
      * {@link #DEFAULT_TRAIL_BYTECOUNT} bytes (or range set via {@link #setEOFLookupRange(int)}) and go back to find
      * <code>startxref</code>.
      *
      * @return the offset of StartXref
      * @throws IOException If something went wrong.
      */
     protected final long getStartxrefOffset() throws IOException
     {
         byte[] buf;
         long skipBytes;
         // read trailing bytes into buffer
         try
         {
             final int trailByteCount = (fileLen < readTrailBytes) ? (int) fileLen : readTrailBytes;
             buf = new byte[trailByteCount];
             skipBytes = fileLen - trailByteCount;
             source.seek(skipBytes);
             int off = 0;
             int readBytes;
             while (off < trailByteCount)
             {
                 readBytes = source.read(buf, off, trailByteCount - off);
                 // in order to not get stuck in a loop we check readBytes (this should never happen)
                 if (readBytes < 1)
                 {
                     throw new IOException(
                             "No more bytes to read for trailing buffer, but expected: "
                                     + (trailByteCount - off));
                 }
                 off += readBytes;
             }
         }
         finally
         {
             source.seek(0);
         }
         // find last '%%EOF'
         int bufOff = lastIndexOf(EOF_MARKER, buf, buf.length);
         if (bufOff < 0)
         {
             if (isLenient)
             {
                 // in lenient mode the '%%EOF' isn't needed
                 bufOff = buf.length;
                 LOG.debug("Missing end of file marker '" + new String(EOF_MARKER) + "'");
             }
             else
             {
                 throw new IOException("Missing end of file marker '" + new String(EOF_MARKER) + "'");
             }
         }
         // find last startxref preceding EOF marker
         bufOff = lastIndexOf(STARTXREF, buf, bufOff);
         long startXRefOffset = skipBytes + bufOff;

         if (bufOff < 0)
         {
             if (isLenient)
             {
                 LOG.debug("Can't find offset for startxref");
                 return -1;
             }
             else
             {
                 throw new IOException("Missing 'startxref' marker.");
             }
         }
         return startXRefOffset;
     }

     /**
      * Searches last appearance of pattern within buffer. Lookup before _lastOff and goes back until 0.
      *
      * @param pattern pattern to search for
      * @param buf buffer to search pattern in
      * @param endOff offset (exclusive) where lookup starts at
      *
      * @return start offset of pattern within buffer or <code>-1</code> if pattern could not be found
      */
     protected int lastIndexOf(final char[] pattern, final byte[] buf, final int endOff)
     {
         final int lastPatternChOff = pattern.length - 1;

         int bufOff = endOff;
         int patOff = lastPatternChOff;
         char lookupCh = pattern[patOff];

         while (--bufOff >= 0)
         {
             if (buf[bufOff] == lookupCh)
             {
                 if (--patOff < 0)
                 {
                     // whole pattern matched
                     return bufOff;
                 }
                 // matched current char, advance to preceding one
                 lookupCh = pattern[patOff];
             }
             else if (patOff < lastPatternChOff)
             {
                 // no char match but already matched some chars; reset
                 patOff = lastPatternChOff;
                 lookupCh = pattern[patOff];
             }
         }
         return -1;
     }

     /**
      * Return true if parser is lenient. Meaning auto healing capacity of the parser are used.
      *
      * @return true if parser is lenient
      */
     public boolean isLenient()
     {
         return isLenient;
     }

     /**
      * Change the parser leniency flag.
      *
      * This method can only be called before the parsing of the file.
      *
      * @param lenient try to handle malformed PDFs.
      *
      */
     public void setLenient(boolean lenient)
     {
         if (initialParseDone)
         {
             throw new IllegalArgumentException("Cannot change leniency after parsing");
         }
         this.isLenient = lenient;
     }

     /**
      * Creates a unique object id using object number and object generation
      * number. (requires object number &lt; 2^31))
      */
     private long getObjectId(final COSObject obj)
     {
         return obj.getObjectNumber() << 32 | obj.getGenerationNumber();
     }

     /**
      * Adds all from newObjects to toBeParsedList if it is not an COSObject or
      * we didn't add this COSObject already (checked via addedObjects).
      */
     private void addNewToList(final Queue<COSBase> toBeParsedList,
             final Collection<COSBase> newObjects, final Set<Long> addedObjects)
     {
         for (COSBase newObject : newObjects)
         {
             addNewToList(toBeParsedList, newObject, addedObjects);
         }
     }

     /**
      * Adds newObject to toBeParsedList if it is not an COSObject or we didn't
      * add this COSObject already (checked via addedObjects).
      */
     private void addNewToList(final Queue<COSBase> toBeParsedList, final COSBase newObject,
             final Set<Long> addedObjects)
     {
         if (newObject instanceof COSObject)
         {
             final long objId = getObjectId((COSObject) newObject);
             if (!addedObjects.add(objId))
             {
                 return;
             }
         }
         toBeParsedList.add(newObject);
     }

     /**
      * Will parse every object necessary to load a single page from the pdf document. We try our
      * best to order objects according to offset in file before reading to minimize seek operations.
      *
      * @param dict the COSObject from the parent pages.
      * @param excludeObjects dictionary object reference entries with these names will not be parsed
      *
      * @throws IOException if something went wrong
      */
     protected void parseDictObjects(COSDictionary dict, COSName... excludeObjects) throws IOException
     {
         // ---- create queue for objects waiting for further parsing
         final Queue<COSBase> toBeParsedList = new LinkedList<COSBase>();
         // offset ordered object map
         final TreeMap<Long, List<COSObject>> objToBeParsed = new TreeMap<Long, List<COSObject>>();
         // in case of compressed objects offset points to stmObj
         final Set<Long> parsedObjects = new HashSet<Long>();
         final Set<Long> addedObjects = new HashSet<Long>();

         addExcludedToList(excludeObjects, dict, parsedObjects);
         addNewToList(toBeParsedList, dict.getValues(), addedObjects);

         // ---- go through objects to be parsed
         while (!(toBeParsedList.isEmpty() && objToBeParsed.isEmpty()))
         {
             // -- first get all COSObject from other kind of objects and
             // put them in objToBeParsed; afterwards toBeParsedList is empty
             COSBase baseObj;
             while ((baseObj = toBeParsedList.poll()) != null)
             {
                 if (baseObj instanceof COSDictionary)
                 {
                     addNewToList(toBeParsedList, ((COSDictionary) baseObj).getValues(), addedObjects);
                 }
                 else if (baseObj instanceof COSArray)
                 {
                     final Iterator<COSBase> arrIter = ((COSArray) baseObj).iterator();
                     while (arrIter.hasNext())
                     {
                         addNewToList(toBeParsedList, arrIter.next(), addedObjects);
                     }
                 }
                 else if (baseObj instanceof COSObject)
                 {
                     COSObject obj = (COSObject) baseObj;
                     long objId = getObjectId(obj);
                     COSObjectKey objKey = new COSObjectKey(obj.getObjectNumber(), obj.getGenerationNumber());

                     if (!parsedObjects.contains(objId))
                     {
                         Long fileOffset = xrefTrailerResolver.getXrefTable().get(objKey);
                         // it is allowed that object references point to null,
                         // thus we have to test
                         if (fileOffset != null && fileOffset != 0)
                         {
                             if (fileOffset > 0)
                             {
                                 objToBeParsed.put(fileOffset, Collections.singletonList(obj));
                             }
                             else
                             {
                                 // negative offset means we have a compressed
                                 // object within object stream;
                                 // get offset of object stream
                                 fileOffset = xrefTrailerResolver.getXrefTable().get(
                                         new COSObjectKey((int)-fileOffset, 0));
                                 if ((fileOffset == null) || (fileOffset <= 0))
                                 {
                                     throw new IOException(
                                             "Invalid object stream xref object reference for key '" + objKey + "': "
                                                     + fileOffset);
                                 }

                                 List<COSObject> stmObjects = objToBeParsed.get(fileOffset);
                                 if (stmObjects == null)
                                 {
                                     stmObjects = new ArrayList<COSObject>();
                                     objToBeParsed.put(fileOffset, stmObjects);
                                 }
                                 stmObjects.add(obj);
                             }
                         }
                         else
                         {
                             // NULL object
                             COSObject pdfObject = document.getObjectFromPool(objKey);
                             pdfObject.setObject(COSNull.NULL);
                         }
                     }
                 }
             }

             // ---- read first COSObject with smallest offset
             // resulting object will be added to toBeParsedList
             if (objToBeParsed.isEmpty())
             {
                 break;
             }

             for (COSObject obj : objToBeParsed.remove(objToBeParsed.firstKey()))
             {
                 COSBase parsedObj = parseObjectDynamically(obj, false);
                 if (parsedObj != null)
                 {
                     obj.setObject(parsedObj);
                     addNewToList(toBeParsedList, parsedObj, addedObjects);
                     parsedObjects.add(getObjectId(obj));
                 }
             }
         }
     }

     // add objects not to be parsed to list of already parsed objects
     private void addExcludedToList(COSName[] excludeObjects, COSDictionary dict, final Set<Long> parsedObjects)
     {
         if (excludeObjects != null)
         {
             for (COSName objName : excludeObjects)
             {
                 COSBase baseObj = dict.getItem(objName);
                 if (baseObj instanceof COSObject)
                 {
                     parsedObjects.add(getObjectId((COSObject) baseObj));
                 }
             }
         }
     }

     /**
      * This will parse the next object from the stream and add it to the local state.
      *
      * @param obj object to be parsed (we only take object number and generation number for lookup start offset)
      * @param requireExistingNotCompressedObj if <code>true</code> object to be parsed must not be contained within
      * compressed stream
      * @return the parsed object (which is also added to document object)
      *
      * @throws IOException If an IO error occurs.
      */
     protected final COSBase parseObjectDynamically(COSObject obj,
             boolean requireExistingNotCompressedObj) throws IOException
     {
         return parseObjectDynamically(obj.getObjectNumber(),
                 obj.getGenerationNumber(), requireExistingNotCompressedObj);
     }

     /**
      * This will parse the next object from the stream and add it to the local state.
      * It's reduced to parsing an indirect object.
      *
      * @param objNr object number of object to be parsed
      * @param objGenNr object generation number of object to be parsed
      * @param requireExistingNotCompressedObj if <code>true</code> the object to be parsed must be defined in xref
      * (comment: null objects may be missing from xref) and it must not be a compressed object within object stream
      * (this is used to circumvent being stuck in a loop in a malicious PDF)
      *
      * @return the parsed object (which is also added to document object)
      *
      * @throws IOException If an IO error occurs.
      */
     protected COSBase parseObjectDynamically(long objNr, int objGenNr,
             boolean requireExistingNotCompressedObj) throws IOException
     {
         // ---- create object key and get object (container) from pool
         final COSObjectKey objKey = new COSObjectKey(objNr, objGenNr);
         final COSObject pdfObject = document.getObjectFromPool(objKey);

         if (pdfObject.getObject() == null)
         {
             // not previously parsed
             // ---- read offset or object stream object number from xref table
             Long offsetOrObjstmObNr = xrefTrailerResolver.getXrefTable().get(objKey);

             // sanity test to circumvent loops with broken documents
             if (requireExistingNotCompressedObj
                     && ((offsetOrObjstmObNr == null) || (offsetOrObjstmObNr <= 0)))
             {
                 throw new IOException("Object must be defined and must not be compressed object: "
                         + objKey.getNumber() + ":" + objKey.getGeneration());
             }

             // maybe something is wrong with the xref table -> perform brute force search for all objects
             if (offsetOrObjstmObNr == null && isLenient && bfSearchCOSObjectKeyOffsets == null)
             {
                 bfSearchForObjects();
                 if (bfSearchCOSObjectKeyOffsets != null && !bfSearchCOSObjectKeyOffsets.isEmpty())
                 {
                     LOG.debug("Add all new read objects from brute force search to the xref table");
                     Map<COSObjectKey, Long> xrefOffset = xrefTrailerResolver.getXrefTable();
                     final Set<Map.Entry<COSObjectKey, Long>> entries = bfSearchCOSObjectKeyOffsets.entrySet();
                     for (Entry<COSObjectKey, Long> entry : entries)
                     {
                         COSObjectKey key = entry.getKey();
                         // add all missing objects to the xref table
                         if (!xrefOffset.containsKey(key))
                         {
                             xrefOffset.put(key, entry.getValue());
                         }
                     }
                     offsetOrObjstmObNr = xrefOffset.get(objKey);
                 }
             }

             if (offsetOrObjstmObNr == null)
             {
                 // not defined object -> NULL object (Spec. 1.7, chap. 3.2.9)
                 pdfObject.setObject(COSNull.NULL);
             }
             else if (offsetOrObjstmObNr > 0)
             {
                 // offset of indirect object in file
                 parseFileObject(offsetOrObjstmObNr, objKey, pdfObject);
             }
             else
             {
                 // xref value is object nr of object stream containing object to be parsed
                 // since our object was not found it means object stream was not parsed so far
                 parseObjectStream((int) -offsetOrObjstmObNr);
             }
         }
         return pdfObject.getObject();
     }

     private void parseFileObject(Long offsetOrObjstmObNr, final COSObjectKey objKey, final COSObject pdfObject) throws IOException
     {
         // ---- go to object start
         source.seek(offsetOrObjstmObNr);

         // ---- we must have an indirect object
         final long readObjNr = readObjectNumber();
         final int readObjGen = readGenerationNumber();
         readExpectedString(OBJ_MARKER, true);

         // ---- consistency check
         if ((readObjNr != objKey.getNumber()) || (readObjGen != objKey.getGeneration()))
         {
             throw new IOException("XREF for " + objKey.getNumber() + ":"
                     + objKey.getGeneration() + " points to wrong object: " + readObjNr
                     + ":" + readObjGen + " at offset " + offsetOrObjstmObNr);
         }

         skipSpaces();
         COSBase pb = parseDirObject();
         String endObjectKey = readString();

         if (endObjectKey.equals(STREAM_STRING))
         {
             source.rewind(endObjectKey.getBytes(ISO_8859_1).length);
             if (pb instanceof COSDictionary)
             {
                 COSStream stream = parseCOSStream((COSDictionary) pb);

                 if (securityHandler != null)
                 {
                     securityHandler.decryptStream(stream, objKey.getNumber(), objKey.getGeneration());
                 }
                 pb = stream;
             }
             else
             {
                 // this is not legal
                 // the combination of a dict and the stream/endstream
                 // forms a complete stream object
                 throw new IOException("Stream not preceded by dictionary (offset: "
                         + offsetOrObjstmObNr + ").");
             }
             skipSpaces();
             endObjectKey = readLine();

             // we have case with a second 'endstream' before endobj
             if (!endObjectKey.startsWith(ENDOBJ_STRING) && endObjectKey.startsWith(ENDSTREAM_STRING))
             {
                 endObjectKey = endObjectKey.substring(9).trim();
                 if (endObjectKey.length() == 0)
                 {
                     // no other characters in extra endstream line
                     // read next line
                     endObjectKey = readLine();
                 }
             }
         }
         else if (securityHandler != null)
         {
             securityHandler.decrypt(pb, objKey.getNumber(), objKey.getGeneration());
         }

         pdfObject.setObject(pb);

         if (!endObjectKey.startsWith(ENDOBJ_STRING))
         {
             if (isLenient)
             {
                 LOG.warn("Object (" + readObjNr + ":" + readObjGen + ") at offset "
                         + offsetOrObjstmObNr + " does not end with 'endobj' but with '"
                         + endObjectKey + "'");
             }
             else
             {
                 throw new IOException("Object (" + readObjNr + ":" + readObjGen
                         + ") at offset " + offsetOrObjstmObNr
                         + " does not end with 'endobj' but with '" + endObjectKey + "'");
             }
         }
     }

     private void parseObjectStream(int objstmObjNr) throws IOException
     {
         final COSBase objstmBaseObj = parseObjectDynamically(objstmObjNr, 0, true);
         if (objstmBaseObj instanceof COSStream)
         {
             // parse object stream
             PDFObjectStreamParser parser = new PDFObjectStreamParser((COSStream) objstmBaseObj, document);
             try
             {
                 parser.parse();
             }
             catch(IOException exception)
             {
                 if (isLenient)
                 {
                     LOG.debug("Stop reading object stream "+objstmObjNr+" due to an exception", exception);
                     // the error is handled in parseDictObjects
                     return;
                 }
                 else
                 {
                     throw exception;
                 }
             }
             // register all objects which are referenced to be contained in object stream
             for (COSObject next : parser.getObjects())
             {
                 COSObjectKey stmObjKey = new COSObjectKey(next);
                 Long offset = xrefTrailerResolver.getXrefTable().get(stmObjKey);
                 if (offset != null && offset == -objstmObjNr)
                 {
                     COSObject stmObj = document.getObjectFromPool(stmObjKey);
                     stmObj.setObject(next.getObject());
                 }
             }
         }
     }

     /**
      * Returns length value referred to or defined in given object.
      */
     private COSNumber getLength(final COSBase lengthBaseObj, final COSName streamType) throws IOException
     {
         if (lengthBaseObj == null)
         {
             return null;
         }
         COSNumber retVal = null;
         // maybe length was given directly
         if (lengthBaseObj instanceof COSNumber)
         {
             retVal = (COSNumber) lengthBaseObj;
         }
         // length in referenced object
         else if (lengthBaseObj instanceof COSObject)
         {
             COSObject lengthObj = (COSObject) lengthBaseObj;
             if (lengthObj.getObject() == null)
             {
                 // not read so far, keep current stream position
                 final long curFileOffset = source.getPosition();
                 boolean isObjectStream = COSName.OBJ_STM.equals(streamType);
                 parseObjectDynamically(lengthObj, isObjectStream);
                 // reset current stream position
                 source.seek(curFileOffset);
                 if (lengthObj.getObject() == null)
                 {
                     throw new IOException("Length object content was not read.");
                 }
             }
             if (!(lengthObj.getObject() instanceof COSNumber))
             {
                 throw new IOException("Wrong type of referenced length object " + lengthObj
                         + ": " + lengthObj.getObject().getClass().getSimpleName());
             }
             retVal = (COSNumber) lengthObj.getObject();
         }
         else
         {
             throw new IOException("Wrong type of length object: "
                     + lengthBaseObj.getClass().getSimpleName());
         }
         return retVal;
     }

     private static final int STREAMCOPYBUFLEN = 8192;
     private final byte[] streamCopyBuf = new byte[STREAMCOPYBUFLEN];

     /**
      * This will read a COSStream from the input stream using length attribute within dictionary. If
      * length attribute is a indirect reference it is first resolved to get the stream length. This
      * means we copy stream data without testing for 'endstream' or 'endobj' and thus it is no
      * problem if these keywords occur within stream. We require 'endstream' to be found after
      * stream data is read.
      *
      * @param dic dictionary that goes with this stream.
      *
      * @return parsed pdf stream.
      *
      * @throws IOException if an error occurred reading the stream, like problems with reading
      * length attribute, stream does not end with 'endstream' after data read, stream too short etc.
      */
     protected COSStream parseCOSStream(COSDictionary dic) throws IOException
     {
         COSStream stream = document.createCOSStream(dic);

         // read 'stream'; this was already tested in parseObjectsDynamically()
         readString();

         skipWhiteSpaces();

         /*
          * This needs to be dic.getItem because when we are parsing, the underlying object might still be null.
          */
         COSNumber streamLengthObj = getLength(dic.getItem(COSName.LENGTH), dic.getCOSName(COSName.TYPE));
         if (streamLengthObj == null)
         {
             if (isLenient)
             {
                LOG.warn("The stream doesn't provide any stream length, using fallback readUntilEnd, at offset "
                     + source.getPosition());
             }
             else
             {
                 throw new IOException("Missing length for stream.");
             }
         }

         // get output stream to copy data to
         if (streamLengthObj != null && validateStreamLength(streamLengthObj.longValue()))
         {
             OutputStream out = stream.createRawOutputStream();
             try
             {
                 readValidStream(out, streamLengthObj);
             }
             finally
             {
                 out.close();
                 // restore original (possibly incorrect) length
                 stream.setItem(COSName.LENGTH, streamLengthObj);
             }
         }
         else
         {
             OutputStream out = stream.createRawOutputStream();
             try
             {
                 readUntilEndStream(new EndstreamOutputStream(out));
             }
             finally
             {
                 out.close();
                 // restore original (possibly incorrect) length
                 if (streamLengthObj != null)
                 {
                     stream.setItem(COSName.LENGTH, streamLengthObj);
                 }
                 else
                 {
                     stream.removeItem(COSName.LENGTH);
                 }
             }
         }
         String endStream = readString();
         if (endStream.equals("endobj") && isLenient)
         {
             LOG.warn("stream ends with 'endobj' instead of 'endstream' at offset "
                     + source.getPosition());
             // avoid follow-up warning about missing endobj
             source.rewind(ENDOBJ.length);
         }
         else if (endStream.length() > 9 && isLenient && endStream.substring(0,9).equals(ENDSTREAM_STRING))
         {
             LOG.warn("stream ends with '" + endStream + "' instead of 'endstream' at offset "
                     + source.getPosition());
             // unread the "extra" bytes
             source.rewind(endStream.substring(9).getBytes(ISO_8859_1).length);
         }
         else if (!endStream.equals(ENDSTREAM_STRING))
         {
             throw new IOException(
                     "Error reading stream, expected='endstream' actual='"
                     + endStream + "' at offset " + source.getPosition());
         }

         return stream;
     }

     /**
      * This method will read through the current stream object until
      * we find the keyword "endstream" meaning we're at the end of this
      * object. Some pdf files, however, forget to write some endstream tags
      * and just close off objects with an "endobj" tag so we have to handle
      * this case as well.
      *
      * This method is optimized using buffered IO and reduced number of
      * byte compare operations.
      *
      * @param out  stream we write out to.
      *
      * @throws IOException if something went wrong
      */
     private void readUntilEndStream( final OutputStream out ) throws IOException
     {
         int bufSize;
         int charMatchCount = 0;
         byte[] keyw = ENDSTREAM;

         // last character position of shortest keyword ('endobj')
         final int quickTestOffset = 5;

         // read next chunk into buffer; already matched chars are added to beginning of buffer
         while ( ( bufSize = source.read( strmBuf, charMatchCount, STRMBUFLEN - charMatchCount ) ) > 0 )
         {
             bufSize += charMatchCount;

             int bIdx = charMatchCount;
             int quickTestIdx;

             // iterate over buffer, trying to find keyword match
             for ( int maxQuicktestIdx = bufSize - quickTestOffset; bIdx < bufSize; bIdx++ )
             {
                 // reduce compare operations by first test last character we would have to
                 // match if current one matches; if it is not a character from keywords
                 // we can move behind the test character; this shortcut is inspired by the
                 // Boyer-Moore string search algorithm and can reduce parsing time by approx. 20%
                 quickTestIdx = bIdx + quickTestOffset;
                 if (charMatchCount == 0 && quickTestIdx < maxQuicktestIdx)
                 {
                     final byte ch = strmBuf[quickTestIdx];
                     if ( ( ch > 't' ) || ( ch < 'a' ) )
                     {
                         // last character we would have to match if current character would match
                         // is not a character from keywords -> jump behind and start over
                         bIdx = quickTestIdx;
                         continue;
                     }
                 }

                 // could be negative - but we only compare to ASCII
                 final byte ch = strmBuf[bIdx];

                 if ( ch == keyw[ charMatchCount ] )
                 {
                     if ( ++charMatchCount == keyw.length )
                     {
                         // match found
                         bIdx++;
                         break;
                     }
                 }
                 else
                 {
                     if ( ( charMatchCount == 3 ) && ( ch == ENDOBJ[ charMatchCount ] ) )
                     {
                         // maybe ENDSTREAM is missing but we could have ENDOBJ
                         keyw = ENDOBJ;
                         charMatchCount++;
                     }
                     else
                     {
                         // no match; incrementing match start by 1 would be dumb since we already know
                         // matched chars depending on current char read we may already have beginning
                         // of a new match: 'e': first char matched; 'n': if we are at match position
                         // idx 7 we already read 'e' thus 2 chars matched for each other char we have
                         // to start matching first keyword char beginning with next read position
                         charMatchCount = ( ch == E ) ? 1 : ( ( ch == N ) && ( charMatchCount == 7 ) ) ? 2 : 0;
                         // search again for 'endstream'
                         keyw = ENDSTREAM;
                     }
                 }
             }

             int contentBytes = Math.max( 0, bIdx - charMatchCount );

             // write buffer content until first matched char to output stream
             if ( contentBytes > 0 )
             {
                 out.write( strmBuf, 0, contentBytes );
             }
             if ( charMatchCount == keyw.length )
             {
                 // keyword matched; unread matched keyword (endstream/endobj) and following buffered content
                 source.rewind( bufSize - contentBytes );
                 break;
             }
             else
             {
                 // copy matched chars at start of buffer
                 System.arraycopy( keyw, 0, strmBuf, 0, charMatchCount );
             }
         }
         // this writes a lonely CR or drops trailing CR LF and LF
         out.flush();
     }

     private void readValidStream(OutputStream out, COSNumber streamLengthObj) throws IOException
     {
         long remainBytes = streamLengthObj.longValue();
         while (remainBytes > 0)
         {
             final int chunk = (remainBytes > STREAMCOPYBUFLEN) ? STREAMCOPYBUFLEN : (int) remainBytes;
             final int readBytes = source.read(streamCopyBuf, 0, chunk);
             if (readBytes <= 0)
             {
                 // shouldn't happen, the stream length has already been validated
                 throw new IOException("read error at offset " + source.getPosition()
                         + ": expected " + chunk + " bytes, but read() returns " + readBytes);
             }
             out.write(streamCopyBuf, 0, readBytes);
             remainBytes -= readBytes;
         }
     }

     private boolean validateStreamLength(long streamLength) throws IOException
     {
         boolean streamLengthIsValid = true;
         long originOffset = source.getPosition();
         long expectedEndOfStream = originOffset + streamLength;
         if (expectedEndOfStream > fileLen)
         {
             streamLengthIsValid = false;
             LOG.warn("The end of the stream is out of range, using workaround to read the stream, "
                     + "stream start position: " + originOffset + ", length: " + streamLength
                     + ", expected end position: " + expectedEndOfStream);
         }
         else
         {
             source.seek(expectedEndOfStream);
             skipSpaces();
             if (!isString(ENDSTREAM))
             {
                 streamLengthIsValid = false;
                 LOG.warn("The end of the stream doesn't point to the correct offset, using workaround to read the stream, "
                         + "stream start position: " + originOffset + ", length: " + streamLength
                         + ", expected end position: " + expectedEndOfStream);
             }
             source.seek(originOffset);
         }
         return streamLengthIsValid;
     }

     /**
      * Check if the cross reference table/stream can be found at the current offset.
      *
      * @param startXRefOffset
      * @return the revised offset
      * @throws IOException
      */
     private long checkXRefOffset(long startXRefOffset) throws IOException
     {
         // repair mode isn't available in non-lenient mode
         if (!isLenient)
         {
             return startXRefOffset;
         }
         source.seek(startXRefOffset);
         if (source.peek() == X && isString(XREF_TABLE))
         {
             return startXRefOffset;
         }
         if (startXRefOffset > 0)
         {
             long fixedOffset = checkXRefStreamOffset(startXRefOffset, true);
             if (fixedOffset > -1)
             {
                 return fixedOffset;
             }
         }
         // try to find a fixed offset
         return calculateXRefFixedOffset(startXRefOffset, false);
     }

     /**
      * Check if the cross reference stream can be found at the current offset.
      *
      * @param startXRefOffset the expected start offset of the XRef stream
      * @param checkOnly check only but don't repair the offset if set to true
      * @return the revised offset
      * @throws IOException if something went wrong
      */
     private long checkXRefStreamOffset(long startXRefOffset, boolean checkOnly) throws IOException
     {
         // repair mode isn't available in non-lenient mode
         if (!isLenient || startXRefOffset == 0)
         {
             return startXRefOffset;
         }
         // seek to offset-1
         source.seek(startXRefOffset-1);
         int nextValue = source.read();
         // the first character has to be a whitespace, and then a digit
         if (isWhitespace(nextValue))
         {
             skipSpaces();
             if (isDigit())
             {
                 try
                 {
                     // it's a XRef stream
                     readObjectNumber();
                     readGenerationNumber();
                     readExpectedString(OBJ_MARKER, true);
                     // check the dictionary to avoid false positives
                     COSDictionary dict = parseCOSDictionary();
                     source.seek(startXRefOffset);
                     if (dict != null && "XRef".equals(dict.getNameAsString(COSName.TYPE)))
                     {
                         return startXRefOffset;
                     }
                 }
                 catch (IOException exception)
                 {
                 // there wasn't an object of a xref stream
                     // try to repair the offset
                     source.seek(startXRefOffset);
                 }
             }
         }
         // try to find a fixed offset
         return checkOnly ? -1 : calculateXRefFixedOffset(startXRefOffset, true);
     }

     /**
      * Try to find a fixed offset for the given xref table/stream.
      *
      * @param objectOffset the given offset where to look at
      * @param streamsOnly search for xref streams only
      * @return the fixed offset
      *
      * @throws IOException if something went wrong
      */
     private long calculateXRefFixedOffset(long objectOffset, boolean streamsOnly) throws IOException
     {
         if (objectOffset < 0)
         {
             LOG.error("Invalid object offset " + objectOffset + " when searching for a xref table/stream");
             return 0;
         }
         // start a brute force search for all xref tables and try to find the offset we are looking for
         long newOffset = bfSearchForXRef(objectOffset, streamsOnly);
         if (newOffset > -1)
         {
             LOG.debug("Fixed reference for xref table/stream " + objectOffset + " -> " + newOffset);
             return newOffset;
         }
         LOG.error("Can't find the object xref table/stream at offset " + objectOffset);
         return 0;
     }

     /**
      * Check the XRef table by dereferencing all objects and fixing the offset if necessary.
      *
      * @throws IOException if something went wrong.
      */
     private void checkXrefOffsets() throws IOException
     {
         // repair mode isn't available in non-lenient mode
         if (!isLenient)
         {
             return;
         }
         Map<COSObjectKey, Long> xrefOffset = xrefTrailerResolver.getXrefTable();
         if (xrefOffset != null)
         {
             boolean bruteForceSearch = false;
             for (Entry<COSObjectKey, Long> objectEntry : xrefOffset.entrySet())
             {
                 COSObjectKey objectKey = objectEntry.getKey();
                 Long objectOffset = objectEntry.getValue();
                 // a negative offset number represents a object number itself
                 // see type 2 entry in xref stream
                 if (objectOffset != null && objectOffset >= 0
                         && !checkObjectKeys(objectKey, objectOffset))
                 {
                     LOG.debug("Stop checking xref offsets as at least one couldn't be dereferenced");
                     bruteForceSearch = true;
                     break;
                 }
             }
             if (bruteForceSearch)
             {
                 bfSearchForObjects();
                 if (bfSearchCOSObjectKeyOffsets != null && !bfSearchCOSObjectKeyOffsets.isEmpty())
                 {
                     List<COSObjectKey> objStreams = new ArrayList<COSObjectKey>();
                     // find all object streams
                     for (COSObjectKey key : xrefOffset.keySet())
                     {
                         Long offset = xrefOffset.get(key);
                         if (offset != null && offset < 0 )
                         {
                             COSObjectKey objStream = new COSObjectKey(-offset, 0);
                             if (!objStreams.contains(objStream))
                             {
                                 objStreams.add(new COSObjectKey(-offset, 0));
                             }
                         }
                     }
                     // remove all found object streams
                     for (COSObjectKey key : bfSearchCOSObjectKeyOffsets.keySet())
                     {
                         objStreams.remove(key);
                     }
                     // remove all objects which are part of an object stream which wasn't found
                     for (COSObjectKey key : objStreams)
                     {
                         Set<Long> objects = xrefTrailerResolver.getContainedObjectNumbers((int)(key.getNumber()));
                         for (Long objNr :objects)
                         {
                             xrefOffset.remove(new COSObjectKey(objNr, 0));
                         }
                     }
                     LOG.debug("Replaced read xref table with the results of a brute force search");
                     xrefOffset.putAll(bfSearchCOSObjectKeyOffsets);
                 }
             }
         }
     }

     /**
      * Check if the given object can be found at the given offset.
      *
      * @param objectKey the object we are looking for
      * @param offset the offset where to look
      * @return returns true if the given object can be dereferenced at the given offset
      * @throws IOException if something went wrong
      */
     private boolean checkObjectKeys(COSObjectKey objectKey, long offset) throws IOException
     {
         // there can't be any object at the very beginning of a pdf
         if (offset < MINIMUM_SEARCH_OFFSET)
         {
             return false;
         }
         long objectNr = objectKey.getNumber();
         int objectGen = objectKey.getGeneration();
         long originOffset = source.getPosition();
         source.seek(offset);
         String objectString = createObjectString(objectNr, objectGen);
         try
         {
             if (isString(objectString.getBytes(ISO_8859_1)))
             {
                 // everything is ok, return origin object key
                 source.seek(originOffset);
                 return true;
             }
         }
         catch (IOException exception)
         {
             // Swallow the exception, obviously there isn't any valid object number
         }
         finally
         {
             source.seek(originOffset);
         }
         // no valid object number found
         return false;
     }
     /**
      * Create a string for the given object id.
      *
      * @param objectID the object id
      * @param genID the generation id
      * @return the generated string
      */
     private String createObjectString(long objectID, int genID)
     {
         return Long.toString(objectID) + " " + Integer.toString(genID) + " obj";
     }

     /**
      * Brute force search for every object in the pdf.
      *
      * @throws IOException if something went wrong
      */
     private void bfSearchForObjects() throws IOException
     {
         if (bfSearchCOSObjectKeyOffsets == null)
         {
             bfSearchCOSObjectKeyOffsets = new HashMap<COSObjectKey, Long>();
             long originOffset = source.getPosition();
             long currentOffset = MINIMUM_SEARCH_OFFSET;
             String objString = " obj";
             char[] string = objString.toCharArray();
             do
             {
                 source.seek(currentOffset);
                 if (isString(string))
                 {
                     long tempOffset = currentOffset - 1;
                     source.seek(tempOffset);
                     int genID = source.peek();
                     // is the next char a digit?
                     if (isDigit(genID))
                     {
                         genID -= 48;
                         tempOffset--;
                         source.seek(tempOffset);
                         if (isSpace())
                         {
                             while (tempOffset > MINIMUM_SEARCH_OFFSET && isSpace())
                             {
                                 source.seek(--tempOffset);
                             }
                             int length = 0;
                             while (tempOffset > MINIMUM_SEARCH_OFFSET && isDigit())
                             {
                                 source.seek(--tempOffset);
                                 length++;
                             }
                             if (length > 0)
                             {
                                 source.read();
                                 byte[] objIDBytes = source.readFully(length);
                                 String objIdString = new String(objIDBytes, 0,
                                         objIDBytes.length, ISO_8859_1);
                                 Long objectID;
                                 try
                                 {
                                     objectID = Long.valueOf(objIdString);
                                 }
                                 catch (NumberFormatException exception)
                                 {
                                     objectID = null;
                                 }
                                 if (objectID != null)
                                 {
                                     bfSearchCOSObjectKeyOffsets.put(new COSObjectKey(objectID, genID), tempOffset+1);
                                 }
                             }
                         }
                     }
                 }
                 currentOffset++;
             }
             while (!source.isEOF());
             // reestablish origin position
             source.seek(originOffset);
         }
     }

     /**
      * Search for the offset of the given xref table/stream among those found by a brute force search.
      *
      * @param streamsOnly search for xref streams only
      * @return the offset of the xref entry
      * @throws IOException if something went wrong
      */
     private long bfSearchForXRef(long xrefOffset, boolean streamsOnly) throws IOException
     {
         long newOffset = -1;
         long newOffsetTable = -1;
         long newOffsetStream = -1;
         if (!streamsOnly)
         {
             bfSearchForXRefTables();
         }
         bfSearchForXRefStreams();
         if (!streamsOnly && bfSearchXRefTablesOffsets != null)
         {
             // TODO to be optimized, this won't work in every case
             newOffsetTable = searchNearestValue(bfSearchXRefTablesOffsets, xrefOffset);
         }
         if (bfSearchXRefStreamsOffsets != null)
         {
             // TODO to be optimized, this won't work in every case
             newOffsetStream = searchNearestValue(bfSearchXRefStreamsOffsets, xrefOffset);
         }
         // choose the nearest value
         if (newOffsetTable > -1 && newOffsetStream > -1)
         {
             long differenceTable = xrefOffset - newOffsetTable;
             long differenceStream = xrefOffset - newOffsetStream;
             if (Math.abs(differenceTable) > Math.abs(differenceStream))
             {
                 newOffset = newOffsetStream;
                 bfSearchXRefStreamsOffsets.remove(newOffsetStream);
             }
             else
             {
                 newOffset = newOffsetTable;
                 bfSearchXRefTablesOffsets.remove(newOffsetTable);
             }
         }
         else if (newOffsetTable > -1)
         {
             newOffset = newOffsetTable;
             bfSearchXRefTablesOffsets.remove(newOffsetTable);
         }
         else if (newOffsetStream > -1)
         {
             newOffset = newOffsetStream;
             bfSearchXRefStreamsOffsets.remove(newOffsetStream);
         }
         return newOffset;
     }

     private long searchNearestValue(List<Long> values, long offset)
     {
         long newValue = -1;
         long currentDifference = -1;
         int currentOffsetIndex = -1;
         int numberOfOffsets = values.size();
         // find the nearest value
         for (int i = 0; i < numberOfOffsets; i++)
         {
             long newDifference = offset - values.get(i);
             // find the nearest offset
             if (currentDifference == -1
                     || (Math.abs(currentDifference) > Math.abs(newDifference)))
             {
                 currentDifference = newDifference;
                 currentOffsetIndex = i;
             }
         }
         if (currentOffsetIndex > -1)
         {
             newValue = values.get(currentOffsetIndex);
         }
         return newValue;
     }
     /**
      * Brute force search for all xref entries (tables).
      *
      * @throws IOException if something went wrong
      */
     private void bfSearchForXRefTables() throws IOException
     {
         if (bfSearchXRefTablesOffsets == null)
         {
             // a pdf may contain more than one xref entry
             bfSearchXRefTablesOffsets = new Vector<Long>();
             long originOffset = source.getPosition();
             source.seek(MINIMUM_SEARCH_OFFSET);
             // search for xref tables
             while (!source.isEOF())
             {
                 if (isString(XREF_TABLE))
                 {
                     long newOffset = source.getPosition();
                     source.seek(newOffset - 1);
                     // ensure that we don't read "startxref" instead of "xref"
                     if (isWhitespace())
                     {
                         bfSearchXRefTablesOffsets.add(newOffset);
                     }
                     source.seek(newOffset + 4);
                 }
                 source.read();
             }
             source.seek(originOffset);
         }
     }

     /**
      * Brute force search for all /XRef entries (streams).
      *
      * @throws IOException if something went wrong
      */
     private void bfSearchForXRefStreams() throws IOException
     {
         if (bfSearchXRefStreamsOffsets == null)
         {
             // a pdf may contain more than one /XRef entry
             bfSearchXRefStreamsOffsets = new Vector<Long>();
             long originOffset = source.getPosition();
             source.seek(MINIMUM_SEARCH_OFFSET);
             // search for XRef streams
             String objString = " obj";
             char[] string = objString.toCharArray();
             while (!source.isEOF())
             {
                 if (isString(XREF_STREAM))
                 {
                     // search backwards for the beginning of the stream
                     long newOffset = -1;
                     long xrefOffset = source.getPosition();
                     boolean objFound = false;
                     for (int i = 1; i < 30 && !objFound; i++)
                     {
                         long currentOffset = xrefOffset - (i * 10);
                         if (currentOffset > 0)
                         {
                             source.seek(currentOffset);
                             for (int j = 0; j < 10; j++)
                             {
                                 if (isString(string))
                                 {
                                     long tempOffset = currentOffset - 1;
                                     source.seek(tempOffset);
                                     int genID = source.peek();
                                     // is the next char a digit?
                                     if (isDigit(genID))
                                     {
                                         genID -= 48;
                                         tempOffset--;
                                         source.seek(tempOffset);
                                         if (isSpace())
                                         {
                                             int length = 0;
                                             source.seek(--tempOffset);
                                             while (tempOffset > MINIMUM_SEARCH_OFFSET && isDigit())
                                             {
                                                 source.seek(--tempOffset);
                                                 length++;
                                             }
                                             if (length > 0)
                                             {
                                                 source.read();
                                                 newOffset = source.getPosition();
                                             }
                                         }
                                     }
                                     LOG.debug("Fixed reference for xref stream " + xrefOffset
                                             + " -> " + newOffset);
                                     objFound = true;
                                     break;
                                 }
                                 else
                                 {
                                     currentOffset++;
                                     source.read();
                                 }
                             }
                         }
                     }
                     if (newOffset > -1)
                     {
                         bfSearchXRefStreamsOffsets.add(newOffset);
                     }
                     source.seek(xrefOffset + 5);
                 }
                 source.read();
             }
             source.seek(originOffset);
         }
     }

     /**
      * Rebuild the trailer dictionary if startxref can't be found.
      *
      * @return the rebuild trailer dictionary
      *
      * @throws IOException if something went wrong
      */
     protected final COSDictionary rebuildTrailer() throws IOException
     {
         COSDictionary trailer = null;
         bfSearchForObjects();
         if (bfSearchCOSObjectKeyOffsets != null)
         {
             xrefTrailerResolver.nextXrefObj( 0, XRefType.TABLE );
             for (Entry<COSObjectKey, Long> entry : bfSearchCOSObjectKeyOffsets.entrySet())
             {
                 xrefTrailerResolver.setXRef(entry.getKey(), entry.getValue());
             }
             xrefTrailerResolver.setStartxref(0);
             trailer = xrefTrailerResolver.getTrailer();
             getDocument().setTrailer(trailer);
             // search for the different parts of the trailer dictionary
             for(Entry<COSObjectKey, Long> entry : bfSearchCOSObjectKeyOffsets.entrySet())
             {
                 Long offset = entry.getValue();
                 source.seek(offset);
                 readObjectNumber();
                 readGenerationNumber();
                 readExpectedString(OBJ_MARKER, true);
                 try
                 {
                     COSDictionary dictionary = parseCOSDictionary();
                     if (dictionary != null)
                     {
                         // document catalog
                         if (COSName.CATALOG.equals(dictionary.getCOSName(COSName.TYPE)))
                         {
                             trailer.setItem(COSName.ROOT, document.getObjectFromPool(entry.getKey()));
                         }
                         // info dictionary
                         else if (dictionary.containsKey(COSName.MOD_DATE) &&
                                 (dictionary.containsKey(COSName.TITLE)
                                 || dictionary.containsKey(COSName.AUTHOR)
                                 || dictionary.containsKey(COSName.SUBJECT)
                                 || dictionary.containsKey(COSName.KEYWORDS)
                                 || dictionary.containsKey(COSName.CREATOR)
                                 || dictionary.containsKey(COSName.PRODUCER)
                                 || dictionary.containsKey(COSName.CREATION_DATE)))
                         {
                             trailer.setItem(COSName.INFO, document.getObjectFromPool(entry.getKey()));
                         }
                         // TODO encryption dictionary
                     }
                 }
                 catch(IOException exception)
                 {
                     LOG.debug("Skipped object " + entry.getKey() + ", either it's corrupt or not a dictionary");
                 }
             }
         }
         return trailer;
     }

     /**
      * This will parse the startxref section from the stream.
      * The startxref value is ignored.
      *
      * @return the startxref value or -1 on parsing error
      * @throws IOException If an IO error occurs.
      */
     private long parseStartXref() throws IOException
     {
         long startXref = -1;
         if (isString(STARTXREF))
         {
             readString();
             skipSpaces();
             // This integer is the byte offset of the first object referenced by the xref or xref stream
             startXref = readLong();
         }
         return startXref;
     }

     /**
      * Checks if the given string can be found at the current offset.
      *
      * @param string the bytes of the string to look for
      * @return true if the bytes are in place, false if not
      * @throws IOException if something went wrong
      */
     private boolean isString(byte[] string) throws IOException
     {
         boolean bytesMatching = false;
         if (source.peek() == string[0])
         {
             int length = string.length;
             byte[] bytesRead = new byte[length];
             int numberOfBytes = source.read(bytesRead, 0, length);
             while (numberOfBytes < length)
             {
                 int readMore = source.read(bytesRead, numberOfBytes, length - numberOfBytes);
                 if (readMore < 0)
                 {
                     break;
                 }
                 numberOfBytes += readMore;
             }
             if (Arrays.equals(string, bytesRead))
             {
                 bytesMatching = true;
             }
             source.rewind(numberOfBytes);
         }
         return bytesMatching;
     }

     /**
      * Checks if the given string can be found at the current offset.
      *
      * @param string the bytes of the string to look for
      * @return true if the bytes are in place, false if not
      * @throws IOException if something went wrong
      */
     private boolean isString(char[] string) throws IOException
     {
         boolean bytesMatching = true;
         long originOffset = source.getPosition();
         for (char c : string)
         {
             if (source.read() != c)
             {
                 bytesMatching = false;
             }
         }
         source.seek(originOffset);
         return bytesMatching;
     }

     /**
      * This will parse the trailer from the stream and add it to the state.
      *
      * @return false on parsing error
      * @throws IOException If an IO error occurs.
      */
     private boolean parseTrailer() throws IOException
     {
         if(source.peek() != 't')
         {
             return false;
         }
         //read "trailer"
         long currentOffset = source.getPosition();
         String nextLine = readLine();
         if( !nextLine.trim().equals( "trailer" ) )
         {
             // in some cases the EOL is missing and the trailer immediately
             // continues with "<<" or with a blank character
             // even if this does not comply with PDF reference we want to support as many PDFs as possible
             // Acrobat reader can also deal with this.
             if (nextLine.startsWith("trailer"))
             {
                 // we can't just unread a portion of the read data as we don't know if the EOL consist of 1 or 2 bytes
                 int len = "trailer".length();
                 // jump back right after "trailer"
                 source.seek(currentOffset + len);
             }
             else
             {
                 return false;
             }
         }

         // in some cases the EOL is missing and the trailer continues with " <<"
         // even if this does not comply with PDF reference we want to support as many PDFs as possible
         // Acrobat reader can also deal with this.
         skipSpaces();

         COSDictionary parsedTrailer = parseCOSDictionary();
         xrefTrailerResolver.setTrailer( parsedTrailer );

         skipSpaces();
         return true;
     }

     /**
      * Parse the header of a pdf.
      *
      * @return true if a PDF header was found
      * @throws IOException if something went wrong
      */
     protected boolean parsePDFHeader() throws IOException
     {
         return parseHeader(PDF_HEADER, PDF_DEFAULT_VERSION);
     }

     /**
      * Parse the header of a fdf.
      *
      * @return true if a FDF header was found
      * @throws IOException if something went wrong
      */
     protected boolean parseFDFHeader() throws IOException
     {
         return parseHeader(FDF_HEADER, FDF_DEFAULT_VERSION);
     }

     private boolean parseHeader(String headerMarker, String defaultVersion) throws IOException
     {
         // read first line
         String header = readLine();
         // some pdf-documents are broken and the pdf-version is in one of the following lines
         if (!header.contains(headerMarker))
         {
             header = readLine();
             while (!header.contains(headerMarker))
             {
                 // if a line starts with a digit, it has to be the first one with data in it
                 if ((header.length() > 0) && (Character.isDigit(header.charAt(0))))
                 {
                     break;
                 }
                 header = readLine();
             }
         }

         // nothing found
         if (!header.contains(headerMarker))
         {
             source.seek(0);
             return false;
         }

         //sometimes there is some garbage in the header before the header
         //actually starts, so lets try to find the header first.
         int headerStart = header.indexOf( headerMarker );

         // greater than zero because if it is zero then there is no point of trimming
         if ( headerStart > 0 )
         {
             //trim off any leading characters
             header = header.substring( headerStart, header.length() );
         }

         // This is used if there is garbage after the header on the same line
         if (header.startsWith(headerMarker) && !header.matches(headerMarker + "\\d.\\d"))
         {
             if (header.length() < headerMarker.length() + 3)
             {
                 // No version number at all, set to 1.4 as default
                 header = headerMarker + defaultVersion;
                 LOG.debug("No version found, set to " + defaultVersion + " as default.");
             }
             else
             {
                 String headerGarbage = header.substring(headerMarker.length() + 3, header.length()) + "\n";
                 header = header.substring(0, headerMarker.length() + 3);
                 source.rewind(headerGarbage.getBytes(ISO_8859_1).length);
             }
         }
         float headerVersion = -1;
         try
         {
             String[] headerParts = header.split("-");
             if (headerParts.length == 2)
             {
                 headerVersion = Float.parseFloat(headerParts[1]);
             }
         }
         catch (NumberFormatException exception)
         {
             LOG.debug("Can't parse the header version.", exception);
         }
         if (headerVersion < 0)
         {
             throw new IOException( "Error getting header version: " + header);
         }
         document.setVersion(headerVersion);
         // rewind
         source.seek(0);
         return true;
     }

     /**
      * This will parse the xref table from the stream and add it to the state
      * The XrefTable contents are ignored.
      * @param startByteOffset the offset to start at
      * @return false on parsing error
      * @throws IOException If an IO error occurs.
      */
     protected boolean parseXrefTable(long startByteOffset) throws IOException
     {
         if(source.peek() != 'x')
         {
             return false;
         }
         String xref = readString();
         if( !xref.trim().equals( "xref" ) )
         {
             return false;
         }

         // check for trailer after xref
         String str = readString();
         byte[] b = str.getBytes(ISO_8859_1);
         source.rewind(b.length);

         // signal start of new XRef
         xrefTrailerResolver.nextXrefObj( startByteOffset, XRefType.TABLE );

         if (str.startsWith("trailer"))
         {
             LOG.warn("skipping empty xref table");
             return false;
         }

         // Xref tables can have multiple sections. Each starts with a starting object id and a count.
         while(true)
         {
             // first obj id
             long currObjID = readObjectNumber();

             // the number of objects in the xref table
             long count = readLong();

             skipSpaces();
             for(int i = 0; i < count; i++)
             {
                 if(source.isEOF() || isEndOfName((char)source.peek()))
                 {
                     break;
                 }
                 if(source.peek() == 't')
                 {
                     break;
                 }
                 //Ignore table contents
                 String currentLine = readLine();
                 String[] splitString = currentLine.split("\\s");
                 if (splitString.length < 3)
                 {
                     LOG.warn("invalid xref line: " + currentLine);
                     break;
                 }
                 /* This supports the corrupt table as reported in
                  * PDFBOX-474 (XXXX XXX XX n) */
                 if(splitString[splitString.length-1].equals("n"))
                 {
                     try
                     {
                         long currOffset = Long.parseLong(splitString[0]);
                         int currGenID = Integer.parseInt(splitString[1]);
                         COSObjectKey objKey = new COSObjectKey(currObjID, currGenID);
                         xrefTrailerResolver.setXRef(objKey, currOffset);
                     }
                     catch(NumberFormatException e)
                     {
                         throw new IOException(e);
                     }
                 }
                 else if(!splitString[2].equals("f"))
                 {
                     throw new IOException("Corrupt XRefTable Entry - ObjID:" + currObjID);
                 }
                 currObjID++;
                 skipSpaces();
             }
             skipSpaces();
             if (!isDigit())
             {
                 break;
             }
         }
         return true;
     }

     /**
      * Fills XRefTrailerResolver with data of given stream.
      * Stream must be of type XRef.
      * @param stream the stream to be read
      * @param objByteOffset the offset to start at
      * @param isStandalone should be set to true if the stream is not part of a hybrid xref table
      * @throws IOException if there is an error parsing the stream
      */
     private void parseXrefStream(COSStream stream, long objByteOffset, boolean isStandalone) throws IOException
     {
         // the cross reference stream of a hybrid xref table will be added to the existing one
         // and we must not override the offset and the trailer
         if ( isStandalone )
         {
             xrefTrailerResolver.nextXrefObj( objByteOffset, XRefType.STREAM );
             xrefTrailerResolver.setTrailer( stream );
         }
         PDFXrefStreamParser parser = new PDFXrefStreamParser( stream, document, xrefTrailerResolver );
         parser.parse();
     }

     /**
      * This will get the document that was parsed.  parse() must be called before this is called.
      * When you are done with this document you must call close() on it to release
      * resources.
      *
      * @return The document that was parsed.
      *
      * @throws IOException If there is an error getting the document.
      */
     public COSDocument getDocument() throws IOException
     {
         if( document == null )
         {
             throw new IOException( "You must call parse() before calling getDocument()" );
         }
         return document;
     }

     /**
      * Parse the values of the trailer dictionary and return the root object.
      *
      * @param trailer The trailer dictionary.
      * @return The parsed root object.
      * @throws IOException If an IO error occurs or if the root object is
      * missing in the trailer dictionary.
      */
     protected COSBase parseTrailerValuesDynamically(COSDictionary trailer) throws IOException
     {
         // PDFBOX-1557 - ensure that all COSObject are loaded in the trailer
         // PDFBOX-1606 - after securityHandler has been instantiated
         for (COSBase trailerEntry : trailer.getValues())
         {
             if (trailerEntry instanceof COSObject)
             {
                 COSObject tmpObj = (COSObject) trailerEntry;
                 parseObjectDynamically(tmpObj, false);
             }
         }
         // parse catalog or root object
         COSObject root = (COSObject) trailer.getItem(COSName.ROOT);
         if (root == null)
         {
             throw new IOException("Missing root object specification in trailer.");
         }
         return parseObjectDynamically(root, false);
     }

 }