pdfbox/src/main/java/org/apache/pdfbox/pdfparser/VisualSignatureParser.java - pdfbox - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.pdfbox.pdfparser;

 import java.io.IOException;
 import java.io.InputStream;
 import java.util.regex.Pattern;

 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.pdfbox.cos.COSBase;
 import org.apache.pdfbox.cos.COSDictionary;
 import org.apache.pdfbox.cos.COSDocument;
 import org.apache.pdfbox.cos.COSObject;
 import org.apache.pdfbox.pdfwriter.COSWriter;
 import org.apache.pdfbox.persistence.util.COSObjectKey;

 public class VisualSignatureParser extends BaseParser
 {

     /**
      * Log instance.
      */
     private static final Log LOG = LogFactory.getLog(PDFParser.class);

     /**
      * Constructor.
      *
      * @param input the inputstream to be read.
      *
      * @throws IOException If something went wrong
      */
     public VisualSignatureParser(InputStream input) throws IOException
     {
         super(input);
     }

     /**
      * {@inheritDoc}
      */
     public void parse() throws IOException
     {
         document = new COSDocument();
         skipToNextObj();

         boolean wasLastParsedObjectEOF = false;
         try
         {
             while(!wasLastParsedObjectEOF)
             {
                 if(pdfSource.isEOF())
                 {
                     break;
                 }
                 try
                 {
                     wasLastParsedObjectEOF = parseObject();
                 }
                 catch(IOException e)
                 {
                     /*
                      * Warning is sent to the PDFBox.log and to the Console that
                      * we skipped over an object
                      */
                     LOG.warn("Parsing Error, Skipping Object", e);
                     skipToNextObj();
                 }
                 skipSpaces();
             }
         }
         catch(IOException e)
         {
             /*
              * PDF files may have random data after the EOF marker. Ignore errors if
              * last object processed is EOF.
              */
             if(!wasLastParsedObjectEOF)
             {
                 throw e;
             }
         }
     }

     private void skipToNextObj() throws IOException
     {
         byte[] b = new byte[16];
         Pattern p = Pattern.compile("\\d+\\s+\\d+\\s+obj.*", Pattern.DOTALL);
         /* Read a buffer of data each time to see if it starts with a
          * known keyword. This is not the most efficient design, but we should
          * rarely be needing this function. We could update this to use the
          * circular buffer, like in readUntilEndStream().
          */
         while(!pdfSource.isEOF())
         {
             int l = pdfSource.read(b);
             if(l < 1)
             {
                 break;
             }
             String s = new String(b, "US-ASCII");
             if(s.startsWith("trailer")
                     || s.startsWith("xref")
                     || s.startsWith("startxref")
                     || s.startsWith("stream")
                     || p.matcher(s).matches())
             {
                 pdfSource.unread(b);
                 break;
             }
             else
             {
                 pdfSource.unread(b, 1, l - 1);
             }
         }
     }

     private boolean parseObject() throws IOException
     {
         boolean isEndOfFile = false;
         skipSpaces();
         //peek at the next character to determine the type of object we are parsing
         char peekedChar = (char) pdfSource.peek();

         //ignore endobj and endstream sections.
         while(peekedChar == 'e')
         {
             //there are times when there are multiple endobj, so lets
             //just read them and move on.
             readString();
             skipSpaces();
             peekedChar = (char) pdfSource.peek();
         }
         if(pdfSource.isEOF())
         {
             // end of file we will return a false and call it a day.
         }
         else if(peekedChar == 'x')
         {
             //xref table. Note: The contents of the Xref table are currently ignored
             return true;
         }
         else if(peekedChar == 't' || peekedChar == 's')
         {
             // Note: startxref can occur in either a trailer section or by itself
             if(peekedChar == 't')
             {
                 return true;
             }
             if(peekedChar == 's')
             {
                 skipToNextObj();
                 //verify that EOF exists
                 String eof = readExpectedString("%%EOF");
                 if(eof.indexOf("%%EOF") == -1 && !pdfSource.isEOF())
                 {
                     throw new IOException("expected='%%EOF' actual='" + eof + "' next=" + readString()
                             + " next=" + readString());
                 }
                 isEndOfFile = true;
             }
         }
         else
         {
             //we are going to parse an normal object
             long number = -1;
             int genNum = -1;
             String objectKey = null;
             boolean missingObjectNumber = false;
             try
             {
                 char peeked = (char) pdfSource.peek();
                 if(peeked == '<')
                 {
                     missingObjectNumber = true;
                 }
                 else
                 {
                     number = readObjectNumber();
                 }
             }
             catch(IOException e)
             {
                 //ok for some reason "GNU Ghostscript 5.10" puts two endobj
                 //statements after an object, of course this is nonsense
                 //but because we want to support as many PDFs as possible
                 //we will simply try again
                 number = readObjectNumber();
             }
             if(!missingObjectNumber)
             {
                 skipSpaces();
                 genNum = readGenerationNumber();

                 objectKey = readString(3);
                 //System.out.println( "parseObject() num=" + number +
                 //" genNumber=" + genNum + " key='" + objectKey + "'" );
                 if(!objectKey.equals("obj"))
                 {
                     throw new IOException("expected='obj' actual='" + objectKey + "' " + pdfSource);
                 }
             }
             else
             {
                 number = -1;
                 genNum = -1;
             }

             skipSpaces();
             COSBase pb = parseDirObject();
             String endObjectKey = readString();

             if(endObjectKey.equals("stream"))
             {
                 pdfSource.unread(endObjectKey.getBytes());
                 pdfSource.unread(' ');
                 if(pb instanceof COSDictionary)
                 {
                     pb = parseCOSStream((COSDictionary) pb, getDocument().getScratchFile());

                 }
                 else
                 {
                     // this is not legal
                     // the combination of a dict and the stream/endstream forms a complete stream object
                     throw new IOException("stream not preceded by dictionary");
                 }
                 endObjectKey = readString();
             }

             COSObjectKey key = new COSObjectKey(number, genNum);
             COSObject pdfObject = document.getObjectFromPool(key);
             pb.setNeedToBeUpdate(true);
             pdfObject.setObject(pb);

             if(!endObjectKey.equals("endobj"))
             {
                 if(endObjectKey.startsWith("endobj"))
                 {
                     /*
                      * Some PDF files don't contain a new line after endobj so we
                      * need to make sure that the next object number is getting read separately
                      * and not part of the endobj keyword. Ex. Some files would have "endobj28"
                      * instead of "endobj"
                      */
                     pdfSource.unread(endObjectKey.substring(6).getBytes());
                 }
                 else if(!pdfSource.isEOF())
                 {
                     try
                     {
                         //It is possible that the endobj  is missing, there
                         //are several PDFs out there that do that so skip it and move on.
                         Float.parseFloat(endObjectKey);
                         pdfSource.unread(COSWriter.SPACE);
                         pdfSource.unread(endObjectKey.getBytes());
                     }
                     catch(NumberFormatException e)
                     {
                         //we will try again incase there was some garbage which
                         //some writers will leave behind.
                         String secondEndObjectKey = readString();
                         if(!secondEndObjectKey.equals("endobj"))
                         {
                             if(isClosing())
                             {
                                 //found a case with 17506.pdf object 41 that was like this
                                 //41 0 obj [/Pattern /DeviceGray] ] endobj
                                 //notice the second array close, here we are reading it
                                 //and ignoring and attempting to continue
                                 pdfSource.read();
                             }
                             skipSpaces();
                             String thirdPossibleEndObj = readString();
                             if(!thirdPossibleEndObj.equals("endobj"))
                             {
                                 throw new IOException("expected='endobj' firstReadAttempt='" + endObjectKey + "' "
                                         + "secondReadAttempt='" + secondEndObjectKey + "' " + pdfSource);
                             }
                         }
                     }
                 }
             }
             skipSpaces();
         }
         return isEndOfFile;
     }

     /**
      * Returns the underlying COSDocument.
      *
      * @return the COSDocument
      *
      * @throws IOException If something went wrong
      */
     public COSDocument getDocument() throws IOException
     {
         if(document == null)
         {
             throw new IOException("You must call parse() before calling getDocument()");
         }
         return document;
     }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.pdfbox.pdfparser;

	import java.io.IOException;
	import java.io.InputStream;
	import java.util.regex.Pattern;

	import org.apache.commons.logging.Log;
	import org.apache.commons.logging.LogFactory;
	import org.apache.pdfbox.cos.COSBase;
	import org.apache.pdfbox.cos.COSDictionary;
	import org.apache.pdfbox.cos.COSDocument;
	import org.apache.pdfbox.cos.COSObject;
	import org.apache.pdfbox.pdfwriter.COSWriter;
	import org.apache.pdfbox.persistence.util.COSObjectKey;

	public class VisualSignatureParser extends BaseParser
	{

	/**
	* Log instance.
	*/
	private static final Log LOG = LogFactory.getLog(PDFParser.class);

	/**
	* Constructor.
	*
	* @param input the inputstream to be read.
	*
	* @throws IOException If something went wrong
	*/
	public VisualSignatureParser(InputStream input) throws IOException
	{
	super(input);
	}

	/**
	* {@inheritDoc}
	*/
	public void parse() throws IOException
	{
	document = new COSDocument();
	skipToNextObj();

	boolean wasLastParsedObjectEOF = false;
	try
	{
	while(!wasLastParsedObjectEOF)
	{
	if(pdfSource.isEOF())
	{
	break;
	}
	try
	{
	wasLastParsedObjectEOF = parseObject();
	}
	catch(IOException e)
	{
	/*
	* Warning is sent to the PDFBox.log and to the Console that
	* we skipped over an object
	*/
	LOG.warn("Parsing Error, Skipping Object", e);
	skipToNextObj();
	}
	skipSpaces();
	}
	}
	catch(IOException e)
	{
	/*
	* PDF files may have random data after the EOF marker. Ignore errors if
	* last object processed is EOF.
	*/
	if(!wasLastParsedObjectEOF)
	{
	throw e;
	}
	}
	}

	private void skipToNextObj() throws IOException
	{
	byte[] b = new byte[16];
	Pattern p = Pattern.compile("\\d+\\s+\\d+\\s+obj.*", Pattern.DOTALL);
	/* Read a buffer of data each time to see if it starts with a
	* known keyword. This is not the most efficient design, but we should
	* rarely be needing this function. We could update this to use the
	* circular buffer, like in readUntilEndStream().
	*/
	while(!pdfSource.isEOF())
	{
	int l = pdfSource.read(b);
	if(l < 1)
	{
	break;
	}
	String s = new String(b, "US-ASCII");
	if(s.startsWith("trailer")
	\|\| s.startsWith("xref")
	\|\| s.startsWith("startxref")
	\|\| s.startsWith("stream")
	\|\| p.matcher(s).matches())
	{
	pdfSource.unread(b);
	break;
	}
	else
	{
	pdfSource.unread(b, 1, l - 1);
	}
	}
	}

	private boolean parseObject() throws IOException
	{
	boolean isEndOfFile = false;
	skipSpaces();
	//peek at the next character to determine the type of object we are parsing
	char peekedChar = (char) pdfSource.peek();

	//ignore endobj and endstream sections.
	while(peekedChar == 'e')
	{
	//there are times when there are multiple endobj, so lets
	//just read them and move on.
	readString();
	skipSpaces();
	peekedChar = (char) pdfSource.peek();
	}
	if(pdfSource.isEOF())
	{
	// end of file we will return a false and call it a day.
	}
	else if(peekedChar == 'x')
	{
	//xref table. Note: The contents of the Xref table are currently ignored
	return true;
	}
	else if(peekedChar == 't' \|\| peekedChar == 's')
	{
	// Note: startxref can occur in either a trailer section or by itself
	if(peekedChar == 't')
	{
	return true;
	}
	if(peekedChar == 's')
	{
	skipToNextObj();
	//verify that EOF exists
	String eof = readExpectedString("%%EOF");
	if(eof.indexOf("%%EOF") == -1 && !pdfSource.isEOF())
	{
	throw new IOException("expected='%%EOF' actual='" + eof + "' next=" + readString()
	+ " next=" + readString());
	}
	isEndOfFile = true;
	}
	}
	else
	{
	//we are going to parse an normal object
	long number = -1;
	int genNum = -1;
	String objectKey = null;
	boolean missingObjectNumber = false;
	try
	{
	char peeked = (char) pdfSource.peek();
	if(peeked == '<')
	{
	missingObjectNumber = true;
	}
	else
	{
	number = readObjectNumber();
	}
	}
	catch(IOException e)
	{
	//ok for some reason "GNU Ghostscript 5.10" puts two endobj
	//statements after an object, of course this is nonsense
	//but because we want to support as many PDFs as possible
	//we will simply try again
	number = readObjectNumber();
	}
	if(!missingObjectNumber)
	{
	skipSpaces();
	genNum = readGenerationNumber();

	objectKey = readString(3);
	//System.out.println( "parseObject() num=" + number +
	//" genNumber=" + genNum + " key='" + objectKey + "'" );
	if(!objectKey.equals("obj"))
	{
	throw new IOException("expected='obj' actual='" + objectKey + "' " + pdfSource);
	}
	}
	else
	{
	number = -1;
	genNum = -1;
	}

	skipSpaces();
	COSBase pb = parseDirObject();
	String endObjectKey = readString();

	if(endObjectKey.equals("stream"))
	{
	pdfSource.unread(endObjectKey.getBytes());
	pdfSource.unread(' ');
	if(pb instanceof COSDictionary)
	{
	pb = parseCOSStream((COSDictionary) pb, getDocument().getScratchFile());

	}
	else
	{
	// this is not legal
	// the combination of a dict and the stream/endstream forms a complete stream object
	throw new IOException("stream not preceded by dictionary");
	}
	endObjectKey = readString();
	}

	COSObjectKey key = new COSObjectKey(number, genNum);
	COSObject pdfObject = document.getObjectFromPool(key);
	pb.setNeedToBeUpdate(true);
	pdfObject.setObject(pb);

	if(!endObjectKey.equals("endobj"))
	{
	if(endObjectKey.startsWith("endobj"))
	{
	/*
	* Some PDF files don't contain a new line after endobj so we
	* need to make sure that the next object number is getting read separately
	* and not part of the endobj keyword. Ex. Some files would have "endobj28"
	* instead of "endobj"
	*/
	pdfSource.unread(endObjectKey.substring(6).getBytes());
	}
	else if(!pdfSource.isEOF())
	{
	try
	{
	//It is possible that the endobj is missing, there
	//are several PDFs out there that do that so skip it and move on.
	Float.parseFloat(endObjectKey);
	pdfSource.unread(COSWriter.SPACE);
	pdfSource.unread(endObjectKey.getBytes());
	}
	catch(NumberFormatException e)
	{
	//we will try again incase there was some garbage which
	//some writers will leave behind.
	String secondEndObjectKey = readString();
	if(!secondEndObjectKey.equals("endobj"))
	{
	if(isClosing())
	{
	//found a case with 17506.pdf object 41 that was like this
	//41 0 obj [/Pattern /DeviceGray] ] endobj
	//notice the second array close, here we are reading it
	//and ignoring and attempting to continue
	pdfSource.read();
	}
	skipSpaces();
	String thirdPossibleEndObj = readString();
	if(!thirdPossibleEndObj.equals("endobj"))
	{
	throw new IOException("expected='endobj' firstReadAttempt='" + endObjectKey + "' "
	+ "secondReadAttempt='" + secondEndObjectKey + "' " + pdfSource);
	}
	}
	}
	}
	}
	skipSpaces();
	}
	return isEndOfFile;
	}

	/**
	* Returns the underlying COSDocument.
	*
	* @return the COSDocument
	*
	* @throws IOException If something went wrong
	*/
	public COSDocument getDocument() throws IOException
	{
	if(document == null)
	{
	throw new IOException("You must call parse() before calling getDocument()");
	}
	return document;
	}
	}