blob: 5c002eff35a522e86627d449a6c0b550238d5078 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.pdfbox.pdfparser;
import java.io.IOException;
import java.io.InputStream;
import java.util.regex.Pattern;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSDictionary;
import org.apache.pdfbox.cos.COSDocument;
import org.apache.pdfbox.cos.COSObject;
import org.apache.pdfbox.pdfwriter.COSWriter;
import org.apache.pdfbox.persistence.util.COSObjectKey;
public class VisualSignatureParser extends BaseParser
{
/**
* Log instance.
*/
private static final Log LOG = LogFactory.getLog(PDFParser.class);
/**
* Constructor.
*
* @param input the inputstream to be read.
*
* @throws IOException If something went wrong
*/
public VisualSignatureParser(InputStream input) throws IOException
{
super(input);
}
/**
* {@inheritDoc}
*/
public void parse() throws IOException
{
document = new COSDocument();
skipToNextObj();
boolean wasLastParsedObjectEOF = false;
try
{
while(!wasLastParsedObjectEOF)
{
if(pdfSource.isEOF())
{
break;
}
try
{
wasLastParsedObjectEOF = parseObject();
}
catch(IOException e)
{
/*
* Warning is sent to the PDFBox.log and to the Console that
* we skipped over an object
*/
LOG.warn("Parsing Error, Skipping Object", e);
skipToNextObj();
}
skipSpaces();
}
}
catch(IOException e)
{
/*
* PDF files may have random data after the EOF marker. Ignore errors if
* last object processed is EOF.
*/
if(!wasLastParsedObjectEOF)
{
throw e;
}
}
}
private void skipToNextObj() throws IOException
{
byte[] b = new byte[16];
Pattern p = Pattern.compile("\\d+\\s+\\d+\\s+obj.*", Pattern.DOTALL);
/* Read a buffer of data each time to see if it starts with a
* known keyword. This is not the most efficient design, but we should
* rarely be needing this function. We could update this to use the
* circular buffer, like in readUntilEndStream().
*/
while(!pdfSource.isEOF())
{
int l = pdfSource.read(b);
if(l < 1)
{
break;
}
String s = new String(b, "US-ASCII");
if(s.startsWith("trailer")
|| s.startsWith("xref")
|| s.startsWith("startxref")
|| s.startsWith("stream")
|| p.matcher(s).matches())
{
pdfSource.unread(b);
break;
}
else
{
pdfSource.unread(b, 1, l - 1);
}
}
}
private boolean parseObject() throws IOException
{
boolean isEndOfFile = false;
skipSpaces();
//peek at the next character to determine the type of object we are parsing
char peekedChar = (char) pdfSource.peek();
//ignore endobj and endstream sections.
while(peekedChar == 'e')
{
//there are times when there are multiple endobj, so lets
//just read them and move on.
readString();
skipSpaces();
peekedChar = (char) pdfSource.peek();
}
if(pdfSource.isEOF())
{
// end of file we will return a false and call it a day.
}
else if(peekedChar == 'x')
{
//xref table. Note: The contents of the Xref table are currently ignored
return true;
}
else if(peekedChar == 't' || peekedChar == 's')
{
// Note: startxref can occur in either a trailer section or by itself
if(peekedChar == 't')
{
return true;
}
if(peekedChar == 's')
{
skipToNextObj();
//verify that EOF exists
String eof = readExpectedString("%%EOF");
if(eof.indexOf("%%EOF") == -1 && !pdfSource.isEOF())
{
throw new IOException("expected='%%EOF' actual='" + eof + "' next=" + readString()
+ " next=" + readString());
}
isEndOfFile = true;
}
}
else
{
//we are going to parse an normal object
long number = -1;
int genNum = -1;
String objectKey = null;
boolean missingObjectNumber = false;
try
{
char peeked = (char) pdfSource.peek();
if(peeked == '<')
{
missingObjectNumber = true;
}
else
{
number = readObjectNumber();
}
}
catch(IOException e)
{
//ok for some reason "GNU Ghostscript 5.10" puts two endobj
//statements after an object, of course this is nonsense
//but because we want to support as many PDFs as possible
//we will simply try again
number = readObjectNumber();
}
if(!missingObjectNumber)
{
skipSpaces();
genNum = readGenerationNumber();
objectKey = readString(3);
//System.out.println( "parseObject() num=" + number +
//" genNumber=" + genNum + " key='" + objectKey + "'" );
if(!objectKey.equals("obj"))
{
throw new IOException("expected='obj' actual='" + objectKey + "' " + pdfSource);
}
}
else
{
number = -1;
genNum = -1;
}
skipSpaces();
COSBase pb = parseDirObject();
String endObjectKey = readString();
if(endObjectKey.equals("stream"))
{
pdfSource.unread(endObjectKey.getBytes());
pdfSource.unread(' ');
if(pb instanceof COSDictionary)
{
pb = parseCOSStream((COSDictionary) pb, getDocument().getScratchFile());
}
else
{
// this is not legal
// the combination of a dict and the stream/endstream forms a complete stream object
throw new IOException("stream not preceded by dictionary");
}
endObjectKey = readString();
}
COSObjectKey key = new COSObjectKey(number, genNum);
COSObject pdfObject = document.getObjectFromPool(key);
pb.setNeedToBeUpdate(true);
pdfObject.setObject(pb);
if(!endObjectKey.equals("endobj"))
{
if(endObjectKey.startsWith("endobj"))
{
/*
* Some PDF files don't contain a new line after endobj so we
* need to make sure that the next object number is getting read separately
* and not part of the endobj keyword. Ex. Some files would have "endobj28"
* instead of "endobj"
*/
pdfSource.unread(endObjectKey.substring(6).getBytes());
}
else if(!pdfSource.isEOF())
{
try
{
//It is possible that the endobj is missing, there
//are several PDFs out there that do that so skip it and move on.
Float.parseFloat(endObjectKey);
pdfSource.unread(COSWriter.SPACE);
pdfSource.unread(endObjectKey.getBytes());
}
catch(NumberFormatException e)
{
//we will try again incase there was some garbage which
//some writers will leave behind.
String secondEndObjectKey = readString();
if(!secondEndObjectKey.equals("endobj"))
{
if(isClosing())
{
//found a case with 17506.pdf object 41 that was like this
//41 0 obj [/Pattern /DeviceGray] ] endobj
//notice the second array close, here we are reading it
//and ignoring and attempting to continue
pdfSource.read();
}
skipSpaces();
String thirdPossibleEndObj = readString();
if(!thirdPossibleEndObj.equals("endobj"))
{
throw new IOException("expected='endobj' firstReadAttempt='" + endObjectKey + "' "
+ "secondReadAttempt='" + secondEndObjectKey + "' " + pdfSource);
}
}
}
}
}
skipSpaces();
}
return isEndOfFile;
}
/**
* Returns the underlying COSDocument.
*
* @return the COSDocument
*
* @throws IOException If something went wrong
*/
public COSDocument getDocument() throws IOException
{
if(document == null)
{
throw new IOException("You must call parse() before calling getDocument()");
}
return document;
}
}