| /***************************************************************************** |
| * |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| * |
| ****************************************************************************/ |
| |
| package org.apache.pdfbox.preflight.parser; |
| |
| import java.io.File; |
| import java.io.IOException; |
| import java.nio.charset.Charset; |
| import java.nio.charset.StandardCharsets; |
| import java.util.Arrays; |
| import java.util.Set; |
| import java.util.regex.Matcher; |
| import java.util.regex.Pattern; |
| import org.apache.pdfbox.cos.COSArray; |
| import org.apache.pdfbox.cos.COSBase; |
| import org.apache.pdfbox.cos.COSDictionary; |
| import org.apache.pdfbox.cos.COSFloat; |
| import org.apache.pdfbox.cos.COSName; |
| import org.apache.pdfbox.cos.COSNull; |
| import org.apache.pdfbox.cos.COSNumber; |
| import org.apache.pdfbox.cos.COSObject; |
| import org.apache.pdfbox.cos.COSObjectKey; |
| import org.apache.pdfbox.cos.COSStream; |
| import org.apache.pdfbox.cos.COSString; |
| import org.apache.pdfbox.io.RandomAccessReadBufferedFile; |
| import org.apache.pdfbox.pdfparser.PDFParser; |
| import org.apache.pdfbox.pdfparser.XrefTrailerResolver.XRefType; |
| import org.apache.pdfbox.pdmodel.PDDocument; |
| import org.apache.pdfbox.preflight.Format; |
| import org.apache.pdfbox.preflight.PreflightConfiguration; |
| import org.apache.pdfbox.preflight.PreflightConstants; |
| import org.apache.pdfbox.preflight.PreflightContext; |
| import org.apache.pdfbox.preflight.PreflightDocument; |
| import org.apache.pdfbox.preflight.ValidationResult; |
| import org.apache.pdfbox.preflight.ValidationResult.ValidationError; |
| import org.apache.pdfbox.preflight.exception.SyntaxValidationException; |
| |
| |
| import static org.apache.pdfbox.preflight.PreflightConstants.ERROR_SYNTAX_ARRAY_TOO_LONG; |
| import static org.apache.pdfbox.preflight.PreflightConstants.ERROR_SYNTAX_CROSS_REF; |
| import static org.apache.pdfbox.preflight.PreflightConstants.ERROR_SYNTAX_HEXA_STRING_EVEN_NUMBER; |
| import static org.apache.pdfbox.preflight.PreflightConstants.ERROR_SYNTAX_HEXA_STRING_INVALID; |
| import static org.apache.pdfbox.preflight.PreflightConstants.ERROR_SYNTAX_HEXA_STRING_TOO_LONG; |
| import static org.apache.pdfbox.preflight.PreflightConstants.ERROR_SYNTAX_INVALID_OFFSET; |
| import static org.apache.pdfbox.preflight.PreflightConstants.ERROR_SYNTAX_MISSING_OFFSET; |
| import static org.apache.pdfbox.preflight.PreflightConstants.ERROR_SYNTAX_NAME_TOO_LONG; |
| import static org.apache.pdfbox.preflight.PreflightConstants.ERROR_SYNTAX_NUMERIC_RANGE; |
| import static org.apache.pdfbox.preflight.PreflightConstants.ERROR_SYNTAX_OBJ_DELIMITER; |
| import static org.apache.pdfbox.preflight.PreflightConstants.ERROR_SYNTAX_STREAM_DELIMITER; |
| import static org.apache.pdfbox.preflight.PreflightConstants.ERROR_SYNTAX_STREAM_LENGTH_INVALID; |
| import static org.apache.pdfbox.preflight.PreflightConstants.ERROR_SYNTAX_TOO_MANY_ENTRIES; |
| import static org.apache.pdfbox.preflight.PreflightConstants.ERROR_SYNTAX_TRAILER_EOF; |
| import static org.apache.pdfbox.preflight.PreflightConstants.MAX_ARRAY_ELEMENTS; |
| import static org.apache.pdfbox.preflight.PreflightConstants.MAX_DICT_ENTRIES; |
| import static org.apache.pdfbox.preflight.PreflightConstants.MAX_NAME_SIZE; |
| import static org.apache.pdfbox.preflight.PreflightConstants.MAX_NEGATIVE_FLOAT; |
| import static org.apache.pdfbox.preflight.PreflightConstants.MAX_POSITIVE_FLOAT; |
| import static org.apache.pdfbox.preflight.PreflightConstants.MAX_STRING_LENGTH; |
| |
| public class PreflightParser extends PDFParser |
| { |
| /** |
| * Define a one byte encoding that hasn't specific encoding in UTF-8 charset. Avoid unexpected |
| * error when the encoding is Cp5816 |
| */ |
| private static final Charset ENCODING = StandardCharsets.ISO_8859_1; |
| |
| private Format format = null; |
| |
| private PreflightConfiguration config = null; |
| |
| private PreflightDocument preflightDocument; |
| |
| private ValidationResult validationResult; |
| |
| /** |
| * Constructor. |
| * |
| * @param file |
| * @throws IOException if there is a reading error. |
| */ |
| public PreflightParser(File file) throws IOException |
| { |
| super(new RandomAccessReadBufferedFile(file)); |
| } |
| |
| /** |
| * Constructor. |
| * |
| * @param filename |
| * @throws IOException if there is a reading error. |
| */ |
| public PreflightParser(String filename) throws IOException |
| { |
| this(new File(filename)); |
| } |
| |
| /** |
| * Add a validation error to the ValidationResult. |
| * |
| * @param error the validation error to be added |
| */ |
| private void addValidationError(ValidationError error) |
| { |
| validationResult.addError(error); |
| } |
| |
| @Override |
| public PDDocument parse() throws IOException |
| { |
| return parse(Format.PDF_A1B); |
| } |
| |
| /** |
| * Parse the given file and check if it is a confirming file according to the given format. |
| * |
| * @param format |
| * format that the document should follow (default {@link Format#PDF_A1B}) |
| * @throws IOException |
| */ |
| public PDDocument parse(Format format) throws IOException |
| { |
| return parse(format, null); |
| } |
| |
| /** |
| * Parse the given file and check if it is a confirming file according to the given format. |
| * |
| * @param format |
| * format that the document should follow (default {@link Format#PDF_A1B}) |
| * @param config |
| * Configuration bean that will be used by the PreflightDocument. If null the format is used to determine |
| * the default configuration. |
| * @throws IOException |
| */ |
| public PDDocument parse(Format format, PreflightConfiguration config) throws IOException |
| { |
| this.format = format == null ? Format.PDF_A1B : format; |
| this.config = config; |
| validationResult = new ValidationResult(true); |
| PDDocument pdDocument = null; |
| checkPdfHeader(); |
| try |
| { |
| // disable leniency |
| pdDocument = super.parse(false); |
| } |
| catch (IOException e) |
| { |
| addValidationError(new ValidationError(PreflightConstants.ERROR_SYNTAX_COMMON, e.getMessage())); |
| throw new SyntaxValidationException(e, validationResult); |
| } |
| // create context |
| PreflightContext context = new PreflightContext(); |
| context.setDocument(preflightDocument); |
| preflightDocument.setContext(context); |
| context.setXrefTrailerResolver(xrefTrailerResolver); |
| context.setFileLen(fileLen); |
| preflightDocument.addValidationErrors(validationResult.getErrorsList()); |
| return pdDocument; |
| } |
| |
| @Override |
| protected PDDocument createDocument() throws IOException |
| { |
| preflightDocument = new PreflightDocument(document, format, config); |
| return preflightDocument; |
| } |
| |
| // -------------------------------------------------------- |
| // - Below All methods that adds controls on the PDF syntax |
| // -------------------------------------------------------- |
| |
| @Override |
| protected void initialParse() throws IOException |
| { |
| super.initialParse(); |
| // Dereference each object to trigger parser validation errors |
| Set<COSObjectKey> objectKeys = document.getXrefTable().keySet(); |
| objectKeys.forEach(key -> document.getObjectFromPool(key).getObject()); |
| } |
| |
| @Override |
| protected boolean resetTrailerResolver() |
| { |
| // Don't reset the xref trailer resolver after parsing as it is needed for validation |
| return false; |
| } |
| |
| /** |
| * Check that the PDF header match rules of the PDF/A specification. First line (offset 0) must |
| * be a comment with the PDF version (version 1.0 isn't conform to the PDF/A specification) |
| * Second line is a comment with at least 4 bytes greater than 0x80 |
| */ |
| private void checkPdfHeader() |
| { |
| try |
| { |
| source.seek(0); |
| String firstLine = readLine(); |
| if (firstLine == null || !firstLine.matches("%PDF-1\\.[1-9]")) |
| { |
| addValidationError(new ValidationError(PreflightConstants.ERROR_SYNTAX_HEADER, |
| "First line must match %PDF-1.\\d")); |
| } |
| String secondLine = readLine(); |
| if (secondLine != null) |
| { |
| boolean secondLineFails = false; |
| byte[] secondLineAsBytes = secondLine.getBytes(ENCODING); |
| if (secondLineAsBytes.length == 0 || secondLineAsBytes[0] != '%') |
| { |
| secondLineFails = true; |
| } |
| else if (secondLine.length() >= 5) |
| { |
| for (int i = 1; i < 5; ++i) |
| { |
| secondLineFails |= (secondLineAsBytes[i] & 0xFF) < 0x80; |
| } |
| } |
| if (secondLineFails) |
| { |
| addValidationError(new ValidationError(PreflightConstants.ERROR_SYNTAX_HEADER, |
| "Second line must begin with '%' followed by at least 4 bytes greater than 127")); |
| } |
| } |
| source.seek(0); |
| } |
| catch (IOException e) |
| { |
| addValidationError(new ValidationError(PreflightConstants.ERROR_SYNTAX_HEADER, |
| "Unable to read the PDF file : " + e.getMessage(), e)); |
| } |
| } |
| |
| /** |
| * Same method than the {@linkplain PDFParser#parseXrefTable(long)} with additional controls : - |
| * EOL mandatory after the 'xref' keyword - Cross reference subsection header uses single white |
| * space as separator - and so on |
| * |
| * @param startByteOffset the offset to start at |
| * @return false on parsing error |
| * @throws IOException If an IO error occurs. |
| */ |
| @Override |
| protected boolean parseXrefTable(long startByteOffset) throws IOException |
| { |
| if (source.peek() != 'x') |
| { |
| return false; |
| } |
| String xref = readString(); |
| if (!xref.equals("xref")) |
| { |
| addValidationError(new ValidationError(PreflightConstants.ERROR_SYNTAX_CROSS_REF, |
| "xref must be followed by a EOL character")); |
| return false; |
| } |
| if (!nextIsEOL()) |
| { |
| addValidationError(new ValidationError(PreflightConstants.ERROR_SYNTAX_CROSS_REF, |
| "xref must be followed by EOL")); |
| } |
| |
| // signal start of new XRef |
| xrefTrailerResolver.nextXrefObj(startByteOffset,XRefType.TABLE); |
| |
| // Xref tables can have multiple sections. Each starts with a starting object id and a count. |
| while (true) |
| { |
| // just after the xref<EOL> there are an integer |
| // first obj id |
| long currObjID; |
| // the number of objects in the xref table |
| int count; |
| |
| long offset = source.getPosition(); |
| String line = readLine(); |
| Pattern pattern = Pattern.compile("(\\d+)\\s(\\d+)(\\s*)"); |
| Matcher matcher = pattern.matcher(line); |
| if (matcher.matches()) |
| { |
| currObjID = Long.parseLong(matcher.group(1)); |
| count = Integer.parseInt(matcher.group(2)); |
| } |
| else |
| { |
| addValidationError(new ValidationError(ERROR_SYNTAX_CROSS_REF, |
| "Cross reference subsection header is invalid: '" + line + "' at position " |
| + source.getPosition())); |
| // reset source cursor to read xref information |
| source.seek(offset); |
| // first obj id |
| currObjID = readObjectNumber(); |
| // the number of objects in the xref table |
| count = readInt(); |
| } |
| |
| skipSpaces(); |
| for (int i = 0; i < count; i++) |
| { |
| if (source.isEOF() || isEndOfName((char) source.peek())) |
| { |
| break; |
| } |
| if (source.peek() == 't') |
| { |
| addValidationError(new ValidationError(PreflightConstants.ERROR_SYNTAX_CROSS_REF, |
| "Expected xref line but 't' found")); |
| break; |
| } |
| // Ignore table contents |
| String currentLine = readLine(); |
| String[] splitString = currentLine.split(" "); |
| if (splitString.length < 3) |
| { |
| addValidationError(new ValidationError(PreflightConstants.ERROR_SYNTAX_CROSS_REF, |
| "invalid xref line: " + currentLine)); |
| break; |
| } |
| // This supports the corrupt table as reported in PDFBOX-474 (XXXX XXX XX n) |
| if (splitString[splitString.length - 1].equals("n")) |
| { |
| try |
| { |
| long currOffset = Long.parseLong(splitString[0]); |
| int currGenID = Integer.parseInt(splitString[1]); |
| COSObjectKey objKey = new COSObjectKey(currObjID, currGenID); |
| xrefTrailerResolver.setXRef(objKey, currOffset); |
| } |
| catch (NumberFormatException e) |
| { |
| addValidationError(new ValidationError(PreflightConstants.ERROR_SYNTAX_CROSS_REF, |
| "offset or genid can't be read as number " + e.getMessage(), e)); |
| } |
| } |
| else if (!splitString[2].equals("f")) |
| { |
| addValidationError(new ValidationError(PreflightConstants.ERROR_SYNTAX_CROSS_REF, |
| "Corrupt XRefTable Entry - ObjID:" + currObjID)); |
| } |
| currObjID++; |
| skipSpaces(); |
| } |
| skipSpaces(); |
| if (!isDigit()) |
| { |
| break; |
| } |
| } |
| return true; |
| } |
| |
| /** |
| * Wraps the {@link PDFParser#parseCOSStream} to check rules on 'stream' and 'endstream' |
| * keywords. {@link #checkStreamKeyWord()} and {@link #checkEndstreamKeyWord(org.apache.pdfbox.cos.COSDictionary, long)} |
| * |
| * @param dic dictionary that goes with this stream. |
| * |
| * @return parsed pdf stream. |
| * |
| * @throws IOException if an error occurred reading the stream, like problems with reading |
| * length attribute, stream does not end with 'endstream' after data read, stream too short etc. |
| */ |
| @Override |
| protected COSStream parseCOSStream(COSDictionary dic) throws IOException |
| { |
| long startOffset = checkStreamKeyWord(); |
| COSStream result = super.parseCOSStream(dic); |
| checkEndstreamKeyWord(dic, startOffset); |
| return result; |
| } |
| |
| /** |
| * 'stream' must be followed by <CR><LF> or only <LF> |
| * |
| * @throws IOException |
| */ |
| private long checkStreamKeyWord() throws IOException |
| { |
| String streamV = readString(); |
| if (!streamV.equals("stream")) |
| { |
| addValidationError(new ValidationError(ERROR_SYNTAX_STREAM_DELIMITER, |
| "Expected 'stream' keyword but found '" + streamV + "' at offset "+source.getPosition())); |
| } |
| long startOffset = source.getPosition(); |
| int nextChar = source.read(); |
| if (nextChar == ASCII_CR && source.peek() == ASCII_LF) |
| { |
| startOffset += 2; |
| } |
| else if (nextChar == ASCII_LF) |
| { |
| startOffset++; |
| } |
| else |
| { |
| addValidationError(new ValidationError(ERROR_SYNTAX_STREAM_DELIMITER, |
| "Expected 'EOL' after the stream keyword at offset "+source.getPosition())); |
| } |
| // set the offset before stream |
| source.seek(source.getPosition() - 7); |
| return startOffset; |
| } |
| |
| /** |
| * 'endstream' must be preceded by an EOL |
| * |
| * @throws IOException |
| */ |
| private void checkEndstreamKeyWord(COSDictionary dic, long startOffset) |
| throws IOException |
| { |
| source.seek(source.getPosition() - 10); |
| long endOffset = source.getPosition(); |
| int nextChar = source.read(); |
| boolean eolFound = false; |
| boolean crlfFound = false; |
| // LF found |
| if (nextChar == ASCII_LF) |
| { |
| eolFound = true; |
| // check if the LF is part of a CRLF |
| source.rewind(2); |
| if (source.read() == ASCII_CR) |
| { |
| endOffset--; |
| crlfFound = true; |
| } |
| source.read(); |
| } |
| boolean addStreamLengthErrorMessage = false; |
| long actualLength = endOffset - startOffset; |
| if (!eolFound) |
| { |
| addValidationError(new ValidationError(ERROR_SYNTAX_STREAM_DELIMITER, |
| "Expected 'EOL' before the endstream keyword at offset "+source.getPosition()+" but found '"+source.peek()+"'")); |
| addStreamLengthErrorMessage = true; |
| } |
| String endstreamV = readString(); |
| if (!endstreamV.equals("endstream")) |
| { |
| addValidationError(new ValidationError(ERROR_SYNTAX_STREAM_DELIMITER, |
| "Expected 'endstream' keyword at offset "+source.getPosition()+" but found '" + endstreamV + "'")); |
| addStreamLengthErrorMessage = true; |
| } |
| |
| int length = dic.getInt(COSName.LENGTH); |
| if (addStreamLengthErrorMessage || // |
| (length > -1 && ((!crlfFound && length - actualLength != 0) |
| || (crlfFound && length - actualLength > 1)))) |
| { |
| addValidationError(new ValidationError(ERROR_SYNTAX_STREAM_LENGTH_INVALID, |
| "Stream length is invalid [dic=" + dic + "; defined length=" + length |
| + "; actual length=" + actualLength + ", starting offset=" |
| + startOffset)); |
| } |
| } |
| |
| private boolean nextIsEOL() throws IOException |
| { |
| boolean succeed = false; |
| int nextChar = source.read(); |
| if (ASCII_CR == nextChar && ASCII_LF == source.peek()) |
| { |
| source.read(); |
| succeed = true; |
| } |
| else if (ASCII_CR == nextChar || ASCII_LF == nextChar) |
| { |
| succeed = true; |
| } |
| return succeed; |
| } |
| |
| @Override |
| /** |
| * Call {@link BaseParser#parseCOSArray()} and check the number of element in the array |
| */ |
| protected COSArray parseCOSArray() throws IOException |
| { |
| COSArray result = super.parseCOSArray(); |
| if (result != null && result.size() > MAX_ARRAY_ELEMENTS) |
| { |
| addValidationError(new ValidationError(ERROR_SYNTAX_ARRAY_TOO_LONG, "Array too long : " + result.size())); |
| } |
| return result; |
| } |
| |
| @Override |
| /** |
| * Call {@link BaseParser#parseCOSName()} and check the length of the name |
| */ |
| protected COSName parseCOSName() throws IOException |
| { |
| COSName result = super.parseCOSName(); |
| if (result != null && result.getName().getBytes().length > MAX_NAME_SIZE) |
| { |
| addValidationError(new ValidationError(ERROR_SYNTAX_NAME_TOO_LONG, "Name too long: " + result.getName())); |
| } |
| return result; |
| } |
| |
| /** |
| * Check that the hexa string contains only an even number of |
| * Hexadecimal characters. Once it is done, reset the offset at the beginning of the string and |
| * call {@link PDFParser#parseCOSString()} |
| * |
| * @return The parsed PDF string. |
| * |
| * @throws IOException If there is an error reading from the stream. |
| */ |
| @Override |
| protected COSString parseCOSString() throws IOException |
| { |
| // offset reminder |
| long offset = source.getPosition(); |
| char nextChar = (char) source.read(); |
| int count = 0; |
| if (nextChar == '<') |
| { |
| do |
| { |
| nextChar = (char) source.read(); |
| if (!isWhitespace(nextChar) && nextChar != '>') |
| { |
| if (Character.digit(nextChar, 16) >= 0) |
| { |
| count++; |
| } |
| else |
| { |
| addValidationError(new ValidationError(ERROR_SYNTAX_HEXA_STRING_INVALID, |
| "Hexa String must have only Hexadecimal Characters (found '" + nextChar + "') at offset " + source.getPosition())); |
| break; |
| } |
| } |
| } |
| while (nextChar != '>'); |
| } |
| |
| if (count % 2 != 0) |
| { |
| addValidationError(new ValidationError(ERROR_SYNTAX_HEXA_STRING_EVEN_NUMBER, |
| "Hexa string shall contain even number of non white space char at offset " + source.getPosition())); |
| } |
| |
| // reset the offset to parse the COSString |
| source.seek(offset); |
| COSString result = super.parseCOSString(); |
| |
| if (result.getString().length() > MAX_STRING_LENGTH) |
| { |
| addValidationError(new ValidationError(ERROR_SYNTAX_HEXA_STRING_TOO_LONG, "Hexa string is too long at offset "+source.getPosition())); |
| } |
| return result; |
| } |
| |
| /** |
| * Call {@link PDFParser#parseDirObject()} check limit range for Float, Integer and number of |
| * Dictionary entries. |
| * |
| * @return The parsed object. |
| * @throws java.io.IOException if there is an error during parsing. |
| */ |
| @Override |
| protected COSBase parseDirObject() throws IOException |
| { |
| COSBase result = super.parseDirObject(); |
| |
| if (result instanceof COSNumber) |
| { |
| COSNumber number = (COSNumber) result; |
| if (number instanceof COSFloat) |
| { |
| float real = number.floatValue(); |
| if (real > MAX_POSITIVE_FLOAT || real < MAX_NEGATIVE_FLOAT) |
| { |
| addValidationError(new ValidationError(ERROR_SYNTAX_NUMERIC_RANGE, |
| "Float is too long or too small: " + real+" at offset "+source.getPosition())); |
| } |
| } |
| else |
| { |
| long numAsLong = number.longValue(); |
| if (numAsLong > Integer.MAX_VALUE || numAsLong < Integer.MIN_VALUE) |
| { |
| addValidationError(new ValidationError(ERROR_SYNTAX_NUMERIC_RANGE, |
| "Numeric is too long or too small: " + numAsLong+" at offset "+source.getPosition())); |
| } |
| } |
| } |
| |
| if (result instanceof COSDictionary) |
| { |
| COSDictionary dic = (COSDictionary) result; |
| if (dic.size() > MAX_DICT_ENTRIES) |
| { |
| addValidationError(new ValidationError(ERROR_SYNTAX_TOO_MANY_ENTRIES, "Too Many Entries In Dictionary at offset "+source.getPosition())); |
| } |
| } |
| return result; |
| } |
| |
| @Override |
| protected synchronized COSBase parseObjectDynamically(long objNr, int objGenNr, |
| boolean requireExistingNotCompressedObj) |
| throws IOException |
| { |
| // ---- create object key and get object (container) from pool |
| final COSObjectKey objKey = new COSObjectKey(objNr, objGenNr); |
| final COSObject pdfObject = document.getObjectFromPool(objKey); |
| COSBase referencedObject = !pdfObject.isObjectNull() ? pdfObject.getObject() : null; |
| |
| // not previously parsed |
| if (referencedObject == null) |
| { |
| // read offset or object stream object number from xref table |
| Long offsetOrObjstmObNr = document.getXrefTable().get(objKey); |
| |
| // sanity test to circumvent loops with broken documents |
| if (requireExistingNotCompressedObj && ((offsetOrObjstmObNr == null))) |
| { |
| addValidationError(new ValidationError(ERROR_SYNTAX_MISSING_OFFSET, |
| "Object must be defined and must not be compressed object: " + objKey.getNumber() + ":" |
| + objKey.getGeneration())); |
| throw new SyntaxValidationException("Object must be defined and must not be compressed object: " |
| + objKey.getNumber() + ":" + objKey.getGeneration(), |
| validationResult); |
| } |
| |
| if (offsetOrObjstmObNr == null) |
| { |
| // not defined object -> NULL object (Spec. 1.7, chap. 3.2.9) |
| // remove parser to avoid endless recursion |
| pdfObject.setToNull(); |
| |
| } |
| else if (offsetOrObjstmObNr == 0) |
| { |
| addValidationError(new ValidationError(ERROR_SYNTAX_INVALID_OFFSET, "Object {" + objKey.getNumber() |
| + ":" + objKey.getGeneration() + "} has an offset of 0")); |
| } |
| else if (offsetOrObjstmObNr > 0) |
| { |
| // offset of indirect object in file |
| referencedObject = parseFileObject(offsetOrObjstmObNr, objKey); |
| } |
| else |
| { |
| // xref value is object nr of object stream containing object to be parsed |
| // since our object was not found it means object stream was not parsed so far |
| referencedObject = parseObjectStreamObject((int) -offsetOrObjstmObNr, objKey); |
| } |
| if (referencedObject == null || referencedObject instanceof COSNull) |
| { |
| pdfObject.setToNull(); |
| } |
| } |
| return referencedObject; |
| } |
| |
| private COSBase parseFileObject(Long offsetOrObjstmObNr, final COSObjectKey objKey) |
| throws IOException |
| { |
| // offset of indirect object in file |
| // ---- go to object start |
| source.seek(offsetOrObjstmObNr); |
| // ---- we must have an indirect object |
| long readObjNr; |
| int readObjGen; |
| |
| long offset = source.getPosition(); |
| String line = readLine(); |
| Pattern pattern = Pattern.compile("(\\d+)\\s(\\d+)\\sobj"); |
| Matcher matcher = pattern.matcher(line); |
| if (matcher.matches()) |
| { |
| readObjNr = Long.parseLong(matcher.group(1)); |
| readObjGen = Integer.parseInt(matcher.group(2)); |
| } |
| else |
| { |
| |
| addValidationError(new ValidationError(ERROR_SYNTAX_OBJ_DELIMITER, |
| "Single space expected [offset=" + offset + "; key=" |
| + offsetOrObjstmObNr.toString() + "; line=" + line + "; object=" |
| + objKey.getNumber() + " " + objKey.getGeneration() + "]")); |
| |
| // reset source cursor to read object information |
| source.seek(offset); |
| readObjNr = readObjectNumber(); |
| readObjGen = readGenerationNumber(); |
| skipSpaces(); // skip spaces between Object Generation number and the 'obj' keyword |
| for (char c : OBJ_MARKER) |
| { |
| if (source.read() != c) |
| { |
| addValidationError(new ValidationError(ERROR_SYNTAX_OBJ_DELIMITER, |
| "Expected pattern '" + new String(OBJ_MARKER) |
| + " but missed at character '" + c + "'")); |
| throw new SyntaxValidationException("Expected pattern '" |
| + new String(OBJ_MARKER) + " but missed at character '" + c + "'", |
| validationResult); |
| } |
| } |
| } |
| |
| // ---- consistency check |
| if ((readObjNr != objKey.getNumber()) || (readObjGen != objKey.getGeneration())) |
| { |
| throw new IOException("XREF for " + objKey.getNumber() + ":" + objKey.getGeneration() |
| + " points to wrong object: " + readObjNr + ":" + readObjGen); |
| } |
| |
| skipSpaces(); |
| COSBase referencedObject = parseDirObject(); |
| skipSpaces(); |
| long endObjectOffset = source.getPosition(); |
| String endObjectKey = readString(); |
| |
| if (endObjectKey.equals("stream")) |
| { |
| source.seek(endObjectOffset); |
| if (referencedObject instanceof COSDictionary) |
| { |
| COSStream stream = parseCOSStream((COSDictionary) referencedObject); |
| if (securityHandler != null) |
| { |
| securityHandler.decryptStream(stream, readObjNr, readObjGen); |
| } |
| referencedObject = stream; |
| } |
| else |
| { |
| // this is not legal |
| // the combination of a dict and the stream/endstream forms a complete stream object |
| throw new IOException( |
| "Stream not preceded by dictionary (offset: " + offsetOrObjstmObNr + ")."); |
| } |
| skipSpaces(); |
| endObjectOffset = source.getPosition(); |
| endObjectKey = readString(); |
| |
| // we have case with a second 'endstream' before endobj |
| if (!endObjectKey.startsWith("endobj") && endObjectKey.startsWith("endstream")) |
| { |
| endObjectKey = endObjectKey.substring(9).trim(); |
| if (endObjectKey.length() == 0) |
| { |
| // no other characters in extra endstream line |
| endObjectKey = readString(); // read next line |
| } |
| } |
| } |
| else if (securityHandler != null) |
| { |
| securityHandler.decrypt(referencedObject, readObjNr, readObjGen); |
| } |
| if (!endObjectKey.startsWith("endobj")) |
| { |
| throw new IOException("Object (" + readObjNr + ":" + readObjGen + ") at offset " |
| + offsetOrObjstmObNr + " does not end with 'endobj'."); |
| } |
| else |
| { |
| offset = source.getPosition(); |
| source.seek(endObjectOffset - 1); |
| if (!nextIsEOL()) |
| { |
| addValidationError( |
| new ValidationError(PreflightConstants.ERROR_SYNTAX_OBJ_DELIMITER, |
| "EOL expected before the 'endobj' keyword at offset " |
| + source.getPosition())); |
| } |
| source.seek(offset); |
| } |
| |
| if (!nextIsEOL()) |
| { |
| addValidationError(new ValidationError(PreflightConstants.ERROR_SYNTAX_OBJ_DELIMITER, |
| "EOL expected after the 'endobj' keyword at offset " + source.getPosition())); |
| } |
| return referencedObject; |
| } |
| |
| @Override |
| protected int lastIndexOf(final char[] pattern, final byte[] buf, final int endOff) |
| { |
| int offset = super.lastIndexOf(pattern, buf, endOff); |
| if (offset > 0 && Arrays.equals(pattern, EOF_MARKER)) |
| { |
| // this is the offset of the last %%EOF sequence. |
| // nothing should be present after this sequence. |
| int tmpOffset = offset + pattern.length; |
| int offsetDiff = buf.length - tmpOffset; |
| // EOL is authorized |
| if (offsetDiff > 2 |
| || (offsetDiff == 2 |
| && (buf[tmpOffset] != ASCII_CR || buf[tmpOffset + 1] != ASCII_LF)) |
| || (offsetDiff == 1 |
| && (buf[tmpOffset] != ASCII_CR && buf[tmpOffset] != ASCII_LF))) |
| { |
| long position; |
| try |
| { |
| position = source.getPosition(); |
| } |
| catch (IOException ex) |
| { |
| position = Long.MIN_VALUE; |
| } |
| addValidationError(new ValidationError(ERROR_SYNTAX_TRAILER_EOF, |
| "File contains data after the last %%EOF sequence at offset " + position)); |
| } |
| } |
| return offset; |
| } |
| |
| /** |
| * Load and validate the given file. Returns the validation result and closes the read pdf. |
| * |
| * @param file thew file to be read and validated |
| * @return the validation result |
| * @throws IOException in case of a file reading or parsing error |
| */ |
| public static ValidationResult validate(File file) throws IOException |
| { |
| ValidationResult result; |
| PreflightParser parser = new PreflightParser(file); |
| try (PreflightDocument document = (PreflightDocument) parser.parse()) |
| { |
| result = document.validate(); |
| } |
| catch (SyntaxValidationException e) |
| { |
| result = e.getResult(); |
| } |
| return result; |
| } |
| } |