| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package org.apache.pdfbox.pdfparser; |
| |
| import static org.junit.Assert.assertEquals; |
| import static org.junit.Assert.fail; |
| |
| import java.io.File; |
| import java.io.IOException; |
| import java.net.URISyntaxException; |
| |
| import org.apache.pdfbox.Loader; |
| import org.apache.pdfbox.pdmodel.PDDocument; |
| import org.apache.pdfbox.pdmodel.PDDocumentInformation; |
| import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline; |
| import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem; |
| import org.apache.pdfbox.rendering.PDFRenderer; |
| import org.apache.pdfbox.util.DateConverter; |
| import org.junit.Test; |
| |
| public class TestPDFParser |
| { |
| private static final File TARGETPDFDIR = new File("target/pdfs"); |
| |
| @Test |
| public void testPDFParserMissingCatalog() throws URISyntaxException |
| { |
| // PDFBOX-3060 |
| try |
| { |
| Loader.loadPDF(new File(TestPDFParser.class.getResource("MissingCatalog.pdf").toURI())) |
| .close(); |
| } |
| catch (Exception exception) |
| { |
| fail("Unexpected Exception"); |
| } |
| } |
| |
| /** |
| * Test whether /Info dictionary is retrieved correctly when rebuilding the trailer of a corrupt |
| * file. An incorrect algorithm would result in an outline dictionary being mistaken for an |
| * /Info. |
| * |
| * @throws IOException |
| */ |
| @Test |
| public void testPDFBox3208() throws IOException |
| { |
| try (PDDocument doc = Loader |
| .loadPDF(new File(TARGETPDFDIR, "PDFBOX-3208-L33MUTT2SVCWGCS6UIYL5TH3PNPXHIS6.pdf"))) |
| { |
| PDDocumentInformation di = doc.getDocumentInformation(); |
| assertEquals("Liquent Enterprise Services", di.getAuthor()); |
| assertEquals("Liquent services server", di.getCreator()); |
| assertEquals("Amyuni PDF Converter version 4.0.0.9", di.getProducer()); |
| assertEquals("", di.getKeywords()); |
| assertEquals("", di.getSubject()); |
| assertEquals("892B77DE781B4E71A1BEFB81A51A5ABC_20140326022424.docx", di.getTitle()); |
| assertEquals(DateConverter.toCalendar("D:20140326142505-02'00'"), di.getCreationDate()); |
| assertEquals(DateConverter.toCalendar("20140326172513Z"), di.getModificationDate()); |
| } |
| } |
| |
| /** |
| * Test whether the /Info is retrieved correctly when rebuilding the trailer of a corrupt file, |
| * despite the /Info dictionary not having a modification date. |
| * |
| * @throws IOException |
| */ |
| @Test |
| public void testPDFBox3940() throws IOException |
| { |
| try (PDDocument doc = Loader.loadPDF(new File(TARGETPDFDIR, "PDFBOX-3940-079977.pdf"))) |
| { |
| PDDocumentInformation di = doc.getDocumentInformation(); |
| assertEquals("Unknown", di.getAuthor()); |
| assertEquals("C:REGULA~1IREGSFR_EQ_EM.WP", di.getCreator()); |
| assertEquals("Acrobat PDFWriter 3.02 for Windows", di.getProducer()); |
| assertEquals("", di.getKeywords()); |
| assertEquals("", di.getSubject()); |
| assertEquals("C:REGULA~1IREGSFR_EQ_EM.PDF", di.getTitle()); |
| assertEquals(DateConverter.toCalendar("Tuesday, July 28, 1998 4:00:09 PM"), di.getCreationDate()); |
| } |
| } |
| |
| /** |
| * PDFBOX-3783: test parsing of file with trash after %%EOF. |
| */ |
| @Test |
| public void testPDFBox3783() |
| { |
| try |
| { |
| Loader.loadPDF( |
| new File(TARGETPDFDIR, "PDFBOX-3783-72GLBIGUC6LB46ELZFBARRJTLN4RBSQM.pdf")) |
| .close(); |
| } |
| catch (Exception exception) |
| { |
| fail("Unexpected IOException"); |
| } |
| |
| } |
| |
| /** |
| * PDFBOX-3785, PDFBOX-3957: |
| * Test whether truncated file with several revisions has correct page count. |
| * |
| * @throws IOException |
| */ |
| @Test |
| public void testPDFBox3785() throws IOException |
| { |
| try (PDDocument doc = Loader.loadPDF(new File(TARGETPDFDIR, "PDFBOX-3785-202097.pdf"))) |
| { |
| assertEquals(11, doc.getNumberOfPages()); |
| } |
| } |
| |
| /** |
| * PDFBOX-3947: test parsing of file with broken object stream. |
| */ |
| @Test |
| public void testPDFBox3947() |
| { |
| try |
| { |
| Loader.loadPDF(new File(TARGETPDFDIR, "PDFBOX-3947-670064.pdf")).close(); |
| } |
| catch (Exception exception) |
| { |
| fail("Unexpected Exception"); |
| } |
| } |
| |
| /** |
| * PDFBOX-3948: test parsing of file with object stream containing some unexpected newlines. |
| */ |
| @Test |
| public void testPDFBox3948() |
| { |
| try |
| { |
| Loader.loadPDF( |
| new File(TARGETPDFDIR, "PDFBOX-3948-EUWO6SQS5TM4VGOMRD3FLXZHU35V2CP2.pdf")) |
| .close(); |
| } |
| catch (Exception exception) |
| { |
| fail("Unexpected Exception"); |
| } |
| } |
| |
| /** |
| * PDFBOX-3949: test parsing of file with incomplete object stream. |
| */ |
| @Test |
| public void testPDFBox3949() |
| { |
| try |
| { |
| Loader.loadPDF( |
| new File(TARGETPDFDIR, "PDFBOX-3949-MKFYUGZWS3OPXLLVU2Z4LWCTVA5WNOGF.pdf")) |
| .close(); |
| } |
| catch (Exception exception) |
| { |
| fail("Unexpected Exception"); |
| } |
| } |
| |
| /** |
| * PDFBOX-3950: test parsing and rendering of truncated file with missing pages. |
| * |
| * @throws IOException |
| */ |
| @Test |
| public void testPDFBox3950() throws IOException |
| { |
| try (PDDocument doc = Loader |
| .loadPDF(new File(TARGETPDFDIR, "PDFBOX-3950-23EGDHXSBBYQLKYOKGZUOVYVNE675PRD.pdf"))) |
| { |
| assertEquals(4, doc.getNumberOfPages()); |
| PDFRenderer renderer = new PDFRenderer(doc); |
| for (int i = 0; i < doc.getNumberOfPages(); ++i) |
| { |
| try |
| { |
| renderer.renderImage(i); |
| } |
| catch (IOException ex) |
| { |
| if (i == 3 && ex.getMessage().equals("Missing descendant font array")) |
| { |
| continue; |
| } |
| throw ex; |
| } |
| } |
| } |
| } |
| |
| /** |
| * PDFBOX-3951: test parsing of truncated file. |
| * |
| * @throws IOException |
| */ |
| @Test |
| public void testPDFBox3951() throws IOException |
| { |
| try (PDDocument doc = Loader |
| .loadPDF(new File(TARGETPDFDIR, "PDFBOX-3951-FIHUZWDDL2VGPOE34N6YHWSIGSH5LVGZ.pdf"))) |
| { |
| assertEquals(143, doc.getNumberOfPages()); |
| } |
| } |
| |
| /** |
| * PDFBOX-3964: test parsing of broken file. |
| * |
| * @throws IOException |
| */ |
| @Test |
| public void testPDFBox3964() throws IOException |
| { |
| try (PDDocument doc = Loader |
| .loadPDF(new File(TARGETPDFDIR, "PDFBOX-3964-c687766d68ac766be3f02aaec5e0d713_2.pdf"))) |
| { |
| assertEquals(10, doc.getNumberOfPages()); |
| } |
| } |
| |
| /** |
| * Test whether /Info dictionary is retrieved correctly in brute force search for the |
| * Info/Catalog dictionaries. |
| * |
| * @throws IOException |
| */ |
| @Test |
| public void testPDFBox3977() throws IOException |
| { |
| try (PDDocument doc = Loader |
| .loadPDF(new File(TARGETPDFDIR, "PDFBOX-3977-63NGFQRI44HQNPIPEJH5W2TBM6DJZWMI.pdf"))) |
| { |
| PDDocumentInformation di = doc.getDocumentInformation(); |
| assertEquals("QuarkXPress(tm) 6.52", di.getCreator()); |
| assertEquals("Acrobat Distiller 7.0 pour Macintosh", di.getProducer()); |
| assertEquals("Fich sal Fabr corr1 (Page 6)", di.getTitle()); |
| assertEquals(DateConverter.toCalendar("D:20070608151915+02'00'"), di.getCreationDate()); |
| assertEquals(DateConverter.toCalendar("D:20080604152122+02'00'"), di.getModificationDate()); |
| } |
| } |
| |
| /** |
| * Test parsing the "genko_oc_shiryo1.pdf" file, which is susceptible to regression. |
| */ |
| @Test |
| public void testParseGenko() |
| { |
| try |
| { |
| Loader.loadPDF(new File(TARGETPDFDIR, "genko_oc_shiryo1.pdf")).close(); |
| } |
| catch (Exception exception) |
| { |
| fail("Unexpected Exception"); |
| } |
| } |
| |
| /** |
| * Test parsing the file from PDFBOX-4338, which brought an |
| * ArrayIndexOutOfBoundsException before the bug was fixed. |
| */ |
| @Test |
| public void testPDFBox4338() |
| { |
| try |
| { |
| Loader.loadPDF(new File(TARGETPDFDIR, "PDFBOX-4338.pdf")).close(); |
| } |
| catch (Exception exception) |
| { |
| fail("Unexpected Exception"); |
| } |
| } |
| |
| /** |
| * Test parsing the file from PDFBOX-4339, which brought a |
| * NullPointerException before the bug was fixed. |
| */ |
| @Test |
| public void testPDFBox4339() |
| { |
| try |
| { |
| Loader.loadPDF(new File(TARGETPDFDIR, "PDFBOX-4339.pdf")).close(); |
| } |
| catch (Exception exception) |
| { |
| fail("Unexpected Exception"); |
| } |
| } |
| |
| /** |
| * Test parsing the "WXMDXCYRWFDCMOSFQJ5OAJIAFXYRZ5OA.pdf" file, which is susceptible to |
| * regression. |
| * |
| * @throws IOException |
| */ |
| @Test |
| public void testPDFBox4153() throws IOException |
| { |
| try (PDDocument doc = Loader.loadPDF(new File(TARGETPDFDIR, "PDFBOX-4153-WXMDXCYRWFDCMOSFQJ5OAJIAFXYRZ5OA.pdf"))) |
| { |
| PDDocumentOutline documentOutline = doc.getDocumentCatalog().getDocumentOutline(); |
| PDOutlineItem firstChild = documentOutline.getFirstChild(); |
| assertEquals("Main Menu", firstChild.getTitle()); |
| } |
| } |
| |
| /** |
| * Test that PDFBOX-4490 has 3 pages. |
| * |
| * @throws IOException |
| */ |
| @Test |
| public void testPDFBox4490() throws IOException |
| { |
| try (PDDocument doc = Loader.loadPDF(new File(TARGETPDFDIR, "PDFBOX-4490.pdf"))) |
| { |
| assertEquals(3, doc.getNumberOfPages()); |
| } |
| } |
| |
| } |