tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java - tika - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.tika.parser.pdf;

 import static org.junit.Assert.assertArrayEquals;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertNotNull;
 import static org.junit.Assert.assertNull;
 import static org.junit.Assert.assertTrue;

 import java.io.InputStream;
 import java.util.Arrays;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
 import java.util.logging.Level;
 import java.util.logging.Logger;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;

 import org.apache.commons.io.IOUtils;
 import org.apache.pdfbox.rendering.ImageType;
 import org.junit.AfterClass;
 import org.junit.BeforeClass;
 import org.junit.Ignore;
 import org.junit.Test;
 import org.xml.sax.ContentHandler;

 import org.apache.tika.Tika;
 import org.apache.tika.TikaTest;
 import org.apache.tika.config.TikaConfig;
 import org.apache.tika.exception.AccessPermissionException;
 import org.apache.tika.exception.EncryptedDocumentException;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.exception.ZeroByteFileException;
 import org.apache.tika.extractor.DocumentSelector;
 import org.apache.tika.metadata.Font;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.Office;
 import org.apache.tika.metadata.OfficeOpenXMLCore;
 import org.apache.tika.metadata.PDF;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.metadata.XMP;
 import org.apache.tika.metadata.XMPMM;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.CompositeParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.PasswordProvider;
 import org.apache.tika.sax.BodyContentHandler;
 import org.apache.tika.sax.ContentHandlerDecorator;

 /**
  * Test case for parsing pdf files.
  */
 public class PDFParserTest extends TikaTest {

     public static Level PDFBOX_LOG_LEVEL = Level.INFO;

     @BeforeClass
     public static void setup() {
         //remember default logging level, but turn off for PDFParserTest
         PDFBOX_LOG_LEVEL = Logger.getLogger("org.apache.pdfbox").getLevel();
         Logger.getLogger("org.apache.pdfbox").setLevel(Level.OFF);
     }

     @AfterClass
     public static void tearDown() {
         //return to regular logging level
         Logger.getLogger("org.apache.pdfbox").setLevel(PDFBOX_LOG_LEVEL);
     }

     private static int substringCount(String needle, String haystack) {
         int upto = -1;
         int count = 0;
         while (true) {
             final int next = haystack.indexOf(needle, upto);
             if (next == -1) {
                 break;
             }
             count++;
             upto = next + 1;
         }

         return count;
     }

     @Test
     public void testPdfParsing() throws Exception {

         XMLResult r = getXML("testPDF.pdf");
         Metadata metadata = r.metadata;
         String xml = r.xml;
         assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
         assertEquals("Bertrand Delacr\u00e9taz", metadata.get(TikaCoreProperties.CREATOR));
         assertEquals("Firefox", metadata.get(TikaCoreProperties.CREATOR_TOOL));
         assertEquals("Apache Tika - Apache Tika", metadata.get(TikaCoreProperties.TITLE));

         // Can't reliably test dates yet - see TIKA-451
 //        assertEquals("Sat Sep 15 10:02:31 BST 2007", metadata.get(Metadata.CREATION_DATE));
 //        assertEquals("Sat Sep 15 10:02:31 BST 2007", metadata.get(Metadata.LAST_MODIFIED));

         assertContains("Apache Tika", xml);
         assertContains("Tika - Content Analysis Toolkit", xml);
         assertContains("incubator", xml);
         assertContains("Apache Software Foundation", xml);
         // testing how the end of one paragraph is separated from start of the next one
         assertTrue("should have word boundary after headline",
                 !xml.contains("ToolkitApache"));
         assertTrue("should have word boundary between paragraphs",
                 !xml.contains("libraries.Apache"));
     }

     @Test
     public void testFontNameExtraction() throws Exception {
         PDFParserConfig config = new PDFParserConfig();
         config.setExtractFontNames(true);
         ParseContext pc = new ParseContext();
         pc.set(PDFParserConfig.class, config);
         XMLResult r = getXML("testPDFVarious.pdf", pc);
         assertContains("ABCDEE+Calibri", r.metadata.get(Font.FONT_NAME));
     }

     @Test
     public void testPdfParsingMetadataOnly() throws Exception {

         Metadata metadata = getXML("testPDF.pdf").metadata;
         assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
         assertEquals("Bertrand Delacr\u00e9taz", metadata.get(TikaCoreProperties.CREATOR));
         assertEquals("Firefox", metadata.get(TikaCoreProperties.CREATOR_TOOL));
         assertEquals("Apache Tika - Apache Tika", metadata.get(TikaCoreProperties.TITLE));
     }

     @Test
     public void testCustomMetadata() throws Exception {

         XMLResult r = getXML("testPDF-custommetadata.pdf");
         Metadata metadata = r.metadata;
         assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
         assertEquals("Document author", metadata.get(TikaCoreProperties.CREATOR));
         assertEquals("Document title", metadata.get(TikaCoreProperties.TITLE));

         assertEquals("Custom Value", metadata.get("Custom Property"));

         assertEquals("Array Entry 1", metadata.get("Custom Array"));
         assertEquals(2, metadata.getValues("Custom Array").length);
         assertEquals("Array Entry 1", metadata.getValues("Custom Array")[0]);
         assertEquals("Array Entry 2", metadata.getValues("Custom Array")[1]);

         assertContains("Hello World!", r.xml);
     }

     /**
      * PDFs can be "protected" with the default password. This means
      * they're encrypted (potentially both text and metadata),
      * but we can decrypt them easily.
      */
     @Test
     public void testProtectedPDF() throws Exception {
         XMLResult r = getXML("testPDF_protected.pdf");
         Metadata metadata = r.metadata;
         assertEquals("true", metadata.get("pdf:encrypted"));
         assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
         assertEquals("The Bank of England", metadata.get(TikaCoreProperties.CREATOR));
         assertEquals("Speeches by Andrew G Haldane",
                 metadata.get(OfficeOpenXMLCore.SUBJECT));
         assertEquals(
                 "Rethinking the Financial Network, Speech by Andrew G Haldane, " +
                         "Executive Director, Financial Stability " +
                         "delivered at the Financial Student " +
                         "Association, Amsterdam on 28 April 2009",
                 metadata.get(TikaCoreProperties.TITLE));

         assertContains("RETHINKING THE FINANCIAL NETWORK", r.xml);
         assertContains("On 16 November 2002", r.xml);
         assertContains("In many important respects", r.xml);


         // Try again with an explicit empty password
         ParseContext context = new ParseContext();
         context.set(PasswordProvider.class, new PasswordProvider() {
             public String getPassword(Metadata metadata) {
                 return "";
             }
         });
         r = getXML("testPDF_protected.pdf", context);
         metadata = r.metadata;
         assertEquals("true", metadata.get("pdf:encrypted"));

         assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
         assertEquals("The Bank of England", metadata.get(TikaCoreProperties.CREATOR));
         assertEquals("Speeches by Andrew G Haldane", metadata.get(OfficeOpenXMLCore.SUBJECT));
         assertEquals(
                 "Rethinking the Financial Network, Speech by Andrew G Haldane, " +
                         "Executive Director, Financial Stability delivered at the " +
                         "Financial Student Association, Amsterdam on 28 April 2009",
                 metadata.get(TikaCoreProperties.TITLE));

         assertContains("RETHINKING THE FINANCIAL NETWORK", r.xml);
         assertContains("On 16 November 2002", r.xml);
         assertContains("In many important respects", r.xml);

         //now test wrong password
         context.set(PasswordProvider.class, new PasswordProvider() {
             public String getPassword(Metadata metadata) {
                 return "WRONG!!!!";
             }
         });

         boolean ex = false;
         ContentHandler handler = new BodyContentHandler();
         metadata = new Metadata();
         try (InputStream stream = getResourceAsStream("/test-documents/testPDF_protected.pdf")) {
             AUTO_DETECT_PARSER.parse(stream, handler, metadata, context);
         } catch (EncryptedDocumentException e) {
             ex = true;
         }
         assertTrue("encryption exception", ex);
         assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
         assertEquals("true", metadata.get("pdf:encrypted"));
         //pdf:encrypted, X-Parsed-By and Content-Type
         assertEquals("very little metadata should be parsed", 3, metadata.names().length);
         assertEquals(0, handler.toString().length());
     }

     @Test
     public void testTwoTextBoxes() throws Exception {
         String content;
         try (InputStream stream = getResourceAsStream(
                 "/test-documents/testPDFTwoTextBoxes.pdf")) {
             content = getText(stream, AUTO_DETECT_PARSER);
         }
         content = content.replaceAll("\\s+", " ");
         assertContains(
                 "Left column line 1 Left column line 2 Right column line 1 Right column line 2",
                 content);
     }

     @Test
     public void testVarious() throws Exception {
         Metadata metadata = new Metadata();
         String content;
         try (InputStream stream = getResourceAsStream("/test-documents/testPDFVarious.pdf")) {
             content = getText(stream, AUTO_DETECT_PARSER, metadata);
         }
         //content = content.replaceAll("\\s+"," ");
         assertContains("Footnote appears here", content);
         assertContains("This is a footnote.", content);
         assertContains("This is the header text.", content);
         assertContains("This is the footer text.", content);
         assertContains("Here is a text box", content);
         assertContains("Bold", content);
         assertContains("italic", content);
         assertContains("underline", content);
         assertContains("superscript", content);
         assertContains("subscript", content);
         assertContains("Here is a citation:", content);
         assertContains("Figure 1 This is a caption for Figure 1", content);
         assertContains("(Kramer)", content);
         assertContains("Row 1 Col 1 Row 1 Col 2 Row 1 Col 3 Row " +
                         "2 Col 1 Row 2 Col 2 Row 2 Col 3",
                 content.replaceAll("\\s+", " "));
         assertContains("Row 1 column 1 Row 2 column 1 Row 1 column 2 Row 2 column 2",
                 content.replaceAll("\\s+", " "));
         assertContains("This is a hyperlink", content);
         assertContains("Here is a list:", content);
         for (int row = 1; row <= 3; row++) {
             //assertContains("·\tBullet " + row, content);
             //assertContains("\u00b7\tBullet " + row, content);
             assertContains("Bullet " + row, content);
         }
         assertContains("Here is a numbered list:", content);
         for (int row = 1; row <= 3; row++) {
             //assertContains(row + ")\tNumber bullet " + row, content);
             assertContains(row + ") Number bullet " + row, content);
         }

         for (int row = 1; row <= 2; row++) {
             for (int col = 1; col <= 3; col++) {
                 assertContains("Row " + row + " Col " + col, content);
             }
         }

         assertContains("Keyword1 Keyword2", content);
         assertEquals("Keyword1 Keyword2", metadata.get(Office.KEYWORDS));

         assertContains("Subject is here", content);
         assertEquals("Subject is here", metadata.get(OfficeOpenXMLCore.SUBJECT));

         assertContains("Suddenly some Japanese text:", content);
         // Special version of (GHQ)
         assertContains("\uff08\uff27\uff28\uff31\uff09", content);
         // 6 other characters
         assertContains("\u30be\u30eb\u30b2\u3068\u5c3e" +
                         "\u5d0e\u3001\u6de1\u3005\u3068\u6700\u671f",
                 content);

         assertContains("And then some Gothic text:", content);
         // TODO: I saved the word doc as a PDF, but that
         // process somehow, apparently lost the gothic
         // chars, so we cannot test this here:
         //assertContains("\uD800\uDF32\uD800\uDF3f\uD800\uDF44
         // \uD800\uDF39\uD800\uDF43\uD800\uDF3A", content);
     }

     @Test
     public void testAnnotations() throws Exception {
         String content;
         try (InputStream stream = getResourceAsStream("/test-documents/testAnnotations.pdf")) {
             content = getText(stream, AUTO_DETECT_PARSER);
         }
         content = content.replaceAll("[\\s\u00a0]+", " ");
         assertContains("Here is some text", content);
         assertContains("Here is a comment", content);

         // Test w/ annotation text disabled:
         PDFParser pdfParser = new PDFParser();
         pdfParser.getPDFParserConfig().setExtractAnnotationText(false);
         try (InputStream stream = getResourceAsStream("/test-documents/testAnnotations.pdf")) {
             content = getText(stream, pdfParser);
         }
         content = content.replaceAll("[\\s\u00a0]+", " ");
         assertContains("Here is some text", content);
         assertEquals(-1, content.indexOf("Here is a comment"));

         // annotation text disabled through parsecontext
         ParseContext context = new ParseContext();
         PDFParserConfig config = new PDFParserConfig();
         config.setExtractAnnotationText(false);
         context.set(PDFParserConfig.class, config);
         try (InputStream stream = getResourceAsStream("/test-documents/testAnnotations.pdf")) {
             content = getText(stream, AUTO_DETECT_PARSER, context);
         }
         content = content.replaceAll("[\\s\u00a0]+", " ");
         assertContains("Here is some text", content);
         assertEquals(-1, content.indexOf("Here is a comment"));


         // TIKA-738: make sure no extra </p> tags
         String xml = getXML("testAnnotations.pdf").xml;
         assertEquals(substringCount("<p>", xml), substringCount("</p>", xml));
     }

     // TIKA-981
     @Test
     public void testPopupAnnotation() throws Exception {
         XMLResult r = getXML("testPopupAnnotation.pdf");
         assertContains("this is the note", r.xml);
         assertContains("igalsh", r.xml);
     }

     @Test
     public void testEmbeddedPDFs() throws Exception {
         String xml = getXML("testPDFPackage.pdf").xml;
         assertContains("PDF1", xml);
         assertContains("PDF2", xml);
     }

     @Test
     public void testPageNumber() throws Exception {
         final XMLResult result = getXML("testPageNumber.pdf");
         final String content = result.xml.replaceAll("\\s+", "");
         assertContains("<p>1</p>", content);
     }

     /**
      * Test to ensure that Links are extracted from the text
      * <p/>
      * Note - the PDF contains the text "This is a hyperlink" which
      * a hyperlink annotation, linking to the tika site, on it. This
      * test will need updating when we're able to apply the annotation
      * to the text itself, rather than following on afterwards as now
      */
     @Test
     public void testLinks() throws Exception {
         final XMLResult result = getXML("testPDFVarious.pdf");
         assertContains("<div class=\"annotation\"><a href=\"http://tika.apache.org/\">" +
                 "http://tika.apache.org/</a></div>", result.xml);
     }

     @Test
     public void testDisableAutoSpace() throws Exception {
         PDFParser parser = new PDFParser();
         parser.getPDFParserConfig().setEnableAutoSpace(false);
         XMLResult r = getXML("testExtraSpaces.pdf", parser);

         String content = r.xml.replaceAll("[\\s\u00a0]+", " ");
         // Text is correct when autoSpace is off:
         assertContains("Here is some formatted text", content);

         parser.getPDFParserConfig().setEnableAutoSpace(true);
         r = getXML("testExtraSpaces.pdf", parser);
         content = r.xml.replaceAll("[\\s\u00a0]+", " ");
         // Text is correct when autoSpace is off:

         // Text has extra spaces when autoSpace is on
         assertEquals(-1, content.indexOf("Here is some formatted text"));

         //now try with autodetect
         ParseContext context = new ParseContext();
         PDFParserConfig config = new PDFParserConfig();
         context.set(PDFParserConfig.class, config);
         //default is true
         r = getXML("testExtraSpaces.pdf", context);
         content = r.xml.replaceAll("[\\s\u00a0]+", " ");
         // Text has extra spaces when autoSpace is on
         assertEquals(-1, content.indexOf("Here is some formatted text"));

         config.setEnableAutoSpace(false);
         r = getXML("testExtraSpaces.pdf", parser, context);
         content = r.xml.replaceAll("[\\s\u00a0]+", " ");

         // Text is correct when autoSpace is off:
         assertContains("Here is some formatted text", content);

     }

     @Test
     public void testDuplicateOverlappingText() throws Exception {
         PDFParser parser = new PDFParser();
         // Default is false (keep overlapping text):
         XMLResult r = getXML("testOverlappingText.pdf", parser);
         assertContains("Text the first timeText the second time", r.xml);

         parser.getPDFParserConfig().setSuppressDuplicateOverlappingText(true);
         r = getXML("testOverlappingText.pdf", parser);
         // "Text the first" was dedup'd:
         assertContains("Text the first timesecond time", r.xml);

         //now try with autodetect
         ParseContext context = new ParseContext();
         PDFParserConfig config = new PDFParserConfig();
         context.set(PDFParserConfig.class, config);
         r = getXML("testOverlappingText.pdf", context);
         // Default is false (keep overlapping text):
         assertContains("Text the first timeText the second time", r.xml);

         config.setSuppressDuplicateOverlappingText(true);
         r = getXML("testOverlappingText.pdf", context);
         // "Text the first" was dedup'd:
         assertContains("Text the first timesecond time", r.xml);

     }

     @Test
     public void testSortByPosition() throws Exception {
         PDFParser parser = new PDFParser();
         parser.getPDFParserConfig().setEnableAutoSpace(false);
         InputStream stream = getResourceAsStream("/test-documents/testPDFTwoTextBoxes.pdf");
         // Default is false (do not sort):
         String content = getText(stream, parser);
         content = content.replaceAll("\\s+", " ");
         assertContains(
                 "Left column line 1 Left column line 2 Right column line 1 Right column line 2",
                 content);

         parser.setSortByPosition(true);
         stream = getResourceAsStream("/test-documents/testPDFTwoTextBoxes.pdf");
         content = getText(stream, parser);
         content = content.replaceAll("\\s+", " ");
         // Column text is now interleaved:
         assertContains(
                 "Left column line 1 Right column line 1 Left colu mn line 2 Right column line 2",
                 content);

         //now try setting autodetect via parsecontext
         ParseContext context = new ParseContext();
         PDFParserConfig config = new PDFParserConfig();
         context.set(PDFParserConfig.class, config);
         // Default is false (do not sort):
         content = getText("testPDFTwoTextBoxes.pdf", new Metadata(), context);
         content = content.replaceAll("\\s+", " ");
         assertContains(
                 "Left column line 1 Left column line 2 Right column line 1 Right column line 2",
                 content);

         config.setSortByPosition(true);
         context.set(PDFParserConfig.class, config);
         stream = getResourceAsStream("/test-documents/testPDFTwoTextBoxes.pdf");
         content = getText("testPDFTwoTextBoxes.pdf", new Metadata(), context);
         content = content.replaceAll("\\s+", " ");
         // Column text is now interleaved:
         assertContains(
                 "Left column line 1 Right column line 1 Left colu mn line 2 Right column line 2",
                 content);

     }

     // TIKA-1035
     @Test
     public void testBookmarks() throws Exception {
         String xml = getXML("testPDF_bookmarks.pdf").xml;
         int i = xml.indexOf("Denmark bookmark is here");
         int j = xml.indexOf("</body>");
         assertTrue(i != -1);
         assertTrue(j != -1);
         assertTrue(i < j);
     }

     // TIKA-2303
     @Test
     public void testTurningOffBookmarks() throws Exception {
         PDFParserConfig config = new PDFParserConfig();
         config.setExtractBookmarksText(false);
         ParseContext parseContext = new ParseContext();
         parseContext.set(PDFParserConfig.class, config);
         String xml = getXML("testPDF_bookmarks.pdf", parseContext).xml;
         assertNotContained("Denmark bookmark is here", xml);
     }


     // TIKA-973
     //commented out until test documents that are unambiguously
     //consistent with Apache License v2.0 are contributed.
     //TODO: add back test for AcroForm extraction; test document should include
     //recursive forms
 /*    public void testAcroForm() throws Exception{
        Parser p = new AutoDetectParser();
        ParseContext context = new ParseContext();
        InputStream stream = getResourceAsStream("/test-documents/testPDF_acroForm1.pdf");
        String txt = getText(stream, p, context);
        stream.close();

        //simple first level form contents
        assertContains("to: John Doe", txt);
        //checkbox
        assertContains("xpackaging: Yes", txt);

        //this guarantees that the form processor
        //worked recursively at least once...i.e. it didn't just
        //take the first form
        stream = getResourceAsStream("/test-documents/testPDF_acroForm2.pdf");
        txt = getText(stream, p, context);
        stream.close();
        assertContains("123 Main St.", txt);


        //now test with nonsequential parser
        PDFParserConfig config = new PDFParserConfig();
        config.setUseNonSequentialParser(true);
        context.set(PDFParserConfig.class, config);
        stream = getResourceAsStream("/test-documents/testPDF_acroForm1.pdf");
        txt = getText(stream, p, context);
        stream.close();

        //simple first level form contents
        assertContains("to: John Doe", txt);
        //checkbox
        assertContains("xpackaging: Yes", txt);

        //this guarantees that the form processor
        //worked recursively at least once...i.e. it didn't just
        //take the first form
        stream = getResourceAsStream("/test-documents/testPDF_acroForm2.pdf");
        txt = getText(stream, p, context);
        assertContains("123 Main St.", txt);
        stream.close();
     }
 */

     //TIKA-1226
     @Test
     public void testSignatureInAcroForm() throws Exception {
         //The current test doc does not contain any content in the signature area.
         //This just tests that a RuntimeException is not thrown.
         //TODO: find a better test file for this issue.
         XMLResult result = getXML("testPDF_acroform3.pdf");
         Metadata m = result.metadata;
         assertEquals("true", m.get(PDF.HAS_XMP));
         assertEquals("true", m.get(PDF.HAS_ACROFORM_FIELDS));
         assertEquals("false", m.get(PDF.HAS_XFA));
         assertTrue("found", (result.xml.contains("<li>aTextField: TIKA-1226</li>")));
     }

     @Test
     public void testSingleCloseDoc() throws Exception {
         //TIKA-1341
         Metadata m = new Metadata();
         ParseContext c = new ParseContext();
         ContentHandler h = new EventCountingHandler();
         try (InputStream is = getResourceAsStream("/test-documents/testPDFTripleLangTitle.pdf")) {
             AUTO_DETECT_PARSER.parse(is, h, m, c);
         }
         assertEquals(1, ((EventCountingHandler) h).getEndDocument());
     }

     @Test
     public void testVersions() throws Exception {

         Map<String, String> dcFormat = new HashMap<>();
         dcFormat.put("4.x", "application/pdf; version=1.3");
         dcFormat.put("5.x", "application/pdf; version=1.4");
         dcFormat.put("6.x", "application/pdf; version=1.5");
         dcFormat.put("7.x", "application/pdf; version=1.6");
         dcFormat.put("8.x", "application/pdf; version=1.7");
         dcFormat.put("9.x", "application/pdf; version=1.7");
         dcFormat.put("10.x", "application/pdf; version=1.7");
         dcFormat.put("11.x.PDFA-1b", "application/pdf; version=1.7");

         Map<String, String> pdfVersions = new HashMap<>();
         pdfVersions.put("4.x", "1.3");
         pdfVersions.put("5.x", "1.4");
         pdfVersions.put("6.x", "1.5");
         pdfVersions.put("7.x", "1.6");
         pdfVersions.put("8.x", "1.7");
         pdfVersions.put("9.x", "1.7");
         pdfVersions.put("10.x", "1.7");
         pdfVersions.put("11.x.PDFA-1b", "1.7");

         Map<String, String> pdfExtensionVersions = new HashMap<>();
         pdfExtensionVersions.put("9.x", "1.7 Adobe Extension Level 3");
         pdfExtensionVersions.put("10.x", "1.7 Adobe Extension Level 8");
         pdfExtensionVersions.put("11.x.PDFA-1b", "1.7 Adobe Extension Level 8");

         for (Map.Entry<String, String> e : dcFormat.entrySet()) {
             String fName = "testPDF_Version." + e.getKey() + ".pdf";

             XMLResult r = getXML(fName);
             boolean foundDC = false;
             String[] vals = r.metadata.getValues("dc:format");
             for (String v : vals) {
                 if (v.equals(e.getValue())) {
                     foundDC = true;
                     break;
                 }
             }
             assertTrue("dc:format ::" + e.getValue(), foundDC);
             String extensionVersionTruth = pdfExtensionVersions.get(e.getKey());
             if (extensionVersionTruth != null) {
                 assertEquals("pdf:PDFExtensionVersion :: " + extensionVersionTruth,
                         extensionVersionTruth, r.metadata.get("pdf:PDFExtensionVersion"));
             }
             assertEquals("pdf:PDFVersion", pdfVersions.get(e.getKey()),
                     r.metadata.get("pdf:PDFVersion"));
         }
         //now test full 11.x
         XMLResult r = getXML("testPDF_Version.11.x.PDFA-1b.pdf");
         Set<String> versions = new HashSet<>(Arrays.asList(r.metadata.getValues("dc:format")));

         for (String hit : new String[]{"application/pdf; version=1.7",
                 "application/pdf; version=\"A-1b\"",
                 "application/pdf; version=\"1.7 Adobe Extension Level 8\""}) {
             assertTrue(hit, versions.contains(hit));
         }

         assertEquals("pdfaid:conformance", r.metadata.get("pdfaid:conformance"), "B");
         assertEquals("pdfaid:part", r.metadata.get("pdfaid:part"), "1");
     }

     @Test
     public void testMultipleAuthors() throws Exception {

         XMLResult r = getXML("testPDF_twoAuthors.pdf");
         List<String> authors = Arrays.asList(r.metadata.getValues(TikaCoreProperties.CREATOR));
         assertContains("Sample Author 1", authors);
         assertContains("Sample Author 2", authors);

     }

     //STUB test for once TIKA-1295 is fixed
     @Test
     public void testMultipleTitles() throws Exception {
         XMLResult r = getXML("testPDFTripleLangTitle.pdf");
         //TODO: add other tests as part of TIKA-1295
         //dc:title-fr-ca (or whatever we decide) should be "Bonjour World"
         //dc:title-zh-ch is currently hosed...bug in PDFBox while injecting xmp?
         //
         assertEquals("Hello World", r.metadata.get("dc:title"));
     }

     @Test
     public void testInlineSelector() throws Exception {

         PDFParserConfig config = new PDFParserConfig();
         config.setExtractInlineImages(true);
         config.setExtractUniqueInlineImagesOnly(false);
         ParseContext context = new ParseContext();
         context.set(org.apache.tika.parser.pdf.PDFParserConfig.class, config);

         List<Metadata> metadatas = getRecursiveMetadata("testPDF_childAttachments.pdf", context);
         int inline = 0;
         int attach = 0;
         for (Metadata m : metadatas) {
             String v = m.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE);
             if (v != null) {
                 if (v.equals(TikaCoreProperties.EmbeddedResourceType.INLINE.toString())) {
                     inline++;
                 } else if (v
                         .equals(TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString())) {
                     attach++;
                 }
             }
         }
         assertEquals(2, inline);
         assertEquals(2, attach);

         //now try turning off inline

         context.set(org.apache.tika.extractor.DocumentSelector.class, new AvoidInlineSelector());
         inline = 0;
         attach = 0;

         metadatas = getRecursiveMetadata("testPDF_childAttachments.pdf", context);
         for (Metadata m : metadatas) {
             String v = m.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE);
             if (v != null) {
                 if (v.equals(TikaCoreProperties.EmbeddedResourceType.INLINE.toString())) {
                     inline++;
                 } else if (v
                         .equals(TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString())) {
                     attach++;
                 }
             }
         }
         assertEquals(0, inline);
         assertEquals(2, attach);

     }


     @Test
     public void testInlineConfig() throws Exception {

         List<Metadata> metadatas = getRecursiveMetadata("testPDF_childAttachments.pdf");
         int inline = 0;
         int attach = 0;
         for (Metadata m : metadatas) {
             String v = m.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE);
             if (v != null) {
                 if (v.equals(TikaCoreProperties.EmbeddedResourceType.INLINE.toString())) {
                     inline++;
                 } else if (v
                         .equals(TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString())) {
                     attach++;
                 }
             }
         }
         assertEquals(0, inline);
         assertEquals(2, attach);

         //now try turning off inline
         PDFParserConfig config = new PDFParserConfig();
         config.setExtractInlineImages(true);
         config.setExtractUniqueInlineImagesOnly(false);

         ParseContext context = new ParseContext();
         context.set(org.apache.tika.parser.pdf.PDFParserConfig.class, config);
         inline = 0;
         attach = 0;

         metadatas = getRecursiveMetadata("testPDF_childAttachments.pdf", context);
         for (Metadata m : metadatas) {
             String v = m.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE);
             if (v != null) {
                 if (v.equals(TikaCoreProperties.EmbeddedResourceType.INLINE.toString())) {
                     inline++;
                 } else if (v
                         .equals(TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString())) {
                     attach++;
                 }
             }
         }
         assertEquals(2, inline);
         assertEquals(2, attach);
     }

     @Test //TIKA-1376
     public void testEmbeddedFileNameExtraction() throws Exception {
         List<Metadata> metadatas = getRecursiveMetadata("testPDF_multiFormatEmbFiles.pdf");
         assertEquals("metadata size", 5, metadatas.size());
         Metadata firstAttachment = metadatas.get(1);
         assertEquals("attachment file name", "Test.txt",
                 firstAttachment.get(TikaCoreProperties.RESOURCE_NAME_KEY));
     }

     @Test //TIKA-1427
     public void testEmbeddedFileMarkup() throws Exception {
         ParseContext context = new ParseContext();
         context.set(org.apache.tika.parser.Parser.class, AUTO_DETECT_PARSER);

         PDFParserConfig config = new PDFParserConfig();
         config.setExtractInlineImages(true);
         config.setExtractUniqueInlineImagesOnly(false);
         context.set(org.apache.tika.parser.pdf.PDFParserConfig.class, config);

         XMLResult r = getXML("testPDF_childAttachments.pdf", context);
         //regular attachment
         assertContains("<div source=\"attachment\" class=\"embedded\" id=\"Unit10.doc\" />", r.xml);
         //inline image
         assertContains("<img src=\"embedded:image1.tif\" alt=\"image1.tif\" />", r.xml);

         //doc embedded inside an annotation
         r = getXML("testPDFFileEmbInAnnotation.pdf");
         assertContains("<div source=\"annotationFileAttachment\" class=\"embedded\" id=\"Excel" +
                 ".xlsx\" />", r.xml);
     }

     //Access checker tests

     @Test
     public void testLegacyAccessChecking() throws Exception {
         //test that default behavior doesn't throw AccessPermissionException
         for (String file : new String[]{"testPDF_no_extract_no_accessibility_owner_empty.pdf",
                 "testPDF_no_extract_yes_accessibility_owner_empty.pdf",}) {
             String xml = getXML(file).xml;
             assertContains("Hello World", xml);
         }

         //now try with the user password
         PasswordProvider provider = new PasswordProvider() {
             @Override
             public String getPassword(Metadata metadata) {
                 return "user";
             }
         };

         ParseContext context = new ParseContext();
         context.set(PasswordProvider.class, provider);

         for (String path : new String[]{"testPDF_no_extract_no_accessibility_owner_user.pdf",
                 "testPDF_no_extract_yes_accessibility_owner_user.pdf",}) {
             assertContains("Hello World", getXML(path, context).xml);
         }
     }

     @Test
     public void testAccessCheckingEmptyPassword() throws Exception {
         PDFParserConfig config = new PDFParserConfig();

         //don't allow extraction, not even for accessibility
         config.setAccessChecker(new AccessChecker(false));
         ParseContext context = new ParseContext();
         context.set(PDFParserConfig.class, config);

         //test exception for empty password
         for (String path : new String[]{"testPDF_no_extract_no_accessibility_owner_empty.pdf",
                 "testPDF_no_extract_yes_accessibility_owner_empty.pdf",}) {
             assertException("/test-documents/" + path, AUTO_DETECT_PARSER, context,
                     AccessPermissionException.class);
         }

         config.setAccessChecker(new AccessChecker(true));
         assertException("/test-documents/" + "testPDF_no_extract_no_accessibility_owner_empty.pdf",
                 AUTO_DETECT_PARSER, context, AccessPermissionException.class);

         assertContains("Hello World",
                 getXML("testPDF_no_extract_yes_accessibility_owner_empty.pdf", context).xml);
     }

     @Test
     public void testAccessCheckingUserPassword() throws Exception {
         ParseContext context = new ParseContext();

         PDFParserConfig config = new PDFParserConfig();
         //don't allow extraction, not even for accessibility
         config.setAccessChecker(new AccessChecker(false));
         PasswordProvider passwordProvider = new PasswordProvider() {
             @Override
             public String getPassword(Metadata metadata) {
                 return "user";
             }
         };

         context.set(PasswordProvider.class, passwordProvider);
         context.set(PDFParserConfig.class, config);

         //test bad passwords
         for (String path : new String[]{"testPDF_no_extract_no_accessibility_owner_empty.pdf",
                 "testPDF_no_extract_yes_accessibility_owner_empty.pdf",}) {
             assertException("/test-documents/" + path, AUTO_DETECT_PARSER, context,
                     EncryptedDocumentException.class);
         }

         //bad password is still a bad password
         config.setAccessChecker(new AccessChecker(true));
         for (String path : new String[]{"testPDF_no_extract_no_accessibility_owner_empty.pdf",
                 "testPDF_no_extract_yes_accessibility_owner_empty.pdf",}) {
             assertException("/test-documents/" + path, AUTO_DETECT_PARSER, context,
                     EncryptedDocumentException.class);
         }

         //now test documents that require this "user" password
         assertException("/test-documents/" + "testPDF_no_extract_no_accessibility_owner_user.pdf",
                 AUTO_DETECT_PARSER, context, AccessPermissionException.class);

         assertContains("Hello World",
                 getXML("testPDF_no_extract_yes_accessibility_owner_user.pdf", context).xml);

         config.setAccessChecker(new AccessChecker(false));
         for (String path : new String[]{"testPDF_no_extract_no_accessibility_owner_user.pdf",
                 "testPDF_no_extract_yes_accessibility_owner_user.pdf",}) {
             assertException("/test-documents/" + path, AUTO_DETECT_PARSER, context,
                     AccessPermissionException.class);
         }
     }

     @Test
     public void testAccessCheckingOwnerPassword() throws Exception {
         ParseContext context = new ParseContext();

         PDFParserConfig config = new PDFParserConfig();
         //don't allow extraction, not even for accessibility
         config.setAccessChecker(new AccessChecker(true));
         PasswordProvider passwordProvider = new PasswordProvider() {
             @Override
             public String getPassword(Metadata metadata) {
                 return "owner";
             }
         };

         context.set(PasswordProvider.class, passwordProvider);
         context.set(PDFParserConfig.class, config);

         //with owner's password, text can be extracted, no matter the
         // AccessibilityChecker's settings
         for (String path : new String[]{"testPDF_no_extract_no_accessibility_owner_user.pdf",
                 "testPDF_no_extract_yes_accessibility_owner_user.pdf",
                 "testPDF_no_extract_no_accessibility_owner_empty.pdf",
                 "testPDF_no_extract_yes_accessibility_owner_empty.pdf",}) {

             assertContains("Hello World", getXML(path, context).xml);
         }

         //really, with owner's password, all extraction is allowed
         config.setAccessChecker(new AccessChecker(false));
         for (String path : new String[]{"testPDF_no_extract_no_accessibility_owner_user.pdf",
                 "testPDF_no_extract_yes_accessibility_owner_user.pdf",
                 "testPDF_no_extract_no_accessibility_owner_empty.pdf",
                 "testPDF_no_extract_yes_accessibility_owner_empty.pdf",}) {
             assertContains("Hello World", getXML(path, context).xml);
         }
     }

     @Test
     public void testNoXMP() throws Exception {
         assertEquals("false", getXML("testPDF.pdf").metadata.get(PDF.HAS_XMP));
     }

     @Test
     public void testPDFEncodedStringsInXMP() throws Exception {
         //TIKA-1678
         XMLResult r = getXML("testPDF_PDFEncodedStringInXMP.pdf");
         assertEquals("Microsoft", r.metadata.get(TikaCoreProperties.TITLE));
     }

     @Test
     public void testXFAExtractionBasic() throws Exception {
         XMLResult r = getXML("testPDF_XFA_govdocs1_258578.pdf");
         Metadata m = r.metadata;
         assertEquals("true", m.get(PDF.HAS_XFA));
         assertEquals("true", m.get(PDF.HAS_ACROFORM_FIELDS));
         assertEquals("true", m.get(PDF.HAS_XMP));
         //contains content existing only in the "regular" pdf
         assertContains("Mount Rushmore National Memorial", r.xml);
         //contains xfa fields and data
         assertContains("<li fieldName=\"School_Name\">School Name: my_school</li>", r.xml);
     }

     @Test
     public void testXFAOnly() throws Exception {
         ParseContext context = new ParseContext();
         PDFParserConfig config = new PDFParserConfig();
         config.setIfXFAExtractOnlyXFA(true);
         context.set(PDFParserConfig.class, config);
         String xml = getXML("testPDF_XFA_govdocs1_258578.pdf", context).xml;
         assertContains("<body><div class=\"xfa_content\">", xml);
         assertContains("<li fieldName=\"Room_1\">Room [1]: my_room1</li>", xml);

         assertNotContained("Mount Rushmore National Memorial", xml);
     }

     @Test
     public void testXMPMM() throws Exception {

         Metadata m = getXML("testPDF_twoAuthors.pdf").metadata;
         assertEquals("uuid:0e46913c-72b9-40c0-8232-69e362abcd1e", m.get(XMPMM.DOCUMENTID));

         m = getXML("testPDF_Version.11.x.PDFA-1b.pdf").metadata;
         assertEquals("uuid:cccee1fc-51b3-4b52-ac86-672af3974d25", m.get(XMPMM.DOCUMENTID));

         //now test for 7 elements in each parallel array
         //from the history section
         assertArrayEquals(
                 new String[]{"uuid:0313504b-a0b0-4dac-a9f0-357221f2eadf",
                     "uuid:edc4279e-0d5f-465e-b13e-1298402fd11c",
                     "uuid:f565b775-43f3-4a9a-8541-e98c4115db6d",
                     "uuid:9fd5e0a8-14a5-4920-ad7f-870c0b8ee65f",
                     "uuid:09b6cfba-efde-4e07-a77f-70de858cc0aa",
                     "uuid:1e4ffbd7-dabc-4aae-801c-15b3404ade36",
                     "uuid:c1669773-a6ca-4bdd-aade-519030d0af00"},
                 m.getValues(XMPMM.HISTORY_EVENT_INSTANCEID));

         assertArrayEquals(
                 new String[]{"converted", "converted", "converted", "converted", "converted",
                         "converted", "converted"}, m.getValues(XMPMM.HISTORY_ACTION));

         assertArrayEquals(
                 new String[]{"Preflight", "Preflight", "Preflight", "Preflight", "Preflight",
                         "Preflight", "Preflight"}, m.getValues(XMPMM.HISTORY_SOFTWARE_AGENT));

         assertArrayEquals(
                 new String[]{"2014-03-04T23:50:41Z", "2014-03-04T23:50:42Z", "2014-03-04T23:51:34Z",
                         "2014-03-04T23:51:36Z", "2014-03-04T23:51:37Z", "2014-03-04T23:52:22Z",
                         "2014-03-04T23:54:48Z"}, m.getValues(XMPMM.HISTORY_WHEN));
     }

     @Test
     public void testSkipBadPage() throws Exception {
         //test file comes from govdocs1
         //can't use TikaTest shortcuts because of exception
         ContentHandler handler = new BodyContentHandler(-1);
         Metadata m = new Metadata();
         ParseContext context = new ParseContext();
         boolean tikaEx = false;
         try (InputStream is = getResourceAsStream("/test-documents/testPDF_bad_page_303226.pdf")) {
             AUTO_DETECT_PARSER.parse(is, handler, m, context);
         } catch (TikaException e) {
             tikaEx = true;
         }
         String content = handler.toString();
         assertTrue("Should have thrown exception", tikaEx);
         assertEquals(1, m.getValues(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING).length);
         assertContains("Unknown dir", m.get(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING));
         assertContains("1309.61", content);

         //now try throwing exception immediately
         PDFParserConfig config = new PDFParserConfig();
         config.setCatchIntermediateIOExceptions(false);
         context.set(PDFParserConfig.class, config);

         handler = new BodyContentHandler(-1);
         m = new Metadata();
         tikaEx = false;
         try (InputStream is = getResourceAsStream("/test-documents/testPDF_bad_page_303226.pdf")) {
             AUTO_DETECT_PARSER.parse(is, handler, m, context);
         } catch (TikaException e) {
             tikaEx = true;
         }
         content = handler.toString();
         assertTrue("Should have thrown exception", tikaEx);
         assertEquals(0, m.getValues(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING).length);
         assertNotContained("1309.61", content);
     }

     @Test
     public void testInitializationViaConfig() throws Exception {
         try (InputStream is = getResourceAsStream(
                 "/org/apache/tika/parser/pdf/tika-config.xml")) {
             assertNotNull(is);
             TikaConfig tikaConfig = new TikaConfig(is);
             Parser p = new AutoDetectParser(tikaConfig);

             String text =
                     getText(getResourceAsStream("/test-documents/testPDFTwoTextBoxes.pdf"), p);
             text = text.replaceAll("\\s+", " ");

             // Column text is now interleaved:
             assertContains(
                     "Left column line 1 Right column line 1 " +
                             "Left colu mn line 2 Right column line 2",
                     text);

             //test overriding underlying settings with PDFParserConfig
             ParseContext pc = new ParseContext();
             PDFParserConfig config = new PDFParserConfig();
             config.setSortByPosition(false);
             pc.set(PDFParserConfig.class, config);
             text = getText("testPDFTwoTextBoxes.pdf", p, new Metadata(), pc);
             text = text.replaceAll("\\s+", " ");
             // Column text is not interleaved:
             assertContains("Left column line 1 Left column line 2 ", text);

             //test a new PDFParserConfig and setting another value
             //this tests that the underlying "sortByPosition" as set
             //in the config file is still operative
             config = new PDFParserConfig();
             config.setOcrDPI(10000);
             config.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.NO_OCR);
             pc.set(PDFParserConfig.class, config);
             text = getText("testPDFTwoTextBoxes.pdf", p, new Metadata(), pc);
             text = text.replaceAll("\\s+", " ");

             // Column text is now interleaved:
             assertContains(
                     "Left column line 1 Right column line 1 Left " +
                             "colu mn line 2 Right column line 2",
                     text);
         }
     }

     @Test
     public void testInitializationOfNonPrimitivesViaConfig() throws Exception {
         try (InputStream is = getResourceAsStream(
                 "/org/apache/tika/parser/pdf/tika-config-non-primitives.xml")) {
             assertNotNull(is);
             TikaConfig tikaConfig = new TikaConfig(is);
             AutoDetectParser p = new AutoDetectParser(tikaConfig);
             Map<MediaType, Parser> parsers = p.getParsers();
             Parser composite = parsers.get(MediaType.application("pdf"));
             Parser pdfParser =
                     ((CompositeParser) composite).getParsers().get(MediaType.application("pdf"));
             assertEquals("org.apache.tika.parser.pdf.PDFParser",
                     pdfParser.getClass().getName());
             assertEquals(PDFParserConfig.OCR_STRATEGY.OCR_ONLY,
                     ((PDFParser) pdfParser).getPDFParserConfig().getOcrStrategy());
             assertEquals(ImageType.RGB,
                     ((PDFParser) pdfParser).getPDFParserConfig().getOcrImageType());
         }
     }

     @Test
     public void testDiffTitles() throws Exception {
         //different titles in xmp vs docinfo
         Metadata m = getXML("testPDF_diffTitles.pdf").metadata;
         assertEquals("this is a new title", m.get(PDF.DOC_INFO_TITLE));
         assertEquals("Sample Title", m.get(TikaCoreProperties.TITLE));
     }

     @Test
     public void testMaxLength() throws Exception {
         InputStream is = getResourceAsStream("/test-documents/testPDF.pdf");
         String content = new Tika().parseToString(is, new Metadata(), 100);
         assertTrue(content.length() == 100);
         assertContains("Tika - Content", content);
     }

     @Test
     public void testConfiguringMoreParams() throws Exception {
         try (InputStream configIs = getResourceAsStream(
                 "/org/apache/tika/parser/pdf/tika-inline-config.xml")) {
             assertNotNull(configIs);
             TikaConfig tikaConfig = new TikaConfig(configIs);
             AutoDetectParser p = new AutoDetectParser(tikaConfig);
             //make absolutely certain the functionality works!
             List<Metadata> metadata = getRecursiveMetadata("testOCR.pdf", p);
             assertEquals(2, metadata.size());
             Map<MediaType, Parser> parsers = p.getParsers();
             Parser composite = parsers.get(MediaType.application("pdf"));
             Parser pdfParser =
                     ((CompositeParser) composite).getParsers().get(MediaType.application("pdf"));
             assertTrue(pdfParser instanceof PDFParser);
             PDFParserConfig pdfParserConfig = ((PDFParser) pdfParser).getPDFParserConfig();
             assertEquals(new AccessChecker(true), pdfParserConfig.getAccessChecker());
             assertEquals(true, pdfParserConfig.isExtractInlineImages());
             assertEquals(false, pdfParserConfig.isExtractUniqueInlineImagesOnly());
             assertEquals(314, pdfParserConfig.getOcrDPI());
             assertEquals(2.1f, pdfParserConfig.getOcrImageQuality(), .01f);
             assertEquals("jpeg", pdfParserConfig.getOcrImageFormatName());
             assertEquals(524288000, pdfParserConfig.getMaxMainMemoryBytes());
             assertEquals(false, pdfParserConfig.isCatchIntermediateIOExceptions());

         }
     }

     //TODO: figure out how to test jp2 embedded with OCR

     private void assertException(String path, Parser parser, ParseContext context, Class expected) {
         boolean noEx = false;
         InputStream is = getResourceAsStream(path);
         try {
             String text = getText(is, parser, context);
             noEx = true;
         } catch (Exception e) {
             assertEquals("Not the right exception: " + path, expected, e.getClass());
         } finally {
             IOUtils.closeQuietly(is);
         }
         assertFalse(path + " should have thrown exception", noEx);
     }

     @Test
     public void testLanguageMetadata() throws Exception {
         assertEquals("de-CH",
                 getXML("testPDF-custommetadata.pdf").metadata.get(TikaCoreProperties.LANGUAGE));
         assertEquals("zh-CN",
                 getXML("testPDFFileEmbInAnnotation.pdf").metadata.get(TikaCoreProperties.LANGUAGE));
     }

     @Test
     public void testAngles() throws Exception {
         PDFParserConfig pdfParserConfig = new PDFParserConfig();
         pdfParserConfig.setDetectAngles(true);
         ParseContext parseContext = new ParseContext();
         parseContext.set(PDFParserConfig.class, pdfParserConfig);
         String xml = getXML("testPDF_angles.pdf", parseContext).xml;
         //make sure there is only one page!
         assertContainsCount("<div class=\"page\">", xml, 1);
         assertContains("IN-DEMAND", xml);
         assertContains("natural underground", xml);
         assertContains("transport mined materials", xml);
     }


     @Test
     public void testUnmappedUnicodeStats() throws Exception {
         List<Metadata> metadataList = getRecursiveMetadata("testPDF_bad_page_303226.pdf", true);
         Metadata m = metadataList.get(0);
         int[] totalChars = m.getIntValues(PDF.CHARACTERS_PER_PAGE);
         int[] unmappedUnicodeChars = m.getIntValues(PDF.UNMAPPED_UNICODE_CHARS_PER_PAGE);
         //weird issue with pdfbox 2.0.20
         //this test passes in my IDE, but does not pass with mvn clean install from commandline
         if (totalChars[15] > 0) {
             assertEquals(3805, totalChars[15]);
             assertEquals(120, unmappedUnicodeChars[15]);
         }
         //confirm all works with angles
         PDFParserConfig pdfParserConfig = new PDFParserConfig();
         pdfParserConfig.setDetectAngles(true);
         ParseContext parseContext = new ParseContext();
         parseContext.set(PDFParserConfig.class, pdfParserConfig);
         metadataList = getRecursiveMetadata("testPDF_bad_page_303226.pdf", parseContext, true);
         m = metadataList.get(0);
         totalChars = m.getIntValues(PDF.CHARACTERS_PER_PAGE);
         unmappedUnicodeChars = m.getIntValues(PDF.UNMAPPED_UNICODE_CHARS_PER_PAGE);
         if (totalChars[15] > 0) {
             assertEquals(3805, totalChars[15]);
             assertEquals(120, unmappedUnicodeChars[15]);
         }

     }

     @Test
     public void testNPEInPDFParserConfig() {
         //TIKA-3091
         PDFParserConfig config = new PDFParserConfig();
         //don't care about values; want to make sure no NPE is thrown
         String txt = config.toString();
         config.hashCode();
         config.equals(new PDFParserConfig());
     }

     @Test //TIKA-3041
     @Ignore("turn back on if we add file from PDFBOX-52")
     public void testPDFBox52() throws Exception {
         PDFParserConfig config = new PDFParserConfig();
         config.setExtractInlineImages(true);
         config.setExtractUniqueInlineImagesOnly(false);
         ParseContext context = new ParseContext();
         context.set(PDFParserConfig.class, config);

         List<Metadata> metadataList = getRecursiveMetadata("testPDF_PDFBOX-52.pdf", context);
         int max = 0;
         Matcher matcher = Pattern.compile("image(\\d+)").matcher("");
         for (Metadata m : metadataList) {
             String n = m.get(TikaCoreProperties.RESOURCE_NAME_KEY);

             if (n != null && matcher.reset(n).find()) {
                 int i = Integer.parseInt(matcher.group(1));
                 if (i > max) {
                     max = i;
                 }
             }
         }
         assertEquals(37, metadataList.size());
         assertEquals(35, max);
     }

     @Test
     public void testXMPBasicSchema() throws Exception {
         //TIKA-3101
         List<Metadata> metadataList = getRecursiveMetadata("testPDF_XMPBasicSchema.pdf");
         Metadata m = metadataList.get(0);
         //these two fields derive from the basic schema in the XMP, not dublin core
         assertEquals("Hewlett-Packard MFP", m.get(XMP.CREATOR_TOOL));
         assertEquals("1998-08-29T13:53:15Z", m.get(XMP.CREATE_DATE));
     }

     @Test
     public void testXMPPDFSchema() throws Exception {
         //as of this writing, we don't currently have any pdfs in our
         //test suite with data that is different btwn pdf doc info and xmp. :(
         Metadata metadata = getXML("testPopupAnnotation.pdf").metadata;
         assertEquals("IBM Lotus Symphony 3.0", metadata.get(PDF.PRODUCER));
     }

     @Test
     public void testExtractInlineImageMetadata() throws Exception {
         ParseContext context = new ParseContext();
         PDFParserConfig config = new PDFParserConfig();
         config.setExtractInlineImageMetadataOnly(true);
         context.set(PDFParserConfig.class, config);
         List<Metadata> metadataList = getRecursiveMetadata("testOCR.pdf", context);
         assertNull(context.get(ZeroByteFileException.IgnoreZeroByteFileException.class));
         assertEquals(2, metadataList.size());
         assertEquals("image/png", metadataList.get(1).get(Metadata.CONTENT_TYPE));
         assertEquals("/image0.png",
                 metadataList.get(1).get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH));
         assertEquals(261, (int) metadataList.get(1).getInt(Metadata.IMAGE_LENGTH));
         assertEquals(934, (int) metadataList.get(1).getInt(Metadata.IMAGE_WIDTH));
         assertEquals("image0.png", metadataList.get(1).get(TikaCoreProperties.RESOURCE_NAME_KEY));
     }

     /**
      * Simple class to count end of document events.  If functionality is useful,
      * move to org.apache.tika in src/test
      */
     private static class EventCountingHandler extends ContentHandlerDecorator {
         private int endDocument = 0;

         @Override
         public void endDocument() {
             endDocument++;
         }

         public int getEndDocument() {
             return endDocument;
         }
     }

     private static class AvoidInlineSelector implements DocumentSelector {

         @Override
         public boolean select(Metadata metadata) {
             String v = metadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE);
             if (v != null && v.equals(TikaCoreProperties.EmbeddedResourceType.INLINE.toString())) {
                 return false;
             }
             return true;
         }
     }

     @Test
     public void testDeeplyEmbeddedAttachments() throws Exception {
         //test file comes from pdfcpu issue #120: https://github.com/pdfcpu/pdfcpu/issues/201
         //in our regression corpus: pdfcpu-201-0.zip-0.pdf");
         List<Metadata> metadataList = getRecursiveMetadata(
                 "testPDF_deeplyEmbeddedAttachments.pdf");
         assertEquals(21, metadataList.size());
     }

     @Test
     public void testEmbeddedRichMedia() throws Exception {
         List<Metadata> metadata = getRecursiveMetadata("testFlashInPDF.pdf");
         assertEquals(2, metadata.size());
         assertEquals("application/x-shockwave-flash", metadata.get(1).get(Metadata.CONTENT_TYPE));
         assertEquals("TestMovie02.swf", metadata.get(1).get(TikaCoreProperties.RESOURCE_NAME_KEY));
         assertEquals("15036", metadata.get(1).get(Metadata.CONTENT_LENGTH));
     }

     /**
     @Test
     public void testWriteLimit() throws Exception {
         for (int i = 0; i < 10000; i += 13) {
             Metadata metadata = testWriteLimit("testPDF_childAttachments.pdf", i);
             assertEquals("true", metadata.get(TikaCoreProperties.WRITE_LIMIT_REACHED));
             int len = metadata.get(TikaCoreProperties.TIKA_CONTENT).length();
             System.out.println(len + " : " + i);
             assertTrue(len <= i);
         }
     }

     private Metadata testWriteLimit(String fileName, int limit) throws Exception {
         BasicContentHandlerFactory factory = new BasicContentHandlerFactory(
                 BasicContentHandlerFactory.HANDLER_TYPE.TEXT, limit
         );
         ContentHandler contentHandler = factory.getNewContentHandler();
         Metadata metadata = new Metadata();
         ParseContext parseContext = new ParseContext();
         try (InputStream is = getResourceAsStream("/test-documents/" + fileName)) {
             AUTO_DETECT_PARSER.parse(is, contentHandler, metadata, parseContext);
         } catch (WriteLimitReachedException e) {
             //e.printStackTrace();
         }
         metadata.set(TikaCoreProperties.TIKA_CONTENT, contentHandler.toString());
         return metadata;
     }*/
 }