tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java - tika - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.tika.parser.pdf;

 import static org.apache.tika.parser.pdf.PDFParserConfig.OCR_STRATEGY.AUTO;
 import static org.apache.tika.parser.pdf.PDFParserConfig.OCR_STRATEGY.NO_OCR;
 import static org.apache.tika.parser.pdf.PDFParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION;
 import static org.apache.tika.parser.pdf.PDFParserConfig.OCR_STRATEGY.OCR_ONLY;

 import javax.xml.stream.XMLStreamException;
 import java.awt.image.BufferedImage;
 import java.io.BufferedInputStream;
 import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.OutputStream;
 import java.lang.reflect.InvocationTargetException;
 import java.lang.reflect.Method;
 import java.nio.charset.StandardCharsets;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.text.SimpleDateFormat;
 import java.util.ArrayList;
 import java.util.Calendar;
 import java.util.Collections;
 import java.util.HashSet;
 import java.util.List;
 import java.util.ListIterator;
 import java.util.Locale;
 import java.util.Map;
 import java.util.Set;
 import java.util.TreeMap;

 import org.apache.commons.io.IOUtils;
 import org.apache.pdfbox.cos.COSName;
 import org.apache.pdfbox.pdmodel.PDDocument;
 import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
 import org.apache.pdfbox.pdmodel.PDDocumentNameDictionary;
 import org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode;
 import org.apache.pdfbox.pdmodel.PDPage;
 import org.apache.pdfbox.pdmodel.PDPageTree;
 import org.apache.pdfbox.pdmodel.common.PDDestinationOrAction;
 import org.apache.pdfbox.pdmodel.common.PDNameTreeNode;
 import org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification;
 import org.apache.pdfbox.pdmodel.common.filespecification.PDEmbeddedFile;
 import org.apache.pdfbox.pdmodel.common.filespecification.PDFileSpecification;
 import org.apache.pdfbox.pdmodel.common.filespecification.PDSimpleFileSpecification;
 import org.apache.pdfbox.pdmodel.font.PDFont;
 import org.apache.pdfbox.pdmodel.interactive.action.PDAction;
 import org.apache.pdfbox.pdmodel.interactive.action.PDActionImportData;
 import org.apache.pdfbox.pdmodel.interactive.action.PDActionJavaScript;
 import org.apache.pdfbox.pdmodel.interactive.action.PDActionLaunch;
 import org.apache.pdfbox.pdmodel.interactive.action.PDActionRemoteGoTo;
 import org.apache.pdfbox.pdmodel.interactive.action.PDActionURI;
 import org.apache.pdfbox.pdmodel.interactive.action.PDAnnotationAdditionalActions;
 import org.apache.pdfbox.pdmodel.interactive.action.PDDocumentCatalogAdditionalActions;
 import org.apache.pdfbox.pdmodel.interactive.action.PDFormFieldAdditionalActions;
 import org.apache.pdfbox.pdmodel.interactive.action.PDPageAdditionalActions;
 import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
 import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationFileAttachment;
 import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationMarkup;
 import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationWidget;
 import org.apache.pdfbox.pdmodel.interactive.digitalsignature.PDSignature;
 import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline;
 import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
 import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineNode;
 import org.apache.pdfbox.pdmodel.interactive.form.PDAcroForm;
 import org.apache.pdfbox.pdmodel.interactive.form.PDField;
 import org.apache.pdfbox.pdmodel.interactive.form.PDNonTerminalField;
 import org.apache.pdfbox.pdmodel.interactive.form.PDSignatureField;
 import org.apache.pdfbox.pdmodel.interactive.form.PDXFAResource;
 import org.apache.pdfbox.rendering.PDFRenderer;
 import org.apache.pdfbox.text.PDFTextStripper;
 import org.apache.pdfbox.tools.imageio.ImageIOUtil;
 import org.apache.pdfbox.util.Matrix;
 import org.apache.pdfbox.util.Vector;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
 import org.apache.tika.extractor.EmbeddedDocumentUtil;
 import org.apache.tika.io.TemporaryResources;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Font;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.PDF;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.sax.BodyContentHandler;
 import org.apache.tika.sax.EmbeddedContentHandler;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 import org.xml.sax.helpers.AttributesImpl;

 class AbstractPDF2XHTML extends PDFTextStripper {

     enum ActionTrigger {
         AFTER_DOCUMENT_PRINT,
         AFTER_DOCUMENT_SAVE,
         ANNOTATION_CURSOR_ENTERS,
         ANNOTATION_CURSOR_EXIT,
         ANNOTATION_LOSE_INPUT_FOCUS,
         ANNOTATION_MOUSE_CLICK,
         ANNOTATION_MOUSE_RELEASED,
         ANNOTATION_PAGE_CLOSED,
         ANNOTATION_PAGE_NO_LONGER_VISIBLE,
         ANNOTATION_PAGE_OPENED,
         ANNOTATION_PAGE_VISIBLE,
         ANNOTATION_RECEIVES_FOCUS,
         ANNOTATION_WIDGET,
         BEFORE_DOCUMENT_CLOSE,
         BEFORE_DOCUMENT_PRINT,
         BEFORE_DOCUMENT_SAVE,
         DOCUMENT_OPEN,
         FORM_FIELD,
         FORM_FIELD_FORMATTED,
         FORM_FIELD_KEYSTROKE,
         FORM_FIELD_RECALCULATE,
         FORM_FIELD_VALUE_CHANGE,
         PAGE_CLOSE,
         PAGE_OPEN, BOOKMARK,
     };

     /**
      * Maximum recursive depth during AcroForm processing.
      * Prevents theoretical AcroForm recursion bomb.
      */
     private final static int MAX_ACROFORM_RECURSIONS = 10;

     private static final MediaType XFA_MEDIA_TYPE = MediaType.application("vnd.adobe.xdp+xml");
     private static final MediaType XMP_MEDIA_TYPE = MediaType.application("rdf+xml");

     public static final String XMP_DOCUMENT_CATALOG_LOCATION = "documentCatalog";
     public static final String XMP_PAGE_LOCATION_PREFIX = "page ";

     /**
      * Format used for signature dates
      * TODO Make this thread-safe
      */
     private final SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.ROOT);


     final List<IOException> exceptions = new ArrayList<>();
     final PDDocument pdDocument;
     final XHTMLContentHandler xhtml;
     final ParseContext context;
     final Metadata metadata;
     final EmbeddedDocumentExtractor embeddedDocumentExtractor;
     final PDFParserConfig config;
     final Parser ocrParser;

     //zero-based pageIndex
     int pageIndex = 0;
     int startPage = -1;//private in PDFTextStripper...must have own copy because we override processpages
     int unmappedUnicodeCharsPerPage = 0;
     int totalCharsPerPage = 0;

     private final Set<String> fontNames = new HashSet<>();

     AbstractPDF2XHTML(PDDocument pdDocument, ContentHandler handler, ParseContext context, Metadata metadata,
                       PDFParserConfig config) throws IOException {
         this.pdDocument = pdDocument;
         this.xhtml = new XHTMLContentHandler(handler, metadata);
         this.context = context;
         this.metadata = metadata;
         this.config = config;
         embeddedDocumentExtractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
         if (config.getOcrStrategy() == NO_OCR) {
             ocrParser = null;
         } else {
             ocrParser = EmbeddedDocumentUtil.getStatelessParser(context);
         }
     }

     @Override
     protected void startPage(PDPage page) throws IOException {
         try {
             xhtml.startElement("div", "class", "page");
         } catch (SAXException e) {
             throw new IOException("Unable to start a page", e);
         }
         writeParagraphStart();
     }

     private void extractXMPXFA(PDDocument pdfDocument, Metadata parentMetadata, ParseContext context) throws IOException, SAXException {
         Set<MediaType> supportedTypes = Collections.EMPTY_SET;
         Parser embeddedParser = context.get(Parser.class);
         if (embeddedParser != null) {
             supportedTypes = embeddedParser.getSupportedTypes(context);
         }

         if (supportedTypes == null || supportedTypes.size() == 0) {
             return;
         }

         if (supportedTypes.contains(XMP_MEDIA_TYPE)) {
             //try the main metadata
             if (pdfDocument.getDocumentCatalog().getMetadata() != null) {
                 try (InputStream is = pdfDocument.getDocumentCatalog().getMetadata().exportXMPMetadata()) {
                     extractXMPAsEmbeddedFile(is, XMP_DOCUMENT_CATALOG_LOCATION);
                 } catch (IOException e) {
                     EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
                 }
             }
             //now iterate through the pages
             int pageNumber = 1;
             for (PDPage page : pdfDocument.getPages()) {
                 if (page.getMetadata() != null) {
                     try (InputStream is = page.getMetadata().exportXMPMetadata()) {
                         extractXMPAsEmbeddedFile(is, XMP_PAGE_LOCATION_PREFIX+pageNumber);
                     } catch (IOException e) {
                         EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
                     }
                 }
                 pageNumber++;
             }
         }

         //now try the xfa
         if (pdfDocument.getDocumentCatalog().getAcroForm(null) != null &&
             pdfDocument.getDocumentCatalog().getAcroForm(null).getXFA() != null) {

             Metadata xfaMetadata = new Metadata();
             xfaMetadata.set(Metadata.CONTENT_TYPE, XFA_MEDIA_TYPE.toString());
             xfaMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.METADATA.toString());
             if (embeddedDocumentExtractor.shouldParseEmbedded(xfaMetadata) &&
                     supportedTypes.contains(XFA_MEDIA_TYPE)) {
                 byte[] bytes = null;
                 try {
                     bytes = pdfDocument.getDocumentCatalog().getAcroForm(null).getXFA().getBytes();
                 } catch (IOException e) {
                     EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
                 }
                 if (bytes != null) {
                     try (InputStream is = new ByteArrayInputStream(bytes)) {
                         parseMetadata(is, xfaMetadata);
                     }
                 }
             }
         }
     }

     private void extractXMPAsEmbeddedFile(InputStream is, String location) throws IOException, SAXException {
         if (is == null) {
             return;
         }
         Metadata xmpMetadata = new Metadata();
         xmpMetadata.set(Metadata.CONTENT_TYPE, XMP_MEDIA_TYPE.toString());
         xmpMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.METADATA.toString());
         xmpMetadata.set(PDF.XMP_LOCATION, location);
         if (embeddedDocumentExtractor.shouldParseEmbedded(xmpMetadata)) {
             try {
                 parseMetadata(is, xmpMetadata);
             } finally {
                 IOUtils.closeQuietly(is);
             }
         }

     }

     private void parseMetadata(InputStream stream, Metadata embeddedMetadata) throws IOException, SAXException {
         try {
             embeddedDocumentExtractor.parseEmbedded(
                     stream,
                     new EmbeddedContentHandler(xhtml),
                     embeddedMetadata, false);
         } catch (IOException e) {
             handleCatchableIOE(e);
         }
     }

     private void extractEmbeddedDocuments(PDDocument document)
             throws IOException, SAXException, TikaException {
             PDDocumentNameDictionary namesDictionary =
                     new PDDocumentNameDictionary(document.getDocumentCatalog());
             PDEmbeddedFilesNameTreeNode efTree = namesDictionary.getEmbeddedFiles();
             if (efTree == null) {
                 return;
             }

         Map<String, PDComplexFileSpecification> embeddedFileNames = efTree.getNames();
         //For now, try to get the embeddedFileNames out of embeddedFiles or its kids.
         //This code follows: pdfbox/examples/pdmodel/ExtractEmbeddedFiles.java
         //If there is a need we could add a fully recursive search to find a non-null
         //Map<String, COSObjectable> that contains the doc info.
         if (embeddedFileNames != null) {
             processEmbeddedDocNames(embeddedFileNames);
         } else {
             List<PDNameTreeNode<PDComplexFileSpecification>> kids = efTree.getKids();
             if (kids == null) {
                 return;
             }
             for (PDNameTreeNode<PDComplexFileSpecification> node : kids) {
                 embeddedFileNames = node.getNames();
                 if (embeddedFileNames != null) {
                     processEmbeddedDocNames(embeddedFileNames);
                 }
             }
         }
     }

     private void processDoc(String name, PDFileSpecification spec, AttributesImpl attributes) throws TikaException, SAXException, IOException {
         if (spec instanceof PDSimpleFileSpecification) {
             attributes.addAttribute("", "class", "class", "CDATA", "linked");
             attributes.addAttribute("", "id", "id", "CDATA", spec.getFile());
             xhtml.startElement("div", attributes);
             xhtml.endElement("div");
         } else if (spec instanceof  PDComplexFileSpecification){
             if (attributes.getIndex("source") < 0) {
                 attributes.addAttribute("", "source", "source", "CDATA", "attachment");
             }
             extractMultiOSPDEmbeddedFiles(name, (PDComplexFileSpecification)spec, attributes);
         }
     }

     private void processEmbeddedDocNames(Map<String, PDComplexFileSpecification> embeddedFileNames)
             throws IOException, SAXException, TikaException {
         if (embeddedFileNames == null || embeddedFileNames.isEmpty()) {
             return;
         }

         for (Map.Entry<String, PDComplexFileSpecification> ent : embeddedFileNames.entrySet()) {
             processDoc(ent.getKey(), ent.getValue(), new AttributesImpl());
         }
     }

     private void extractMultiOSPDEmbeddedFiles(String displayName,
                                        PDComplexFileSpecification spec, AttributesImpl attributes) throws IOException,
             SAXException, TikaException {

         if (spec == null) {
             return;
         }
         //current strategy is to pull all, not just first non-null
         extractPDEmbeddedFile(displayName, spec.getFileUnicode(),
                 spec.getFile(), spec.getEmbeddedFile(), attributes);
         extractPDEmbeddedFile(displayName, spec.getFileUnicode(),
                 spec.getFileMac(), spec.getEmbeddedFileMac(), attributes);
         extractPDEmbeddedFile(displayName, spec.getFileUnicode(),
                 spec.getFileDos(), spec.getEmbeddedFileDos(), attributes);
         extractPDEmbeddedFile(displayName, spec.getFileUnicode(),
                 spec.getFileUnix(), spec.getEmbeddedFileUnix(), attributes);
     }

     private void extractPDEmbeddedFile(String displayName, String unicodeFileName,
                                        String fileName, PDEmbeddedFile file, AttributesImpl attributes)
             throws SAXException, IOException, TikaException {

         if (file == null) {
             //skip silently
             return;
         }

         fileName = (fileName == null || "".equals(fileName.trim())) ? unicodeFileName : fileName;
         fileName = (fileName == null || "".equals(fileName.trim())) ? displayName : fileName;

         // TODO: other metadata?
         Metadata embeddedMetadata = new Metadata();
         embeddedMetadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, fileName);
         embeddedMetadata.set(Metadata.CONTENT_TYPE, file.getSubtype());
         embeddedMetadata.set(Metadata.CONTENT_LENGTH, Long.toString(file.getSize()));
         embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
                 TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString());
         embeddedMetadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, fileName);
         if (!embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
             return;
         }
         TikaInputStream stream = null;
         try {
             stream = TikaInputStream.get(file.createInputStream());
         } catch (IOException e) {
             //store this exception in the parent's metadata
             EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
             return;
         }
         try {
             embeddedDocumentExtractor.parseEmbedded(
                     stream,
                     new EmbeddedContentHandler(xhtml),
                     embeddedMetadata, false);

             attributes.addAttribute("", "class", "class", "CDATA", "embedded");
             attributes.addAttribute("", "id", "id", "CDATA", fileName);
             xhtml.startElement("div", attributes);
             xhtml.endElement("div");
         } finally {
             IOUtils.closeQuietly(stream);
         }

     }

     void handleCatchableIOE(IOException e) throws IOException {
         if (config.isCatchIntermediateIOExceptions()) {
             if (e.getCause() instanceof SAXException && e.getCause().getMessage() != null &&
                     e.getCause().getMessage().contains("Your document contained more than")) {
                 //TODO -- is there a cleaner way of checking for:
                 // WriteOutContentHandler.WriteLimitReachedException?
                 throw e;
             }

             String msg = e.getMessage();
             if (msg == null) {
                 msg = "IOException, no message";
             }
             metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, msg);
             exceptions.add(e);
         } else {
             throw e;
         }
     }

     void doOCROnCurrentPage(PDFParserConfig.OCR_STRATEGY ocrStrategy) throws IOException, TikaException, SAXException {
         if (ocrStrategy.equals(NO_OCR)) {
             return;
         }
         MediaType ocrImageMediaType =
                 MediaType.image("ocr-"+config.getOcrImageFormatName());
         if (! ocrParser.getSupportedTypes(context).contains(ocrImageMediaType)) {
             if (ocrStrategy == OCR_ONLY || ocrStrategy == OCR_AND_TEXT_EXTRACTION) {
                 throw new TikaException("" +
                         "I regret that I couldn't find an OCR parser to handle " + ocrImageMediaType + "." +
                         "Please set the OCR_STRATEGY to NO_OCR or configure your" +
                         "OCR parser correctly"
                 );
             } else if (ocrStrategy == AUTO) {
                 //silently skip
                 return;
             }
         }

         PDFRenderer renderer = new PDFRenderer(pdDocument);

         try (TemporaryResources tmp = new TemporaryResources()) {

             int dpi = config.getOcrDPI();
             BufferedImage image = renderer.renderImageWithDPI(pageIndex, dpi, config.getOcrImageType());
             Path tmpFile = tmp.createTempFile();
             try (OutputStream os = Files.newOutputStream(tmpFile)) {
                 //TODO: get output format from TesseractConfig
                 ImageIOUtil.writeImage(image, config.getOcrImageFormatName(),
                         os, dpi, config.getOcrImageQuality());
             }
             try (InputStream is = TikaInputStream.get(tmpFile)) {
                 metadata.set(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE, ocrImageMediaType.toString());
                 ocrParser.parse(is,
                         new EmbeddedContentHandler(new BodyContentHandler(xhtml)),
                         metadata, context);
             }
         } catch (IOException e) {
             handleCatchableIOE(e);
         } catch (SAXException e) {
             throw new IOException("error writing OCR content from PDF", e);
         }
     }

     @Override
     protected void endPage(PDPage page) throws IOException {
         metadata.add(PDF.CHARACTERS_PER_PAGE, totalCharsPerPage);
         metadata.add(PDF.UNMAPPED_UNICODE_CHARS_PER_PAGE,
                 unmappedUnicodeCharsPerPage);


         try {
             for (PDAnnotation annotation : page.getAnnotations()) {

                 if (annotation instanceof PDAnnotationFileAttachment) {
                     PDAnnotationFileAttachment fann = (PDAnnotationFileAttachment) annotation;
                     if (fann.getFile() instanceof PDComplexFileSpecification) {
                         PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) fann.getFile();
                         try {
                             AttributesImpl attributes = new AttributesImpl();
                             attributes.addAttribute("", "source", "source", "CDATA", "annotation");
                             extractMultiOSPDEmbeddedFiles(fann.getAttachmentName(), fileSpec, attributes);
                         } catch (SAXException e) {
                             throw new IOException("file embedded in annotation sax exception", e);
                         } catch (TikaException e) {
                             throw new IOException("file embedded in annotation tika exception", e);
                         } catch (IOException e) {
                             handleCatchableIOE(e);
                         }
                     }
                 } else if (annotation instanceof PDAnnotationWidget) {
                     handleWidget((PDAnnotationWidget)annotation);
                 }
                 // TODO: remove once PDFBOX-1143 is fixed:
                 if (config.isExtractAnnotationText()) {
                     PDActionURI uri = getActionURI(annotation);
                     if (uri != null) {
                         String link = uri.getURI();
                         if (link != null && link.trim().length() > 0) {
                             xhtml.startElement("div", "class", "annotation");
                             xhtml.startElement("a", "href", link);
                             xhtml.characters(link);
                             xhtml.endElement("a");
                             xhtml.endElement("div");
                         }
                     }

                     if (annotation instanceof PDAnnotationMarkup) {
                         PDAnnotationMarkup annotationMarkup = (PDAnnotationMarkup) annotation;
                         String title = annotationMarkup.getTitlePopup();
                         String subject = annotationMarkup.getSubject();
                         String contents = annotationMarkup.getContents();
                         // TODO: maybe also annotationMarkup.getRichContents()?
                         if (title != null || subject != null || contents != null) {
                             xhtml.startElement("div", "class", "annotation");

                             if (title != null) {
                                 xhtml.startElement("div", "class", "annotationTitle");
                                 xhtml.characters(title);
                                 xhtml.endElement("div");
                             }

                             if (subject != null) {
                                 xhtml.startElement("div", "class", "annotationSubject");
                                 xhtml.characters(subject);
                                 xhtml.endElement("div");
                             }

                             if (contents != null) {
                                 xhtml.startElement("div", "class", "annotationContents");
                                 xhtml.characters(contents);
                                 xhtml.endElement("div");
                             }

                             xhtml.endElement("div");
                         }
                     }
                 }
             }
             if (config.getOcrStrategy().equals(PDFParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION)) {
                 doOCROnCurrentPage(OCR_AND_TEXT_EXTRACTION);
             } else if (config.getOcrStrategy().equals(PDFParserConfig.OCR_STRATEGY.AUTO)) {
                 //TODO add more sophistication
                 if (totalCharsPerPage < 10 || unmappedUnicodeCharsPerPage > 10) {
                     doOCROnCurrentPage(AUTO);
                 }
             }

             PDPageAdditionalActions pageActions = page.getActions();
             if (pageActions != null) {
                 handleDestinationOrAction(pageActions.getC(), ActionTrigger.PAGE_CLOSE);
                 handleDestinationOrAction(pageActions.getO(), ActionTrigger.PAGE_OPEN);
             }
             xhtml.endElement("div");
         } catch (SAXException|TikaException e) {
             throw new IOException("Unable to end a page", e);
         } catch (IOException e) {
             handleCatchableIOE(e);
         } finally {
             totalCharsPerPage = 0;
             unmappedUnicodeCharsPerPage = 0;
         }

         if (config.isExtractFontNames()) {

             for (COSName n : page.getResources().getFontNames()) {
                 PDFont font = page.getResources().getFont(n);
                 if (font != null && font.getFontDescriptor() != null) {
                     String fontName = font.getFontDescriptor().getFontName();
                     if (fontName != null) {
                         fontNames.add(fontName);
                     }
                 }
             }
         }
     }

     private void handleWidget(PDAnnotationWidget widget) throws TikaException, SAXException, IOException {
         if (widget == null) {
             return;
         }
         handleDestinationOrAction(widget.getAction(), ActionTrigger.ANNOTATION_WIDGET);
         PDAnnotationAdditionalActions annotationActions = widget.getActions();
         if (annotationActions != null) {
             handleDestinationOrAction(annotationActions.getBl(), ActionTrigger.ANNOTATION_LOSE_INPUT_FOCUS);
             handleDestinationOrAction(annotationActions.getD(), ActionTrigger.ANNOTATION_MOUSE_CLICK);
             handleDestinationOrAction(annotationActions.getE(), ActionTrigger.ANNOTATION_CURSOR_ENTERS);
             handleDestinationOrAction(annotationActions.getFo(), ActionTrigger.ANNOTATION_RECEIVES_FOCUS);
             handleDestinationOrAction(annotationActions.getPC(), ActionTrigger.ANNOTATION_PAGE_CLOSED);
             handleDestinationOrAction(annotationActions.getPI(), ActionTrigger.ANNOTATION_PAGE_NO_LONGER_VISIBLE);
             handleDestinationOrAction(annotationActions.getPO(), ActionTrigger.ANNOTATION_PAGE_OPENED);
             handleDestinationOrAction(annotationActions.getPV(), ActionTrigger.ANNOTATION_PAGE_VISIBLE);
             handleDestinationOrAction(annotationActions.getU(), ActionTrigger.ANNOTATION_MOUSE_RELEASED);
             handleDestinationOrAction(annotationActions.getX(), ActionTrigger.ANNOTATION_CURSOR_EXIT);
         }

     }

     @Override
     protected void startDocument(PDDocument pdf) throws IOException {
         try {
             xhtml.startDocument();
             try {
                 handleDestinationOrAction(pdf.getDocumentCatalog().getOpenAction(), ActionTrigger.DOCUMENT_OPEN);
             } catch (IOException e) {
                 //See PDFBOX-3773
                 //swallow -- no need to report this
             }
         } catch (TikaException|SAXException e) {
             throw new IOException("Unable to start a document", e);
         }
     }

     private void handleDestinationOrAction(PDDestinationOrAction action,
                                            ActionTrigger actionTrigger) throws IOException, SAXException, TikaException {
         if (action == null || ! config.isExtractActions()) {
             return;
         }
         AttributesImpl attributes = new AttributesImpl();
         String actionOrDestString = (action instanceof PDAction) ? "action" : "destination";

         addNonNullAttribute("class",  actionOrDestString, attributes);
         addNonNullAttribute("type", action.getClass().getSimpleName(), attributes);
         addNonNullAttribute("trigger", actionTrigger.name(), attributes);

         if (action instanceof PDActionImportData) {
             processDoc("", ((PDActionImportData)action).getFile(), attributes);
         } else if (action instanceof PDActionLaunch) {
             PDActionLaunch pdActionLaunch = (PDActionLaunch)action;
             addNonNullAttribute("id", pdActionLaunch.getF(), attributes);
             addNonNullAttribute("defaultDirectory", pdActionLaunch.getD(), attributes);
             addNonNullAttribute("operation", pdActionLaunch.getO(), attributes);
             addNonNullAttribute("parameters", pdActionLaunch.getP(), attributes);
             processDoc(pdActionLaunch.getF(), pdActionLaunch.getFile(), attributes);
         } else if (action instanceof PDActionRemoteGoTo) {
             PDActionRemoteGoTo remoteGoTo = (PDActionRemoteGoTo)action;
             processDoc("", remoteGoTo.getFile(), attributes);
         } else if (action instanceof PDActionJavaScript) {
             PDActionJavaScript jsAction = (PDActionJavaScript)action;
             Metadata m = new Metadata();
             m.set(Metadata.CONTENT_TYPE, "application/javascript");
             m.set(Metadata.CONTENT_ENCODING, StandardCharsets.UTF_8.toString());
             m.set(PDF.ACTION_TRIGGER, actionTrigger.toString());
             m.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.MACRO.name());
             String js = jsAction.getAction();
             js = (js == null) ? "" : js;
             if (embeddedDocumentExtractor.shouldParseEmbedded(m)) {
                 try (InputStream is = TikaInputStream.get(js.getBytes(StandardCharsets.UTF_8))) {
                     embeddedDocumentExtractor.parseEmbedded(is, xhtml, m, false);
                 }
             }
             addNonNullAttribute("class", "javascript", attributes);
             addNonNullAttribute("type", jsAction.getType(), attributes);
             addNonNullAttribute("subtype", jsAction.getSubType(), attributes);
             xhtml.startElement("div", attributes);
             xhtml.endElement("div");
         } else {
             xhtml.startElement("div", attributes);
             xhtml.endElement("div");
         }
     }

     private static void addNonNullAttribute(String name, String value, AttributesImpl attributes) {
         if (name == null || value == null) {
             return;
         }
         attributes.addAttribute("", name, name, "CDATA", value);
     }

     @Override
     protected void endDocument(PDDocument pdf) throws IOException {
         try {
             // Extract text for any bookmarks:
 			if(config.isExtractBookmarksText()) {
                 extractBookmarkText();
             }

             try {
                 extractEmbeddedDocuments(pdf);
             } catch (IOException e) {
                 handleCatchableIOE(e);
             }

             extractXMPXFA(pdf, metadata, context);

             //extract acroform data at end of doc
             if (config.isExtractAcroFormContent() == true) {
                 try {
                     extractAcroForm(pdf);
                 } catch (IOException e) {
                     handleCatchableIOE(e);
                 }
             }
             PDDocumentCatalogAdditionalActions additionalActions = pdf.getDocumentCatalog().getActions();
             handleDestinationOrAction(additionalActions.getDP(), ActionTrigger.AFTER_DOCUMENT_PRINT);
             handleDestinationOrAction(additionalActions.getDS(), ActionTrigger.AFTER_DOCUMENT_SAVE);
             handleDestinationOrAction(additionalActions.getWC(), ActionTrigger.BEFORE_DOCUMENT_CLOSE);
             handleDestinationOrAction(additionalActions.getWP(), ActionTrigger.BEFORE_DOCUMENT_PRINT);
             handleDestinationOrAction(additionalActions.getWS(), ActionTrigger.BEFORE_DOCUMENT_SAVE);
             xhtml.endDocument();
         } catch (TikaException e) {
             throw new IOException("Unable to end a document", e);
         } catch (SAXException e) {
             throw new IOException("Unable to end a document", e);
         }
         if (fontNames.size() > 0) {
             for (String fontName : fontNames) {
                 metadata.add(Font.FONT_NAME, fontName);
             }
         }
     }

     void extractBookmarkText() throws SAXException, IOException, TikaException {
         PDDocumentOutline outline = document.getDocumentCatalog().getDocumentOutline();
         if (outline != null) {
             extractBookmarkText(outline);
         }
     }

     void extractBookmarkText(PDOutlineNode bookmark) throws SAXException, IOException, TikaException {
         PDOutlineItem current = bookmark.getFirstChild();

         if (current != null) {
             xhtml.startElement("ul");
             while (current != null) {
                 xhtml.startElement("li");
                 xhtml.characters(current.getTitle());
                 xhtml.endElement("li");
                 handleDestinationOrAction(current.getAction(), ActionTrigger.BOOKMARK);
                 // Recurse:
                 extractBookmarkText(current);
                 current = current.getNextSibling();
             }
             xhtml.endElement("ul");
         }
     }

     void extractAcroForm(PDDocument pdf) throws IOException,
             SAXException, TikaException {
         //Thank you, Ben Litchfield, for org.apache.pdfbox.examples.fdf.PrintFields
         //this code derives from Ben's code
         PDDocumentCatalog catalog = pdf.getDocumentCatalog();

         if (catalog == null)
             return;

         PDAcroForm form = catalog.getAcroForm(null);
         if (form == null)
             return;

         //if it has xfa, try that.
         //if it doesn't exist or there's an exception,
         //go with traditional AcroForm
         PDXFAResource pdxfa = form.getXFA();

         if (pdxfa != null) {
             //if successful, return
             XFAExtractor xfaExtractor = new XFAExtractor();
             InputStream is = null;
             try {
                 is = new BufferedInputStream(
                         new ByteArrayInputStream(pdxfa.getBytes()));
             } catch (IOException e) {
                 EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
             }
             if (is != null) {
                 try {
                     xfaExtractor.extract(is, xhtml, metadata, context);
                     return;
                 } catch (XMLStreamException e) {
                     //if there was an xml parse exception in xfa, try the AcroForm
                     EmbeddedDocumentUtil.recordException(e, metadata);
                 } finally {
                     IOUtils.closeQuietly(is);
                 }
             }
         }

         @SuppressWarnings("rawtypes")
         List fields = form.getFields();

         if (fields == null)
             return;

         @SuppressWarnings("rawtypes")
         ListIterator itr = fields.listIterator();

         if (itr == null)
             return;

         xhtml.startElement("div", "class", "acroform");
         xhtml.startElement("ol");

         while (itr.hasNext()) {
             Object obj = itr.next();
             if (obj != null && obj instanceof PDField) {
                 processAcroField((PDField) obj, 0);
             }
         }
         xhtml.endElement("ol");
         xhtml.endElement("div");
     }

     private void processAcroField(PDField field, final int currentRecursiveDepth)
             throws SAXException, IOException, TikaException {

         if (currentRecursiveDepth >= MAX_ACROFORM_RECURSIONS) {
             return;
         }

         PDFormFieldAdditionalActions pdFormFieldAdditionalActions = field.getActions();
         if (pdFormFieldAdditionalActions != null) {
             handleDestinationOrAction(pdFormFieldAdditionalActions.getC(), ActionTrigger.FORM_FIELD_RECALCULATE);
             handleDestinationOrAction(pdFormFieldAdditionalActions.getF(), ActionTrigger.FORM_FIELD_FORMATTED);
             handleDestinationOrAction(pdFormFieldAdditionalActions.getK(), ActionTrigger.FORM_FIELD_KEYSTROKE);
             handleDestinationOrAction(pdFormFieldAdditionalActions.getV(), ActionTrigger.FORM_FIELD_VALUE_CHANGE);
         }
         if (field.getWidgets() != null) {
             for (PDAnnotationWidget widget : field.getWidgets()) {
                 handleWidget(widget);
             }
         }


         addFieldString(field);
         if (field instanceof PDNonTerminalField) {
             int r = currentRecursiveDepth + 1;
             xhtml.startElement("ol");
             for (PDField child : ((PDNonTerminalField)field).getChildren()) {
                 processAcroField(child, r);
             }
             xhtml.endElement("ol");
         }
     }

     private void addFieldString(PDField field) throws SAXException {
         //Pick partial name to present in content and altName for attribute
         //Ignoring FullyQualifiedName for now
         String partName = field.getPartialName();
         String altName = field.getAlternateFieldName();

         StringBuilder sb = new StringBuilder();
         AttributesImpl attrs = new AttributesImpl();

         if (partName != null) {
             sb.append(partName).append(": ");
         }
         if (altName != null) {
             attrs.addAttribute("", "altName", "altName", "CDATA", altName);
         }
         //return early if PDSignature field
         if (field instanceof PDSignatureField) {
             handleSignature(attrs, (PDSignatureField) field);
             return;
         }
         String value = field.getValueAsString();
         if (value != null && !value.equals("null")) {
             sb.append(value);
         }

         if (attrs.getLength() > 0 || sb.length() > 0) {
             xhtml.startElement("li", attrs);
             xhtml.characters(sb.toString());
             xhtml.endElement("li");
         }
     }

     private void handleSignature(AttributesImpl parentAttributes, PDSignatureField sigField)
             throws SAXException {

         PDSignature sig = sigField.getSignature();
         if (sig == null) {
             return;
         }
         Map<String, String> vals = new TreeMap<>();
         vals.put("name", sig.getName());
         vals.put("contactInfo", sig.getContactInfo());
         vals.put("location", sig.getLocation());
         vals.put("reason", sig.getReason());

         Calendar cal = sig.getSignDate();
         if (cal != null) {
             dateFormat.setTimeZone(cal.getTimeZone());
             vals.put("date", dateFormat.format(cal.getTime()));
         }
         //see if there is any data
         int nonNull = 0;
         for (String val : vals.keySet()) {
             if (val != null && !val.equals("")) {
                 nonNull++;
             }
         }
         //if there is, process it
         if (nonNull > 0) {
             metadata.set(TikaCoreProperties.HAS_SIGNATURE, "true");
             xhtml.startElement("li", parentAttributes);

             AttributesImpl attrs = new AttributesImpl();
             attrs.addAttribute("", "type", "type", "CDATA", "signaturedata");

             xhtml.startElement("ol", attrs);
             for (Map.Entry<String, String> e : vals.entrySet()) {
                 if (e.getValue() == null || e.getValue().equals("")) {
                     continue;
                 }
                 attrs = new AttributesImpl();
                 attrs.addAttribute("", "signdata", "signdata", "CDATA", e.getKey());
                 xhtml.startElement("li", attrs);
                 xhtml.characters(e.getValue());
                 xhtml.endElement("li");
             }
             xhtml.endElement("ol");
             xhtml.endElement("li");
         }
     }


     private static PDActionURI getActionURI(PDAnnotation annot) {
         //copied and pasted from PDFBox's PrintURLs

         // use reflection to catch all annotation types that have getAction()
         // If you can't use reflection, then check for classes
         // PDAnnotationLink and PDAnnotationWidget, and call getAction() and check for a
         // PDActionURI result type
         try {
             Method actionMethod = annot.getClass().getDeclaredMethod("getAction");
             if (actionMethod.getReturnType().equals(PDAction.class)) {
                 PDAction action = (PDAction) actionMethod.invoke(annot);
                 if (action instanceof PDActionURI) {
                     return (PDActionURI) action;
                 }
             }
         }
         catch (NoSuchMethodException|IllegalAccessException|InvocationTargetException e) {
         }
         return null;
     }

     /**
      * we need to override this because we are overriding {@link #processPages(PDPageTree)}
      * @return
      */
     @Override
     public int getCurrentPageNo() {
         return pageIndex+1;
     }

     /**
      * See TIKA-2845 for why we need to override this.
      *
      * @param pages
      * @throws IOException
      */
     @Override
     protected void processPages(PDPageTree pages) throws IOException {
         //we currently need this hack because we aren't able to increment
         //the private currentPageNo in PDFTextStripper,
         //and PDFTextStripper's processPage relies on that variable
         //being >= startPage when deciding whether or not to process a page
         // See:
         // if (currentPageNo >= startPage && currentPageNo <= endPage
         //                && (startBookmarkPageNumber == -1 || currentPageNo >= startBookmarkPageNumber)
         //                && (endBookmarkPageNumber == -1 || currentPageNo <= endBookmarkPageNumber))
         //        {
         super.setStartPage(-1);
         for (PDPage page : pages) {
             if (getCurrentPageNo() >= getStartPage()
                     && getCurrentPageNo() <= getEndPage()) {
                 processPage(page);
             }
             pageIndex++;
         }
     }

     @Override
     public void setStartBookmark(PDOutlineItem pdOutlineItem) {
         throw new UnsupportedOperationException("We don't currently support this -- See PDFTextStripper's processPages() for how to implement this.");
     }

     @Override
     public void setEndBookmark(PDOutlineItem pdOutlineItem) {
         throw new UnsupportedOperationException("We don't currently support this -- See PDFTextStripper's processPages() for how to implement this.");
     }

     @Override
     public void setStartPage(int startPage) {
         this.startPage = startPage;
     }

     @Override
     public int getStartPage() {
         return startPage;
     }

     @Override
     protected void showGlyph(Matrix textRenderingMatrix, PDFont font, int code, String unicode, Vector displacement) throws IOException
     {
         super.showGlyph(textRenderingMatrix, font, code, unicode, displacement);
         if (unicode == null || unicode.isEmpty()) {
             unmappedUnicodeCharsPerPage++;
         }
         totalCharsPerPage++;
     }
 }