tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFMarkedContent2XHTML.java - tika - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.tika.parser.pdf;

 import java.io.IOException;
 import java.io.Writer;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Locale;
 import java.util.Map;
 import java.util.Objects;
 import java.util.Set;

 import org.apache.pdfbox.cos.COSArray;
 import org.apache.pdfbox.cos.COSBase;
 import org.apache.pdfbox.cos.COSDictionary;
 import org.apache.pdfbox.cos.COSInteger;
 import org.apache.pdfbox.cos.COSName;
 import org.apache.pdfbox.cos.COSObject;
 import org.apache.pdfbox.pdmodel.PDDocument;
 import org.apache.pdfbox.pdmodel.PDPage;
 import org.apache.pdfbox.pdmodel.PDPageTree;
 import org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure.PDStructureTreeRoot;
 import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
 import org.apache.pdfbox.text.PDFMarkedContentExtractor;
 import org.apache.pdfbox.text.TextPosition;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;

 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.ParseContext;

 /**
  * <p>This was added in Tika 1.24 as an alpha version of a text extractor
  * that builds the text from the marked text tree and includes/normalizes
  * some of the structural tags.
  * </p>
  *
  * @since 1.24
  */

 public class PDFMarkedContent2XHTML extends PDF2XHTML {

     private static final int MAX_RECURSION_DEPTH = 1000;
     private static final String DIV = "div";
     private static final Map<String, HtmlTag> COMMON_TAG_MAP = new HashMap<>();

     static {
         //code requires these to be all lower case
         COMMON_TAG_MAP.put("document", new HtmlTag("body"));
         COMMON_TAG_MAP.put("div", new HtmlTag("div"));
         COMMON_TAG_MAP.put("p", new HtmlTag("p"));
         COMMON_TAG_MAP.put("span", new HtmlTag("span"));
         COMMON_TAG_MAP.put("table", new HtmlTag("table"));
         COMMON_TAG_MAP.put("thead", new HtmlTag("thead"));
         COMMON_TAG_MAP.put("tbody", new HtmlTag("tbody"));
         COMMON_TAG_MAP.put("tr", new HtmlTag("tr"));
         COMMON_TAG_MAP.put("th", new HtmlTag("th"));
         COMMON_TAG_MAP.put("td", new HtmlTag("td"));//TODO -- convert to th if in thead?
         COMMON_TAG_MAP.put("l", new HtmlTag("ul"));
         COMMON_TAG_MAP.put("li", new HtmlTag("li"));
         COMMON_TAG_MAP.put("h1", new HtmlTag("h1"));
         COMMON_TAG_MAP.put("h2", new HtmlTag("h2"));
         COMMON_TAG_MAP.put("h3", new HtmlTag("h3"));
         COMMON_TAG_MAP.put("h4", new HtmlTag("h4"));
         COMMON_TAG_MAP.put("h5", new HtmlTag("h5"));
         COMMON_TAG_MAP.put("h6", new HtmlTag("h6"));
     }

     //this stores state as we recurse through the structure tag tree
     private State state = new State();

     private PDFMarkedContent2XHTML(PDDocument document, ContentHandler handler,
                                    ParseContext context, Metadata metadata, PDFParserConfig config)
             throws IOException {
         super(document, handler, context, metadata, config);
     }

     /**
      * Converts the given PDF document (and related metadata) to a stream
      * of XHTML SAX events sent to the given content handler.
      *
      * @param pdDocument PDF document
      * @param handler    SAX content handler
      * @param metadata   PDF metadata
      * @throws SAXException  if the content handler fails to process SAX events
      * @throws TikaException if there was an exception outside of per page processing
      */
     public static void process(PDDocument pdDocument, ContentHandler handler, ParseContext context,
                                Metadata metadata, PDFParserConfig config)
             throws SAXException, TikaException {

         PDFMarkedContent2XHTML pdfMarkedContent2XHTML = null;
         try {
             pdfMarkedContent2XHTML =
                     new PDFMarkedContent2XHTML(pdDocument, handler, context, metadata, config);
         } catch (IOException e) {
             throw new TikaException("couldn't initialize PDFMarkedContent2XHTML", e);
         }
         try {
             pdfMarkedContent2XHTML.writeText(pdDocument, new Writer() {
                 @Override
                 public void write(char[] cbuf, int off, int len) {
                 }

                 @Override
                 public void flush() {
                 }

                 @Override
                 public void close() {
                 }
             });
         } catch (IOException e) {
             if (e.getCause() instanceof SAXException) {
                 throw (SAXException) e.getCause();
             } else {
                 throw new TikaException("Unable to extract PDF content", e);
             }
         }
         if (pdfMarkedContent2XHTML.exceptions.size() > 0) {
             //throw the first
             throw new TikaException("Unable to extract PDF content",
                     pdfMarkedContent2XHTML.exceptions.get(0));
         }
     }

     private static Map<String, HtmlTag> loadRoleMap(Map<String, Object> roleMap) {
         if (roleMap == null) {
             return Collections.EMPTY_MAP;
         }
         Map<String, HtmlTag> tags = new HashMap<>();
         for (Map.Entry<String, Object> e : roleMap.entrySet()) {
             String k = e.getKey();
             Object obj = e.getValue();
             if (obj instanceof String) {
                 String v = (String) obj;
                 String lc = v.toLowerCase(Locale.US);
                 if (COMMON_TAG_MAP.containsValue(new HtmlTag(lc))) {
                     tags.put(k, new HtmlTag(lc));
                 } else {
                     tags.put(k, new HtmlTag(DIV, lc));
                 }
             }
         }
         return tags;
     }

     private static void findPages(COSBase kidsObj, List<ObjectRef> pageRefs) {
         if (kidsObj == null) {
             return;
         }
         if (kidsObj instanceof COSArray) {
             for (COSBase kid : ((COSArray) kidsObj)) {
                 if (kid instanceof COSObject) {
                     COSBase kidbase = ((COSObject) kid).getObject();
                     if (kidbase instanceof COSDictionary) {
                         COSDictionary dict = (COSDictionary) kidbase;
                         if (dict.containsKey(COSName.TYPE) &&
                                 COSName.PAGE.equals(dict.getCOSName(COSName.TYPE))) {
                             pageRefs.add(new ObjectRef(((COSObject) kid).getObjectNumber(),
                                     ((COSObject) kid).getGenerationNumber()));
                             continue;
                         }
                         if (((COSDictionary) kidbase).containsKey(COSName.KIDS)) {
                             findPages(((COSDictionary) kidbase).getItem(COSName.KIDS), pageRefs);
                         }
                     }
                 }
             }
         }
     }

     @Override
     protected void processPages(PDPageTree pages) throws IOException {

         //this is a 0-indexed list of object refs for each page
         //we need this to map the mcids later...
         //TODO: is there a better way of getting these/doing the mapping?

         List<ObjectRef> pageRefs = new ArrayList<>();
         //STEP 1: get the page refs
         findPages(pdDocument.getPages().getCOSObject().getItem(COSName.KIDS), pageRefs);
         //confirm the right number of pages was found
         if (pageRefs.size() != pdDocument.getNumberOfPages()) {
             throw new IOException(new TikaException(
                     "Couldn't find the right number of page refs (" + pageRefs.size() +
                             ") for pages (" + pdDocument.getNumberOfPages() + ")"));
         }

         PDStructureTreeRoot structureTreeRoot =
                 pdDocument.getDocumentCatalog().getStructureTreeRoot();

         //STEP 2: load the roleMap
         Map<String, HtmlTag> roleMap = loadRoleMap(structureTreeRoot.getRoleMap());

         //STEP 3: load all of the text, mapped to MCIDs
         Map<MCID, String> paragraphs = loadTextByMCID(pageRefs);

         //STEP 4: now recurse the the structure tree root and output the structure
         //and the text bits from paragraphs

         try {
             recurse(structureTreeRoot.getK(), null, 0, paragraphs, roleMap);
         } catch (SAXException e) {
             throw new IOException(e);
         }

         //STEP 5: handle all the potentially unprocessed bits
         try {
             if (state.hrefAnchorBuilder.length() > 0) {
                 xhtml.startElement("p");
                 writeString(state.hrefAnchorBuilder.toString());
                 xhtml.endElement("p");
             }
             for (MCID mcid : paragraphs.keySet()) {
                 if (!state.processedMCIDs.contains(mcid)) {
                     if (mcid.mcid > -1) {
                         //TODO: LOG! piece of text that wasn't referenced  in the marked content
                         // tree
                         // but should have been.  If mcid == -1, this was a known item not part of
                         // content tree.
                     }

                     xhtml.startElement("p");
                     writeString(paragraphs.get(mcid));
                     xhtml.endElement("p");
                 }
             }
         } catch (SAXException e) {
             throw new IOException(e);
         }
         //Step 6: for now, iterate through the pages again and do all the other handling
         //TODO: figure out when we're crossing page boundaries during the recursion
         // step above and do the page by page processing then...rather than dumping this
         // all here.
         for (PDPage page : pdDocument.getPages()) {
             startPage(page);
             endPage(page);
         }

     }

     private void recurse(COSBase kids, ObjectRef currentPageRef, int depth,
                          Map<MCID, String> paragraphs, Map<String, HtmlTag> roleMap)
             throws IOException, SAXException {

         if (depth > MAX_RECURSION_DEPTH) {
             throw new IOException(
                     new TikaException("Exceeded max recursion depth " + MAX_RECURSION_DEPTH));
         }

         if (kids instanceof COSArray) {
             for (COSBase k : ((COSArray) kids)) {
                 recurse(k, currentPageRef, depth, paragraphs, roleMap);
             }
         } else if (kids instanceof COSObject) {
             COSBase cosType = ((COSObject) kids).getItem(COSName.TYPE);
             if (cosType != null && cosType instanceof COSName) {
                 if ("OBJR".equals(((COSName) cosType).getName())) {
                     recurse(((COSObject) kids).getDictionaryObject(COSName.OBJ), currentPageRef,
                             depth + 1, paragraphs, roleMap);
                 }
             }

             COSBase n = ((COSObject) kids).getItem(COSName.S);
             String name = "";
             if (n instanceof COSName) {
                 name = ((COSName) n).getName();
             }
             COSBase grandkids = ((COSObject) kids).getItem(COSName.K);
             if (grandkids == null) {
                 return;
             }
             COSBase pageBase = ((COSObject) kids).getItem(COSName.PG);

             if (pageBase != null && pageBase instanceof COSObject) {
                 currentPageRef = new ObjectRef(((COSObject) pageBase).getObjectNumber(),
                         ((COSObject) pageBase).getGenerationNumber());
             }

             HtmlTag tag = getTag(name, roleMap);
             boolean startedLink = false;
             boolean ignoreTag = false;
             if ("link".equals(tag.clazz)) {
                 state.inLink = true;
                 startedLink = true;
             }
             if (!state.inLink) {
                 //TODO: currently suppressing span and lbody...
                 // is this what we want to do?  What else should we suppress?
                 if ("span".equals(tag.tag)) {
                     ignoreTag = true;
                 } else if ("lbody".equals(tag.clazz)) {
                     ignoreTag = true;
                 }
                 if (!ignoreTag) {
                     if (tag.clazz != null && tag.clazz.trim().length() > 0) {
                         xhtml.startElement(tag.tag, "class", tag.clazz);
                     } else {
                         xhtml.startElement(tag.tag);
                     }
                 }
             }

             recurse(grandkids, currentPageRef, depth + 1, paragraphs, roleMap);
             if (startedLink) {
                 writeLink();
             }
             if (!state.inLink && !startedLink && !ignoreTag) {
                 xhtml.endElement(tag.tag);
             }
         } else if (kids instanceof COSInteger) {
             int mcidInt = ((COSInteger) kids).intValue();
             MCID mcid = new MCID(currentPageRef, mcidInt);
             if (paragraphs.containsKey(mcid)) {
                 if (state.inLink) {
                     state.hrefAnchorBuilder.append(paragraphs.get(mcid));
                 } else {
                     try {
                         //if it isn't a uri, output this anyhow
                         writeString(paragraphs.get(mcid));
                     } catch (IOException e) {
                         handleCatchableIOE(e);
                     }
                 }
                 state.processedMCIDs.add(mcid);
             } else {
                 //TODO: log can't find mcid
             }
         } else if (kids instanceof COSDictionary) {
             //TODO: check for other types of dictionary?
             COSDictionary dict = (COSDictionary) kids;
             COSDictionary anchor = dict.getCOSDictionary(COSName.A);
             //check for subtype /Link ?
             //COSName subtype = obj.getCOSName(COSName.SUBTYPE);
             if (anchor != null) {
                 state.uri = anchor.getString(COSName.URI);
             } else {
                 if (dict.containsKey(COSName.K)) {
                     recurse(dict.getDictionaryObject(COSName.K), currentPageRef, depth + 1,
                             paragraphs, roleMap);
                 } else if (dict.containsKey(COSName.OBJ)) {
                     recurse(dict.getDictionaryObject(COSName.OBJ), currentPageRef, depth + 1,
                             paragraphs, roleMap);

                 }
             }
         } else {
             //TODO: handle a different object?
         }
     }

     private void writeLink() throws SAXException, IOException {
         //This is only for uris, obv.
         //If we want to catch within doc references (GOTO, we need to cache those in state.
         //See testPDF_childAttachments.pdf for examples
         if (state.uri != null && state.uri.trim().length() > 0) {
             xhtml.startElement("a", "href", state.uri);
             xhtml.characters(state.hrefAnchorBuilder.toString());
             xhtml.endElement("a");
         } else {
             try {
                 //if it isn't a uri, output this anyhow
                 writeString(state.hrefAnchorBuilder.toString());
             } catch (IOException e) {
                 handleCatchableIOE(e);
             }
         }
         state.hrefAnchorBuilder.setLength(0);
         state.inLink = false;
         state.uri = null;

     }

     private HtmlTag getTag(String name, Map<String, HtmlTag> roleMap) {
         if (roleMap.containsKey(name)) {
             return roleMap.get(name);
         }
         String lc = name.toLowerCase(Locale.US);
         if (COMMON_TAG_MAP.containsKey(lc)) {
             return COMMON_TAG_MAP.get(lc);
         }
         roleMap.put(name, new HtmlTag(DIV, name.toLowerCase(Locale.US)));
         return roleMap.get(name);
     }

     private Map<MCID, String> loadTextByMCID(List<ObjectRef> pageRefs) throws IOException {
         int pageCount = 1;
         Map<MCID, String> paragraphs = new HashMap<>();
         for (PDPage page : pdDocument.getPages()) {
             ObjectRef pageRef = pageRefs.get(pageCount - 1);
             PDFMarkedContentExtractor ex = new PDFMarkedContentExtractor();
             try {
                 ex.processPage(page);
             } catch (IOException e) {
                 handleCatchableIOE(e);
                 continue;
             }
             for (PDMarkedContent c : ex.getMarkedContents()) {
                 //TODO: at some point also handle
                 // 1. c.getActualText()
                 // 2. c.getExpandedForm()
                 // 3. c.getAlternateDescription()
                 // 4. c.getLanguage()

                 List<Object> objects = c.getContents();
                 StringBuilder sb = new StringBuilder();
                 //TODO: sort text positions? Figure out when to add/remove a newline and/or space?
                 for (Object o : objects) {
                     if (o instanceof TextPosition) {
                         String unicode = ((TextPosition) o).getUnicode();
                         if (unicode != null) {
                             sb.append(unicode);
                         }
                     }
                     /*
                     TODO: do we want to do anything with these?
                     TODO: Are there other types of objects we need to handle here?
                     else if (o instanceof PDImageXObject) {

                     } else if (o instanceof PDTransparencyGroup) {

                     } else if (o instanceof PDMarkedContent) {

                     } else if (o instanceof PDFormXObject) {

                     } else {
                         throw new RuntimeException("can't handle "+o.getClass());
                     }*/
                 }

                 int mcidInt = c.getMCID();
                 MCID mcid = new MCID(pageRef, mcidInt);
                 String p = sb.toString();
                 if (c.getTag().equals("P")) {
                     p = p.trim();
                 }

                 if (mcidInt < 0) {
                     //mcidInt == -1 for text bits that do not have an actual
                     //mcid -- concatenate these bits
                     if (paragraphs.containsKey(mcid)) {
                         p = paragraphs.get(mcid) + "\n" + p;
                     }
                 }

                 paragraphs.put(mcid, p);

             }
             pageCount++;
         }
         return paragraphs;
     }

     private static class State {
         Set<MCID> processedMCIDs = new HashSet<>();
         boolean inLink = false;
         int tableDepth = 0;
         private StringBuilder hrefAnchorBuilder = new StringBuilder();
         private String uri = null;
         private int tdDepth = 0;
     }

     private static class HtmlTag {
         private final String tag;
         private final String clazz;

         HtmlTag() {
             this("");
         }

         HtmlTag(String tag) {
             this(tag, "");
         }

         HtmlTag(String tag, String clazz) {
             this.tag = tag;
             this.clazz = clazz;
         }

         @Override
         public boolean equals(Object o) {
             if (this == o) {
                 return true;
             }
             if (o == null || getClass() != o.getClass()) {
                 return false;
             }

             HtmlTag htmlTag = (HtmlTag) o;

             if (!Objects.equals(tag, htmlTag.tag)) {
                 return false;
             }
             return Objects.equals(clazz, htmlTag.clazz);
         }

         @Override
         public int hashCode() {
             int result = tag != null ? tag.hashCode() : 0;
             result = 31 * result + (clazz != null ? clazz.hashCode() : 0);
             return result;
         }
     }

     private static class ObjectRef {
         private final long objId;
         private final int version;

         public ObjectRef(long objId, int version) {
             this.objId = objId;
             this.version = version;
         }

         @Override
         public boolean equals(Object o) {
             if (this == o) {
                 return true;
             }
             if (o == null || getClass() != o.getClass()) {
                 return false;
             }
             ObjectRef objectRef = (ObjectRef) o;
             return objId == objectRef.objId && version == objectRef.version;
         }

         @Override
         public int hashCode() {
             return Objects.hash(objId, version);
         }

         @Override
         public String toString() {
             return "ObjectRef{" + "objId=" + objId + ", version=" + version + '}';
         }
     }

     /**
      * In PDF land, MCID are integers that should be unique _per page_.
      * This class includes the object ref to the page and the mcid
      * so that this should be a cross-document unique key to
      * given content.
      * <p>
      * If the mcid integer == -1, that means that there is text on the page
      * not assigned to any marked content.
      */
     private static class MCID {
         //this is the object ref to the particular page
         private final ObjectRef objectRef;
         private final int mcid;

         public MCID(ObjectRef objectRef, int mcid) {
             this.objectRef = objectRef;
             this.mcid = mcid;
         }

         @Override
         public boolean equals(Object o) {
             if (this == o) {
                 return true;
             }
             if (o == null || getClass() != o.getClass()) {
                 return false;
             }
             MCID mcid1 = (MCID) o;
             return mcid == mcid1.mcid && Objects.equals(objectRef, mcid1.objectRef);
         }

         @Override
         public int hashCode() {
             return Objects.hash(objectRef, mcid);
         }

         @Override
         public String toString() {
             return "MCID{" + "objectRef=" + objectRef + ", mcid=" + mcid + '}';
         }
     }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.tika.parser.pdf;

	import java.io.IOException;
	import java.io.Writer;
	import java.util.ArrayList;
	import java.util.Collections;
	import java.util.HashMap;
	import java.util.HashSet;
	import java.util.List;
	import java.util.Locale;
	import java.util.Map;
	import java.util.Objects;
	import java.util.Set;

	import org.apache.pdfbox.cos.COSArray;
	import org.apache.pdfbox.cos.COSBase;
	import org.apache.pdfbox.cos.COSDictionary;
	import org.apache.pdfbox.cos.COSInteger;
	import org.apache.pdfbox.cos.COSName;
	import org.apache.pdfbox.cos.COSObject;
	import org.apache.pdfbox.pdmodel.PDDocument;
	import org.apache.pdfbox.pdmodel.PDPage;
	import org.apache.pdfbox.pdmodel.PDPageTree;
	import org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure.PDStructureTreeRoot;
	import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
	import org.apache.pdfbox.text.PDFMarkedContentExtractor;
	import org.apache.pdfbox.text.TextPosition;
	import org.xml.sax.ContentHandler;
	import org.xml.sax.SAXException;

	import org.apache.tika.exception.TikaException;
	import org.apache.tika.metadata.Metadata;
	import org.apache.tika.parser.ParseContext;

	/**
	* <p>This was added in Tika 1.24 as an alpha version of a text extractor
	* that builds the text from the marked text tree and includes/normalizes
	* some of the structural tags.
	* </p>
	*
	* @since 1.24
	*/

	public class PDFMarkedContent2XHTML extends PDF2XHTML {

	private static final int MAX_RECURSION_DEPTH = 1000;
	private static final String DIV = "div";
	private static final Map<String, HtmlTag> COMMON_TAG_MAP = new HashMap<>();

	static {
	//code requires these to be all lower case
	COMMON_TAG_MAP.put("document", new HtmlTag("body"));
	COMMON_TAG_MAP.put("div", new HtmlTag("div"));
	COMMON_TAG_MAP.put("p", new HtmlTag("p"));
	COMMON_TAG_MAP.put("span", new HtmlTag("span"));
	COMMON_TAG_MAP.put("table", new HtmlTag("table"));
	COMMON_TAG_MAP.put("thead", new HtmlTag("thead"));
	COMMON_TAG_MAP.put("tbody", new HtmlTag("tbody"));
	COMMON_TAG_MAP.put("tr", new HtmlTag("tr"));
	COMMON_TAG_MAP.put("th", new HtmlTag("th"));
	COMMON_TAG_MAP.put("td", new HtmlTag("td"));//TODO -- convert to th if in thead?
	COMMON_TAG_MAP.put("l", new HtmlTag("ul"));
	COMMON_TAG_MAP.put("li", new HtmlTag("li"));
	COMMON_TAG_MAP.put("h1", new HtmlTag("h1"));
	COMMON_TAG_MAP.put("h2", new HtmlTag("h2"));
	COMMON_TAG_MAP.put("h3", new HtmlTag("h3"));
	COMMON_TAG_MAP.put("h4", new HtmlTag("h4"));
	COMMON_TAG_MAP.put("h5", new HtmlTag("h5"));
	COMMON_TAG_MAP.put("h6", new HtmlTag("h6"));
	}

	//this stores state as we recurse through the structure tag tree
	private State state = new State();

	private PDFMarkedContent2XHTML(PDDocument document, ContentHandler handler,
	ParseContext context, Metadata metadata, PDFParserConfig config)
	throws IOException {
	super(document, handler, context, metadata, config);
	}

	/**
	* Converts the given PDF document (and related metadata) to a stream
	* of XHTML SAX events sent to the given content handler.
	*
	* @param pdDocument PDF document
	* @param handler SAX content handler
	* @param metadata PDF metadata
	* @throws SAXException if the content handler fails to process SAX events
	* @throws TikaException if there was an exception outside of per page processing
	*/
	public static void process(PDDocument pdDocument, ContentHandler handler, ParseContext context,
	Metadata metadata, PDFParserConfig config)
	throws SAXException, TikaException {

	PDFMarkedContent2XHTML pdfMarkedContent2XHTML = null;
	try {
	pdfMarkedContent2XHTML =
	new PDFMarkedContent2XHTML(pdDocument, handler, context, metadata, config);
	} catch (IOException e) {
	throw new TikaException("couldn't initialize PDFMarkedContent2XHTML", e);
	}
	try {
	pdfMarkedContent2XHTML.writeText(pdDocument, new Writer() {
	@Override
	public void write(char[] cbuf, int off, int len) {
	}

	@Override
	public void flush() {
	}

	@Override
	public void close() {
	}
	});
	} catch (IOException e) {
	if (e.getCause() instanceof SAXException) {
	throw (SAXException) e.getCause();
	} else {
	throw new TikaException("Unable to extract PDF content", e);
	}
	}
	if (pdfMarkedContent2XHTML.exceptions.size() > 0) {
	//throw the first
	throw new TikaException("Unable to extract PDF content",
	pdfMarkedContent2XHTML.exceptions.get(0));
	}
	}

	private static Map<String, HtmlTag> loadRoleMap(Map<String, Object> roleMap) {
	if (roleMap == null) {
	return Collections.EMPTY_MAP;
	}
	Map<String, HtmlTag> tags = new HashMap<>();
	for (Map.Entry<String, Object> e : roleMap.entrySet()) {
	String k = e.getKey();
	Object obj = e.getValue();
	if (obj instanceof String) {
	String v = (String) obj;
	String lc = v.toLowerCase(Locale.US);
	if (COMMON_TAG_MAP.containsValue(new HtmlTag(lc))) {
	tags.put(k, new HtmlTag(lc));
	} else {
	tags.put(k, new HtmlTag(DIV, lc));
	}
	}
	}
	return tags;
	}

	private static void findPages(COSBase kidsObj, List<ObjectRef> pageRefs) {
	if (kidsObj == null) {
	return;
	}
	if (kidsObj instanceof COSArray) {
	for (COSBase kid : ((COSArray) kidsObj)) {
	if (kid instanceof COSObject) {
	COSBase kidbase = ((COSObject) kid).getObject();
	if (kidbase instanceof COSDictionary) {
	COSDictionary dict = (COSDictionary) kidbase;
	if (dict.containsKey(COSName.TYPE) &&
	COSName.PAGE.equals(dict.getCOSName(COSName.TYPE))) {
	pageRefs.add(new ObjectRef(((COSObject) kid).getObjectNumber(),
	((COSObject) kid).getGenerationNumber()));
	continue;
	}
	if (((COSDictionary) kidbase).containsKey(COSName.KIDS)) {
	findPages(((COSDictionary) kidbase).getItem(COSName.KIDS), pageRefs);
	}
	}
	}
	}
	}
	}

	@Override
	protected void processPages(PDPageTree pages) throws IOException {

	//this is a 0-indexed list of object refs for each page
	//we need this to map the mcids later...
	//TODO: is there a better way of getting these/doing the mapping?

	List<ObjectRef> pageRefs = new ArrayList<>();
	//STEP 1: get the page refs
	findPages(pdDocument.getPages().getCOSObject().getItem(COSName.KIDS), pageRefs);
	//confirm the right number of pages was found
	if (pageRefs.size() != pdDocument.getNumberOfPages()) {
	throw new IOException(new TikaException(
	"Couldn't find the right number of page refs (" + pageRefs.size() +
	") for pages (" + pdDocument.getNumberOfPages() + ")"));
	}

	PDStructureTreeRoot structureTreeRoot =
	pdDocument.getDocumentCatalog().getStructureTreeRoot();

	//STEP 2: load the roleMap
	Map<String, HtmlTag> roleMap = loadRoleMap(structureTreeRoot.getRoleMap());

	//STEP 3: load all of the text, mapped to MCIDs
	Map<MCID, String> paragraphs = loadTextByMCID(pageRefs);

	//STEP 4: now recurse the the structure tree root and output the structure
	//and the text bits from paragraphs

	try {
	recurse(structureTreeRoot.getK(), null, 0, paragraphs, roleMap);
	} catch (SAXException e) {
	throw new IOException(e);
	}

	//STEP 5: handle all the potentially unprocessed bits
	try {
	if (state.hrefAnchorBuilder.length() > 0) {
	xhtml.startElement("p");
	writeString(state.hrefAnchorBuilder.toString());
	xhtml.endElement("p");
	}
	for (MCID mcid : paragraphs.keySet()) {
	if (!state.processedMCIDs.contains(mcid)) {
	if (mcid.mcid > -1) {
	//TODO: LOG! piece of text that wasn't referenced in the marked content
	// tree
	// but should have been. If mcid == -1, this was a known item not part of
	// content tree.
	}

	xhtml.startElement("p");
	writeString(paragraphs.get(mcid));
	xhtml.endElement("p");
	}
	}
	} catch (SAXException e) {
	throw new IOException(e);
	}
	//Step 6: for now, iterate through the pages again and do all the other handling
	//TODO: figure out when we're crossing page boundaries during the recursion
	// step above and do the page by page processing then...rather than dumping this
	// all here.
	for (PDPage page : pdDocument.getPages()) {
	startPage(page);
	endPage(page);
	}

	}

	private void recurse(COSBase kids, ObjectRef currentPageRef, int depth,
	Map<MCID, String> paragraphs, Map<String, HtmlTag> roleMap)
	throws IOException, SAXException {

	if (depth > MAX_RECURSION_DEPTH) {
	throw new IOException(
	new TikaException("Exceeded max recursion depth " + MAX_RECURSION_DEPTH));
	}

	if (kids instanceof COSArray) {
	for (COSBase k : ((COSArray) kids)) {
	recurse(k, currentPageRef, depth, paragraphs, roleMap);
	}
	} else if (kids instanceof COSObject) {
	COSBase cosType = ((COSObject) kids).getItem(COSName.TYPE);
	if (cosType != null && cosType instanceof COSName) {
	if ("OBJR".equals(((COSName) cosType).getName())) {
	recurse(((COSObject) kids).getDictionaryObject(COSName.OBJ), currentPageRef,
	depth + 1, paragraphs, roleMap);
	}
	}

	COSBase n = ((COSObject) kids).getItem(COSName.S);
	String name = "";
	if (n instanceof COSName) {
	name = ((COSName) n).getName();
	}
	COSBase grandkids = ((COSObject) kids).getItem(COSName.K);
	if (grandkids == null) {
	return;
	}
	COSBase pageBase = ((COSObject) kids).getItem(COSName.PG);

	if (pageBase != null && pageBase instanceof COSObject) {
	currentPageRef = new ObjectRef(((COSObject) pageBase).getObjectNumber(),
	((COSObject) pageBase).getGenerationNumber());
	}

	HtmlTag tag = getTag(name, roleMap);
	boolean startedLink = false;
	boolean ignoreTag = false;
	if ("link".equals(tag.clazz)) {
	state.inLink = true;
	startedLink = true;
	}
	if (!state.inLink) {
	//TODO: currently suppressing span and lbody...
	// is this what we want to do? What else should we suppress?
	if ("span".equals(tag.tag)) {
	ignoreTag = true;
	} else if ("lbody".equals(tag.clazz)) {
	ignoreTag = true;
	}
	if (!ignoreTag) {
	if (tag.clazz != null && tag.clazz.trim().length() > 0) {
	xhtml.startElement(tag.tag, "class", tag.clazz);
	} else {
	xhtml.startElement(tag.tag);
	}
	}
	}

	recurse(grandkids, currentPageRef, depth + 1, paragraphs, roleMap);
	if (startedLink) {
	writeLink();
	}
	if (!state.inLink && !startedLink && !ignoreTag) {
	xhtml.endElement(tag.tag);
	}
	} else if (kids instanceof COSInteger) {
	int mcidInt = ((COSInteger) kids).intValue();
	MCID mcid = new MCID(currentPageRef, mcidInt);
	if (paragraphs.containsKey(mcid)) {
	if (state.inLink) {
	state.hrefAnchorBuilder.append(paragraphs.get(mcid));
	} else {
	try {
	//if it isn't a uri, output this anyhow
	writeString(paragraphs.get(mcid));
	} catch (IOException e) {
	handleCatchableIOE(e);
	}
	}
	state.processedMCIDs.add(mcid);
	} else {
	//TODO: log can't find mcid
	}
	} else if (kids instanceof COSDictionary) {
	//TODO: check for other types of dictionary?
	COSDictionary dict = (COSDictionary) kids;
	COSDictionary anchor = dict.getCOSDictionary(COSName.A);
	//check for subtype /Link ?
	//COSName subtype = obj.getCOSName(COSName.SUBTYPE);
	if (anchor != null) {
	state.uri = anchor.getString(COSName.URI);
	} else {
	if (dict.containsKey(COSName.K)) {
	recurse(dict.getDictionaryObject(COSName.K), currentPageRef, depth + 1,
	paragraphs, roleMap);
	} else if (dict.containsKey(COSName.OBJ)) {
	recurse(dict.getDictionaryObject(COSName.OBJ), currentPageRef, depth + 1,
	paragraphs, roleMap);

	}
	}
	} else {
	//TODO: handle a different object?
	}
	}

	private void writeLink() throws SAXException, IOException {
	//This is only for uris, obv.
	//If we want to catch within doc references (GOTO, we need to cache those in state.
	//See testPDF_childAttachments.pdf for examples
	if (state.uri != null && state.uri.trim().length() > 0) {
	xhtml.startElement("a", "href", state.uri);
	xhtml.characters(state.hrefAnchorBuilder.toString());
	xhtml.endElement("a");
	} else {
	try {
	//if it isn't a uri, output this anyhow
	writeString(state.hrefAnchorBuilder.toString());
	} catch (IOException e) {
	handleCatchableIOE(e);
	}
	}
	state.hrefAnchorBuilder.setLength(0);
	state.inLink = false;
	state.uri = null;

	}

	private HtmlTag getTag(String name, Map<String, HtmlTag> roleMap) {
	if (roleMap.containsKey(name)) {
	return roleMap.get(name);
	}
	String lc = name.toLowerCase(Locale.US);
	if (COMMON_TAG_MAP.containsKey(lc)) {
	return COMMON_TAG_MAP.get(lc);
	}
	roleMap.put(name, new HtmlTag(DIV, name.toLowerCase(Locale.US)));
	return roleMap.get(name);
	}

	private Map<MCID, String> loadTextByMCID(List<ObjectRef> pageRefs) throws IOException {
	int pageCount = 1;
	Map<MCID, String> paragraphs = new HashMap<>();
	for (PDPage page : pdDocument.getPages()) {
	ObjectRef pageRef = pageRefs.get(pageCount - 1);
	PDFMarkedContentExtractor ex = new PDFMarkedContentExtractor();
	try {
	ex.processPage(page);
	} catch (IOException e) {
	handleCatchableIOE(e);
	continue;
	}
	for (PDMarkedContent c : ex.getMarkedContents()) {
	//TODO: at some point also handle
	// 1. c.getActualText()
	// 2. c.getExpandedForm()
	// 3. c.getAlternateDescription()
	// 4. c.getLanguage()

	List<Object> objects = c.getContents();
	StringBuilder sb = new StringBuilder();
	//TODO: sort text positions? Figure out when to add/remove a newline and/or space?
	for (Object o : objects) {
	if (o instanceof TextPosition) {
	String unicode = ((TextPosition) o).getUnicode();
	if (unicode != null) {
	sb.append(unicode);
	}
	}
	/*
	TODO: do we want to do anything with these?
	TODO: Are there other types of objects we need to handle here?
	else if (o instanceof PDImageXObject) {

	} else if (o instanceof PDTransparencyGroup) {

	} else if (o instanceof PDMarkedContent) {

	} else if (o instanceof PDFormXObject) {

	} else {
	throw new RuntimeException("can't handle "+o.getClass());
	}*/
	}

	int mcidInt = c.getMCID();
	MCID mcid = new MCID(pageRef, mcidInt);
	String p = sb.toString();
	if (c.getTag().equals("P")) {
	p = p.trim();
	}

	if (mcidInt < 0) {
	//mcidInt == -1 for text bits that do not have an actual
	//mcid -- concatenate these bits
	if (paragraphs.containsKey(mcid)) {
	p = paragraphs.get(mcid) + "\n" + p;
	}
	}

	paragraphs.put(mcid, p);

	}
	pageCount++;
	}
	return paragraphs;
	}

	private static class State {
	Set<MCID> processedMCIDs = new HashSet<>();
	boolean inLink = false;
	int tableDepth = 0;
	private StringBuilder hrefAnchorBuilder = new StringBuilder();
	private String uri = null;
	private int tdDepth = 0;
	}

	private static class HtmlTag {
	private final String tag;
	private final String clazz;

	HtmlTag() {
	this("");
	}

	HtmlTag(String tag) {
	this(tag, "");
	}

	HtmlTag(String tag, String clazz) {
	this.tag = tag;
	this.clazz = clazz;
	}

	@Override
	public boolean equals(Object o) {
	if (this == o) {
	return true;
	}
	if (o == null \|\| getClass() != o.getClass()) {
	return false;
	}

	HtmlTag htmlTag = (HtmlTag) o;

	if (!Objects.equals(tag, htmlTag.tag)) {
	return false;
	}
	return Objects.equals(clazz, htmlTag.clazz);
	}

	@Override
	public int hashCode() {
	int result = tag != null ? tag.hashCode() : 0;
	result = 31 * result + (clazz != null ? clazz.hashCode() : 0);
	return result;
	}
	}

	private static class ObjectRef {
	private final long objId;
	private final int version;

	public ObjectRef(long objId, int version) {
	this.objId = objId;
	this.version = version;
	}

	@Override
	public boolean equals(Object o) {
	if (this == o) {
	return true;
	}
	if (o == null \|\| getClass() != o.getClass()) {
	return false;
	}
	ObjectRef objectRef = (ObjectRef) o;
	return objId == objectRef.objId && version == objectRef.version;
	}

	@Override
	public int hashCode() {
	return Objects.hash(objId, version);
	}

	@Override
	public String toString() {
	return "ObjectRef{" + "objId=" + objId + ", version=" + version + '}';
	}
	}

	/**
	* In PDF land, MCID are integers that should be unique _per page_.
	* This class includes the object ref to the page and the mcid
	* so that this should be a cross-document unique key to
	* given content.
	* <p>
	* If the mcid integer == -1, that means that there is text on the page
	* not assigned to any marked content.
	*/
	private static class MCID {
	//this is the object ref to the particular page
	private final ObjectRef objectRef;
	private final int mcid;

	public MCID(ObjectRef objectRef, int mcid) {
	this.objectRef = objectRef;
	this.mcid = mcid;
	}

	@Override
	public boolean equals(Object o) {
	if (this == o) {
	return true;
	}
	if (o == null \|\| getClass() != o.getClass()) {
	return false;
	}
	MCID mcid1 = (MCID) o;
	return mcid == mcid1.mcid && Objects.equals(objectRef, mcid1.objectRef);
	}

	@Override
	public int hashCode() {
	return Objects.hash(objectRef, mcid);
	}

	@Override
	public String toString() {
	return "MCID{" + "objectRef=" + objectRef + ", mcid=" + mcid + '}';
	}
	}
	}