blob: 2201c9cef317cba4f22e7a073c6ce91bb58884a8 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.fop.render.pdf.pdfbox;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import org.apache.pdfbox.cos.COSArray;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSDictionary;
import org.apache.pdfbox.cos.COSInteger;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.cos.COSNull;
import org.apache.pdfbox.cos.COSObject;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.documentinterchange.taggedpdf.StandardStructureTypes;
import org.apache.fop.pdf.PDFDictionary;
import org.apache.fop.pdf.PDFDocument;
import org.apache.fop.pdf.PDFNumber;
import org.apache.fop.pdf.PDFObject;
import org.apache.fop.pdf.PDFPage;
import org.apache.fop.pdf.PDFReference;
import org.apache.fop.pdf.PDFStructElem;
import org.apache.fop.render.pdf.PDFLogicalStructureHandler;
public class StructureTreeMerger {
PDFBoxAdapter adapter;
PDFPage targetPage;
PDFDocument pdfDoc;
private PDPage srcPage;
private COSDictionary roleMap;
protected PDFStructElem currentSessionElem;
private PDFLogicalStructureHandler logicalStructHandler;
private Map<Integer, PDFStructElem> structElemCache = new HashMap<Integer, PDFStructElem>();
private Map<Integer, PDFStructElem> markedContentMap = new TreeMap<Integer, PDFStructElem>();
private int currentMCID;
private List<COSObject> topElems = new ArrayList<COSObject>();
private COSArray extra = new COSArray();
private COSArray originalParentTree = new COSArray();
public StructureTreeMerger(PDFStructElem currentSessionElem, PDFLogicalStructureHandler logicalStructHandler,
PDFBoxAdapter adapter, PDPage srcPage) {
this.adapter = adapter;
this.srcPage = srcPage;
this.targetPage = adapter.getTargetPage();
this.pdfDoc = targetPage.getDocument();
this.currentMCID = adapter.getCurrentMCID();
this.logicalStructHandler = logicalStructHandler;
this.currentSessionElem = currentSessionElem;
}
public void setRoleMap(COSDictionary roleMap) {
this.roleMap = roleMap;
}
public void copyStructure(COSArray pageParentTreeArray) throws IOException {
originalParentTree = pageParentTreeArray;
pageParentTreeArray = removeNonCOSObjects(pageParentTreeArray);
for (COSBase entry : pageParentTreeArray) {
COSObject entryObj = (COSObject)entry;
createPageStructElements(entryObj);
}
createParents(pageParentTreeArray);
for (COSObject top : topElems) {
findLeafNodesInPageFromStructElemObjects(top);
}
createParents(extra);
addToPageParentTreeArray();
removeNullPlaceholders();
}
public void createDirectDescendants(COSBase base, PDFStructElem parent) throws IOException {
if (base instanceof COSDictionary) {
COSDictionary baseDict = (COSDictionary)base;
if (baseDict.keySet().contains(COSName.K)) {
createDirectDescendants(baseDict.getItem(COSName.K), parent);
}
} else if (base instanceof COSArray) {
COSArray array = (COSArray)base;
for (int i = 0; i < array.size(); i++) {
createDirectDescendants(array.get(i), parent);
}
} else {
assert base instanceof COSObject;
COSObject obj = (COSObject)base;
createAndRegisterStructElem(obj);
PDFStructElem elem = structElemCache.get((int)obj.getObjectNumber());
copyElemEntries(obj, elem);
parent.addKid(elem);
elem.setParent(parent);
COSBase objKid = obj.getItem(COSName.K);
if (objKid != null) {
createDirectDescendants(objKid, elem);
}
}
}
public void setCurrentSessionElem() {
if (currentSessionElem == null) {
currentSessionElem = pdfDoc.getStructureTreeElements()
.get(pdfDoc.getStructureTreeElements().size() - 1);
}
}
private void createParents(COSArray markedContentParents) throws IOException {
for (COSBase entry : markedContentParents) {
COSObject elemCos = (COSObject)entry;
COSObject elemParent = (COSObject)elemCos.getItem(COSName.P);
if (elemParent != null) {
PDFStructElem elem = structElemCache.get((int)elemCos.getObjectNumber());
createParents(elemCos, elemParent, elem);
}
}
}
private PDFStructElem createAndRegisterStructElem(COSObject entry) {
PDFStructElem elem = new PDFStructElem();
pdfDoc.registerStructureElement(elem);
structElemCache.put((int)entry.getObjectNumber(), elem);
return elem;
}
private void copyElemEntries(COSBase base, PDFStructElem elem) throws IOException {
assert base instanceof COSObject;
COSObject baseObj = (COSObject) base;
COSDictionary baseDic = (COSDictionary) baseObj.getObject();
COSName[] names = {COSName.TYPE, COSName.S, COSName.PG, COSName.ALT, COSName.LANG, COSName.A,
COSName.ACTUAL_TEXT, COSName.T, COSName.E, COSName.C };
for (COSName name : names) {
if (baseDic.keySet().contains(name)) {
if (name.equals(COSName.PG)) {
elem.put(COSName.PG.getName(), targetPage.makeReference());
} else {
elem.put(name.getName(), adapter.cloneForNewDocument(baseDic.getItem(name)));
}
}
}
adapter.cacheClonedObject(base, elem);
}
private PDFStructElem createPageStructElements(COSObject entry) throws IOException {
int objID = (int)entry.getObjectNumber();
if (structElemCache.containsKey(objID)) {
return null;
}
PDFStructElem elem = createAndRegisterStructElem(entry);
copyElemEntries(entry, elem);
COSDictionary baseDict = (COSDictionary) entry.getObject();
COSBase kid = baseDict.getItem(COSName.K);
createKids(kid, baseDict, elem, false);
return elem;
}
private void createParents(COSObject cosElem, COSObject cosParentElem, PDFStructElem elem) throws IOException {
int elemObjectID = (int)cosParentElem.getObjectNumber();
COSDictionary parentElemDictionary = (COSDictionary)cosParentElem.getObject();
PDFStructElem elemParent = structElemCache.get(elemObjectID);
if (isStructureTreeRoot(parentElemDictionary)) {
elem.setParent(currentSessionElem);
currentSessionElem.addKid(elem);
topElems.add(cosElem);
} else if (elemParent != null) {
if (!checkIfStructureTypeIsPresent(parentElemDictionary, StandardStructureTypes.TR)) {
elem.setParent(elemParent);
int position = StructureTreeMergerUtil.findObjectPositionInKidsArray(cosElem);
elemParent.addKidInSpecificOrder(position, elem);
}
} else if (!checkIfStructureTypeIsPresent(parentElemDictionary, StandardStructureTypes.DOCUMENT)) {
elemParent = createAndRegisterStructElem(cosParentElem);
copyElemEntries(cosParentElem, elemParent);
elem.setParent(elemParent);
fillKidsWithNull(elemParent, (COSDictionary)cosParentElem.getObject());
if (parentElemDictionary.getDictionaryObject(COSName.S) == COSName.TR) {
COSBase rowKids = parentElemDictionary.getItem(COSName.K);
createKids(rowKids, parentElemDictionary, elemParent, true);
} else {
int position = StructureTreeMergerUtil.findObjectPositionInKidsArray(cosElem);
elemParent.addKidInSpecificOrder(position, elem);
}
COSObject parentObj = (COSObject)parentElemDictionary.getItem(COSName.P);
createParents(cosParentElem, parentObj, elemParent);
} else {
elem.setParent(currentSessionElem);
int position = StructureTreeMergerUtil.findObjectPositionInKidsArray(cosElem);
currentSessionElem.addKidInSpecificOrder(position, elem);
topElems.add(cosElem);
}
}
private void createKids(COSBase baseKid, COSDictionary parentDict, PDFStructElem parent,
boolean originatedFromTableRow) throws IOException {
if (baseKid instanceof COSArray) {
COSArray baseArray = (COSArray) baseKid;
for (COSBase entry : baseArray) {
createKids(entry, parentDict, parent, originatedFromTableRow);
}
} else if (baseKid instanceof COSObject) {
COSObject kid = (COSObject)baseKid;
createKidFromCOSObject(kid, parentDict, parent, originatedFromTableRow);
} else if (baseKid instanceof COSInteger) {
if (checkPageEntryInAncestorsRecursively(parentDict)) {
PDFNumber num = (PDFNumber)adapter.cloneForNewDocument(baseKid);
createKidEntryFromInt(num, parent);
}
} else if (baseKid instanceof COSDictionary) {
COSDictionary mcrDict = (COSDictionary)baseKid;
createKidFromCOSDictionary(mcrDict, parent, parentDict);
}
}
private void createKidFromCOSObject(COSObject baseObj, COSDictionary parentDict, PDFStructElem parent,
boolean originatedFromTableRow) throws IOException {
COSBase baseKid = baseObj.getObject();
if (baseKid instanceof COSInteger) {
COSInteger number = (COSInteger) baseKid;
createKids(number, parentDict, parent, originatedFromTableRow);
} else {
COSDictionary unwrappedDict = (COSDictionary)baseKid;
if (unwrappedDict.getDictionaryObject(COSName.S) == null) {
COSDictionary mcrDict = (COSDictionary)baseKid;
createKidFromCOSDictionary(mcrDict, parent, parentDict);
} else if (originatedFromTableRow) {
int objID = (int)baseObj.getObjectNumber();
if (structElemCache.get(objID) != null) {
PDFStructElem kidElem = structElemCache.get(objID);
parent.addKid(kidElem);
kidElem.setParent(parent);
} else {
createkidEntryFromCosObjectForRow(baseObj, parent);
}
} else {
parent.addKid(null);
}
}
}
private void createkidEntryFromCosObjectForRow(COSObject entree, PDFStructElem parent) throws IOException {
int entreeObjID = (int)entree.getObjectNumber();
PDFStructElem elemRef = structElemCache.get(entreeObjID);
if (elemRef == null) {
elemRef = createAndRegisterStructElem(entree);
copyElemEntries(entree, elemRef);
COSDictionary baseDict = (COSDictionary) entree.getObject();
COSBase kid = baseDict.getItem(COSName.K);
createKids(kid, baseDict, elemRef, true);
parent.addKid(elemRef);
} else {
parent.addKid(elemRef);
}
elemRef.setParent(parent);
}
private boolean checkPageEntryInAncestorsRecursively(COSDictionary elem) {
if (elem.containsKey(COSName.PG)) {
COSDictionary pageDict = (COSDictionary)elem.getDictionaryObject(COSName.PG);
return srcPage.getCOSObject() == pageDict;
} else if (elem.containsKey(COSName.P)) {
COSDictionary parent = (COSDictionary)elem.getDictionaryObject(COSName.P);
return checkPageEntryInAncestorsRecursively(parent);
} else {
return true;
}
}
private boolean isElementFromSourcePage(COSDictionary mrcDict, COSDictionary parentDict) {
if (mrcDict.containsKey(COSName.PG)) {
COSDictionary page = (COSDictionary)mrcDict.getDictionaryObject(COSName.PG);
return srcPage.getCOSObject() == page;
} else {
return checkPageEntryInAncestorsRecursively(parentDict);
}
}
private void createKidFromCOSDictionary(COSDictionary mcrDict, PDFStructElem parent, COSDictionary baseDict)
throws IOException {
Collection<COSName> exclude = Arrays.asList(COSName.PG);
PDFReference referenceObj;
if (isElementFromSourcePage(mcrDict, baseDict)) {
PDFDictionary contentItem = (PDFDictionary)adapter.cloneForNewDocument(mcrDict, mcrDict, exclude);
if (mcrDict.keySet().contains(COSName.TYPE)) {
String type = ((COSName) mcrDict.getDictionaryObject(COSName.TYPE)).getName();
if (type.equals("OBJR")) {
COSObject obj = (COSObject) mcrDict.getItem(COSName.OBJ);
if (adapter.getCachedClone(obj) == null) {
referenceObj = null;
} else {
referenceObj = ((PDFObject) adapter.getCachedClone(obj)).makeReference();
}
contentItem.put(COSName.OBJ.getName(), referenceObj);
updateStructParentAndAddToPageParentTree(referenceObj, parent);
} else if (type.equals("MCR")) {
updateMCIDEntry(contentItem);
markedContentMap.put((((PDFNumber)contentItem.get(COSName.MCID.getName())).getNumber())
.intValue(), parent);
}
}
if (mcrDict.keySet().contains(COSName.PG)) {
contentItem.put(COSName.PG.getName(), targetPage.makeReference());
} else {
parent.put(COSName.PG.getName(), targetPage.makeReference());
}
parent.addKid(contentItem);
} else {
parent.addKid(null);
}
}
private void createKidEntryFromInt(PDFNumber num, PDFStructElem parent) {
num.setNumber(num.getNumber().intValue() + currentMCID);
parent.addKid(num);
markedContentMap.put(num.getNumber().intValue(), parent);
}
private void updateMCIDEntry(PDFDictionary mcrDictionary) {
if (currentMCID > 0) {
int oldMCID = (((PDFNumber)mcrDictionary.get(COSName.MCID.getName())).getNumber()).intValue();
PDFNumber number = new PDFNumber();
number.setNumber(oldMCID + currentMCID);
mcrDictionary.put(COSName.MCID.getName(), number);
}
}
private void removeNullPlaceholders() {
List<PDFStructElem> list = new ArrayList<PDFStructElem>(structElemCache.values());
for (PDFStructElem elem : list) {
List<PDFObject> kids = elem.getKids();
if (kids != null) {
kids.removeAll(Collections.singleton(null));
}
}
}
private boolean isStructureTreeRoot(COSDictionary elem) {
if (elem.keySet().contains(COSName.TYPE)) {
COSName type = (COSName)elem.getDictionaryObject(COSName.TYPE);
return type.equals(COSName.STRUCT_TREE_ROOT);
}
return false;
}
public void addToPageParentTreeArray() {
List<PDFStructElem> complete = restoreNullValuesInParentTree();
for (PDFStructElem entry : complete) {
logicalStructHandler.getPageParentTree().add(entry);
}
}
private List<PDFStructElem> restoreNullValuesInParentTree() {
int total = markedContentMap.size();
List<PDFStructElem> list = new ArrayList<PDFStructElem>(markedContentMap.values());
List<PDFStructElem> complete = new ArrayList<PDFStructElem>(total);
for (COSBase base : originalParentTree) {
if (base instanceof COSNull || base == null) {
complete.add(null);
} else if (!list.isEmpty()) {
complete.add(list.get(0));
list.remove(0);
}
}
return complete;
}
private void updateStructParentAndAddToPageParentTree(PDFReference obj, PDFStructElem elem) {
int nextParentTreeKey = logicalStructHandler.getNextParentTreeKey();
if (obj != null) {
PDFObject referenceObj = obj.getObject();
assert referenceObj instanceof PDFDictionary;
PDFDictionary objDict = (PDFDictionary)referenceObj;
objDict.put((COSName.STRUCT_PARENT).getName(), nextParentTreeKey);
}
logicalStructHandler.getParentTree().addToNums(nextParentTreeKey, elem);
}
private void findLeafNodesInPageFromStructElemObjects(COSBase entry) throws IOException {
if (entry instanceof COSObject) {
COSObject entryObj = (COSObject) entry;
COSDictionary structElemDictionary = (COSDictionary) entryObj.getObject();
COSBase kid = structElemDictionary.getItem(COSName.K);
findLeafKids(kid, entryObj);
}
}
private void findLeafKids(COSBase kid, COSObject parent) throws IOException {
if (kid instanceof COSArray) {
COSArray arrayKid = (COSArray)kid;
for (COSBase arrayEntry : arrayKid) {
findLeafKids(arrayEntry, parent);
}
} else if (kid instanceof COSObject) {
COSObject kidObject = (COSObject)kid;
COSBase base = kidObject.getObject();
COSDictionary temp = (COSDictionary)base;
if (temp.getDictionaryObject(COSName.S) != null && temp.getItem(COSName.K) != null) {
COSBase tempKids = temp.getItem(COSName.K);
findLeafKids(tempKids, kidObject);
} else {
findLeafKids(temp, parent);
}
} else if (kid instanceof COSDictionary) {
COSDictionary kidDictionary = (COSDictionary)kid;
COSDictionary parentDict = (COSDictionary)parent.getObject();
if (isElementFromSourcePage(kidDictionary, parentDict)) {
PDFStructElem elem = structElemCache.get((int)parent.getObjectNumber());
if (elem == null) {
elem = createAndRegisterStructElem(parent);
copyElemEntries(parent, elem);
extra.add(parent);
createKids(kid, parentDict, elem, false);
}
}
} else {
assert kid instanceof COSInteger;
COSDictionary parentDict = (COSDictionary)parent.getObject();
if (checkPageEntryInAncestorsRecursively(parentDict)) {
PDFStructElem elem = structElemCache.get((int)parent.getObjectNumber());
if (elem == null) {
elem = createAndRegisterStructElem(parent);
copyElemEntries(parent, elem);
createKids(kid, parentDict, elem, false);
}
}
}
}
private void fillKidsWithNull(PDFStructElem elem, COSDictionary baseElem) {
COSBase baseArray = baseElem.getItem(COSName.K);
if (baseArray instanceof COSArray) {
COSArray array = (COSArray)baseArray;
int size = array.size();
for (int i = 0; i < size; i++) {
elem.addKid(null);
}
}
}
private boolean checkIfStructureTypeIsPresent(COSDictionary elemDictionary, String type) {
String potentialCustomElemType = ((COSName)elemDictionary.getDictionaryObject(COSName.S)).getName();
if (type.equals(potentialCustomElemType)) {
return true;
} else {
List<String> rolemapValues = StructureTreeMergerUtil.findRoleMapKeyByValue(type, roleMap);
return rolemapValues.contains(potentialCustomElemType);
}
}
private COSArray removeNonCOSObjects(COSArray pageParentTreeArray) {
COSArray objectList = new COSArray();
for (COSBase entry : pageParentTreeArray) {
if (entry instanceof COSObject) {
COSObject entryObj = (COSObject)entry;
objectList.add(entryObj);
}
}
return objectList;
}
public void setCurrentSessionElemKid() {
PDFNumber num = new PDFNumber();
createKidEntryFromInt(num, currentSessionElem);
addToPageParentTreeArray();
}
}