blob: 7994046091c8dcba6e1b9ef3c35be2fe48cd6ea2 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.microsoft.ooxml;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import javax.xml.namespace.QName;
import org.apache.poi.common.usermodel.Hyperlink;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.openxml4j.opc.PackagePart;
import org.apache.poi.openxml4j.opc.PackagePartName;
import org.apache.poi.openxml4j.opc.PackageRelationship;
import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
import org.apache.poi.openxml4j.opc.PackagingURIHelper;
import org.apache.poi.openxml4j.opc.TargetMode;
import org.apache.poi.sl.usermodel.Placeholder;
import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
import org.apache.poi.xslf.usermodel.XMLSlideShow;
import org.apache.poi.xslf.usermodel.XSLFComment;
import org.apache.poi.xslf.usermodel.XSLFCommentAuthors;
import org.apache.poi.xslf.usermodel.XSLFGraphicFrame;
import org.apache.poi.xslf.usermodel.XSLFGroupShape;
import org.apache.poi.xslf.usermodel.XSLFHyperlink;
import org.apache.poi.xslf.usermodel.XSLFNotes;
import org.apache.poi.xslf.usermodel.XSLFNotesMaster;
import org.apache.poi.xslf.usermodel.XSLFPictureShape;
import org.apache.poi.xslf.usermodel.XSLFRelation;
import org.apache.poi.xslf.usermodel.XSLFShape;
import org.apache.poi.xslf.usermodel.XSLFSheet;
import org.apache.poi.xslf.usermodel.XSLFSlide;
import org.apache.poi.xslf.usermodel.XSLFSlideLayout;
import org.apache.poi.xslf.usermodel.XSLFSlideShow;
import org.apache.poi.xslf.usermodel.XSLFTable;
import org.apache.poi.xslf.usermodel.XSLFTableCell;
import org.apache.poi.xslf.usermodel.XSLFTableRow;
import org.apache.poi.xslf.usermodel.XSLFTextParagraph;
import org.apache.poi.xslf.usermodel.XSLFTextRun;
import org.apache.poi.xslf.usermodel.XSLFTextShape;
import org.apache.xmlbeans.XmlException;
import org.apache.xmlbeans.XmlObject;
import org.openxmlformats.schemas.presentationml.x2006.main.CTPicture;
import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideIdList;
import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideIdListEntry;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.XHTMLContentHandler;
public class XSLFPowerPointExtractorDecorator extends AbstractOOXMLExtractor {
private final static String HANDOUT_MASTER =
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/handoutMaster";
private Metadata metadata;
public XSLFPowerPointExtractorDecorator(Metadata metadata, ParseContext context,
XSLFPowerPointExtractor extractor) {
super(context, extractor);
this.metadata = metadata;
}
/**
* use {@link XSLFPowerPointExtractorDecorator#XSLFPowerPointExtractorDecorator(Metadata,
* ParseContext, XSLFPowerPointExtractor)}
*
* @param context
* @param extractor
*/
@Deprecated
public XSLFPowerPointExtractorDecorator(ParseContext context,
XSLFPowerPointExtractor extractor) {
this(new Metadata(), context, extractor);
}
/**
* @see org.apache.poi.xslf.extractor.XSLFPowerPointExtractor#getText()
*/
protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException, IOException {
XMLSlideShow slideShow = (XMLSlideShow) extractor.getDocument();
XSLFCommentAuthors commentAuthors = slideShow.getCommentAuthors();
List<XSLFSlide> slides = slideShow.getSlides();
for (XSLFSlide slide : slides) {
String slideDesc;
if (slide.getPackagePart() != null && slide.getPackagePart().getPartName() != null) {
slideDesc = getJustFileName(slide.getPackagePart().getPartName().toString());
slideDesc += "_";
} else {
slideDesc = null;
}
// slide content
xhtml.startElement("div", "class", "slide-content");
extractContent(slide.getShapes(), false, xhtml, slideDesc);
xhtml.endElement("div");
if (config.isIncludeSlideMasterContent()) {
// slide layout which is the master sheet for this slide
xhtml.startElement("div", "class", "slide-master-content");
XSLFSlideLayout slideLayout = slide.getMasterSheet();
extractContent(slideLayout.getShapes(), true, xhtml, null);
xhtml.endElement("div");
// slide master which is the master sheet for all text layouts
XSLFSheet slideMaster = slideLayout.getMasterSheet();
extractContent(slideMaster.getShapes(), true, xhtml, null);
}
if (config.isIncludeSlideNotes()) {
// notes (if present)
XSLFNotes slideNotes = slide.getNotes();
if (slideNotes != null) {
xhtml.startElement("div", "class", "slide-notes");
extractContent(slideNotes.getShapes(), false, xhtml, slideDesc);
// master sheet for this notes
XSLFNotesMaster notesMaster = slideNotes.getMasterSheet();
if (notesMaster != null) {
extractContent(notesMaster.getShapes(), true, xhtml, null);
}
xhtml.endElement("div");
}
}
// comments (if present)
List<XSLFComment> comments = slide.getComments();
if (comments != null) {
StringBuilder authorStringBuilder = new StringBuilder();
for (XSLFComment comment : comments) {
authorStringBuilder.setLength(0);
xhtml.startElement("p", "class", "slide-comment");
if (comment.getAuthor() != null) {
authorStringBuilder.append(comment.getAuthor());
}
if (comment.getAuthorInitials() != null) {
if (authorStringBuilder.length() > 0) {
authorStringBuilder.append(" ");
}
authorStringBuilder.append("(").append(comment.getAuthorInitials()).append(")");
}
if (comment.getText() != null && authorStringBuilder.length() > 0) {
authorStringBuilder.append(" - ");
}
if (authorStringBuilder.length() > 0) {
xhtml.startElement("b");
xhtml.characters(authorStringBuilder.toString());
xhtml.endElement("b");
}
xhtml.characters(comment.getText());
xhtml.endElement("p");
}
}
//now dump diagram data
handleGeneralTextContainingPart(RELATION_DIAGRAM_DATA, "diagram-data",
slide.getPackagePart(), metadata,
new OOXMLWordAndPowerPointTextHandler(new OOXMLTikaBodyPartHandler(xhtml),
new HashMap<>()//empty
));
//now dump chart data
handleGeneralTextContainingPart(XSLFRelation.CHART.getRelation(), "chart",
slide.getPackagePart(), metadata,
new OOXMLWordAndPowerPointTextHandler(new OOXMLTikaBodyPartHandler(xhtml),
new HashMap<>()//empty
));
}
}
private void extractContent(List<? extends XSLFShape> shapes, boolean skipPlaceholders,
XHTMLContentHandler xhtml, String slideDesc) throws SAXException {
for (XSLFShape sh : shapes) {
if (sh instanceof XSLFTextShape) {
XSLFTextShape txt = (XSLFTextShape) sh;
Placeholder ph = txt.getTextType();
if (skipPlaceholders && ph != null) {
continue;
}
boolean inHyperlink = false;
for (XSLFTextParagraph p : txt.getTextParagraphs()) {
xhtml.startElement("p");
if (!config.isIncludeHeadersAndFooters() && p.isHeaderOrFooter()) {
continue;
}
for (XSLFTextRun run : p.getTextRuns()) {
//TODO: add check for targetmode=external into POI
//then check to confirm that the urls are actually
//external and not footnote refs via the current hack
Hyperlink hyperlink = run.getHyperlink();
if (hyperlink != null && hyperlink.getAddress() != null &&
!hyperlink.getAddress().contains("#_ftn")) {
xhtml.startElement("a", "href", hyperlink.getAddress());
inHyperlink = true;
}
xhtml.characters(run.getRawText());
if (inHyperlink == true) {
xhtml.endElement("a");
}
inHyperlink = false;
}
xhtml.endElement("p");
}
} else if (sh instanceof XSLFGroupShape) {
// recurse into groups of shapes
XSLFGroupShape group = (XSLFGroupShape) sh;
extractContent(group.getShapes(), skipPlaceholders, xhtml, slideDesc);
} else if (sh instanceof XSLFTable) {
//unlike tables in Word, ppt/x can't have recursive tables...I don't think
extractTable((XSLFTable) sh, xhtml);
} else if (sh instanceof XSLFGraphicFrame) {
XSLFGraphicFrame frame = (XSLFGraphicFrame) sh;
XmlObject[] sp = frame.getXmlObject().selectPath(
"declare namespace p='http://schemas.openxmlformats.org/presentationml/2006/main' .//*/p:oleObj");
if (sp != null) {
for (XmlObject emb : sp) {
XmlObject relIDAtt = emb.selectAttribute(new QName(
"http://schemas.openxmlformats.org/officeDocument/2006/relationships",
"id"));
if (relIDAtt != null) {
String relID = relIDAtt.getDomNode().getNodeValue();
if (slideDesc != null) {
relID = slideDesc + relID;
}
AttributesImpl attributes = new AttributesImpl();
attributes.addAttribute("", "class", "class", "CDATA", "embedded");
attributes.addAttribute("", "id", "id", "CDATA", relID);
xhtml.startElement("div", attributes);
xhtml.endElement("div");
}
}
}
} else if (sh instanceof XSLFPictureShape) {
if (!skipPlaceholders && (sh.getXmlObject() instanceof CTPicture)) {
CTPicture ctPic = ((CTPicture) sh.getXmlObject());
if (ctPic.getBlipFill() != null && ctPic.getBlipFill().getBlip() != null) {
String relID = ctPic.getBlipFill().getBlip().getEmbed();
if (relID != null) {
if (slideDesc != null) {
relID = slideDesc + relID;
}
AttributesImpl attributes = new AttributesImpl();
attributes.addAttribute("", "class", "class", "CDATA", "embedded");
attributes.addAttribute("", "id", "id", "CDATA", relID);
xhtml.startElement("div", attributes);
xhtml.endElement("div");
}
}
}
}
}
}
private void extractTable(XSLFTable tbl, XHTMLContentHandler xhtml) throws SAXException {
xhtml.startElement("table");
for (XSLFTableRow row : tbl) {
xhtml.startElement("tr");
for (XSLFTableCell c : row.getCells()) {
xhtml.startElement("td");
//TODO: Need to wait for fix in POI to test for hyperlink first
//shouldn't need to catch NPE...
XSLFHyperlink hyperlink = null;
try {
hyperlink = c.getHyperlink();
} catch (NullPointerException e) {
//swallow
}
if (hyperlink != null && hyperlink.getAddress() != null) {
xhtml.startElement("a", "href", hyperlink.getAddress());
}
xhtml.characters(c.getText());
if (hyperlink != null && hyperlink.getAddress() != null) {
xhtml.endElement("a");
}
xhtml.endElement("td");
}
xhtml.endElement("tr");
}
xhtml.endElement("table");
}
/**
* In PowerPoint files, slides have things embedded in them,
* and slide drawings which have the images
*/
@Override
protected List<PackagePart> getMainDocumentParts() throws TikaException {
List<PackagePart> parts = new ArrayList<>();
XSLFSlideShow document = null;
try {
document = new XSLFSlideShow(extractor.getPackage());
} catch (Exception e) {
throw new TikaException(e.getMessage()); // Shouldn't happen
}
CTSlideIdList ctSlideIdList = document.getSlideReferences();
if (ctSlideIdList != null) {
for (int i = 0; i < ctSlideIdList.sizeOfSldIdArray(); i++) {
CTSlideIdListEntry ctSlide = ctSlideIdList.getSldIdArray(i);
// Add the slide
PackagePart slidePart;
try {
slidePart = document.getSlidePart(ctSlide);
} catch (IOException | XmlException e) {
throw new TikaException("Broken OOXML file", e);
}
addSlideParts(slidePart, parts);
}
}
//add full document to include macros
parts.add(document.getPackagePart());
for (String rel : new String[]{XSLFRelation.SLIDE_MASTER.getRelation(), HANDOUT_MASTER}) {
try {
PackageRelationshipCollection prc =
document.getPackagePart().getRelationshipsByType(rel);
for (int i = 0; i < prc.size(); i++) {
PackagePart pp =
document.getPackagePart().getRelatedPart(prc.getRelationship(i));
if (pp != null) {
parts.add(pp);
}
}
} catch (InvalidFormatException e) {
//log
}
}
return parts;
}
private void addSlideParts(PackagePart slidePart, List<PackagePart> parts) {
for (String relation : new String[]{XSLFRelation.VML_DRAWING.getRelation(),
XSLFRelation.SLIDE_LAYOUT.getRelation(), XSLFRelation.NOTES_MASTER.getRelation(),
XSLFRelation.NOTES.getRelation()}) {
try {
for (PackageRelationship packageRelationship : slidePart
.getRelationshipsByType(relation)) {
if (packageRelationship.getTargetMode() == TargetMode.INTERNAL) {
PackagePartName relName = PackagingURIHelper
.createPartName(packageRelationship.getTargetURI());
parts.add(packageRelationship.getPackage().getPart(relName));
}
}
} catch (InvalidFormatException e) {
//swallow
}
}
//and slide of course
parts.add(slidePart);
}
}