blob: 433804a51de5db47695018e6c2a0cc3319dd89bf [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.microsoft.ooxml;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.zip.ZipException;
import org.apache.commons.io.input.CloseShieldInputStream;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.openxml4j.opc.PackagePart;
import org.apache.poi.openxml4j.opc.PackagePartName;
import org.apache.poi.openxml4j.opc.PackageRelationship;
import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
import org.apache.poi.openxml4j.opc.PackagingURIHelper;
import org.apache.poi.openxml4j.opc.TargetMode;
import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
import org.apache.poi.xslf.usermodel.XSLFRelation;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.microsoft.ooxml.xslf.XSLFEventBasedPowerPointExtractor;
import org.apache.tika.sax.EmbeddedContentHandler;
import org.apache.tika.sax.OfflineContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.utils.ExceptionUtils;
import org.apache.tika.utils.XMLReaderUtils;
/**
* SAX/Streaming pptx extractior
*/
public class SXSLFPowerPointExtractorDecorator extends AbstractOOXMLExtractor {
private final static String HANDOUT_MASTER =
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/handoutMaster";
//a pptx file should have one of these "main story" parts
private final static String[] MAIN_STORY_PART_RELATIONS =
new String[]{XSLFRelation.MAIN.getContentType(),
XSLFRelation.PRESENTATION_MACRO.getContentType(),
XSLFRelation.PRESENTATIONML.getContentType(),
XSLFRelation.PRESENTATIONML_TEMPLATE.getContentType(),
XSLFRelation.MACRO.getContentType(),
XSLFRelation.MACRO_TEMPLATE.getContentType(),
XSLFRelation.THEME_MANAGER.getContentType()
//TODO: what else
};
private final OPCPackage opcPackage;
private final ParseContext context;
private final Metadata metadata;
private final CommentAuthors commentAuthors = new CommentAuthors();
private PackagePart mainDocument = null;
public SXSLFPowerPointExtractorDecorator(Metadata metadata, ParseContext context,
XSLFEventBasedPowerPointExtractor extractor) {
super(context, extractor);
this.metadata = metadata;
this.context = context;
this.opcPackage = extractor.getPackage();
for (String contentType : MAIN_STORY_PART_RELATIONS) {
List<PackagePart> pps = opcPackage.getPartsByContentType(contentType);
if (pps.size() > 0) {
mainDocument = pps.get(0);
break;
}
}
//if mainDocument == null, throw exception
}
/**
* @see XSLFPowerPointExtractor#getText()
*/
protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException, IOException {
loadCommentAuthors();
PackageRelationshipCollection slidesPRC = null;
try {
slidesPRC = mainDocument.getRelationshipsByType(XSLFRelation.SLIDE.getRelation());
} catch (InvalidFormatException e) {
metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
ExceptionUtils.getStackTrace(e));
}
if (slidesPRC != null && slidesPRC.size() > 0) {
for (int i = 0; i < slidesPRC.size(); i++) {
try {
handleSlidePart(mainDocument.getRelatedPart(slidesPRC.getRelationship(i)),
xhtml);
} catch (InvalidFormatException | ZipException e) {
metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
ExceptionUtils.getStackTrace(e));
}
}
}
if (config.isIncludeSlideMasterContent()) {
handleGeneralTextContainingPart(XSLFRelation.SLIDE_MASTER.getRelation(), "slide-master",
mainDocument, metadata, new PlaceHolderSkipper(
new OOXMLWordAndPowerPointTextHandler(
new OOXMLTikaBodyPartHandler(xhtml),
new HashMap<>())));
handleGeneralTextContainingPart(HANDOUT_MASTER, "slide-handout-master", mainDocument,
metadata,
new OOXMLWordAndPowerPointTextHandler(new OOXMLTikaBodyPartHandler(xhtml),
new HashMap<>()));
}
}
private void loadCommentAuthors() {
PackageRelationshipCollection prc = null;
try {
prc = mainDocument.getRelationshipsByType(XSLFRelation.COMMENT_AUTHORS.getRelation());
} catch (InvalidFormatException e) {
metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
ExceptionUtils.getStackTrace(e));
}
if (prc == null || prc.size() == 0) {
return;
}
for (int i = 0; i < prc.size(); i++) {
PackagePart commentAuthorsPart = null;
try {
commentAuthorsPart = mainDocument.getRelatedPart(prc.getRelationship(i));
} catch (InvalidFormatException e) {
metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
ExceptionUtils.getStackTrace(e));
}
if (commentAuthorsPart == null) {
continue;
}
try (InputStream stream = commentAuthorsPart.getInputStream()) {
XMLReaderUtils.parseSAX(new CloseShieldInputStream(stream),
new OfflineContentHandler(new XSLFCommentAuthorHandler()), context);
} catch (TikaException | SAXException | IOException e) {
metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
ExceptionUtils.getStackTrace(e));
}
}
}
private void handleSlidePart(PackagePart slidePart, XHTMLContentHandler xhtml)
throws IOException, SAXException {
Map<String, String> linkedRelationships =
loadLinkedRelationships(slidePart, false, metadata);
// Map<String, String> hyperlinks = loadHyperlinkRelationships(packagePart);
xhtml.startElement("div", "class", "slide-content");
try (InputStream stream = slidePart.getInputStream()) {
XMLReaderUtils.parseSAX(new CloseShieldInputStream(stream), new OfflineContentHandler(
new EmbeddedContentHandler(new OOXMLWordAndPowerPointTextHandler(
new OOXMLTikaBodyPartHandler(xhtml), linkedRelationships))), context);
} catch (TikaException | IOException e) {
metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
ExceptionUtils.getStackTrace(e));
}
xhtml.endElement("div");
if (config.isIncludeSlideMasterContent()) {
handleGeneralTextContainingPart(XSLFRelation.SLIDE_LAYOUT.getRelation(),
"slide-master-content", slidePart, metadata, new PlaceHolderSkipper(
new OOXMLWordAndPowerPointTextHandler(
new OOXMLTikaBodyPartHandler(xhtml), linkedRelationships)));
}
if (config.isIncludeSlideNotes()) {
handleGeneralTextContainingPart(XSLFRelation.NOTES.getRelation(), "slide-notes",
slidePart, metadata,
new OOXMLWordAndPowerPointTextHandler(new OOXMLTikaBodyPartHandler(xhtml),
linkedRelationships));
if (config.isIncludeSlideMasterContent()) {
handleGeneralTextContainingPart(XSLFRelation.NOTES_MASTER.getRelation(),
"slide-notes-master", slidePart, metadata,
new OOXMLWordAndPowerPointTextHandler(new OOXMLTikaBodyPartHandler(xhtml),
linkedRelationships));
}
}
handleGeneralTextContainingPart(XSLFRelation.COMMENTS.getRelation(), null, slidePart,
metadata, new XSLFCommentsHandler(xhtml));
handleGeneralTextContainingPart(AbstractOOXMLExtractor.RELATION_DIAGRAM_DATA,
"diagram-data", slidePart, metadata,
new OOXMLWordAndPowerPointTextHandler(new OOXMLTikaBodyPartHandler(xhtml),
linkedRelationships));
handleGeneralTextContainingPart(XSLFRelation.CHART.getRelation(), "chart", slidePart,
metadata, new OOXMLWordAndPowerPointTextHandler(new OOXMLTikaBodyPartHandler(xhtml),
linkedRelationships));
}
/**
* In PowerPoint files, slides have things embedded in them,
* and slide drawings which have the images
*/
@Override
protected List<PackagePart> getMainDocumentParts() {
List<PackagePart> parts = new ArrayList<>();
//TODO: consider: getPackage().getPartsByName(Pattern.compile("/ppt/embeddings/.*?
//TODO: consider: getPackage().getPartsByName(Pattern.compile("/ppt/media/.*?
PackageRelationshipCollection slidePRC = null;
try {
slidePRC = mainDocument.getRelationshipsByType(XSLFRelation.SLIDE.getRelation());
} catch (InvalidFormatException e) {
metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
ExceptionUtils.getStackTrace(e));
}
if (slidePRC != null) {
for (int i = 0; i < slidePRC.size(); i++) {
PackagePart slidePart = null;
try {
slidePart = mainDocument.getRelatedPart(slidePRC.getRelationship(i));
} catch (InvalidFormatException e) {
metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
ExceptionUtils.getStackTrace(e));
}
addSlideParts(slidePart, parts);
}
}
parts.add(mainDocument);
for (String rel : new String[]{XSLFRelation.SLIDE_MASTER.getRelation(), HANDOUT_MASTER}) {
PackageRelationshipCollection prc = null;
try {
prc = mainDocument.getRelationshipsByType(rel);
} catch (InvalidFormatException e) {
metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
ExceptionUtils.getStackTrace(e));
}
if (prc != null) {
for (int i = 0; i < prc.size(); i++) {
PackagePart pp = null;
try {
pp = mainDocument.getRelatedPart(prc.getRelationship(i));
} catch (InvalidFormatException e) {
metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
ExceptionUtils.getStackTrace(e));
}
if (pp != null) {
parts.add(pp);
}
}
}
}
return parts;
}
private void addSlideParts(PackagePart slidePart, List<PackagePart> parts) {
for (String relation : new String[]{XSLFRelation.VML_DRAWING.getRelation(),
XSLFRelation.SLIDE_LAYOUT.getRelation(), XSLFRelation.NOTES_MASTER.getRelation(),
XSLFRelation.NOTES.getRelation()}) {
PackageRelationshipCollection prc = null;
try {
prc = slidePart.getRelationshipsByType(relation);
} catch (InvalidFormatException e) {
metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
ExceptionUtils.getStackTrace(e));
}
if (prc != null) {
for (PackageRelationship packageRelationship : prc) {
if (packageRelationship.getTargetMode() == TargetMode.INTERNAL) {
PackagePartName relName = null;
try {
relName = PackagingURIHelper
.createPartName(packageRelationship.getTargetURI());
} catch (InvalidFormatException e) {
metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
ExceptionUtils.getStackTrace(e));
}
if (relName != null) {
parts.add(packageRelationship.getPackage().getPart(relName));
}
}
}
}
}
//and slide of course
parts.add(slidePart);
}
private static class PlaceHolderSkipper extends DefaultHandler {
private final ContentHandler wrappedHandler;
boolean inPH = false;
PlaceHolderSkipper(ContentHandler wrappedHandler) {
this.wrappedHandler = wrappedHandler;
}
@Override
public void startElement(String uri, String localName, String qName, Attributes atts)
throws SAXException {
if ("ph".equals(localName)) {
inPH = true;
}
if (!inPH) {
wrappedHandler.startElement(uri, localName, qName, atts);
}
}
@Override
public void endElement(String uri, String localName, String qName) throws SAXException {
if (!inPH) {
wrappedHandler.endElement(uri, localName, qName);
}
if ("sp".equals(localName)) {
inPH = false;
}
}
@Override
public void characters(char[] ch, int start, int length) throws SAXException {
if (!inPH) {
wrappedHandler.characters(ch, start, length);
}
}
@Override
public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
if (!inPH) {
wrappedHandler.characters(ch, start, length);
}
}
}
private class XSLFCommentsHandler extends DefaultHandler {
private String commentAuthorId = null;
private StringBuilder commentBuffer = new StringBuilder();
private XHTMLContentHandler xhtml;
XSLFCommentsHandler(XHTMLContentHandler xhtml) {
this.xhtml = xhtml;
}
@Override
public void startElement(String uri, String localName, String qName, Attributes atts)
throws SAXException {
if ("cm".equals(localName)) {
commentAuthorId = atts.getValue("", "authorId");
//get date (dt)?
}
}
@Override
public void characters(char[] ch, int start, int length) throws SAXException {
//TODO: require that we're in <p:text>?
commentBuffer.append(ch, start, length);
}
@Override
public void endElement(String uri, String localName, String qName) throws SAXException {
if ("cm".equals(localName)) {
xhtml.startElement("p", "class", "slide-comment");
String authorString = commentAuthors.getName(commentAuthorId);
String authorInitials = commentAuthors.getInitials(commentAuthorId);
if (authorString != null || authorInitials != null) {
xhtml.startElement("b");
boolean authorExists = false;
if (authorString != null) {
xhtml.characters(authorString);
authorExists = true;
}
if (authorExists && authorInitials != null) {
xhtml.characters(" (");
}
if (authorInitials != null) {
xhtml.characters(authorInitials);
}
if (authorExists && authorInitials != null) {
xhtml.characters(")");
}
xhtml.endElement("b");
}
xhtml.characters(commentBuffer.toString());
xhtml.endElement("p");
commentBuffer.setLength(0);
commentAuthorId = null;
}
}
}
private class XSLFCommentAuthorHandler extends DefaultHandler {
String id = null;
String name = null;
String initials = null;
@Override
public void startElement(String uri, String localName, String qName, Attributes atts)
throws SAXException {
if ("cmAuthor".equals(localName)) {
for (int i = 0; i < atts.getLength(); i++) {
if ("id".equals(atts.getLocalName(i))) {
id = atts.getValue(i);
} else if ("name".equals(atts.getLocalName(i))) {
name = atts.getValue(i);
} else if ("initials".equals(atts.getLocalName(i))) {
initials = atts.getValue(i);
}
}
commentAuthors.add(id, name, initials);
//clear out
id = null;
name = null;
initials = null;
}
}
}
private static class CommentAuthors {
Map<String, String> nameMap = new HashMap<>();
Map<String, String> initialMap = new HashMap<>();
void add(String id, String name, String initials) {
if (id == null) {
return;
}
if (name != null) {
nameMap.put(id, name);
}
if (initials != null) {
initialMap.put(id, initials);
}
}
String getName(String id) {
if (id == null) {
return null;
}
return nameMap.get(id);
}
String getInitials(String id) {
if (id == null) {
return null;
}
return initialMap.get(id);
}
}
}