blob: 7d1797bb41da6b4d31bdb966f368bc2c4fc8220a [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.iwork;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;
import javax.xml.namespace.QName;
import org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException;
import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
import org.apache.commons.compress.archivers.zip.ZipFile;
import org.apache.commons.io.input.CloseShieldInputStream;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.apache.tika.detect.XmlRootExtractor;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.OfflineContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.utils.XMLReaderUtils;
/**
* A parser for the IWork container files. This includes *.key, *.pages and *.numbers files.
* This parser delegates the relevant entries to a {@link ContentHandler} that parsers the content.
* <p>
* Currently supported formats:
* <ol>
* <li>Keynote format version 2.x. Currently only tested with Keynote version 5.x
* <li>Pages format version 1.x. Currently only tested with Pages version 4.0.x
* <li>Numbers format version 1.x. Currently only tested with Numbers version 2.0.x
* </ol>
*/
public class IWorkPackageParser extends AbstractParser {
/**
* Which files within an iWork file contain the actual content?
*/
public final static Set<String> IWORK_CONTENT_ENTRIES = Collections.unmodifiableSet(
new HashSet<>(Arrays.asList("index.apxl", "index.xml", "presentation.apxl")));
/**
* All iWork files contain one of these, so we can detect based on it
*/
public final static String IWORK_COMMON_ENTRY = "buildVersionHistory.plist";
/**
* Serial version UID
*/
private static final long serialVersionUID = -2160322853809682372L;
/**
* This parser handles all iWorks formats.
*/
private final static Set<MediaType> supportedTypes = Collections.unmodifiableSet(
new HashSet<>(Arrays.asList(MediaType.application("vnd.apple.iwork"),
IWORKDocumentType.KEYNOTE.getType(), IWORKDocumentType.NUMBERS.getType(),
IWORKDocumentType.PAGES.getType())));
public Set<MediaType> getSupportedTypes(ParseContext context) {
return supportedTypes;
}
public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
ParseContext context) throws IOException, SAXException, TikaException {
ZipArchiveInputStream zip = new ZipArchiveInputStream(stream);
ZipArchiveEntry entry = zip.getNextZipEntry();
while (entry != null) {
if (!IWORK_CONTENT_ENTRIES.contains(entry.getName())) {
entry = zip.getNextZipEntry();
continue;
}
InputStream entryStream = new BufferedInputStream(zip, 4096);
entryStream.mark(4096);
IWORKDocumentType type = IWORKDocumentType.detectType(entryStream);
entryStream.reset();
if (type != null) {
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
ContentHandler contentHandler;
switch (type) {
case KEYNOTE:
contentHandler = new KeynoteContentHandler(xhtml, metadata);
break;
case NUMBERS:
contentHandler = new NumbersContentHandler(xhtml, metadata);
break;
case PAGES:
contentHandler = new PagesContentHandler(xhtml, metadata);
break;
case ENCRYPTED:
// We can't do anything for the file right now
contentHandler = null;
break;
default:
throw new TikaException("Unhandled iWorks file " + type);
}
metadata.add(Metadata.CONTENT_TYPE, type.getType().toString());
xhtml.startDocument();
if (contentHandler != null) {
XMLReaderUtils.parseSAX(new CloseShieldInputStream(entryStream),
new OfflineContentHandler(contentHandler), context);
}
xhtml.endDocument();
}
entry = zip.getNextZipEntry();
}
// Don't close the zip InputStream (TIKA-1117).
}
public enum IWORKDocumentType {
KEYNOTE("http://developer.apple.com/namespaces/keynote2", "presentation",
MediaType.application("vnd.apple.keynote")),
NUMBERS("http://developer.apple.com/namespaces/ls", "document",
MediaType.application("vnd.apple.numbers")),
PAGES("http://developer.apple.com/namespaces/sl", "document",
MediaType.application("vnd.apple.pages")),
ENCRYPTED(null, null, MediaType.application("x-tika-iworks-protected"));
private final String namespace;
private final String part;
private final MediaType type;
IWORKDocumentType(String namespace, String part, MediaType type) {
this.namespace = namespace;
this.part = part;
this.type = type;
}
public static IWORKDocumentType detectType(ZipArchiveEntry entry, ZipFile zip) {
try {
if (entry == null) {
return null;
}
try (InputStream stream = zip.getInputStream(entry)) {
return detectType(stream);
}
} catch (IOException e) {
return null;
}
}
public static IWORKDocumentType detectType(ZipArchiveEntry entry,
ZipArchiveInputStream zip) {
if (entry == null) {
return null;
}
return detectType(zip);
}
public static IWORKDocumentType detectType(InputStream stream) {
QName qname = new XmlRootExtractor().extractRootElement(stream);
if (qname != null) {
String uri = qname.getNamespaceURI();
String local = qname.getLocalPart();
for (IWORKDocumentType type : values()) {
if (type.getNamespace().equals(uri) && type.getPart().equals(local)) {
return type;
}
}
} else {
// There was a problem with extracting the root type
// Password Protected iWorks files are funny, but we can usually
// spot them because they encrypt part of the zip stream
try {
stream.read();
} catch (UnsupportedZipFeatureException e) {
// Compression field was likely encrypted
return ENCRYPTED;
} catch (Exception ignored) {
}
}
return null;
}
public String getNamespace() {
return namespace;
}
public String getPart() {
return part;
}
public MediaType getType() {
return type;
}
}
}