blob: 54314fe1d18e7a89370db88eb6bdc1d2a450e0d1 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.pkg;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.zip.GZIPInputStream;
import org.apache.commons.compress.archivers.ArchiveEntry;
import org.apache.commons.compress.archivers.ArchiveInputStream;
import org.apache.commons.compress.archivers.ar.ArArchiveInputStream;
import org.apache.commons.compress.archivers.cpio.CpioArchiveInputStream;
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
import org.apache.commons.compress.compressors.gzip.GzipUtils;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
import org.apache.tika.io.CloseShieldInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
/**
* Extractor for packaging and compression formats.
*/
class PackageExtractor {
private final ContentHandler handler;
private final Metadata metadata;
private final EmbeddedDocumentExtractor extractor;
public PackageExtractor(
ContentHandler handler, Metadata metadata, ParseContext context) {
this.handler = handler;
this.metadata = metadata;
EmbeddedDocumentExtractor ex = context.get(EmbeddedDocumentExtractor.class);
if (ex==null) {
this.extractor = new ParsingEmbeddedDocumentExtractor(context);
} else {
this.extractor = ex;
}
}
public void parse(InputStream stream)
throws IOException, SAXException, TikaException {
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
// At the end we want to close the package/compression stream to
// release any associated resources, but the underlying document
// stream should not be closed
stream = new CloseShieldInputStream(stream);
// Capture two bytes to determine the packaging/compression format
if (!stream.markSupported()) {
stream = new BufferedInputStream(stream);
}
stream.mark(2);
int a = stream.read();
int b = stream.read();
stream.reset();
// Select decompression or unpacking mechanism based on the two bytes
if (a == 'B' && b == 'Z') {
metadata.set(Metadata.CONTENT_TYPE, "application/x-bzip");
decompress(new BZip2CompressorInputStream(stream), xhtml);
} else if (a == 0x1f && b == 0x8b) {
metadata.set(Metadata.CONTENT_TYPE, "application/x-gzip");
decompress(new GZIPInputStream(stream), xhtml);
} else if (a == 'P' && b == 'K') {
metadata.set(Metadata.CONTENT_TYPE, "application/zip");
unpack(new ZipArchiveInputStream(stream), xhtml);
} else if ((a == '0' && b == '7')
|| (a == 0x71 && b == 0xc7)
|| (a == 0xc7 && b == 0x71)) {
metadata.set(Metadata.CONTENT_TYPE, "application/x-cpio");
unpack(new CpioArchiveInputStream(stream), xhtml);
} else if (a == '=' && (b == '<' || b == '!')) {
metadata.set(Metadata.CONTENT_TYPE, "application/x-archive");
unpack(new ArArchiveInputStream(stream), xhtml);
} else {
metadata.set(Metadata.CONTENT_TYPE, "application/x-tar");
unpack(new TarArchiveInputStream(stream), xhtml);
}
xhtml.endDocument();
}
private void decompress(InputStream stream, XHTMLContentHandler xhtml)
throws IOException, SAXException, TikaException {
try {
Metadata entrydata = new Metadata();
String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
if (name != null) {
if (name.endsWith(".tbz")) {
name = name.substring(0, name.length() - 4) + ".tar";
} else if (name.endsWith(".tbz2")) {
name = name.substring(0, name.length() - 5) + ".tar";
} else if (name.endsWith(".bz")) {
name = name.substring(0, name.length() - 3);
} else if (name.endsWith(".bz2")) {
name = name.substring(0, name.length() - 4);
} else if (name.length() > 0) {
name = GzipUtils.getUncompressedFilename(name);
}
entrydata.set(Metadata.RESOURCE_NAME_KEY, name);
}
// Use the delegate parser to parse the compressed document
if (extractor.shouldParseEmbedded(entrydata)) {
extractor.parseEmbedded(stream, xhtml, entrydata, true);
}
} finally {
stream.close();
}
}
/**
* Parses the given stream as a package of multiple underlying files.
* The package entries are parsed using the delegate parser instance.
* It is not an error if the entry can not be parsed, in that case
* just the entry name (if given) is emitted.
*
* @param archive package stream
* @param xhtml content handler
* @throws IOException if an IO error occurs
* @throws SAXException if a SAX error occurs
*/
public void unpack(ArchiveInputStream archive, XHTMLContentHandler xhtml)
throws IOException, SAXException {
try {
ArchiveEntry entry = archive.getNextEntry();
while (entry != null) {
if (!entry.isDirectory()) {
Metadata entrydata = new Metadata();
String name = entry.getName();
if (name != null && name.length() > 0) {
entrydata.set(Metadata.RESOURCE_NAME_KEY, name);
}
if (extractor.shouldParseEmbedded(entrydata)) {
extractor.parseEmbedded(archive, xhtml, entrydata, true);
}
}
entry = archive.getNextEntry();
}
} finally {
archive.close();
}
}
}