| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.tika.parser.pkg; |
| |
| import java.io.BufferedInputStream; |
| import java.io.IOException; |
| import java.io.InputStream; |
| import java.util.zip.GZIPInputStream; |
| |
| import org.apache.commons.compress.archivers.ArchiveEntry; |
| import org.apache.commons.compress.archivers.ArchiveInputStream; |
| import org.apache.commons.compress.archivers.ar.ArArchiveInputStream; |
| import org.apache.commons.compress.archivers.cpio.CpioArchiveInputStream; |
| import org.apache.commons.compress.archivers.tar.TarArchiveInputStream; |
| import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream; |
| import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream; |
| import org.apache.commons.compress.compressors.gzip.GzipUtils; |
| import org.apache.tika.exception.TikaException; |
| import org.apache.tika.extractor.EmbeddedDocumentExtractor; |
| import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor; |
| import org.apache.tika.io.CloseShieldInputStream; |
| import org.apache.tika.metadata.Metadata; |
| import org.apache.tika.parser.ParseContext; |
| import org.apache.tika.sax.XHTMLContentHandler; |
| import org.xml.sax.ContentHandler; |
| import org.xml.sax.SAXException; |
| |
| /** |
| * Extractor for packaging and compression formats. |
| */ |
| class PackageExtractor { |
| |
| private final ContentHandler handler; |
| |
| private final Metadata metadata; |
| |
| private final EmbeddedDocumentExtractor extractor; |
| |
| public PackageExtractor( |
| ContentHandler handler, Metadata metadata, ParseContext context) { |
| this.handler = handler; |
| this.metadata = metadata; |
| |
| EmbeddedDocumentExtractor ex = context.get(EmbeddedDocumentExtractor.class); |
| |
| if (ex==null) { |
| this.extractor = new ParsingEmbeddedDocumentExtractor(context); |
| } else { |
| this.extractor = ex; |
| } |
| |
| } |
| |
| public void parse(InputStream stream) |
| throws IOException, SAXException, TikaException { |
| XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); |
| xhtml.startDocument(); |
| |
| // At the end we want to close the package/compression stream to |
| // release any associated resources, but the underlying document |
| // stream should not be closed |
| stream = new CloseShieldInputStream(stream); |
| |
| // Capture two bytes to determine the packaging/compression format |
| if (!stream.markSupported()) { |
| stream = new BufferedInputStream(stream); |
| } |
| stream.mark(2); |
| int a = stream.read(); |
| int b = stream.read(); |
| stream.reset(); |
| |
| // Select decompression or unpacking mechanism based on the two bytes |
| if (a == 'B' && b == 'Z') { |
| metadata.set(Metadata.CONTENT_TYPE, "application/x-bzip"); |
| decompress(new BZip2CompressorInputStream(stream), xhtml); |
| } else if (a == 0x1f && b == 0x8b) { |
| metadata.set(Metadata.CONTENT_TYPE, "application/x-gzip"); |
| decompress(new GZIPInputStream(stream), xhtml); |
| } else if (a == 'P' && b == 'K') { |
| metadata.set(Metadata.CONTENT_TYPE, "application/zip"); |
| unpack(new ZipArchiveInputStream(stream), xhtml); |
| } else if ((a == '0' && b == '7') |
| || (a == 0x71 && b == 0xc7) |
| || (a == 0xc7 && b == 0x71)) { |
| metadata.set(Metadata.CONTENT_TYPE, "application/x-cpio"); |
| unpack(new CpioArchiveInputStream(stream), xhtml); |
| } else if (a == '=' && (b == '<' || b == '!')) { |
| metadata.set(Metadata.CONTENT_TYPE, "application/x-archive"); |
| unpack(new ArArchiveInputStream(stream), xhtml); |
| } else { |
| metadata.set(Metadata.CONTENT_TYPE, "application/x-tar"); |
| unpack(new TarArchiveInputStream(stream), xhtml); |
| } |
| |
| xhtml.endDocument(); |
| } |
| |
| private void decompress(InputStream stream, XHTMLContentHandler xhtml) |
| throws IOException, SAXException, TikaException { |
| try { |
| Metadata entrydata = new Metadata(); |
| String name = metadata.get(Metadata.RESOURCE_NAME_KEY); |
| if (name != null) { |
| if (name.endsWith(".tbz")) { |
| name = name.substring(0, name.length() - 4) + ".tar"; |
| } else if (name.endsWith(".tbz2")) { |
| name = name.substring(0, name.length() - 5) + ".tar"; |
| } else if (name.endsWith(".bz")) { |
| name = name.substring(0, name.length() - 3); |
| } else if (name.endsWith(".bz2")) { |
| name = name.substring(0, name.length() - 4); |
| } else if (name.length() > 0) { |
| name = GzipUtils.getUncompressedFilename(name); |
| } |
| entrydata.set(Metadata.RESOURCE_NAME_KEY, name); |
| } |
| |
| // Use the delegate parser to parse the compressed document |
| if (extractor.shouldParseEmbedded(entrydata)) { |
| extractor.parseEmbedded(stream, xhtml, entrydata, true); |
| } |
| } finally { |
| stream.close(); |
| } |
| } |
| |
| /** |
| * Parses the given stream as a package of multiple underlying files. |
| * The package entries are parsed using the delegate parser instance. |
| * It is not an error if the entry can not be parsed, in that case |
| * just the entry name (if given) is emitted. |
| * |
| * @param archive package stream |
| * @param xhtml content handler |
| * @throws IOException if an IO error occurs |
| * @throws SAXException if a SAX error occurs |
| */ |
| public void unpack(ArchiveInputStream archive, XHTMLContentHandler xhtml) |
| throws IOException, SAXException { |
| try { |
| ArchiveEntry entry = archive.getNextEntry(); |
| while (entry != null) { |
| if (!entry.isDirectory()) { |
| Metadata entrydata = new Metadata(); |
| String name = entry.getName(); |
| if (name != null && name.length() > 0) { |
| entrydata.set(Metadata.RESOURCE_NAME_KEY, name); |
| } |
| |
| if (extractor.shouldParseEmbedded(entrydata)) { |
| extractor.parseEmbedded(archive, xhtml, entrydata, true); |
| } |
| } |
| entry = archive.getNextEntry(); |
| } |
| } finally { |
| archive.close(); |
| } |
| } |
| |
| } |