blob: 00d31d499d2b92fad288b60ad577771a2e392453 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.detect.zip;
import java.io.ByteArrayInputStream;
import java.io.EOFException;
import java.io.IOException;
import java.io.InputStream;
import java.util.List;
import org.apache.commons.compress.archivers.ArchiveException;
import org.apache.commons.compress.archivers.ArchiveStreamFactory;
import org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException;
import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
import org.apache.commons.compress.archivers.zip.ZipFile;
import org.apache.commons.compress.compressors.CompressorException;
import org.apache.commons.compress.compressors.CompressorStreamFactory;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.input.CloseShieldInputStream;
import org.apache.tika.config.Field;
import org.apache.tika.config.ServiceLoader;
import org.apache.tika.detect.Detector;
import org.apache.tika.io.BoundedInputStream;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
public class DefaultZipContainerDetector implements Detector {
//Regrettably, some tiff files can be incorrectly identified
//as tar files. We need this ugly workaround to rule out TIFF.
//If commons-compress ever chooses to take over TIFF detection
//we can remove all of this. See TIKA-2591.
final static MediaType TIFF = MediaType.image("tiff");
final static byte[][] TIFF_SIGNATURES = new byte[3][];
/**
* Serial version UID
*/
private static final long serialVersionUID = 2891763938430295453L;
static {
TIFF_SIGNATURES[0] = new byte[]{'M', 'M', 0x00, 0x2a};
TIFF_SIGNATURES[1] = new byte[]{'I', 'I', 0x2a, 0x00};
TIFF_SIGNATURES[2] = new byte[]{'M', 'M', 0x00, 0x2b};
}
//this has to be > 100,000 to handle some of the iworks files
//in our unit tests
@Field
int markLimit = 16 * 1024 * 1024;
List<ZipContainerDetector> zipDetectors;
public DefaultZipContainerDetector() {
this(new ServiceLoader(DefaultZipContainerDetector.class.getClassLoader()));
}
public DefaultZipContainerDetector(ServiceLoader loader) {
this(loader.loadServiceProviders(ZipContainerDetector.class));
}
public DefaultZipContainerDetector(List<ZipContainerDetector> zipDetectors) {
//TODO: OPCBased needs to be last!!!
this.zipDetectors = zipDetectors;
}
static boolean isZipArchive(MediaType type) {
return type.equals(PackageConstants.ZIP) || type.equals(PackageConstants.JAR);
}
private static boolean isTiff(byte[] prefix) {
for (byte[] sig : TIFF_SIGNATURES) {
if (arrayStartWith(sig, prefix)) {
return true;
}
}
return false;
}
private static boolean arrayStartWith(byte[] needle, byte[] haystack) {
if (haystack.length < needle.length) {
return false;
}
for (int i = 0; i < needle.length; i++) {
if (haystack[i] != needle[i]) {
return false;
}
}
return true;
}
static MediaType detectArchiveFormat(byte[] prefix, int length) {
if (isTiff(prefix)) {
return TIFF;
}
try {
String name = ArchiveStreamFactory.detect(new ByteArrayInputStream(prefix, 0, length));
return PackageConstants.getMediaType(name);
} catch (ArchiveException e) {
return MediaType.OCTET_STREAM;
}
}
static MediaType detectCompressorFormat(byte[] prefix, int length) {
try {
String type =
CompressorStreamFactory.detect(new ByteArrayInputStream(prefix, 0, length));
return CompressorConstants.getMediaType(type);
} catch (CompressorException e) {
return MediaType.OCTET_STREAM;
}
}
/**
* If this is less than 0, the file will be spooled to disk,
* and detection will run on the full file.
* If this is greater than 0, the {@link DeprecatedStreamingZipContainerDetector}
* will be called only up to the markLimit.
*
* @param markLimit mark limit for streaming detection
*/
@Field
public void setMarkLimit(int markLimit) {
this.markLimit = markLimit;
}
@Override
public MediaType detect(InputStream input, Metadata metadata) throws IOException {
// Check if we have access to the document
if (input == null) {
return MediaType.OCTET_STREAM;
}
byte[] prefix = new byte[1024]; // enough for all known archive formats
input.mark(1024);
int length = -1;
try {
length = IOUtils.read(input, prefix, 0, 1024);
} finally {
input.reset();
}
MediaType type = detectArchiveFormat(prefix, length);
if (type == TIFF) {
return TIFF;
} else if (isZipArchive(type)) {
if (TikaInputStream.isTikaInputStream(input)) {
TikaInputStream tis = TikaInputStream.cast(input);
if (markLimit < 0) {
tis.getFile();
}
if (tis.hasFile()) {
return detectZipFormatOnFile(tis);
}
}
return detectStreaming(input, metadata);
} else if (!type.equals(MediaType.OCTET_STREAM)) {
return type;
} else {
return detectCompressorFormat(prefix, length);
}
}
/**
* This will call TikaInputStream's getFile(). If there are no exceptions,
* it will place the ZipFile in TikaInputStream's openContainer and leave it
* open.
*
* @param tis
* @return
*/
private MediaType detectZipFormatOnFile(TikaInputStream tis) {
try {
ZipFile zip = new ZipFile(tis.getFile()); // TODO: hasFile()?
try {
for (ZipContainerDetector zipDetector : zipDetectors) {
MediaType type = zipDetector.detect(zip, tis);
if (type != null) {
return type;
}
}
} finally {
tis.setOpenContainer(zip);
}
} catch (IOException e) {
// ignore
}
// Fallback: it's still a zip file, we just don't know what kind of one
return MediaType.APPLICATION_ZIP;
}
MediaType detectStreaming(InputStream input, Metadata metadata) throws IOException {
BoundedInputStream boundedInputStream = new BoundedInputStream(markLimit, input);
boundedInputStream.mark(markLimit);
try {
return detectStreaming(boundedInputStream, metadata, false);
} finally {
boundedInputStream.reset();
}
}
MediaType detectStreaming(InputStream input, Metadata metadata, boolean allowStoredEntries)
throws IOException {
StreamingDetectContext detectContext = new StreamingDetectContext();
try (ZipArchiveInputStream zis = new ZipArchiveInputStream(
new CloseShieldInputStream(input), "UTF8", false, allowStoredEntries)) {
ZipArchiveEntry zae = zis.getNextZipEntry();
while (zae != null) {
MediaType mt = detect(zae, zis, detectContext);
if (mt != null) {
return mt;
}
zae = zis.getNextZipEntry();
}
} catch (UnsupportedZipFeatureException zfe) {
if (allowStoredEntries == false &&
zfe.getFeature() == UnsupportedZipFeatureException.Feature.DATA_DESCRIPTOR) {
input.reset();
return detectStreaming(input, metadata, true);
}
} catch (SecurityException e) {
throw e;
} catch (EOFException e) {
//truncated zip -- swallow
} catch (IOException e) {
//another option for a truncated zip
}
return finalDetect(detectContext);
}
private MediaType detect(ZipArchiveEntry zae, ZipArchiveInputStream zis,
StreamingDetectContext detectContext) throws IOException {
for (ZipContainerDetector d : zipDetectors) {
MediaType mt = d.streamingDetectUpdate(zae, zis, detectContext);
if (mt != null) {
return mt;
}
}
return null;
}
private MediaType finalDetect(StreamingDetectContext detectContext) {
for (ZipContainerDetector d : zipDetectors) {
MediaType mt = d.streamingDetectFinal(detectContext);
if (mt != null) {
return mt;
}
}
return MediaType.APPLICATION_ZIP;
}
}