blob: aefac501bf31aafd724a752e28b59d21aca3b831 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.detect.ole;
import static org.apache.tika.mime.MediaType.application;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Path;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;
import org.apache.poi.poifs.filesystem.DirectoryEntry;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.Entry;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.tika.config.Field;
import org.apache.tika.detect.Detector;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
/**
* TODO: refactor this copy/paste from POIFSContainerDetector
*/
/**
* A detector that works on a POIFS OLE2 document
* to figure out exactly what the file is.
* This should work for all OLE2 documents, whether
* they are ones supported by POI or not.
*/
public class MiscOLEDetector implements Detector {
/**
* The OLE base file format
*/
public static final MediaType OLE = application("x-tika-msoffice");
/**
* Hangul Word Processor (Korean)
*/
public static final MediaType HWP = application("x-hwp-v5");
/**
* Base QuattroPro mime
*/
public static final MediaType QUATTROPRO = application("x-quattro-pro");
@Field
private int markLimit = 16 * 1024 * 1024;
/**
* Internal detection of the specific kind of OLE2 document, based on the
* names of the top level streams within the file.
*
* @deprecated Use {@link #detect(Set, DirectoryEntry)} and pass the root
* entry of the filesystem whose type is to be detected, as a
* second argument.
*/
protected static MediaType detect(Set<String> names) {
return detect(names, null);
}
/**
* Internal detection of the specific kind of OLE2 document, based on the
* names of the top-level streams within the file. In some cases the
* detection may need access to the root {@link DirectoryEntry} of that file
* for best results. The entry can be given as a second, optional argument.
*
* @param names
* @param root
* @return
*/
protected static MediaType detect(Set<String> names, DirectoryEntry root) {
if (names == null || names.size() == 0) {
return OLE;
} else if (names.contains("\u0005HwpSummaryInformation")) {
// Hangul Word Processor v5+ (previous aren't OLE2-based)
return HWP;
} else if (names.contains("PerfectOffice_MAIN")) {
if (names.contains("SlideShow")) {
return MediaType.application("x-corelpresentations"); // .shw
} else if (names.contains("PerfectOffice_OBJECTS")) {
return new MediaType(QUATTROPRO, "version", "7-8"); // .wb?
}
} else if (names.contains("NativeContent_MAIN")) {
return new MediaType(QUATTROPRO, "version", "9"); // .qpw
// Couldn't detect a more specific type
}
return OLE;
}
private static Set<String> getTopLevelNames(DirectoryNode root) {
Set<String> names = new HashSet<>();
for (Entry entry : root) {
names.add(entry.getName());
}
return names;
}
/**
* If a TikaInputStream is passed in to {@link #detect(InputStream, Metadata)},
* and there is not an underlying file, this detector will spool up to {@link #markLimit}
* to disk. If the stream was read in entirety (e.g. the spooled file is not truncated),
* this detector will open the file with POI and perform detection.
* If the spooled file is truncated, the detector will return {@link #OLE} (or
* {@link MediaType#OCTET_STREAM} if there's no OLE header).
* <p>
* As of Tika 1.21, this detector respects the legacy behavior of not performing detection
* on a non-TikaInputStream.
*
* @param markLimit
*/
public void setMarkLimit(int markLimit) {
this.markLimit = markLimit;
}
private Set<String> getTopLevelNames(TikaInputStream stream) throws IOException {
// Force the document stream to a (possibly temporary) file
// so we don't modify the current position of the stream.
//If the markLimit is < 0, this will spool the entire file
//to disk if there is not an underlying file.
Path file = stream.getPath(markLimit);
//if the stream was longer than markLimit, don't detect
if (file == null) {
return Collections.emptySet();
}
try {
POIFSFileSystem fs = new POIFSFileSystem(file.toFile(), true);
// Optimize a possible later parsing process by keeping
// a reference to the already opened POI file system
stream.setOpenContainer(fs);
return getTopLevelNames(fs.getRoot());
} catch (IOException e) {
// Parse error in POI, so we don't know the file type
return Collections.emptySet();
} catch (RuntimeException e) {
// Another problem in POI
return Collections.emptySet();
}
}
public MediaType detect(InputStream input, Metadata metadata) throws IOException {
// Check if we have access to the document
if (input == null) {
return MediaType.OCTET_STREAM;
}
// If this is a TikaInputStream wrapping an already
// parsed NPOIFileSystem/DirectoryNode, just get the
// names from the root:
TikaInputStream tis = TikaInputStream.cast(input);
Set<String> names = null;
if (tis != null) {
Object container = tis.getOpenContainer();
if (container instanceof POIFSFileSystem) {
names = getTopLevelNames(((POIFSFileSystem) container).getRoot());
} else if (container instanceof DirectoryNode) {
names = getTopLevelNames((DirectoryNode) container);
}
}
if (names == null) {
// Check if the document starts with the OLE header
input.mark(8);
try {
if (input.read() != 0xd0 || input.read() != 0xcf || input.read() != 0x11 ||
input.read() != 0xe0 || input.read() != 0xa1 || input.read() != 0xb1 ||
input.read() != 0x1a || input.read() != 0xe1) {
return MediaType.OCTET_STREAM;
}
} catch (IOException e) {
return MediaType.OCTET_STREAM;
} finally {
input.reset();
}
}
// We can only detect the exact type when given a TikaInputStream
if (names == null && tis != null) {
// Look for known top level entry names to detect the document type
names = getTopLevelNames(tis);
}
// Detect based on the names (as available)
if (tis != null && tis.getOpenContainer() != null &&
tis.getOpenContainer() instanceof POIFSFileSystem) {
return detect(names, ((POIFSFileSystem) tis.getOpenContainer()).getRoot());
} else {
return detect(names, null);
}
}
}