blob: f8e5bf7499c8e85d031d90c3dd7dae526bfa44bb [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.microsoft.rtf;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.util.Locale;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream;
import org.apache.commons.io.output.UnsynchronizedByteArrayOutputStream;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.DocumentEntry;
import org.apache.poi.poifs.filesystem.DocumentInputStream;
import org.apache.poi.poifs.filesystem.Entry;
import org.apache.poi.poifs.filesystem.FileMagic;
import org.apache.poi.poifs.filesystem.Ole10Native;
import org.apache.poi.poifs.filesystem.Ole10NativeException;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.tika.exception.TikaException;
import org.apache.tika.exception.TikaMemoryLimitException;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.BoundedInputStream;
import org.apache.tika.io.EndianUtils;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.RTFMetadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.microsoft.OfficeParser.POIFSDocumentType;
/**
* Many thanks to Simon Mourier for:
* http://stackoverflow.com/questions/14779647/extract-embedded-image-object-in-rtf
* and for granting permission to use his code in Tika.
*/
class RTFObjDataParser {
private final static String WIN_ASCII = "WINDOWS-1252";
private final int memoryLimitInKb;
RTFObjDataParser(int memoryLimitInKb) {
this.memoryLimitInKb = memoryLimitInKb;
}
/**
* Parses the embedded object/pict string
*
* @param is actual bytes (already converted from the
* hex pair string stored in the embedded object data into actual bytes or read
* as raw binary bytes)
* @return a SimpleRTFEmbObj or null
* @throws IOException if there are any surprise surprises during parsing
*/
private static boolean hasPOIFSHeader(InputStream is) throws IOException {
return FileMagic.valueOf(is) == FileMagic.OLE2;
}
/**
* @param bytes
* @param metadata incoming metadata
* @param unknownFilenameCount
* @return byte[] for contents of obj data
* @throws IOException
*/
protected byte[] parse(byte[] bytes, Metadata metadata, AtomicInteger unknownFilenameCount)
throws IOException, TikaException {
UnsynchronizedByteArrayInputStream is = new UnsynchronizedByteArrayInputStream(bytes);
long version = readUInt(is);
metadata.add(RTFMetadata.EMB_APP_VERSION, Long.toString(version));
long formatId = readUInt(is);
//2 is an embedded object. 1 is a link.
if (formatId != 2L) {
return null;
}
String className = readLengthPrefixedAnsiString(is).trim();
String topicName = readLengthPrefixedAnsiString(is).trim();
String itemName = readLengthPrefixedAnsiString(is).trim();
if (className != null && className.length() > 0) {
metadata.add(RTFMetadata.EMB_CLASS, className);
}
if (topicName != null && topicName.length() > 0) {
metadata.add(RTFMetadata.EMB_TOPIC, topicName);
}
if (itemName != null && itemName.length() > 0) {
metadata.add(RTFMetadata.EMB_ITEM, itemName);
}
long dataSz = readUInt(is);
//readBytes tests for reading too many bytes
byte[] embObjBytes = readBytes(is, dataSz);
if (className.toLowerCase(Locale.ROOT).equals("package")) {
return handlePackage(embObjBytes, metadata);
} else if (className.toLowerCase(Locale.ROOT).equals("pbrush")) {
//simple bitmap bytes
return embObjBytes;
} else {
UnsynchronizedByteArrayInputStream embIs = new UnsynchronizedByteArrayInputStream(embObjBytes);
boolean hasPoifs = false;
try {
hasPoifs = hasPOIFSHeader(embIs);
} catch (IOException e) {
EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
return embObjBytes;
}
if (hasPoifs) {
try {
return handleEmbeddedPOIFS(embIs, metadata, unknownFilenameCount);
} catch (Exception e) {
EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
}
}
}
return embObjBytes;
}
//will throw IOException if not actually POIFS
//can return null byte[]
private byte[] handleEmbeddedPOIFS(InputStream is, Metadata metadata,
AtomicInteger unknownFilenameCount)
throws TikaException, IOException {
byte[] ret = null;
try (POIFSFileSystem fs = new POIFSFileSystem(is)) {
DirectoryNode root = fs.getRoot();
if (root == null) {
return ret;
}
if (root.hasEntry("Package")) {
Entry ooxml = root.getEntry("Package");
UnsynchronizedByteArrayOutputStream out = new UnsynchronizedByteArrayOutputStream();
try (BoundedInputStream bis = new BoundedInputStream(memoryLimitInKb * 1024,
new DocumentInputStream((DocumentEntry) ooxml))) {
IOUtils.copy(bis, out);
if (bis.hasHitBound()) {
throw new TikaMemoryLimitException((memoryLimitInKb * 1024 + 1),
(memoryLimitInKb * 1024));
}
}
ret = out.toByteArray();
} else {
//try poifs
POIFSDocumentType type = POIFSDocumentType.detectType(root);
if (type == POIFSDocumentType.OLE10_NATIVE) {
try {
// Try to un-wrap the OLE10Native record:
Ole10Native ole = Ole10Native.createFromEmbeddedOleObject(root);
ret = ole.getDataBuffer();
} catch (Ole10NativeException ex) {
// Not a valid OLE10Native record, skip it
}
} else if (type == POIFSDocumentType.COMP_OBJ) {
DocumentEntry contentsEntry;
try {
contentsEntry = (DocumentEntry) root.getEntry("CONTENTS");
} catch (FileNotFoundException ioe) {
//no contents
EmbeddedDocumentUtil.recordEmbeddedStreamException(ioe, metadata);
return ret;
}
try (DocumentInputStream inp = new DocumentInputStream(contentsEntry)) {
ret = new byte[contentsEntry.getSize()];
inp.readFully(ret);
}
} else {
UnsynchronizedByteArrayOutputStream out = new UnsynchronizedByteArrayOutputStream();
is.reset();
BoundedInputStream bis = new BoundedInputStream(memoryLimitInKb * 1024, is);
IOUtils.copy(is, out);
if (bis.hasHitBound()) {
throw new TikaMemoryLimitException(memoryLimitInKb * 1024 + 1,
memoryLimitInKb * 1024);
}
ret = out.toByteArray();
metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY,
"file_" + unknownFilenameCount.getAndIncrement() + "." +
type.getExtension());
metadata.set(Metadata.CONTENT_TYPE, type.getType().toString());
}
}
}
return ret;
}
/**
* can return null if there is a linked object
* instead of an embedded file
*/
private byte[] handlePackage(byte[] pkgBytes, Metadata metadata)
throws IOException, TikaException {
//now parse the package header
UnsynchronizedByteArrayInputStream is = new UnsynchronizedByteArrayInputStream(pkgBytes);
readUShort(is);
String displayName = readAnsiString(is);
//should we add this to the metadata?
readAnsiString(is); //iconFilePath
try {
//iconIndex
EndianUtils.readUShortBE(is);
} catch (EndianUtils.BufferUnderrunException e) {
throw new IOException(e);
}
int type = readUShort(is); //type
//1 is link, 3 is embedded object
//this only handles embedded objects
if (type != 3) {
return null;
}
//should we really be ignoring this filePathLen?
readUInt(is); //filePathLen
String ansiFilePath = readAnsiString(is); //filePath
long bytesLen = readUInt(is);
byte[] objBytes = initByteArray(bytesLen);
IOUtils.readFully(is, objBytes);
StringBuilder unicodeFilePath = new StringBuilder();
try {
long unicodeLen = readUInt(is);
for (int i = 0; i < unicodeLen; i++) {
int lo = is.read();
int hi = is.read();
int sum = lo + 256 * hi;
if (hi == -1 || lo == -1) {
//stream ran out; empty SB and stop
unicodeFilePath.setLength(0);
break;
}
unicodeFilePath.append((char) sum);
}
} catch (IOException e) {
//swallow; the unicode file path is optional and might not happen
unicodeFilePath.setLength(0);
}
String fileNameToUse = "";
String pathToUse = "";
if (unicodeFilePath.length() > 0) {
String p = unicodeFilePath.toString();
fileNameToUse = p;
pathToUse = p;
} else {
fileNameToUse = displayName == null ? "" : displayName;
pathToUse = ansiFilePath == null ? "" : ansiFilePath;
}
metadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, fileNameToUse);
metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, FilenameUtils.getName(fileNameToUse));
metadata.set(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID, pathToUse);
return objBytes;
}
private int readUShort(InputStream is) throws IOException {
try {
return EndianUtils.readUShortLE(is);
} catch (EndianUtils.BufferUnderrunException e) {
throw new IOException(e);
}
}
private long readUInt(InputStream is) throws IOException {
try {
return EndianUtils.readUIntLE(is);
} catch (EndianUtils.BufferUnderrunException e) {
throw new IOException(e);
}
}
private String readAnsiString(InputStream is) throws IOException {
StringBuilder sb = new StringBuilder();
int c = is.read();
while (c > 0) {
sb.append((char) c);
c = is.read();
}
if (c == -1) {
throw new IOException("Hit end of stream before end of AnsiString");
}
return sb.toString();
}
private String readLengthPrefixedAnsiString(InputStream is) throws IOException, TikaException {
long len = readUInt(is);
byte[] bytes = readBytes(is, len);
try {
return new String(bytes, WIN_ASCII);
} catch (UnsupportedEncodingException e) {
//shouldn't ever happen
throw new IOException("Unsupported encoding");
}
}
private byte[] readBytes(InputStream is, long len) throws IOException, TikaException {
//initByteArray tests for "reading of too many bytes"
byte[] bytes = initByteArray(len);
IOUtils.readFully(is, bytes);
return bytes;
}
private byte[] initByteArray(long len) throws IOException, TikaException {
if (len < 0) {
throw new IOException("Requested length for reading bytes < 0?!: " + len);
} else if (memoryLimitInKb > -1 && len > memoryLimitInKb * 1024) {
throw new TikaMemoryLimitException(len, memoryLimitInKb * 1024);
} else if (len > Integer.MAX_VALUE) {
throw new TikaMemoryLimitException(len, Integer.MAX_VALUE);
}
return new byte[(int) len];
}
}