blob: afe7b69ec848ab92945bf510088ac3009ad7df87 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.apple;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Set;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.input.CloseShieldInputStream;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.exception.TikaMemoryLimitException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.EndianUtils;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.XHTMLContentHandler;
/**
* Parser that strips the header off of AppleSingle and AppleDouble
* files.
* <p>
* See <a href="http://kaiser-edv.de/documents/AppleSingle_AppleDouble.pdf">spec document</a>.
*/
public class AppleSingleFileParser extends AbstractParser {
private static final int MAX_FIELD_LENGTH = 1_073_741_824;
/**
* Entry types
*/
private static final int DATA_FORK = 1;
private static final int RESOURCE_FORK = 2;
private static final int REAL_NAME = 3;
private static final int COMMENT = 4;
private static final int ICON_BW = 5;
private static final int ICON_COLOR = 6;
//7?!
private static final int FILE_DATES_INFO = 8;
private static final int FINDER_INFO = 9;
private static final int MACINTOSH_FILE_INFO = 10;
private static final int PRODOS_FILE_INFO = 11;
private static final int MSDOS_FILE_INFO = 12;
private static final int SHORT_NAME = 13;
private static final int AFP_FILE_INFO = 14;
private static final int DIRECTORY_ID = 15;
private static final Set<MediaType> SUPPORTED_TYPES =
Collections.singleton(MediaType.application("applefile"));
public Set<MediaType> getSupportedTypes(ParseContext context) {
return SUPPORTED_TYPES;
}
@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
ParseContext context) throws IOException, SAXException, TikaException {
EmbeddedDocumentExtractor ex = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
short numEntries = readThroughNumEntries(stream);
long bytesRead = 26;
List<FieldInfo> fieldInfoList = getSortedFieldInfoList(stream, numEntries);
bytesRead += 12 * numEntries;
Metadata embeddedMetadata = new Metadata();
bytesRead = processFieldEntries(stream, fieldInfoList, embeddedMetadata, bytesRead);
FieldInfo contentFieldInfo = getContentFieldInfo(fieldInfoList);
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
if (contentFieldInfo != null) {
long diff = contentFieldInfo.offset - bytesRead;
IOUtils.skipFully(stream, diff);
if (ex.shouldParseEmbedded(embeddedMetadata)) {
// TODO: we should probably add a readlimiting wrapper around this
// stream to ensure that not more than contentFieldInfo.length bytes
// are read
ex.parseEmbedded(new CloseShieldInputStream(stream), xhtml, embeddedMetadata,
false);
}
}
xhtml.endDocument();
}
private FieldInfo getContentFieldInfo(List<FieldInfo> fieldInfoList) {
for (FieldInfo fieldInfo : fieldInfoList) {
if (fieldInfo.entryId == 1) {
return fieldInfo;
}
}
return null;
}
private long processFieldEntries(InputStream stream, List<FieldInfo> fieldInfoList,
Metadata embeddedMetadata, long bytesRead)
throws IOException, TikaException {
byte[] buffer = null;
for (FieldInfo f : fieldInfoList) {
long diff = f.offset - bytesRead;
//just in case
IOUtils.skipFully(stream, diff);
bytesRead += diff;
if (f.entryId == REAL_NAME) {
if (f.length > MAX_FIELD_LENGTH) {
throw new TikaMemoryLimitException(f.length, MAX_FIELD_LENGTH);
}
buffer = new byte[(int) f.length];
IOUtils.readFully(stream, buffer);
bytesRead += f.length;
String originalFileName =
new String(buffer, 0, buffer.length, StandardCharsets.US_ASCII);
embeddedMetadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, originalFileName);
} else if (f.entryId != DATA_FORK) {
IOUtils.skipFully(stream, f.length);
bytesRead += f.length;
}
}
return bytesRead;
}
private List<FieldInfo> getSortedFieldInfoList(InputStream stream, short numEntries)
throws IOException, TikaException {
//this is probably overkill. I'd hope that these were already
//in order. This ensures it.
List<FieldInfo> fieldInfoList = new ArrayList<>(numEntries);
for (int i = 0; i < numEntries; i++) {
//convert 32-bit unsigned ints to longs
fieldInfoList.add(new FieldInfo(EndianUtils.readUIntBE(stream), //entry id
EndianUtils.readUIntBE(stream), //offset
EndianUtils.readUIntBE(stream) //length
));
}
if (fieldInfoList.size() == 0) {
throw new TikaException("AppleSingleFile missing field info");
}
//make absolutely sure these are in order!
fieldInfoList.sort(Comparator.comparingLong(fieldInfo -> fieldInfo.offset));
return fieldInfoList;
}
//read through header until you hit the number of entries
private short readThroughNumEntries(InputStream stream) throws TikaException, IOException {
//mime
EndianUtils.readIntBE(stream);
//version
long version = EndianUtils.readIntBE(stream);
if (version != 0x00020000) {
throw new TikaException("Version should have been 0x00020000, but was:" + version);
}
IOUtils.skipFully(stream, 16);//filler
return EndianUtils.readShortBE(stream);//number of entries
}
private static class FieldInfo {
private final long entryId;
private final long offset;
private final long length;
private FieldInfo(long entryId, long offset, long length) {
this.entryId = entryId;
this.offset = offset;
this.length = length;
}
}
}