blob: 8f0537de45df2ed3dc447781899b7f6de88b3c59 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.apple;
import java.io.IOException;
import java.io.InputStream;
import java.text.DateFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import javax.xml.parsers.ParserConfigurationException;
import com.dd.plist.NSArray;
import com.dd.plist.NSData;
import com.dd.plist.NSDate;
import com.dd.plist.NSDictionary;
import com.dd.plist.NSNumber;
import com.dd.plist.NSObject;
import com.dd.plist.NSSet;
import com.dd.plist.NSString;
import com.dd.plist.PropertyListFormatException;
import com.dd.plist.PropertyListParser;
import com.dd.plist.UID;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.apache.tika.detect.apple.BPListDetector;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.XHTMLContentHandler;
/**
* Parser for Apple's plist and bplist. This is a wrapper around
* com.googlecode.plist:dd-plist
* <p>
* As of 1.25, Tika does not have detection for the text based plist,
* so those files will not be directed to this parser
*
* @since 1.25
*/
public class PListParser extends AbstractParser {
private static final String ARR = "array";
private static final String DATA = "data";
private static final String DATE = "date";
private static final String DICT = "dict";
private static final String KEY = "key";
private static final String NUMBER = "number";
private static final String PLIST = "plist";
private static final String SET = "set";
private static final String STRING = "string";
private static final String UID = "uid";
private static final Set<MediaType> SUPPORTED_TYPES = Collections.unmodifiableSet(new HashSet<>(
Arrays.asList(BPListDetector.BITUNES, BPListDetector.BMEMGRAPH, BPListDetector.BPLIST,
BPListDetector.BWEBARCHIVE, BPListDetector.PLIST)));
@Override
public Set<MediaType> getSupportedTypes(ParseContext context) {
return SUPPORTED_TYPES;
}
@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
ParseContext context) throws IOException, SAXException, TikaException {
EmbeddedDocumentExtractor embeddedDocumentExtractor =
EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
DateFormat df = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.US);
NSObject rootObj = null;
//if this already went through the PListDetector,
//there should be an NSObject in the open container
if (stream instanceof TikaInputStream) {
rootObj = (NSObject) ((TikaInputStream) stream).getOpenContainer();
}
if (rootObj == null) {
try {
if (stream instanceof TikaInputStream && ((TikaInputStream) stream).hasFile()) {
rootObj = PropertyListParser.parse(((TikaInputStream) stream).getFile());
} else {
rootObj = PropertyListParser.parse(stream);
}
} catch (PropertyListFormatException | ParseException |
ParserConfigurationException e) {
throw new TikaException("problem parsing root", e);
}
}
String contentType = metadata.get(Metadata.CONTENT_TYPE);
if (BPListDetector.PLIST.toString().equals(contentType)) {
if (rootObj instanceof NSDictionary) {
MediaType subtype =
BPListDetector.detectXMLOnKeys(((NSDictionary) rootObj).keySet());
metadata.set(Metadata.CONTENT_TYPE, subtype.toString());
}
}
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
State state = new State(xhtml, metadata, embeddedDocumentExtractor, df);
xhtml.startDocument();
xhtml.startElement(PLIST);
parseObject(rootObj, state);
xhtml.endElement(PLIST);
xhtml.endDocument();
}
private void parseObject(NSObject obj, State state) throws SAXException, IOException {
if (obj instanceof NSDictionary) {
parseDict((NSDictionary) obj, state);
} else if (obj instanceof NSArray) {
NSArray nsArray = (NSArray) obj;
state.xhtml.startElement(ARR);
for (NSObject child : nsArray.getArray()) {
parseObject(child, state);
}
state.xhtml.endElement(ARR);
} else if (obj instanceof NSString) {
state.xhtml.startElement(STRING);
state.xhtml.characters(((NSString) obj).getContent());
state.xhtml.endElement(STRING);
} else if (obj instanceof NSNumber) {
state.xhtml.startElement(NUMBER);
state.xhtml.characters(((NSNumber) obj).toString());
state.xhtml.endElement(NUMBER);
} else if (obj instanceof NSData) {
state.xhtml.startElement(DATA);
handleData((NSData) obj, state);
state.xhtml.endElement(DATA);
} else if (obj instanceof NSDate) {
state.xhtml.startElement(DATE);
String dateString = state.dateFormat.format(((NSDate) obj).getDate());
state.xhtml.characters(dateString);
state.xhtml.endElement(DATE);
} else if (obj instanceof NSSet) {
state.xhtml.startElement(SET);
parseSet((NSSet) obj, state);
state.xhtml.endElement(SET);
} else if (obj instanceof UID) {
//do we want to do anything with obj.getBytes()
state.xhtml.element(UID, ((UID) obj).getName());
} else {
throw new UnsupportedOperationException(
"don't yet support this type of object: " + obj.getClass() +
" Please open an issue on our tracker");
}
}
private void parseSet(NSSet obj, State state) throws SAXException, IOException {
state.xhtml.startElement(SET);
for (NSObject child : obj.allObjects()) {
parseObject(child, state);
}
state.xhtml.endElement(SET);
}
private void parseDict(NSDictionary obj, State state) throws SAXException, IOException {
state.xhtml.startElement(DICT);
for (Map.Entry<String, NSObject> mapEntry : obj.getHashMap().entrySet()) {
String key = mapEntry.getKey();
NSObject value = mapEntry.getValue();
state.xhtml.element(KEY, key);
parseObject(value, state);
}
state.xhtml.endElement(DICT);
}
private void handleData(NSData value, State state) throws IOException, SAXException {
state.xhtml.characters(value.getBase64EncodedData());
Metadata embeddedMetadata = new Metadata();
if (!state.embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
return;
}
try (TikaInputStream tis = TikaInputStream.get(value.bytes())) {
state.embeddedDocumentExtractor
.parseEmbedded(tis, state.xhtml, embeddedMetadata, false);
}
}
private static class State {
final XHTMLContentHandler xhtml;
final Metadata metadata;
final EmbeddedDocumentExtractor embeddedDocumentExtractor;
final DateFormat dateFormat;
public State(XHTMLContentHandler xhtml, Metadata metadata,
EmbeddedDocumentExtractor embeddedDocumentExtractor, DateFormat df) {
this.xhtml = xhtml;
this.metadata = metadata;
this.embeddedDocumentExtractor = embeddedDocumentExtractor;
this.dateFormat = df;
}
}
}