blob: d11430c2d5c09291f479464cf8c7281eeb66eabb [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.mail;
import org.apache.commons.io.IOUtils;
import org.apache.james.mime4j.MimeException;
import org.apache.james.mime4j.codec.DecodeMonitor;
import org.apache.james.mime4j.codec.DecoderUtil;
import org.apache.james.mime4j.dom.address.Address;
import org.apache.james.mime4j.dom.address.AddressList;
import org.apache.james.mime4j.dom.address.Mailbox;
import org.apache.james.mime4j.dom.address.MailboxList;
import org.apache.james.mime4j.dom.field.AddressListField;
import org.apache.james.mime4j.dom.field.DateTimeField;
import org.apache.james.mime4j.dom.field.MailboxListField;
import org.apache.james.mime4j.dom.field.ParsedField;
import org.apache.james.mime4j.dom.field.UnstructuredField;
import org.apache.james.mime4j.field.LenientFieldParser;
import org.apache.james.mime4j.message.MaximalBodyDescriptor;
import org.apache.james.mime4j.parser.ContentHandler;
import org.apache.james.mime4j.stream.BodyDescriptor;
import org.apache.james.mime4j.stream.Field;
import org.apache.tika.detect.Detector;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Message;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.csv.TextAndCSVParser;
import org.apache.tika.parser.html.HtmlParser;
import org.apache.tika.parser.mailcommons.MailUtil;
import org.apache.tika.parser.txt.TXTParser;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.EmbeddedContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.SAXException;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.text.DateFormat;
import java.text.DateFormatSymbols;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Stack;
import java.util.TimeZone;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import static org.apache.tika.utils.DateUtils.MIDDAY;
import static org.apache.tika.utils.DateUtils.UTC;
/**
* Bridge between mime4j's content handler and the generic Sax content handler
* used by Tika. See
* http://james.apache.org/mime4j/apidocs/org/apache/james/mime4j/parser/ContentHandler.html
*/
class MailContentHandler implements ContentHandler {
private static final String MULTIPART_ALTERNATIVE = "multipart/alternative";
//TIKA-1970 Mac Mail's format
private static final Pattern GENERAL_TIME_ZONE_NO_MINUTES_PATTERN =
Pattern.compile("(?:UTC|GMT)([+-])(\\d?\\d)\\Z");
//find a time ending in am/pm without a space: 10:30am and
//use this pattern to insert space: 10:30 am
private static final Pattern AM_PM = Pattern.compile("(?i)(\\d)([ap]m)\\b");
private static final DateFormat[] ALTERNATE_DATE_FORMATS = new DateFormat[] {
//note that the string is "cleaned" before processing:
//1) condense multiple whitespace to single space
//2) trim()
//3) strip out commas
//4) insert space before am/pm
//May 16 2016 1:32am
createDateFormat("MMM dd yy hh:mm a", null),
//this is a standard pattern handled by mime4j;
//but mime4j fails with leading whitespace
createDateFormat("EEE d MMM yy HH:mm:ss Z", UTC),
createDateFormat("EEE d MMM yy HH:mm:ss z", UTC),
createDateFormat("EEE d MMM yy HH:mm:ss", null),// no timezone
createDateFormat("EEEEE MMM d yy hh:mm a", null),// Sunday, May 15 2016 1:32 PM
//16 May 2016 at 09:30:32 GMT+1 (Mac Mail TIKA-1970)
createDateFormat("d MMM yy 'at' HH:mm:ss z", UTC), // UTC/Zulu
createDateFormat("yy-MM-dd HH:mm:ss", null),
createDateFormat("MM/dd/yy hh:mm a", null, false),
//now dates without times
createDateFormat("MMM d yy", MIDDAY, false),
createDateFormat("EEE d MMM yy", MIDDAY, false),
createDateFormat("d MMM yy", MIDDAY, false),
createDateFormat("yy/MM/dd", MIDDAY, false),
createDateFormat("MM/dd/yy", MIDDAY, false)
};
private static DateFormat createDateFormat(String format, TimeZone timezone) {
return createDateFormat(format, timezone, true);
}
private static DateFormat createDateFormat(String format, TimeZone timezone, boolean isLenient) {
SimpleDateFormat sdf =
new SimpleDateFormat(format, new DateFormatSymbols(Locale.US));
if (timezone != null) {
sdf.setTimeZone(timezone);
}
sdf.setLenient(isLenient);
return sdf;
}
private final XHTMLContentHandler handler;
private final Metadata metadata;
private final ParseContext parseContext;
private boolean strictParsing = false;
private final boolean extractAllAlternatives;
private final EmbeddedDocumentExtractor extractor;
private final Detector detector;
//this is used to buffer a multipart body that
//keeps track of multipart/alternative and its children
private Stack<Part> alternativePartBuffer = new Stack<>();
private Stack<BodyDescriptor> parts = new Stack<>();
MailContentHandler(XHTMLContentHandler xhtml, Detector detector, Metadata metadata,
ParseContext context, boolean strictParsing, boolean extractAllAlternatives) {
this.handler = xhtml;
this.metadata = metadata;
this.parseContext = context;
this.strictParsing = strictParsing;
this.extractAllAlternatives = extractAllAlternatives;
// Fetch / Build an EmbeddedDocumentExtractor with which
// to handle/process the parts/attachments
// Was an EmbeddedDocumentExtractor explicitly supplied?
this.extractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
this.detector = detector;
}
@Override
public void body(BodyDescriptor body, InputStream is) throws MimeException,
IOException {
// use a different metadata object
// in order to specify the mime type of the
// sub part without damaging the main metadata
Metadata submd = new Metadata();
submd.set(Metadata.CONTENT_TYPE, body.getMimeType());
submd.set(Metadata.CONTENT_ENCODING, body.getCharset());
// TIKA-2455: flag the containing type.
if (parts.size() > 0) {
submd.set(Message.MULTIPART_SUBTYPE, parts.peek().getSubType());
submd.set(Message.MULTIPART_BOUNDARY, parts.peek().getBoundary());
}
if (body instanceof MaximalBodyDescriptor) {
MaximalBodyDescriptor maximalBody = (MaximalBodyDescriptor) body;
String contentDispositionType = maximalBody.getContentDispositionType();
if (contentDispositionType != null && !contentDispositionType.isEmpty()) {
StringBuilder contentDisposition = new StringBuilder(contentDispositionType);
Map<String, String> contentDispositionParameters = maximalBody.getContentDispositionParameters();
for (Entry<String, String> param : contentDispositionParameters.entrySet()) {
contentDisposition.append("; ")
.append(param.getKey()).append("=\"").append(param.getValue()).append('"');
}
String contentDispositionFileName = maximalBody.getContentDispositionFilename();
if (contentDispositionFileName != null) {
submd.set(TikaCoreProperties.RESOURCE_NAME_KEY, contentDispositionFileName);
}
submd.set(Metadata.CONTENT_DISPOSITION, contentDisposition.toString());
}
}
//if we're in a multipart/alternative or any one of its children
//add the bodypart to the latest that was added
if (!extractAllAlternatives && alternativePartBuffer.size() > 0) {
ByteArrayOutputStream bos = new ByteArrayOutputStream();
IOUtils.copy(is, bos);
alternativePartBuffer.peek().children.add(new BodyContents(submd, bos.toByteArray()));
} else if (!extractAllAlternatives && parts.size() < 2) {
//if you're at the first level of embedding
//and you're not in an alternative part block
//and you're text/html, put that in the body of the email
//otherwise treat as a regular attachment
ByteArrayOutputStream bos = new ByteArrayOutputStream();
IOUtils.copy(is, bos);
byte[] bytes = bos.toByteArray();
if (detectTextOrHtml(submd, bytes)) {
handleInlineBodyPart(new BodyContents(submd, bos.toByteArray()));
} else {
//else handle as you would any other embedded content
try (TikaInputStream tis = TikaInputStream.get(bytes)) {
handleEmbedded(tis, submd);
}
}
} else {
//else handle as you would any other embedded content
try (TikaInputStream tis = TikaInputStream.get(is)) {
handleEmbedded(tis, submd);
}
}
}
private boolean detectTextOrHtml(Metadata submd, byte[] bytes) {
String mediaTypeString = submd.get(Metadata.CONTENT_TYPE);
if (mediaTypeString != null) {
if (mediaTypeString.startsWith("text")) {
return true;
} else {
return false;
}
}
try (TikaInputStream tis = TikaInputStream.get(bytes)) {
MediaType mediaType = detector.detect(tis, submd);
if (mediaType != null) {
//detect only once
submd.set(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE, mediaType.toString());
if (mediaType.toString().startsWith("text")) {
return true;
}
}
} catch (IOException e) {
}
return false;
}
private void handleEmbedded(TikaInputStream tis, Metadata metadata) throws MimeException, IOException {
String disposition = metadata.get(Metadata.CONTENT_DISPOSITION);
boolean isInline = false;
if (disposition != null) {
if (disposition.toLowerCase(Locale.US).contains("inline")) {
metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
TikaCoreProperties.EmbeddedResourceType.INLINE.toString());
isInline = true;
}
}
if (! isInline) {
metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString());
}
try {
if (extractor.shouldParseEmbedded(metadata)) {
// Wrap the InputStream before passing on, as the James provided
// one misses many features we might want eg mark/reset
extractor.parseEmbedded(tis, handler, metadata, false);
}
} catch (SAXException e) {
throw new MimeException(e);
}
}
@Override
public void endBodyPart() throws MimeException {
//if we're buffering for a multipart/alternative
//don't write </p></div>
if (alternativePartBuffer.size() > 0) {
return;
}
try {
handler.endElement("p");
handler.endElement("div");
} catch (SAXException e) {
throw new MimeException(e);
}
}
@Override
public void endHeader() throws MimeException {
}
@Override
public void startMessage() throws MimeException {
}
@Override
public void endMessage() throws MimeException {
}
@Override
public void endMultipart() throws MimeException {
if (alternativePartBuffer.size() == 1) {
Part alternativeRoot = alternativePartBuffer.pop();
try {
handleBestParts(alternativeRoot);
} catch (IOException e) {
throw new MimeException(e);
}
} else if (alternativePartBuffer.size() > 1) {
alternativePartBuffer.pop();
}
//test that parts has something
//if it doesn't, there's a problem with the file
//e.g. more endMultiPart than startMultipart
//we're currently silently swallowing this
if (parts.size() > 0) {
parts.pop();
}
}
@Override
public void epilogue(InputStream is) throws MimeException, IOException {
}
/**
* Header for the whole message or its parts
*
* @see <a href="http://james.apache.org/mime4j/apidocs/org/apache/james/mime4j/parser/">
* http://james.apache.org/mime4j/apidocs/org/apache/james/mime4j/parser/</a>
* Field.html
*/
public void field(Field field) throws MimeException {
// if we're in a part, skip.
// We want to gather only the metadata for the whole msg.
if (parts.size() > 0) {
return;
}
try {
String fieldname = field.getName();
ParsedField parsedField = LenientFieldParser.getParser().parse(
field, DecodeMonitor.SILENT);
if (fieldname.equalsIgnoreCase("From")) {
MailboxListField fromField = (MailboxListField) parsedField;
MailboxList mailboxList = fromField.getMailboxList();
if (fromField.isValidField() && mailboxList != null) {
for (Address address : mailboxList) {
String from = getDisplayString(address);
MailUtil.setPersonAndEmail(from, Message.MESSAGE_FROM_NAME,
Message.MESSAGE_FROM_EMAIL, metadata);
metadata.add(Metadata.MESSAGE_FROM, from);
metadata.add(TikaCoreProperties.CREATOR, from);
}
} else {
String from = stripOutFieldPrefix(field, "From:");
MailUtil.setPersonAndEmail(from, Message.MESSAGE_FROM_NAME,
Message.MESSAGE_FROM_EMAIL, metadata);
if (from.startsWith("<")) {
from = from.substring(1);
}
if (from.endsWith(">")) {
from = from.substring(0, from.length() - 1);
}
metadata.add(Metadata.MESSAGE_FROM, from);
metadata.add(TikaCoreProperties.CREATOR, from);
}
} else if (fieldname.equalsIgnoreCase("Subject")) {
metadata.set(TikaCoreProperties.TITLE,
((UnstructuredField) parsedField).getValue());
metadata.set(TikaCoreProperties.SUBJECT,
((UnstructuredField) parsedField).getValue());
} else if (fieldname.equalsIgnoreCase("To")) {
processAddressList(parsedField, "To:", Metadata.MESSAGE_TO);
} else if (fieldname.equalsIgnoreCase("CC")) {
processAddressList(parsedField, "Cc:", Metadata.MESSAGE_CC);
} else if (fieldname.equalsIgnoreCase("BCC")) {
processAddressList(parsedField, "Bcc:", Metadata.MESSAGE_BCC);
} else if (fieldname.equalsIgnoreCase("Content-Type")) {
final MediaType contentType = MediaType.parse(parsedField.getBody());
if (contentType.getType().equalsIgnoreCase("multipart")) {
metadata.set(Message.MULTIPART_SUBTYPE, contentType.getSubtype());
metadata.set(Message.MULTIPART_BOUNDARY, contentType.getParameters().get("boundary"));
} else {
metadata.add(Metadata.MESSAGE_RAW_HEADER_PREFIX + parsedField.getName(),
field.getBody());
}
} else if (fieldname.equalsIgnoreCase("Date")) {
DateTimeField dateField = (DateTimeField) parsedField;
Date date = dateField.getDate();
if (date == null) {
date = tryOtherDateFormats(field.getBody());
}
metadata.set(TikaCoreProperties.CREATED, date);
} else {
metadata.add(Metadata.MESSAGE_RAW_HEADER_PREFIX+parsedField.getName(),
field.getBody());
}
} catch (RuntimeException me) {
if (strictParsing) {
throw me;
}
}
}
private static synchronized Date tryOtherDateFormats(String text) {
if (text == null) {
return null;
}
text = text.replaceAll("\\s+", " ").trim();
//strip out commas
text = text.replaceAll(",", "");
Matcher matcher = GENERAL_TIME_ZONE_NO_MINUTES_PATTERN.matcher(text);
if (matcher.find()) {
text = matcher.replaceFirst("GMT$1$2:00");
}
matcher = AM_PM.matcher(text);
if (matcher.find()) {
text = matcher.replaceFirst("$1 $2");
}
for (DateFormat format : ALTERNATE_DATE_FORMATS) {
try {
return format.parse(text);
} catch (ParseException e) {
}
}
return null;
}
private void processAddressList(ParsedField field, String addressListType,
String metadataField) throws MimeException {
AddressListField toField = (AddressListField) field;
if (toField.isValidField()) {
AddressList addressList = toField.getAddressList();
for (Address address : addressList) {
metadata.add(metadataField, getDisplayString(address));
}
} else {
String to = stripOutFieldPrefix(field,
addressListType);
for (String eachTo : to.split(",")) {
metadata.add(metadataField, eachTo.trim());
}
}
}
private String getDisplayString(Address address) {
if (address instanceof Mailbox) {
Mailbox mailbox = (Mailbox) address;
String name = mailbox.getName();
if (name != null && name.length() > 0) {
name = DecoderUtil.decodeEncodedWords(name, DecodeMonitor.SILENT);
return name + " <" + mailbox.getAddress() + ">";
} else {
return mailbox.getAddress();
}
} else {
return address.toString();
}
}
@Override
public void preamble(InputStream is) throws MimeException, IOException {
}
@Override
public void raw(InputStream is) throws MimeException, IOException {
}
@Override
public void startBodyPart() throws MimeException {
//if we're buffering for a multipart/alternative
//don't write <div><p>
if (alternativePartBuffer.size() > 0) {
return;
}
try {
handler.startElement("div", "class", "email-entry");
handler.startElement("p");
} catch (SAXException e) {
throw new MimeException(e);
}
}
@Override
public void startHeader() throws MimeException {
// TODO Auto-generated method stub
}
@Override
public void startMultipart(BodyDescriptor descr) throws MimeException {
parts.push(descr);
if (! extractAllAlternatives) {
if (alternativePartBuffer.size() == 0
&& MULTIPART_ALTERNATIVE.equalsIgnoreCase(descr.getMimeType())) {
Part part = new Part(descr);
alternativePartBuffer.push(part);
} else if (alternativePartBuffer.size() > 0) {
//add the part to the stack
Part parent = alternativePartBuffer.peek();
Part part = new Part(descr);
alternativePartBuffer.push(part);
if (parent != null) {
parent.children.add(part);
}
}
}
}
private String stripOutFieldPrefix(Field field, String fieldname) {
String temp = field.getRaw().toString();
int loc = fieldname.length();
while (temp.charAt(loc) == ' ') {
loc++;
}
return temp.substring(loc);
}
private void handleBestParts(Part part) throws MimeException, IOException {
if (part == null) {
return;
}
if (part instanceof BodyContents) {
handleInlineBodyPart((BodyContents)part);
return;
}
if (MULTIPART_ALTERNATIVE.equalsIgnoreCase(part.bodyDescriptor.getMimeType())) {
int bestPartScore = -1;
Part bestPart = null;
for (Part alternative : part.children) {
int score = score(alternative);
if (score > bestPartScore) {
bestPart = alternative;
bestPartScore = score;
}
}
handleBestParts(bestPart);
} else {
for (Part child : part.children) {
handleBestParts(child);
}
}
}
private void handleInlineBodyPart(BodyContents part) throws MimeException, IOException {
String contentType = part.metadata.get(Metadata.CONTENT_TYPE);
Parser parser = null;
boolean inlineText = false;
if (MediaType.TEXT_HTML.toString().equalsIgnoreCase(contentType)) {
parser =
EmbeddedDocumentUtil.tryToFindExistingLeafParser(HtmlParser.class, parseContext);
} else if (MediaType.TEXT_PLAIN.toString().equalsIgnoreCase(contentType)) {
parser =
EmbeddedDocumentUtil.tryToFindExistingLeafParser(TXTParser.class, parseContext);
if (parser == null) {
parser = EmbeddedDocumentUtil.tryToFindExistingLeafParser(TextAndCSVParser.class, parseContext);
inlineText = true;
}
}
if (parser == null) {
//back off and treat it as an embedded chunk
try (TikaInputStream tis = TikaInputStream.get(part.bytes)) {
handleEmbedded(tis, part.metadata);
}
} else {
//parse inline
try {
Metadata inlineMetadata = new Metadata();
if (inlineText) {
inlineMetadata.set(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE, MediaType.TEXT_PLAIN.toString());
}
parser.parse(
new ByteArrayInputStream(part.bytes),
new EmbeddedContentHandler(new BodyContentHandler(handler)),
inlineMetadata, parseContext
);
} catch (SAXException | TikaException e) {
throw new MimeException(e);
}
}
}
private int score(Part part) {
if (part == null) {
return 0;
}
if (part instanceof BodyContents) {
String contentType = ((BodyContents)part).metadata.get(Metadata.CONTENT_TYPE);
if (contentType == null) {
return 0;
} else if (contentType.equalsIgnoreCase(MediaType.TEXT_PLAIN.toString())) {
return 1;
} else if (contentType.equalsIgnoreCase("application/rtf")) {
//TODO -- is this the right definition in rfc822 for rich text?!
return 2;
} else if (contentType.equalsIgnoreCase(MediaType.TEXT_HTML.toString())) {
return 3;
}
}
return 4;
}
private static class Part {
private final BodyDescriptor bodyDescriptor;
private final List<Part> children = new ArrayList<>();
public Part(BodyDescriptor bodyDescriptor) {
this.bodyDescriptor = bodyDescriptor;
}
}
private static class BodyContents extends Part {
private final Metadata metadata;
private final byte[] bytes;
private BodyContents(Metadata metadata, byte[] bytes) {
super(null);
this.metadata = metadata;
this.bytes = bytes;
}
}
}