blob: 1a6410cf0207c26082370ff5134b0edbdf959908 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.mail;
import static java.nio.charset.StandardCharsets.US_ASCII;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;
import static org.mockito.Matchers.any;
import static org.mockito.Matchers.eq;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.never;
import static org.mockito.Mockito.times;
import static org.mockito.Mockito.verify;
import java.io.ByteArrayInputStream;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.text.DateFormat;
import java.text.DateFormatSymbols;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.List;
import java.util.Locale;
import org.apache.james.mime4j.stream.MimeConfig;
import org.junit.BeforeClass;
import org.junit.Test;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.helpers.DefaultHandler;
import org.apache.tika.TikaTest;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.ContainerExtractor;
import org.apache.tika.extractor.ParserContainerExtractor;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Message;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ocr.TesseractOCRParserTest;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
public class RFC822ParserTest extends TikaTest {
//legacy RFC822 behavior...extract every alternative part
private static Parser EXTRACT_ALL_ALTERNATIVES_PARSER;
private static TikaConfig TIKA_CONFIG;
private static InputStream getStream(String name) {
InputStream stream =
Thread.currentThread().getContextClassLoader().getResourceAsStream(name);
assertNotNull("Test file not found " + name, stream);
return stream;
}
@BeforeClass
public static void setUp() throws Exception {
try (InputStream is = getStream(
"org/apache/tika/parser/mail/tika-config-extract-all-alternatives.xml")) {
TIKA_CONFIG = new TikaConfig(is);
}
EXTRACT_ALL_ALTERNATIVES_PARSER = new AutoDetectParser(TIKA_CONFIG);
}
@Test
public void testSimple() throws Exception {
Metadata metadata = new Metadata();
InputStream stream = getStream("test-documents/testRFC822");
ContentHandler handler = mock(DefaultHandler.class);
try {
EXTRACT_ALL_ALTERNATIVES_PARSER.parse(stream, handler, metadata, new ParseContext());
verify(handler).startDocument();
//just one body
verify(handler).startElement(eq(XHTMLContentHandler.XHTML), eq("p"), eq("p"),
any(Attributes.class));
verify(handler).endElement(XHTMLContentHandler.XHTML, "p", "p");
//no multi-part body parts
verify(handler, never())
.startElement(eq(XHTMLContentHandler.XHTML), eq("div"), eq("div"),
any(Attributes.class));
verify(handler, never()).endElement(XHTMLContentHandler.XHTML, "div", "div");
verify(handler).endDocument();
//note no leading spaces, and no quotes
assertEquals("Julien Nioche (JIRA) <jira@apache.org>",
metadata.get(TikaCoreProperties.CREATOR));
assertEquals("[jira] Commented: (TIKA-461) RFC822 messages not parsed",
metadata.get(TikaCoreProperties.TITLE));
assertEquals("[jira] Commented: (TIKA-461) RFC822 messages not parsed",
metadata.get(TikaCoreProperties.SUBJECT));
} catch (Exception e) {
fail("Exception thrown: " + e.getMessage());
}
}
@Test
public void testExtendedToFromMetadata() throws Exception {
Metadata m = getXML("testRFC822").metadata;
assertEquals("Julien Nioche (JIRA)", m.get(Message.MESSAGE_FROM_NAME));
assertEquals("jira@apache.org", m.get(Message.MESSAGE_FROM_EMAIL));
m = getXML("testRFC822-multipart").metadata;
assertEquals("DigitalPebble", m.get(Message.MESSAGE_FROM_NAME));
assertEquals("julien@digitalpebble.com", m.get(Message.MESSAGE_FROM_EMAIL));
m = getXML("testRFC822_quoted").metadata;
assertEquals("Another Person", m.get(Message.MESSAGE_FROM_NAME));
assertEquals("another.person@another-example.com", m.get(Message.MESSAGE_FROM_EMAIL));
m = getXML("testRFC822_i18nheaders").metadata;
assertEquals("Keld Jørn Simonsen", m.get(Message.MESSAGE_FROM_NAME));
assertEquals("keld@dkuug.dk", m.get(Message.MESSAGE_FROM_EMAIL));
//this is currently detected as mbox!!!
m = getXML("testEmailWithPNGAtt.eml", new RFC822Parser()).metadata;
assertEquals("Tika Test", m.get(Message.MESSAGE_FROM_NAME));
assertEquals("XXXX@apache.org", m.get(Message.MESSAGE_FROM_EMAIL));
}
@Test
public void testMultipart() {
Metadata metadata = new Metadata();
InputStream stream = getStream("test-documents/testRFC822-multipart");
ContentHandler handler = mock(XHTMLContentHandler.class);
ParseContext context = new ParseContext();
context.set(Parser.class, EXTRACT_ALL_ALTERNATIVES_PARSER);
try {
EXTRACT_ALL_ALTERNATIVES_PARSER.parse(stream, handler, metadata, context);
verify(handler).startDocument();
int bodyExpectedTimes = 4, multipackExpectedTimes = 5;
// TIKA-1422. TesseractOCRParser interferes with the
// number of times the handler is invoked But, different versions of Tesseract lead
// to a different number of invocations. So, we
// only verify the handler if Tesseract cannot run.
if (!TesseractOCRParserTest.canRun()) {
verify(handler, times(bodyExpectedTimes))
.startElement(eq(XHTMLContentHandler.XHTML), eq("div"), eq("div"),
any(Attributes.class));
verify(handler, times(bodyExpectedTimes))
.endElement(XHTMLContentHandler.XHTML, "div", "div");
}
verify(handler, times(multipackExpectedTimes))
.startElement(eq(XHTMLContentHandler.XHTML), eq("p"), eq("p"),
any(Attributes.class));
verify(handler, times(multipackExpectedTimes))
.endElement(XHTMLContentHandler.XHTML, "p", "p");
verify(handler).endDocument();
} catch (Exception e) {
fail("Exception thrown: " + e.getMessage());
}
//repeat, this time looking at content
metadata = new Metadata();
stream = getStream("test-documents/testRFC822-multipart");
handler = new BodyContentHandler();
try {
EXTRACT_ALL_ALTERNATIVES_PARSER.parse(stream, handler, metadata, context);
//tests correct decoding of quoted printable text, including UTF-8 bytes into Unicode
String bodyText = handler.toString();
assertTrue(bodyText.contains("body 1"));
assertTrue(bodyText.contains("body 2"));
assertFalse(bodyText.contains("R0lGODlhNgE8AMQAA")); //part of encoded gif
} catch (Exception e) {
fail("Exception thrown: " + e.getMessage());
}
}
@Test
public void testQuotedPrintable() {
Metadata metadata = new Metadata();
InputStream stream = getStream("test-documents/testRFC822_quoted");
ContentHandler handler = new BodyContentHandler();
ParseContext context = new ParseContext();
try {
EXTRACT_ALL_ALTERNATIVES_PARSER.parse(stream, handler, metadata, context);
//tests correct decoding of quoted printable text, including UTF-8 bytes into Unicode
String bodyText = handler.toString();
assertTrue(bodyText.contains("D\u00FCsseldorf has non-ascii."));
assertTrue(bodyText.contains("Lines can be split like this."));
assertTrue(bodyText.contains("Spaces at the end of a line \r\nmust be encoded.\r\n"));
assertFalse(bodyText.contains("=")); //there should be no escape sequences
} catch (Exception e) {
fail("Exception thrown: " + e.getMessage());
}
}
@Test
public void testBase64() throws Exception {
Metadata metadata = new Metadata();
InputStream stream = getStream("test-documents/testRFC822_base64");
ContentHandler handler = new BodyContentHandler();
ParseContext context = new ParseContext();
context.set(Parser.class, EXTRACT_ALL_ALTERNATIVES_PARSER);
//need to pass in hint. Autodetects text/plain
metadata.set(Metadata.CONTENT_TYPE, "message/rfc822");
try {
EXTRACT_ALL_ALTERNATIVES_PARSER.parse(stream, handler, metadata, context);
//tests correct decoding of base64 text, including ISO-8859-1 bytes into Unicode
assertContains("Here is some text, with international characters, voil\u00E0!",
handler.toString());
} catch (Exception e) {
fail("Exception thrown: " + e.getMessage());
}
}
@Test
public void testI18NHeaders() {
Metadata metadata = new Metadata();
InputStream stream = getStream("test-documents/testRFC822_i18nheaders");
ContentHandler handler = mock(DefaultHandler.class);
try {
EXTRACT_ALL_ALTERNATIVES_PARSER.parse(stream, handler, metadata, new ParseContext());
//tests correct decoding of internationalized headers, both
//quoted-printable (Q) and Base64 (B).
assertEquals("Keld J\u00F8rn Simonsen <keld@dkuug.dk>",
metadata.get(TikaCoreProperties.CREATOR));
assertEquals("If you can read this you understand the example.",
metadata.get(TikaCoreProperties.TITLE));
assertEquals("If you can read this you understand the example.",
metadata.get(TikaCoreProperties.SUBJECT));
} catch (Exception e) {
fail("Exception thrown: " + e.getMessage());
}
}
/**
* The from isn't in the usual form.
* See TIKA-618
*/
@Test
public void testUnusualFromAddress() throws Exception {
Metadata metadata = new Metadata();
InputStream stream = getStream("test-documents/testRFC822_oddfrom");
ContentHandler handler = mock(DefaultHandler.class);
EXTRACT_ALL_ALTERNATIVES_PARSER.parse(stream, handler, metadata, new ParseContext());
assertEquals("Saved by Windows Internet Explorer 7",
metadata.get(TikaCoreProperties.CREATOR));
assertEquals("Air Permit Programs | Air & Radiation | US EPA",
metadata.get(TikaCoreProperties.TITLE));
assertEquals("Air Permit Programs | Air & Radiation | US EPA",
metadata.get(TikaCoreProperties.SUBJECT));
}
/**
* Test for TIKA-640, increase header max beyond 10k bytes
*/
@Test
public void testLongHeader() throws Exception {
StringBuilder inputBuilder = new StringBuilder();
for (int i = 0; i < 2000; ++i) {
inputBuilder.append( //len > 50
"really really really really really really long name ");
}
String name = inputBuilder.toString();
byte[] data = ("Status: 520\r\nFrom: " + name + "\r\n\r\n").getBytes(US_ASCII);
ContentHandler handler = new DefaultHandler();
Metadata metadata = new Metadata();
ParseContext context = new ParseContext();
try {
EXTRACT_ALL_ALTERNATIVES_PARSER
.parse(new ByteArrayInputStream(data), handler, metadata, context);
fail();
} catch (TikaException expected) {
}
MimeConfig config = new MimeConfig.Builder().setMaxHeaderLen(-1).setMaxLineLen(-1).build();
context.set(MimeConfig.class, config);
EXTRACT_ALL_ALTERNATIVES_PARSER
.parse(new ByteArrayInputStream(data), handler, metadata, context);
assertEquals(name.trim(), metadata.get(TikaCoreProperties.CREATOR));
}
/**
* Test for TIKA-678 - not all headers may be present
*/
@Test
public void testSomeMissingHeaders() throws Exception {
Metadata metadata = new Metadata();
InputStream stream = getStream("test-documents/testRFC822-limitedheaders");
ContentHandler handler = new BodyContentHandler();
ParseContext context = new ParseContext();
context.set(Parser.class, EXTRACT_ALL_ALTERNATIVES_PARSER);
EXTRACT_ALL_ALTERNATIVES_PARSER.parse(stream, handler, metadata, context);
assertEquals(true, metadata.isMultiValued(TikaCoreProperties.CREATOR));
assertEquals("xyz", metadata.getValues(TikaCoreProperties.CREATOR)[0]);
assertEquals("abc", metadata.getValues(TikaCoreProperties.CREATOR)[1]);
assertEquals(true, metadata.isMultiValued(Metadata.MESSAGE_FROM));
assertEquals("xyz", metadata.getValues(Metadata.MESSAGE_FROM)[0]);
assertEquals("abc", metadata.getValues(Metadata.MESSAGE_FROM)[1]);
assertEquals(true, metadata.isMultiValued(Metadata.MESSAGE_TO));
assertEquals("abc", metadata.getValues(Metadata.MESSAGE_TO)[0]);
assertEquals("def", metadata.getValues(Metadata.MESSAGE_TO)[1]);
assertEquals("abcd", metadata.get(TikaCoreProperties.TITLE));
assertEquals("abcd", metadata.get(TikaCoreProperties.SUBJECT));
assertContains("bar biz bat", handler.toString());
}
/**
* TIKA-1222 When requested, ensure that the various attachments of
* the mail come through properly as embedded resources
*/
@Test
public void testGetAttachmentsAsEmbeddedResources() throws Exception {
TrackingHandler tracker = new TrackingHandler();
ContainerExtractor ex = new ParserContainerExtractor(TIKA_CONFIG);
try (TikaInputStream tis = TikaInputStream
.get(getStream("test-documents/testRFC822-multipart"))) {
assertEquals(true, ex.isSupported(tis));
ex.extract(tis, ex, tracker);
}
// Check we found all 3 parts
assertEquals(3, tracker.filenames.size());
assertEquals(3, tracker.mediaTypes.size());
// No filenames available
assertEquals(null, tracker.filenames.get(0));
assertEquals(null, tracker.filenames.get(1));
// Except for this using Content-Disposition filename field
assertEquals("logo.gif", tracker.filenames.get(2));
// Types are available
assertEquals(MediaType.TEXT_PLAIN, tracker.mediaTypes.get(0));
assertEquals(MediaType.TEXT_HTML, tracker.mediaTypes.get(1));
assertEquals(MediaType.image("gif"), tracker.mediaTypes.get(2));
}
@Test
public void testDetection() throws Exception {
//test simple text file
XMLResult r = getXML("testRFC822_date_utf8");
assertEquals("message/rfc822", r.metadata.get(Metadata.CONTENT_TYPE));
//test without extension
r = getXML("testRFC822_eml");
assertEquals("message/rfc822", r.metadata.get(Metadata.CONTENT_TYPE));
}
@Test
public void testDates() throws Exception {
//tests non-standard dates that mime4j can't parse
XMLResult r = getXML("testRFC822_date_utf8");
assertEquals("2016-05-16T08:30:32Z", r.metadata.get(TikaCoreProperties.CREATED));
r = getXML("testRFC822_eml");
assertEquals("2016-05-16T08:30:32Z", r.metadata.get(TikaCoreProperties.CREATED));
String expected = "2016-05-15T01:32:00Z";
for (String dateString : new String[]{"Sun, 15 May 2016 01:32:00 UTC",
//make sure this test basically works
"Sun, 15 May 2016 01:32:00", //no timezone
"Sunday, May 15 2016 1:32 AM", "May 15 2016 1:32am", "May 15 2016 1:32 am",
"2016-05-15 01:32:00", " Sun, 15 May 2016 3:32:00 +0200",
//format correctly handled by mime4j if no leading whitespace
" Sun, 14 May 2016 20:32:00 EST",}) {
testDate(dateString, expected);
}
//now try days without times
expected = "2016-05-15T12:00:00Z";
for (String dateString : new String[]{"May 15, 2016", "Sun, 15 May 2016", "15 May 2016",}) {
testDate(dateString, expected);
}
}
@Test
public void testTrickyDates() throws Exception {
DateFormat df = new SimpleDateFormat("yyyy-MM-dd", new DateFormatSymbols(Locale.US));
//make sure there are no mis-parses of e.g. 90 = year 90 A.D, not 1990
Date date1980 = df.parse("1980-01-01");
for (String dateString : new String[]{"Mon, 29 Jan 96 14:02 GMT", "7/20/95 1:12pm",
"08/14/2000 12:48 AM", "06/24/2008, Tuesday, 11 AM", "11/14/08", "12/02/1996",
"96/12/02",}) {
Date parsedDate = getDate(dateString);
if (parsedDate != null) {
assertTrue("date must be after 1980:" + dateString,
parsedDate.getTime() > date1980.getTime());
}
}
//TODO: mime4j misparses these to pre 1980 dates
//"Wed, 27 Dec 95 11:20:40 EST",
//"26 Aug 00 11:14:52 EDT"
//
//We are still misparsing: 8/1/03 to a pre 1980 date
}
private void testDate(String dateString, String expected) throws Exception {
Date parsedDate = getDate(dateString);
assertNotNull("couldn't parse " + dateString, parsedDate);
DateFormat df =
new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'", new DateFormatSymbols(Locale.US));
String parsedDateString = df.format(parsedDate);
assertEquals("failed to match: " + dateString, expected, parsedDateString);
}
private Date getDate(String dateString) throws Exception {
String mail = "From: dev@tika.apache.org\n" + "Date: " + dateString + "\n";
Parser p = new RFC822Parser();
Metadata m = new Metadata();
try (InputStream is = TikaInputStream.get(mail.getBytes(StandardCharsets.UTF_8))) {
p.parse(is, new DefaultHandler(), m, new ParseContext());
}
return m.getDate(TikaCoreProperties.CREATED);
}
@Test
public void testMultipleSubjects() throws Exception {
//adapted from govdocs1 303710.txt
String s = "From: Shawn Jones [chiroshawn@yahoo.com]\n" + "Subject: 2006N-3502\n" +
"Subject: I Urge You to Require Notice of Mercury";
Parser p = new RFC822Parser();
Metadata m = new Metadata();
p.parse(TikaInputStream.get(s.getBytes(StandardCharsets.UTF_8)), new DefaultHandler(), m,
new ParseContext());
assertEquals("I Urge You to Require Notice of Mercury", m.get(TikaCoreProperties.TITLE));
}
@Test
public void testExtractAttachments() throws Exception {
List<Metadata> metadataList =
getRecursiveMetadata("testEmailWithPNGAtt.eml", EXTRACT_ALL_ALTERNATIVES_PARSER);
// Check we get the metadata
assertEquals("Tika Test <XXXX@apache.org>", metadataList.get(3).get(Metadata.MESSAGE_FROM));
assertEquals("Test Attachment Email", metadataList.get(3).get(TikaCoreProperties.TITLE));
// Check attachments
assertEquals(4, metadataList.size());
assertEquals("text/plain; charset=UTF-8", metadataList.get(1).get(Metadata.CONTENT_TYPE));
assertEquals("image/png", metadataList.get(2).get(Metadata.CONTENT_TYPE));
assertEquals("testPNG.png", metadataList.get(2).get(TikaCoreProperties.RESOURCE_NAME_KEY));
assertContains("This email has a PNG attachment included in it",
metadataList.get(1).get(TikaCoreProperties.TIKA_CONTENT));
assertEquals(null, metadataList.get(1).get(Metadata.CONTENT_DISPOSITION));
assertEquals("attachment; filename=\"testPNG.png\"",
metadataList.get(2).get(Metadata.CONTENT_DISPOSITION));
}
@Test
public void testEmbeddedMetadata() throws Exception {
List<Metadata> seenMetadata =
getRecursiveMetadata("testRFC822-multipart", EXTRACT_ALL_ALTERNATIVES_PARSER);
assertEquals(4, seenMetadata.size());
assertEquals(null, seenMetadata.get(1).get(Metadata.CONTENT_DISPOSITION));
assertEquals("text/plain; charset=UTF-8", seenMetadata.get(1).get(Metadata.CONTENT_TYPE));
assertEquals("UTF-8", seenMetadata.get(1).get(Metadata.CONTENT_ENCODING));
assertEquals(null, seenMetadata.get(2).get(Metadata.CONTENT_DISPOSITION));
assertEquals("text/html; charset=UTF-8", seenMetadata.get(2).get(Metadata.CONTENT_TYPE));
assertEquals("UTF-8", seenMetadata.get(2).get(Metadata.CONTENT_ENCODING));
assertEquals("attachment; filename=\"logo.gif\"",
seenMetadata.get(3).get(Metadata.CONTENT_DISPOSITION));
assertEquals("logo.gif", seenMetadata.get(3).get(TikaCoreProperties.RESOURCE_NAME_KEY));
assertEquals("image/gif", seenMetadata.get(3).get(Metadata.CONTENT_TYPE));
}
@Test
public void testMultipartFlags() throws Exception {
List<Metadata> metadataList =
getRecursiveMetadata("testRFC822-multipart", EXTRACT_ALL_ALTERNATIVES_PARSER);
// Check the root metadata.
assertEquals("mixed", metadataList.get(0).get(Message.MULTIPART_SUBTYPE));
assertEquals("0016e64606800312ee04913db790",
metadataList.get(0).get(Message.MULTIPART_BOUNDARY));
// Check the metadata of the first alternative.
assertTrue(
metadataList.get(1).get(Metadata.CONTENT_TYPE).equals("text/plain; charset=UTF-8"));
assertTrue(metadataList.get(1).get(Message.MULTIPART_SUBTYPE).equals("alternative"));
assertTrue(metadataList.get(1).get(Message.MULTIPART_BOUNDARY)
.equals("0016e64606800312ea04913db78e"));
// Check the metadata of the second alternative.
assertTrue(
metadataList.get(2).get(Metadata.CONTENT_TYPE).equals("text/html; charset=UTF-8"));
assertTrue(metadataList.get(2).get(Message.MULTIPART_SUBTYPE).equals("alternative"));
assertTrue(metadataList.get(2).get(Message.MULTIPART_BOUNDARY)
.equals("0016e64606800312ea04913db78e"));
// Check the metadata of the attached GIF.
assertTrue(metadataList.get(3).get(Metadata.CONTENT_TYPE).equals("image/gif"));
assertEquals("mixed", metadataList.get(3).get(Message.MULTIPART_SUBTYPE));
assertEquals("0016e64606800312ee04913db790",
metadataList.get(3).get(Message.MULTIPART_BOUNDARY));
}
@Test
public void testBasicAlternativeBodyHandling() throws Exception {
/*
multi-part/mixed
multi-part/alternative
text
html
gif
*/
List<Metadata> metadataList = getRecursiveMetadata("testRFC822-multipart");
assertEquals(2, metadataList.size());
String body = metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT);
assertContains("body 2", body);
assertNotContained("body 1", body);
assertEquals("message/rfc822", metadataList.get(0).get(Metadata.CONTENT_TYPE));
assertEquals("image/gif", metadataList.get(1).get(Metadata.CONTENT_TYPE));
assertEquals("/logo.gif",
metadataList.get(1).get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH));
}
@Test
public void testMixedRelatedMultipart() throws Exception {
/*
multipart/mixed (..6)
multipart/related (..5)
multipart/alternative (..4)
text/plain
text/html
image/jpeg (inline) Mary with cooler.jpeg (..5)
image/jpeg (attachment) mary-coffee.jpg (..6)
*/
List<Metadata> metadataList = getRecursiveMetadata("testRFC822-mixed-simple");
assertEquals(3, metadataList.size());
assertContains("body 2", metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT));
assertNotContained("body 1", metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT));
assertEquals("message/rfc822", metadataList.get(0).get(Metadata.CONTENT_TYPE));
assertEquals("image/jpeg", metadataList.get(1).get(Metadata.CONTENT_TYPE));
assertEquals("/Mary with cooler.jpeg",
metadataList.get(1).get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH));
assertEquals(TikaCoreProperties.EmbeddedResourceType.INLINE.toString(),
metadataList.get(1).get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
assertEquals("image/jpeg", metadataList.get(2).get(Metadata.CONTENT_TYPE));
assertEquals("/mary-coffee.jpg",
metadataList.get(2).get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH));
assertEquals(TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString(),
metadataList.get(2).get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
}
@Test
public void testAlternativeWithComplexMixedChild() throws Exception {
/*
This tests that both html body chunks are stitched back
together in the body text for the main email.
multi-part/alternative
text
multipart/mixed
html body chunk 1
pdf
html body chunk 2
*/
List<Metadata> metadataList = getRecursiveMetadata("testRFC822-mixed-with-pdf-inline");
assertEquals(2, metadataList.size());
String body = metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT);
assertContains("body 2", body);
assertContains("body 3", body);
assertNotContained("body 1", body);
assertEquals("message/rfc822", metadataList.get(0).get(Metadata.CONTENT_TYPE));
assertEquals("application/pdf", metadataList.get(1).get(Metadata.CONTENT_TYPE));
assertEquals("/tzora-titan-4-hummer-xl-manual.pdf",
metadataList.get(1).get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH));
}
@Test
public void testSimpleBodyInlined() throws Exception {
List<Metadata> metadataList = getRecursiveMetadata("testRFC822_simple_inline_body.txt");
assertEquals(1, metadataList.size());
assertContains("asked", metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT));
}
@Test
public void testGroupwise() throws Exception {
//TODO -- this should treat attachments as attachments, no?
List<Metadata> metadataList = getRecursiveMetadata("testGroupWiseEml.eml");
assertEquals(1, metadataList.size());
assertContains("ssssss", metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT));
}
}