| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.tika.parser.mbox; |
| |
| import static org.junit.Assert.assertEquals; |
| |
| import java.io.InputStream; |
| import java.util.List; |
| import java.util.Map; |
| |
| import org.junit.Before; |
| import org.junit.Test; |
| import org.xml.sax.ContentHandler; |
| |
| import org.apache.tika.TikaTest; |
| import org.apache.tika.detect.TypeDetector; |
| import org.apache.tika.metadata.Metadata; |
| import org.apache.tika.metadata.TikaCoreProperties; |
| import org.apache.tika.parser.AutoDetectParser; |
| import org.apache.tika.parser.ParseContext; |
| import org.apache.tika.parser.Parser; |
| import org.apache.tika.sax.BodyContentHandler; |
| |
| public class MboxParserTest extends TikaTest { |
| |
| protected ParseContext recursingContext; |
| private Parser autoDetectParser; |
| private TypeDetector typeDetector; |
| private MboxParser mboxParser; |
| |
| @Before |
| public void setUp() throws Exception { |
| typeDetector = new TypeDetector(); |
| autoDetectParser = new AutoDetectParser(typeDetector); |
| recursingContext = new ParseContext(); |
| recursingContext.set(Parser.class, autoDetectParser); |
| |
| mboxParser = new MboxParser(); |
| mboxParser.setTracking(true); |
| } |
| |
| @Test |
| public void testSimple() throws Exception { |
| ContentHandler handler = new BodyContentHandler(); |
| Metadata metadata = new Metadata(); |
| |
| try (InputStream stream = getResourceAsStream("/test-documents/simple.mbox")) { |
| mboxParser.parse(stream, handler, metadata, recursingContext); |
| } |
| |
| String content = handler.toString(); |
| assertContains("Test content 1", content); |
| assertContains("Test content 2", content); |
| assertEquals("application/mbox", metadata.get(Metadata.CONTENT_TYPE)); |
| |
| Map<Integer, Metadata> mailsMetadata = mboxParser.getTrackingMetadata(); |
| assertEquals("Nb. Of mails", 2, mailsMetadata.size()); |
| |
| Metadata mail1 = mailsMetadata.get(0); |
| assertEquals("message/rfc822", mail1.get(Metadata.CONTENT_TYPE)); |
| assertEquals("envelope-sender-mailbox-name Mon Jun 01 10:00:00 2009", |
| mail1.get("MboxParser-from")); |
| |
| Metadata mail2 = mailsMetadata.get(1); |
| assertEquals("message/rfc822", mail2.get(Metadata.CONTENT_TYPE)); |
| assertEquals("envelope-sender-mailbox-name Mon Jun 01 11:00:00 2010", |
| mail2.get("MboxParser-from")); |
| } |
| |
| @Test |
| public void testHeaders() throws Exception { |
| ContentHandler handler = new BodyContentHandler(); |
| Metadata metadata = new Metadata(); |
| |
| try (InputStream stream = getResourceAsStream("/test-documents/headers.mbox")) { |
| mboxParser.parse(stream, handler, metadata, recursingContext); |
| } |
| |
| assertContains("Test content", handler.toString()); |
| assertEquals("Nb. Of mails", 1, mboxParser.getTrackingMetadata().size()); |
| |
| Metadata mailMetadata = mboxParser.getTrackingMetadata().get(0); |
| |
| assertEquals("2009-06-10T03:58:45Z", mailMetadata.get(TikaCoreProperties.CREATED)); |
| assertEquals("<author@domain.com>", mailMetadata.get(TikaCoreProperties.CREATOR)); |
| assertEquals("subject", mailMetadata.get(TikaCoreProperties.SUBJECT)); |
| assertEquals("message/rfc822", mailMetadata.get(Metadata.CONTENT_TYPE)); |
| assertEquals("author@domain.com", mailMetadata.get("Message-From")); |
| assertEquals("<name@domain.com>", mailMetadata.get("MboxParser-return-path")); |
| } |
| |
| @Test |
| public void testMultilineHeader() throws Exception { |
| ContentHandler handler = new BodyContentHandler(); |
| Metadata metadata = new Metadata(); |
| |
| try (InputStream stream = getResourceAsStream("/test-documents/multiline.mbox")) { |
| mboxParser.parse(stream, handler, metadata, recursingContext); |
| } |
| |
| assertEquals("Nb. Of mails", 1, mboxParser.getTrackingMetadata().size()); |
| |
| Metadata mailMetadata = mboxParser.getTrackingMetadata().get(0); |
| assertEquals("from xxx by xxx with xxx; date", mailMetadata.get("MboxParser-received")); |
| } |
| |
| @Test |
| public void testQuoted() throws Exception { |
| ContentHandler handler = new BodyContentHandler(); |
| Metadata metadata = new Metadata(); |
| |
| try (InputStream stream = getResourceAsStream("/test-documents/quoted.mbox")) { |
| mboxParser.parse(stream, handler, metadata, recursingContext); |
| } |
| |
| assertContains("Test content", handler.toString()); |
| assertContains("> quoted stuff", handler.toString()); |
| } |
| |
| @Test |
| public void testComplex() throws Exception { |
| ContentHandler handler = new BodyContentHandler(); |
| Metadata metadata = new Metadata(); |
| |
| try (InputStream stream = getResourceAsStream("/test-documents/complex.mbox")) { |
| mboxParser.parse(stream, handler, metadata, recursingContext); |
| } |
| |
| assertEquals("Nb. Of mails", 3, mboxParser.getTrackingMetadata().size()); |
| |
| Metadata firstMail = mboxParser.getTrackingMetadata().get(0); |
| assertEquals("Re: question about when shuffle/sort start working", |
| firstMail.get(TikaCoreProperties.SUBJECT)); |
| assertEquals("Re: question about when shuffle/sort start working", |
| firstMail.get(TikaCoreProperties.TITLE)); |
| assertEquals("Jothi Padmanabhan <jothipn@yahoo-inc.com>", |
| firstMail.get(TikaCoreProperties.CREATOR)); |
| assertEquals("core-user@hadoop.apache.org", |
| firstMail.get(Metadata.MESSAGE_RECIPIENT_ADDRESS)); |
| |
| assertContains("When a Mapper completes", handler.toString()); |
| } |
| |
| @Test |
| public void testTika2478() throws Exception { |
| List<Metadata> metadataList = getRecursiveMetadata("testMBOX_complex.mbox"); |
| assertEquals(2, metadataList.size()); |
| assertEquals("application/mbox", metadataList.get(0).get(Metadata.CONTENT_TYPE)); |
| assertEquals("message/rfc822", metadataList.get(1).get(Metadata.CONTENT_TYPE)); |
| assertContains("body 2", metadataList.get(1).get(TikaCoreProperties.TIKA_CONTENT)); |
| assertNotContained("body 1", metadataList.get(1).get(TikaCoreProperties.TIKA_CONTENT)); |
| } |
| } |