blob: 144f9813d3208ae415625068c59b21e624d03bd4 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.microsoft;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.sax.SAXTransformerFactory;
import javax.xml.transform.sax.TransformerHandler;
import javax.xml.transform.stream.StreamResult;
import java.io.InputStream;
import java.io.StringWriter;
import java.util.Arrays;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.tika.TikaTest;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.metadata.Message;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.junit.Test;
import org.xml.sax.ContentHandler;
/**
* Test case for parsing Outlook files.
*/
public class OutlookParserTest extends TikaTest {
@Test
public void testOutlookParsing() throws Exception {
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
try (InputStream stream = getResourceAsStream("/test-documents/test-outlook.msg")) {
AUTO_DETECT_PARSER.parse(stream, handler, metadata, new ParseContext());
}
assertEquals(
"application/vnd.ms-outlook",
metadata.get(Metadata.CONTENT_TYPE));
assertEquals(
"Microsoft Outlook Express 6",
metadata.get(TikaCoreProperties.TITLE));
assertEquals(
"Nouvel utilisateur de Outlook Express",
metadata.get(Metadata.MESSAGE_RECIPIENT_ADDRESS));
assertEquals(
"L'\u00C9quipe Microsoft Outlook Express",
metadata.get(TikaCoreProperties.CREATOR));
//ensure that "raw" header is correctly decoded
assertEquals(
"L'\u00C9quipe Microsoft Outlook Express <msoe@microsoft.com>",
metadata.get(Metadata.MESSAGE_RAW_HEADER_PREFIX+"From"));
assertEquals("Nouvel utilisateur de Outlook Express",
metadata.get(Message.MESSAGE_TO_EMAIL));
assertEquals("",
metadata.get(Message.MESSAGE_TO_NAME));
assertEquals("Nouvel utilisateur de Outlook Express",
metadata.get(Message.MESSAGE_TO_DISPLAY_NAME));
// Stored as Thu, 5 Apr 2007 09:26:06 -0700
assertEquals(
"2007-04-05T16:26:06Z",
metadata.get(TikaCoreProperties.CREATED));
String content = handler.toString();
assertContains("Microsoft Outlook Express 6", content);
assertContains("L'\u00C9quipe Microsoft Outlook Express", content);
assertContains("Nouvel utilisateur de Outlook Express", content);
assertContains("Messagerie et groupes de discussion", content);
}
/**
* Test case for TIKA-197
*
* @see <a href="https://issues.apache.org/jira/browse/TIKA-197">TIKA-197</a>
*/
@Test
public void testMultipleCopies() throws Exception {
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
try (InputStream stream = getResourceAsStream("/test-documents/testMSG.msg")) {
AUTO_DETECT_PARSER.parse(stream, handler, metadata, new ParseContext());
}
assertEquals(
"application/vnd.ms-outlook",
metadata.get(Metadata.CONTENT_TYPE));
String content = handler.toString();
Pattern pattern = Pattern.compile("From");
Matcher matcher = pattern.matcher(content);
assertTrue(matcher.find());
assertFalse(matcher.find());
//test that last header is added
assertContains("29 Jan 2009 19:17:10.0163 (UTC) FILETIME=[2ED25E30:01C98246]",
Arrays.asList(metadata.getValues("Message:Raw-Header:X-OriginalArrivalTime")));
//confirm next line is added correctly
assertContains("from athena.apache.org (HELO athena.apache.org) (140.211.11.136)\n" +
" by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 29 Jan 2009 11:17:08 -0800",
Arrays.asList(metadata.getValues("Message:Raw-Header:Received")));
assertEquals("EX", metadata.get(Office.MAPI_SENT_BY_SERVER_TYPE));
assertEquals("NOTE", metadata.get(Office.MAPI_MESSAGE_CLASS));
assertEquals("Jukka Zitting", metadata.get(Message.MESSAGE_FROM_NAME));
assertEquals("jukka.zitting@gmail.com", metadata.get(Message.MESSAGE_FROM_EMAIL));
assertEquals("Jukka Zitting", metadata.get(Office.MAPI_FROM_REPRESENTING_NAME));
assertEquals("jukka.zitting@gmail.com", metadata.get(Office.MAPI_FROM_REPRESENTING_EMAIL));
//to-name is empty, make sure that we get an empty string.
assertEquals("tika-dev@lucene.apache.org", metadata.get(Message.MESSAGE_TO_EMAIL));
assertEquals("tika-dev@lucene.apache.org", metadata.get(Message.MESSAGE_TO_DISPLAY_NAME));
assertEquals("", metadata.get(Message.MESSAGE_TO_NAME));
}
/**
* Test case for TIKA-395, to ensure parser works for new Outlook formats.
*
* @see <a href="https://issues.apache.org/jira/browse/TIKA-395">TIKA-395</a>
*/
@Test
public void testOutlookNew() throws Exception {
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
try (InputStream stream = getResourceAsStream("/test-documents/test-outlook2003.msg")) {
AUTO_DETECT_PARSER.parse(stream, handler, metadata, new ParseContext());
}
assertEquals(
"application/vnd.ms-outlook",
metadata.get(Metadata.CONTENT_TYPE));
assertEquals(
"Welcome to Microsoft Office Outlook 2003",
metadata.get(TikaCoreProperties.TITLE));
String content = handler.toString();
assertContains("Outlook 2003", content);
assertContains("Streamlined Mail Experience", content);
assertContains("Navigation Pane", content);
//make sure these are parallel
assertEquals("", metadata.get(Message.MESSAGE_TO_EMAIL));
assertEquals("New Outlook User", metadata.get(Message.MESSAGE_TO_NAME));
assertEquals("New Outlook User", metadata.get(Message.MESSAGE_TO_DISPLAY_NAME));
}
@Test
public void testOutlookHTMLVersion() throws Exception {
Metadata metadata = new Metadata();
// Check the HTML version
StringWriter sw = new StringWriter();
SAXTransformerFactory factory = (SAXTransformerFactory)
SAXTransformerFactory.newInstance();
TransformerHandler handler = factory.newTransformerHandler();
handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
handler.setResult(new StreamResult(sw));
try (InputStream stream = getResourceAsStream("/test-documents/testMSG_chinese.msg")) {
AUTO_DETECT_PARSER.parse(stream, handler, metadata, new ParseContext());
}
// As the HTML version should have been processed, ensure
// we got some of the links
String content = sw.toString();
assertContains("<dd>tests.chang@fengttt.com</dd>", content);
assertContains("<p>Alfresco MSG format testing", content);
assertContains("<li>1", content);
assertContains("<li>2", content);
// Make sure we don't have nested html docs
assertEquals(2, content.split("<body>").length);
assertEquals(2, content.split("<\\/body>").length);
// Make sure that the Chinese actually came through
assertContains("\u5F35\u6BD3\u502B", metadata.get(TikaCoreProperties.CREATOR));
assertContains("\u9673\u60E0\u73CD", content);
assertEquals("tests.chang@fengttt.com", metadata.get(Message.MESSAGE_TO_EMAIL));
assertEquals("Tests Chang@FT (張毓倫)", metadata.get(Office.MAPI_FROM_REPRESENTING_NAME));
assertEquals("/O=FT GROUP/OU=FT/CN=RECIPIENTS/CN=LYDIACHANG",
metadata.get(Office.MAPI_FROM_REPRESENTING_EMAIL));
}
@Test
public void testOutlookForwarded() throws Exception {
Metadata metadata = new Metadata();
// Check the HTML version
StringWriter sw = new StringWriter();
SAXTransformerFactory factory = (SAXTransformerFactory)
SAXTransformerFactory.newInstance();
TransformerHandler handler = factory.newTransformerHandler();
handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
handler.setResult(new StreamResult(sw));
try (InputStream stream = getResourceAsStream("/test-documents/testMSG_forwarded.msg")) {
AUTO_DETECT_PARSER.parse(stream, handler, metadata, new ParseContext());
}
// Make sure we don't have nested docs
String content = sw.toString();
assertEquals(2, content.split("<body>").length);
assertEquals(2, content.split("<\\/body>").length);
}
@Test
public void testOutlookHTMLfromRTF() throws Exception {
Metadata metadata = new Metadata();
// Check the HTML version
StringWriter sw = new StringWriter();
SAXTransformerFactory factory = (SAXTransformerFactory)
SAXTransformerFactory.newInstance();
TransformerHandler handler = factory.newTransformerHandler();
handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
handler.setResult(new StreamResult(sw));
try (InputStream stream = getResourceAsStream("/test-documents/test-outlook2003.msg")) {
AUTO_DETECT_PARSER.parse(stream, handler, metadata, new ParseContext());
}
// As the HTML version should have been processed, ensure
// we got some of the links
String content = sw.toString().replaceAll("[\\r\\n\\t]+", " ").replaceAll(" +", " ");
assertContains("<dd>New Outlook User</dd>", content);
assertContains("designed <i>to help you", content);
assertContains("<p> <a href=\"http://r.office.microsoft.com/r/rlidOutlookWelcomeMail10?clid=1033\">Cached Exchange Mode</a>", content);
// Link - check text around it, and the link itself
assertContains("sign up for a free subscription", content);
assertContains("Office Newsletter", content);
assertContains("newsletter will be sent to you", content);
assertContains("http://r.office.microsoft.com/r/rlidNewsletterSignUp?clid=1033", content);
// Make sure we don't have nested html docs
assertEquals(2, content.split("<body>").length);
assertEquals(2, content.split("<\\/body>").length);
}
@Test
public void testMAPIMessageClasses() throws Exception {
for (String messageClass : new String[]{
"Appointment", "Contact", "Post", "StickyNote", "Task"
}) {
testMsgClass(messageClass,
getXML("testMSG_" + messageClass + ".msg").metadata);
}
testMsgClass("NOTE", getXML("test-outlook2003.msg").metadata);
}
private void testMsgClass(String expected, Metadata metadata) {
assertTrue(expected + ", but got: " + metadata.get(Office.MAPI_MESSAGE_CLASS),
expected.equalsIgnoreCase(metadata.get(Office.MAPI_MESSAGE_CLASS).replaceAll("_", "")));
}
@Test
public void testHandlingAllAlternativesBodies() throws Exception {
//test that default only has one body
List<Metadata> metadataList = getRecursiveMetadata("testMSG.msg");
assertEquals(1, metadataList.size());
assertContains("breaking your application",
metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT));
assertEquals("application/vnd.ms-outlook",
metadataList.get(0).get(Metadata.CONTENT_TYPE));
//now try extracting all bodies
//they should each appear as standalone attachments
//with no content in the body of the msg level
try (InputStream is = getResourceAsStream("tika-config-extract-all-alternatives-msg.xml")) {
TikaConfig tikaConfig = new TikaConfig(is);
Parser p = new AutoDetectParser(tikaConfig);
metadataList = getRecursiveMetadata("testMSG.msg", p);
assertEquals(3, metadataList.size());
assertNotContained("breaking your application",
metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT));
assertEquals("application/vnd.ms-outlook",
metadataList.get(0).get(Metadata.CONTENT_TYPE));
assertContains("breaking your application",
metadataList.get(1).get(TikaCoreProperties.TIKA_CONTENT));
assertEquals("application/rtf",
metadataList.get(1).get(Metadata.CONTENT_TYPE));
assertContains("breaking your application",
metadataList.get(2).get(TikaCoreProperties.TIKA_CONTENT));
assertTrue(metadataList.get(2).get(Metadata.CONTENT_TYPE).startsWith("text/plain"));
}
}
}