TIKA-4249 -- allow utf8 bom to at start of rfc822 detection
diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index 09bbd96..94318a0 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -7035,6 +7035,7 @@
<magic priority="45">
<!-- be a bit more flexible, but require one from each of these -->
<match minShouldMatch="2">
+
<match minShouldMatch="1">
<match value="Content-ID:" type="stringignorecase" offset="0"/>
<match value="Content-Location:" type="stringignorecase" offset="0"/>
@@ -7055,6 +7056,27 @@
<match value="User-Agent:" type="string" offset="0"/>
<match value="X-Mailer:" type="string" offset="0"/>
<match value="X-Originating-IP:" type="stringignorecase" offset="0"/>
+ <match value="0xefbbbf" type="string" offset="0">
+ <match value="Content-ID:" type="stringignorecase" offset="3"/>
+ <match value="Content-Location:" type="stringignorecase" offset="3"/>
+ <match value="Content-Transfer-Encoding:" type="stringignorecase" offset="3"/>
+ <match value="Content-Type:" type="stringignorecase" offset="3"/>
+ <match value="Date:" type="stringignorecase" offset="3"/>
+ <match value="Delivered-To:" type="string" offset="3"/>
+ <match value="From:" type="stringignorecase" offset="3"/>
+ <match value="Message-ID:" type="stringignorecase" offset="3"/>
+ <match value="MIME-Version:" type="stringignorecase" offset="3"/>
+ <match value="Received:" type="stringignorecase" offset="3"/>
+ <match value="Relay-Version:" type="stringignorecase" offset="3"/>
+ <match value="Return-Path:" type="stringignorecase" offset="3"/>
+ <match value="Sent:" type="string" offset="3"/>
+ <match value="Status:" type="string" offset="3"/>
+ <match value="Subject:" type="string" offset="3"/>
+ <match value="To:" type="string" offset="3"/>
+ <match value="User-Agent:" type="string" offset="3"/>
+ <match value="X-Mailer:" type="string" offset="3"/>
+ <match value="X-Originating-IP:" type="stringignorecase" offset="3"/>
+ </match>
</match>
<match minShouldMatch="1">
<match value="\nContent-ID:" type="stringignorecase" offset="0:1024"/>
diff --git a/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java b/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java
index 434ff6c..84820ac 100644
--- a/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java
+++ b/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java
@@ -28,6 +28,8 @@
import java.io.InputStream;
import java.net.URL;
+import org.apache.commons.io.ByteOrderMark;
+import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
@@ -110,6 +112,27 @@
}
@Test
+ public void testRFC822WithBOM() throws Exception {
+ String header = "From: blah <blah@blah.com>\r\n" + "Received: Friday, January 24, 2020 3:24 PM\r\n" +
+ "To: someone@somewhere.com\r\n" + "Cc: someone-else@other.com\r\n" +
+ "Subject: Received\r\n";
+ MediaType rfc822 = MediaType.parse("message/rfc822");
+ assertEquals(rfc822, MIME_TYPES.detect(UnsynchronizedByteArrayInputStream
+ .builder()
+ .setByteArray(header.getBytes(UTF_8))
+ .get(), new Metadata()));
+
+ int utfLength = ByteOrderMark.UTF_8.length();
+ byte[] bytes = new byte[header.getBytes(UTF_8).length + utfLength];
+ System.arraycopy(ByteOrderMark.UTF_8.getBytes(), 0, bytes, 0, utfLength);
+ System.arraycopy(header.getBytes(UTF_8), 0, bytes, 3, header.getBytes(UTF_8).length);
+ assertEquals(rfc822, MIME_TYPES.detect(UnsynchronizedByteArrayInputStream
+ .builder()
+ .setByteArray(bytes)
+ .get(), new Metadata()));
+ }
+
+ @Test
public void testSuperTypes() {
assertTrue(REGISTRY.isSpecializationOf(MediaType.parse("text/something; charset=UTF-8"),
MediaType.parse("text/something")));