MIME4J-218: Content-Type fallback character set
contributed by Wolfgang Fahl <wf at bitplan.com>
git-svn-id: https://svn.apache.org/repos/asf/james/mime4j/trunk@1635743 13f79535-47bb-0310-9956-ffa450edef68
diff --git a/core/src/main/java/org/apache/james/mime4j/stream/DefaultFieldBuilder.java b/core/src/main/java/org/apache/james/mime4j/stream/DefaultFieldBuilder.java
index 8b335f0..6c65613 100644
--- a/core/src/main/java/org/apache/james/mime4j/stream/DefaultFieldBuilder.java
+++ b/core/src/main/java/org/apache/james/mime4j/stream/DefaultFieldBuilder.java
@@ -60,7 +60,7 @@
}
int len = line.length();
if (this.maxlen > 0 && this.buf.length() + len >= this.maxlen) {
- throw new MaxHeaderLengthLimitException("Maximum header length limit exceeded");
+ throw new MaxHeaderLengthLimitException("Maximum header length limit (" + this.maxlen + ") exceeded");
}
this.buf.append(line.buffer(), 0, line.length());
}
diff --git a/dom/src/main/java/org/apache/james/mime4j/message/BasicBodyFactory.java b/dom/src/main/java/org/apache/james/mime4j/message/BasicBodyFactory.java
index 0f406ec..6d24141 100644
--- a/dom/src/main/java/org/apache/james/mime4j/message/BasicBodyFactory.java
+++ b/dom/src/main/java/org/apache/james/mime4j/message/BasicBodyFactory.java
@@ -26,6 +26,7 @@
import java.io.StringReader;
import java.io.UnsupportedEncodingException;
import java.nio.charset.Charset;
+import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.UnsupportedCharsetException;
import org.apache.james.mime4j.Charsets;
@@ -42,12 +43,44 @@
public static final BasicBodyFactory INSTANCE = new BasicBodyFactory();
- private static Charset resolveCharset(final String mimeCharset) throws UnsupportedEncodingException {
- try {
- return mimeCharset != null ? Charset.forName(mimeCharset) : null;
- } catch (UnsupportedCharsetException ex) {
- throw new UnsupportedEncodingException(mimeCharset);
+ private final boolean lenient;
+
+ public BasicBodyFactory() {
+ this(true);
+ }
+
+ public BasicBodyFactory(final boolean lenient) {
+ this.lenient = lenient;
+ }
+
+ /**
+ * select the Charset for the given mimeCharset string
+ *
+ * if you need support for non standard or invalid mimeCharset specifications
+ * you might want to create your own derived BodyFactory extending BasicBodyFactory and
+ * overriding this method as suggested by:
+ * https://issues.apache.org/jira/browse/MIME4J-218
+ *
+ * the default behavior is lenient, invalid mimeCharset specifications will return the defaultCharset
+ *
+ * @param mimeCharset - the string specification for a Charset e.g. "UTF-8"
+ * @throws UnsupportedEncodingException if the mimeCharset is invalid
+ */
+ protected Charset resolveCharset(final String mimeCharset) throws UnsupportedEncodingException {
+ if (mimeCharset != null) {
+ try {
+ return Charset.forName(mimeCharset);
+ } catch (UnsupportedCharsetException ex) {
+ if (!lenient) {
+ throw new UnsupportedEncodingException(mimeCharset);
+ }
+ } catch (IllegalCharsetNameException ex) {
+ if (!lenient) {
+ throw new UnsupportedEncodingException(mimeCharset);
+ }
+ }
}
+ return Charset.defaultCharset();
}
public TextBody textBody(final String text, final String mimeCharset) throws UnsupportedEncodingException {
diff --git a/dom/src/main/java/org/apache/james/mime4j/message/DefaultMessageBuilder.java b/dom/src/main/java/org/apache/james/mime4j/message/DefaultMessageBuilder.java
index 6221a30..323e25c 100644
--- a/dom/src/main/java/org/apache/james/mime4j/message/DefaultMessageBuilder.java
+++ b/dom/src/main/java/org/apache/james/mime4j/message/DefaultMessageBuilder.java
@@ -302,7 +302,7 @@
BodyDescriptorBuilder bdb = bodyDescBuilder != null ? bodyDescBuilder :
new DefaultBodyDescriptorBuilder(null, fieldParser != null ? fieldParser :
strict ? DefaultFieldParser.getParser() : LenientFieldParser.getParser(), mon);
- BodyFactory bf = bodyFactory != null ? bodyFactory : new BasicBodyFactory();
+ BodyFactory bf = bodyFactory != null ? bodyFactory : new BasicBodyFactory(!strict);
MimeStreamParser parser = new MimeStreamParser(cfg, mon, bdb);
parser.setContentHandler(new ParserStreamContentHandler(message, bf));
parser.setContentDecoding(contentDecoding);
diff --git a/dom/src/main/java/org/apache/james/mime4j/message/MessageBuilder.java b/dom/src/main/java/org/apache/james/mime4j/message/MessageBuilder.java
index b51f11f..5504cda 100644
--- a/dom/src/main/java/org/apache/james/mime4j/message/MessageBuilder.java
+++ b/dom/src/main/java/org/apache/james/mime4j/message/MessageBuilder.java
@@ -309,7 +309,7 @@
/**
* Sets binary content of this message with the given MIME type.
*
- * @param body
+ * @param bin
* the body.
* @param mimeType
* the MIME media type of the specified body
@@ -898,7 +898,7 @@
BodyDescriptorBuilder currentBodyDescBuilder = bodyDescBuilder != null ? bodyDescBuilder :
new DefaultBodyDescriptorBuilder(null, fieldParser != null ? fieldParser :
strict ? DefaultFieldParser.getParser() : LenientFieldParser.getParser(), currentMonitor);
- BodyFactory currentBodyFactory = bodyFactory != null ? bodyFactory : new BasicBodyFactory();
+ BodyFactory currentBodyFactory = bodyFactory != null ? bodyFactory : new BasicBodyFactory(!strict);
MimeStreamParser parser = new MimeStreamParser(currentConfig, currentMonitor, currentBodyDescBuilder);
Message message = new MessageImpl();
diff --git a/dom/src/test/java/org/apache/james/mime4j/dom/MessageCharsetLenientTest.java b/dom/src/test/java/org/apache/james/mime4j/dom/MessageCharsetLenientTest.java
new file mode 100644
index 0000000..f6b15fa
--- /dev/null
+++ b/dom/src/test/java/org/apache/james/mime4j/dom/MessageCharsetLenientTest.java
@@ -0,0 +1,108 @@
+package org.apache.james.mime4j.dom;
+
+import java.io.ByteArrayInputStream;
+import java.io.UnsupportedEncodingException;
+
+import org.apache.james.mime4j.message.BasicBodyFactory;
+import org.apache.james.mime4j.message.DefaultMessageBuilder;
+import org.junit.Assert;
+import org.junit.Test;
+
+/**
+ * check that the Charset handling of BasicBodyFactory can be influenced with
+ * the boolean lenient flag
+ *
+ * @author wf
+ *
+ */
+public class MessageCharsetLenientTest {
+
+ /**
+ * set up a message with an invalid charset
+ *
+ * @throws Exception
+ */
+ @Test
+ public void testLenientCharsetHandling() throws Exception {
+ // this list of invalidCharsets is taken from parsing a sample of some 1/4 million e-mails
+ // so all of the showed up in real world e-mails
+ String invalidCharsets[] = {
+ "%CHARSET",
+ "'iso-8859-1'",
+ "'utf-8'",
+ "0",
+ "238",
+ "DEFAULT_CHARSET",
+ "DIN_66003",
+ "ISO 8859-1",
+ "None",
+ "Standard",
+ "UTF-7",
+ "X-CTEXT",
+ "X-UNKNOWN",
+ "\\iso-8859-1\"",
+ "\\us-ascii\"",
+ "ansi_x3.110-1983",
+ "charset=us-ascii",
+ "en",
+ "iso-0-250-250-250-25-0-25",
+ "iso-10646",
+ "iso-1149-1",
+ "iso-2191-1",
+ "iso-3817-4",
+ "iso-4736-8",
+ "iso-5266-7",
+ "iso-5666-3",
+ "iso-5978-6",
+ "iso-6558-5",
+ "iso-7708-8",
+ "iso-8085-5",
+ "iso-8589-0",
+ "iso-8814-4",
+ "iso-8859-1 name=FAQ.htm",
+ "iso-8859-16",
+ "iso-8859-1?",
+ "iso-8859-8-i",
+ "iso-9284-4",
+ "latin-iso8859-1",
+ "unicode-1-1-utf-7",
+ "unknown-8bit",
+ "utf-7",
+ "windows-1250 reply-type=original",
+ "windows-1252 <!DOCTYPE HTML PUBLIC -//W3C//DTD HTML 4.01 Transitional//EN>",
+ "x-user-defined", " {$RND_CHARSET$}" };
+
+ // check with lenient charset handling on and off
+ boolean[] lenientstates = { true, false };
+ // create the message builder
+ DefaultMessageBuilder builder = new DefaultMessageBuilder();
+ // count how many Exception hits we got
+ int invalidCount=0;
+ // test in bosh states
+ for (boolean lenient : lenientstates) {
+ // set how lenient we are
+ builder.setBodyFactory(new BasicBodyFactory(lenient));
+ // check the list of invalid Charsets
+ for (String invalidCharset : invalidCharsets) {
+ // create a message with the charset
+ String charsetContent = "Subject: my subject\r\n"
+ + "Content-Type: text/plain; charset=" + invalidCharset + "\r\n"
+ + "Strange charset isn't it?\r" + "\r\n";
+ // try parsing it
+ try {
+ Message message = builder.parseMessage(new ByteArrayInputStream(
+ charsetContent.getBytes("UTF-8")));
+ // check some message attribute
+ Assert.assertEquals("text/plain", message.getMimeType());
+ // if we get here we had a lenient mode - in non lenient an exception would have been thrown
+ Assert.assertTrue("Charset:"+invalidCharset+" should not be allowed when lenient is "+lenient,lenient);
+ } catch (UnsupportedEncodingException ex) {
+ Assert.assertFalse("Charset:"+invalidCharset+" should not throw an exception when lenient is "+lenient,lenient);
+ invalidCount++;
+ }
+ }
+ } // for
+ Assert.assertEquals(invalidCharsets.length,invalidCount);
+ }
+
+}
diff --git a/dom/src/test/java/org/apache/james/mime4j/message/BodyPartBuilderTest.java b/dom/src/test/java/org/apache/james/mime4j/message/BodyPartBuilderTest.java
index 154cf7c..55d2aaf 100644
--- a/dom/src/test/java/org/apache/james/mime4j/message/BodyPartBuilderTest.java
+++ b/dom/src/test/java/org/apache/james/mime4j/message/BodyPartBuilderTest.java
@@ -20,13 +20,10 @@
package org.apache.james.mime4j.message;
import java.io.InputStream;
-import java.util.List;
import org.apache.james.mime4j.Charsets;
import org.apache.james.mime4j.dom.Body;
-import org.apache.james.mime4j.dom.TextBody;
import org.apache.james.mime4j.dom.field.ContentTypeField;
-import org.apache.james.mime4j.stream.Field;
import org.junit.Assert;
import org.junit.Test;
import org.mockito.Mockito;
diff --git a/dom/src/test/java/org/apache/james/mime4j/message/EntityImplTest.java b/dom/src/test/java/org/apache/james/mime4j/message/EntityImplTest.java
index 9ef8412..2d0b31d 100644
--- a/dom/src/test/java/org/apache/james/mime4j/message/EntityImplTest.java
+++ b/dom/src/test/java/org/apache/james/mime4j/message/EntityImplTest.java
@@ -23,9 +23,6 @@
import org.apache.james.mime4j.dom.Entity;
import org.apache.james.mime4j.dom.Header;
import org.apache.james.mime4j.field.DefaultFieldParser;
-import org.apache.james.mime4j.message.BasicBodyFactory;
-import org.apache.james.mime4j.message.BodyPart;
-import org.apache.james.mime4j.message.HeaderImpl;
import org.junit.Assert;
import org.junit.Test;
diff --git a/dom/src/test/java/org/apache/james/mime4j/message/MessageBuilderTest.java b/dom/src/test/java/org/apache/james/mime4j/message/MessageBuilderTest.java
index 35533f8..7e9a296 100644
--- a/dom/src/test/java/org/apache/james/mime4j/message/MessageBuilderTest.java
+++ b/dom/src/test/java/org/apache/james/mime4j/message/MessageBuilderTest.java
@@ -23,7 +23,6 @@
import java.text.SimpleDateFormat;
import java.util.Arrays;
import java.util.Date;
-import java.util.List;
import java.util.TimeZone;
import org.apache.james.mime4j.Charsets;
@@ -39,7 +38,6 @@
import org.apache.james.mime4j.field.DefaultFieldParser;
import org.apache.james.mime4j.field.Fields;
import org.apache.james.mime4j.field.address.AddressBuilder;
-import org.apache.james.mime4j.stream.Field;
import org.junit.Assert;
import org.junit.Test;
import org.mockito.Mockito;