MIME4J-218: Content-Type fallback character set
contributed by Wolfgang Fahl <wf at bitplan.com>

git-svn-id: https://svn.apache.org/repos/asf/james/mime4j/trunk@1635743 13f79535-47bb-0310-9956-ffa450edef68
diff --git a/core/src/main/java/org/apache/james/mime4j/stream/DefaultFieldBuilder.java b/core/src/main/java/org/apache/james/mime4j/stream/DefaultFieldBuilder.java
index 8b335f0..6c65613 100644
--- a/core/src/main/java/org/apache/james/mime4j/stream/DefaultFieldBuilder.java
+++ b/core/src/main/java/org/apache/james/mime4j/stream/DefaultFieldBuilder.java
@@ -60,7 +60,7 @@
         }
         int len = line.length();
         if (this.maxlen > 0 && this.buf.length() + len >= this.maxlen) {
-            throw new MaxHeaderLengthLimitException("Maximum header length limit exceeded");
+            throw new MaxHeaderLengthLimitException("Maximum header length limit (" + this.maxlen + ") exceeded");
         }
         this.buf.append(line.buffer(), 0, line.length());
     }
diff --git a/dom/src/main/java/org/apache/james/mime4j/message/BasicBodyFactory.java b/dom/src/main/java/org/apache/james/mime4j/message/BasicBodyFactory.java
index 0f406ec..6d24141 100644
--- a/dom/src/main/java/org/apache/james/mime4j/message/BasicBodyFactory.java
+++ b/dom/src/main/java/org/apache/james/mime4j/message/BasicBodyFactory.java
@@ -26,6 +26,7 @@
 import java.io.StringReader;
 import java.io.UnsupportedEncodingException;
 import java.nio.charset.Charset;
+import java.nio.charset.IllegalCharsetNameException;
 import java.nio.charset.UnsupportedCharsetException;
 
 import org.apache.james.mime4j.Charsets;
@@ -42,12 +43,44 @@
 
     public static final BasicBodyFactory INSTANCE = new BasicBodyFactory();
 
-    private static Charset resolveCharset(final String mimeCharset) throws UnsupportedEncodingException {
-        try {
-            return mimeCharset != null ? Charset.forName(mimeCharset) : null;
-        } catch (UnsupportedCharsetException ex) {
-            throw new UnsupportedEncodingException(mimeCharset);
+    private final boolean lenient;
+
+    public BasicBodyFactory() {
+        this(true);
+    }
+
+    public BasicBodyFactory(final boolean lenient) {
+        this.lenient = lenient;
+    }
+
+    /**
+     * select the Charset for the given mimeCharset string
+     * 
+     *  if you need support for non standard or invalid mimeCharset specifications
+     *  you might want to create your own derived BodyFactory extending BasicBodyFactory and
+     *  overriding this method as suggested by:
+     *    https://issues.apache.org/jira/browse/MIME4J-218
+     *  
+     *  the default behavior is lenient, invalid mimeCharset specifications will return the defaultCharset
+     * 
+     *  @param mimeCharset - the string specification for a Charset e.g. "UTF-8"
+     *  @throws UnsupportedEncodingException if the mimeCharset is invalid
+     */ 
+    protected Charset resolveCharset(final String mimeCharset) throws UnsupportedEncodingException {
+        if (mimeCharset != null) {
+            try {
+                return Charset.forName(mimeCharset);
+            } catch (UnsupportedCharsetException ex) {
+                if (!lenient) {
+                    throw new UnsupportedEncodingException(mimeCharset);
+                }
+            } catch (IllegalCharsetNameException ex) {
+                if (!lenient) {
+                    throw new UnsupportedEncodingException(mimeCharset);
+                }
+            }
         }
+        return Charset.defaultCharset();
     }
 
     public TextBody textBody(final String text, final String mimeCharset) throws UnsupportedEncodingException {
diff --git a/dom/src/main/java/org/apache/james/mime4j/message/DefaultMessageBuilder.java b/dom/src/main/java/org/apache/james/mime4j/message/DefaultMessageBuilder.java
index 6221a30..323e25c 100644
--- a/dom/src/main/java/org/apache/james/mime4j/message/DefaultMessageBuilder.java
+++ b/dom/src/main/java/org/apache/james/mime4j/message/DefaultMessageBuilder.java
@@ -302,7 +302,7 @@
             BodyDescriptorBuilder bdb = bodyDescBuilder != null ? bodyDescBuilder :
                 new DefaultBodyDescriptorBuilder(null, fieldParser != null ? fieldParser :
                     strict ? DefaultFieldParser.getParser() : LenientFieldParser.getParser(), mon);
-            BodyFactory bf = bodyFactory != null ? bodyFactory : new BasicBodyFactory();
+            BodyFactory bf = bodyFactory != null ? bodyFactory : new BasicBodyFactory(!strict);
             MimeStreamParser parser = new MimeStreamParser(cfg, mon, bdb);
             parser.setContentHandler(new ParserStreamContentHandler(message, bf));
             parser.setContentDecoding(contentDecoding);
diff --git a/dom/src/main/java/org/apache/james/mime4j/message/MessageBuilder.java b/dom/src/main/java/org/apache/james/mime4j/message/MessageBuilder.java
index b51f11f..5504cda 100644
--- a/dom/src/main/java/org/apache/james/mime4j/message/MessageBuilder.java
+++ b/dom/src/main/java/org/apache/james/mime4j/message/MessageBuilder.java
@@ -309,7 +309,7 @@
     /**
      * Sets binary content of this message with the given MIME type.
      *
-     * @param body
+     * @param bin
      *            the body.
      * @param mimeType
      *            the MIME media type of the specified body
@@ -898,7 +898,7 @@
         BodyDescriptorBuilder currentBodyDescBuilder = bodyDescBuilder != null ? bodyDescBuilder :
                 new DefaultBodyDescriptorBuilder(null, fieldParser != null ? fieldParser :
                         strict ? DefaultFieldParser.getParser() : LenientFieldParser.getParser(), currentMonitor);
-        BodyFactory currentBodyFactory = bodyFactory != null ? bodyFactory : new BasicBodyFactory();
+        BodyFactory currentBodyFactory = bodyFactory != null ? bodyFactory : new BasicBodyFactory(!strict);
         MimeStreamParser parser = new MimeStreamParser(currentConfig, currentMonitor, currentBodyDescBuilder);
 
         Message message = new MessageImpl();
diff --git a/dom/src/test/java/org/apache/james/mime4j/dom/MessageCharsetLenientTest.java b/dom/src/test/java/org/apache/james/mime4j/dom/MessageCharsetLenientTest.java
new file mode 100644
index 0000000..f6b15fa
--- /dev/null
+++ b/dom/src/test/java/org/apache/james/mime4j/dom/MessageCharsetLenientTest.java
@@ -0,0 +1,108 @@
+package org.apache.james.mime4j.dom;
+
+import java.io.ByteArrayInputStream;
+import java.io.UnsupportedEncodingException;
+
+import org.apache.james.mime4j.message.BasicBodyFactory;
+import org.apache.james.mime4j.message.DefaultMessageBuilder;
+import org.junit.Assert;
+import org.junit.Test;
+
+/**
+ * check that the Charset handling of BasicBodyFactory can be influenced with
+ * the boolean lenient flag
+ * 
+ * @author wf
+ *
+ */
+public class MessageCharsetLenientTest {
+
+	/**
+	 * set up a message with an invalid charset
+	 * 
+	 * @throws Exception
+	 */
+	@Test
+	public void testLenientCharsetHandling() throws Exception {
+		// this list of invalidCharsets is taken from parsing a sample of some 1/4 million e-mails
+		// so all of the showed up in real world e-mails
+		String invalidCharsets[] = {
+				"%CHARSET",
+				"'iso-8859-1'",
+				"'utf-8'",
+				"0",
+				"238",
+				"DEFAULT_CHARSET",
+				"DIN_66003",
+				"ISO 8859-1",
+				"None",
+				"Standard",
+				"UTF-7",
+				"X-CTEXT",
+				"X-UNKNOWN",
+				"\\iso-8859-1\"",
+				"\\us-ascii\"",
+				"ansi_x3.110-1983",
+				"charset=us-ascii",
+				"en",
+				"iso-0-250-250-250-25-0-25",
+				"iso-10646",
+				"iso-1149-1",
+				"iso-2191-1",
+				"iso-3817-4",
+				"iso-4736-8",
+				"iso-5266-7",
+				"iso-5666-3",
+				"iso-5978-6",
+				"iso-6558-5",
+				"iso-7708-8",
+				"iso-8085-5",
+				"iso-8589-0",
+				"iso-8814-4",
+				"iso-8859-1 name=FAQ.htm",
+				"iso-8859-16",
+				"iso-8859-1?",
+				"iso-8859-8-i",
+				"iso-9284-4",
+				"latin-iso8859-1",
+				"unicode-1-1-utf-7",
+				"unknown-8bit",
+				"utf-7",
+				"windows-1250 reply-type=original",
+				"windows-1252 <!DOCTYPE HTML PUBLIC -//W3C//DTD HTML 4.01 Transitional//EN>",
+				"x-user-defined", " {$RND_CHARSET$}" };
+		
+		// check with lenient charset handling on and off
+		boolean[] lenientstates = { true, false };
+		// create the message builder
+		DefaultMessageBuilder builder = new DefaultMessageBuilder();
+		// count how many Exception hits we got
+		int invalidCount=0;
+		// test in bosh states
+		for (boolean lenient : lenientstates) {
+			// set how lenient we are
+            builder.setBodyFactory(new BasicBodyFactory(lenient));
+			// check the list of invalid Charsets
+			for (String invalidCharset : invalidCharsets) {
+				// create a message with the charset 
+				String charsetContent = "Subject: my subject\r\n"
+						+ "Content-Type: text/plain; charset=" + invalidCharset + "\r\n"
+						+ "Strange charset isn't it?\r" + "\r\n";
+        // try parsing it
+				try {
+					Message message = builder.parseMessage(new ByteArrayInputStream(
+							charsetContent.getBytes("UTF-8")));
+					// check some message attribute
+					Assert.assertEquals("text/plain", message.getMimeType());
+					// if we get here we had a lenient mode - in non lenient an exception would have been thrown
+					Assert.assertTrue("Charset:"+invalidCharset+" should not be allowed when lenient is "+lenient,lenient);
+				} catch (UnsupportedEncodingException ex) {
+					Assert.assertFalse("Charset:"+invalidCharset+" should not throw an exception when lenient is "+lenient,lenient);
+					invalidCount++;
+				}
+			}
+		} // for
+		Assert.assertEquals(invalidCharsets.length,invalidCount);
+	}
+
+}
diff --git a/dom/src/test/java/org/apache/james/mime4j/message/BodyPartBuilderTest.java b/dom/src/test/java/org/apache/james/mime4j/message/BodyPartBuilderTest.java
index 154cf7c..55d2aaf 100644
--- a/dom/src/test/java/org/apache/james/mime4j/message/BodyPartBuilderTest.java
+++ b/dom/src/test/java/org/apache/james/mime4j/message/BodyPartBuilderTest.java
@@ -20,13 +20,10 @@
 package org.apache.james.mime4j.message;
 
 import java.io.InputStream;
-import java.util.List;
 
 import org.apache.james.mime4j.Charsets;
 import org.apache.james.mime4j.dom.Body;
-import org.apache.james.mime4j.dom.TextBody;
 import org.apache.james.mime4j.dom.field.ContentTypeField;
-import org.apache.james.mime4j.stream.Field;
 import org.junit.Assert;
 import org.junit.Test;
 import org.mockito.Mockito;
diff --git a/dom/src/test/java/org/apache/james/mime4j/message/EntityImplTest.java b/dom/src/test/java/org/apache/james/mime4j/message/EntityImplTest.java
index 9ef8412..2d0b31d 100644
--- a/dom/src/test/java/org/apache/james/mime4j/message/EntityImplTest.java
+++ b/dom/src/test/java/org/apache/james/mime4j/message/EntityImplTest.java
@@ -23,9 +23,6 @@
 import org.apache.james.mime4j.dom.Entity;
 import org.apache.james.mime4j.dom.Header;
 import org.apache.james.mime4j.field.DefaultFieldParser;
-import org.apache.james.mime4j.message.BasicBodyFactory;
-import org.apache.james.mime4j.message.BodyPart;
-import org.apache.james.mime4j.message.HeaderImpl;
 import org.junit.Assert;
 import org.junit.Test;
 
diff --git a/dom/src/test/java/org/apache/james/mime4j/message/MessageBuilderTest.java b/dom/src/test/java/org/apache/james/mime4j/message/MessageBuilderTest.java
index 35533f8..7e9a296 100644
--- a/dom/src/test/java/org/apache/james/mime4j/message/MessageBuilderTest.java
+++ b/dom/src/test/java/org/apache/james/mime4j/message/MessageBuilderTest.java
@@ -23,7 +23,6 @@
 import java.text.SimpleDateFormat;
 import java.util.Arrays;
 import java.util.Date;
-import java.util.List;
 import java.util.TimeZone;
 
 import org.apache.james.mime4j.Charsets;
@@ -39,7 +38,6 @@
 import org.apache.james.mime4j.field.DefaultFieldParser;
 import org.apache.james.mime4j.field.Fields;
 import org.apache.james.mime4j.field.address.AddressBuilder;
-import org.apache.james.mime4j.stream.Field;
 import org.junit.Assert;
 import org.junit.Test;
 import org.mockito.Mockito;