src/main/java/org/apache/commons/fileupload/util/mime/MimeUtility.java - commons-fileupload - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.commons.fileupload.util.mime;

 import java.io.ByteArrayOutputStream;
 import java.io.IOException;
 import java.io.UnsupportedEncodingException;
 import java.util.HashMap;
 import java.util.Locale;
 import java.util.Map;

 /**
  * Utility class to decode MIME texts.
  *
  * @since 1.3
  */
 public final class MimeUtility {

     /**
      * The linear whitespace chars sequence.
      */
     private static final String LINEAR_WHITESPACE = " \t\r\n";

     /**
      * Mappings between MIME and Java charset.
      */
     private static final Map<String, String> MIME2JAVA = new HashMap<String, String>();

     /**
      * The Base64 decoder.
      */
     private static final Base64Decoder BASE64_DECODER = new Base64Decoder();

     static {
         MIME2JAVA.put("iso-2022-cn", "ISO2022CN");
         MIME2JAVA.put("iso-2022-kr", "ISO2022KR");
         MIME2JAVA.put("utf-8", "UTF8");
         MIME2JAVA.put("utf8", "UTF8");
         MIME2JAVA.put("ja_jp.iso2022-7", "ISO2022JP");
         MIME2JAVA.put("ja_jp.eucjp", "EUCJIS");
         MIME2JAVA.put("euc-kr", "KSC5601");
         MIME2JAVA.put("euckr", "KSC5601");
         MIME2JAVA.put("us-ascii", "ISO-8859-1");
         MIME2JAVA.put("x-us-ascii", "ISO-8859-1");
     }

     /**
      * Hidden constructor, this class must not be instantiated.
      */
     private MimeUtility() {
         // do nothing
     }

     /**
      * Decode a string of text obtained from a mail header into
      * it's proper form.  The text generally will consist of a
      * string of tokens, some of which may be encoded using
      * base64 encoding.
      *
      * @param text   The text to decode.
      *
      * @return The decoded test string.
      * @throws UnsupportedEncodingException
      */
     public static String decodeText(String text) throws UnsupportedEncodingException {
         // if the text contains any encoded tokens, those tokens will be marked with "=?".  If the
         // source string doesn't contain that sequent, no decoding is required.
         if (text.indexOf("=?") < 0) {
             return text;
         }

         int offset = 0;
         int endOffset = text.length();

         int startWhiteSpace = -1;
         int endWhiteSpace = -1;

         StringBuffer decodedText = new StringBuffer(text.length());

         boolean previousTokenEncoded = false;

         while (offset < endOffset) {
             char ch = text.charAt(offset);

             // is this a whitespace character?
             if (LINEAR_WHITESPACE.indexOf(ch) != -1) {
                 startWhiteSpace = offset;
                 while (offset < endOffset) {
                     // step over the white space characters.
                     ch = text.charAt(offset);
                     if (LINEAR_WHITESPACE.indexOf(ch) != -1) {
                         offset++;
                     } else {
                         // record the location of the first non lwsp and drop down to process the
                         // token characters.
                         endWhiteSpace = offset;
                         break;
                     }
                 }
             } else {
                 // we have a word token.  We need to scan over the word and then try to parse it.
                 int wordStart = offset;

                 while (offset < endOffset) {
                     // step over the white space characters.
                     ch = text.charAt(offset);
                     if (LINEAR_WHITESPACE.indexOf(ch) == -1) {
                         offset++;
                     } else {
                         break;
                     }

                     //NB:  Trailing whitespace on these header strings will just be discarded.
                 }
                 // pull out the word token.
                 String word = text.substring(wordStart, offset);
                 // is the token encoded?  decode the word
                 if (word.startsWith("=?")) {
                     try {
                         // if this gives a parsing failure, treat it like a non-encoded word.
                         String decodedWord = decodeWord(word);

                         // are any whitespace characters significant?  Append 'em if we've got 'em.
                         if (!previousTokenEncoded && startWhiteSpace != -1) {
                             decodedText.append(text.substring(startWhiteSpace, endWhiteSpace));
                             startWhiteSpace = -1;
                         }
                         // this is definitely a decoded token.
                         previousTokenEncoded = true;
                         // and add this to the text.
                         decodedText.append(decodedWord);
                         // we continue parsing from here...we allow parsing errors to fall through
                         // and get handled as normal text.
                         continue;

                     } catch (ParseException e) {
                         // just ignore it, skip to next word
                     }
                 }
                 // this is a normal token, so it doesn't matter what the previous token was.  Add the white space
                 // if we have it.
                 if (startWhiteSpace != -1) {
                     decodedText.append(text.substring(startWhiteSpace, endWhiteSpace));
                     startWhiteSpace = -1;
                 }
                 // this is not a decoded token.
                 previousTokenEncoded = false;
                 decodedText.append(word);
             }
         }

         return decodedText.toString();
     }

     /**
      * Parse a string using the RFC 2047 rules for an "encoded-word"
      * type.  This encoding has the syntax:
      *
      * encoded-word = "=?" charset "?" encoding "?" encoded-text "?="
      *
      * @param word   The possibly encoded word value.
      *
      * @return The decoded word.
      * @throws ParseException
      * @throws UnsupportedEncodingException
      */
     private static String decodeWord(String word) throws ParseException, UnsupportedEncodingException {
         // encoded words start with the characters "=?".  If this not an encoded word, we throw a
         // ParseException for the caller.

         if (!word.startsWith("=?")) {
             throw new ParseException("Invalid RFC 2047 encoded-word: " + word);
         }

         int charsetPos = word.indexOf('?', 2);
         if (charsetPos == -1) {
             throw new ParseException("Missing charset in RFC 2047 encoded-word: " + word);
         }

         // pull out the character set information (this is the MIME name at this point).
         String charset = word.substring(2, charsetPos).toLowerCase();

         // now pull out the encoding token the same way.
         int encodingPos = word.indexOf('?', charsetPos + 1);
         if (encodingPos == -1) {
             throw new ParseException("Missing encoding in RFC 2047 encoded-word: " + word);
         }

         String encoding = word.substring(charsetPos + 1, encodingPos);

         // and finally the encoded text.
         int encodedTextPos = word.indexOf("?=", encodingPos + 1);
         if (encodedTextPos == -1) {
             throw new ParseException("Missing encoded text in RFC 2047 encoded-word: " + word);
         }

         String encodedText = word.substring(encodingPos + 1, encodedTextPos);

         // seems a bit silly to encode a null string, but easy to deal with.
         if (encodedText.length() == 0) {
             return "";
         }

         try {
             // the decoder writes directly to an output stream.
             ByteArrayOutputStream out = new ByteArrayOutputStream(encodedText.length());

             byte[] encodedData = encodedText.getBytes("US-ASCII");

             // Base64 encoded?
             if (encoding.equals("B")) {
                 BASE64_DECODER.decode(encodedData, 0, encodedData.length, out);
             } else if (encoding.equals("Q")) { // maybe quoted printable.
                 QuotedPrintableDecoder.decodeWord(encodedData, 0, encodedData.length, out);
             } else {
                 throw new UnsupportedEncodingException("Unknown RFC 2047 encoding: " + encoding);
             }
             // get the decoded byte data and convert into a string.
             byte[] decodedData = out.toByteArray();
             return new String(decodedData, javaCharset(charset));
         } catch (IOException e) {
             throw new UnsupportedEncodingException("Invalid RFC 2047 encoding");
         }
     }

     /**
      * Translate a MIME standard character set name into the Java
      * equivalent.
      *
      * @param charset The MIME standard name.
      *
      * @return The Java equivalent for this name.
      */
     private static String javaCharset(String charset) {
         // nothing in, nothing out.
         if (charset == null) {
             return null;
         }

         String mappedCharset = MIME2JAVA.get(charset.toLowerCase(Locale.ENGLISH));
         // if there is no mapping, then the original name is used.  Many of the MIME character set
         // names map directly back into Java.  The reverse isn't necessarily true.
         if (mappedCharset == null) {
             return charset;
         }
         return mappedCharset;
     }

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.commons.fileupload.util.mime;

	import java.io.ByteArrayOutputStream;
	import java.io.IOException;
	import java.io.UnsupportedEncodingException;
	import java.util.HashMap;
	import java.util.Locale;
	import java.util.Map;

	/**
	* Utility class to decode MIME texts.
	*
	* @since 1.3
	*/
	public final class MimeUtility {

	/**
	* The linear whitespace chars sequence.
	*/
	private static final String LINEAR_WHITESPACE = " \t\r\n";

	/**
	* Mappings between MIME and Java charset.
	*/
	private static final Map<String, String> MIME2JAVA = new HashMap<String, String>();

	/**
	* The Base64 decoder.
	*/
	private static final Base64Decoder BASE64_DECODER = new Base64Decoder();

	static {
	MIME2JAVA.put("iso-2022-cn", "ISO2022CN");
	MIME2JAVA.put("iso-2022-kr", "ISO2022KR");
	MIME2JAVA.put("utf-8", "UTF8");
	MIME2JAVA.put("utf8", "UTF8");
	MIME2JAVA.put("ja_jp.iso2022-7", "ISO2022JP");
	MIME2JAVA.put("ja_jp.eucjp", "EUCJIS");
	MIME2JAVA.put("euc-kr", "KSC5601");
	MIME2JAVA.put("euckr", "KSC5601");
	MIME2JAVA.put("us-ascii", "ISO-8859-1");
	MIME2JAVA.put("x-us-ascii", "ISO-8859-1");
	}

	/**
	* Hidden constructor, this class must not be instantiated.
	*/
	private MimeUtility() {
	// do nothing
	}

	/**
	* Decode a string of text obtained from a mail header into
	* it's proper form. The text generally will consist of a
	* string of tokens, some of which may be encoded using
	* base64 encoding.
	*
	* @param text The text to decode.
	*
	* @return The decoded test string.
	* @throws UnsupportedEncodingException
	*/
	public static String decodeText(String text) throws UnsupportedEncodingException {
	// if the text contains any encoded tokens, those tokens will be marked with "=?". If the
	// source string doesn't contain that sequent, no decoding is required.
	if (text.indexOf("=?") < 0) {
	return text;
	}

	int offset = 0;
	int endOffset = text.length();

	int startWhiteSpace = -1;
	int endWhiteSpace = -1;

	StringBuffer decodedText = new StringBuffer(text.length());

	boolean previousTokenEncoded = false;

	while (offset < endOffset) {
	char ch = text.charAt(offset);

	// is this a whitespace character?
	if (LINEAR_WHITESPACE.indexOf(ch) != -1) {
	startWhiteSpace = offset;
	while (offset < endOffset) {
	// step over the white space characters.
	ch = text.charAt(offset);
	if (LINEAR_WHITESPACE.indexOf(ch) != -1) {
	offset++;
	} else {
	// record the location of the first non lwsp and drop down to process the
	// token characters.
	endWhiteSpace = offset;
	break;
	}
	}
	} else {
	// we have a word token. We need to scan over the word and then try to parse it.
	int wordStart = offset;

	while (offset < endOffset) {
	// step over the white space characters.
	ch = text.charAt(offset);
	if (LINEAR_WHITESPACE.indexOf(ch) == -1) {
	offset++;
	} else {
	break;
	}

	//NB: Trailing whitespace on these header strings will just be discarded.
	}
	// pull out the word token.
	String word = text.substring(wordStart, offset);
	// is the token encoded? decode the word
	if (word.startsWith("=?")) {
	try {
	// if this gives a parsing failure, treat it like a non-encoded word.
	String decodedWord = decodeWord(word);

	// are any whitespace characters significant? Append 'em if we've got 'em.
	if (!previousTokenEncoded && startWhiteSpace != -1) {
	decodedText.append(text.substring(startWhiteSpace, endWhiteSpace));
	startWhiteSpace = -1;
	}
	// this is definitely a decoded token.
	previousTokenEncoded = true;
	// and add this to the text.
	decodedText.append(decodedWord);
	// we continue parsing from here...we allow parsing errors to fall through
	// and get handled as normal text.
	continue;

	} catch (ParseException e) {
	// just ignore it, skip to next word
	}
	}
	// this is a normal token, so it doesn't matter what the previous token was. Add the white space
	// if we have it.
	if (startWhiteSpace != -1) {
	decodedText.append(text.substring(startWhiteSpace, endWhiteSpace));
	startWhiteSpace = -1;
	}
	// this is not a decoded token.
	previousTokenEncoded = false;
	decodedText.append(word);
	}
	}

	return decodedText.toString();
	}

	/**
	* Parse a string using the RFC 2047 rules for an "encoded-word"
	* type. This encoding has the syntax:
	*
	* encoded-word = "=?" charset "?" encoding "?" encoded-text "?="
	*
	* @param word The possibly encoded word value.
	*
	* @return The decoded word.
	* @throws ParseException
	* @throws UnsupportedEncodingException
	*/
	private static String decodeWord(String word) throws ParseException, UnsupportedEncodingException {
	// encoded words start with the characters "=?". If this not an encoded word, we throw a
	// ParseException for the caller.

	if (!word.startsWith("=?")) {
	throw new ParseException("Invalid RFC 2047 encoded-word: " + word);
	}

	int charsetPos = word.indexOf('?', 2);
	if (charsetPos == -1) {
	throw new ParseException("Missing charset in RFC 2047 encoded-word: " + word);
	}

	// pull out the character set information (this is the MIME name at this point).
	String charset = word.substring(2, charsetPos).toLowerCase();

	// now pull out the encoding token the same way.
	int encodingPos = word.indexOf('?', charsetPos + 1);
	if (encodingPos == -1) {
	throw new ParseException("Missing encoding in RFC 2047 encoded-word: " + word);
	}

	String encoding = word.substring(charsetPos + 1, encodingPos);

	// and finally the encoded text.
	int encodedTextPos = word.indexOf("?=", encodingPos + 1);
	if (encodedTextPos == -1) {
	throw new ParseException("Missing encoded text in RFC 2047 encoded-word: " + word);
	}

	String encodedText = word.substring(encodingPos + 1, encodedTextPos);

	// seems a bit silly to encode a null string, but easy to deal with.
	if (encodedText.length() == 0) {
	return "";
	}

	try {
	// the decoder writes directly to an output stream.
	ByteArrayOutputStream out = new ByteArrayOutputStream(encodedText.length());

	byte[] encodedData = encodedText.getBytes("US-ASCII");

	// Base64 encoded?
	if (encoding.equals("B")) {
	BASE64_DECODER.decode(encodedData, 0, encodedData.length, out);
	} else if (encoding.equals("Q")) { // maybe quoted printable.
	QuotedPrintableDecoder.decodeWord(encodedData, 0, encodedData.length, out);
	} else {
	throw new UnsupportedEncodingException("Unknown RFC 2047 encoding: " + encoding);
	}
	// get the decoded byte data and convert into a string.
	byte[] decodedData = out.toByteArray();
	return new String(decodedData, javaCharset(charset));
	} catch (IOException e) {
	throw new UnsupportedEncodingException("Invalid RFC 2047 encoding");
	}
	}

	/**
	* Translate a MIME standard character set name into the Java
	* equivalent.
	*
	* @param charset The MIME standard name.
	*
	* @return The Java equivalent for this name.
	*/
	private static String javaCharset(String charset) {
	// nothing in, nothing out.
	if (charset == null) {
	return null;
	}

	String mappedCharset = MIME2JAVA.get(charset.toLowerCase(Locale.ENGLISH));
	// if there is no mapping, then the original name is used. Many of the MIME character set
	// names map directly back into Java. The reverse isn't necessarily true.
	if (mappedCharset == null) {
	return charset;
	}
	return mappedCharset;
	}

	}