| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.commons.fileupload.util.mime; |
| |
| import java.io.ByteArrayOutputStream; |
| import java.io.IOException; |
| import java.io.UnsupportedEncodingException; |
| import java.util.HashMap; |
| import java.util.Locale; |
| import java.util.Map; |
| |
| /** |
| * Utility class to decode MIME texts. |
| * |
| * @since 1.3 |
| */ |
| public final class MimeUtility { |
| |
| /** |
| * The linear whitespace chars sequence. |
| */ |
| private static final String LINEAR_WHITESPACE = " \t\r\n"; |
| |
| /** |
| * Mappings between MIME and Java charset. |
| */ |
| private static final Map<String, String> MIME2JAVA = new HashMap<String, String>(); |
| |
| /** |
| * The Base64 decoder. |
| */ |
| private static final Base64Decoder BASE64_DECODER = new Base64Decoder(); |
| |
| static { |
| MIME2JAVA.put("iso-2022-cn", "ISO2022CN"); |
| MIME2JAVA.put("iso-2022-kr", "ISO2022KR"); |
| MIME2JAVA.put("utf-8", "UTF8"); |
| MIME2JAVA.put("utf8", "UTF8"); |
| MIME2JAVA.put("ja_jp.iso2022-7", "ISO2022JP"); |
| MIME2JAVA.put("ja_jp.eucjp", "EUCJIS"); |
| MIME2JAVA.put("euc-kr", "KSC5601"); |
| MIME2JAVA.put("euckr", "KSC5601"); |
| MIME2JAVA.put("us-ascii", "ISO-8859-1"); |
| MIME2JAVA.put("x-us-ascii", "ISO-8859-1"); |
| } |
| |
| /** |
| * Hidden constructor, this class must not be instantiated. |
| */ |
| private MimeUtility() { |
| // do nothing |
| } |
| |
| /** |
| * Decode a string of text obtained from a mail header into |
| * it's proper form. The text generally will consist of a |
| * string of tokens, some of which may be encoded using |
| * base64 encoding. |
| * |
| * @param text The text to decode. |
| * |
| * @return The decoded test string. |
| * @throws UnsupportedEncodingException |
| */ |
| public static String decodeText(String text) throws UnsupportedEncodingException { |
| // if the text contains any encoded tokens, those tokens will be marked with "=?". If the |
| // source string doesn't contain that sequent, no decoding is required. |
| if (text.indexOf("=?") < 0) { |
| return text; |
| } |
| |
| int offset = 0; |
| int endOffset = text.length(); |
| |
| int startWhiteSpace = -1; |
| int endWhiteSpace = -1; |
| |
| StringBuffer decodedText = new StringBuffer(text.length()); |
| |
| boolean previousTokenEncoded = false; |
| |
| while (offset < endOffset) { |
| char ch = text.charAt(offset); |
| |
| // is this a whitespace character? |
| if (LINEAR_WHITESPACE.indexOf(ch) != -1) { |
| startWhiteSpace = offset; |
| while (offset < endOffset) { |
| // step over the white space characters. |
| ch = text.charAt(offset); |
| if (LINEAR_WHITESPACE.indexOf(ch) != -1) { |
| offset++; |
| } else { |
| // record the location of the first non lwsp and drop down to process the |
| // token characters. |
| endWhiteSpace = offset; |
| break; |
| } |
| } |
| } else { |
| // we have a word token. We need to scan over the word and then try to parse it. |
| int wordStart = offset; |
| |
| while (offset < endOffset) { |
| // step over the white space characters. |
| ch = text.charAt(offset); |
| if (LINEAR_WHITESPACE.indexOf(ch) == -1) { |
| offset++; |
| } else { |
| break; |
| } |
| |
| //NB: Trailing whitespace on these header strings will just be discarded. |
| } |
| // pull out the word token. |
| String word = text.substring(wordStart, offset); |
| // is the token encoded? decode the word |
| if (word.startsWith("=?")) { |
| try { |
| // if this gives a parsing failure, treat it like a non-encoded word. |
| String decodedWord = decodeWord(word); |
| |
| // are any whitespace characters significant? Append 'em if we've got 'em. |
| if (!previousTokenEncoded && startWhiteSpace != -1) { |
| decodedText.append(text.substring(startWhiteSpace, endWhiteSpace)); |
| startWhiteSpace = -1; |
| } |
| // this is definitely a decoded token. |
| previousTokenEncoded = true; |
| // and add this to the text. |
| decodedText.append(decodedWord); |
| // we continue parsing from here...we allow parsing errors to fall through |
| // and get handled as normal text. |
| continue; |
| |
| } catch (ParseException e) { |
| // just ignore it, skip to next word |
| } |
| } |
| // this is a normal token, so it doesn't matter what the previous token was. Add the white space |
| // if we have it. |
| if (startWhiteSpace != -1) { |
| decodedText.append(text.substring(startWhiteSpace, endWhiteSpace)); |
| startWhiteSpace = -1; |
| } |
| // this is not a decoded token. |
| previousTokenEncoded = false; |
| decodedText.append(word); |
| } |
| } |
| |
| return decodedText.toString(); |
| } |
| |
| /** |
| * Parse a string using the RFC 2047 rules for an "encoded-word" |
| * type. This encoding has the syntax: |
| * |
| * encoded-word = "=?" charset "?" encoding "?" encoded-text "?=" |
| * |
| * @param word The possibly encoded word value. |
| * |
| * @return The decoded word. |
| * @throws ParseException |
| * @throws UnsupportedEncodingException |
| */ |
| private static String decodeWord(String word) throws ParseException, UnsupportedEncodingException { |
| // encoded words start with the characters "=?". If this not an encoded word, we throw a |
| // ParseException for the caller. |
| |
| if (!word.startsWith("=?")) { |
| throw new ParseException("Invalid RFC 2047 encoded-word: " + word); |
| } |
| |
| int charsetPos = word.indexOf('?', 2); |
| if (charsetPos == -1) { |
| throw new ParseException("Missing charset in RFC 2047 encoded-word: " + word); |
| } |
| |
| // pull out the character set information (this is the MIME name at this point). |
| String charset = word.substring(2, charsetPos).toLowerCase(); |
| |
| // now pull out the encoding token the same way. |
| int encodingPos = word.indexOf('?', charsetPos + 1); |
| if (encodingPos == -1) { |
| throw new ParseException("Missing encoding in RFC 2047 encoded-word: " + word); |
| } |
| |
| String encoding = word.substring(charsetPos + 1, encodingPos); |
| |
| // and finally the encoded text. |
| int encodedTextPos = word.indexOf("?=", encodingPos + 1); |
| if (encodedTextPos == -1) { |
| throw new ParseException("Missing encoded text in RFC 2047 encoded-word: " + word); |
| } |
| |
| String encodedText = word.substring(encodingPos + 1, encodedTextPos); |
| |
| // seems a bit silly to encode a null string, but easy to deal with. |
| if (encodedText.length() == 0) { |
| return ""; |
| } |
| |
| try { |
| // the decoder writes directly to an output stream. |
| ByteArrayOutputStream out = new ByteArrayOutputStream(encodedText.length()); |
| |
| byte[] encodedData = encodedText.getBytes("US-ASCII"); |
| |
| // Base64 encoded? |
| if (encoding.equals("B")) { |
| BASE64_DECODER.decode(encodedData, 0, encodedData.length, out); |
| } else if (encoding.equals("Q")) { // maybe quoted printable. |
| QuotedPrintableDecoder.decodeWord(encodedData, 0, encodedData.length, out); |
| } else { |
| throw new UnsupportedEncodingException("Unknown RFC 2047 encoding: " + encoding); |
| } |
| // get the decoded byte data and convert into a string. |
| byte[] decodedData = out.toByteArray(); |
| return new String(decodedData, javaCharset(charset)); |
| } catch (IOException e) { |
| throw new UnsupportedEncodingException("Invalid RFC 2047 encoding"); |
| } |
| } |
| |
| /** |
| * Translate a MIME standard character set name into the Java |
| * equivalent. |
| * |
| * @param charset The MIME standard name. |
| * |
| * @return The Java equivalent for this name. |
| */ |
| private static String javaCharset(String charset) { |
| // nothing in, nothing out. |
| if (charset == null) { |
| return null; |
| } |
| |
| String mappedCharset = MIME2JAVA.get(charset.toLowerCase(Locale.ENGLISH)); |
| // if there is no mapping, then the original name is used. Many of the MIME character set |
| // names map directly back into Java. The reverse isn't necessarily true. |
| if (mappedCharset == null) { |
| return charset; |
| } |
| return mappedCharset; |
| } |
| |
| } |