| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| */ |
| |
| package javax.mail.internet; |
| |
| /** |
| * @version $Rev$ $Date$ |
| */ |
| public class HeaderTokenizer { |
| public static class Token { |
| // Constant values from J2SE 1.4 API Docs (Constant values) |
| public static final int ATOM = -1; |
| public static final int COMMENT = -3; |
| public static final int EOF = -4; |
| public static final int QUOTEDSTRING = -2; |
| private final int _type; |
| private final String _value; |
| |
| public Token(final int type, final String value) { |
| _type = type; |
| _value = value; |
| } |
| |
| public int getType() { |
| return _type; |
| } |
| |
| public String getValue() { |
| return _value; |
| } |
| } |
| |
| private static final char NUL = '\0'; |
| private static final Token EOF = new Token(Token.EOF, null); |
| // characters not allowed in MIME |
| public static final String MIME = "()<>@,;:\\\"\t []/?="; |
| // characters not allowed in RFC822 |
| public static final String RFC822 = "()<>@,;:\\\"\t .[]"; |
| private static final String WHITE = " \t\n\r"; |
| private final String _delimiters; |
| private final String _header; |
| private final int _headerLength; |
| private final boolean _skip; |
| private int pos; |
| |
| public HeaderTokenizer(final String header) { |
| this(header, RFC822); |
| } |
| |
| public HeaderTokenizer(final String header, final String delimiters) { |
| this(header, delimiters, true); |
| } |
| |
| public HeaderTokenizer(final String header, |
| final String delimiters, |
| final boolean skipComments) { |
| _skip = skipComments; |
| _header = header; |
| _delimiters = delimiters; |
| _headerLength=header.length(); |
| } |
| |
| //Return the rest of the Header. |
| //null is returned if we are already at end of header |
| public String getRemainder() { |
| |
| if(pos > _headerLength) { |
| return null; |
| } |
| |
| return _header.substring(pos); |
| } |
| |
| public Token next() throws ParseException { |
| return readToken(NUL, false); |
| } |
| |
| /** |
| * Parses the next token from this String. |
| * If endOfAtom is not NUL, the token extends until the |
| * endOfAtom character is seen, or to the end of the header. |
| * This method is useful when parsing headers that don't |
| * obey the MIME specification, e.g., by failing to quote |
| * parameter values that contain spaces. |
| * |
| * @param endOfAtom if not NUL, character marking end of token |
| * @return the next Token |
| * @exception ParseException if the parse fails |
| * @since JavaMail 1.5 |
| */ |
| public Token next(final char endOfAtom) throws ParseException { |
| return next(endOfAtom, false); |
| } |
| |
| /** |
| * Parses the next token from this String. |
| * endOfAtom is handled as above. If keepEscapes is true, |
| * any backslash escapes are preserved in the returned string. |
| * This method is useful when parsing headers that don't |
| * obey the MIME specification, e.g., by failing to escape |
| * backslashes in the filename parameter. |
| * |
| * @param endOfAtom if not NUL, character marking end of token |
| * @param keepEscapes keep all backslashes in returned string? |
| * @return the next Token |
| * @exception ParseException if the parse fails |
| * @since JavaMail 1.5 |
| */ |
| public Token next(final char endOfAtom, final boolean keepEscapes) |
| throws ParseException { |
| return readToken(endOfAtom, keepEscapes); |
| } |
| |
| |
| public Token peek() throws ParseException { |
| final int start = pos; |
| try { |
| return readToken(NUL, false); |
| } finally { |
| pos = start; |
| } |
| } |
| |
| /** |
| * Read an ATOM token from the parsed header. |
| * |
| * @return A token containing the value of the atom token. |
| */ |
| private Token readAtomicToken() { |
| // skip to next delimiter |
| final int start = pos; |
| final StringBuilder sb = new StringBuilder(); |
| sb.append(_header.charAt(pos)); |
| while (++pos < _headerLength) { |
| // break on the first non-atom character. |
| final char ch = _header.charAt(pos); |
| |
| if ((_delimiters.indexOf(_header.charAt(pos)) != -1 || ch < 32 || ch >= 127)) { |
| break; |
| } |
| } |
| |
| return new Token(Token.ATOM, _header.substring(start, pos)); |
| } |
| |
| /** |
| * Read the next token from the header. |
| * |
| * @return The next token from the header. White space is skipped, and comment |
| * tokens are also skipped if indicated. |
| * @exception ParseException |
| */ |
| private Token readToken(final char endOfAtom, final boolean keepEscapes) throws ParseException { |
| if (pos >= _headerLength) { |
| return EOF; |
| } else { |
| final char c = _header.charAt(pos); |
| // comment token...read and skip over this |
| if (c == '(') { |
| final Token comment = readComment(keepEscapes); |
| if (_skip) { |
| return readToken(endOfAtom, keepEscapes); |
| } else { |
| return comment; |
| } |
| |
| // quoted literal |
| } else if (c == '\"') { |
| return readQuotedString('"', keepEscapes, 1); |
| |
| // white space, eat this and find a real token. |
| } else if (WHITE.indexOf(c) != -1) { |
| eatWhiteSpace(); |
| return readToken(endOfAtom, keepEscapes); |
| |
| // either a CTL or special. These characters have a self-defining token type. |
| } else if (c < 32 || c >= 127 || _delimiters.indexOf(c) != -1) { |
| |
| if (endOfAtom != NUL && c != endOfAtom) { |
| return readQuotedString(endOfAtom, keepEscapes, 0); |
| } |
| |
| |
| pos++; |
| return new Token(c, String.valueOf(c)); |
| |
| } else { |
| // start of an atom, parse it off. |
| if (endOfAtom != NUL && c != endOfAtom) { |
| return readQuotedString(endOfAtom, keepEscapes, 0); |
| } |
| |
| return readAtomicToken(); |
| } |
| } |
| } |
| |
| /** |
| * Extract a substring from the header string and apply any |
| * escaping/folding rules to the string. |
| * |
| * @param start The starting offset in the header. |
| * @param end The header end offset + 1. |
| * |
| * @return The processed string value. |
| * @exception ParseException |
| */ |
| private String getEscapedValue(final int start, final int end, final boolean keepEscapes) throws ParseException { |
| final StringBuffer value = new StringBuffer(); |
| |
| for (int i = start; i < end; i++) { |
| final char ch = _header.charAt(i); |
| // is this an escape character? |
| if (ch == '\\') { |
| i++; |
| if (i == end) { |
| throw new ParseException("Invalid escape character"); |
| } |
| |
| if(keepEscapes) { |
| value.append("\\"); |
| } |
| |
| value.append(_header.charAt(i)); |
| } |
| // line breaks are ignored, except for naked '\n' characters, which are consider |
| // parts of linear whitespace. |
| else if (ch == '\r') { |
| // see if this is a CRLF sequence, and skip the second if it is. |
| if (i < end - 1 && _header.charAt(i + 1) == '\n') { |
| i++; |
| } |
| } |
| else { |
| |
| // just append the ch value. |
| value.append(ch); |
| } |
| } |
| return value.toString(); |
| } |
| |
| /** |
| * Read a comment from the header, applying nesting and escape |
| * rules to the content. |
| * |
| * @return A comment token with the token value. |
| * @exception ParseException |
| */ |
| private Token readComment(final boolean keepEscapes) throws ParseException { |
| final int start = pos + 1; |
| int nesting = 1; |
| |
| boolean requiresEscaping = false; |
| |
| // skip to end of comment/string |
| while (++pos < _headerLength) { |
| final char ch = _header.charAt(pos); |
| if (ch == ')') { |
| nesting--; |
| if (nesting == 0) { |
| break; |
| } |
| } |
| else if (ch == '(') { |
| nesting++; |
| } |
| else if (ch == '\\') { |
| pos++; |
| requiresEscaping = true; |
| } |
| // we need to process line breaks also |
| else if (ch == '\r') { |
| requiresEscaping = true; |
| } |
| } |
| |
| if (nesting != 0) { |
| throw new ParseException("Unbalanced comments"); |
| } |
| |
| String value; |
| if (requiresEscaping) { |
| value = getEscapedValue(start, pos, keepEscapes); |
| } |
| else { |
| value = _header.substring(start, pos++); |
| } |
| return new Token(Token.COMMENT, value); |
| } |
| |
| /** |
| * Parse out a quoted string from the header, applying escaping |
| * rules to the value. |
| * |
| * @return The QUOTEDSTRING token with the value. |
| * @exception ParseException |
| */ |
| private Token readQuotedString(final char endChar, final boolean keepEscapes, final int offset) throws ParseException { |
| final int start = pos+offset; |
| boolean requiresEscaping = false; |
| |
| // skip to end of comment/string |
| while (++pos < _headerLength) { |
| final char ch = _header.charAt(pos); |
| |
| if (ch == endChar) { |
| String value; |
| if (requiresEscaping) { |
| value = getEscapedValue(start, pos++, keepEscapes); |
| } |
| else { |
| value = _header.substring(start, pos++); |
| } |
| return new Token(Token.QUOTEDSTRING, value); |
| } |
| else if (ch == '\\') { |
| pos++; |
| requiresEscaping = true; |
| } |
| // we need to process line breaks also |
| else if (ch == '\r') { |
| requiresEscaping = true; |
| } |
| } |
| |
| // we ran out of chars in the string. If the end char is a quote, then there |
| // is a missing quote somewhere |
| if (endChar == '"') { |
| throw new ParseException("Missing '\"'"); |
| } |
| |
| // otherwise, we can just return whatever is left |
| String value; |
| if (requiresEscaping) { |
| value = getEscapedValue(start, pos, keepEscapes); |
| |
| } else { |
| value = _header.substring(start, pos); |
| } |
| return new Token(Token.QUOTEDSTRING, trimWhiteSpace(value)); |
| } |
| |
| /** |
| * Skip white space in the token string. |
| */ |
| private void eatWhiteSpace() { |
| // skip to end of whitespace |
| while (++pos < _headerLength |
| && WHITE.indexOf(_header.charAt(pos)) != -1) { |
| ; |
| } |
| } |
| |
| /** |
| * linear white spaces must be removed from quoted text or text |
| * |
| LWSP-char = SPACE / HTAB ; semantics = SPACE |
| |
| linear-white-space = 1*([CRLF] LWSP-char) ; semantics = SPACE |
| ; CRLF => folding |
| |
| text = <any CHAR, including bare ; => atoms, specials, |
| CR & bare LF, but NOT ; comments and |
| including CRLF> ; quoted-strings are |
| ; NOT recognized. |
| |
| atom = 1*<any CHAR except specials, SPACE and CTLs> |
| |
| quoted-string = <"> *(qtext/quoted-pair) <">; Regular qtext or |
| ; quoted chars. |
| |
| qtext = <any CHAR excepting <">, ; => may be folded |
| "\" & CR, and including |
| linear-white-space> |
| |
| domain-literal = "[" *(dtext / quoted-pair) "]" |
| */ |
| private static String trimWhiteSpace(final String s) { |
| char c; |
| int i; |
| for (i = s.length() - 1; i >= 0; i--) { |
| if (( |
| (c = s.charAt(i)) != ' ') && // space |
| (c != '\t') && // tab |
| (c != '\r') && // CR |
| (c != '\n')) { // LF |
| |
| break; |
| } |
| } |
| |
| if (i <= 0) { |
| return ""; |
| |
| } else { |
| return s.substring(0, i + 1); |
| } |
| } |
| |
| } |