| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| */ |
| |
| package javax.mail.internet; |
| |
| import java.io.UnsupportedEncodingException; |
| import java.util.ArrayList; |
| import java.util.List; |
| |
| class AddressParser { |
| |
| // the validation strictness levels, from most lenient to most conformant. |
| static public final int NONSTRICT = 0; |
| static public final int PARSE_HEADER = 1; |
| static public final int STRICT = 2; |
| |
| // different mailbox types |
| static protected final int UNKNOWN = 0; |
| static protected final int ROUTE_ADDR = 1; |
| static protected final int GROUP_ADDR = 2; |
| static protected final int SIMPLE_ADDR = 3; |
| |
| // constants for token types. |
| static protected final int END_OF_TOKENS = '\0'; |
| static protected final int PERIOD = '.'; |
| static protected final int LEFT_ANGLE = '<'; |
| static protected final int RIGHT_ANGLE = '>'; |
| static protected final int COMMA = ','; |
| static protected final int AT_SIGN = '@'; |
| static protected final int SEMICOLON = ';'; |
| static protected final int COLON = ':'; |
| static protected final int QUOTED_LITERAL = '"'; |
| static protected final int DOMAIN_LITERAL = '['; |
| static protected final int COMMENT = '('; |
| static protected final int ATOM = 'A'; |
| static protected final int WHITESPACE = ' '; |
| |
| |
| // the string we're parsing |
| private final String addresses; |
| // the current parsing position |
| private int position; |
| // the end position of the string |
| private int end; |
| // the strictness flag |
| private final int validationLevel; |
| |
| public AddressParser(final String addresses, final int validation) { |
| this.addresses = addresses; |
| validationLevel = validation; |
| } |
| |
| |
| /** |
| * Parse an address list into an array of internet addresses. |
| * |
| * @return An array containing all of the non-null addresses in the list. |
| * @exception AddressException |
| * Thrown for any validation errors. |
| */ |
| public InternetAddress[] parseAddressList() throws AddressException |
| { |
| // get the address as a set of tokens we can process. |
| final TokenStream tokens = tokenizeAddress(); |
| |
| // get an array list accumulator. |
| final ArrayList addressList = new ArrayList(); |
| |
| // we process sections of the token stream until we run out of tokens. |
| while (true) { |
| // parse off a single address. Address lists can have null elements, |
| // so this might return a null value. The null value does not get added |
| // to the address accumulator. |
| addressList.addAll(parseSingleAddress(tokens, false)); |
| // This token should be either a "," delimiter or a stream terminator. If we're |
| // at the end, time to get out. |
| final AddressToken token = tokens.nextToken(); |
| if (token.type == END_OF_TOKENS) { |
| break; |
| } |
| } |
| |
| return (InternetAddress [])addressList.toArray(new InternetAddress[0]); |
| } |
| |
| |
| /** |
| * Parse a single internet address. This must be a single address, |
| * not an address list. |
| * |
| * @exception AddressException |
| */ |
| public InternetAddress parseAddress() throws AddressException |
| { |
| // get the address as a set of tokens we can process. |
| final TokenStream tokens = tokenizeAddress(); |
| |
| // parse off a single address. Address lists can have null elements, |
| // so this might return a null value. The null value does not get added |
| // to the address accumulator. |
| final List addressList = parseSingleAddress(tokens, false); |
| // we must get exactly one address back from this. |
| if (addressList.isEmpty()) { |
| throw new AddressException("Null address", addresses, 0); |
| } |
| // this could be a simple list of blank delimited tokens. Ensure we only got one back. |
| if (addressList.size() > 1) { |
| throw new AddressException("Illegal Address", addresses, 0); |
| } |
| |
| // This token must be a stream stream terminator, or we have an error. |
| final AddressToken token = tokens.nextToken(); |
| if (token.type != END_OF_TOKENS) { |
| illegalAddress("Illegal Address", token); |
| } |
| |
| return (InternetAddress)addressList.get(0); |
| } |
| |
| |
| /** |
| * Validate an internet address. This must be a single address, |
| * not a list of addresses. The address also must not contain |
| * and personal information to be valid. |
| * |
| * @exception AddressException |
| */ |
| public void validateAddress() throws AddressException |
| { |
| // get the address as a set of tokens we can process. |
| final TokenStream tokens = tokenizeAddress(); |
| |
| // parse off a single address. Address lists can have null elements, |
| // so this might return a null value. The null value does not get added |
| // to the address accumulator. |
| final List addressList = parseSingleAddress(tokens, false); |
| if (addressList.isEmpty()) { |
| throw new AddressException("Null address", addresses, 0); |
| } |
| |
| // this could be a simple list of blank delimited tokens. Ensure we only got one back. |
| if (addressList.size() > 1) { |
| throw new AddressException("Illegal Address", addresses, 0); |
| } |
| |
| final InternetAddress address = (InternetAddress)addressList.get(0); |
| |
| // validation occurs on an address that's already been split into personal and address |
| // data. |
| if (address.personal != null) { |
| throw new AddressException("Illegal Address", addresses, 0); |
| } |
| // This token must be a stream stream terminator, or we have an error. |
| final AddressToken token = tokens.nextToken(); |
| if (token.type != END_OF_TOKENS) { |
| illegalAddress("Illegal Address", token); |
| } |
| } |
| |
| |
| /** |
| * Extract the set of address from a group Internet specification. |
| * |
| * @return An array containing all of the non-null addresses in the list. |
| * @exception AddressException |
| */ |
| public InternetAddress[] extractGroupList() throws AddressException |
| { |
| // get the address as a set of tokens we can process. |
| final TokenStream tokens = tokenizeAddress(); |
| |
| // get an array list accumulator. |
| final ArrayList addresses = new ArrayList(); |
| |
| AddressToken token = tokens.nextToken(); |
| |
| // scan forward to the ':' that starts the group list. If we don't find one, |
| // this is an exception. |
| while (token.type != COLON) { |
| if (token.type == END_OF_TOKENS) { |
| illegalAddress("Missing ':'", token); |
| } |
| token = tokens.nextToken(); |
| } |
| |
| // we process sections of the token stream until we run out of tokens. |
| while (true) { |
| // parse off a single address. Address lists can have null elements, |
| // so this might return a null value. The null value does not get added |
| // to the address accumulator. |
| addresses.addAll(parseSingleAddress(tokens, true)); |
| // This token should be either a "," delimiter or a group terminator. If we're |
| // at the end, this is an error. |
| token = tokens.nextToken(); |
| if (token.type == SEMICOLON) { |
| break; |
| } |
| else if (token.type == END_OF_TOKENS) { |
| illegalAddress("Missing ';'", token); |
| } |
| } |
| |
| return (InternetAddress [])addresses.toArray(new InternetAddress[0]); |
| } |
| |
| |
| /** |
| * Parse out a single address from a string from a string |
| * of address tokens, returning an InternetAddress object that |
| * represents the address. |
| * |
| * @param tokens The token source for this address. |
| * |
| * @return A parsed out and constructed InternetAddress object for |
| * the next address. Returns null if this is an "empty" |
| * address in a list. |
| * @exception AddressException |
| */ |
| private List parseSingleAddress(final TokenStream tokens, final boolean inGroup) throws AddressException |
| { |
| final List parsedAddresses = new ArrayList(); |
| |
| // index markers for personal information |
| AddressToken personalStart = null; |
| AddressToken personalEnd = null; |
| |
| // and similar bits for the address information. |
| AddressToken addressStart = null; |
| AddressToken addressEnd = null; |
| |
| // there is a fall-back set of rules allowed that will parse the address as a set of blank delimited |
| // tokens. However, we do NOT allow this if we encounter any tokens that fall outside of these |
| // rules. For example, comment fields and quoted strings will disallow the very lenient rule set. |
| boolean nonStrictRules = true; |
| |
| // we don't know the type of address yet |
| int addressType = UNKNOWN; |
| |
| // the parsing goes in two stages. Stage one runs through the tokens locating the bounds |
| // of the address we're working on, resolving the personal information, and also validating |
| // some of the larger scale syntax features of an address (matched delimiters for routes and |
| // groups, invalid nesting checks, etc.). |
| |
| // get the next token from the queue and save this. We're going to scan ahead a bit to |
| // figure out what type of address we're looking at, then reset to do the actually parsing |
| // once we've figured out a form. |
| final AddressToken first = tokens.nextToken(); |
| // push it back on before starting processing. |
| tokens.pushToken(first); |
| |
| // scan ahead for a trigger token that tells us what we've got. |
| while (addressType == UNKNOWN) { |
| |
| final AddressToken token = tokens.nextToken(); |
| switch (token.type) { |
| // skip these for now...after we've processed everything and found that this is a simple |
| // address form, then we'll check for a leading comment token in the first position and use |
| // if as personal information. |
| case COMMENT: |
| // comments do, however, denote that this must be parsed according to RFC822 rules. |
| nonStrictRules = false; |
| break; |
| |
| // a semi-colon when processing a group is an address terminator. we need to |
| // process this like a comma then |
| case SEMICOLON: |
| if (inGroup) { |
| // we need to push the terminator back on for the caller to see. |
| tokens.pushToken(token); |
| // if we've not tagged any tokens as being the address beginning, so this must be a |
| // null address. |
| if (addressStart == null) { |
| // just return the empty list from this. |
| return parsedAddresses; |
| } |
| // the end token is the back part. |
| addressEnd = tokens.previousToken(token); |
| // without a '<' for a route addr, we can't distinguish address tokens from personal data. |
| // We'll use a leading comment, if there is one. |
| personalStart = null; |
| // this is just a simple form. |
| addressType = SIMPLE_ADDR; |
| break; |
| } |
| |
| // NOTE: The above falls through if this is not a group. |
| |
| // any of these tokens are a real token that can be the start of an address. Many of |
| // them are not valid as first tokens in this context, but we flag them later if validation |
| // has been requested. For now, we just mark these as the potential address start. |
| case DOMAIN_LITERAL: |
| case QUOTED_LITERAL: |
| // this set of tokens require fuller RFC822 parsing, so turn off the flag. |
| nonStrictRules = false; |
| |
| case ATOM: |
| case AT_SIGN: |
| case PERIOD: |
| // if we're not determined the start of the address yet, then check to see if we |
| // need to consider this the personal start. |
| if (addressStart == null) { |
| if (personalStart == null) { |
| personalStart = token; |
| } |
| // This is the first real token of the address, which at this point can |
| // be either the personal info or the first token of the address. If we hit |
| // an address terminator without encountering either a route trigger or group |
| // trigger, then this is the real address. |
| addressStart = token; |
| } |
| break; |
| |
| // a LEFT_ANGLE indicates we have a full RFC822 mailbox form. The leading phrase |
| // is the personal info. The address is inside the brackets. |
| case LEFT_ANGLE: |
| // a route address automatically switches off the blank-delimited token mode. |
| nonStrictRules = false; |
| // this is a route address |
| addressType = ROUTE_ADDR; |
| // the address is placed in the InternetAddress object without the route |
| // brackets, so our start is one past this. |
| addressStart = tokens.nextRealToken(); |
| // push this back on the queue so the scanner picks it up properly. |
| tokens.pushToken(addressStart); |
| // make sure we flag the end of the personal section too. |
| if (personalStart != null) { |
| personalEnd = tokens.previousToken(token); |
| } |
| // scan the rest of a route address. |
| addressEnd = scanRouteAddress(tokens, false); |
| break; |
| |
| // a COLON indicates this is a group specifier...parse the group. |
| case COLON: |
| // Colons would not be valid in simple lists, so turn it off. |
| nonStrictRules = false; |
| // if we're scanning a group, we shouldn't encounter a ":". This is a |
| // recursion error if found. |
| if (inGroup) { |
| illegalAddress("Nested group element", token); |
| } |
| addressType = GROUP_ADDR; |
| // groups don't have any personal sections. |
| personalStart = null; |
| // our real start was back at the beginning |
| addressStart = first; |
| addressEnd = scanGroupAddress(tokens); |
| break; |
| |
| // a semi colon can the same as a comma if we're processing a group. |
| |
| |
| // reached the end of string...this might be a null address, or one of the very simple name |
| // forms used for non-strict RFC822 versions. Reset, and try that form |
| case END_OF_TOKENS: |
| // if we're scanning a group, we shouldn't encounter an end token. This is an |
| // error if found. |
| if (inGroup) { |
| illegalAddress("Missing ';'", token); |
| } |
| |
| // NOTE: fall through from above. |
| |
| // this is either a terminator for an address list or a a group terminator. |
| case COMMA: |
| // we need to push the terminator back on for the caller to see. |
| tokens.pushToken(token); |
| // if we've not tagged any tokens as being the address beginning, so this must be a |
| // null address. |
| if (addressStart == null) { |
| // just return the empty list from this. |
| return parsedAddresses; |
| } |
| // the end token is the back part. |
| addressEnd = tokens.previousToken(token); |
| // without a '<' for a route addr, we can't distinguish address tokens from personal data. |
| // We'll use a leading comment, if there is one. |
| personalStart = null; |
| // this is just a simple form. |
| addressType = SIMPLE_ADDR; |
| break; |
| |
| // right angle tokens are pushed, because parsing of the bracketing is not necessarily simple. |
| // we need to flag these here. |
| case RIGHT_ANGLE: |
| illegalAddress("Unexpected '>'", token); |
| |
| } |
| } |
| |
| String personal = null; |
| |
| // if we have personal data, then convert it to a string value. |
| if (personalStart != null) { |
| final TokenStream personalTokens = tokens.section(personalStart, personalEnd); |
| personal = personalToString(personalTokens); |
| } |
| // if we have a simple address, then check the first token to see if it's a comment. For simple addresses, |
| // we'll accept the first comment token as the personal information. |
| else { |
| if (addressType == SIMPLE_ADDR && first.type == COMMENT) { |
| personal = first.value; |
| } |
| } |
| |
| final TokenStream addressTokens = tokens.section(addressStart, addressEnd); |
| |
| // if this is one of the strictly RFC822 types, then we always validate the address. If this is a |
| // a simple address, then we only validate if strict parsing rules are in effect or we've been asked |
| // to validate. |
| if (validationLevel != PARSE_HEADER) { |
| switch (addressType) { |
| case GROUP_ADDR: |
| validateGroup(addressTokens); |
| break; |
| |
| case ROUTE_ADDR: |
| validateRouteAddr(addressTokens, false); |
| break; |
| |
| case SIMPLE_ADDR: |
| // this is a conditional validation |
| validateSimpleAddress(addressTokens); |
| break; |
| } |
| } |
| |
| // more complex addresses and addresses containing tokens other than just simple addresses |
| // need proper handling. |
| if (validationLevel != NONSTRICT || addressType != SIMPLE_ADDR || !nonStrictRules) { |
| // we might have traversed this already when we validated, so reset the |
| // position before using this again. |
| addressTokens.reset(); |
| final String address = addressToString(addressTokens); |
| |
| // get the parsed out sections as string values. |
| final InternetAddress result = new InternetAddress(); |
| result.setAddress(address); |
| try { |
| result.setPersonal(personal); |
| } catch (final UnsupportedEncodingException e) { |
| } |
| // even though we have a single address, we return this as an array. Simple addresses |
| // can be produce an array of items, so we need to return everything. |
| parsedAddresses.add(result); |
| return parsedAddresses; |
| } |
| else { |
| addressTokens.reset(); |
| |
| TokenStream nextAddress = addressTokens.getBlankDelimitedToken(); |
| while (nextAddress != null) { |
| final String address = addressToString(nextAddress); |
| // get the parsed out sections as string values. |
| final InternetAddress result = new InternetAddress(); |
| result.setAddress(address); |
| parsedAddresses.add(result); |
| nextAddress = addressTokens.getBlankDelimitedToken(); |
| } |
| return parsedAddresses; |
| } |
| } |
| |
| |
| /** |
| * Scan the token stream, parsing off a route addr spec. This |
| * will do some basic syntax validation, but will not actually |
| * validate any of the address information. Comments will be |
| * discarded. |
| * |
| * @param tokens The stream of tokens. |
| * |
| * @return The last token of the route address (the one preceeding the |
| * terminating '>'. |
| */ |
| private AddressToken scanRouteAddress(final TokenStream tokens, final boolean inGroup) throws AddressException { |
| // get the first token and ensure we have something between the "<" and ">". |
| AddressToken token = tokens.nextRealToken(); |
| // the last processed non-whitespace token, which is the actual address end once the |
| // right angle bracket is encountered. |
| |
| AddressToken previous = null; |
| |
| // if this route-addr has route information, the first token after the '<' must be a '@'. |
| // this determines if/where a colon or comma can appear. |
| boolean inRoute = token.type == AT_SIGN; |
| |
| // now scan until we reach the terminator. The only validation is done on illegal characters. |
| while (true) { |
| switch (token.type) { |
| // The following tokens are all valid between the brackets, so just skip over them. |
| case ATOM: |
| case QUOTED_LITERAL: |
| case DOMAIN_LITERAL: |
| case PERIOD: |
| case AT_SIGN: |
| break; |
| |
| case COLON: |
| // if not processing route information, this is illegal. |
| if (!inRoute) { |
| illegalAddress("Unexpected ':'", token); |
| } |
| // this is the end of the route information, the rules now change. |
| inRoute = false; |
| break; |
| |
| case COMMA: |
| // if not processing route information, this is illegal. |
| if (!inRoute) { |
| illegalAddress("Unexpected ','", token); |
| } |
| break; |
| |
| case RIGHT_ANGLE: |
| // if previous is null, we've had a route address which is "<>". That's illegal. |
| if (previous == null) { |
| illegalAddress("Illegal address", token); |
| } |
| // step to the next token..this had better be either a comma for another address or |
| // the very end of the address list . |
| token = tokens.nextRealToken(); |
| // if we're scanning part of a group, then the allowed terminators are either ',' or ';'. |
| if (inGroup) { |
| if (token.type != COMMA && token.type != SEMICOLON) { |
| illegalAddress("Illegal address", token); |
| } |
| } |
| // a normal address should have either a ',' for a list or the end. |
| else { |
| if (token.type != COMMA && token.type != END_OF_TOKENS) { |
| illegalAddress("Illegal address", token); |
| } |
| } |
| // we need to push the termination token back on. |
| tokens.pushToken(token); |
| // return the previous token as the updated position. |
| return previous; |
| |
| case END_OF_TOKENS: |
| illegalAddress("Missing '>'", token); |
| |
| // now for the illegal ones in this context. |
| case SEMICOLON: |
| illegalAddress("Unexpected ';'", token); |
| |
| case LEFT_ANGLE: |
| illegalAddress("Unexpected '<'", token); |
| } |
| // remember the previous token. |
| previous = token; |
| token = tokens.nextRealToken(); |
| } |
| } |
| |
| |
| /** |
| * Scan the token stream, parsing off a group address. This |
| * will do some basic syntax validation, but will not actually |
| * validate any of the address information. Comments will be |
| * ignored. |
| * |
| * @param tokens The stream of tokens. |
| * |
| * @return The last token of the group address (the terminating ':"). |
| */ |
| private AddressToken scanGroupAddress(final TokenStream tokens) throws AddressException { |
| // A group does not require that there be anything between the ':' and ';". This is |
| // just a group with an empty list. |
| AddressToken token = tokens.nextRealToken(); |
| |
| // now scan until we reach the terminator. The only validation is done on illegal characters. |
| while (true) { |
| switch (token.type) { |
| // The following tokens are all valid in group addresses, so just skip over them. |
| case ATOM: |
| case QUOTED_LITERAL: |
| case DOMAIN_LITERAL: |
| case PERIOD: |
| case AT_SIGN: |
| case COMMA: |
| break; |
| |
| case COLON: |
| illegalAddress("Nested group", token); |
| |
| // route address within a group specifier....we need to at least verify the bracket nesting |
| // and higher level syntax of the route. |
| case LEFT_ANGLE: |
| scanRouteAddress(tokens, true); |
| break; |
| |
| // the only allowed terminator is the ';' |
| case END_OF_TOKENS: |
| illegalAddress("Missing ';'", token); |
| |
| // now for the illegal ones in this context. |
| case SEMICOLON: |
| // verify there's nothing illegal after this. |
| final AddressToken next = tokens.nextRealToken(); |
| if (next.type != COMMA && next.type != END_OF_TOKENS) { |
| illegalAddress("Illegal address", token); |
| } |
| // don't forget to put this back on...our caller will need it. |
| tokens.pushToken(next); |
| return token; |
| |
| case RIGHT_ANGLE: |
| illegalAddress("Unexpected '>'", token); |
| } |
| token = tokens.nextRealToken(); |
| } |
| } |
| |
| |
| /** |
| * Parse the provided internet address into a set of tokens. This |
| * phase only does a syntax check on the tokens. The interpretation |
| * of the tokens is the next phase. |
| * |
| * @exception AddressException |
| */ |
| private TokenStream tokenizeAddress() throws AddressException { |
| |
| // get a list for the set of tokens |
| final TokenStream tokens = new TokenStream(); |
| |
| end = addresses.length(); // our parsing end marker |
| |
| // now scan along the string looking for the special characters in an internet address. |
| while (moreCharacters()) { |
| final char ch = currentChar(); |
| |
| switch (ch) { |
| // start of a comment bit...ignore everything until we hit a closing paren. |
| case '(': |
| scanComment(tokens); |
| break; |
| // a closing paren found outside of normal processing. |
| case ')': |
| syntaxError("Unexpected ')'", position); |
| |
| |
| // start of a quoted string |
| case '"': |
| scanQuotedLiteral(tokens); |
| break; |
| // domain literal |
| case '[': |
| scanDomainLiteral(tokens); |
| break; |
| |
| // a naked closing bracket...not valid except as part of a domain literal. |
| case ']': |
| syntaxError("Unexpected ']'", position); |
| |
| // special character delimiters |
| case '<': |
| tokens.addToken(new AddressToken(LEFT_ANGLE, position)); |
| nextChar(); |
| break; |
| |
| // a naked closing bracket...not valid without a starting one, but |
| // we need to handle this in context. |
| case '>': |
| tokens.addToken(new AddressToken(RIGHT_ANGLE, position)); |
| nextChar(); |
| break; |
| case ':': |
| tokens.addToken(new AddressToken(COLON, position)); |
| nextChar(); |
| break; |
| case ',': |
| tokens.addToken(new AddressToken(COMMA, position)); |
| nextChar(); |
| break; |
| case '.': |
| tokens.addToken(new AddressToken(PERIOD, position)); |
| nextChar(); |
| break; |
| case ';': |
| tokens.addToken(new AddressToken(SEMICOLON, position)); |
| nextChar(); |
| break; |
| case '@': |
| tokens.addToken(new AddressToken(AT_SIGN, position)); |
| nextChar(); |
| break; |
| |
| // white space characters. These are mostly token delimiters, but there are some relaxed |
| // situations where they get processed, so we need to add a white space token for the first |
| // one we encounter in a span. |
| case ' ': |
| case '\t': |
| case '\r': |
| case '\n': |
| // add a single white space token |
| tokens.addToken(new AddressToken(WHITESPACE, position)); |
| |
| nextChar(); |
| // step over any space characters, leaving us positioned either at the end |
| // or the first |
| while (moreCharacters()) { |
| final char nextChar = currentChar(); |
| if (nextChar == ' ' || nextChar == '\t' || nextChar == '\r' || nextChar == '\n') { |
| nextChar(); |
| } |
| else { |
| break; |
| } |
| } |
| break; |
| |
| // potentially an atom...if it starts with an allowed atom character, we |
| // parse out the token, otherwise this is invalid. |
| default: |
| if (ch < 040 || ch >= 0177) { |
| syntaxError("Illegal character in address", position); |
| } |
| |
| scanAtom(tokens); |
| break; |
| } |
| } |
| |
| // for this end marker, give an end position. |
| tokens.addToken(new AddressToken(END_OF_TOKENS, addresses.length())); |
| return tokens; |
| } |
| |
| |
| /** |
| * Step to the next character position while parsing. |
| */ |
| private void nextChar() { |
| position++; |
| } |
| |
| |
| /** |
| * Retrieve the character at the current parsing position. |
| * |
| * @return The current character. |
| */ |
| private char currentChar() { |
| return addresses.charAt(position); |
| } |
| |
| /** |
| * Test if there are more characters left to parse. |
| * |
| * @return True if we've hit the last character, false otherwise. |
| */ |
| private boolean moreCharacters() { |
| return position < end; |
| } |
| |
| |
| /** |
| * Parse a quoted string as specified by the RFC822 specification. |
| * |
| * @param tokens The TokenStream where the parsed out token is added. |
| */ |
| private void scanQuotedLiteral(final TokenStream tokens) throws AddressException { |
| final StringBuffer value = new StringBuffer(); |
| |
| // step over the quote delimiter. |
| nextChar(); |
| |
| while (moreCharacters()) { |
| final char ch = currentChar(); |
| |
| // is this an escape char? |
| if (ch == '\\') { |
| // step past this, and grab the following character |
| nextChar(); |
| if (!moreCharacters()) { |
| syntaxError("Missing '\"'", position); |
| } |
| value.append(currentChar()); |
| } |
| // end of the string? |
| else if (ch == '"') { |
| // return the constructed string. |
| tokens.addToken(new AddressToken(value.toString(), QUOTED_LITERAL, position)); |
| // step over the close delimiter for the benefit of the next token. |
| nextChar(); |
| return; |
| } |
| // the RFC822 spec disallows CR characters. |
| else if (ch == '\r') { |
| syntaxError("Illegal line end in literal", position); |
| } |
| else |
| { |
| value.append(ch); |
| } |
| nextChar(); |
| } |
| // missing delimiter |
| syntaxError("Missing '\"'", position); |
| } |
| |
| |
| /** |
| * Parse a domain literal as specified by the RFC822 specification. |
| * |
| * @param tokens The TokenStream where the parsed out token is added. |
| */ |
| private void scanDomainLiteral(final TokenStream tokens) throws AddressException { |
| final StringBuffer value = new StringBuffer(); |
| |
| final int startPosition = position; |
| // step over the quote delimiter. |
| nextChar(); |
| |
| while (moreCharacters()) { |
| final char ch = currentChar(); |
| |
| // is this an escape char? |
| if (ch == '\\') { |
| // because domain literals don't get extra escaping, we render them |
| // with the escaped characters intact. Therefore, append the '\' escape |
| // first, then append the escaped character without examination. |
| value.append(currentChar()); |
| // step past this, and grab the following character |
| nextChar(); |
| if (!moreCharacters()) { |
| syntaxError("Missing '\"'", position); |
| } |
| value.append(currentChar()); |
| } |
| // end of the string? |
| else if (ch == ']') { |
| // return the constructed string. |
| tokens.addToken(new AddressToken(value.toString(), DOMAIN_LITERAL, startPosition)); |
| // step over the close delimiter for the benefit of the next token. |
| nextChar(); |
| return; |
| } |
| // the RFC822 spec says no nesting |
| else if (ch == '[') { |
| syntaxError("Unexpected '['", position); |
| } |
| // carriage returns are similarly illegal. |
| else if (ch == '\r') { |
| syntaxError("Illegal line end in domain literal", position); |
| } |
| else |
| { |
| value.append(ch); |
| } |
| nextChar(); |
| } |
| // missing delimiter |
| syntaxError("Missing ']'", position); |
| } |
| |
| /** |
| * Scan an atom in an internet address, using the RFC822 rules |
| * for atom delimiters. |
| * |
| * @param tokens The TokenStream where the parsed out token is added. |
| */ |
| private void scanAtom(final TokenStream tokens) throws AddressException { |
| final int start = position; |
| nextChar(); |
| while (moreCharacters()) { |
| |
| final char ch = currentChar(); |
| if (isAtom(ch)) { |
| nextChar(); |
| } |
| else { |
| break; |
| } |
| } |
| |
| // return the scanned part of the string. |
| tokens.addToken(new AddressToken(addresses.substring(start, position), ATOM, start)); |
| } |
| |
| |
| /** |
| * Parse an internet address comment field as specified by |
| * RFC822. Includes support for quoted characters and nesting. |
| * |
| * @param tokens The TokenStream where the parsed out token is added. |
| */ |
| private void scanComment(final TokenStream tokens) throws AddressException { |
| final StringBuffer value = new StringBuffer(); |
| |
| final int startPosition = position; |
| // step past the start character |
| nextChar(); |
| |
| // we're at the top nesting level on the comment. |
| int nest = 1; |
| |
| // scan while we have more characters. |
| while (moreCharacters()) { |
| final char ch = currentChar(); |
| // escape character? |
| if (ch == '\\') { |
| // step over this...if escaped, we must have at least one more character |
| // in the string. |
| nextChar(); |
| if (!moreCharacters()) { |
| syntaxError("Missing ')'", position); |
| } |
| value.append(currentChar()); |
| } |
| // nested comment? |
| else if (ch == '(') { |
| // step the nesting level...we treat the comment as a single unit, with the delimiters |
| // for the nested comments embedded in the middle |
| nest++; |
| value.append(ch); |
| } |
| // is this the comment close? |
| else if (ch == ')') { |
| // reduce the nesting level. If we still have more to process, add the delimiter character |
| // and keep going. |
| nest--; |
| if (nest > 0) { |
| value.append(ch); |
| } |
| else { |
| // step past this and return. The outermost comment delimiter is not included in |
| // the string value, since this is frequently used as personal data on the |
| // InternetAddress objects. |
| nextChar(); |
| tokens.addToken(new AddressToken(value.toString(), COMMENT, startPosition)); |
| return; |
| } |
| } |
| else if (ch == '\r') { |
| syntaxError("Illegal line end in comment", position); |
| } |
| else { |
| value.append(ch); |
| } |
| // step to the next character. |
| nextChar(); |
| } |
| // ran out of data before seeing the closing bit, not good |
| syntaxError("Missing ')'", position); |
| } |
| |
| |
| /** |
| * Validate the syntax of an RFC822 group internet address specification. |
| * |
| * @param tokens The stream of tokens for the address. |
| * |
| * @exception AddressException |
| */ |
| private void validateGroup(final TokenStream tokens) throws AddressException { |
| // we know already this is an address in the form "phrase:group;". Now we need to validate the |
| // elements. |
| |
| int phraseCount = 0; |
| |
| AddressToken token = tokens.nextRealToken(); |
| // now scan to the semi color, ensuring we have only word or comment tokens. |
| while (token.type != COLON) { |
| // only these tokens are allowed here. |
| if (token.type != ATOM && token.type != QUOTED_LITERAL) { |
| invalidToken(token); |
| } |
| phraseCount++; |
| token = tokens.nextRealToken(); |
| } |
| |
| |
| // RFC822 groups require a leading phrase in group specifiers. |
| if (phraseCount == 0) { |
| illegalAddress("Missing group identifier phrase", token); |
| } |
| |
| // now we do the remainder of the parsing using the initial phrase list as the sink...the entire |
| // address will be converted to a string later. |
| |
| // ok, we only know this has been valid up to the ":", now we have some real checks to perform. |
| while (true) { |
| // go scan off a mailbox. if everything goes according to plan, we should be positioned at either |
| // a comma or a semicolon. |
| validateGroupMailbox(tokens); |
| |
| token = tokens.nextRealToken(); |
| |
| // we're at the end of the group. Make sure this is truely the end. |
| if (token.type == SEMICOLON) { |
| token = tokens.nextRealToken(); |
| if (token.type != END_OF_TOKENS) { |
| illegalAddress("Illegal group address", token); |
| } |
| return; |
| } |
| |
| // if not a semicolon, this better be a comma. |
| else if (token.type != COMMA) { |
| illegalAddress("Illegal group address", token); |
| } |
| } |
| } |
| |
| |
| /** |
| * Validate the syntax of single mailbox within a group address. |
| * |
| * @param tokens The stream of tokens representing the address. |
| * |
| * @exception AddressException |
| */ |
| private void validateGroupMailbox(final TokenStream tokens) throws AddressException { |
| final AddressToken first = tokens.nextRealToken(); |
| // is this just a null address in the list? then push the terminator back and return. |
| if (first.type == COMMA || first.type == SEMICOLON) { |
| tokens.pushToken(first); |
| return; |
| } |
| |
| // now we need to scan ahead to see if we can determine the type. |
| AddressToken token = first; |
| |
| |
| // we need to scan forward to figure out what sort of address this is. |
| while (first != null) { |
| switch (token.type) { |
| // until we know the context, these are all just ignored. |
| case QUOTED_LITERAL: |
| case ATOM: |
| break; |
| |
| // a LEFT_ANGLE indicates we have a full RFC822 mailbox form. The leading phrase |
| // is the personal info. The address is inside the brackets. |
| case LEFT_ANGLE: |
| tokens.pushToken(first); |
| validatePhrase(tokens, false); |
| validateRouteAddr(tokens, true); |
| return; |
| |
| // we've hit a period as the first non-word token. This should be part of a local-part |
| // of an address. |
| case PERIOD: |
| // we've hit an "@" as the first non-word token. This is probably a simple address in |
| // the form "user@domain". |
| case AT_SIGN: |
| tokens.pushToken(first); |
| validateAddressSpec(tokens); |
| return; |
| |
| // reached the end of string...this might be a null address, or one of the very simple name |
| // forms used for non-strict RFC822 versions. Reset, and try that form |
| case COMMA: |
| // this is the end of the group...handle it like a comma for now. |
| case SEMICOLON: |
| tokens.pushToken(first); |
| validateAddressSpec(tokens); |
| return; |
| |
| case END_OF_TOKENS: |
| illegalAddress("Missing ';'", token); |
| |
| } |
| token = tokens.nextRealToken(); |
| } |
| } |
| |
| |
| /** |
| * Utility method for throwing an AddressException caused by an |
| * unexpected primitive token. |
| * |
| * @param token The token causing the problem (must not be a value type token). |
| * |
| * @exception AddressException |
| */ |
| private void invalidToken(final AddressToken token) throws AddressException { |
| illegalAddress("Unexpected '" + token.type + "'", token); |
| } |
| |
| |
| /** |
| * Raise an error about illegal syntax. |
| * |
| * @param message The message used in the thrown exception. |
| * @param position The parsing position within the string. |
| * |
| * @exception AddressException |
| */ |
| private void syntaxError(final String message, final int position) throws AddressException |
| { |
| throw new AddressException(message, addresses, position); |
| } |
| |
| |
| /** |
| * Throw an exception based on the position of an invalid token. |
| * |
| * @param message The exception message. |
| * @param token The token causing the error. This tokens position is used |
| * in the exception information. |
| */ |
| private void illegalAddress(final String message, final AddressToken token) throws AddressException { |
| throw new AddressException(message, addresses, token.position); |
| } |
| |
| |
| /** |
| * Validate that a required phrase exists. |
| * |
| * @param tokens The set of tokens to validate. positioned at the phrase start. |
| * @param required A flag indicating whether the phrase is optional or required. |
| * |
| * @exception AddressException |
| */ |
| private void validatePhrase(final TokenStream tokens, final boolean required) throws AddressException { |
| // we need to have at least one WORD token in the phrase...everything is optional |
| // after that. |
| AddressToken token = tokens.nextRealToken(); |
| if (token.type != ATOM && token.type != QUOTED_LITERAL) { |
| if (required) { |
| illegalAddress("Missing group phrase", token); |
| } |
| } |
| |
| // now scan forward to the end of the phrase |
| token = tokens.nextRealToken(); |
| while (token.type == ATOM || token.type == QUOTED_LITERAL) { |
| token = tokens.nextRealToken(); |
| } |
| } |
| |
| |
| /** |
| * validate a routeaddr specification |
| * |
| * @param tokens The tokens representing the address portion (personal information |
| * already removed). |
| * @param ingroup true indicates we're validating a route address inside a |
| * group list. false indicates we're validating a standalone |
| * address. |
| * |
| * @exception AddressException |
| */ |
| private void validateRouteAddr(final TokenStream tokens, final boolean ingroup) throws AddressException { |
| // get the next real token. |
| AddressToken token = tokens.nextRealToken(); |
| // if this is an at sign, then we have a list of domains to parse. |
| if (token.type == AT_SIGN) { |
| // push the marker token back in for the route parser, and step past that part. |
| tokens.pushToken(token); |
| validateRoute(tokens); |
| } |
| else { |
| // we need to push this back on to validate the local part. |
| tokens.pushToken(token); |
| } |
| |
| // now we expect to see an address spec. |
| validateAddressSpec(tokens); |
| |
| token = tokens.nextRealToken(); |
| if (ingroup) { |
| // if we're validating within a group specification, the angle brackets are still there (and |
| // required). |
| if (token.type != RIGHT_ANGLE) { |
| illegalAddress("Missing '>'", token); |
| } |
| } |
| else { |
| // the angle brackets were removed to make this an address, so we should be done. Make sure we |
| // have a terminator here. |
| if (token.type != END_OF_TOKENS) { |
| illegalAddress("Illegal Address", token); |
| } |
| } |
| } |
| |
| |
| |
| /** |
| * Validate a simple address in the form "user@domain". |
| * |
| * @param tokens The stream of tokens representing the address. |
| */ |
| private void validateSimpleAddress(final TokenStream tokens) throws AddressException { |
| |
| // the validation routines occur after addresses have been split into |
| // personal and address forms. Therefore, our validation begins directly |
| // with the first token. |
| validateAddressSpec(tokens); |
| |
| // get the next token and see if there is something here...anything but the terminator is an error |
| final AddressToken token = tokens.nextRealToken(); |
| if (token.type != END_OF_TOKENS) { |
| illegalAddress("Illegal Address", token); |
| } |
| } |
| |
| /** |
| * Validate the addr-spec portion of an address. RFC822 requires |
| * this be of the form "local-part@domain". However, javamail also |
| * allows simple address of the form "local-part". We only require |
| * the domain if an '@' is encountered. |
| * |
| * @param tokens |
| */ |
| private void validateAddressSpec(final TokenStream tokens) throws AddressException { |
| // all addresses, even the simple ones, must have at least a local part. |
| validateLocalPart(tokens); |
| |
| // now see if we have a domain portion to look at. |
| final AddressToken token = tokens.nextRealToken(); |
| if (token.type == AT_SIGN) { |
| validateDomain(tokens); |
| } |
| else { |
| // put this back for termination |
| tokens.pushToken(token); |
| } |
| |
| } |
| |
| |
| /** |
| * Validate the route portion of a route-addr. This is a list |
| * of domain values in the form 1#("@" domain) ":". |
| * |
| * @param tokens The token stream holding the address information. |
| */ |
| private void validateRoute(final TokenStream tokens) throws AddressException { |
| while (true) { |
| final AddressToken token = tokens.nextRealToken(); |
| // if this is the first part of the list, go parse off a domain |
| if (token.type == AT_SIGN) { |
| validateDomain(tokens); |
| } |
| // another element in the list? Go around again |
| else if (token.type == COMMA) { |
| continue; |
| } |
| // the list is terminated by a colon...stop this part of the validation once we hit one. |
| else if (token.type == COLON) { |
| return; |
| } |
| // the list is terminated by a colon. If this isn't one of those, we have an error. |
| else { |
| illegalAddress("Missing ':'", token); |
| } |
| } |
| } |
| |
| |
| /** |
| * Parse the local part of an address spec. The local part |
| * is a series of "words" separated by ".". |
| */ |
| private void validateLocalPart(final TokenStream tokens) throws AddressException { |
| while (true) { |
| // get the token. |
| AddressToken token = tokens.nextRealToken(); |
| |
| // this must be either an atom or a literal. |
| if (token.type != ATOM && token.type != QUOTED_LITERAL) { |
| illegalAddress("Invalid local part", token); |
| } |
| |
| // get the next token (white space and comments ignored) |
| token = tokens.nextRealToken(); |
| // if this is a period, we continue parsing |
| if (token.type != PERIOD) { |
| tokens.pushToken(token); |
| // return the token |
| return; |
| } |
| } |
| } |
| |
| |
| |
| /** |
| * Parse a domain name of the form sub-domain *("." sub-domain). |
| * a sub-domain is either an atom or a domain-literal. |
| */ |
| private void validateDomain(final TokenStream tokens) throws AddressException { |
| while (true) { |
| // get the token. |
| AddressToken token = tokens.nextRealToken(); |
| |
| // this must be either an atom or a domain literal. |
| if (token.type != ATOM && token.type != DOMAIN_LITERAL) { |
| illegalAddress("Invalid domain", token); |
| } |
| |
| // get the next token (white space is ignored) |
| token = tokens.nextRealToken(); |
| // if this is a period, we continue parsing |
| if (token.type != PERIOD) { |
| // return the token |
| tokens.pushToken(token); |
| return; |
| } |
| } |
| } |
| |
| /** |
| * Convert a list of word tokens into a phrase string. The |
| * rules for this are a little hard to puzzle out, but there |
| * is a logic to it. If the list is empty, the phrase is |
| * just a null value. |
| * |
| * If we have a phrase, then the quoted strings need to |
| * handled appropriately. In multi-token phrases, the |
| * quoted literals are concatenated with the quotes intact, |
| * regardless of content. Thus a phrase that comes in like this: |
| * |
| * "Geronimo" Apache |
| * |
| * gets converted back to the same string. |
| * |
| * If there is just a single token in the phrase, AND the token |
| * is a quoted string AND the string does not contain embedded |
| * special characters ("\.,@<>()[]:;), then the phrase |
| * is expressed as an atom. Thus the literal |
| * |
| * "Geronimo" |
| * |
| * becomes |
| * |
| * Geronimo |
| * |
| * but |
| * |
| * "(Geronimo)" |
| * |
| * remains |
| * |
| * "(Geronimo)" |
| * |
| * Note that we're generating a canonical form of the phrase, |
| * which removes comments and reduces linear whitespace down |
| * to a single separator token. |
| * |
| * @param phrase An array list of phrase tokens (which may be empty). |
| */ |
| private String personalToString(final TokenStream tokens) { |
| |
| // no tokens in the stream? This is a null value. |
| AddressToken token = tokens.nextToken(); |
| |
| if (token.type == END_OF_TOKENS) { |
| return null; |
| } |
| |
| final AddressToken next = tokens.nextToken(); |
| |
| // single element phrases get special treatment. |
| if (next.type == END_OF_TOKENS) { |
| // this can be used directly...if it contains special characters, quoting will be |
| // performed when it's converted to a string value. |
| return token.value; |
| } |
| |
| // reset to the beginning |
| tokens.pushToken(token); |
| |
| // have at least two tokens, |
| final StringBuffer buffer = new StringBuffer(); |
| |
| // get the first token. After the first, we add these as blank delimited values. |
| token = tokens.nextToken(); |
| addTokenValue(token, buffer); |
| |
| token = tokens.nextToken(); |
| while (token.type != END_OF_TOKENS) { |
| // add a blank separator |
| buffer.append(' '); |
| // now add the next tokens value |
| addTokenValue(token, buffer); |
| token = tokens.nextToken(); |
| } |
| // and return the canonicalized value |
| return buffer.toString(); |
| } |
| |
| |
| /** |
| * take a canonicalized set of address tokens and reformat it back into a string value, |
| * inserting whitespace where appropriate. |
| * |
| * @param tokens The set of tokens representing the address. |
| * |
| * @return The string value of the tokens. |
| */ |
| private String addressToString(final TokenStream tokens) { |
| final StringBuffer buffer = new StringBuffer(); |
| |
| // this flag controls whether we insert a blank delimiter between tokens as |
| // we advance through the list. Blanks are only inserted between consequtive value tokens. |
| // Initially, this is false, then we flip it to true whenever we add a value token, and |
| // back to false for any special character token. |
| boolean spaceRequired = false; |
| |
| // we use nextToken rather than nextRealToken(), since we need to process the comments also. |
| AddressToken token = tokens.nextToken(); |
| |
| // now add each of the tokens |
| while (token.type != END_OF_TOKENS) { |
| switch (token.type) { |
| // the word tokens are the only ones where we need to worry about adding |
| // whitespace delimiters. |
| case ATOM: |
| case QUOTED_LITERAL: |
| // was the last token also a word? Insert a blank first. |
| if (spaceRequired) { |
| buffer.append(' '); |
| } |
| addTokenValue(token, buffer); |
| // let the next iteration know we just added a word to the list. |
| spaceRequired = true; |
| break; |
| |
| // these special characters are just added in. The constants for the character types |
| // were carefully selected to be the character value in question. This allows us to |
| // just append the value. |
| case LEFT_ANGLE: |
| case RIGHT_ANGLE: |
| case COMMA: |
| case COLON: |
| case AT_SIGN: |
| case SEMICOLON: |
| case PERIOD: |
| buffer.append((char)token.type); |
| // no spaces around specials |
| spaceRequired = false; |
| break; |
| |
| // Domain literals self delimiting...we can just append them and turn off the space flag. |
| case DOMAIN_LITERAL: |
| addTokenValue(token, buffer); |
| spaceRequired = false; |
| break; |
| |
| // Comments are also self delimitin. |
| case COMMENT: |
| addTokenValue(token, buffer); |
| spaceRequired = false; |
| break; |
| } |
| token = tokens.nextToken(); |
| } |
| return buffer.toString(); |
| } |
| |
| |
| /** |
| * Append a value token on to a string buffer used to create |
| * the canonicalized string value. |
| * |
| * @param token The token we're adding. |
| * @param buffer The target string buffer. |
| */ |
| private void addTokenValue(final AddressToken token, final StringBuffer buffer) { |
| // atom values can be added directly. |
| if (token.type == ATOM) { |
| buffer.append(token.value); |
| } |
| // a literal value? Add this as a quoted string |
| else if (token.type == QUOTED_LITERAL) { |
| buffer.append(formatQuotedString(token.value)); |
| } |
| // could be a domain literal of the form "[value]" |
| else if (token.type == DOMAIN_LITERAL) { |
| buffer.append('['); |
| buffer.append(token.value); |
| buffer.append(']'); |
| } |
| // comments also have values |
| else if (token.type == COMMENT) { |
| buffer.append('('); |
| buffer.append(token.value); |
| buffer.append(')'); |
| } |
| } |
| |
| |
| |
| private static final byte[] CHARMAP = { |
| 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x06, 0x02, 0x06, 0x02, 0x02, 0x06, 0x02, 0x02, |
| 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, |
| 0x04, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x00, 0x01, 0x00, |
| 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x00, 0x01, 0x00, |
| |
| 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, |
| 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x00, 0x00, |
| 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, |
| 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, |
| }; |
| |
| private static final byte FLG_SPECIAL = 1; |
| private static final byte FLG_CONTROL = 2; |
| |
| /** |
| * Quick test to see if a character is an allowed atom character |
| * or not. |
| * |
| * @param ch The test character. |
| * |
| * @return true if this character is allowed in atoms, false for any |
| * control characters, special characters, or blanks. |
| */ |
| public static boolean isAtom(final char ch) { |
| if (ch > '\u007f') { |
| return false; |
| } |
| else if (ch == ' ') { |
| return false; |
| } |
| else { |
| return (CHARMAP[ch] & (FLG_SPECIAL | FLG_CONTROL)) == 0; |
| } |
| } |
| |
| /** |
| * Tests one string to determine if it contains any of the |
| * characters in a supplied test string. |
| * |
| * @param s The string we're testing. |
| * @param chars The set of characters we're testing against. |
| * |
| * @return true if any of the characters is found, false otherwise. |
| */ |
| public static boolean containsCharacters(final String s, final String chars) |
| { |
| for (int i = 0; i < s.length(); i++) { |
| if (chars.indexOf(s.charAt(i)) >= 0) { |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| |
| /** |
| * Tests if a string contains any non-special characters that |
| * would require encoding the value as a quoted string rather |
| * than a simple atom value. |
| * |
| * @param s The test string. |
| * |
| * @return True if the string contains only blanks or allowed atom |
| * characters. |
| */ |
| public static boolean containsSpecials(final String s) |
| { |
| for (int i = 0; i < s.length(); i++) { |
| final char ch = s.charAt(i); |
| // must be either a blank or an allowed atom char. |
| if (ch == ' ' || isAtom(ch)) { |
| continue; |
| } |
| else { |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| |
| /** |
| * Tests if a string contains any non-special characters that |
| * would require encoding the value as a quoted string rather |
| * than a simple atom value. |
| * |
| * @param s The test string. |
| * |
| * @return True if the string contains only blanks or allowed atom |
| * characters. |
| */ |
| public static boolean isAtom(final String s) |
| { |
| for (int i = 0; i < s.length(); i++) { |
| final char ch = s.charAt(i); |
| // must be an allowed atom character |
| if (!isAtom(ch)) { |
| return false; |
| } |
| } |
| return true; |
| } |
| |
| /** |
| * Apply RFC822 quoting rules to a literal string value. This |
| * will search the string to see if there are any characters that |
| * require special escaping, and apply the escapes. If the |
| * string is just a string of blank-delimited atoms, the string |
| * value is returned without quotes. |
| * |
| * @param s The source string. |
| * |
| * @return A version of the string as a valid RFC822 quoted literal. |
| */ |
| public static String quoteString(final String s) { |
| |
| // only backslash and double quote require escaping. If the string does not |
| // contain any of these, then we can just slap on some quotes and go. |
| if (s.indexOf('\\') == -1 && s.indexOf('"') == -1) { |
| // if the string is an atom (or a series of blank-delimited atoms), we can just return it directly. |
| if (!containsSpecials(s)) { |
| return s; |
| } |
| final StringBuffer buffer = new StringBuffer(s.length() + 2); |
| buffer.append('"'); |
| buffer.append(s); |
| buffer.append('"'); |
| return buffer.toString(); |
| } |
| |
| // get a buffer sufficiently large for the string, two quote characters, and a "reasonable" |
| // number of escaped values. |
| final StringBuffer buffer = new StringBuffer(s.length() + 10); |
| buffer.append('"'); |
| |
| // now check all of the characters. |
| for (int i = 0; i < s.length(); i++) { |
| final char ch = s.charAt(i); |
| // character requiring escaping? |
| if (ch == '\\' || ch == '"') { |
| // add an extra backslash |
| buffer.append('\\'); |
| } |
| // and add on the character |
| buffer.append(ch); |
| } |
| buffer.append('"'); |
| return buffer.toString(); |
| } |
| |
| /** |
| * Apply RFC822 quoting rules to a literal string value. This |
| * will search the string to see if there are any characters that |
| * require special escaping, and apply the escapes. The returned |
| * value is enclosed in quotes. |
| * |
| * @param s The source string. |
| * |
| * @return A version of the string as a valid RFC822 quoted literal. |
| */ |
| public static String formatQuotedString(final String s) { |
| // only backslash and double quote require escaping. If the string does not |
| // contain any of these, then we can just slap on some quotes and go. |
| if (s.indexOf('\\') == -1 && s.indexOf('"') == -1) { |
| final StringBuffer buffer = new StringBuffer(s.length() + 2); |
| buffer.append('"'); |
| buffer.append(s); |
| buffer.append('"'); |
| return buffer.toString(); |
| } |
| |
| // get a buffer sufficiently large for the string, two quote characters, and a "reasonable" |
| // number of escaped values. |
| final StringBuffer buffer = new StringBuffer(s.length() + 10); |
| buffer.append('"'); |
| |
| // now check all of the characters. |
| for (int i = 0; i < s.length(); i++) { |
| final char ch = s.charAt(i); |
| // character requiring escaping? |
| if (ch == '\\' || ch == '"') { |
| // add an extra backslash |
| buffer.append('\\'); |
| } |
| // and add on the character |
| buffer.append(ch); |
| } |
| buffer.append('"'); |
| return buffer.toString(); |
| } |
| |
| public class TokenStream { |
| // the set of tokens in the parsed address list, as determined by RFC822 syntax rules. |
| private final List tokens; |
| |
| // the current token position |
| int currentToken = 0; |
| |
| |
| /** |
| * Default constructor for a TokenStream. This creates an |
| * empty TokenStream for purposes of tokenizing an address. |
| * It is the creator's responsibility to terminate the stream |
| * with a terminator token. |
| */ |
| public TokenStream() { |
| tokens = new ArrayList(); |
| } |
| |
| |
| /** |
| * Construct a TokenStream from a list of tokens. A terminator |
| * token is added to the end. |
| * |
| * @param tokens An existing token list. |
| */ |
| public TokenStream(final List tokens) { |
| this.tokens = tokens; |
| tokens.add(new AddressToken(END_OF_TOKENS, -1)); |
| } |
| |
| /** |
| * Add an address token to the token list. |
| * |
| * @param t The new token to add to the list. |
| */ |
| public void addToken(final AddressToken token) { |
| tokens.add(token); |
| } |
| |
| /** |
| * Get the next token at the cursor position, advancing the |
| * position accordingly. |
| * |
| * @return The token at the current token position. |
| */ |
| public AddressToken nextToken() { |
| AddressToken token = (AddressToken)tokens.get(currentToken++); |
| // we skip over white space tokens when operating in this mode, so |
| // check the token and iterate until we get a non-white space. |
| while (token.type == WHITESPACE) { |
| token = (AddressToken)tokens.get(currentToken++); |
| } |
| return token; |
| } |
| |
| |
| /** |
| * Get the next token at the cursor position, without advancing the |
| * position. |
| * |
| * @return The token at the current token position. |
| */ |
| public AddressToken currentToken() { |
| // return the current token and step the cursor |
| return (AddressToken)tokens.get(currentToken); |
| } |
| |
| |
| /** |
| * Get the next non-comment token from the string. Comments are ignored, except as personal information |
| * for very simple address specifications. |
| * |
| * @return A token guaranteed not to be a whitespace token. |
| */ |
| public AddressToken nextRealToken() |
| { |
| AddressToken token = nextToken(); |
| if (token.type == COMMENT) { |
| token = nextToken(); |
| } |
| return token; |
| } |
| |
| /** |
| * Push a token back on to the queue, making the index of this |
| * token the current cursor position. |
| * |
| * @param token The token to push. |
| */ |
| public void pushToken(final AddressToken token) { |
| // just reset the cursor to the token's index position. |
| currentToken = tokenIndex(token); |
| } |
| |
| /** |
| * Get the next token after a given token, without advancing the |
| * token position. |
| * |
| * @param token The token we're retrieving a token relative to. |
| * |
| * @return The next token in the list. |
| */ |
| public AddressToken nextToken(final AddressToken token) { |
| return (AddressToken)tokens.get(tokenIndex(token) + 1); |
| } |
| |
| |
| /** |
| * Return the token prior to a given token. |
| * |
| * @param token The token used for the index. |
| * |
| * @return The token prior to the index token in the list. |
| */ |
| public AddressToken previousToken(final AddressToken token) { |
| return (AddressToken)tokens.get(tokenIndex(token) - 1); |
| } |
| |
| |
| /** |
| * Retrieve a token at a given index position. |
| * |
| * @param index The target index. |
| */ |
| public AddressToken getToken(final int index) |
| { |
| return (AddressToken)tokens.get(index); |
| } |
| |
| |
| /** |
| * Retrieve the index of a particular token in the stream. |
| * |
| * @param token The target token. |
| * |
| * @return The index of the token within the stream. Returns -1 if this |
| * token is somehow not in the stream. |
| */ |
| public int tokenIndex(final AddressToken token) { |
| return tokens.indexOf(token); |
| } |
| |
| |
| /** |
| * Extract a new TokenStream running from the start token to the |
| * token preceeding the end token. |
| * |
| * @param start The starting token of the section. |
| * @param end The last token (+1) for the target section. |
| * |
| * @return A new TokenStream object for processing this section of tokens. |
| */ |
| public TokenStream section(final AddressToken start, final AddressToken end) { |
| final int startIndex = tokenIndex(start); |
| final int endIndex = tokenIndex(end); |
| |
| // List.subList() returns a list backed by the original list. Since we need to add a |
| // terminator token to this list when we take the sublist, we need to manually copy the |
| // references so we don't end up munging the original list. |
| final ArrayList list = new ArrayList(endIndex - startIndex + 2); |
| |
| for (int i = startIndex; i <= endIndex; i++) { |
| list.add(tokens.get(i)); |
| } |
| return new TokenStream(list); |
| } |
| |
| |
| /** |
| * Reset the token position back to the beginning of the |
| * stream. |
| */ |
| public void reset() { |
| currentToken = 0; |
| } |
| |
| /** |
| * Scan forward looking for a non-blank token. |
| * |
| * @return The first non-blank token in the stream. |
| */ |
| public AddressToken getNonBlank() |
| { |
| AddressToken token = currentToken(); |
| while (token.type == WHITESPACE) { |
| currentToken++; |
| token = currentToken(); |
| } |
| return token; |
| } |
| |
| |
| /** |
| * Extract a blank delimited token from a TokenStream. A blank |
| * delimited token is the set of tokens up to the next real whitespace |
| * token (comments not included). |
| * |
| * @return A TokenStream object with the new set of tokens. |
| */ |
| public TokenStream getBlankDelimitedToken() |
| { |
| // get the next non-whitespace token. |
| final AddressToken first = getNonBlank(); |
| // if this is the end, we return null. |
| if (first.type == END_OF_TOKENS) { |
| return null; |
| } |
| |
| AddressToken last = first; |
| |
| // the methods for retrieving tokens skip over whitespace, so we're going to process this |
| // by index. |
| currentToken++; |
| |
| AddressToken token = currentToken(); |
| while (true) { |
| // if this is our marker, then pluck out the section and return it. |
| if (token.type == END_OF_TOKENS || token.type == WHITESPACE) { |
| return section(first, last); |
| } |
| last = token; |
| currentToken++; |
| // we accept any and all tokens here. |
| token = currentToken(); |
| } |
| } |
| |
| /** |
| * Return the index of the current cursor position. |
| * |
| * @return The integer index of the current token. |
| */ |
| public int currentIndex() { |
| return currentToken; |
| } |
| |
| public void dumpTokens() |
| { |
| System.out.println(">>>>>>>>> Start dumping TokenStream tokens"); |
| for (int i = 0; i < tokens.size(); i++) { |
| System.out.println("-------- Token: " + tokens.get(i)); |
| } |
| |
| System.out.println("++++++++ cursor position=" + currentToken); |
| System.out.println(">>>>>>>>> End dumping TokenStream tokens"); |
| } |
| } |
| |
| |
| /** |
| * Simple utility class for representing address tokens. |
| */ |
| public class AddressToken { |
| |
| // the token type |
| int type; |
| |
| // string value of the token (can be null) |
| String value; |
| |
| // position of the token within the address string. |
| int position; |
| |
| AddressToken(final int type, final int position) |
| { |
| this.type = type; |
| this.value = null; |
| this.position = position; |
| } |
| |
| AddressToken(final String value, final int type, final int position) |
| { |
| this.type = type; |
| this.value = value; |
| this.position = position; |
| } |
| |
| @Override |
| public String toString() |
| { |
| if (type == END_OF_TOKENS) { |
| return "AddressToken: type=END_OF_TOKENS"; |
| } |
| if (value == null) { |
| return "AddressToken: type=" + (char)type; |
| } |
| else { |
| return "AddressToken: type=" + (char)type + " value=" + value; |
| } |
| } |
| } |
| } |
| |