| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.commons.lang2.text; |
| |
| import java.util.ArrayList; |
| import java.util.Collections; |
| import java.util.List; |
| import java.util.ListIterator; |
| import java.util.NoSuchElementException; |
| |
| /** |
| * Tokenizes a string based based on delimiters (separators) |
| * and supporting quoting and ignored character concepts. |
| * <p> |
| * This class can split a String into many smaller strings. It aims |
| * to do a similar job to {@link java.util.StringTokenizer StringTokenizer}, |
| * however it offers much more control and flexibility including implementing |
| * the <code>ListIterator</code> interface. By default, it is setup |
| * like <code>StringTokenizer</code>. |
| * <p> |
| * The input String is split into a number of <i>tokens</i>. |
| * Each token is separated from the next String by a <i>delimiter</i>. |
| * One or more delimiter characters must be specified. |
| * <p> |
| * Each token may be surrounded by quotes. |
| * The <i>quote</i> matcher specifies the quote character(s). |
| * A quote may be escaped within a quoted section by duplicating itself. |
| * <p> |
| * Between each token and the delimiter are potentially characters that need trimming. |
| * The <i>trimmer</i> matcher specifies these characters. |
| * One usage might be to trim whitespace characters. |
| * <p> |
| * At any point outside the quotes there might potentially be invalid characters. |
| * The <i>ignored</i> matcher specifies these characters to be removed. |
| * One usage might be to remove new line characters. |
| * <p> |
| * Empty tokens may be removed or returned as null. |
| * <pre> |
| * "a,b,c" - Three tokens "a","b","c" (comma delimiter) |
| * " a, b , c " - Three tokens "a","b","c" (default CSV processing trims whitespace) |
| * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched) |
| * </pre> |
| * <p> |
| * |
| * This tokenizer has the following properties and options: |
| * |
| * <table> |
| * <tr> |
| * <th>Property</th><th>Type</th><th>Default</th> |
| * </tr> |
| * <tr> |
| * <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td> |
| * </tr> |
| * <tr> |
| * <td>quote</td><td>NoneMatcher</td><td>{}</td> |
| * </tr> |
| * <tr> |
| * <td>ignore</td><td>NoneMatcher</td><td>{}</td> |
| * </tr> |
| * <tr> |
| * <td>emptyTokenAsNull</td><td>boolean</td><td>false</td> |
| * </tr> |
| * <tr> |
| * <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td> |
| * </tr> |
| * </table> |
| * |
| * @author Matthew Inger |
| * @author Stephen Colebourne |
| * @author Gary D. Gregory |
| * @since 2.2 |
| * @version $Id$ |
| */ |
| public class StrTokenizer implements ListIterator, Cloneable { |
| |
| private static final StrTokenizer CSV_TOKENIZER_PROTOTYPE; |
| private static final StrTokenizer TSV_TOKENIZER_PROTOTYPE; |
| static { |
| CSV_TOKENIZER_PROTOTYPE = new StrTokenizer(); |
| CSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.commaMatcher()); |
| CSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher()); |
| CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher()); |
| CSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher()); |
| CSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false); |
| CSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false); |
| |
| TSV_TOKENIZER_PROTOTYPE = new StrTokenizer(); |
| TSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.tabMatcher()); |
| TSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher()); |
| TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher()); |
| TSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher()); |
| TSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false); |
| TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false); |
| } |
| |
| /** The text to work on. */ |
| private char chars[]; |
| /** The parsed tokens */ |
| private String tokens[]; |
| /** The current iteration position */ |
| private int tokenPos; |
| |
| /** The delimiter matcher */ |
| private StrMatcher delimMatcher = StrMatcher.splitMatcher(); |
| /** The quote matcher */ |
| private StrMatcher quoteMatcher = StrMatcher.noneMatcher(); |
| /** The ignored matcher */ |
| private StrMatcher ignoredMatcher = StrMatcher.noneMatcher(); |
| /** The trimmer matcher */ |
| private StrMatcher trimmerMatcher = StrMatcher.noneMatcher(); |
| |
| /** Whether to return empty tokens as null */ |
| private boolean emptyAsNull = false; |
| /** Whether to ignore empty tokens */ |
| private boolean ignoreEmptyTokens = true; |
| |
| //----------------------------------------------------------------------- |
| |
| /** |
| * Returns a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>. |
| * |
| * @return a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>. |
| */ |
| private static StrTokenizer getCSVClone() { |
| return (StrTokenizer) CSV_TOKENIZER_PROTOTYPE.clone(); |
| } |
| |
| /** |
| * Gets a new tokenizer instance which parses Comma Seperated Value strings |
| * initializing it with the given input. The default for CSV processing |
| * will be trim whitespace from both ends (which can be overriden with |
| * the setTrimmer method). |
| * <p> |
| * You must call a "reset" method to set the string which you want to parse. |
| * @return a new tokenizer instance which parses Comma Seperated Value strings |
| */ |
| public static StrTokenizer getCSVInstance() { |
| return getCSVClone(); |
| } |
| |
| /** |
| * Gets a new tokenizer instance which parses Comma Seperated Value strings |
| * initializing it with the given input. The default for CSV processing |
| * will be trim whitespace from both ends (which can be overriden with |
| * the setTrimmer method). |
| * |
| * @param input the text to parse |
| * @return a new tokenizer instance which parses Comma Seperated Value strings |
| */ |
| public static StrTokenizer getCSVInstance(String input) { |
| StrTokenizer tok = getCSVClone(); |
| tok.reset(input); |
| return tok; |
| } |
| |
| /** |
| * Gets a new tokenizer instance which parses Comma Seperated Value strings |
| * initializing it with the given input. The default for CSV processing |
| * will be trim whitespace from both ends (which can be overriden with |
| * the setTrimmer method). |
| * |
| * @param input the text to parse |
| * @return a new tokenizer instance which parses Comma Seperated Value strings |
| */ |
| public static StrTokenizer getCSVInstance(char[] input) { |
| StrTokenizer tok = getCSVClone(); |
| tok.reset(input); |
| return tok; |
| } |
| |
| /** |
| * Returns a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>. |
| * |
| * @return a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>. |
| */ |
| private static StrTokenizer getTSVClone() { |
| return (StrTokenizer) TSV_TOKENIZER_PROTOTYPE.clone(); |
| } |
| |
| |
| /** |
| * Gets a new tokenizer instance which parses Tab Seperated Value strings. |
| * The default for CSV processing will be trim whitespace from both ends |
| * (which can be overriden with the setTrimmer method). |
| * <p> |
| * You must call a "reset" method to set the string which you want to parse. |
| * @return a new tokenizer instance which parses Tab Seperated Value strings. |
| */ |
| public static StrTokenizer getTSVInstance() { |
| return getTSVClone(); |
| } |
| |
| /** |
| * Gets a new tokenizer instance which parses Tab Seperated Value strings. |
| * The default for CSV processing will be trim whitespace from both ends |
| * (which can be overriden with the setTrimmer method). |
| * @param input the string to parse |
| * @return a new tokenizer instance which parses Tab Seperated Value strings. |
| */ |
| public static StrTokenizer getTSVInstance(String input) { |
| StrTokenizer tok = getTSVClone(); |
| tok.reset(input); |
| return tok; |
| } |
| |
| /** |
| * Gets a new tokenizer instance which parses Tab Seperated Value strings. |
| * The default for CSV processing will be trim whitespace from both ends |
| * (which can be overriden with the setTrimmer method). |
| * @param input the string to parse |
| * @return a new tokenizer instance which parses Tab Seperated Value strings. |
| */ |
| public static StrTokenizer getTSVInstance(char[] input) { |
| StrTokenizer tok = getTSVClone(); |
| tok.reset(input); |
| return tok; |
| } |
| |
| //----------------------------------------------------------------------- |
| /** |
| * Constructs a tokenizer splitting on space, tab, newline and formfeed |
| * as per StringTokenizer, but with no text to tokenize. |
| * <p> |
| * This constructor is normally used with {@link #reset(String)}. |
| */ |
| public StrTokenizer() { |
| super(); |
| this.chars = null; |
| } |
| |
| /** |
| * Constructs a tokenizer splitting on space, tab, newline and formfeed |
| * as per StringTokenizer. |
| * |
| * @param input the string which is to be parsed |
| */ |
| public StrTokenizer(String input) { |
| super(); |
| if (input != null) { |
| chars = input.toCharArray(); |
| } else { |
| chars = null; |
| } |
| } |
| |
| /** |
| * Constructs a tokenizer splitting on the specified delimiter character. |
| * |
| * @param input the string which is to be parsed |
| * @param delim the field delimiter character |
| */ |
| public StrTokenizer(String input, char delim) { |
| this(input); |
| setDelimiterChar(delim); |
| } |
| |
| /** |
| * Constructs a tokenizer splitting on the specified delimiter string. |
| * |
| * @param input the string which is to be parsed |
| * @param delim the field delimiter string |
| */ |
| public StrTokenizer(String input, String delim) { |
| this(input); |
| setDelimiterString(delim); |
| } |
| |
| /** |
| * Constructs a tokenizer splitting using the specified delimiter matcher. |
| * |
| * @param input the string which is to be parsed |
| * @param delim the field delimiter matcher |
| */ |
| public StrTokenizer(String input, StrMatcher delim) { |
| this(input); |
| setDelimiterMatcher(delim); |
| } |
| |
| /** |
| * Constructs a tokenizer splitting on the specified delimiter character |
| * and handling quotes using the specified quote character. |
| * |
| * @param input the string which is to be parsed |
| * @param delim the field delimiter character |
| * @param quote the field quoted string character |
| */ |
| public StrTokenizer(String input, char delim, char quote) { |
| this(input, delim); |
| setQuoteChar(quote); |
| } |
| |
| /** |
| * Constructs a tokenizer splitting using the specified delimiter matcher |
| * and handling quotes using the specified quote matcher. |
| * |
| * @param input the string which is to be parsed |
| * @param delim the field delimiter matcher |
| * @param quote the field quoted string matcher |
| */ |
| public StrTokenizer(String input, StrMatcher delim, StrMatcher quote) { |
| this(input, delim); |
| setQuoteMatcher(quote); |
| } |
| |
| /** |
| * Constructs a tokenizer splitting on space, tab, newline and formfeed |
| * as per StringTokenizer. |
| * <p> |
| * The input character array is not cloned, and must not be altered after |
| * passing in to this method. |
| * |
| * @param input the string which is to be parsed, not cloned |
| */ |
| public StrTokenizer(char[] input) { |
| super(); |
| this.chars = input; |
| } |
| |
| /** |
| * Constructs a tokenizer splitting on the specified character. |
| * <p> |
| * The input character array is not cloned, and must not be altered after |
| * passing in to this method. |
| * |
| * @param input the string which is to be parsed, not cloned |
| * @param delim the field delimiter character |
| */ |
| public StrTokenizer(char[] input, char delim) { |
| this(input); |
| setDelimiterChar(delim); |
| } |
| |
| /** |
| * Constructs a tokenizer splitting on the specified string. |
| * <p> |
| * The input character array is not cloned, and must not be altered after |
| * passing in to this method. |
| * |
| * @param input the string which is to be parsed, not cloned |
| * @param delim the field delimiter string |
| */ |
| public StrTokenizer(char[] input, String delim) { |
| this(input); |
| setDelimiterString(delim); |
| } |
| |
| /** |
| * Constructs a tokenizer splitting using the specified delimiter matcher. |
| * <p> |
| * The input character array is not cloned, and must not be altered after |
| * passing in to this method. |
| * |
| * @param input the string which is to be parsed, not cloned |
| * @param delim the field delimiter matcher |
| */ |
| public StrTokenizer(char[] input, StrMatcher delim) { |
| this(input); |
| setDelimiterMatcher(delim); |
| } |
| |
| /** |
| * Constructs a tokenizer splitting on the specified delimiter character |
| * and handling quotes using the specified quote character. |
| * <p> |
| * The input character array is not cloned, and must not be altered after |
| * passing in to this method. |
| * |
| * @param input the string which is to be parsed, not cloned |
| * @param delim the field delimiter character |
| * @param quote the field quoted string character |
| */ |
| public StrTokenizer(char[] input, char delim, char quote) { |
| this(input, delim); |
| setQuoteChar(quote); |
| } |
| |
| /** |
| * Constructs a tokenizer splitting using the specified delimiter matcher |
| * and handling quotes using the specified quote matcher. |
| * <p> |
| * The input character array is not cloned, and must not be altered after |
| * passing in to this method. |
| * |
| * @param input the string which is to be parsed, not cloned |
| * @param delim the field delimiter character |
| * @param quote the field quoted string character |
| */ |
| public StrTokenizer(char[] input, StrMatcher delim, StrMatcher quote) { |
| this(input, delim); |
| setQuoteMatcher(quote); |
| } |
| |
| // API |
| //----------------------------------------------------------------------- |
| /** |
| * Gets the number of tokens found in the String. |
| * |
| * @return the number of matched tokens |
| */ |
| public int size() { |
| checkTokenized(); |
| return tokens.length; |
| } |
| |
| /** |
| * Gets the next token from the String. |
| * |
| * @return the next sequential token, or null when no more tokens are found |
| */ |
| public String nextToken() { |
| if (hasNext()) { |
| return tokens[tokenPos++]; |
| } |
| return null; |
| } |
| |
| /** |
| * Gets the previous token from the String. |
| * |
| * @return the previous sequential token, or null when no more tokens are found |
| */ |
| public String previousToken() { |
| if (hasPrevious()) { |
| return tokens[--tokenPos]; |
| } |
| return null; |
| } |
| |
| /** |
| * Gets a copy of the full token list as an independent modifiable array. |
| * |
| * @return the tokens as a String array |
| */ |
| public String[] getTokenArray() { |
| checkTokenized(); |
| return (String[]) tokens.clone(); |
| } |
| |
| /** |
| * Gets a copy of the full token list as an independent modifiable list. |
| * |
| * @return the tokens as a String array |
| */ |
| public List getTokenList() { |
| checkTokenized(); |
| List list = new ArrayList(tokens.length); |
| for (int i = 0; i < tokens.length; i++) { |
| list.add(tokens[i]); |
| } |
| return list; |
| } |
| |
| /** |
| * Resets this tokenizer, forgetting all parsing and iteration already completed. |
| * <p> |
| * This method allows the same tokenizer to be reused for the same String. |
| * |
| * @return this, to enable chaining |
| */ |
| public StrTokenizer reset() { |
| tokenPos = 0; |
| tokens = null; |
| return this; |
| } |
| |
| /** |
| * Reset this tokenizer, giving it a new input string to parse. |
| * In this manner you can re-use a tokenizer with the same settings |
| * on multiple input lines. |
| * |
| * @param input the new string to tokenize, null sets no text to parse |
| * @return this, to enable chaining |
| */ |
| public StrTokenizer reset(String input) { |
| reset(); |
| if (input != null) { |
| this.chars = input.toCharArray(); |
| } else { |
| this.chars = null; |
| } |
| return this; |
| } |
| |
| /** |
| * Reset this tokenizer, giving it a new input string to parse. |
| * In this manner you can re-use a tokenizer with the same settings |
| * on multiple input lines. |
| * <p> |
| * The input character array is not cloned, and must not be altered after |
| * passing in to this method. |
| * |
| * @param input the new character array to tokenize, not cloned, null sets no text to parse |
| * @return this, to enable chaining |
| */ |
| public StrTokenizer reset(char[] input) { |
| reset(); |
| this.chars = input; |
| return this; |
| } |
| |
| // ListIterator |
| //----------------------------------------------------------------------- |
| /** |
| * Checks whether there are any more tokens. |
| * |
| * @return true if there are more tokens |
| */ |
| public boolean hasNext() { |
| checkTokenized(); |
| return tokenPos < tokens.length; |
| } |
| |
| /** |
| * Gets the next token. This method is equivalent to {@link #nextToken()}. |
| * |
| * @return the next String token |
| */ |
| public Object next() { |
| if (hasNext()) { |
| return tokens[tokenPos++]; |
| } |
| throw new NoSuchElementException(); |
| } |
| |
| /** |
| * Gets the index of the next token to return. |
| * |
| * @return the next token index |
| */ |
| public int nextIndex() { |
| return tokenPos; |
| } |
| |
| /** |
| * Checks whether there are any previous tokens that can be iterated to. |
| * |
| * @return true if there are previous tokens |
| */ |
| public boolean hasPrevious() { |
| checkTokenized(); |
| return tokenPos > 0; |
| } |
| |
| /** |
| * Gets the token previous to the last returned token. |
| * |
| * @return the previous token |
| */ |
| public Object previous() { |
| if (hasPrevious()) { |
| return tokens[--tokenPos]; |
| } |
| throw new NoSuchElementException(); |
| } |
| |
| /** |
| * Gets the index of the previous token. |
| * |
| * @return the previous token index |
| */ |
| public int previousIndex() { |
| return tokenPos - 1; |
| } |
| |
| /** |
| * Unsupported ListIterator operation. |
| * |
| * @throws UnsupportedOperationException always |
| */ |
| public void remove() { |
| throw new UnsupportedOperationException("remove() is unsupported"); |
| } |
| |
| /** |
| * Unsupported ListIterator operation. |
| * @param obj this parameter ignored. |
| * @throws UnsupportedOperationException always |
| */ |
| public void set(Object obj) { |
| throw new UnsupportedOperationException("set() is unsupported"); |
| } |
| |
| /** |
| * Unsupported ListIterator operation. |
| * @param obj this parameter ignored. |
| * @throws UnsupportedOperationException always |
| */ |
| public void add(Object obj) { |
| throw new UnsupportedOperationException("add() is unsupported"); |
| } |
| |
| // Implementation |
| //----------------------------------------------------------------------- |
| /** |
| * Checks if tokenization has been done, and if not then do it. |
| */ |
| private void checkTokenized() { |
| if (tokens == null) { |
| if (chars == null) { |
| // still call tokenize as subclass may do some work |
| List split = tokenize(null, 0, 0); |
| tokens = (String[]) split.toArray(new String[split.size()]); |
| } else { |
| List split = tokenize(chars, 0, chars.length); |
| tokens = (String[]) split.toArray(new String[split.size()]); |
| } |
| } |
| } |
| |
| /** |
| * Internal method to performs the tokenization. |
| * <p> |
| * Most users of this class do not need to call this method. This method |
| * will be called automatically by other (public) methods when required. |
| * <p> |
| * This method exists to allow subclasses to add code before or after the |
| * tokenization. For example, a subclass could alter the character array, |
| * offset or count to be parsed, or call the tokenizer multiple times on |
| * multiple strings. It is also be possible to filter the results. |
| * <p> |
| * <code>StrTokenizer</code> will always pass a zero offset and a count |
| * equal to the length of the array to this method, however a subclass |
| * may pass other values, or even an entirely different array. |
| * |
| * @param chars the character array being tokenized, may be null |
| * @param offset the start position within the character array, must be valid |
| * @param count the number of characters to tokenize, must be valid |
| * @return the modifiable list of String tokens, unmodifiable if null array or zero count |
| */ |
| protected List tokenize(char[] chars, int offset, int count) { |
| if (chars == null || count == 0) { |
| return Collections.EMPTY_LIST; |
| } |
| StrBuilder buf = new StrBuilder(); |
| List tokens = new ArrayList(); |
| int pos = offset; |
| |
| // loop around the entire buffer |
| while (pos >= 0 && pos < count) { |
| // find next token |
| pos = readNextToken(chars, pos, count, buf, tokens); |
| |
| // handle case where end of string is a delimiter |
| if (pos >= count) { |
| addToken(tokens, ""); |
| } |
| } |
| return tokens; |
| } |
| |
| /** |
| * Adds a token to a list, paying attention to the parameters we've set. |
| * |
| * @param list the list to add to |
| * @param tok the token to add |
| */ |
| private void addToken(List list, String tok) { |
| if (tok == null || tok.length() == 0) { |
| if (isIgnoreEmptyTokens()) { |
| return; |
| } |
| if (isEmptyTokenAsNull()) { |
| tok = null; |
| } |
| } |
| list.add(tok); |
| } |
| |
| /** |
| * Reads character by character through the String to get the next token. |
| * |
| * @param chars the character array being tokenized |
| * @param start the first character of field |
| * @param len the length of the character array being tokenized |
| * @param workArea a temporary work area |
| * @param tokens the list of parsed tokens |
| * @return the starting position of the next field (the character |
| * immediately after the delimiter), or -1 if end of string found |
| */ |
| private int readNextToken(char[] chars, int start, int len, StrBuilder workArea, List tokens) { |
| // skip all leading whitespace, unless it is the |
| // field delimiter or the quote character |
| while (start < len) { |
| int removeLen = Math.max( |
| getIgnoredMatcher().isMatch(chars, start, start, len), |
| getTrimmerMatcher().isMatch(chars, start, start, len)); |
| if (removeLen == 0 || |
| getDelimiterMatcher().isMatch(chars, start, start, len) > 0 || |
| getQuoteMatcher().isMatch(chars, start, start, len) > 0) { |
| break; |
| } |
| start += removeLen; |
| } |
| |
| // handle reaching end |
| if (start >= len) { |
| addToken(tokens, ""); |
| return -1; |
| } |
| |
| // handle empty token |
| int delimLen = getDelimiterMatcher().isMatch(chars, start, start, len); |
| if (delimLen > 0) { |
| addToken(tokens, ""); |
| return start + delimLen; |
| } |
| |
| // handle found token |
| int quoteLen = getQuoteMatcher().isMatch(chars, start, start, len); |
| if (quoteLen > 0) { |
| return readWithQuotes(chars, start + quoteLen, len, workArea, tokens, start, quoteLen); |
| } |
| return readWithQuotes(chars, start, len, workArea, tokens, 0, 0); |
| } |
| |
| /** |
| * Reads a possibly quoted string token. |
| * |
| * @param chars the character array being tokenized |
| * @param start the first character of field |
| * @param len the length of the character array being tokenized |
| * @param workArea a temporary work area |
| * @param tokens the list of parsed tokens |
| * @param quoteStart the start position of the matched quote, 0 if no quoting |
| * @param quoteLen the length of the matched quote, 0 if no quoting |
| * @return the starting position of the next field (the character |
| * immediately after the delimiter, or if end of string found, |
| * then the length of string |
| */ |
| private int readWithQuotes(char[] chars, int start, int len, StrBuilder workArea, |
| List tokens, int quoteStart, int quoteLen) |
| { |
| // Loop until we've found the end of the quoted |
| // string or the end of the input |
| workArea.clear(); |
| int pos = start; |
| boolean quoting = (quoteLen > 0); |
| int trimStart = 0; |
| |
| while (pos < len) { |
| // quoting mode can occur several times throughout a string |
| // we must switch between quoting and non-quoting until we |
| // encounter a non-quoted delimiter, or end of string |
| if (quoting) { |
| // In quoting mode |
| |
| // If we've found a quote character, see if it's |
| // followed by a second quote. If so, then we need |
| // to actually put the quote character into the token |
| // rather than end the token. |
| if (isQuote(chars, pos, len, quoteStart, quoteLen)) { |
| if (isQuote(chars, pos + quoteLen, len, quoteStart, quoteLen)) { |
| // matched pair of quotes, thus an escaped quote |
| workArea.append(chars, pos, quoteLen); |
| pos += (quoteLen * 2); |
| trimStart = workArea.size(); |
| continue; |
| } |
| |
| // end of quoting |
| quoting = false; |
| pos += quoteLen; |
| continue; |
| } |
| |
| // copy regular character from inside quotes |
| workArea.append(chars[pos++]); |
| trimStart = workArea.size(); |
| |
| } else { |
| // Not in quoting mode |
| |
| // check for delimiter, and thus end of token |
| int delimLen = getDelimiterMatcher().isMatch(chars, pos, start, len); |
| if (delimLen > 0) { |
| // return condition when end of token found |
| addToken(tokens, workArea.substring(0, trimStart)); |
| return pos + delimLen; |
| } |
| |
| // check for quote, and thus back into quoting mode |
| if (quoteLen > 0) { |
| if (isQuote(chars, pos, len, quoteStart, quoteLen)) { |
| quoting = true; |
| pos += quoteLen; |
| continue; |
| } |
| } |
| |
| // check for ignored (outside quotes), and ignore |
| int ignoredLen = getIgnoredMatcher().isMatch(chars, pos, start, len); |
| if (ignoredLen > 0) { |
| pos += ignoredLen; |
| continue; |
| } |
| |
| // check for trimmed character |
| // don't yet know if its at the end, so copy to workArea |
| // use trimStart to keep track of trim at the end |
| int trimmedLen = getTrimmerMatcher().isMatch(chars, pos, start, len); |
| if (trimmedLen > 0) { |
| workArea.append(chars, pos, trimmedLen); |
| pos += trimmedLen; |
| continue; |
| } |
| |
| // copy regular character from outside quotes |
| workArea.append(chars[pos++]); |
| trimStart = workArea.size(); |
| } |
| } |
| |
| // return condition when end of string found |
| addToken(tokens, workArea.substring(0, trimStart)); |
| return -1; |
| } |
| |
| /** |
| * Checks if the characters at the index specified match the quote |
| * already matched in readNextToken(). |
| * |
| * @param chars the character array being tokenized |
| * @param pos the position to check for a quote |
| * @param len the length of the character array being tokenized |
| * @param quoteStart the start position of the matched quote, 0 if no quoting |
| * @param quoteLen the length of the matched quote, 0 if no quoting |
| * @return true if a quote is matched |
| */ |
| private boolean isQuote(char[] chars, int pos, int len, int quoteStart, int quoteLen) { |
| for (int i = 0; i < quoteLen; i++) { |
| if ((pos + i) >= len || chars[pos + i] != chars[quoteStart + i]) { |
| return false; |
| } |
| } |
| return true; |
| } |
| |
| // Delimiter |
| //----------------------------------------------------------------------- |
| /** |
| * Gets the field delimiter matcher. |
| * |
| * @return the delimiter matcher in use |
| */ |
| public StrMatcher getDelimiterMatcher() { |
| return this.delimMatcher; |
| } |
| |
| /** |
| * Sets the field delimiter matcher. |
| * <p> |
| * The delimitier is used to separate one token from another. |
| * |
| * @param delim the delimiter matcher to use |
| * @return this, to enable chaining |
| */ |
| public StrTokenizer setDelimiterMatcher(StrMatcher delim) { |
| if (delim == null) { |
| this.delimMatcher = StrMatcher.noneMatcher(); |
| } else { |
| this.delimMatcher = delim; |
| } |
| return this; |
| } |
| |
| /** |
| * Sets the field delimiter character. |
| * |
| * @param delim the delimiter character to use |
| * @return this, to enable chaining |
| */ |
| public StrTokenizer setDelimiterChar(char delim) { |
| return setDelimiterMatcher(StrMatcher.charMatcher(delim)); |
| } |
| |
| /** |
| * Sets the field delimiter string. |
| * |
| * @param delim the delimiter string to use |
| * @return this, to enable chaining |
| */ |
| public StrTokenizer setDelimiterString(String delim) { |
| return setDelimiterMatcher(StrMatcher.stringMatcher(delim)); |
| } |
| |
| // Quote |
| //----------------------------------------------------------------------- |
| /** |
| * Gets the quote matcher currently in use. |
| * <p> |
| * The quote character is used to wrap data between the tokens. |
| * This enables delimiters to be entered as data. |
| * The default value is '"' (double quote). |
| * |
| * @return the quote matcher in use |
| */ |
| public StrMatcher getQuoteMatcher() { |
| return quoteMatcher; |
| } |
| |
| /** |
| * Set the quote matcher to use. |
| * <p> |
| * The quote character is used to wrap data between the tokens. |
| * This enables delimiters to be entered as data. |
| * |
| * @param quote the quote matcher to use, null ignored |
| * @return this, to enable chaining |
| */ |
| public StrTokenizer setQuoteMatcher(StrMatcher quote) { |
| if (quote != null) { |
| this.quoteMatcher = quote; |
| } |
| return this; |
| } |
| |
| /** |
| * Sets the quote character to use. |
| * <p> |
| * The quote character is used to wrap data between the tokens. |
| * This enables delimiters to be entered as data. |
| * |
| * @param quote the quote character to use |
| * @return this, to enable chaining |
| */ |
| public StrTokenizer setQuoteChar(char quote) { |
| return setQuoteMatcher(StrMatcher.charMatcher(quote)); |
| } |
| |
| // Ignored |
| //----------------------------------------------------------------------- |
| /** |
| * Gets the ignored character matcher. |
| * <p> |
| * These characters are ignored when parsing the String, unless they are |
| * within a quoted region. |
| * The default value is not to ignore anything. |
| * |
| * @return the ignored matcher in use |
| */ |
| public StrMatcher getIgnoredMatcher() { |
| return ignoredMatcher; |
| } |
| |
| /** |
| * Set the matcher for characters to ignore. |
| * <p> |
| * These characters are ignored when parsing the String, unless they are |
| * within a quoted region. |
| * |
| * @param ignored the ignored matcher to use, null ignored |
| * @return this, to enable chaining |
| */ |
| public StrTokenizer setIgnoredMatcher(StrMatcher ignored) { |
| if (ignored != null) { |
| this.ignoredMatcher = ignored; |
| } |
| return this; |
| } |
| |
| /** |
| * Set the character to ignore. |
| * <p> |
| * This character is ignored when parsing the String, unless it is |
| * within a quoted region. |
| * |
| * @param ignored the ignored character to use |
| * @return this, to enable chaining |
| */ |
| public StrTokenizer setIgnoredChar(char ignored) { |
| return setIgnoredMatcher(StrMatcher.charMatcher(ignored)); |
| } |
| |
| // Trimmer |
| //----------------------------------------------------------------------- |
| /** |
| * Gets the trimmer character matcher. |
| * <p> |
| * These characters are trimmed off on each side of the delimiter |
| * until the token or quote is found. |
| * The default value is not to trim anything. |
| * |
| * @return the trimmer matcher in use |
| */ |
| public StrMatcher getTrimmerMatcher() { |
| return trimmerMatcher; |
| } |
| |
| /** |
| * Sets the matcher for characters to trim. |
| * <p> |
| * These characters are trimmed off on each side of the delimiter |
| * until the token or quote is found. |
| * |
| * @param trimmer the trimmer matcher to use, null ignored |
| * @return this, to enable chaining |
| */ |
| public StrTokenizer setTrimmerMatcher(StrMatcher trimmer) { |
| if (trimmer != null) { |
| this.trimmerMatcher = trimmer; |
| } |
| return this; |
| } |
| |
| //----------------------------------------------------------------------- |
| /** |
| * Gets whether the tokenizer currently returns empty tokens as null. |
| * The default for this property is false. |
| * |
| * @return true if empty tokens are returned as null |
| */ |
| public boolean isEmptyTokenAsNull() { |
| return this.emptyAsNull; |
| } |
| |
| /** |
| * Sets whether the tokenizer should return empty tokens as null. |
| * The default for this property is false. |
| * |
| * @param emptyAsNull whether empty tokens are returned as null |
| * @return this, to enable chaining |
| */ |
| public StrTokenizer setEmptyTokenAsNull(boolean emptyAsNull) { |
| this.emptyAsNull = emptyAsNull; |
| return this; |
| } |
| |
| //----------------------------------------------------------------------- |
| /** |
| * Gets whether the tokenizer currently ignores empty tokens. |
| * The default for this property is false. |
| * |
| * @return true if empty tokens are not returned |
| */ |
| public boolean isIgnoreEmptyTokens() { |
| return ignoreEmptyTokens; |
| } |
| |
| /** |
| * Sets whether the tokenizer should ignore and not return empty tokens. |
| * The default for this property is false. |
| * |
| * @param ignoreEmptyTokens whether empty tokens are not returned |
| * @return this, to enable chaining |
| */ |
| public StrTokenizer setIgnoreEmptyTokens(boolean ignoreEmptyTokens) { |
| this.ignoreEmptyTokens = ignoreEmptyTokens; |
| return this; |
| } |
| |
| //----------------------------------------------------------------------- |
| /** |
| * Gets the String content that the tokenizer is parsing. |
| * |
| * @return the string content being parsed |
| */ |
| public String getContent() { |
| if (chars == null) { |
| return null; |
| } |
| return new String(chars); |
| } |
| |
| //----------------------------------------------------------------------- |
| /** |
| * Creates a new instance of this Tokenizer. The new instance is reset so |
| * that it will be at the start of the token list. |
| * If a {@link CloneNotSupportedException} is caught, return <code>null</code>. |
| * |
| * @return a new instance of this Tokenizer which has been reset. |
| */ |
| public Object clone() { |
| try { |
| return cloneReset(); |
| } catch (CloneNotSupportedException ex) { |
| return null; |
| } |
| } |
| |
| /** |
| * Creates a new instance of this Tokenizer. The new instance is reset so that |
| * it will be at the start of the token list. |
| * |
| * @return a new instance of this Tokenizer which has been reset. |
| * @throws CloneNotSupportedException if there is a problem cloning |
| */ |
| Object cloneReset() throws CloneNotSupportedException { |
| // this method exists to enable 100% test coverage |
| StrTokenizer cloned = (StrTokenizer) super.clone(); |
| if (cloned.chars != null) { |
| cloned.chars = (char[]) cloned.chars.clone(); |
| } |
| cloned.reset(); |
| return cloned; |
| } |
| |
| //----------------------------------------------------------------------- |
| /** |
| * Gets the String content that the tokenizer is parsing. |
| * |
| * @return the string content being parsed |
| */ |
| public String toString() { |
| if (tokens == null) { |
| return "StrTokenizer[not tokenized yet]"; |
| } |
| return "StrTokenizer" + getTokenList(); |
| } |
| |
| } |