src/main/java/org/apache/commons/csv/Lexer.java - commons-csv - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   https://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 package org.apache.commons.csv;

 import static org.apache.commons.io.IOUtils.EOF;

 import java.io.Closeable;
 import java.io.IOException;

 import org.apache.commons.io.IOUtils;

 /**
  * Lexical analyzer.
  */
 final class Lexer implements Closeable {

     private static final String CR_STRING = Character.toString(Constants.CR);
     private static final String LF_STRING = Character.toString(Constants.LF);

     private final char[] delimiter;
     private final char[] delimiterBuf;
     private final char[] escapeDelimiterBuf;
     private final int escape;
     private final int quoteChar;
     private final int commentStart;
     private final boolean ignoreSurroundingSpaces;
     private final boolean ignoreEmptyLines;
     private final boolean lenientEof;
     private final boolean trailingData;

     /** The buffered reader. */
     private final ExtendedBufferedReader reader;
     private String firstEol;

     private boolean isLastTokenDelimiter;

     Lexer(final CSVFormat format, final ExtendedBufferedReader reader) {
         this.reader = reader;
         this.delimiter = format.getDelimiterCharArray();
         this.escape = nullToDisabled(format.getEscapeCharacter());
         this.quoteChar = nullToDisabled(format.getQuoteCharacter());
         this.commentStart = nullToDisabled(format.getCommentMarker());
         this.ignoreSurroundingSpaces = format.getIgnoreSurroundingSpaces();
         this.ignoreEmptyLines = format.getIgnoreEmptyLines();
         this.lenientEof = format.getLenientEof();
         this.trailingData = format.getTrailingData();
         this.delimiterBuf = new char[delimiter.length - 1];
         this.escapeDelimiterBuf = new char[2 * delimiter.length - 1];
     }

     /**
      * Appends the next escaped character to the token's content.
      *
      * @param token the current token
      * @throws IOException  on stream access error
      * @throws CSVException Thrown on invalid input.
      */
     private void appendNextEscapedCharacterToToken(final Token token) throws IOException {
         if (isEscapeDelimiter()) {
             token.content.append(delimiter);
         } else {
             final int unescaped = readEscape();
             if (unescaped == EOF) { // unexpected char after escape
                 token.content.append((char) escape).append((char) reader.getLastChar());
             } else {
                 token.content.append((char) unescaped);
             }
         }
     }

     /**
      * Closes resources.
      *
      * @throws IOException
      *             If an I/O error occurs
      */
     @Override
     public void close() throws IOException {
         reader.close();
     }

     /**
      * Gets the number of bytes read
      *
      * @return the number of bytes read
      */
     long getBytesRead() {
         return reader.getBytesRead();
     }

     /**
      * Returns the current character position
      *
      * @return the current character position
      */
     long getCharacterPosition() {
         return reader.getPosition();
     }

     /**
      * Returns the current line number
      *
      * @return the current line number
      */
     long getCurrentLineNumber() {
         return reader.getLineNumber();
     }

     String getFirstEol() {
         return firstEol;
     }

     boolean isClosed() {
         return reader.isClosed();
     }

     boolean isCommentStart(final int ch) {
         return ch == commentStart;
     }

     /**
      * Determine whether the next characters constitute a delimiter through {@link ExtendedBufferedReader#peek(char[])}.
      *
      * @param ch
      *             the current character.
      * @return true if the next characters constitute a delimiter.
      * @throws IOException If an I/O error occurs.
      */
     boolean isDelimiter(final int ch) throws IOException {
         isLastTokenDelimiter = false;
         if (ch != delimiter[0]) {
             return false;
         }
         if (delimiter.length == 1) {
             isLastTokenDelimiter = true;
             return true;
         }
         reader.peek(delimiterBuf);
         for (int i = 0; i < delimiterBuf.length; i++) {
             if (delimiterBuf[i] != delimiter[i + 1]) {
                 return false;
             }
         }
         final int count = reader.read(delimiterBuf, 0, delimiterBuf.length);
         isLastTokenDelimiter = count != EOF;
         return isLastTokenDelimiter;
     }

     /**
      * Tests if the given character indicates the end of the file.
      *
      * @return true if the given character indicates the end of the file.
      */
     boolean isEndOfFile(final int ch) {
         return ch == EOF;
     }

     /**
      * Tests if the given character is the escape character.
      *
      * @return true if the given character is the escape character.
      */
     boolean isEscape(final int ch) {
         return ch == escape;
     }

     /**
      * Tests if the next characters constitute a escape delimiter through {@link ExtendedBufferedReader#peek(char[])}.
      *
      * For example, for delimiter "[|]" and escape '!', return true if the next characters constitute "![!|!]".
      *
      * @return true if the next characters constitute an escape delimiter.
      * @throws IOException If an I/O error occurs.
      */
     boolean isEscapeDelimiter() throws IOException {
         reader.peek(escapeDelimiterBuf);
         if (escapeDelimiterBuf[0] != delimiter[0]) {
             return false;
         }
         for (int i = 1; i < delimiter.length; i++) {
             if (escapeDelimiterBuf[2 * i] != delimiter[i] || escapeDelimiterBuf[2 * i - 1] != escape) {
                 return false;
             }
         }
         final int count = reader.read(escapeDelimiterBuf, 0, escapeDelimiterBuf.length);
         return count != EOF;
     }

     private boolean isMetaChar(final int ch) {
         return ch == escape || ch == quoteChar || ch == commentStart;
     }

     boolean isQuoteChar(final int ch) {
         return ch == quoteChar;
     }

     /**
      * Tests if the current character represents the start of a line: a CR, LF, or is at the start of the file.
      *
      * @param ch the character to check
      * @return true if the character is at the start of a line.
      */
     boolean isStartOfLine(final int ch) {
         return ch == Constants.LF || ch == Constants.CR || ch == Constants.UNDEFINED;
     }

     /**
      * Returns the next token.
      * <p>
      * A token corresponds to a term, a record change or an end-of-file indicator.
      * </p>
      *
      * @param token an existing Token object to reuse. The caller is responsible for initializing the Token.
      * @return the next token found.
      * @throws IOException  on stream access error.
      * @throws CSVException Thrown on invalid input.
      */
     Token nextToken(final Token token) throws IOException {
         // Get the last read char (required for empty line detection)
         int lastChar = reader.getLastChar();
         // read the next char and set eol
         int c = reader.read();
         // Note: The following call will swallow LF if c == CR. But we don't need to know if the last char was CR or LF - they are equivalent here.
         boolean eol = readEndOfLine(c);
         // empty line detection: eol AND (last char was EOL or beginning)
         if (ignoreEmptyLines) {
             while (eol && isStartOfLine(lastChar)) {
                 // Go on char ahead ...
                 lastChar = c;
                 c = reader.read();
                 eol = readEndOfLine(c);
                 // reached the end of the file without any content (empty line at the end)
                 if (isEndOfFile(c)) {
                     token.type = Token.Type.EOF;
                     // don't set token.isReady here because no content
                     return token;
                 }
             }
         }
         // Did we reach EOF during the last iteration already? EOF
         if (isEndOfFile(lastChar) || !isLastTokenDelimiter && isEndOfFile(c)) {
             token.type = Token.Type.EOF;
             // don't set token.isReady here because no content
             return token;
         }
         if (isStartOfLine(lastChar) && isCommentStart(c)) {
             final String line = reader.readLine();
             if (line == null) {
                 token.type = Token.Type.EOF;
                 // don't set token.isReady here because no content
                 return token;
             }
             final String comment = line.trim();
             token.content.append(comment);
             token.type = Token.Type.COMMENT;
             return token;
         }
         // Important: make sure a new char gets consumed in each iteration
         while (token.type == Token.Type.INVALID) {
             // ignore whitespaces at beginning of a token
             if (ignoreSurroundingSpaces) {
                 while (Character.isWhitespace((char) c) && !isDelimiter(c) && !eol) {
                     c = reader.read();
                     eol = readEndOfLine(c);
                 }
             }
             // ok, start of token reached: encapsulated, or token
             if (isDelimiter(c)) {
                 // empty token return TOKEN("")
                 token.type = Token.Type.TOKEN;
             } else if (eol) {
                 // empty token return EORECORD("")
                 // noop: token.content.append("");
                 token.type = Token.Type.EORECORD;
             } else if (isQuoteChar(c)) {
                 // consume encapsulated token
                 parseEncapsulatedToken(token);
             } else if (isEndOfFile(c)) {
                 // end of file return EOF()
                 // noop: token.content.append("");
                 token.type = Token.Type.EOF;
                 token.isReady = true; // there is data at EOF
             } else {
                 // next token must be a simple token
                 // add removed blanks when not ignoring whitespace chars...
                 parseSimpleToken(token, c);
             }
         }
         return token;
     }

     private int nullToDisabled(final Character c) {
         return c == null ? Constants.UNDEFINED : c.charValue(); // Explicit unboxing
     }

     /**
      * Parses an encapsulated token.
      * <p>
      * Encapsulated tokens are surrounded by the given encapsulating string. The encapsulator itself might be included
      * in the token using a doubling syntax (as "", '') or using escaping (as in \", \'). Whitespaces before and after
      * an encapsulated token is ignored. The token is finished when one of the following conditions becomes true:
      * </p>
      * <ul>
      * <li>An unescaped encapsulator has been reached and is followed by optional whitespace then:</li>
      * <ul>
      * <li>delimiter (TOKEN)</li>
      * <li>end of line (EORECORD)</li>
      * </ul>
      * <li>end of stream has been reached (EOF)</li> </ul>
      *
      * @param token
      *            the current token
      * @return a valid token object
      * @throws IOException
      *             Thrown when in an invalid state: EOF before closing encapsulator or invalid character before
      *             delimiter or EOL.
      * @throws CSVException Thrown on invalid input.
      */
     private Token parseEncapsulatedToken(final Token token) throws IOException {
         token.isQuoted = true;
         // Save current line number in case needed for IOE
         final long startLineNumber = getCurrentLineNumber();
         int c;
         while (true) {
             c = reader.read();
             if (isQuoteChar(c)) {
                 if (isQuoteChar(reader.peek())) {
                     // double or escaped encapsulator -> add single encapsulator to token
                     c = reader.read();
                     token.content.append((char) c);
                 } else {
                     // token finish mark (encapsulator) reached: ignore whitespace till delimiter
                     while (true) {
                         c = reader.read();
                         if (isDelimiter(c)) {
                             token.type = Token.Type.TOKEN;
                             return token;
                         }
                         if (isEndOfFile(c)) {
                             token.type = Token.Type.EOF;
                             token.isReady = true; // There is data at EOF
                             return token;
                         }
                         if (readEndOfLine(c)) {
                             token.type = Token.Type.EORECORD;
                             return token;
                         }
                         if (trailingData) {
                             token.content.append((char) c);
                         } else if (!Character.isWhitespace((char) c)) {
                             // error invalid char between token and next delimiter
                             throw new CSVException("Invalid character between encapsulated token and delimiter at line: %,d, position: %,d",
                                     getCurrentLineNumber(), getCharacterPosition());
                         }
                     }
                 }
             } else if (isEscape(c)) {
                 appendNextEscapedCharacterToToken(token);
             } else if (isEndOfFile(c)) {
                 if (lenientEof) {
                     token.type = Token.Type.EOF;
                     token.isReady = true; // There is data at EOF
                     return token;
                 }
                 // error condition (end of file before end of token)
                 throw new CSVException("(startline %,d) EOF reached before encapsulated token finished", startLineNumber);
             } else {
                 // consume character
                 token.content.append((char) c);
             }
         }
     }

     /**
      * Parses a simple token.
      * <p>
      * Simple tokens are tokens that are not surrounded by encapsulators. A simple token might contain escaped delimiters (as \, or \;). The token is finished
      * when one of the following conditions becomes true:
      * </p>
      * <ul>
      * <li>The end of line has been reached (EORECORD)</li>
      * <li>The end of stream has been reached (EOF)</li>
      * <li>An unescaped delimiter has been reached (TOKEN)</li>
      * </ul>
      *
      * @param token the current token
      * @param ch     the current character
      * @return the filled token
      * @throws IOException  on stream access error
      * @throws CSVException Thrown on invalid input.
      */
     private Token parseSimpleToken(final Token token, final int ch) throws IOException {
         // Faster to use while(true)+break than while(token.type == INVALID)
         int cur = ch;
         while (true) {
             if (readEndOfLine(cur)) {
                 token.type = Token.Type.EORECORD;
                 break;
             }
             if (isEndOfFile(cur)) {
                 token.type = Token.Type.EOF;
                 token.isReady = true; // There is data at EOF
                 break;
             }
             if (isDelimiter(cur)) {
                 token.type = Token.Type.TOKEN;
                 break;
             }
             // continue
             if (isEscape(cur)) {
                 appendNextEscapedCharacterToToken(token);
             } else {
                 token.content.append((char) cur);
             }
             cur = reader.read(); // continue
         }

         if (ignoreSurroundingSpaces) {
             trimTrailingSpaces(token.content);
         }

         return token;
     }

     /**
      * Greedily accepts \n, \r and \r\n This checker consumes silently the second control-character...
      *
      * @return true if the given or next character is a line-terminator
      */
     boolean readEndOfLine(final int ch) throws IOException {
         // check if we have \r\n...
         int cur = ch;
         if (cur == Constants.CR && reader.peek() == Constants.LF) {
             // note: does not change ch outside of this method!
             cur = reader.read();
             // Save the EOL state
             if (firstEol == null) {
                 this.firstEol = Constants.CRLF;
             }
         }
         // save EOL state here.
         if (firstEol == null) {
             if (cur == Constants.LF) {
                 this.firstEol = LF_STRING;
             } else if (cur == Constants.CR) {
                 this.firstEol = CR_STRING;
             }
         }

         return cur == Constants.LF || cur == Constants.CR;
     }

     // TODO escape handling needs more work
     /**
      * Handle an escape sequence. The current character must be the escape character. On return, the next character is available by calling
      * {@link ExtendedBufferedReader#getLastChar()} on the input stream.
      *
      * @return the unescaped character (as an int) or {@link IOUtils#EOF} if char following the escape is invalid.
      * @throws IOException  if there is a problem reading the stream or the end of stream is detected: the escape character is not allowed at end of stream
      * @throws CSVException Thrown on invalid input.
      */
     int readEscape() throws IOException {
         // the escape char has just been read (normally a backslash)
         final int ch = reader.read();
         switch (ch) {
         case 'r':
             return Constants.CR;
         case 'n':
             return Constants.LF;
         case 't':
             return Constants.TAB;
         case 'b':
             return Constants.BACKSPACE;
         case 'f':
             return Constants.FF;
         case Constants.CR:
         case Constants.LF:
         case Constants.FF: // TODO is this correct?
         case Constants.TAB: // TODO is this correct? Do tabs need to be escaped?
         case Constants.BACKSPACE: // TODO is this correct?
             return ch;
         case EOF:
             throw new CSVException("EOF while processing escape sequence");
         default:
             // Now check for meta-characters
             if (isMetaChar(ch)) {
                 return ch;
             }
             // indicate unexpected char - available from in.getLastChar()
             return EOF;
         }
     }

     void trimTrailingSpaces(final StringBuilder buffer) {
         int length = buffer.length();
         while (length > 0 && Character.isWhitespace(buffer.charAt(length - 1))) {
             length--;
         }
         if (length != buffer.length()) {
             buffer.setLength(length);
         }
     }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* https://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/

	package org.apache.commons.csv;

	import static org.apache.commons.io.IOUtils.EOF;

	import java.io.Closeable;
	import java.io.IOException;

	import org.apache.commons.io.IOUtils;

	/**
	* Lexical analyzer.
	*/
	final class Lexer implements Closeable {

	private static final String CR_STRING = Character.toString(Constants.CR);
	private static final String LF_STRING = Character.toString(Constants.LF);

	private final char[] delimiter;
	private final char[] delimiterBuf;
	private final char[] escapeDelimiterBuf;
	private final int escape;
	private final int quoteChar;
	private final int commentStart;
	private final boolean ignoreSurroundingSpaces;
	private final boolean ignoreEmptyLines;
	private final boolean lenientEof;
	private final boolean trailingData;

	/** The buffered reader. */
	private final ExtendedBufferedReader reader;
	private String firstEol;

	private boolean isLastTokenDelimiter;

	Lexer(final CSVFormat format, final ExtendedBufferedReader reader) {
	this.reader = reader;
	this.delimiter = format.getDelimiterCharArray();
	this.escape = nullToDisabled(format.getEscapeCharacter());
	this.quoteChar = nullToDisabled(format.getQuoteCharacter());
	this.commentStart = nullToDisabled(format.getCommentMarker());
	this.ignoreSurroundingSpaces = format.getIgnoreSurroundingSpaces();
	this.ignoreEmptyLines = format.getIgnoreEmptyLines();
	this.lenientEof = format.getLenientEof();
	this.trailingData = format.getTrailingData();
	this.delimiterBuf = new char[delimiter.length - 1];
	this.escapeDelimiterBuf = new char[2 * delimiter.length - 1];
	}

	/**
	* Appends the next escaped character to the token's content.
	*
	* @param token the current token
	* @throws IOException on stream access error
	* @throws CSVException Thrown on invalid input.
	*/
	private void appendNextEscapedCharacterToToken(final Token token) throws IOException {
	if (isEscapeDelimiter()) {
	token.content.append(delimiter);
	} else {
	final int unescaped = readEscape();
	if (unescaped == EOF) { // unexpected char after escape
	token.content.append((char) escape).append((char) reader.getLastChar());
	} else {
	token.content.append((char) unescaped);
	}
	}
	}

	/**
	* Closes resources.
	*
	* @throws IOException
	* If an I/O error occurs
	*/
	@Override
	public void close() throws IOException {
	reader.close();
	}

	/**
	* Gets the number of bytes read
	*
	* @return the number of bytes read
	*/
	long getBytesRead() {
	return reader.getBytesRead();
	}

	/**
	* Returns the current character position
	*
	* @return the current character position
	*/
	long getCharacterPosition() {
	return reader.getPosition();
	}

	/**
	* Returns the current line number
	*
	* @return the current line number
	*/
	long getCurrentLineNumber() {
	return reader.getLineNumber();
	}

	String getFirstEol() {
	return firstEol;
	}

	boolean isClosed() {
	return reader.isClosed();
	}

	boolean isCommentStart(final int ch) {
	return ch == commentStart;
	}

	/**
	* Determine whether the next characters constitute a delimiter through {@link ExtendedBufferedReader#peek(char[])}.
	*
	* @param ch
	* the current character.
	* @return true if the next characters constitute a delimiter.
	* @throws IOException If an I/O error occurs.
	*/
	boolean isDelimiter(final int ch) throws IOException {
	isLastTokenDelimiter = false;
	if (ch != delimiter[0]) {
	return false;
	}
	if (delimiter.length == 1) {
	isLastTokenDelimiter = true;
	return true;
	}
	reader.peek(delimiterBuf);
	for (int i = 0; i < delimiterBuf.length; i++) {
	if (delimiterBuf[i] != delimiter[i + 1]) {
	return false;
	}
	}
	final int count = reader.read(delimiterBuf, 0, delimiterBuf.length);
	isLastTokenDelimiter = count != EOF;
	return isLastTokenDelimiter;
	}

	/**
	* Tests if the given character indicates the end of the file.
	*
	* @return true if the given character indicates the end of the file.
	*/
	boolean isEndOfFile(final int ch) {
	return ch == EOF;
	}

	/**
	* Tests if the given character is the escape character.
	*
	* @return true if the given character is the escape character.
	*/
	boolean isEscape(final int ch) {
	return ch == escape;
	}

	/**
	* Tests if the next characters constitute a escape delimiter through {@link ExtendedBufferedReader#peek(char[])}.
	*
	* For example, for delimiter "[\|]" and escape '!', return true if the next characters constitute "![!\|!]".
	*
	* @return true if the next characters constitute an escape delimiter.
	* @throws IOException If an I/O error occurs.
	*/
	boolean isEscapeDelimiter() throws IOException {
	reader.peek(escapeDelimiterBuf);
	if (escapeDelimiterBuf[0] != delimiter[0]) {
	return false;
	}
	for (int i = 1; i < delimiter.length; i++) {
	if (escapeDelimiterBuf[2 * i] != delimiter[i] \|\| escapeDelimiterBuf[2 * i - 1] != escape) {
	return false;
	}
	}
	final int count = reader.read(escapeDelimiterBuf, 0, escapeDelimiterBuf.length);
	return count != EOF;
	}

	private boolean isMetaChar(final int ch) {
	return ch == escape \|\| ch == quoteChar \|\| ch == commentStart;
	}

	boolean isQuoteChar(final int ch) {
	return ch == quoteChar;
	}

	/**
	* Tests if the current character represents the start of a line: a CR, LF, or is at the start of the file.
	*
	* @param ch the character to check
	* @return true if the character is at the start of a line.
	*/
	boolean isStartOfLine(final int ch) {
	return ch == Constants.LF \|\| ch == Constants.CR \|\| ch == Constants.UNDEFINED;
	}

	/**
	* Returns the next token.
	* <p>
	* A token corresponds to a term, a record change or an end-of-file indicator.
	* </p>
	*
	* @param token an existing Token object to reuse. The caller is responsible for initializing the Token.
	* @return the next token found.
	* @throws IOException on stream access error.
	* @throws CSVException Thrown on invalid input.
	*/
	Token nextToken(final Token token) throws IOException {
	// Get the last read char (required for empty line detection)
	int lastChar = reader.getLastChar();
	// read the next char and set eol
	int c = reader.read();
	// Note: The following call will swallow LF if c == CR. But we don't need to know if the last char was CR or LF - they are equivalent here.
	boolean eol = readEndOfLine(c);
	// empty line detection: eol AND (last char was EOL or beginning)
	if (ignoreEmptyLines) {
	while (eol && isStartOfLine(lastChar)) {
	// Go on char ahead ...
	lastChar = c;
	c = reader.read();
	eol = readEndOfLine(c);
	// reached the end of the file without any content (empty line at the end)
	if (isEndOfFile(c)) {
	token.type = Token.Type.EOF;
	// don't set token.isReady here because no content
	return token;
	}
	}
	}
	// Did we reach EOF during the last iteration already? EOF
	if (isEndOfFile(lastChar) \|\| !isLastTokenDelimiter && isEndOfFile(c)) {
	token.type = Token.Type.EOF;
	// don't set token.isReady here because no content
	return token;
	}
	if (isStartOfLine(lastChar) && isCommentStart(c)) {
	final String line = reader.readLine();
	if (line == null) {
	token.type = Token.Type.EOF;
	// don't set token.isReady here because no content
	return token;
	}
	final String comment = line.trim();
	token.content.append(comment);
	token.type = Token.Type.COMMENT;
	return token;
	}
	// Important: make sure a new char gets consumed in each iteration
	while (token.type == Token.Type.INVALID) {
	// ignore whitespaces at beginning of a token
	if (ignoreSurroundingSpaces) {
	while (Character.isWhitespace((char) c) && !isDelimiter(c) && !eol) {
	c = reader.read();
	eol = readEndOfLine(c);
	}
	}
	// ok, start of token reached: encapsulated, or token
	if (isDelimiter(c)) {
	// empty token return TOKEN("")
	token.type = Token.Type.TOKEN;
	} else if (eol) {
	// empty token return EORECORD("")
	// noop: token.content.append("");
	token.type = Token.Type.EORECORD;
	} else if (isQuoteChar(c)) {
	// consume encapsulated token
	parseEncapsulatedToken(token);
	} else if (isEndOfFile(c)) {
	// end of file return EOF()
	// noop: token.content.append("");
	token.type = Token.Type.EOF;
	token.isReady = true; // there is data at EOF
	} else {
	// next token must be a simple token
	// add removed blanks when not ignoring whitespace chars...
	parseSimpleToken(token, c);
	}
	}
	return token;
	}

	private int nullToDisabled(final Character c) {
	return c == null ? Constants.UNDEFINED : c.charValue(); // Explicit unboxing
	}

	/**
	* Parses an encapsulated token.
	* <p>
	* Encapsulated tokens are surrounded by the given encapsulating string. The encapsulator itself might be included
	* in the token using a doubling syntax (as "", '') or using escaping (as in \", \'). Whitespaces before and after
	* an encapsulated token is ignored. The token is finished when one of the following conditions becomes true:
	* </p>
	* <ul>
	* <li>An unescaped encapsulator has been reached and is followed by optional whitespace then:</li>
	* <ul>
	* <li>delimiter (TOKEN)</li>
	* <li>end of line (EORECORD)</li>
	* </ul>
	* <li>end of stream has been reached (EOF)</li> </ul>
	*
	* @param token
	* the current token
	* @return a valid token object
	* @throws IOException
	* Thrown when in an invalid state: EOF before closing encapsulator or invalid character before
	* delimiter or EOL.
	* @throws CSVException Thrown on invalid input.
	*/
	private Token parseEncapsulatedToken(final Token token) throws IOException {
	token.isQuoted = true;
	// Save current line number in case needed for IOE
	final long startLineNumber = getCurrentLineNumber();
	int c;
	while (true) {
	c = reader.read();
	if (isQuoteChar(c)) {
	if (isQuoteChar(reader.peek())) {
	// double or escaped encapsulator -> add single encapsulator to token
	c = reader.read();
	token.content.append((char) c);
	} else {
	// token finish mark (encapsulator) reached: ignore whitespace till delimiter
	while (true) {
	c = reader.read();
	if (isDelimiter(c)) {
	token.type = Token.Type.TOKEN;
	return token;
	}
	if (isEndOfFile(c)) {
	token.type = Token.Type.EOF;
	token.isReady = true; // There is data at EOF
	return token;
	}
	if (readEndOfLine(c)) {
	token.type = Token.Type.EORECORD;
	return token;
	}
	if (trailingData) {
	token.content.append((char) c);
	} else if (!Character.isWhitespace((char) c)) {
	// error invalid char between token and next delimiter
	throw new CSVException("Invalid character between encapsulated token and delimiter at line: %,d, position: %,d",
	getCurrentLineNumber(), getCharacterPosition());
	}
	}
	}
	} else if (isEscape(c)) {
	appendNextEscapedCharacterToToken(token);
	} else if (isEndOfFile(c)) {
	if (lenientEof) {
	token.type = Token.Type.EOF;
	token.isReady = true; // There is data at EOF
	return token;
	}
	// error condition (end of file before end of token)
	throw new CSVException("(startline %,d) EOF reached before encapsulated token finished", startLineNumber);
	} else {
	// consume character
	token.content.append((char) c);
	}
	}
	}

	/**
	* Parses a simple token.
	* <p>
	* Simple tokens are tokens that are not surrounded by encapsulators. A simple token might contain escaped delimiters (as \, or \;). The token is finished
	* when one of the following conditions becomes true:
	* </p>
	* <ul>
	* <li>The end of line has been reached (EORECORD)</li>
	* <li>The end of stream has been reached (EOF)</li>
	* <li>An unescaped delimiter has been reached (TOKEN)</li>
	* </ul>
	*
	* @param token the current token
	* @param ch the current character
	* @return the filled token
	* @throws IOException on stream access error
	* @throws CSVException Thrown on invalid input.
	*/
	private Token parseSimpleToken(final Token token, final int ch) throws IOException {
	// Faster to use while(true)+break than while(token.type == INVALID)
	int cur = ch;
	while (true) {
	if (readEndOfLine(cur)) {
	token.type = Token.Type.EORECORD;
	break;
	}
	if (isEndOfFile(cur)) {
	token.type = Token.Type.EOF;
	token.isReady = true; // There is data at EOF
	break;
	}
	if (isDelimiter(cur)) {
	token.type = Token.Type.TOKEN;
	break;
	}
	// continue
	if (isEscape(cur)) {
	appendNextEscapedCharacterToToken(token);
	} else {
	token.content.append((char) cur);
	}
	cur = reader.read(); // continue
	}

	if (ignoreSurroundingSpaces) {
	trimTrailingSpaces(token.content);
	}

	return token;
	}

	/**
	* Greedily accepts \n, \r and \r\n This checker consumes silently the second control-character...
	*
	* @return true if the given or next character is a line-terminator
	*/
	boolean readEndOfLine(final int ch) throws IOException {
	// check if we have \r\n...
	int cur = ch;
	if (cur == Constants.CR && reader.peek() == Constants.LF) {
	// note: does not change ch outside of this method!
	cur = reader.read();
	// Save the EOL state
	if (firstEol == null) {
	this.firstEol = Constants.CRLF;
	}
	}
	// save EOL state here.
	if (firstEol == null) {
	if (cur == Constants.LF) {
	this.firstEol = LF_STRING;
	} else if (cur == Constants.CR) {
	this.firstEol = CR_STRING;
	}
	}

	return cur == Constants.LF \|\| cur == Constants.CR;
	}

	// TODO escape handling needs more work
	/**
	* Handle an escape sequence. The current character must be the escape character. On return, the next character is available by calling
	* {@link ExtendedBufferedReader#getLastChar()} on the input stream.
	*
	* @return the unescaped character (as an int) or {@link IOUtils#EOF} if char following the escape is invalid.
	* @throws IOException if there is a problem reading the stream or the end of stream is detected: the escape character is not allowed at end of stream
	* @throws CSVException Thrown on invalid input.
	*/
	int readEscape() throws IOException {
	// the escape char has just been read (normally a backslash)
	final int ch = reader.read();
	switch (ch) {
	case 'r':
	return Constants.CR;
	case 'n':
	return Constants.LF;
	case 't':
	return Constants.TAB;
	case 'b':
	return Constants.BACKSPACE;
	case 'f':
	return Constants.FF;
	case Constants.CR:
	case Constants.LF:
	case Constants.FF: // TODO is this correct?
	case Constants.TAB: // TODO is this correct? Do tabs need to be escaped?
	case Constants.BACKSPACE: // TODO is this correct?
	return ch;
	case EOF:
	throw new CSVException("EOF while processing escape sequence");
	default:
	// Now check for meta-characters
	if (isMetaChar(ch)) {
	return ch;
	}
	// indicate unexpected char - available from in.getLastChar()
	return EOF;
	}
	}

	void trimTrailingSpaces(final StringBuilder buffer) {
	int length = buffer.length();
	while (length > 0 && Character.isWhitespace(buffer.charAt(length - 1))) {
	length--;
	}
	if (length != buffer.length()) {
	buffer.setLength(length);
	}
	}
	}