solr/core/src/java/org/apache/solr/internal/csv/CSVParser.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.solr.internal.csv;

 import java.io.IOException;
 import java.io.Reader;
 import java.util.ArrayList;


 /**
  * Parses CSV files according to the specified configuration.
  *
  * Because CSV appears in many different dialects, the parser supports many
  * configuration settings by allowing the specification of a {@link CSVStrategy}.
  *
  * <p>Parsing of a csv-string having tabs as separators,
  * '"' as an optional value encapsulator, and comments starting with '#':</p>
  * <pre>
  *  String[][] data =
  *   (new CSVParser(new StringReader("a\tb\nc\td"), new CSVStrategy('\t','"','#'))).getAllValues();
  * </pre>
  *
  * <p>Parsing of a csv-string in Excel CSV format</p>
  * <pre>
  *  String[][] data =
  *   (new CSVParser(new StringReader("a;b\nc;d"), CSVStrategy.EXCEL_STRATEGY)).getAllValues();
  * </pre>
  *
  * <p>
  * Internal parser state is completely covered by the strategy
  * and the reader-state.</p>
  *
  * <p>see <a href="package-summary.html">package documentation</a>
  * for more details</p>
  */
 public class CSVParser {

   /** length of the initial token (content-)buffer */
   private static final int INITIAL_TOKEN_LENGTH = 50;

   // the token types
   /** Token has no valid content, i.e. is in its initialized state. */
   protected static final int TT_INVALID = -1;
   /** Token with content, at beginning or in the middle of a line. */
   protected static final int TT_TOKEN = 0;
   /** Token (which can have content) when end of file is reached. */
   protected static final int TT_EOF = 1;
   /** Token with content when end of a line is reached. */
   protected static final int TT_EORECORD = 2;

   /** Immutable empty String array. */
   private static final String[] EMPTY_STRING_ARRAY = new String[0];

   // the input stream
   private final ExtendedBufferedReader in;

   private final CSVStrategy strategy;

   // the following objects are shared to reduce garbage
   /** A record buffer for getLine(). Grows as necessary and is reused. */
   @SuppressWarnings({"rawtypes"})
   private final ArrayList record = new ArrayList();
   private final Token reusableToken = new Token();
   private final CharBuffer wsBuf = new CharBuffer();
   private final CharBuffer code = new CharBuffer(4);


   /**
    * Token is an internal token representation.
    *
    * It is used as contract between the lexer and the parser.
    */
   static class Token {
     /** Token type, see TT_xxx constants. */
     int type = TT_INVALID;
     /** The content buffer. */
     CharBuffer content = new CharBuffer(INITIAL_TOKEN_LENGTH);
     /** Token ready flag: indicates a valid token with content (ready for the parser). */
     boolean isReady;

     Token reset() {
         content.clear();
         type = TT_INVALID;
         isReady = false;
         return this;
     }
   }

   // ======================================================
   //  the constructor
   // ======================================================

   /**
    * CSV parser using the default {@link CSVStrategy}.
    *
    * @param input a Reader containing "csv-formatted" input
    */
   public CSVParser(Reader input) {
     this(input, CSVStrategy.DEFAULT_STRATEGY);
   }

   /**
    * Customized CSV parser using the given {@link CSVStrategy}
    *
    * @param input a Reader containing "csv-formatted" input
    * @param strategy the CSVStrategy used for CSV parsing
    */
   public CSVParser(Reader input, CSVStrategy strategy) {
     this.in = new ExtendedBufferedReader(input);
     this.strategy = strategy;
   }

   // ======================================================
   //  the parser
   // ======================================================

   /**
    * Parses the CSV according to the given strategy
    * and returns the content as an array of records
    * (whereas records are arrays of single values).
    * <p>
    * The returned content starts at the current parse-position in
    * the stream.
    *
    * @return matrix of records x values ('null' when end of file)
    * @throws IOException on parse error or input read-failure
    */
   @SuppressWarnings({"unchecked"})
   public String[][] getAllValues() throws IOException {
     @SuppressWarnings({"rawtypes"})
     ArrayList records = new ArrayList();
     String[] values;
     String[][] ret = null;
     while ((values = getLine()) != null)  {
       records.add(values);
     }
     if (records.size() > 0) {
       ret = new String[records.size()][];
       records.toArray(ret);
     }
     return ret;
   }

   /**
    * Parses the CSV according to the given strategy
    * and returns the next csv-value as string.
    *
    * @return next value in the input stream ('null' when end of file)
    * @throws IOException on parse error or input read-failure
    */
   public String nextValue() throws IOException {
     Token tkn = nextToken();
     String ret = null;
     switch (tkn.type) {
       case TT_TOKEN:
       case TT_EORECORD:
         ret = tkn.content.toString();
         break;
       case TT_EOF:
         ret = null;
         break;
       case TT_INVALID:
       default:
         // error no token available (or error)
         throw new IOException(
           "(line " + getLineNumber()
           + ") invalid parse sequence");
         // unreachable: break;
     }
     return ret;
   }

   /**
    * Parses from the current point in the stream til
    * the end of the current line.
    *
    * @return array of values til end of line
    *        ('null' when end of file has been reached)
    * @throws IOException on parse error or input read-failure
    */
   @SuppressWarnings({"unchecked"})
   public String[] getLine() throws IOException {
     String[] ret = EMPTY_STRING_ARRAY;
     record.clear();
     while (true) {
         reusableToken.reset();
         nextToken(reusableToken);
         switch (reusableToken.type) {
             case TT_TOKEN:
                 record.add(reusableToken.content.toString());
                 break;
             case TT_EORECORD:
                 record.add(reusableToken.content.toString());
                 break;
             case TT_EOF:
                 if (reusableToken.isReady) {
                     record.add(reusableToken.content.toString());
                 } else {
                     ret = null;
                 }
                 break;
             case TT_INVALID:
             default:
                 // error: throw IOException
                 throw new IOException("(line " + getLineNumber() + ") invalid parse sequence");
             // unreachable: break;
         }
         if (reusableToken.type != TT_TOKEN) {
             break;
         }
     }
     if (!record.isEmpty()) {
       ret = (String[]) record.toArray(new String[record.size()]);
     }
     return ret;
   }

   /**
    * Returns the current line number in the input stream.
    *
    * ATTENTION: in case your csv has multiline-values the returned
    *            number does not correspond to the record-number
    *
    * @return  current line number
    */
   public int getLineNumber() {
     return in.getLineNumber();
   }

   // ======================================================
   //  the lexer(s)
   // ======================================================

   /**
    * Convenience method for <code>nextToken(null)</code>.
    */
   protected Token nextToken() throws IOException {
       return nextToken(new Token());
   }

  /**
    * Returns the next token.
    *
    * A token corresponds to a term, a record change or an
    * end-of-file indicator.
    *
    * @param tkn an existing Token object to reuse. The caller is responsible to initialize the
    * Token.
    * @return the next token found
    * @throws IOException on stream access error
    */
   protected Token nextToken(Token tkn) throws IOException {
     wsBuf.clear(); // resuse

     // get the last read char (required for empty line detection)
     int lastChar = in.readAgain();

     //  read the next char and set eol
     /* note: unfortunately isEndOfLine may consumes a character silently.
      *       this has no effect outside of the method. so a simple workaround
      *       is to call 'readAgain' on the stream...
      *       uh: might using objects instead of base-types (jdk1.5 autoboxing!)
      */
     int c = in.read();
     boolean eol = isEndOfLine(c);
     c = in.readAgain();

     //  empty line detection: eol AND (last char was EOL or beginning)
     while (strategy.getIgnoreEmptyLines() && eol
       && (lastChar == '\n'
       || lastChar == ExtendedBufferedReader.UNDEFINED)
       && !isEndOfFile(lastChar)) {
       // go on char ahead ...
       lastChar = c;
       c = in.read();
       eol = isEndOfLine(c);
       c = in.readAgain();
       // reached end of file without any content (empty line at the end)
       if (isEndOfFile(c)) {
         tkn.type = TT_EOF;
         return tkn;
       }
     }

     // did we reached eof during the last iteration already ? TT_EOF
     if (isEndOfFile(lastChar) || (lastChar != strategy.getDelimiter() && isEndOfFile(c))) {
       tkn.type = TT_EOF;
       return tkn;
     }

     //  important: make sure a new char gets consumed in each iteration
     while (!tkn.isReady && tkn.type != TT_EOF) {
       // ignore whitespaces at beginning of a token
       while (strategy.getIgnoreLeadingWhitespaces() && isWhitespace(c) && !eol) {
         wsBuf.append((char) c);
         c = in.read();
         eol = isEndOfLine(c);
       }
       // ok, start of token reached: comment, encapsulated, or token
       if (c == strategy.getCommentStart()) {
         // ignore everything till end of line and continue (incr linecount)
         in.readLine();
         tkn = nextToken(tkn.reset());
       } else if (c == strategy.getDelimiter()) {
         // empty token return TT_TOKEN("")
         tkn.type = TT_TOKEN;
         tkn.isReady = true;
       } else if (eol) {
         // empty token return TT_EORECORD("")
         //noop: tkn.content.append("");
         tkn.type = TT_EORECORD;
         tkn.isReady = true;
       } else if (c == strategy.getEncapsulator()) {
         // consume encapsulated token
         encapsulatedTokenLexer(tkn, c);
       } else if (isEndOfFile(c)) {
         // end of file return TT_EOF()
         //noop: tkn.content.append("");
         tkn.type = TT_EOF;
         tkn.isReady = true;
       } else {
         // next token must be a simple token
         // add removed blanks when not ignoring whitespace chars...
         if (!strategy.getIgnoreLeadingWhitespaces()) {
           tkn.content.append(wsBuf);
         }
         simpleTokenLexer(tkn, c);
       }
     }
     return tkn;
   }

   /**
    * A simple token lexer
    *
    * Simple token are tokens which are not surrounded by encapsulators.
    * A simple token might contain escaped delimiters (as \, or \;). The
    * token is finished when one of the following conditions become true:
    * <ul>
    *   <li>end of line has been reached (TT_EORECORD)</li>
    *   <li>end of stream has been reached (TT_EOF)</li>
    *   <li>an unescaped delimiter has been reached (TT_TOKEN)</li>
    * </ul>
    *
    * @param tkn  the current token
    * @param c    the current character
    * @return the filled token
    *
    * @throws IOException on stream access error
    */
   private Token simpleTokenLexer(Token tkn, int c) throws IOException {
     for (;;) {
       if (isEndOfLine(c)) {
         // end of record
         tkn.type = TT_EORECORD;
         tkn.isReady = true;
         break;
       } else if (isEndOfFile(c)) {
         // end of file
         tkn.type = TT_EOF;
         tkn.isReady = true;
         break;
       } else if (c == strategy.getDelimiter()) {
         // end of token
         tkn.type = TT_TOKEN;
         tkn.isReady = true;
         break;
       } else if (c == '\\' && strategy.getUnicodeEscapeInterpretation() && in.lookAhead() == 'u') {
         // interpret unicode escaped chars (like \u0070 -> p)
         tkn.content.append((char) unicodeEscapeLexer(c));
       } else if (c == strategy.getEscape()) {
         tkn.content.append((char)readEscape(c));
       } else {
         tkn.content.append((char) c);
       }

       c = in.read();
     }

     if (strategy.getIgnoreTrailingWhitespaces()) {
       tkn.content.trimTrailingWhitespace();
     }

     return tkn;
   }


   /**
    * An encapsulated token lexer
    *
    * Encapsulated tokens are surrounded by the given encapsulating-string.
    * The encapsulator itself might be included in the token using a
    * doubling syntax (as "", '') or using escaping (as in \", \').
    * Whitespaces before and after an encapsulated token are ignored.
    *
    * @param tkn    the current token
    * @param c      the current character
    * @return a valid token object
    * @throws IOException on invalid state
    */
   private Token encapsulatedTokenLexer(Token tkn, int c) throws IOException {
     // save current line
     int startLineNumber = getLineNumber();
     // ignore the given delimiter
     // assert c == delimiter;
     for (;;) {
       c = in.read();

       if (c == '\\' && strategy.getUnicodeEscapeInterpretation() && in.lookAhead()=='u') {
         tkn.content.append((char) unicodeEscapeLexer(c));
       } else if (c == strategy.getEscape()) {
         tkn.content.append((char)readEscape(c));
       } else if (c == strategy.getEncapsulator()) {
         if (in.lookAhead() == strategy.getEncapsulator()) {
           // double or escaped encapsulator -> add single encapsulator to token
           c = in.read();
           tkn.content.append((char) c);
         } else {
           // token finish mark (encapsulator) reached: ignore whitespace till delimiter
           for (;;) {
             c = in.read();
             if (c == strategy.getDelimiter()) {
               tkn.type = TT_TOKEN;
               tkn.isReady = true;
               return tkn;
             } else if (isEndOfFile(c)) {
               tkn.type = TT_EOF;
               tkn.isReady = true;
               return tkn;
             } else if (isEndOfLine(c)) {
               // ok eo token reached
               tkn.type = TT_EORECORD;
               tkn.isReady = true;
               return tkn;
             } else if (!isWhitespace(c)) {
               // error invalid char between token and next delimiter
               throw new IOException(
                       "(line " + getLineNumber()
                               + ") invalid char between encapsulated token end delimiter"
               );
             }
           }
         }
       } else if (isEndOfFile(c)) {
         // error condition (end of file before end of token)
         throw new IOException(
                 "(startline " + startLineNumber + ")"
                         + "eof reached before encapsulated token finished"
         );
       } else {
         // consume character
         tkn.content.append((char) c);
       }
     }
   }


   /**
    * Decodes Unicode escapes.
    *
    * Interpretation of "\\uXXXX" escape sequences
    * where XXXX is a hex-number.
    * @param c current char which is discarded because it's the "\\" of "\\uXXXX"
    * @return the decoded character
    * @throws IOException on wrong unicode escape sequence or read error
    */
   protected int unicodeEscapeLexer(int c) throws IOException {
     int ret = 0;
     // ignore 'u' (assume c==\ now) and read 4 hex digits
     c = in.read();
     code.clear();
     try {
       for (int i = 0; i < 4; i++) {
         c  = in.read();
         if (isEndOfFile(c) || isEndOfLine(c)) {
           throw new NumberFormatException("number too short");
         }
         code.append((char) c);
       }
       ret = Integer.parseInt(code.toString(), 16);
     } catch (NumberFormatException e) {
       throw new IOException(
         "(line " + getLineNumber() + ") Wrong unicode escape sequence found '"
         + code.toString() + "'" + e.toString());
     }
     return ret;
   }

   private int readEscape(int c) throws IOException {
     // assume c is the escape char (normally a backslash)
     c = in.read();
     int out;
     switch (c) {
       case 'r': out='\r'; break;
       case 'n': out='\n'; break;
       case 't': out='\t'; break;
       case 'b': out='\b'; break;
       case 'f': out='\f'; break;
       default : out=c;
     }
     return out;
   }

   // ======================================================
   //  strategies
   // ======================================================

   /**
    * Obtain the specified CSV Strategy.  This should not be modified.
    *
    * @return strategy currently being used
    */
   public CSVStrategy getStrategy() {
     return this.strategy;
   }

   // ======================================================
   //  Character class checker
   // ======================================================

   /**
    * @return true if the given char is a whitespace character
    */
   private boolean isWhitespace(int c) {
     return Character.isWhitespace((char) c) && (c != strategy.getDelimiter());
   }

   /**
    * Greedy - accepts \n and \r\n
    * This checker consumes silently the second control-character...
    *
    * @return true if the given character is a line-terminator
    */
   private boolean isEndOfLine(int c) throws IOException {
     // check if we have \r\n...
     if (c == '\r') {
       if (in.lookAhead() == '\n') {
         // note: does not change c outside of this method !!
         c = in.read();
       }
     }
     return (c == '\n');
   }

   /**
    * @return true if the given character indicates end of file
    */
   private boolean isEndOfFile(int c) {
     return c == ExtendedBufferedReader.END_OF_STREAM;
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.solr.internal.csv;

	import java.io.IOException;
	import java.io.Reader;
	import java.util.ArrayList;


	/**
	* Parses CSV files according to the specified configuration.
	*
	* Because CSV appears in many different dialects, the parser supports many
	* configuration settings by allowing the specification of a {@link CSVStrategy}.
	*
	* <p>Parsing of a csv-string having tabs as separators,
	* '"' as an optional value encapsulator, and comments starting with '#':</p>
	* <pre>
	* String[][] data =
	* (new CSVParser(new StringReader("a\tb\nc\td"), new CSVStrategy('\t','"','#'))).getAllValues();
	* </pre>
	*
	* <p>Parsing of a csv-string in Excel CSV format</p>
	* <pre>
	* String[][] data =
	* (new CSVParser(new StringReader("a;b\nc;d"), CSVStrategy.EXCEL_STRATEGY)).getAllValues();
	* </pre>
	*
	* <p>
	* Internal parser state is completely covered by the strategy
	* and the reader-state.</p>
	*
	* <p>see <a href="package-summary.html">package documentation</a>
	* for more details</p>
	*/
	public class CSVParser {

	/** length of the initial token (content-)buffer */
	private static final int INITIAL_TOKEN_LENGTH = 50;

	// the token types
	/** Token has no valid content, i.e. is in its initialized state. */
	protected static final int TT_INVALID = -1;
	/** Token with content, at beginning or in the middle of a line. */
	protected static final int TT_TOKEN = 0;
	/** Token (which can have content) when end of file is reached. */
	protected static final int TT_EOF = 1;
	/** Token with content when end of a line is reached. */
	protected static final int TT_EORECORD = 2;

	/** Immutable empty String array. */
	private static final String[] EMPTY_STRING_ARRAY = new String[0];

	// the input stream
	private final ExtendedBufferedReader in;

	private final CSVStrategy strategy;

	// the following objects are shared to reduce garbage
	/** A record buffer for getLine(). Grows as necessary and is reused. */
	@SuppressWarnings({"rawtypes"})
	private final ArrayList record = new ArrayList();
	private final Token reusableToken = new Token();
	private final CharBuffer wsBuf = new CharBuffer();
	private final CharBuffer code = new CharBuffer(4);


	/**
	* Token is an internal token representation.
	*
	* It is used as contract between the lexer and the parser.
	*/
	static class Token {
	/** Token type, see TT_xxx constants. */
	int type = TT_INVALID;
	/** The content buffer. */
	CharBuffer content = new CharBuffer(INITIAL_TOKEN_LENGTH);
	/** Token ready flag: indicates a valid token with content (ready for the parser). */
	boolean isReady;

	Token reset() {
	content.clear();
	type = TT_INVALID;
	isReady = false;
	return this;
	}
	}

	// ======================================================
	// the constructor
	// ======================================================

	/**
	* CSV parser using the default {@link CSVStrategy}.
	*
	* @param input a Reader containing "csv-formatted" input
	*/
	public CSVParser(Reader input) {
	this(input, CSVStrategy.DEFAULT_STRATEGY);
	}

	/**
	* Customized CSV parser using the given {@link CSVStrategy}
	*
	* @param input a Reader containing "csv-formatted" input
	* @param strategy the CSVStrategy used for CSV parsing
	*/
	public CSVParser(Reader input, CSVStrategy strategy) {
	this.in = new ExtendedBufferedReader(input);
	this.strategy = strategy;
	}

	// ======================================================
	// the parser
	// ======================================================

	/**
	* Parses the CSV according to the given strategy
	* and returns the content as an array of records
	* (whereas records are arrays of single values).
	* <p>
	* The returned content starts at the current parse-position in
	* the stream.
	*
	* @return matrix of records x values ('null' when end of file)
	* @throws IOException on parse error or input read-failure
	*/
	@SuppressWarnings({"unchecked"})
	public String[][] getAllValues() throws IOException {
	@SuppressWarnings({"rawtypes"})
	ArrayList records = new ArrayList();
	String[] values;
	String[][] ret = null;
	while ((values = getLine()) != null) {
	records.add(values);
	}
	if (records.size() > 0) {
	ret = new String[records.size()][];
	records.toArray(ret);
	}
	return ret;
	}

	/**
	* Parses the CSV according to the given strategy
	* and returns the next csv-value as string.
	*
	* @return next value in the input stream ('null' when end of file)
	* @throws IOException on parse error or input read-failure
	*/
	public String nextValue() throws IOException {
	Token tkn = nextToken();
	String ret = null;
	switch (tkn.type) {
	case TT_TOKEN:
	case TT_EORECORD:
	ret = tkn.content.toString();
	break;
	case TT_EOF:
	ret = null;
	break;
	case TT_INVALID:
	default:
	// error no token available (or error)
	throw new IOException(
	"(line " + getLineNumber()
	+ ") invalid parse sequence");
	// unreachable: break;
	}
	return ret;
	}

	/**
	* Parses from the current point in the stream til
	* the end of the current line.
	*
	* @return array of values til end of line
	* ('null' when end of file has been reached)
	* @throws IOException on parse error or input read-failure
	*/
	@SuppressWarnings({"unchecked"})
	public String[] getLine() throws IOException {
	String[] ret = EMPTY_STRING_ARRAY;
	record.clear();
	while (true) {
	reusableToken.reset();
	nextToken(reusableToken);
	switch (reusableToken.type) {
	case TT_TOKEN:
	record.add(reusableToken.content.toString());
	break;
	case TT_EORECORD:
	record.add(reusableToken.content.toString());
	break;
	case TT_EOF:
	if (reusableToken.isReady) {
	record.add(reusableToken.content.toString());
	} else {
	ret = null;
	}
	break;
	case TT_INVALID:
	default:
	// error: throw IOException
	throw new IOException("(line " + getLineNumber() + ") invalid parse sequence");
	// unreachable: break;
	}
	if (reusableToken.type != TT_TOKEN) {
	break;
	}
	}
	if (!record.isEmpty()) {
	ret = (String[]) record.toArray(new String[record.size()]);
	}
	return ret;
	}

	/**
	* Returns the current line number in the input stream.
	*
	* ATTENTION: in case your csv has multiline-values the returned
	* number does not correspond to the record-number
	*
	* @return current line number
	*/
	public int getLineNumber() {
	return in.getLineNumber();
	}

	// ======================================================
	// the lexer(s)
	// ======================================================

	/**
	* Convenience method for <code>nextToken(null)</code>.
	*/
	protected Token nextToken() throws IOException {
	return nextToken(new Token());
	}

	/**
	* Returns the next token.
	*
	* A token corresponds to a term, a record change or an
	* end-of-file indicator.
	*
	* @param tkn an existing Token object to reuse. The caller is responsible to initialize the
	* Token.
	* @return the next token found
	* @throws IOException on stream access error
	*/
	protected Token nextToken(Token tkn) throws IOException {
	wsBuf.clear(); // resuse

	// get the last read char (required for empty line detection)
	int lastChar = in.readAgain();

	// read the next char and set eol
	/* note: unfortunately isEndOfLine may consumes a character silently.
	* this has no effect outside of the method. so a simple workaround
	* is to call 'readAgain' on the stream...
	* uh: might using objects instead of base-types (jdk1.5 autoboxing!)
	*/
	int c = in.read();
	boolean eol = isEndOfLine(c);
	c = in.readAgain();

	// empty line detection: eol AND (last char was EOL or beginning)
	while (strategy.getIgnoreEmptyLines() && eol
	&& (lastChar == '\n'
	\|\| lastChar == ExtendedBufferedReader.UNDEFINED)
	&& !isEndOfFile(lastChar)) {
	// go on char ahead ...
	lastChar = c;
	c = in.read();
	eol = isEndOfLine(c);
	c = in.readAgain();
	// reached end of file without any content (empty line at the end)
	if (isEndOfFile(c)) {
	tkn.type = TT_EOF;
	return tkn;
	}
	}

	// did we reached eof during the last iteration already ? TT_EOF
	if (isEndOfFile(lastChar) \|\| (lastChar != strategy.getDelimiter() && isEndOfFile(c))) {
	tkn.type = TT_EOF;
	return tkn;
	}

	// important: make sure a new char gets consumed in each iteration
	while (!tkn.isReady && tkn.type != TT_EOF) {
	// ignore whitespaces at beginning of a token
	while (strategy.getIgnoreLeadingWhitespaces() && isWhitespace(c) && !eol) {
	wsBuf.append((char) c);
	c = in.read();
	eol = isEndOfLine(c);
	}
	// ok, start of token reached: comment, encapsulated, or token
	if (c == strategy.getCommentStart()) {
	// ignore everything till end of line and continue (incr linecount)
	in.readLine();
	tkn = nextToken(tkn.reset());
	} else if (c == strategy.getDelimiter()) {
	// empty token return TT_TOKEN("")
	tkn.type = TT_TOKEN;
	tkn.isReady = true;
	} else if (eol) {
	// empty token return TT_EORECORD("")
	//noop: tkn.content.append("");
	tkn.type = TT_EORECORD;
	tkn.isReady = true;
	} else if (c == strategy.getEncapsulator()) {
	// consume encapsulated token
	encapsulatedTokenLexer(tkn, c);
	} else if (isEndOfFile(c)) {
	// end of file return TT_EOF()
	//noop: tkn.content.append("");
	tkn.type = TT_EOF;
	tkn.isReady = true;
	} else {
	// next token must be a simple token
	// add removed blanks when not ignoring whitespace chars...
	if (!strategy.getIgnoreLeadingWhitespaces()) {
	tkn.content.append(wsBuf);
	}
	simpleTokenLexer(tkn, c);
	}
	}
	return tkn;
	}

	/**
	* A simple token lexer
	*
	* Simple token are tokens which are not surrounded by encapsulators.
	* A simple token might contain escaped delimiters (as \, or \;). The
	* token is finished when one of the following conditions become true:
	* <ul>
	* <li>end of line has been reached (TT_EORECORD)</li>
	* <li>end of stream has been reached (TT_EOF)</li>
	* <li>an unescaped delimiter has been reached (TT_TOKEN)</li>
	* </ul>
	*
	* @param tkn the current token
	* @param c the current character
	* @return the filled token
	*
	* @throws IOException on stream access error
	*/
	private Token simpleTokenLexer(Token tkn, int c) throws IOException {
	for (;;) {
	if (isEndOfLine(c)) {
	// end of record
	tkn.type = TT_EORECORD;
	tkn.isReady = true;
	break;
	} else if (isEndOfFile(c)) {
	// end of file
	tkn.type = TT_EOF;
	tkn.isReady = true;
	break;
	} else if (c == strategy.getDelimiter()) {
	// end of token
	tkn.type = TT_TOKEN;
	tkn.isReady = true;
	break;
	} else if (c == '\\' && strategy.getUnicodeEscapeInterpretation() && in.lookAhead() == 'u') {
	// interpret unicode escaped chars (like \u0070 -> p)
	tkn.content.append((char) unicodeEscapeLexer(c));
	} else if (c == strategy.getEscape()) {
	tkn.content.append((char)readEscape(c));
	} else {
	tkn.content.append((char) c);
	}

	c = in.read();
	}

	if (strategy.getIgnoreTrailingWhitespaces()) {
	tkn.content.trimTrailingWhitespace();
	}

	return tkn;
	}


	/**
	* An encapsulated token lexer
	*
	* Encapsulated tokens are surrounded by the given encapsulating-string.
	* The encapsulator itself might be included in the token using a
	* doubling syntax (as "", '') or using escaping (as in \", \').
	* Whitespaces before and after an encapsulated token are ignored.
	*
	* @param tkn the current token
	* @param c the current character
	* @return a valid token object
	* @throws IOException on invalid state
	*/
	private Token encapsulatedTokenLexer(Token tkn, int c) throws IOException {
	// save current line
	int startLineNumber = getLineNumber();
	// ignore the given delimiter
	// assert c == delimiter;
	for (;;) {
	c = in.read();

	if (c == '\\' && strategy.getUnicodeEscapeInterpretation() && in.lookAhead()=='u') {
	tkn.content.append((char) unicodeEscapeLexer(c));
	} else if (c == strategy.getEscape()) {
	tkn.content.append((char)readEscape(c));
	} else if (c == strategy.getEncapsulator()) {
	if (in.lookAhead() == strategy.getEncapsulator()) {
	// double or escaped encapsulator -> add single encapsulator to token
	c = in.read();
	tkn.content.append((char) c);
	} else {
	// token finish mark (encapsulator) reached: ignore whitespace till delimiter
	for (;;) {
	c = in.read();
	if (c == strategy.getDelimiter()) {
	tkn.type = TT_TOKEN;
	tkn.isReady = true;
	return tkn;
	} else if (isEndOfFile(c)) {
	tkn.type = TT_EOF;
	tkn.isReady = true;
	return tkn;
	} else if (isEndOfLine(c)) {
	// ok eo token reached
	tkn.type = TT_EORECORD;
	tkn.isReady = true;
	return tkn;
	} else if (!isWhitespace(c)) {
	// error invalid char between token and next delimiter
	throw new IOException(
	"(line " + getLineNumber()
	+ ") invalid char between encapsulated token end delimiter"
	);
	}
	}
	}
	} else if (isEndOfFile(c)) {
	// error condition (end of file before end of token)
	throw new IOException(
	"(startline " + startLineNumber + ")"
	+ "eof reached before encapsulated token finished"
	);
	} else {
	// consume character
	tkn.content.append((char) c);
	}
	}
	}


	/**
	* Decodes Unicode escapes.
	*
	* Interpretation of "\\uXXXX" escape sequences
	* where XXXX is a hex-number.
	* @param c current char which is discarded because it's the "\\" of "\\uXXXX"
	* @return the decoded character
	* @throws IOException on wrong unicode escape sequence or read error
	*/
	protected int unicodeEscapeLexer(int c) throws IOException {
	int ret = 0;
	// ignore 'u' (assume c==\ now) and read 4 hex digits
	c = in.read();
	code.clear();
	try {
	for (int i = 0; i < 4; i++) {
	c = in.read();
	if (isEndOfFile(c) \|\| isEndOfLine(c)) {
	throw new NumberFormatException("number too short");
	}
	code.append((char) c);
	}
	ret = Integer.parseInt(code.toString(), 16);
	} catch (NumberFormatException e) {
	throw new IOException(
	"(line " + getLineNumber() + ") Wrong unicode escape sequence found '"
	+ code.toString() + "'" + e.toString());
	}
	return ret;
	}

	private int readEscape(int c) throws IOException {
	// assume c is the escape char (normally a backslash)
	c = in.read();
	int out;
	switch (c) {
	case 'r': out='\r'; break;
	case 'n': out='\n'; break;
	case 't': out='\t'; break;
	case 'b': out='\b'; break;
	case 'f': out='\f'; break;
	default : out=c;
	}
	return out;
	}

	// ======================================================
	// strategies
	// ======================================================

	/**
	* Obtain the specified CSV Strategy. This should not be modified.
	*
	* @return strategy currently being used
	*/
	public CSVStrategy getStrategy() {
	return this.strategy;
	}

	// ======================================================
	// Character class checker
	// ======================================================

	/**
	* @return true if the given char is a whitespace character
	*/
	private boolean isWhitespace(int c) {
	return Character.isWhitespace((char) c) && (c != strategy.getDelimiter());
	}

	/**
	* Greedy - accepts \n and \r\n
	* This checker consumes silently the second control-character...
	*
	* @return true if the given character is a line-terminator
	*/
	private boolean isEndOfLine(int c) throws IOException {
	// check if we have \r\n...
	if (c == '\r') {
	if (in.lookAhead() == '\n') {
	// note: does not change c outside of this method !!
	c = in.read();
	}
	}
	return (c == '\n');
	}

	/**
	* @return true if the given character indicates end of file
	*/
	private boolean isEndOfFile(int c) {
	return c == ExtendedBufferedReader.END_OF_STREAM;
	}
	}