src/java/org/apache/turbine/util/parser/DataStreamParser.java - turbine-core - Git at Google

 package org.apache.turbine.util.parser;

 /* ====================================================================
  * The Apache Software License, Version 1.1
  *
  * Copyright (c) 2001-2003 The Apache Software Foundation.  All rights
  * reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  *
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  *
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in
  *    the documentation and/or other materials provided with the
  *    distribution.
  *
  * 3. The end-user documentation included with the redistribution,
  *    if any, must include the following acknowledgment:
  *       "This product includes software developed by the
  *        Apache Software Foundation (http://www.apache.org/)."
  *    Alternately, this acknowledgment may appear in the software itself,
  *    if and wherever such third-party acknowledgments normally appear.
  *
  * 4. The names "Apache" and "Apache Software Foundation" and
  *    "Apache Turbine" must not be used to endorse or promote products
  *    derived from this software without prior written permission. For
  *    written permission, please contact apache@apache.org.
  *
  * 5. Products derived from this software may not be called "Apache",
  *    "Apache Turbine", nor may "Apache" appear in their name, without
  *    prior written permission of the Apache Software Foundation.
  *
  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
  * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * ====================================================================
  *
  * This software consists of voluntary contributions made by many
  * individuals on behalf of the Apache Software Foundation.  For more
  * information on the Apache Software Foundation, please see
  * <http://www.apache.org/>.
  */

 import java.io.BufferedReader;
 import java.io.IOException;
 import java.io.InputStreamReader;
 import java.io.Reader;
 import java.io.StreamTokenizer;

 import java.util.ArrayList;
 import java.util.Iterator;
 import java.util.List;
 import java.util.NoSuchElementException;

 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;

 /**
  * DataStreamParser is used to parse a stream with a fixed format and
  * generate ValueParser objects which can be used to extract the values
  * in the desired type.
  *
  * <p>The class itself is abstract - a concrete subclass which implements
  * the initTokenizer method such as CSVParser or TSVParser is required
  * to use the functionality.
  *
  * <p>The class implements the java.util.Iterator interface for convenience.
  * This allows simple use in a Velocity template for example:
  *
  * <pre>
  * #foreach ($row in $datastream)
  *   Name: $row.Name
  *   Description: $row.Description
  * #end
  * </pre>
  *
  * @author <a href="mailto:sean@informage.net">Sean Legassick</a>
  * @author <a href="mailto:martin@mvdb.net">Martin van den Bemt</a>
  * @version $Id$
  */
 public abstract class DataStreamParser implements Iterator
 {
     /** Logging */
     private static Log log = LogFactory.getLog(DataStreamParser.class);

     /**
      * Conditional compilation flag.
      */
     private static final boolean DEBUG = false;

     /**
      * The constant for empty fields
      */
     protected static final String EMPTYFIELDNAME = "UNKNOWNFIELD";

     /**
      * The list of column names.
      */
     private List columnNames;

     /**
      * The stream tokenizer for reading values from the input reader.
      */
     private StreamTokenizer tokenizer;

     /**
      * The parameter parser holding the values of columns for the current line.
      */
     private ValueParser lineValues;

     /**
      * Indicates whether or not the tokenizer has read anything yet.
      */
     private boolean neverRead = true;

     /**
      * The character encoding of the input
      */
     private String characterEncoding;

     /**
      * The fieldseperator, which can be almost any char
      */
     private char fieldSeparator;

     /**
      * Create a new DataStreamParser instance. Requires a Reader to read the
      * comma-separated values from, a list of column names and a
      * character encoding.
      *
      * @param in the input reader.
      * @param columnNames a list of column names.
      * @param characterEncoding the character encoding of the input.
      */
     public DataStreamParser(Reader in, List columnNames,
                             String characterEncoding)
     {
         this.columnNames = columnNames;
         this.characterEncoding = characterEncoding;

         if (this.characterEncoding == null)
         {
             // try and get the characterEncoding from the reader
             this.characterEncoding = "US-ASCII";
             try
             {
                 this.characterEncoding = ((InputStreamReader) in).getEncoding();
             }
             catch (ClassCastException e)
             {
             }
         }

         tokenizer = new StreamTokenizer(new BufferedReader(in));
         initTokenizer(tokenizer);
     }

     /**
      * Initialize the StreamTokenizer instance used to read the lines
      * from the input reader. This must be implemented in subclasses to
      * set up other tokenizing properties.
      *
      * @param tokenizer the tokenizer to adjust
      */
     protected void initTokenizer(StreamTokenizer tokenizer)
     {
         // set all numeric characters as ordinary characters
         // (switches off number parsing)
         tokenizer.ordinaryChars('0', '9');
         tokenizer.ordinaryChars('-', '-');
         tokenizer.ordinaryChars('.', '.');

         // leave out the comma sign (,), we need it for empty fields

         tokenizer.wordChars(' ', Integer.MAX_VALUE);

         // and  set the quote mark as the quoting character
         tokenizer.quoteChar('"');

         // and finally say that end of line is significant
         tokenizer.eolIsSignificant(true);
     }

     /**
      * This method must be called to setup the field seperator
      * @param fieldSeparator the char which separates the fields
      */
     public void setFieldSeparator(char fieldSeparator)
     {
         this.fieldSeparator = fieldSeparator;
         // make this field also an ordinary char by default.
         tokenizer.ordinaryChar(fieldSeparator);
     }

     /**
      * Set the list of column names explicitly.
      *
      * @param columnNames A list of column names.
      */
     public void setColumnNames(List columnNames)
     {
         this.columnNames = columnNames;
     }

     /**
      * Read the list of column names from the input reader using the
      * tokenizer. If fieldNames are empty, we use the current fieldNumber
      * + the EMPTYFIELDNAME to make one up.
      *
      * @exception IOException an IOException occurred.
      */
     public void readColumnNames()
             throws IOException
     {
         columnNames = new ArrayList();
         int lastTtype = 0;
         int fieldCounter = 1;

         neverRead = false;
         tokenizer.nextToken();
         while (tokenizer.ttype == StreamTokenizer.TT_WORD || tokenizer.ttype == StreamTokenizer.TT_EOL
                 || tokenizer.ttype == '"' || tokenizer.ttype == fieldSeparator)
         {
             if (tokenizer.ttype != fieldSeparator && tokenizer.ttype != StreamTokenizer.TT_EOL)
             {
                 columnNames.add(tokenizer.sval);
                 fieldCounter++;
             }
             else if (tokenizer.ttype == fieldSeparator && lastTtype == fieldSeparator)
             {
                 // we have an empty field name
                 columnNames.add(EMPTYFIELDNAME + fieldCounter);
                 fieldCounter++;
             }
             else if (lastTtype == fieldSeparator && tokenizer.ttype == StreamTokenizer.TT_EOL)
             {
                 columnNames.add(EMPTYFIELDNAME + fieldCounter);
                 break;
             }
             else if (tokenizer.ttype == StreamTokenizer.TT_EOL)
             {
                 break;
             }
             lastTtype = tokenizer.ttype;
             tokenizer.nextToken();
         }
     }

     /**
      * Determine whether a further row of values exists in the input.
      *
      * @return true if the input has more rows.
      * @exception IOException an IOException occurred.
      */
     public boolean hasNextRow()
             throws IOException
     {
         // check for end of line ensures that an empty last line doesn't
         // give a false positive for hasNextRow
         if (neverRead || tokenizer.ttype == StreamTokenizer.TT_EOL)
         {
             tokenizer.nextToken();
             tokenizer.pushBack();
             neverRead = false;
         }
         return tokenizer.ttype != StreamTokenizer.TT_EOF;
     }

     /**
      * Returns a ValueParser object containing the next row of values.
      *
      * @return a ValueParser object.
      * @exception IOException an IOException occurred.
      * @exception NoSuchElementException there are no more rows in the input.
      */
     public ValueParser nextRow()
             throws IOException, NoSuchElementException
     {
         if (!hasNextRow())
         {
             throw new NoSuchElementException();
         }

         if (lineValues == null)
         {
             lineValues = new BaseValueParser(characterEncoding);
         }
         else
         {
             lineValues.clear();
         }

         Iterator it = columnNames.iterator();
         tokenizer.nextToken();
         while (tokenizer.ttype == StreamTokenizer.TT_WORD
                 || tokenizer.ttype == '"' || tokenizer.ttype == fieldSeparator)
         {
             int lastTtype = 0;
             // note this means that if there are more values than
             // column names, the extra values are discarded.
             if (it.hasNext())
             {
                 String colname = it.next().toString();
                 String colval = tokenizer.sval;
                 if (tokenizer.ttype != fieldSeparator && lastTtype != fieldSeparator)
                 {
                     if (DEBUG)
                     {
                         log.debug("DataStreamParser.nextRow(): " +
                                 colname + "=" + colval);
                     }
                     lineValues.add(colname, colval);
                 }
                 else if (tokenizer.ttype == fieldSeparator && lastTtype != fieldSeparator)
                 {
                     lastTtype = tokenizer.ttype;
                     tokenizer.nextToken();
                     if (tokenizer.ttype != fieldSeparator && tokenizer.sval != null)
                     {
                         lineValues.add(colname, tokenizer.sval);
                     }
                     else if (tokenizer.ttype == StreamTokenizer.TT_EOL)
                     {
                         tokenizer.pushBack();
                     }
                 }
             }
             tokenizer.nextToken();
         }

         return lineValues;
     }

     /**
      * Determine whether a further row of values exists in the input.
      *
      * @return true if the input has more rows.
      */
     public boolean hasNext()
     {
         boolean hasNext = false;

         try
         {
             hasNext = hasNextRow();
         }
         catch (IOException e)
         {
             log.error("IOException in CSVParser.hasNext", e);
         }

         return hasNext;
     }

     /**
      * Returns a ValueParser object containing the next row of values.
      *
      * @return a ValueParser object as an Object.
      * @exception NoSuchElementException there are no more rows in the input
      *                                   or an IOException occurred.
      */
     public Object next()
             throws NoSuchElementException
     {
         Object nextRow = null;

         try
         {
             nextRow = nextRow();
         }
         catch (IOException e)
         {
             log.error("IOException in CSVParser.next", e);
             throw new NoSuchElementException();
         }

         return nextRow;
     }

     /**
      * The optional Iterator.remove method is not supported.
      *
      * @exception UnsupportedOperationException the operation is not supported.
      */
     public void remove()
             throws UnsupportedOperationException
     {
         throw new UnsupportedOperationException();
     }
 }
	package org.apache.turbine.util.parser;

	/* ====================================================================
	* The Apache Software License, Version 1.1
	*
	* Copyright (c) 2001-2003 The Apache Software Foundation. All rights
	* reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	*
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	*
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in
	* the documentation and/or other materials provided with the
	* distribution.
	*
	* 3. The end-user documentation included with the redistribution,
	* if any, must include the following acknowledgment:
	* "This product includes software developed by the
	* Apache Software Foundation (http://www.apache.org/)."
	* Alternately, this acknowledgment may appear in the software itself,
	* if and wherever such third-party acknowledgments normally appear.
	*
	* 4. The names "Apache" and "Apache Software Foundation" and
	* "Apache Turbine" must not be used to endorse or promote products
	* derived from this software without prior written permission. For
	* written permission, please contact apache@apache.org.
	*
	* 5. Products derived from this software may not be called "Apache",
	* "Apache Turbine", nor may "Apache" appear in their name, without
	* prior written permission of the Apache Software Foundation.
	*
	* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
	* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
	* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
	* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
	* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
	* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
	* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
	* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
	* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	* ====================================================================
	*
	* This software consists of voluntary contributions made by many
	* individuals on behalf of the Apache Software Foundation. For more
	* information on the Apache Software Foundation, please see
	* <http://www.apache.org/>.
	*/

	import java.io.BufferedReader;
	import java.io.IOException;
	import java.io.InputStreamReader;
	import java.io.Reader;
	import java.io.StreamTokenizer;

	import java.util.ArrayList;
	import java.util.Iterator;
	import java.util.List;
	import java.util.NoSuchElementException;

	import org.apache.commons.logging.Log;
	import org.apache.commons.logging.LogFactory;

	/**
	* DataStreamParser is used to parse a stream with a fixed format and
	* generate ValueParser objects which can be used to extract the values
	* in the desired type.
	*
	* <p>The class itself is abstract - a concrete subclass which implements
	* the initTokenizer method such as CSVParser or TSVParser is required
	* to use the functionality.
	*
	* <p>The class implements the java.util.Iterator interface for convenience.
	* This allows simple use in a Velocity template for example:
	*
	* <pre>
	* #foreach ($row in $datastream)
	* Name: $row.Name
	* Description: $row.Description
	* #end
	* </pre>
	*
	* @author <a href="mailto:sean@informage.net">Sean Legassick</a>
	* @author <a href="mailto:martin@mvdb.net">Martin van den Bemt</a>
	* @version $Id$
	*/
	public abstract class DataStreamParser implements Iterator
	{
	/** Logging */
	private static Log log = LogFactory.getLog(DataStreamParser.class);

	/**
	* Conditional compilation flag.
	*/
	private static final boolean DEBUG = false;

	/**
	* The constant for empty fields
	*/
	protected static final String EMPTYFIELDNAME = "UNKNOWNFIELD";

	/**
	* The list of column names.
	*/
	private List columnNames;

	/**
	* The stream tokenizer for reading values from the input reader.
	*/
	private StreamTokenizer tokenizer;

	/**
	* The parameter parser holding the values of columns for the current line.
	*/
	private ValueParser lineValues;

	/**
	* Indicates whether or not the tokenizer has read anything yet.
	*/
	private boolean neverRead = true;

	/**
	* The character encoding of the input
	*/
	private String characterEncoding;

	/**
	* The fieldseperator, which can be almost any char
	*/
	private char fieldSeparator;

	/**
	* Create a new DataStreamParser instance. Requires a Reader to read the
	* comma-separated values from, a list of column names and a
	* character encoding.
	*
	* @param in the input reader.
	* @param columnNames a list of column names.
	* @param characterEncoding the character encoding of the input.
	*/
	public DataStreamParser(Reader in, List columnNames,
	String characterEncoding)
	{
	this.columnNames = columnNames;
	this.characterEncoding = characterEncoding;

	if (this.characterEncoding == null)
	{
	// try and get the characterEncoding from the reader
	this.characterEncoding = "US-ASCII";
	try
	{
	this.characterEncoding = ((InputStreamReader) in).getEncoding();
	}
	catch (ClassCastException e)
	{
	}
	}

	tokenizer = new StreamTokenizer(new BufferedReader(in));
	initTokenizer(tokenizer);
	}

	/**
	* Initialize the StreamTokenizer instance used to read the lines
	* from the input reader. This must be implemented in subclasses to
	* set up other tokenizing properties.
	*
	* @param tokenizer the tokenizer to adjust
	*/
	protected void initTokenizer(StreamTokenizer tokenizer)
	{
	// set all numeric characters as ordinary characters
	// (switches off number parsing)
	tokenizer.ordinaryChars('0', '9');
	tokenizer.ordinaryChars('-', '-');
	tokenizer.ordinaryChars('.', '.');

	// leave out the comma sign (,), we need it for empty fields

	tokenizer.wordChars(' ', Integer.MAX_VALUE);

	// and set the quote mark as the quoting character
	tokenizer.quoteChar('"');

	// and finally say that end of line is significant
	tokenizer.eolIsSignificant(true);
	}

	/**
	* This method must be called to setup the field seperator
	* @param fieldSeparator the char which separates the fields
	*/
	public void setFieldSeparator(char fieldSeparator)
	{
	this.fieldSeparator = fieldSeparator;
	// make this field also an ordinary char by default.
	tokenizer.ordinaryChar(fieldSeparator);
	}

	/**
	* Set the list of column names explicitly.
	*
	* @param columnNames A list of column names.
	*/
	public void setColumnNames(List columnNames)
	{
	this.columnNames = columnNames;
	}

	/**
	* Read the list of column names from the input reader using the
	* tokenizer. If fieldNames are empty, we use the current fieldNumber
	* + the EMPTYFIELDNAME to make one up.
	*
	* @exception IOException an IOException occurred.
	*/
	public void readColumnNames()
	throws IOException
	{
	columnNames = new ArrayList();
	int lastTtype = 0;
	int fieldCounter = 1;

	neverRead = false;
	tokenizer.nextToken();
	while (tokenizer.ttype == StreamTokenizer.TT_WORD \|\| tokenizer.ttype == StreamTokenizer.TT_EOL
	\|\| tokenizer.ttype == '"' \|\| tokenizer.ttype == fieldSeparator)
	{
	if (tokenizer.ttype != fieldSeparator && tokenizer.ttype != StreamTokenizer.TT_EOL)
	{
	columnNames.add(tokenizer.sval);
	fieldCounter++;
	}
	else if (tokenizer.ttype == fieldSeparator && lastTtype == fieldSeparator)
	{
	// we have an empty field name
	columnNames.add(EMPTYFIELDNAME + fieldCounter);
	fieldCounter++;
	}
	else if (lastTtype == fieldSeparator && tokenizer.ttype == StreamTokenizer.TT_EOL)
	{
	columnNames.add(EMPTYFIELDNAME + fieldCounter);
	break;
	}
	else if (tokenizer.ttype == StreamTokenizer.TT_EOL)
	{
	break;
	}
	lastTtype = tokenizer.ttype;
	tokenizer.nextToken();
	}
	}

	/**
	* Determine whether a further row of values exists in the input.
	*
	* @return true if the input has more rows.
	* @exception IOException an IOException occurred.
	*/
	public boolean hasNextRow()
	throws IOException
	{
	// check for end of line ensures that an empty last line doesn't
	// give a false positive for hasNextRow
	if (neverRead \|\| tokenizer.ttype == StreamTokenizer.TT_EOL)
	{
	tokenizer.nextToken();
	tokenizer.pushBack();
	neverRead = false;
	}
	return tokenizer.ttype != StreamTokenizer.TT_EOF;
	}

	/**
	* Returns a ValueParser object containing the next row of values.
	*
	* @return a ValueParser object.
	* @exception IOException an IOException occurred.
	* @exception NoSuchElementException there are no more rows in the input.
	*/
	public ValueParser nextRow()
	throws IOException, NoSuchElementException
	{
	if (!hasNextRow())
	{
	throw new NoSuchElementException();
	}

	if (lineValues == null)
	{
	lineValues = new BaseValueParser(characterEncoding);
	}
	else
	{
	lineValues.clear();
	}

	Iterator it = columnNames.iterator();
	tokenizer.nextToken();
	while (tokenizer.ttype == StreamTokenizer.TT_WORD
	\|\| tokenizer.ttype == '"' \|\| tokenizer.ttype == fieldSeparator)
	{
	int lastTtype = 0;
	// note this means that if there are more values than
	// column names, the extra values are discarded.
	if (it.hasNext())
	{
	String colname = it.next().toString();
	String colval = tokenizer.sval;
	if (tokenizer.ttype != fieldSeparator && lastTtype != fieldSeparator)
	{
	if (DEBUG)
	{
	log.debug("DataStreamParser.nextRow(): " +
	colname + "=" + colval);
	}
	lineValues.add(colname, colval);
	}
	else if (tokenizer.ttype == fieldSeparator && lastTtype != fieldSeparator)
	{
	lastTtype = tokenizer.ttype;
	tokenizer.nextToken();
	if (tokenizer.ttype != fieldSeparator && tokenizer.sval != null)
	{
	lineValues.add(colname, tokenizer.sval);
	}
	else if (tokenizer.ttype == StreamTokenizer.TT_EOL)
	{
	tokenizer.pushBack();
	}
	}
	}
	tokenizer.nextToken();
	}

	return lineValues;
	}

	/**
	* Determine whether a further row of values exists in the input.
	*
	* @return true if the input has more rows.
	*/
	public boolean hasNext()
	{
	boolean hasNext = false;

	try
	{
	hasNext = hasNextRow();
	}
	catch (IOException e)
	{
	log.error("IOException in CSVParser.hasNext", e);
	}

	return hasNext;
	}

	/**
	* Returns a ValueParser object containing the next row of values.
	*
	* @return a ValueParser object as an Object.
	* @exception NoSuchElementException there are no more rows in the input
	* or an IOException occurred.
	*/
	public Object next()
	throws NoSuchElementException
	{
	Object nextRow = null;

	try
	{
	nextRow = nextRow();
	}
	catch (IOException e)
	{
	log.error("IOException in CSVParser.next", e);
	throw new NoSuchElementException();
	}

	return nextRow;
	}

	/**
	* The optional Iterator.remove method is not supported.
	*
	* @exception UnsupportedOperationException the operation is not supported.
	*/
	public void remove()
	throws UnsupportedOperationException
	{
	throw new UnsupportedOperationException();
	}
	}