blob: e8ffde9bacabf2074b4e10a3ee70d566b5c9aede [file] [log] [blame]
package org.apache.turbine.util.parser;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001-2003 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Turbine" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Turbine", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StreamTokenizer;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.NoSuchElementException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
/**
* DataStreamParser is used to parse a stream with a fixed format and
* generate ValueParser objects which can be used to extract the values
* in the desired type.
*
* <p>The class itself is abstract - a concrete subclass which implements
* the initTokenizer method such as CSVParser or TSVParser is required
* to use the functionality.
*
* <p>The class implements the java.util.Iterator interface for convenience.
* This allows simple use in a Velocity template for example:
*
* <pre>
* #foreach ($row in $datastream)
* Name: $row.Name
* Description: $row.Description
* #end
* </pre>
*
* @author <a href="mailto:sean@informage.net">Sean Legassick</a>
* @author <a href="mailto:martin@mvdb.net">Martin van den Bemt</a>
* @version $Id$
*/
public abstract class DataStreamParser implements Iterator
{
/** Logging */
private static Log log = LogFactory.getLog(DataStreamParser.class);
/**
* Conditional compilation flag.
*/
private static final boolean DEBUG = false;
/**
* The constant for empty fields
*/
protected static final String EMPTYFIELDNAME = "UNKNOWNFIELD";
/**
* The list of column names.
*/
private List columnNames;
/**
* The stream tokenizer for reading values from the input reader.
*/
private StreamTokenizer tokenizer;
/**
* The parameter parser holding the values of columns for the current line.
*/
private ValueParser lineValues;
/**
* Indicates whether or not the tokenizer has read anything yet.
*/
private boolean neverRead = true;
/**
* The character encoding of the input
*/
private String characterEncoding;
/**
* The fieldseperator, which can be almost any char
*/
private char fieldSeparator;
/**
* Create a new DataStreamParser instance. Requires a Reader to read the
* comma-separated values from, a list of column names and a
* character encoding.
*
* @param in the input reader.
* @param columnNames a list of column names.
* @param characterEncoding the character encoding of the input.
*/
public DataStreamParser(Reader in, List columnNames,
String characterEncoding)
{
this.columnNames = columnNames;
this.characterEncoding = characterEncoding;
if (this.characterEncoding == null)
{
// try and get the characterEncoding from the reader
this.characterEncoding = "US-ASCII";
try
{
this.characterEncoding = ((InputStreamReader) in).getEncoding();
}
catch (ClassCastException e)
{
}
}
tokenizer = new StreamTokenizer(new BufferedReader(in));
initTokenizer(tokenizer);
}
/**
* Initialize the StreamTokenizer instance used to read the lines
* from the input reader. This must be implemented in subclasses to
* set up other tokenizing properties.
*
* @param tokenizer the tokenizer to adjust
*/
protected void initTokenizer(StreamTokenizer tokenizer)
{
// set all numeric characters as ordinary characters
// (switches off number parsing)
tokenizer.ordinaryChars('0', '9');
tokenizer.ordinaryChars('-', '-');
tokenizer.ordinaryChars('.', '.');
// leave out the comma sign (,), we need it for empty fields
tokenizer.wordChars(' ', Integer.MAX_VALUE);
// and set the quote mark as the quoting character
tokenizer.quoteChar('"');
// and finally say that end of line is significant
tokenizer.eolIsSignificant(true);
}
/**
* This method must be called to setup the field seperator
* @param fieldSeparator the char which separates the fields
*/
public void setFieldSeparator(char fieldSeparator)
{
this.fieldSeparator = fieldSeparator;
// make this field also an ordinary char by default.
tokenizer.ordinaryChar(fieldSeparator);
}
/**
* Set the list of column names explicitly.
*
* @param columnNames A list of column names.
*/
public void setColumnNames(List columnNames)
{
this.columnNames = columnNames;
}
/**
* Read the list of column names from the input reader using the
* tokenizer. If fieldNames are empty, we use the current fieldNumber
* + the EMPTYFIELDNAME to make one up.
*
* @exception IOException an IOException occurred.
*/
public void readColumnNames()
throws IOException
{
columnNames = new ArrayList();
int lastTtype = 0;
int fieldCounter = 1;
neverRead = false;
tokenizer.nextToken();
while (tokenizer.ttype == StreamTokenizer.TT_WORD || tokenizer.ttype == StreamTokenizer.TT_EOL
|| tokenizer.ttype == '"' || tokenizer.ttype == fieldSeparator)
{
if (tokenizer.ttype != fieldSeparator && tokenizer.ttype != StreamTokenizer.TT_EOL)
{
columnNames.add(tokenizer.sval);
fieldCounter++;
}
else if (tokenizer.ttype == fieldSeparator && lastTtype == fieldSeparator)
{
// we have an empty field name
columnNames.add(EMPTYFIELDNAME + fieldCounter);
fieldCounter++;
}
else if (lastTtype == fieldSeparator && tokenizer.ttype == StreamTokenizer.TT_EOL)
{
columnNames.add(EMPTYFIELDNAME + fieldCounter);
break;
}
else if (tokenizer.ttype == StreamTokenizer.TT_EOL)
{
break;
}
lastTtype = tokenizer.ttype;
tokenizer.nextToken();
}
}
/**
* Determine whether a further row of values exists in the input.
*
* @return true if the input has more rows.
* @exception IOException an IOException occurred.
*/
public boolean hasNextRow()
throws IOException
{
// check for end of line ensures that an empty last line doesn't
// give a false positive for hasNextRow
if (neverRead || tokenizer.ttype == StreamTokenizer.TT_EOL)
{
tokenizer.nextToken();
tokenizer.pushBack();
neverRead = false;
}
return tokenizer.ttype != StreamTokenizer.TT_EOF;
}
/**
* Returns a ValueParser object containing the next row of values.
*
* @return a ValueParser object.
* @exception IOException an IOException occurred.
* @exception NoSuchElementException there are no more rows in the input.
*/
public ValueParser nextRow()
throws IOException, NoSuchElementException
{
if (!hasNextRow())
{
throw new NoSuchElementException();
}
if (lineValues == null)
{
lineValues = new BaseValueParser(characterEncoding);
}
else
{
lineValues.clear();
}
Iterator it = columnNames.iterator();
tokenizer.nextToken();
while (tokenizer.ttype == StreamTokenizer.TT_WORD
|| tokenizer.ttype == '"' || tokenizer.ttype == fieldSeparator)
{
int lastTtype = 0;
// note this means that if there are more values than
// column names, the extra values are discarded.
if (it.hasNext())
{
String colname = it.next().toString();
String colval = tokenizer.sval;
if (tokenizer.ttype != fieldSeparator && lastTtype != fieldSeparator)
{
if (DEBUG)
{
log.debug("DataStreamParser.nextRow(): " +
colname + "=" + colval);
}
lineValues.add(colname, colval);
}
else if (tokenizer.ttype == fieldSeparator && lastTtype != fieldSeparator)
{
lastTtype = tokenizer.ttype;
tokenizer.nextToken();
if (tokenizer.ttype != fieldSeparator && tokenizer.sval != null)
{
lineValues.add(colname, tokenizer.sval);
}
else if (tokenizer.ttype == StreamTokenizer.TT_EOL)
{
tokenizer.pushBack();
}
}
}
tokenizer.nextToken();
}
return lineValues;
}
/**
* Determine whether a further row of values exists in the input.
*
* @return true if the input has more rows.
*/
public boolean hasNext()
{
boolean hasNext = false;
try
{
hasNext = hasNextRow();
}
catch (IOException e)
{
log.error("IOException in CSVParser.hasNext", e);
}
return hasNext;
}
/**
* Returns a ValueParser object containing the next row of values.
*
* @return a ValueParser object as an Object.
* @exception NoSuchElementException there are no more rows in the input
* or an IOException occurred.
*/
public Object next()
throws NoSuchElementException
{
Object nextRow = null;
try
{
nextRow = nextRow();
}
catch (IOException e)
{
log.error("IOException in CSVParser.next", e);
throw new NoSuchElementException();
}
return nextRow;
}
/**
* The optional Iterator.remove method is not supported.
*
* @exception UnsupportedOperationException the operation is not supported.
*/
public void remove()
throws UnsupportedOperationException
{
throw new UnsupportedOperationException();
}
}