| /* =============================================================================== * |
| * Copyright (C) 1999-2004, The Apache Software Foundation. All rights reserved. * |
| * * |
| * Licensed under the Apache License, Version 2.0 (the "License"). You may not use * |
| * this file except in compliance with the License. You may obtain a copy of the * |
| * License at <http://www.apache.org/licenses/LICENSE-2.0>. * |
| * * |
| * Unless required by applicable law or agreed to in writing, software distributed * |
| * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR * |
| * CONDITIONS OF ANY KIND, either express or implied. See the License for the * |
| * specific language governing permissions and limitations under the License. * |
| * =============================================================================== */ |
| package org.apache.cocoon.generation; |
| |
| import java.io.BufferedReader; |
| import java.io.ByteArrayInputStream; |
| import java.io.CharArrayWriter; |
| import java.io.IOException; |
| import java.io.InputStream; |
| import java.io.InputStreamReader; |
| import java.io.Reader; |
| import java.io.Serializable; |
| import java.util.HashMap; |
| import java.util.Map; |
| |
| import org.apache.avalon.framework.parameters.Parameters; |
| import org.apache.cocoon.ProcessingException; |
| import org.apache.cocoon.environment.SourceResolver; |
| import org.apache.excalibur.source.Source; |
| import org.xml.sax.Attributes; |
| import org.xml.sax.Locator; |
| import org.xml.sax.SAXException; |
| import org.xml.sax.helpers.AttributesImpl; |
| |
| /** |
| * <p>A simple parser converting a Comma Separated Values (CSV) file into XML.</p> |
| * |
| * <p>This parser is controlled by the following sitemap parameters:</p> |
| * |
| * <ul> |
| * <li> |
| * <b>process-headers</b>: whether the first line in the CSV is considered |
| * to be the header defining column names (the resulting output will be |
| * different if this is <i>true</i> or <i>false</i> (default: <i>false</i>). |
| * </li> |
| * <li> |
| * <b>encoding</b>: the character encoding (UTF-8, ISO8859-1, ...) used to |
| * interpret the input CSV source file (default: <i>system default</i>). |
| * </li> |
| * <li> |
| * <b>separator</b>: the field-separator character in the CSV file (comma, |
| * tab, ...) (default: <i>,</i> <small>comma</small>). |
| * </li> |
| * <li> |
| * <b>escape</b>: the character used to escape fields, or part of them, in |
| * the CSV file (default: <i>"</i> <small>quote</small>). |
| * </li> |
| * <li> |
| * <b>buffer-size</b>: the size of the buffer used for reading the source |
| * CSV file (default: <i>4096 bytes</i>). |
| * </li> |
| * </ul> |
| * |
| * <p>The generated output will look something like the following:</p> |
| * |
| * <pre> |
| * <?xml version="1.0" encoding="ISO-8859-1"?> |
| * <csv:document xmlns:csv="http://apache.org/cocoon/csv/1.0"> |
| * <csv:header> |
| * <csv:column number="1">Column A</csv:column> |
| * <csv:column number="2">Column B</csv:column> |
| * <csv:column number="3">Column C</csv:column> |
| * </csv:header> |
| * <csv:record number="1"> |
| * <csv:field number="1" column="Column A">Field A1</csv:field> |
| * <csv:field number="2" column="Column B">Field B1</csv:field> |
| * <csv:field number="3" column="Column C">Field C1</csv:field> |
| * </csv:record> |
| * <csv:record number="2"> |
| * <csv:field number="1" column="Column A">Field A2</csv:field> |
| * <csv:field number="2" column="Column B">Field B2</csv:field> |
| * <csv:field number="3" column="Column C">Field C2</csv:field> |
| * </csv:record> |
| * </csv:document> |
| * </pre> |
| * |
| * <p>Note that this generator has been thoroughly tested with CSV files generated |
| * by <a href="http://office.microsoft.com/" target="_new">Microsoft Excel</a>. |
| * Unfortunately no official CSV specification has ever been published by |
| * any standard body, so the interpretation of the format might be slightly |
| * different in cases.</p> |
| * |
| */ |
| public class CSVGenerator extends FileGenerator { |
| |
| /** <p>The namespace URI of XML generated by this instance.</p> */ |
| public static final String NAMESPACE_URI = "http://apache.org/cocoon/csv/1.0"; |
| /** <p>The namespace prefix of XML generated by this instance.</p> */ |
| public static final String NAMESPACE_PREFIX = "csv"; |
| |
| /** <p>The default encoding configured in the Java VM.</p> */ |
| private static final String DEFAULT_ENCODING = |
| new InputStreamReader(new ByteArrayInputStream(new byte[0])).getEncoding(); |
| /** <p>The default field separator character.</p> */ |
| private static final String DEFAULT_SEPARATOR = ","; |
| /** <p>The default field separator character.</p> */ |
| private static final String DEFAULT_ESCAPE = "\""; |
| /** <p>The default field separator character.</p> */ |
| private static final int DEFAULT_BUFFER_SIZE = 4096; |
| /** <p>A string used for indenting.</p> */ |
| private static final char INDENT_STRING[] = "\n ".toCharArray(); |
| |
| /** <p>The encoding used to read the CSV resource from a stream.</p> */ |
| private String encoding = DEFAULT_ENCODING; |
| /** <p>The character used to separate fields.</p> */ |
| private char separator = DEFAULT_SEPARATOR.charAt(0); |
| /** <p>The character used to initiate and terminate esacaped sequences.</p> */ |
| private char escape = DEFAULT_ESCAPE.charAt(0); |
| /** <p>The size of the buffer used to read the input.</p> */ |
| private int buffersize = DEFAULT_BUFFER_SIZE; |
| /** <p>The current field (column) number in the current record.</p> */ |
| private int fieldnumber = 1; |
| /** <p>The current record (line) number in the current CSV.</p> */ |
| private int recordnumber = 1; |
| /** <p>A flag indicating whether the <record> tag was opened.</p> */ |
| private boolean openrecord = false; |
| /** <p>The character buffer for the current field.</p> */ |
| private CharArrayWriter buffer = null; |
| /** <p>A map of all known columns or null if no headers are processed.</p> */ |
| private Map columns = null; |
| |
| /** |
| * <p>Create a new {@link CSVGenerator} instance.</p> |
| */ |
| public CSVGenerator() { |
| super(); |
| } |
| |
| /** |
| * <p>Recycle this component.</p>. |
| */ |
| public void recycle() { |
| super.recycle(); |
| |
| this.encoding = DEFAULT_ENCODING; |
| this.separator = DEFAULT_SEPARATOR.charAt(0); |
| this.escape = DEFAULT_ESCAPE.charAt(0); |
| this.buffersize = DEFAULT_BUFFER_SIZE; |
| this.buffer = null; |
| this.columns = null; |
| this.recordnumber = 1; |
| this.fieldnumber = 1; |
| this.openrecord = false; |
| } |
| |
| /** |
| * <p>Setup this {@link CSVGenerator} instance.</p> |
| */ |
| public void setup(SourceResolver resolver, Map object_model, String source, |
| Parameters parameters) |
| throws ProcessingException, SAXException, IOException { |
| super.setup(resolver, object_model, source, parameters); |
| |
| boolean header = parameters.getParameterAsBoolean("process-header", false); |
| |
| this.encoding = parameters.getParameter("encoding", DEFAULT_ENCODING); |
| this.separator = parameters.getParameter("separator", DEFAULT_SEPARATOR).charAt(0); |
| this.escape = parameters.getParameter("escape", DEFAULT_ESCAPE).charAt(0); |
| this.buffersize = parameters.getParameterAsInteger("buffer-size", DEFAULT_BUFFER_SIZE); |
| this.buffer = new CharArrayWriter(); |
| this.columns = (header ? new HashMap() : null); |
| this.recordnumber = (header ? 0 : 1); |
| this.fieldnumber = 1; |
| this.openrecord = false; |
| } |
| |
| /** |
| * <p>Generate the unique key.</p> |
| */ |
| public Serializable getKey() { |
| String key = this.inputSource.getURI(); |
| if (this.columns != null) return (key + "+headers"); |
| return key; |
| } |
| |
| /** |
| * <p>Generate XML data from a Comma Separated Value resource.</p>. |
| */ |
| public void generate() |
| throws IOException, SAXException, ProcessingException { |
| |
| /* Create a new Reader correctly decoding the source stream */ |
| CSVReader csv = new CSVReader(this.inputSource, this.encoding, this.buffersize); |
| |
| try { |
| /* Start the document */ |
| this.contentHandler.setDocumentLocator(csv); |
| this.contentHandler.startDocument(); |
| this.contentHandler.startPrefixMapping(NAMESPACE_PREFIX, NAMESPACE_URI); |
| this.indent(0); |
| this.startElement("document"); |
| |
| /* Allocate buffer and status for parsing */ |
| boolean unescaped = true; |
| int prev = -1; |
| int curr = -1; |
| |
| /* Parse the file reading characters one-by-one */ |
| while ((curr = csv.read()) >= 0) { |
| |
| /* Process any occurrence of the escape character */ |
| if (curr == this.escape) { |
| if ((unescaped) && (prev == this.escape)) { |
| this.buffer.write(this.escape); |
| } |
| unescaped = ! unescaped; |
| prev = curr; |
| continue; |
| } |
| |
| /* Process any occurrence of the field separator */ |
| if ((unescaped) && (curr == this.separator)) { |
| this.dumpField(); |
| prev = curr; |
| continue; |
| } |
| |
| /* Process newline characters */ |
| if ((unescaped) && ((curr == '\r') || (curr == '\n'))) { |
| this.dumpField(); |
| this.dumpRecord(); |
| |
| /* Record numbering */ |
| if (((curr == '\n') && (prev != '\r')) || (curr == '\r')) { |
| this.recordnumber ++; |
| } |
| |
| /* Nothing else to do */ |
| prev = curr; |
| continue; |
| } |
| |
| /* Any other character simply gets added to the buffer */ |
| this.buffer.write(curr); |
| prev = curr; |
| } |
| |
| /* Terminate any hanging open record element (just in case) */ |
| this.dumpField(); |
| this.dumpRecord(); |
| |
| /* Terminate the document */ |
| this.indent(0); |
| this.endElement("document"); |
| this.contentHandler.endPrefixMapping(NAMESPACE_PREFIX); |
| this.contentHandler.endDocument(); |
| |
| } finally { |
| csv.close(); |
| } |
| } |
| |
| |
| private void dumpField() |
| throws SAXException { |
| if (this.buffer.size() < 1) { |
| this.fieldnumber ++; |
| return; |
| } |
| |
| if (! this.openrecord) { |
| this.indent(4); |
| |
| if (this.recordnumber > 0) { |
| AttributesImpl attributes = new AttributesImpl(); |
| String value = Integer.toString(this.recordnumber); |
| attributes.addAttribute("", "number", "number", "CDATA", value); |
| this.startElement("record", attributes); |
| } else { |
| this.startElement("header"); |
| } |
| this.openrecord = true; |
| } |
| |
| /* Enclode the field in the proper element */ |
| String element = "field"; |
| char array[] = this.buffer.toCharArray(); |
| this.indent(8); |
| |
| AttributesImpl attributes = new AttributesImpl(); |
| String value = Integer.toString(this.fieldnumber); |
| attributes.addAttribute("", "number", "number", "CDATA", value); |
| |
| if (this.recordnumber < 1) { |
| this.columns.put(new Integer(this.fieldnumber), new String(array)); |
| element = "column"; |
| } else if (this.columns != null) { |
| String header = (String) this.columns.get(new Integer(this.fieldnumber)); |
| if (header != null) { |
| attributes.addAttribute("", "column", "column", "CDATA", header); |
| } |
| } |
| |
| this.startElement(element, attributes); |
| this.contentHandler.characters(array, 0, array.length); |
| this.endElement(element); |
| this.buffer.reset(); |
| |
| this.fieldnumber ++; |
| } |
| |
| private void dumpRecord() |
| throws SAXException { |
| if (this.openrecord) { |
| this.indent(4); |
| if (this.recordnumber > 0) { |
| this.endElement("record"); |
| } else { |
| this.endElement("header"); |
| } |
| this.openrecord = false; |
| } |
| this.fieldnumber = 1; |
| } |
| |
| private void indent(int level) |
| throws SAXException { |
| this.contentHandler.characters(INDENT_STRING, 0, level + 1); |
| } |
| |
| private void startElement(String name) |
| throws SAXException { |
| this.startElement(name, new AttributesImpl()); |
| } |
| |
| private void startElement(String name, Attributes atts) |
| throws SAXException { |
| if (name == null) throw new NullPointerException("Null name"); |
| if (atts == null) atts = new AttributesImpl(); |
| String qual = NAMESPACE_PREFIX + ':' + name; |
| this.contentHandler.startElement(NAMESPACE_URI, name, qual, atts); |
| } |
| |
| private void endElement(String name) |
| throws SAXException { |
| String qual = NAMESPACE_PREFIX + ':' + name; |
| this.contentHandler.endElement(NAMESPACE_URI, name, qual); |
| } |
| |
| private static final class CSVReader extends Reader implements Locator { |
| |
| private String uri = null; |
| private Reader input = null; |
| private int column = 1; |
| private int line = 1; |
| private int last = -1; |
| |
| private CSVReader(Source source, String encoding, int buffer) |
| throws IOException { |
| InputStream stream = source.getInputStream(); |
| Reader reader = new InputStreamReader(stream, encoding); |
| this.input = new BufferedReader(reader, buffer); |
| this.uri = source.getURI(); |
| } |
| |
| public String getPublicId() { |
| return null; |
| } |
| |
| public String getSystemId() { |
| return this.uri; |
| } |
| |
| public int getLineNumber() { |
| return this.line; |
| } |
| |
| public int getColumnNumber() { |
| return this.column; |
| } |
| |
| public void close() |
| throws IOException { |
| this.input.close(); |
| } |
| |
| public int read() |
| throws IOException { |
| int c = this.input.read(); |
| if (c < 0) return c; |
| |
| if (((c == '\n') && (this.last != '\r')) || (c == '\r')) { |
| this.column = 1; |
| this.line ++; |
| } |
| |
| this.last = c; |
| return c; |
| } |
| |
| public int read(char b[], int o, int l) |
| throws IOException { |
| if (b == null) throw new NullPointerException(); |
| if ((o<0)||(o>b.length)||(l<0)||((o+l)>b.length)||((o+l)<0)) { |
| throw new IndexOutOfBoundsException(); |
| } |
| if (l == 0) return 0; |
| |
| int c = read(); |
| if (c == -1) return -1; |
| b[o] = (char)c; |
| |
| int i = 1; |
| try { |
| for (i = 1; i < l ; i++) { |
| c = read(); |
| if (c == -1) break; |
| if (b != null) b[o + i] = (char)c; |
| } |
| } catch (IOException ee) { |
| return i; |
| } |
| return i; |
| } |
| } |
| } |