blob: c189dc4309cc3ba8c7f3073f3ddc479e0fe17e2d [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.commons.csv;
import static org.apache.commons.csv.Token.Type.TOKEN;
import java.io.Closeable;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import java.net.URL;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.TreeMap;
/**
* Parses CSV files according to the specified format.
*
* Because CSV appears in many different dialects, the parser supports many formats by allowing the
* specification of a {@link CSVFormat}.
*
* The parser works record wise. It is not possible to go back, once a record has been parsed from the input stream.
*
* <h2>Creating instances</h2>
* <p>
* There are several static factory methods that can be used to create instances for various types of resources:
* </p>
* <ul>
* <li>{@link #parse(java.io.File, Charset, CSVFormat)}</li>
* <li>{@link #parse(String, CSVFormat)}</li>
* <li>{@link #parse(java.net.URL, java.nio.charset.Charset, CSVFormat)}</li>
* </ul>
* <p>
* Alternatively parsers can also be created by passing a {@link Reader} directly to the sole constructor.
*
* For those who like fluent APIs, parsers can be created using {@link CSVFormat#parse(java.io.Reader)} as a shortcut:
* </p>
* <pre>
* for(CSVRecord record : CSVFormat.EXCEL.parse(in)) {
* ...
* }
* </pre>
*
* <h2>Parsing record wise</h2>
* <p>
* To parse a CSV input from a file, you write:
* </p>
*
* <pre>
* File csvData = new File(&quot;/path/to/csv&quot;);
* CSVParser parser = CSVParser.parse(csvData, CSVFormat.RFC4180);
* for (CSVRecord csvRecord : parser) {
* ...
* }
* </pre>
*
* <p>
* This will read the parse the contents of the file using the
* <a href="http://tools.ietf.org/html/rfc4180" target="_blank">RFC 4180</a> format.
* </p>
*
* <p>
* To parse CSV input in a format like Excel, you write:
* </p>
*
* <pre>
* CSVParser parser = CSVParser.parse(csvData, CSVFormat.EXCEL);
* for (CSVRecord csvRecord : parser) {
* ...
* }
* </pre>
*
* <p>
* If the predefined formats don't match the format at hands, custom formats can be defined. More information about
* customising CSVFormats is available in {@link CSVFormat CSVFormat Javadoc}.
* </p>
*
* <h2>Parsing into memory</h2>
* <p>
* If parsing record wise is not desired, the contents of the input can be read completely into memory.
* </p>
*
* <pre>
* Reader in = new StringReader(&quot;a;b\nc;d&quot;);
* CSVParser parser = new CSVParser(in, CSVFormat.EXCEL);
* List&lt;CSVRecord&gt; list = parser.getRecords();
* </pre>
*
* <p>
* There are two constraints that have to be kept in mind:
* </p>
*
* <ol>
* <li>Parsing into memory starts at the current position of the parser. If you have already parsed records from
* the input, those records will not end up in the in memory representation of your CSV data.</li>
* <li>Parsing into memory may consume a lot of system resources depending on the input. For example if you're
* parsing a 150MB file of CSV data the contents will be read completely into memory.</li>
* </ol>
*
* <h2>Notes</h2>
* <p>
* Internal parser state is completely covered by the format and the reader-state.
* </p>
*
* @see <a href="package-summary.html">package documentation for more details</a>
*/
public final class CSVParser implements Iterable<CSVRecord>, Closeable {
class CSVRecordIterator implements Iterator<CSVRecord> {
private CSVRecord current;
private CSVRecord getNextRecord() {
try {
return CSVParser.this.nextRecord();
} catch (final IOException e) {
throw new IllegalStateException(
e.getClass().getSimpleName() + " reading next record: " + e.toString(), e);
}
}
@Override
public boolean hasNext() {
if (CSVParser.this.isClosed()) {
return false;
}
if (this.current == null) {
this.current = this.getNextRecord();
}
return this.current != null;
}
@Override
public CSVRecord next() {
if (CSVParser.this.isClosed()) {
throw new NoSuchElementException("CSVParser has been closed");
}
CSVRecord next = this.current;
this.current = null;
if (next == null) {
// hasNext() wasn't called before
next = this.getNextRecord();
if (next == null) {
throw new NoSuchElementException("No more CSV records available");
}
}
return next;
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
}
/**
* Creates a parser for the given {@link File}.
*
* @param file
* a CSV file. Must not be null.
* @param charset
* A Charset
* @param format
* the CSVFormat used for CSV parsing. Must not be null.
* @return a new parser
* @throws IllegalArgumentException
* If the parameters of the format are inconsistent or if either file or format are null.
* @throws IOException
* If an I/O error occurs
*/
@SuppressWarnings("resource")
public static CSVParser parse(final File file, final Charset charset, final CSVFormat format) throws IOException {
Assertions.notNull(file, "file");
Assertions.notNull(format, "format");
return new CSVParser(new InputStreamReader(new FileInputStream(file), charset), format);
}
/**
* Creates a CSV parser using the given {@link CSVFormat}.
*
* <p>
* If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser,
* unless you close the {@code reader}.
* </p>
*
* @param inputStream
* an InputStream containing CSV-formatted input. Must not be null.
* @param charset
* a Charset.
* @param format
* the CSVFormat used for CSV parsing. Must not be null.
* @return a new CSVParser configured with the given reader and format.
* @throws IllegalArgumentException
* If the parameters of the format are inconsistent or if either reader or format are null.
* @throws IOException
* If there is a problem reading the header or skipping the first record
* @since 1.5
*/
@SuppressWarnings("resource")
public static CSVParser parse(final InputStream inputStream, final Charset charset, final CSVFormat format)
throws IOException {
Assertions.notNull(inputStream, "inputStream");
Assertions.notNull(format, "format");
return parse(new InputStreamReader(inputStream, charset), format);
}
/**
* Creates a parser for the given {@link Path}.
*
* @param path
* a CSV file. Must not be null.
* @param charset
* A Charset
* @param format
* the CSVFormat used for CSV parsing. Must not be null.
* @return a new parser
* @throws IllegalArgumentException
* If the parameters of the format are inconsistent or if either file or format are null.
* @throws IOException
* If an I/O error occurs
* @since 1.5
*/
public static CSVParser parse(final Path path, final Charset charset, final CSVFormat format) throws IOException {
Assertions.notNull(path, "path");
Assertions.notNull(format, "format");
return parse(Files.newInputStream(path), charset, format);
}
/**
* Creates a CSV parser using the given {@link CSVFormat}
*
* <p>
* If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser,
* unless you close the {@code reader}.
* </p>
*
* @param reader
* a Reader containing CSV-formatted input. Must not be null.
* @param format
* the CSVFormat used for CSV parsing. Must not be null.
* @return a new CSVParser configured with the given reader and format.
* @throws IllegalArgumentException
* If the parameters of the format are inconsistent or if either reader or format are null.
* @throws IOException
* If there is a problem reading the header or skipping the first record
* @since 1.5
*/
public static CSVParser parse(final Reader reader, final CSVFormat format) throws IOException {
return new CSVParser(reader, format);
}
/**
* Creates a parser for the given {@link String}.
*
* @param string
* a CSV string. Must not be null.
* @param format
* the CSVFormat used for CSV parsing. Must not be null.
* @return a new parser
* @throws IllegalArgumentException
* If the parameters of the format are inconsistent or if either string or format are null.
* @throws IOException
* If an I/O error occurs
*/
public static CSVParser parse(final String string, final CSVFormat format) throws IOException {
Assertions.notNull(string, "string");
Assertions.notNull(format, "format");
return new CSVParser(new StringReader(string), format);
}
// the following objects are shared to reduce garbage
/**
* Creates a parser for the given URL.
*
* <p>
* If you do not read all records from the given {@code url}, you should call {@link #close()} on the parser, unless
* you close the {@code url}.
* </p>
*
* @param url
* a URL. Must not be null.
* @param charset
* the charset for the resource. Must not be null.
* @param format
* the CSVFormat used for CSV parsing. Must not be null.
* @return a new parser
* @throws IllegalArgumentException
* If the parameters of the format are inconsistent or if either url, charset or format are null.
* @throws IOException
* If an I/O error occurs
*/
public static CSVParser parse(final URL url, final Charset charset, final CSVFormat format) throws IOException {
Assertions.notNull(url, "url");
Assertions.notNull(charset, "charset");
Assertions.notNull(format, "format");
return new CSVParser(new InputStreamReader(url.openStream(), charset), format);
}
private final CSVFormat format;
/** A mapping of column names to column indices */
private final Map<String, Integer> headerMap;
/** The column order to avoid re-computing it. */
private final List<String> headerNames;
private final Lexer lexer;
private final CSVRecordIterator csvRecordIterator;
/** A record buffer for getRecord(). Grows as necessary and is reused. */
private final List<String> recordList = new ArrayList<>();
/**
* The next record number to assign.
*/
private long recordNumber;
/**
* Lexer offset when the parser does not start parsing at the beginning of the source. Usually used in combination
* with {@link #recordNumber}.
*/
private final long characterOffset;
private final Token reusableToken = new Token();
/**
* Customized CSV parser using the given {@link CSVFormat}
*
* <p>
* If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser,
* unless you close the {@code reader}.
* </p>
*
* @param reader
* a Reader containing CSV-formatted input. Must not be null.
* @param format
* the CSVFormat used for CSV parsing. Must not be null.
* @throws IllegalArgumentException
* If the parameters of the format are inconsistent or if either reader or format are null.
* @throws IOException
* If there is a problem reading the header or skipping the first record
*/
public CSVParser(final Reader reader, final CSVFormat format) throws IOException {
this(reader, format, 0, 1);
}
/**
* Customized CSV parser using the given {@link CSVFormat}
*
* <p>
* If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser,
* unless you close the {@code reader}.
* </p>
*
* @param reader
* a Reader containing CSV-formatted input. Must not be null.
* @param format
* the CSVFormat used for CSV parsing. Must not be null.
* @param characterOffset
* Lexer offset when the parser does not start parsing at the beginning of the source.
* @param recordNumber
* The next record number to assign
* @throws IllegalArgumentException
* If the parameters of the format are inconsistent or if either reader or format are null.
* @throws IOException
* If there is a problem reading the header or skipping the first record
* @since 1.1
*/
@SuppressWarnings("resource")
public CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber)
throws IOException {
Assertions.notNull(reader, "reader");
Assertions.notNull(format, "format");
this.format = format;
this.lexer = new Lexer(format, new ExtendedBufferedReader(reader));
this.csvRecordIterator = new CSVRecordIterator();
final Headers headers = createHeaders();
this.headerMap = headers.headerMap;
this.headerNames = headers.headerNames;
this.characterOffset = characterOffset;
this.recordNumber = recordNumber - 1;
}
private void addRecordValue(final boolean lastRecord) {
final String input = this.reusableToken.content.toString();
final String inputClean = this.format.getTrim() ? input.trim() : input;
if (lastRecord && inputClean.isEmpty() && this.format.getTrailingDelimiter()) {
return;
}
final String nullString = this.format.getNullString();
this.recordList.add(inputClean.equals(nullString) ? null : inputClean);
}
/**
* Closes resources.
*
* @throws IOException
* If an I/O error occurs
*/
@Override
public void close() throws IOException {
if (this.lexer != null) {
this.lexer.close();
}
}
private Map<String, Integer> createEmptyHeaderMap() {
return this.format.getIgnoreHeaderCase() ?
new TreeMap<>(String.CASE_INSENSITIVE_ORDER) :
new LinkedHashMap<>();
}
/**
* Header information based on name and position.
*/
private static final class Headers {
/**
* Header column positions (0-based)
*/
final Map<String, Integer> headerMap;
/**
* Header names in column order
*/
final List<String> headerNames;
Headers(final Map<String, Integer> headerMap, final List<String> headerNames) {
this.headerMap = headerMap;
this.headerNames = headerNames;
}
}
/**
* Creates the name to index mapping if the format defines a header.
*
* @return null if the format has no header.
* @throws IOException if there is a problem reading the header or skipping the first record
*/
private Headers createHeaders() throws IOException {
Map<String, Integer> hdrMap = null;
List<String> headerNames = null;
final String[] formatHeader = this.format.getHeader();
if (formatHeader != null) {
hdrMap = createEmptyHeaderMap();
String[] headerRecord = null;
if (formatHeader.length == 0) {
// read the header from the first line of the file
final CSVRecord nextRecord = this.nextRecord();
if (nextRecord != null) {
headerRecord = nextRecord.values();
}
} else {
if (this.format.getSkipHeaderRecord()) {
this.nextRecord();
}
headerRecord = formatHeader;
}
// build the name to index mappings
if (headerRecord != null) {
for (int i = 0; i < headerRecord.length; i++) {
final String header = headerRecord[i];
final boolean emptyHeader = header == null || header.trim().isEmpty();
if (emptyHeader && !this.format.getAllowMissingColumnNames()) {
throw new IllegalArgumentException(
"A header name is missing in " + Arrays.toString(headerRecord));
}
// Note: This will always allow a duplicate header if the header is empty
final boolean containsHeader = header != null && hdrMap.containsKey(header);
if (containsHeader && !emptyHeader && !this.format.getAllowDuplicateHeaderNames()) {
throw new IllegalArgumentException(
String.format(
"The header contains a duplicate name: \"%s\" in %s. If this is valid then use CSVFormat.withAllowDuplicateHeaderNames().",
header, Arrays.toString(headerRecord)));
}
if (header != null) {
hdrMap.put(header, Integer.valueOf(i));
if (headerNames == null) {
headerNames = new ArrayList<>(headerRecord.length);
}
headerNames.add(header);
}
}
}
}
if (headerNames == null) {
headerNames = Collections.emptyList(); //immutable
} else {
headerNames = Collections.unmodifiableList(headerNames);
}
return new Headers(hdrMap, headerNames);
}
/**
* Returns the current line number in the input stream.
*
* <p>
* <strong>ATTENTION:</strong> If your CSV input has multi-line values, the returned number does not correspond to
* the record number.
* </p>
*
* @return current line number
*/
public long getCurrentLineNumber() {
return this.lexer.getCurrentLineNumber();
}
/**
* Gets the first end-of-line string encountered.
*
* @return the first end-of-line string
* @since 1.5
*/
public String getFirstEndOfLine() {
return lexer.getFirstEol();
}
/**
* Returns a copy of the header map.
* <p>
* The map keys are column names. The map values are 0-based indices.
* </p>
* <p>
* Note: The map can only provide a one-to-one mapping when the format did not
* contain null or duplicate column names.
* </p>
*
* @return a copy of the header map.
*/
public Map<String, Integer> getHeaderMap() {
if (this.headerMap == null) {
return null;
}
final Map<String, Integer> map = createEmptyHeaderMap();
map.putAll(this.headerMap);
return map;
}
/**
* Returns the header map.
*
* @return the header map.
*/
Map<String, Integer> getHeaderMapRaw() {
return this.headerMap;
}
/**
* Returns a read-only list of header names that iterates in column order.
* <p>
* Note: The list provides strings that can be used as keys in the header map.
* The list will not contain null column names if they were present in the input
* format.
* </p>
*
* @return read-only list of header names that iterates in column order.
* @see #getHeaderMap()
* @since 1.7
*/
public List<String> getHeaderNames() {
return headerNames;
}
/**
* Returns the current record number in the input stream.
*
* <p>
* <strong>ATTENTION:</strong> If your CSV input has multi-line values, the returned number does not correspond to
* the line number.
* </p>
*
* @return current record number
*/
public long getRecordNumber() {
return this.recordNumber;
}
/**
* Parses the CSV input according to the given format and returns the content as a list of
* {@link CSVRecord CSVRecords}.
*
* <p>
* The returned content starts at the current parse-position in the stream.
* </p>
*
* @return list of {@link CSVRecord CSVRecords}, may be empty
* @throws IOException
* on parse error or input read-failure
*/
public List<CSVRecord> getRecords() throws IOException {
CSVRecord rec;
final List<CSVRecord> records = new ArrayList<>();
while ((rec = this.nextRecord()) != null) {
records.add(rec);
}
return records;
}
/**
* Gets whether this parser is closed.
*
* @return whether this parser is closed.
*/
public boolean isClosed() {
return this.lexer.isClosed();
}
/**
* Returns an iterator on the records.
*
* <p>
* An {@link IOException} caught during the iteration are re-thrown as an
* {@link IllegalStateException}.
* </p>
* <p>
* If the parser is closed a call to {@link Iterator#next()} will throw a
* {@link NoSuchElementException}.
* </p>
*/
@Override
public Iterator<CSVRecord> iterator() {
return csvRecordIterator;
}
/**
* Parses the next record from the current point in the stream.
*
* @return the record as an array of values, or {@code null} if the end of the stream has been reached
* @throws IOException
* on parse error or input read-failure
*/
CSVRecord nextRecord() throws IOException {
CSVRecord result = null;
this.recordList.clear();
StringBuilder sb = null;
final long startCharPosition = lexer.getCharacterPosition() + this.characterOffset;
do {
this.reusableToken.reset();
this.lexer.nextToken(this.reusableToken);
switch (this.reusableToken.type) {
case TOKEN:
this.addRecordValue(false);
break;
case EORECORD:
this.addRecordValue(true);
break;
case EOF:
if (this.reusableToken.isReady) {
this.addRecordValue(true);
}
break;
case INVALID:
throw new IOException("(line " + this.getCurrentLineNumber() + ") invalid parse sequence");
case COMMENT: // Ignored currently
if (sb == null) { // first comment for this record
sb = new StringBuilder();
} else {
sb.append(Constants.LF);
}
sb.append(this.reusableToken.content);
this.reusableToken.type = TOKEN; // Read another token
break;
default:
throw new IllegalStateException("Unexpected Token type: " + this.reusableToken.type);
}
} while (this.reusableToken.type == TOKEN);
if (!this.recordList.isEmpty()) {
this.recordNumber++;
final String comment = sb == null ? null : sb.toString();
result = new CSVRecord(this, this.recordList.toArray(new String[this.recordList.size()]),
comment, this.recordNumber, startCharPosition);
}
return result;
}
}