| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package org.apache.commons.csv; |
| |
| import static org.apache.commons.csv.Token.Type.TOKEN; |
| |
| import java.io.Closeable; |
| import java.io.File; |
| import java.io.FileInputStream; |
| import java.io.IOException; |
| import java.io.InputStream; |
| import java.io.InputStreamReader; |
| import java.io.Reader; |
| import java.io.StringReader; |
| import java.net.URL; |
| import java.nio.charset.Charset; |
| import java.nio.file.Files; |
| import java.nio.file.Path; |
| import java.util.ArrayList; |
| import java.util.Arrays; |
| import java.util.Collections; |
| import java.util.Iterator; |
| import java.util.LinkedHashMap; |
| import java.util.List; |
| import java.util.Map; |
| import java.util.NoSuchElementException; |
| import java.util.TreeMap; |
| |
| /** |
| * Parses CSV files according to the specified format. |
| * |
| * Because CSV appears in many different dialects, the parser supports many formats by allowing the |
| * specification of a {@link CSVFormat}. |
| * |
| * The parser works record wise. It is not possible to go back, once a record has been parsed from the input stream. |
| * |
| * <h2>Creating instances</h2> |
| * <p> |
| * There are several static factory methods that can be used to create instances for various types of resources: |
| * </p> |
| * <ul> |
| * <li>{@link #parse(java.io.File, Charset, CSVFormat)}</li> |
| * <li>{@link #parse(String, CSVFormat)}</li> |
| * <li>{@link #parse(java.net.URL, java.nio.charset.Charset, CSVFormat)}</li> |
| * </ul> |
| * <p> |
| * Alternatively parsers can also be created by passing a {@link Reader} directly to the sole constructor. |
| * |
| * For those who like fluent APIs, parsers can be created using {@link CSVFormat#parse(java.io.Reader)} as a shortcut: |
| * </p> |
| * <pre> |
| * for(CSVRecord record : CSVFormat.EXCEL.parse(in)) { |
| * ... |
| * } |
| * </pre> |
| * |
| * <h2>Parsing record wise</h2> |
| * <p> |
| * To parse a CSV input from a file, you write: |
| * </p> |
| * |
| * <pre> |
| * File csvData = new File("/path/to/csv"); |
| * CSVParser parser = CSVParser.parse(csvData, CSVFormat.RFC4180); |
| * for (CSVRecord csvRecord : parser) { |
| * ... |
| * } |
| * </pre> |
| * |
| * <p> |
| * This will read the parse the contents of the file using the |
| * <a href="http://tools.ietf.org/html/rfc4180" target="_blank">RFC 4180</a> format. |
| * </p> |
| * |
| * <p> |
| * To parse CSV input in a format like Excel, you write: |
| * </p> |
| * |
| * <pre> |
| * CSVParser parser = CSVParser.parse(csvData, CSVFormat.EXCEL); |
| * for (CSVRecord csvRecord : parser) { |
| * ... |
| * } |
| * </pre> |
| * |
| * <p> |
| * If the predefined formats don't match the format at hands, custom formats can be defined. More information about |
| * customising CSVFormats is available in {@link CSVFormat CSVFormat Javadoc}. |
| * </p> |
| * |
| * <h2>Parsing into memory</h2> |
| * <p> |
| * If parsing record wise is not desired, the contents of the input can be read completely into memory. |
| * </p> |
| * |
| * <pre> |
| * Reader in = new StringReader("a;b\nc;d"); |
| * CSVParser parser = new CSVParser(in, CSVFormat.EXCEL); |
| * List<CSVRecord> list = parser.getRecords(); |
| * </pre> |
| * |
| * <p> |
| * There are two constraints that have to be kept in mind: |
| * </p> |
| * |
| * <ol> |
| * <li>Parsing into memory starts at the current position of the parser. If you have already parsed records from |
| * the input, those records will not end up in the in memory representation of your CSV data.</li> |
| * <li>Parsing into memory may consume a lot of system resources depending on the input. For example if you're |
| * parsing a 150MB file of CSV data the contents will be read completely into memory.</li> |
| * </ol> |
| * |
| * <h2>Notes</h2> |
| * <p> |
| * Internal parser state is completely covered by the format and the reader-state. |
| * </p> |
| * |
| * @see <a href="package-summary.html">package documentation for more details</a> |
| */ |
| public final class CSVParser implements Iterable<CSVRecord>, Closeable { |
| |
| class CSVRecordIterator implements Iterator<CSVRecord> { |
| private CSVRecord current; |
| |
| private CSVRecord getNextRecord() { |
| try { |
| return CSVParser.this.nextRecord(); |
| } catch (final IOException e) { |
| throw new IllegalStateException( |
| e.getClass().getSimpleName() + " reading next record: " + e.toString(), e); |
| } |
| } |
| |
| @Override |
| public boolean hasNext() { |
| if (CSVParser.this.isClosed()) { |
| return false; |
| } |
| if (this.current == null) { |
| this.current = this.getNextRecord(); |
| } |
| |
| return this.current != null; |
| } |
| |
| @Override |
| public CSVRecord next() { |
| if (CSVParser.this.isClosed()) { |
| throw new NoSuchElementException("CSVParser has been closed"); |
| } |
| CSVRecord next = this.current; |
| this.current = null; |
| |
| if (next == null) { |
| // hasNext() wasn't called before |
| next = this.getNextRecord(); |
| if (next == null) { |
| throw new NoSuchElementException("No more CSV records available"); |
| } |
| } |
| |
| return next; |
| } |
| |
| @Override |
| public void remove() { |
| throw new UnsupportedOperationException(); |
| } |
| } |
| |
| /** |
| * Creates a parser for the given {@link File}. |
| * |
| * @param file |
| * a CSV file. Must not be null. |
| * @param charset |
| * A Charset |
| * @param format |
| * the CSVFormat used for CSV parsing. Must not be null. |
| * @return a new parser |
| * @throws IllegalArgumentException |
| * If the parameters of the format are inconsistent or if either file or format are null. |
| * @throws IOException |
| * If an I/O error occurs |
| */ |
| @SuppressWarnings("resource") |
| public static CSVParser parse(final File file, final Charset charset, final CSVFormat format) throws IOException { |
| Assertions.notNull(file, "file"); |
| Assertions.notNull(format, "format"); |
| return new CSVParser(new InputStreamReader(new FileInputStream(file), charset), format); |
| } |
| |
| /** |
| * Creates a CSV parser using the given {@link CSVFormat}. |
| * |
| * <p> |
| * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser, |
| * unless you close the {@code reader}. |
| * </p> |
| * |
| * @param inputStream |
| * an InputStream containing CSV-formatted input. Must not be null. |
| * @param charset |
| * a Charset. |
| * @param format |
| * the CSVFormat used for CSV parsing. Must not be null. |
| * @return a new CSVParser configured with the given reader and format. |
| * @throws IllegalArgumentException |
| * If the parameters of the format are inconsistent or if either reader or format are null. |
| * @throws IOException |
| * If there is a problem reading the header or skipping the first record |
| * @since 1.5 |
| */ |
| @SuppressWarnings("resource") |
| public static CSVParser parse(final InputStream inputStream, final Charset charset, final CSVFormat format) |
| throws IOException { |
| Assertions.notNull(inputStream, "inputStream"); |
| Assertions.notNull(format, "format"); |
| return parse(new InputStreamReader(inputStream, charset), format); |
| } |
| |
| /** |
| * Creates a parser for the given {@link Path}. |
| * |
| * @param path |
| * a CSV file. Must not be null. |
| * @param charset |
| * A Charset |
| * @param format |
| * the CSVFormat used for CSV parsing. Must not be null. |
| * @return a new parser |
| * @throws IllegalArgumentException |
| * If the parameters of the format are inconsistent or if either file or format are null. |
| * @throws IOException |
| * If an I/O error occurs |
| * @since 1.5 |
| */ |
| public static CSVParser parse(final Path path, final Charset charset, final CSVFormat format) throws IOException { |
| Assertions.notNull(path, "path"); |
| Assertions.notNull(format, "format"); |
| return parse(Files.newInputStream(path), charset, format); |
| } |
| |
| /** |
| * Creates a CSV parser using the given {@link CSVFormat} |
| * |
| * <p> |
| * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser, |
| * unless you close the {@code reader}. |
| * </p> |
| * |
| * @param reader |
| * a Reader containing CSV-formatted input. Must not be null. |
| * @param format |
| * the CSVFormat used for CSV parsing. Must not be null. |
| * @return a new CSVParser configured with the given reader and format. |
| * @throws IllegalArgumentException |
| * If the parameters of the format are inconsistent or if either reader or format are null. |
| * @throws IOException |
| * If there is a problem reading the header or skipping the first record |
| * @since 1.5 |
| */ |
| public static CSVParser parse(final Reader reader, final CSVFormat format) throws IOException { |
| return new CSVParser(reader, format); |
| } |
| |
| /** |
| * Creates a parser for the given {@link String}. |
| * |
| * @param string |
| * a CSV string. Must not be null. |
| * @param format |
| * the CSVFormat used for CSV parsing. Must not be null. |
| * @return a new parser |
| * @throws IllegalArgumentException |
| * If the parameters of the format are inconsistent or if either string or format are null. |
| * @throws IOException |
| * If an I/O error occurs |
| */ |
| public static CSVParser parse(final String string, final CSVFormat format) throws IOException { |
| Assertions.notNull(string, "string"); |
| Assertions.notNull(format, "format"); |
| |
| return new CSVParser(new StringReader(string), format); |
| } |
| |
| // the following objects are shared to reduce garbage |
| |
| /** |
| * Creates a parser for the given URL. |
| * |
| * <p> |
| * If you do not read all records from the given {@code url}, you should call {@link #close()} on the parser, unless |
| * you close the {@code url}. |
| * </p> |
| * |
| * @param url |
| * a URL. Must not be null. |
| * @param charset |
| * the charset for the resource. Must not be null. |
| * @param format |
| * the CSVFormat used for CSV parsing. Must not be null. |
| * @return a new parser |
| * @throws IllegalArgumentException |
| * If the parameters of the format are inconsistent or if either url, charset or format are null. |
| * @throws IOException |
| * If an I/O error occurs |
| */ |
| public static CSVParser parse(final URL url, final Charset charset, final CSVFormat format) throws IOException { |
| Assertions.notNull(url, "url"); |
| Assertions.notNull(charset, "charset"); |
| Assertions.notNull(format, "format"); |
| |
| return new CSVParser(new InputStreamReader(url.openStream(), charset), format); |
| } |
| |
| private final CSVFormat format; |
| |
| /** A mapping of column names to column indices */ |
| private final Map<String, Integer> headerMap; |
| |
| /** The column order to avoid re-computing it. */ |
| private final List<String> headerNames; |
| |
| private final Lexer lexer; |
| |
| private final CSVRecordIterator csvRecordIterator; |
| |
| /** A record buffer for getRecord(). Grows as necessary and is reused. */ |
| private final List<String> recordList = new ArrayList<>(); |
| |
| /** |
| * The next record number to assign. |
| */ |
| private long recordNumber; |
| |
| /** |
| * Lexer offset when the parser does not start parsing at the beginning of the source. Usually used in combination |
| * with {@link #recordNumber}. |
| */ |
| private final long characterOffset; |
| |
| private final Token reusableToken = new Token(); |
| |
| /** |
| * Customized CSV parser using the given {@link CSVFormat} |
| * |
| * <p> |
| * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser, |
| * unless you close the {@code reader}. |
| * </p> |
| * |
| * @param reader |
| * a Reader containing CSV-formatted input. Must not be null. |
| * @param format |
| * the CSVFormat used for CSV parsing. Must not be null. |
| * @throws IllegalArgumentException |
| * If the parameters of the format are inconsistent or if either reader or format are null. |
| * @throws IOException |
| * If there is a problem reading the header or skipping the first record |
| */ |
| public CSVParser(final Reader reader, final CSVFormat format) throws IOException { |
| this(reader, format, 0, 1); |
| } |
| |
| /** |
| * Customized CSV parser using the given {@link CSVFormat} |
| * |
| * <p> |
| * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser, |
| * unless you close the {@code reader}. |
| * </p> |
| * |
| * @param reader |
| * a Reader containing CSV-formatted input. Must not be null. |
| * @param format |
| * the CSVFormat used for CSV parsing. Must not be null. |
| * @param characterOffset |
| * Lexer offset when the parser does not start parsing at the beginning of the source. |
| * @param recordNumber |
| * The next record number to assign |
| * @throws IllegalArgumentException |
| * If the parameters of the format are inconsistent or if either reader or format are null. |
| * @throws IOException |
| * If there is a problem reading the header or skipping the first record |
| * @since 1.1 |
| */ |
| @SuppressWarnings("resource") |
| public CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber) |
| throws IOException { |
| Assertions.notNull(reader, "reader"); |
| Assertions.notNull(format, "format"); |
| |
| this.format = format; |
| this.lexer = new Lexer(format, new ExtendedBufferedReader(reader)); |
| this.csvRecordIterator = new CSVRecordIterator(); |
| final Headers headers = createHeaders(); |
| this.headerMap = headers.headerMap; |
| this.headerNames = headers.headerNames; |
| this.characterOffset = characterOffset; |
| this.recordNumber = recordNumber - 1; |
| } |
| |
| private void addRecordValue(final boolean lastRecord) { |
| final String input = this.reusableToken.content.toString(); |
| final String inputClean = this.format.getTrim() ? input.trim() : input; |
| if (lastRecord && inputClean.isEmpty() && this.format.getTrailingDelimiter()) { |
| return; |
| } |
| final String nullString = this.format.getNullString(); |
| this.recordList.add(inputClean.equals(nullString) ? null : inputClean); |
| } |
| |
| /** |
| * Closes resources. |
| * |
| * @throws IOException |
| * If an I/O error occurs |
| */ |
| @Override |
| public void close() throws IOException { |
| if (this.lexer != null) { |
| this.lexer.close(); |
| } |
| } |
| |
| private Map<String, Integer> createEmptyHeaderMap() { |
| return this.format.getIgnoreHeaderCase() ? |
| new TreeMap<>(String.CASE_INSENSITIVE_ORDER) : |
| new LinkedHashMap<>(); |
| } |
| |
| /** |
| * Header information based on name and position. |
| */ |
| private static final class Headers { |
| /** |
| * Header column positions (0-based) |
| */ |
| final Map<String, Integer> headerMap; |
| |
| /** |
| * Header names in column order |
| */ |
| final List<String> headerNames; |
| |
| Headers(final Map<String, Integer> headerMap, final List<String> headerNames) { |
| this.headerMap = headerMap; |
| this.headerNames = headerNames; |
| } |
| } |
| |
| /** |
| * Creates the name to index mapping if the format defines a header. |
| * |
| * @return null if the format has no header. |
| * @throws IOException if there is a problem reading the header or skipping the first record |
| */ |
| private Headers createHeaders() throws IOException { |
| Map<String, Integer> hdrMap = null; |
| List<String> headerNames = null; |
| final String[] formatHeader = this.format.getHeader(); |
| if (formatHeader != null) { |
| hdrMap = createEmptyHeaderMap(); |
| String[] headerRecord = null; |
| if (formatHeader.length == 0) { |
| // read the header from the first line of the file |
| final CSVRecord nextRecord = this.nextRecord(); |
| if (nextRecord != null) { |
| headerRecord = nextRecord.values(); |
| } |
| } else { |
| if (this.format.getSkipHeaderRecord()) { |
| this.nextRecord(); |
| } |
| headerRecord = formatHeader; |
| } |
| |
| // build the name to index mappings |
| if (headerRecord != null) { |
| for (int i = 0; i < headerRecord.length; i++) { |
| final String header = headerRecord[i]; |
| final boolean emptyHeader = header == null || header.trim().isEmpty(); |
| if (emptyHeader && !this.format.getAllowMissingColumnNames()) { |
| throw new IllegalArgumentException( |
| "A header name is missing in " + Arrays.toString(headerRecord)); |
| } |
| // Note: This will always allow a duplicate header if the header is empty |
| final boolean containsHeader = header != null && hdrMap.containsKey(header); |
| if (containsHeader && !emptyHeader && !this.format.getAllowDuplicateHeaderNames()) { |
| throw new IllegalArgumentException( |
| String.format( |
| "The header contains a duplicate name: \"%s\" in %s. If this is valid then use CSVFormat.withAllowDuplicateHeaderNames().", |
| header, Arrays.toString(headerRecord))); |
| } |
| if (header != null) { |
| hdrMap.put(header, Integer.valueOf(i)); |
| if (headerNames == null) { |
| headerNames = new ArrayList<>(headerRecord.length); |
| } |
| headerNames.add(header); |
| } |
| } |
| } |
| } |
| if (headerNames == null) { |
| headerNames = Collections.emptyList(); //immutable |
| } else { |
| headerNames = Collections.unmodifiableList(headerNames); |
| } |
| return new Headers(hdrMap, headerNames); |
| } |
| |
| /** |
| * Returns the current line number in the input stream. |
| * |
| * <p> |
| * <strong>ATTENTION:</strong> If your CSV input has multi-line values, the returned number does not correspond to |
| * the record number. |
| * </p> |
| * |
| * @return current line number |
| */ |
| public long getCurrentLineNumber() { |
| return this.lexer.getCurrentLineNumber(); |
| } |
| |
| /** |
| * Gets the first end-of-line string encountered. |
| * |
| * @return the first end-of-line string |
| * @since 1.5 |
| */ |
| public String getFirstEndOfLine() { |
| return lexer.getFirstEol(); |
| } |
| |
| /** |
| * Returns a copy of the header map. |
| * <p> |
| * The map keys are column names. The map values are 0-based indices. |
| * </p> |
| * <p> |
| * Note: The map can only provide a one-to-one mapping when the format did not |
| * contain null or duplicate column names. |
| * </p> |
| * |
| * @return a copy of the header map. |
| */ |
| public Map<String, Integer> getHeaderMap() { |
| if (this.headerMap == null) { |
| return null; |
| } |
| final Map<String, Integer> map = createEmptyHeaderMap(); |
| map.putAll(this.headerMap); |
| return map; |
| } |
| |
| /** |
| * Returns the header map. |
| * |
| * @return the header map. |
| */ |
| Map<String, Integer> getHeaderMapRaw() { |
| return this.headerMap; |
| } |
| |
| /** |
| * Returns a read-only list of header names that iterates in column order. |
| * <p> |
| * Note: The list provides strings that can be used as keys in the header map. |
| * The list will not contain null column names if they were present in the input |
| * format. |
| * </p> |
| * |
| * @return read-only list of header names that iterates in column order. |
| * @see #getHeaderMap() |
| * @since 1.7 |
| */ |
| public List<String> getHeaderNames() { |
| return headerNames; |
| } |
| |
| /** |
| * Returns the current record number in the input stream. |
| * |
| * <p> |
| * <strong>ATTENTION:</strong> If your CSV input has multi-line values, the returned number does not correspond to |
| * the line number. |
| * </p> |
| * |
| * @return current record number |
| */ |
| public long getRecordNumber() { |
| return this.recordNumber; |
| } |
| |
| /** |
| * Parses the CSV input according to the given format and returns the content as a list of |
| * {@link CSVRecord CSVRecords}. |
| * |
| * <p> |
| * The returned content starts at the current parse-position in the stream. |
| * </p> |
| * |
| * @return list of {@link CSVRecord CSVRecords}, may be empty |
| * @throws IOException |
| * on parse error or input read-failure |
| */ |
| public List<CSVRecord> getRecords() throws IOException { |
| CSVRecord rec; |
| final List<CSVRecord> records = new ArrayList<>(); |
| while ((rec = this.nextRecord()) != null) { |
| records.add(rec); |
| } |
| return records; |
| } |
| |
| /** |
| * Gets whether this parser is closed. |
| * |
| * @return whether this parser is closed. |
| */ |
| public boolean isClosed() { |
| return this.lexer.isClosed(); |
| } |
| |
| /** |
| * Returns an iterator on the records. |
| * |
| * <p> |
| * An {@link IOException} caught during the iteration are re-thrown as an |
| * {@link IllegalStateException}. |
| * </p> |
| * <p> |
| * If the parser is closed a call to {@link Iterator#next()} will throw a |
| * {@link NoSuchElementException}. |
| * </p> |
| */ |
| @Override |
| public Iterator<CSVRecord> iterator() { |
| return csvRecordIterator; |
| } |
| |
| /** |
| * Parses the next record from the current point in the stream. |
| * |
| * @return the record as an array of values, or {@code null} if the end of the stream has been reached |
| * @throws IOException |
| * on parse error or input read-failure |
| */ |
| CSVRecord nextRecord() throws IOException { |
| CSVRecord result = null; |
| this.recordList.clear(); |
| StringBuilder sb = null; |
| final long startCharPosition = lexer.getCharacterPosition() + this.characterOffset; |
| do { |
| this.reusableToken.reset(); |
| this.lexer.nextToken(this.reusableToken); |
| switch (this.reusableToken.type) { |
| case TOKEN: |
| this.addRecordValue(false); |
| break; |
| case EORECORD: |
| this.addRecordValue(true); |
| break; |
| case EOF: |
| if (this.reusableToken.isReady) { |
| this.addRecordValue(true); |
| } |
| break; |
| case INVALID: |
| throw new IOException("(line " + this.getCurrentLineNumber() + ") invalid parse sequence"); |
| case COMMENT: // Ignored currently |
| if (sb == null) { // first comment for this record |
| sb = new StringBuilder(); |
| } else { |
| sb.append(Constants.LF); |
| } |
| sb.append(this.reusableToken.content); |
| this.reusableToken.type = TOKEN; // Read another token |
| break; |
| default: |
| throw new IllegalStateException("Unexpected Token type: " + this.reusableToken.type); |
| } |
| } while (this.reusableToken.type == TOKEN); |
| |
| if (!this.recordList.isEmpty()) { |
| this.recordNumber++; |
| final String comment = sb == null ? null : sb.toString(); |
| result = new CSVRecord(this, this.recordList.toArray(new String[this.recordList.size()]), |
| comment, this.recordNumber, startCharPosition); |
| } |
| return result; |
| } |
| |
| } |