| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * https://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| */ |
| |
| package org.apache.commons.csv; |
| |
| import static org.apache.commons.csv.Constants.CR; |
| import static org.apache.commons.csv.Constants.LF; |
| import static org.apache.commons.csv.Constants.UNDEFINED; |
| import static org.apache.commons.io.IOUtils.EOF; |
| |
| import java.io.IOException; |
| import java.io.Reader; |
| import java.nio.CharBuffer; |
| import java.nio.charset.CharacterCodingException; |
| import java.nio.charset.Charset; |
| import java.nio.charset.CharsetEncoder; |
| |
| import org.apache.commons.io.IOUtils; |
| import org.apache.commons.io.input.UnsynchronizedBufferedReader; |
| |
| /** |
| * A special buffered reader which supports sophisticated read access. |
| * <p> |
| * In particular the reader supports a look-ahead option, which allows you to see the next char returned by |
| * {@link #read()}. This reader also tracks how many characters have been read with {@link #getPosition()}. |
| * </p> |
| */ |
| final class ExtendedBufferedReader extends UnsynchronizedBufferedReader { |
| |
| /** The last char returned */ |
| private int lastChar = UNDEFINED; |
| private int lastCharMark = UNDEFINED; |
| |
| /** The count of EOLs (CR/LF/CRLF) seen so far */ |
| private long lineNumber; |
| private long lineNumberMark; |
| |
| /** The position, which is the number of characters read so far */ |
| private long position; |
| private long positionMark; |
| |
| /** The number of bytes read so far. */ |
| private long bytesRead; |
| private long bytesReadMark; |
| |
| /** Encoder for calculating the number of bytes for each character read. */ |
| private final CharsetEncoder encoder; |
| |
| /** |
| * Constructs a new instance using the default buffer size. |
| */ |
| ExtendedBufferedReader(final Reader reader) { |
| this(reader, null, false); |
| } |
| |
| /** |
| * Constructs a new instance with the specified reader, character set, |
| * and byte tracking option. Initializes an encoder if byte tracking is enabled |
| * and a character set is provided. |
| * |
| * @param reader the reader supports a look-ahead option. |
| * @param charset the character set for encoding, or {@code null} if not applicable. |
| * @param trackBytes {@code true} to enable byte tracking; {@code false} to disable it. |
| */ |
| ExtendedBufferedReader(final Reader reader, final Charset charset, final boolean trackBytes) { |
| super(reader); |
| encoder = charset != null && trackBytes ? charset.newEncoder() : null; |
| } |
| |
| /** |
| * Closes the stream. |
| * |
| * @throws IOException |
| * If an I/O error occurs |
| */ |
| @Override |
| public void close() throws IOException { |
| // Set ivars before calling super close() in case close() throws an IOException. |
| lastChar = EOF; |
| super.close(); |
| } |
| |
| /** |
| * Gets the number of bytes read by the reader. |
| * |
| * @return the number of bytes read by the read |
| */ |
| long getBytesRead() { |
| return this.bytesRead; |
| } |
| |
| /** |
| * Gets the byte length of the given character based on the original Unicode |
| * specification, which defined characters as fixed-width 16-bit entities. |
| * <p> |
| * The Unicode characters are divided into two main ranges: |
| * <ul> |
| * <li><strong>U+0000 to U+FFFF (Basic Multilingual Plane, BMP):</strong> |
| * <ul> |
| * <li>Represented using a single 16-bit {@code char}.</li> |
| * <li>Includes UTF-8 encodings of 1-byte, 2-byte, and some 3-byte characters.</li> |
| * </ul> |
| * </li> |
| * <li><strong>U+10000 to U+10FFFF (Supplementary Characters):</strong> |
| * <ul> |
| * <li>Represented as a pair of {@code char}s:</li> |
| * <li>The first {@code char} is from the high-surrogates range (\uD800-\uDBFF).</li> |
| * <li>The second {@code char} is from the low-surrogates range (\uDC00-\uDFFF).</li> |
| * <li>Includes UTF-8 encodings of some 3-byte characters and all 4-byte characters.</li> |
| * </ul> |
| * </li> |
| * </ul> |
| * |
| * @param current the current character to process. |
| * @return the byte length of the character. |
| * @throws CharacterCodingException if the character cannot be encoded. |
| */ |
| private int getEncodedCharLength(final int current) throws CharacterCodingException { |
| final char cChar = (char) current; |
| final char lChar = (char) lastChar; |
| if (!Character.isSurrogate(cChar)) { |
| return encoder.encode(CharBuffer.wrap(new char[] { cChar })).limit(); |
| } |
| if (Character.isHighSurrogate(cChar)) { |
| // Move on to the next char (low surrogate) |
| return 0; |
| } |
| if (Character.isSurrogatePair(lChar, cChar)) { |
| return encoder.encode(CharBuffer.wrap(new char[] { lChar, cChar })).limit(); |
| } |
| throw new CharacterCodingException(); |
| } |
| |
| /** |
| * Returns the last character that was read as an integer (0 to 65535). This will be the last character returned by |
| * any of the read methods. This will not include a character read using the {@link #peek()} method. If no |
| * character has been read then this will return {@link Constants#UNDEFINED}. If the end of the stream was reached |
| * on the last read then this will return {@link IOUtils#EOF}. |
| * |
| * @return the last character that was read |
| */ |
| int getLastChar() { |
| return lastChar; |
| } |
| |
| /** |
| * Returns the current line number |
| * |
| * @return the current line number |
| */ |
| long getLineNumber() { |
| // Check if we are at EOL or EOF or just starting |
| if (lastChar == CR || lastChar == LF || lastChar == UNDEFINED || lastChar == EOF) { |
| return lineNumber; // counter is accurate |
| } |
| return lineNumber + 1; // Allow for counter being incremented only at EOL |
| } |
| |
| /** |
| * Gets the character position in the reader. |
| * |
| * @return the current position in the reader (counting characters, not bytes since this is a Reader) |
| */ |
| long getPosition() { |
| return this.position; |
| } |
| |
| @Override |
| public void mark(final int readAheadLimit) throws IOException { |
| lineNumberMark = lineNumber; |
| lastCharMark = lastChar; |
| positionMark = position; |
| bytesReadMark = bytesRead; |
| super.mark(readAheadLimit); |
| } |
| |
| @Override |
| public int read() throws IOException { |
| final int current = super.read(); |
| if (current == CR || current == LF && lastChar != CR || |
| current == EOF && lastChar != CR && lastChar != LF && lastChar != EOF) { |
| lineNumber++; |
| } |
| if (encoder != null) { |
| this.bytesRead += getEncodedCharLength(current); |
| } |
| lastChar = current; |
| position++; |
| return lastChar; |
| } |
| |
| @Override |
| public int read(final char[] buf, final int offset, final int length) throws IOException { |
| if (length == 0) { |
| return 0; |
| } |
| final int len = super.read(buf, offset, length); |
| if (len > 0) { |
| for (int i = offset; i < offset + len; i++) { |
| final char ch = buf[i]; |
| if (ch == LF) { |
| if (CR != (i > offset ? buf[i - 1] : lastChar)) { |
| lineNumber++; |
| } |
| } else if (ch == CR) { |
| lineNumber++; |
| } |
| } |
| lastChar = buf[offset + len - 1]; |
| } else if (len == EOF) { |
| lastChar = EOF; |
| } |
| position += len; |
| return len; |
| } |
| |
| /** |
| * Gets the next line, dropping the line terminator(s). This method should only be called when processing a |
| * comment, otherwise, information can be lost. |
| * <p> |
| * Increments {@link #lineNumber} and updates {@link #position}. |
| * </p> |
| * <p> |
| * Sets {@link #lastChar} to {@code Constants.EOF} at EOF, otherwise the last EOL character. |
| * </p> |
| * |
| * @return the line that was read, or null if reached EOF. |
| */ |
| @Override |
| public String readLine() throws IOException { |
| if (peek() == EOF) { |
| return null; |
| } |
| final StringBuilder buffer = new StringBuilder(); |
| while (true) { |
| final int current = read(); |
| if (current == CR) { |
| final int next = peek(); |
| if (next == LF) { |
| read(); |
| } |
| } |
| if (current == EOF || current == LF || current == CR) { |
| break; |
| } |
| buffer.append((char) current); |
| } |
| return buffer.toString(); |
| } |
| |
| @Override |
| public void reset() throws IOException { |
| lineNumber = lineNumberMark; |
| lastChar = lastCharMark; |
| position = positionMark; |
| bytesRead = bytesReadMark; |
| super.reset(); |
| } |
| |
| } |