| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.commons.io.input; |
| |
| import static org.apache.commons.io.IOUtils.EOF; |
| |
| import java.io.IOException; |
| import java.io.InputStream; |
| import java.io.Reader; |
| import java.nio.ByteBuffer; |
| import java.nio.CharBuffer; |
| import java.nio.charset.Charset; |
| import java.nio.charset.CharsetEncoder; |
| import java.nio.charset.CoderResult; |
| import java.nio.charset.CodingErrorAction; |
| import java.util.Objects; |
| |
| import org.apache.commons.io.Charsets; |
| import org.apache.commons.io.IOUtils; |
| import org.apache.commons.io.build.AbstractOrigin; |
| import org.apache.commons.io.build.AbstractStreamBuilder; |
| import org.apache.commons.io.charset.CharsetEncoders; |
| |
| /** |
| * {@link InputStream} implementation that reads a character stream from a {@link Reader} and transforms it to a byte stream using a specified charset encoding. |
| * The stream is transformed using a {@link CharsetEncoder} object, guaranteeing that all charset encodings supported by the JRE are handled correctly. In |
| * particular for charsets such as UTF-16, the implementation ensures that one and only one byte order marker is produced. |
| * <p> |
| * Since in general it is not possible to predict the number of characters to be read from the {@link Reader} to satisfy a read request on the |
| * {@link ReaderInputStream}, all reads from the {@link Reader} are buffered. There is therefore no well defined correlation between the current position of the |
| * {@link Reader} and that of the {@link ReaderInputStream}. This also implies that in general there is no need to wrap the underlying {@link Reader} in a |
| * {@link java.io.BufferedReader}. |
| * </p> |
| * <p> |
| * {@link ReaderInputStream} implements the inverse transformation of {@link java.io.InputStreamReader}; in the following example, reading from {@code in2} |
| * would return the same byte sequence as reading from {@code in} (provided that the initial byte sequence is legal with respect to the charset encoding): |
| * </p> |
| * <p> |
| * To build an instance, see {@link Builder}. |
| * </p> |
| * <pre> |
| * InputStream inputStream = ... |
| * Charset cs = ... |
| * InputStreamReader reader = new InputStreamReader(inputStream, cs); |
| * ReaderInputStream in2 = ReaderInputStream.builder() |
| * .setReader(reader) |
| * .setCharset(cs) |
| * .get(); |
| * </pre> |
| * <p> |
| * {@link ReaderInputStream} implements the same transformation as {@link java.io.OutputStreamWriter}, except that the control flow is reversed: both classes |
| * transform a character stream into a byte stream, but {@link java.io.OutputStreamWriter} pushes data to the underlying stream, while {@link ReaderInputStream} |
| * pulls it from the underlying stream. |
| * </p> |
| * <p> |
| * Note that while there are use cases where there is no alternative to using this class, very often the need to use this class is an indication of a flaw in |
| * the design of the code. This class is typically used in situations where an existing API only accepts an {@link InputStream}, but where the most natural way |
| * to produce the data is as a character stream, i.e. by providing a {@link Reader} instance. An example of a situation where this problem may appear is when |
| * implementing the {@code javax.activation.DataSource} interface from the Java Activation Framework. |
| * </p> |
| * <p> |
| * The {@link #available()} method of this class always returns 0. The methods {@link #mark(int)} and {@link #reset()} are not supported. |
| * </p> |
| * <p> |
| * Instances of {@link ReaderInputStream} are not thread safe. |
| * </p> |
| * |
| * @see org.apache.commons.io.output.WriterOutputStream |
| * @since 2.0 |
| */ |
| public class ReaderInputStream extends InputStream { |
| |
| /** |
| * Builds a new {@link ReaderInputStream} instance. |
| * <p> |
| * For example: |
| * </p> |
| * <pre>{@code |
| * ReaderInputStream s = ReaderInputStream.builder() |
| * .setPath(path) |
| * .setCharsetEncoder(Charset.defaultCharset().newEncoder()) |
| * .get();} |
| * </pre> |
| * |
| * @since 2.12.0 |
| */ |
| public static class Builder extends AbstractStreamBuilder<ReaderInputStream, Builder> { |
| |
| private CharsetEncoder charsetEncoder = super.getCharset().newEncoder(); |
| |
| /** |
| * Constructs a new instance. |
| * <p> |
| * This builder use the aspects Reader, Charset, CharsetEncoder, buffer size. |
| * </p> |
| * <p> |
| * You must provide an origin that can be converted to a Reader by this builder, otherwise, this call will throw an |
| * {@link UnsupportedOperationException}. |
| * </p> |
| * |
| * @return a new instance. |
| * @throws UnsupportedOperationException if the origin cannot provide a Reader. |
| * @throws IllegalStateException if the {@code origin} is {@code null}. |
| * @see AbstractOrigin#getReader(Charset) |
| */ |
| @SuppressWarnings("resource") |
| @Override |
| public ReaderInputStream get() throws IOException { |
| return new ReaderInputStream(checkOrigin().getReader(getCharset()), charsetEncoder, getBufferSize()); |
| } |
| |
| CharsetEncoder getCharsetEncoder() { |
| return charsetEncoder; |
| } |
| |
| @Override |
| public Builder setCharset(final Charset charset) { |
| super.setCharset(charset); |
| charsetEncoder = getCharset().newEncoder(); |
| return this; |
| } |
| |
| /** |
| * Sets the charset encoder. |
| * |
| * @param charsetEncoder the charset encoder, null resets to a default encoder. |
| * @return this |
| */ |
| public Builder setCharsetEncoder(final CharsetEncoder charsetEncoder) { |
| this.charsetEncoder = CharsetEncoders.toCharsetEncoder(charsetEncoder); |
| super.setCharset(this.charsetEncoder.charset()); |
| return this; |
| } |
| |
| } |
| |
| /** |
| * Constructs a new {@link Builder}. |
| * |
| * @return a new {@link Builder}. |
| * @since 2.12.0 |
| */ |
| public static Builder builder() { |
| return new Builder(); |
| } |
| |
| static int checkMinBufferSize(final CharsetEncoder charsetEncoder, final int bufferSize) { |
| final float minRequired = minBufferSize(charsetEncoder); |
| if (bufferSize < minRequired) { |
| throw new IllegalArgumentException(String.format("Buffer size %,d must be at least %s for a CharsetEncoder %s.", bufferSize, minRequired, |
| charsetEncoder.charset().displayName())); |
| } |
| return bufferSize; |
| } |
| |
| static float minBufferSize(final CharsetEncoder charsetEncoder) { |
| return charsetEncoder.maxBytesPerChar() * 2; |
| } |
| |
| private final Reader reader; |
| |
| private final CharsetEncoder charsetEncoder; |
| |
| /** |
| * CharBuffer used as input for the decoder. It should be reasonably large as we read data from the underlying Reader into this buffer. |
| */ |
| private final CharBuffer encoderIn; |
| /** |
| * ByteBuffer used as output for the decoder. This buffer can be small as it is only used to transfer data from the decoder to the buffer provided by the |
| * caller. |
| */ |
| private final ByteBuffer encoderOut; |
| |
| private CoderResult lastCoderResult; |
| |
| private boolean endOfInput; |
| |
| /** |
| * Constructs a new {@link ReaderInputStream} that uses the default character encoding with a default input buffer size of |
| * {@value IOUtils#DEFAULT_BUFFER_SIZE} characters. |
| * |
| * @param reader the target {@link Reader} |
| * @deprecated Use {@link ReaderInputStream#builder()} instead |
| */ |
| @Deprecated |
| public ReaderInputStream(final Reader reader) { |
| this(reader, Charset.defaultCharset()); |
| } |
| |
| /** |
| * Constructs a new {@link ReaderInputStream} with a default input buffer size of {@value IOUtils#DEFAULT_BUFFER_SIZE} characters. |
| * |
| * <p> |
| * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input and unmappable characters. |
| * </p> |
| * |
| * @param reader the target {@link Reader} |
| * @param charset the charset encoding |
| * @deprecated Use {@link ReaderInputStream#builder()} instead, will be protected for subclasses. |
| */ |
| @Deprecated |
| public ReaderInputStream(final Reader reader, final Charset charset) { |
| this(reader, charset, IOUtils.DEFAULT_BUFFER_SIZE); |
| } |
| |
| /** |
| * Constructs a new {@link ReaderInputStream}. |
| * |
| * <p> |
| * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input and unmappable characters. |
| * </p> |
| * |
| * @param reader the target {@link Reader}. |
| * @param charset the charset encoding. |
| * @param bufferSize the size of the input buffer in number of characters. |
| * @deprecated Use {@link ReaderInputStream#builder()} instead |
| */ |
| @Deprecated |
| public ReaderInputStream(final Reader reader, final Charset charset, final int bufferSize) { |
| // @formatter:off |
| this(reader, |
| Charsets.toCharset(charset).newEncoder() |
| .onMalformedInput(CodingErrorAction.REPLACE) |
| .onUnmappableCharacter(CodingErrorAction.REPLACE), |
| bufferSize); |
| // @formatter:on |
| } |
| |
| /** |
| * Constructs a new {@link ReaderInputStream}. |
| * |
| * <p> |
| * This constructor does not call {@link CharsetEncoder#reset() reset} on the provided encoder. The caller of this constructor should do this when providing |
| * an encoder which had already been in use. |
| * </p> |
| * |
| * @param reader the target {@link Reader} |
| * @param charsetEncoder the charset encoder |
| * @since 2.1 |
| * @deprecated Use {@link ReaderInputStream#builder()} instead |
| */ |
| @Deprecated |
| public ReaderInputStream(final Reader reader, final CharsetEncoder charsetEncoder) { |
| this(reader, charsetEncoder, IOUtils.DEFAULT_BUFFER_SIZE); |
| } |
| |
| /** |
| * Constructs a new {@link ReaderInputStream}. |
| * |
| * <p> |
| * This constructor does not call {@link CharsetEncoder#reset() reset} on the provided encoder. The caller of this constructor should do this when providing |
| * an encoder which had already been in use. |
| * </p> |
| * |
| * @param reader the target {@link Reader} |
| * @param charsetEncoder the charset encoder, null defaults to the default Charset encoder. |
| * @param bufferSize the size of the input buffer in number of characters |
| * @since 2.1 |
| * @deprecated Use {@link ReaderInputStream#builder()} instead |
| */ |
| @Deprecated |
| public ReaderInputStream(final Reader reader, final CharsetEncoder charsetEncoder, final int bufferSize) { |
| this.reader = reader; |
| this.charsetEncoder = CharsetEncoders.toCharsetEncoder(charsetEncoder); |
| this.encoderIn = CharBuffer.allocate(checkMinBufferSize(this.charsetEncoder, bufferSize)); |
| this.encoderIn.flip(); |
| this.encoderOut = ByteBuffer.allocate(128); |
| this.encoderOut.flip(); |
| } |
| |
| /** |
| * Constructs a new {@link ReaderInputStream} with a default input buffer size of {@value IOUtils#DEFAULT_BUFFER_SIZE} characters. |
| * |
| * <p> |
| * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input and unmappable characters. |
| * </p> |
| * |
| * @param reader the target {@link Reader} |
| * @param charsetName the name of the charset encoding |
| * @deprecated Use {@link ReaderInputStream#builder()} instead |
| */ |
| @Deprecated |
| public ReaderInputStream(final Reader reader, final String charsetName) { |
| this(reader, charsetName, IOUtils.DEFAULT_BUFFER_SIZE); |
| } |
| |
| /** |
| * Constructs a new {@link ReaderInputStream}. |
| * |
| * <p> |
| * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input and unmappable characters. |
| * </p> |
| * |
| * @param reader the target {@link Reader} |
| * @param charsetName the name of the charset encoding, null maps to the default Charset. |
| * @param bufferSize the size of the input buffer in number of characters |
| * @deprecated Use {@link ReaderInputStream#builder()} instead |
| */ |
| @Deprecated |
| public ReaderInputStream(final Reader reader, final String charsetName, final int bufferSize) { |
| this(reader, Charsets.toCharset(charsetName), bufferSize); |
| } |
| |
| /** |
| * Closes the stream. This method will cause the underlying {@link Reader} to be closed. |
| * |
| * @throws IOException if an I/O error occurs. |
| */ |
| @Override |
| public void close() throws IOException { |
| reader.close(); |
| } |
| |
| /** |
| * Fills the internal char buffer from the reader. |
| * |
| * @throws IOException If an I/O error occurs |
| */ |
| private void fillBuffer() throws IOException { |
| if (!endOfInput && (lastCoderResult == null || lastCoderResult.isUnderflow())) { |
| encoderIn.compact(); |
| final int position = encoderIn.position(); |
| // We don't use Reader#read(CharBuffer) here because it is more efficient |
| // to write directly to the underlying char array (the default implementation |
| // copies data to a temporary char array). |
| final int c = reader.read(encoderIn.array(), position, encoderIn.remaining()); |
| if (c == EOF) { |
| endOfInput = true; |
| } else { |
| encoderIn.position(position + c); |
| } |
| encoderIn.flip(); |
| } |
| encoderOut.compact(); |
| lastCoderResult = charsetEncoder.encode(encoderIn, encoderOut, endOfInput); |
| if (endOfInput) { |
| lastCoderResult = charsetEncoder.flush(encoderOut); |
| } |
| if (lastCoderResult.isError()) { |
| lastCoderResult.throwException(); |
| } |
| encoderOut.flip(); |
| } |
| |
| /** |
| * Gets the CharsetEncoder. |
| * |
| * @return the CharsetEncoder. |
| */ |
| CharsetEncoder getCharsetEncoder() { |
| return charsetEncoder; |
| } |
| |
| /** |
| * Reads a single byte. |
| * |
| * @return either the byte read or {@code -1} if the end of the stream has been reached |
| * @throws IOException if an I/O error occurs. |
| */ |
| @Override |
| public int read() throws IOException { |
| for (;;) { |
| if (encoderOut.hasRemaining()) { |
| return encoderOut.get() & 0xFF; |
| } |
| fillBuffer(); |
| if (endOfInput && !encoderOut.hasRemaining()) { |
| return EOF; |
| } |
| } |
| } |
| |
| /** |
| * Reads the specified number of bytes into an array. |
| * |
| * @param b the byte array to read into |
| * @return the number of bytes read or {@code -1} if the end of the stream has been reached |
| * @throws IOException if an I/O error occurs. |
| */ |
| @Override |
| public int read(final byte[] b) throws IOException { |
| return read(b, 0, b.length); |
| } |
| |
| /** |
| * Reads the specified number of bytes into an array. |
| * |
| * @param array the byte array to read into |
| * @param off the offset to start reading bytes into |
| * @param len the number of bytes to read |
| * @return the number of bytes read or {@code -1} if the end of the stream has been reached |
| * @throws IOException if an I/O error occurs. |
| */ |
| @Override |
| public int read(final byte[] array, int off, int len) throws IOException { |
| Objects.requireNonNull(array, "array"); |
| if (len < 0 || off < 0 || off + len > array.length) { |
| throw new IndexOutOfBoundsException("Array size=" + array.length + ", offset=" + off + ", length=" + len); |
| } |
| int read = 0; |
| if (len == 0) { |
| return 0; // Always return 0 if len == 0 |
| } |
| while (len > 0) { |
| if (encoderOut.hasRemaining()) { // Data from the last read not fully copied |
| final int c = Math.min(encoderOut.remaining(), len); |
| encoderOut.get(array, off, c); |
| off += c; |
| len -= c; |
| read += c; |
| } else if (endOfInput) { // Already reach EOF in the last read |
| break; |
| } else { // Read again |
| fillBuffer(); |
| } |
| } |
| return read == 0 && endOfInput ? EOF : read; |
| } |
| } |