blob: 0278500f19b6a49ed8cc59bc4252f5522aefa5d2 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.commons.io.input;
import static org.junit.jupiter.api.Assertions.assertArrayEquals;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.junit.jupiter.api.Assertions.fail;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.util.Random;
import java.util.Set;
import org.apache.commons.io.Charsets;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
public class CharSequenceInputStreamTest {
private static final String ALPHABET = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
private static final String LARGE_TEST_STRING;
private static final String TEST_STRING = "\u00e0 peine arriv\u00e9s nous entr\u00e2mes dans sa chambre";
static {
final StringBuilder buffer = new StringBuilder();
for (int i = 0; i < 100; i++) {
buffer.append(TEST_STRING);
}
LARGE_TEST_STRING = buffer.toString();
}
private final Random random = new Random();
private Set<String> getRequiredCharsetNames() {
return Charsets.requiredCharsets().keySet();
}
private void testBufferedRead(final String testString, final String charsetName) throws IOException {
final byte[] expected = testString.getBytes(charsetName);
try (InputStream in = new CharSequenceInputStream(testString, charsetName, 512)) {
final byte[] buffer = new byte[128];
int offset = 0; while (true) {
int bufferOffset = random.nextInt(64);
final int bufferLength = random.nextInt(64);
int read = in.read(buffer, bufferOffset, bufferLength);
if (read == -1) {
assertEquals(expected.length, offset, "EOF: offset should equal length for charset " + charsetName);
break;
}
assertTrue(read <= bufferLength, "Read " + read + " <= " + bufferLength);
while (read > 0) {
assertTrue(offset < expected.length,
"offset for " + charsetName + " " + offset + " < " + expected.length);
assertEquals(expected[offset], buffer[bufferOffset], "bytes should agree for " + charsetName);
offset++;
bufferOffset++;
read--;
}
}
}
}
// Unfortunately checking canEncode does not seem to work for all charsets:
// testBufferedRead_AvailableCharset(org.apache.commons.io.input.CharSequenceInputStreamTest) Time elapsed: 0.682 sec <<< ERROR!
// java.lang.UnsupportedOperationException: null
// at java.nio.CharBuffer.array(CharBuffer.java:940)
// at sun.nio.cs.ext.COMPOUND_TEXT_Encoder.encodeLoop(COMPOUND_TEXT_Encoder.java:75)
// at java.nio.charset.CharsetEncoder.encode(CharsetEncoder.java:544)
// at org.apache.commons.io.input.CharSequenceInputStream.fillBuffer(CharSequenceInputStream.java:111)
@Test
public void testBufferedRead_AvailableCharset() throws IOException {
for (final String csName : Charset.availableCharsets().keySet()) {
// prevent java.lang.UnsupportedOperationException at sun.nio.cs.ext.ISO2022_CN.newEncoder.
if (isAvailabilityTestableForCharset(csName)) {
testBufferedRead(TEST_STRING, csName);
}
}
}
@Test
public void testBufferedRead_RequiredCharset() throws IOException {
for (final String csName : getRequiredCharsetNames()) {
testBufferedRead(TEST_STRING, csName);
}
}
@Test
public void testBufferedRead_UTF8() throws IOException {
testBufferedRead(TEST_STRING, "UTF-8");
}
private void testCharsetMismatchInfiniteLoop(final String csName) throws IOException {
// Input is UTF-8 bytes: 0xE0 0xB2 0xA0
final char[] inputChars = new char[] { (char) 0xE0, (char) 0xB2, (char) 0xA0 };
final Charset charset = Charset.forName(csName); // infinite loop for US-ASCII, UTF-8 OK
try (InputStream stream = new CharSequenceInputStream(new String(inputChars), charset, 512)) {
while (stream.read() != -1) {
}
}
}
@Test
public void testCharsetMismatchInfiniteLoop_RequiredCharsets() throws IOException {
for (final String csName : getRequiredCharsetNames()) {
testCharsetMismatchInfiniteLoop(csName);
}
}
// Test is broken if readFirst > 0
// This is because the initial read fills the buffer from the CharSequence
// so data1 gets the first buffer full; data2 will get the next buffer full
private void testIO_356(final int bufferSize, final int dataSize, final int readFirst, final String csName) throws Exception {
final CharSequenceInputStream is = new CharSequenceInputStream(ALPHABET, csName, bufferSize);
for (int i = 0; i < readFirst; i++) {
final int ch = is.read();
assertFalse(ch == -1);
}
is.mark(dataSize);
final byte[] data1 = new byte[dataSize];
final int readCount1 = is.read(data1);
assertEquals(dataSize, readCount1);
is.reset(); // should allow data to be re-read
final byte[] data2 = new byte[dataSize];
final int readCount2 = is.read(data2);
assertEquals(dataSize, readCount2);
is.close();
// data buffers should be identical
assertArrayEquals(data1, data2, "bufferSize=" + bufferSize + " dataSize=" + dataSize);
}
@Test
public void testIO_356_B10_D10_S0_UTF16() throws Exception {
testIO_356(10, 10, 0, "UTF-16");
}
@Test
public void testIO_356_B10_D10_S0_UTF8() throws Exception {
testIO_356(10, 10, 0, "UTF-8");
}
@Test
public void testIO_356_B10_D10_S1_UTF8() throws Exception {
testIO_356(10, 10, 1, "UTF-8");
}
@Test
public void testIO_356_B10_D10_S2_UTF8() throws Exception {
testIO_356(10, 10, 2, "UTF-8");
}
@Test
public void testIO_356_B10_D13_S0_UTF8() throws Exception {
testIO_356(10, 13, 0, "UTF-8");
}
@Test
public void testIO_356_B10_D13_S1_UTF8() throws Exception {
testIO_356(10, 13, 1, "UTF-8");
}
@Test
public void testIO_356_B10_D20_S0_UTF8() throws Exception {
testIO_356(10, 20, 0, "UTF-8");
}
private void testIO_356_Loop(final String csName, final int maxBytesPerChar) throws Exception {
for (int bufferSize = maxBytesPerChar; bufferSize <= 10; bufferSize++) {
for (int dataSize = 1; dataSize <= 20; dataSize++) {
testIO_356(bufferSize, dataSize, 0, csName);
}
}
}
@Test
public void testIO_356_Loop_UTF16() throws Exception {
testIO_356_Loop("UTF-16", 4);
}
@Test
public void testIO_356_Loop_UTF8() throws Exception {
testIO_356_Loop("UTF-8", 4);
}
@Test
public void testLargeBufferedRead_RequiredCharsets() throws IOException {
for (final String csName : getRequiredCharsetNames()) {
testBufferedRead(LARGE_TEST_STRING, csName);
}
}
@Test
public void testLargeBufferedRead_UTF8() throws IOException {
testBufferedRead(LARGE_TEST_STRING, "UTF-8");
}
@Test
public void testLargeSingleByteRead_RequiredCharsets() throws IOException {
for (final String csName : getRequiredCharsetNames()) {
testSingleByteRead(LARGE_TEST_STRING, csName);
}
}
@Test
public void testLargeSingleByteRead_UTF8() throws IOException {
testSingleByteRead(LARGE_TEST_STRING, "UTF-8");
}
// This test is broken for charsets that don't create a single byte for each char
private void testMarkReset(final String csName) throws Exception {
try (InputStream r = new CharSequenceInputStream("test", csName)) {
assertEquals(2, r.skip(2));
r.mark(0);
assertEquals('s', r.read(), csName);
assertEquals('t', r.read(), csName);
assertEquals(-1, r.read(), csName);
r.reset();
assertEquals('s', r.read(), csName);
assertEquals('t', r.read(), csName);
assertEquals(-1, r.read(), csName);
r.reset();
r.reset();
}
}
@Test
@Disabled // Test broken for charsets that create multiple bytes for a single char
public void testMarkReset_RequiredCharsets() throws Exception {
for (final String csName : getRequiredCharsetNames()) {
testMarkReset(csName);
}
}
@Test
public void testMarkReset_USASCII() throws Exception {
testMarkReset("US-ASCII");
}
@Test
public void testMarkReset_UTF8() throws Exception {
testMarkReset("UTF-8");
}
@Test
public void testMarkSupported() throws Exception {
try (InputStream r = new CharSequenceInputStream("test", "UTF-8")) {
assertTrue(r.markSupported());
}
}
private void testReadZero(final String csName) throws Exception {
try (InputStream r = new CharSequenceInputStream("test", csName)) {
final byte[] bytes = new byte[30];
assertEquals(0, r.read(bytes, 0, 0));
}
}
@Test
public void testReadZero_EmptyString() throws Exception {
try (InputStream r = new CharSequenceInputStream("", "UTF-8")) {
final byte[] bytes = new byte[30];
assertEquals(0, r.read(bytes, 0, 0));
}
}
@Test
public void testReadZero_RequiredCharsets() throws Exception {
for (final String csName : getRequiredCharsetNames()) {
testReadZero(csName);
}
}
private void testSingleByteRead(final String testString, final String charsetName) throws IOException {
final byte[] bytes = testString.getBytes(charsetName);
try (InputStream in = new CharSequenceInputStream(testString, charsetName, 512)) {
for (final byte b : bytes) {
final int read = in.read();
assertTrue(read >= 0, "read " + read + " >=0 ");
assertTrue(read <= 255, "read " + read + " <= 255");
assertEquals(b, (byte) read, "Should agree with input");
}
assertEquals(-1, in.read());
}
}
@Test
public void testSingleByteRead_RequiredCharsets() throws IOException {
for (final String csName : getRequiredCharsetNames()) {
testSingleByteRead(TEST_STRING, csName);
}
}
@Test
public void testSingleByteRead_UTF16() throws IOException {
testSingleByteRead(TEST_STRING, "UTF-16");
}
@Test
public void testSingleByteRead_UTF8() throws IOException {
testSingleByteRead(TEST_STRING, "UTF-8");
}
// This is broken for charsets that don't map each char to a byte
private void testSkip(final String csName) throws Exception {
try (InputStream r = new CharSequenceInputStream("test", csName)) {
assertEquals(1, r.skip(1));
assertEquals(2, r.skip(2));
assertEquals('t', r.read(), csName);
r.skip(100);
assertEquals(-1, r.read(), csName);
}
}
@Test
@Disabled // test is broken for charsets that generate multiple bytes per char.
public void testSkip_RequiredCharsets() throws Exception {
for (final String csName : getRequiredCharsetNames()) {
testSkip(csName);
}
}
@Test
public void testSkip_USASCII() throws Exception {
testSkip("US-ASCII");
}
@Test
public void testSkip_UTF8() throws Exception {
testSkip("UTF-8");
}
private int checkAvail(final InputStream is, final int min) throws Exception {
final int available = is.available();
assertTrue(available >= min, "avail should be >= " + min + ", but was " + available);
return available;
}
private void testAvailableSkip(final String csName) throws Exception {
final String input = "test";
try (InputStream r = new CharSequenceInputStream(input, csName)) {
int available = checkAvail(r, input.length());
assertEquals(available - 1, r.skip(available - 1)); // skip all but one
available = checkAvail(r, 1);
assertEquals(1, r.skip(1));
available = checkAvail(r, 0);
}
}
private void testAvailableRead(final String csName) throws Exception {
final String input = "test";
try (InputStream r = new CharSequenceInputStream(input, csName)) {
int available = checkAvail(r, input.length());
assertEquals(available - 1, r.skip(available - 1)); // skip all but one
available = checkAvail(r, 1);
final byte[] buff = new byte[available];
assertEquals(available, r.read(buff, 0, available));
}
}
@Test
public void testAvailable() throws Exception {
for (final String csName : Charset.availableCharsets().keySet()) {
// prevent java.lang.UnsupportedOperationException at sun.nio.cs.ext.ISO2022_CN.newEncoder.
// also try and avoid the following Effor on Continuum
// java.lang.UnsupportedOperationException: null
// at java.nio.CharBuffer.array(CharBuffer.java:940)
// at sun.nio.cs.ext.COMPOUND_TEXT_Encoder.encodeLoop(COMPOUND_TEXT_Encoder.java:75)
// at java.nio.charset.CharsetEncoder.encode(CharsetEncoder.java:544)
// at org.apache.commons.io.input.CharSequenceInputStream.fillBuffer(CharSequenceInputStream.java:120)
// at org.apache.commons.io.input.CharSequenceInputStream.read(CharSequenceInputStream.java:151)
// at org.apache.commons.io.input.CharSequenceInputStreamTest.testAvailableRead(CharSequenceInputStreamTest.java:412)
// at org.apache.commons.io.input.CharSequenceInputStreamTest.testAvailable(CharSequenceInputStreamTest.java:424)
try {
if (isAvailabilityTestableForCharset(csName)) {
testAvailableSkip(csName);
testAvailableRead(csName);
}
} catch (final UnsupportedOperationException e){
fail("Operation not supported for " + csName);
}
}
}
private boolean isAvailabilityTestableForCharset(final String csName) {
return Charset.forName(csName).canEncode()
&& !"COMPOUND_TEXT".equalsIgnoreCase(csName) && !"x-COMPOUND_TEXT".equalsIgnoreCase(csName)
&& !isOddBallLegacyCharsetThatDoesNotSupportFrenchCharacters(csName);
}
private boolean isOddBallLegacyCharsetThatDoesNotSupportFrenchCharacters(final String csName) {
return "x-IBM1388".equalsIgnoreCase(csName) ||
"ISO-2022-CN".equalsIgnoreCase(csName) ||
"ISO-2022-JP".equalsIgnoreCase(csName) ||
"Shift_JIS".equalsIgnoreCase(csName);
}
}