blob: 2b3c7198e1e5ac676f167dbac5dd6bb62f643a74 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tomcat.util.buf;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CoderResult;
import java.nio.charset.CodingErrorAction;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
import org.junit.Assert;
import org.junit.Test;
/**
* These tests have been written with reference to
* <a href="http://www.unicode.org/versions/Unicode6.2.0/ch03.pdf">unicode 6.2,
* chapter 3, section 3.9</a>.
*/
public class TestUtf8 {
// Indicates that at invalid sequence is detected one character later than
// the earliest possible moment
private static final int ERROR_POS_PLUS1 = 1;
// Indicates that at invalid sequence is detected two characters later than
// the earliest possible moment
private static final int ERROR_POS_PLUS2 = 2;
// Indicates that at invalid sequence is detected four characters later
// than the earliest possible moment
private static final int ERROR_POS_PLUS4 = 4;
// Indicates that the trailing valid byte is included in replacement of the
// previous error
private static final int REPLACE_SWALLOWS_TRAILER = 8;
// Indicates that one replacement character is missing
private static final int REPLACE_MISSING1 = 16;
// Indicates that two replacement characters are missing
private static final int REPLACE_MISSING2 = 32;
// Indicates that three replacement characters are missing
private static final int REPLACE_MISSING4 = 64;
public static final List<Utf8TestCase> TEST_CASES = new ArrayList<>();
private static int workAroundCount = 0;
static {
// All known issues have been fixed in Java 8
// https://bugs.openjdk.java.net/browse/JDK-8039751
// Base assumption in Java 7
int javaVersion = 7;
try {
Class.forName("java.util.stream.Collector");
javaVersion = 8;
} catch (Exception e) {
// Ignore
}
Utf8TestCase testCase = null;
TEST_CASES.add(new Utf8TestCase(
"Zero length input",
new int[] {},
-1,
""));
TEST_CASES.add(new Utf8TestCase(
"Valid one byte sequence",
new int[] {0x41},
-1,
"A"));
TEST_CASES.add(new Utf8TestCase(
"Valid two byte sequence",
new int[] {0xC2, 0xA9},
-1,
"\u00A9"));
TEST_CASES.add(new Utf8TestCase(
"Valid three byte sequence",
new int[] {0xE0, 0xA4, 0x87},
-1,
"\u0907"));
TEST_CASES.add(new Utf8TestCase(
"Valid four byte sequence",
new int[] {0xF0, 0x90, 0x90, 0x80},
-1,
"\uD801\uDC00"));
// Java 7 JVM decoder does not report error until all 4 bytes are
// available
testCase = new Utf8TestCase(
"Invalid code point - out of range",
new int[] {0x41, 0xF4, 0x90, 0x80, 0x80, 0x41},
2,
"A\uFFFD\uFFFD\uFFFD\uFFFDA");
if (javaVersion < 8) {
testCase.addForJvm(ERROR_POS_PLUS2);
}
TEST_CASES.add(testCase);
// Java 7 JVM decoder does not report error until all 2 bytes are available
testCase = new Utf8TestCase(
"Valid sequence padded from one byte to two",
new int[] {0x41, 0xC0, 0xC1, 0x41},
1,
"A\uFFFD\uFFFDA");
if (javaVersion < 8) {
testCase.addForJvm(ERROR_POS_PLUS1);
}
TEST_CASES.add(testCase);
// Java 7 JVM decoder does not report error until all 3 bytes are available
testCase = new Utf8TestCase(
"Valid sequence padded from one byte to three",
new int[] {0x41, 0xE0, 0x80, 0xC1, 0x41},
2,
"A\uFFFD\uFFFD\uFFFDA");
if (javaVersion < 8) {
testCase.addForJvm(ERROR_POS_PLUS1);
}
TEST_CASES.add(testCase);
// Java 7 JVM decoder does not report error until all 4 bytes are
// available
testCase = new Utf8TestCase(
"Valid sequence padded from one byte to four",
new int[] {0x41, 0xF0, 0x80, 0x80, 0xC1, 0x41},
2,
"A\uFFFD\uFFFD\uFFFD\uFFFDA");
if (javaVersion < 8) {
testCase.addForJvm(ERROR_POS_PLUS2);
}
TEST_CASES.add(testCase);
TEST_CASES.add(new Utf8TestCase(
"Invalid one byte 1111 1111",
new int[] {0x41, 0xFF, 0x41},
1,
"A\uFFFDA"));
testCase = new Utf8TestCase(
"Invalid one byte 1111 0000",
new int[] {0x41, 0xF0, 0x41},
2,
"A\uFFFDA");
if (javaVersion < 8) {
testCase.addForJvm(REPLACE_SWALLOWS_TRAILER);
}
TEST_CASES.add(testCase);
testCase = new Utf8TestCase(
"Invalid one byte 1110 0000",
new int[] {0x41, 0xE0, 0x41},
2,
"A\uFFFDA");
if (javaVersion < 8) {
testCase.addForJvm(REPLACE_SWALLOWS_TRAILER);
}
TEST_CASES.add(testCase);
testCase = new Utf8TestCase(
"Invalid one byte 1100 0000",
new int[] {0x41, 0xC0, 0x41},
1,
"A\uFFFDA");
if (javaVersion < 8) {
testCase.addForJvm(ERROR_POS_PLUS1);
}
TEST_CASES.add(testCase);
TEST_CASES.add(new Utf8TestCase(
"Invalid one byte 1000 000",
new int[] {0x41, 0x80, 0x41},
1,
"A\uFFFDA"));
TEST_CASES.add(new Utf8TestCase(
"Invalid sequence from unicode 6.2 spec, table 3-8",
new int[] {0x61, 0xF1, 0x80, 0x80, 0xE1, 0x80, 0xC2, 0x62, 0x80,
0x63, 0x80, 0xBF, 0x64},
4,
"a\uFFFD\uFFFD\uFFFDb\uFFFDc\uFFFD\uFFFDd"));
TEST_CASES.add(new Utf8TestCase(
"Valid 4-byte sequence truncated to 3 bytes",
new int[] {0x61, 0xF0, 0x90, 0x90},
3,
"a\uFFFD"));
TEST_CASES.add(new Utf8TestCase(
"Valid 4-byte sequence truncated to 2 bytes",
new int[] {0x61, 0xF0, 0x90},
2,
"a\uFFFD"));
TEST_CASES.add(new Utf8TestCase(
"Valid 4-byte sequence truncated to 1 byte",
new int[] {0x61, 0xF0},
1,
"a\uFFFD"));
TEST_CASES.add(new Utf8TestCase(
"Valid 4-byte sequence truncated to 3 bytes with trailer",
new int[] {0x61, 0xF0, 0x90, 0x90, 0x61},
4,
"a\uFFFDa"));
testCase = new Utf8TestCase(
"Valid 4-byte sequence truncated to 2 bytes with trailer",
new int[] {0x61, 0xF0, 0x90, 0x61},
3,
"a\uFFFDa");
if (javaVersion < 8) {
testCase.addForJvm(REPLACE_SWALLOWS_TRAILER);
}
TEST_CASES.add(testCase);
testCase = new Utf8TestCase(
"Valid 4-byte sequence truncated to 1 byte with trailer",
new int[] {0x61, 0xF0, 0x61},
2,
"a\uFFFDa");
if (javaVersion < 8) {
testCase.addForJvm(REPLACE_SWALLOWS_TRAILER);
}
TEST_CASES.add(testCase);
testCase = new Utf8TestCase(
"U+0000 zero-padded to two bytes",
new int[] {0x61, 0xC0, 0x80, 0x61},
1,
"a\uFFFD\uFFFDa");
if (javaVersion < 8) {
testCase.addForJvm(ERROR_POS_PLUS1);
}
TEST_CASES.add(testCase);
testCase = new Utf8TestCase(
"U+007F zero-padded to two bytes",
new int[] {0x61, 0xC1, 0xBF, 0x61},
1,
"a\uFFFD\uFFFDa");
if (javaVersion < 8) {
testCase.addForJvm(ERROR_POS_PLUS1);
}
TEST_CASES.add(testCase);
TEST_CASES.add(new Utf8TestCase(
"Two bytes, all 1's",
new int[] {0x61, 0xFF, 0xFF, 0x61},
1,
"a\uFFFD\uFFFDa"));
testCase = new Utf8TestCase(
"Two bytes, 1110 first byte first nibble",
new int[] {0x61, 0xE0, 0x80, 0x61},
2,
"a\uFFFD\uFFFDa");
if (javaVersion < 8) {
testCase.addForJvm(ERROR_POS_PLUS1);
}
TEST_CASES.add(testCase);
TEST_CASES.add(new Utf8TestCase(
"Two bytes, 101x first byte first nibble",
new int[] {0x61, 0xA0, 0x80, 0x61},
1,
"a\uFFFD\uFFFDa"));
TEST_CASES.add(new Utf8TestCase(
"Two bytes, invalid second byte",
new int[] {0x61, 0xC2, 0x00, 0x61},
2,
"a\uFFFD\u0000a"));
TEST_CASES.add(new Utf8TestCase(
"Two bytes, invalid second byte",
new int[] {0x61, 0xC2, 0xC0, 0x61},
2,
"a\uFFFD\uFFFDa"));
testCase = new Utf8TestCase(
"Three bytes, U+0000 zero-padded",
new int[] {0x61, 0xE0, 0x80, 0x80, 0x61},
2,
"a\uFFFD\uFFFD\uFFFDa");
if (javaVersion < 8) {
testCase.addForJvm(ERROR_POS_PLUS1);
}
TEST_CASES.add(testCase);
testCase = new Utf8TestCase(
"Three bytes, U+007F zero-padded",
new int[] {0x61, 0xE0, 0x81, 0xBF, 0x61},
2,
"a\uFFFD\uFFFD\uFFFDa");
if (javaVersion < 8) {
testCase.addForJvm(ERROR_POS_PLUS1);
}
TEST_CASES.add(testCase);
testCase = new Utf8TestCase(
"Three bytes, U+07FF zero-padded",
new int[] {0x61, 0xE0, 0x9F, 0xBF, 0x61},
2,
"a\uFFFD\uFFFD\uFFFDa");
if (javaVersion < 8) {
testCase.addForJvm(ERROR_POS_PLUS1);
}
TEST_CASES.add(testCase);
TEST_CASES.add(new Utf8TestCase(
"Three bytes, all 1's",
new int[] {0x61, 0xFF, 0xFF, 0xFF, 0x61},
1,
"a\uFFFD\uFFFD\uFFFDa"));
testCase = new Utf8TestCase(
"Three bytes, invalid first byte",
new int[] {0x61, 0xF8, 0x80, 0x80, 0x61},
1,
"a\uFFFD\uFFFD\uFFFDa");
if (javaVersion < 8) {
testCase.addForJvm(REPLACE_MISSING2).addForJvm(
REPLACE_SWALLOWS_TRAILER);
}
TEST_CASES.add(testCase);
testCase = new Utf8TestCase(
"Three bytes, invalid second byte",
new int[] {0x61, 0xE0, 0xC0, 0x80, 0x61},
2,
"a\uFFFD\uFFFD\uFFFDa");
if (javaVersion < 8) {
testCase.addForJvm(ERROR_POS_PLUS1);
}
TEST_CASES.add(testCase);
TEST_CASES.add(new Utf8TestCase(
"Three bytes, invalid third byte",
new int[] {0x61, 0xE1, 0x80, 0xC0, 0x61},
3,
"a\uFFFD\uFFFDa"));
testCase = new Utf8TestCase(
"Four bytes, U+0000 zero-padded",
new int[] {0x61, 0xF0, 0x80, 0x80, 0x80, 0x61},
2,
"a\uFFFD\uFFFD\uFFFD\uFFFDa");
if (javaVersion < 8) {
testCase.addForJvm(ERROR_POS_PLUS2);
}
TEST_CASES.add(testCase);
testCase = new Utf8TestCase(
"Four bytes, U+007F zero-padded",
new int[] {0x61, 0xF0, 0x80, 0x81, 0xBF, 0x61},
2,
"a\uFFFD\uFFFD\uFFFD\uFFFDa");
if (javaVersion < 8) {
testCase.addForJvm(ERROR_POS_PLUS2);
}
TEST_CASES.add(testCase);
testCase = new Utf8TestCase(
"Four bytes, U+07FF zero-padded",
new int[] {0x61, 0xF0, 0x80, 0x9F, 0xBF, 0x61},
2,
"a\uFFFD\uFFFD\uFFFD\uFFFDa");
if (javaVersion < 8) {
testCase.addForJvm(ERROR_POS_PLUS2);
}
TEST_CASES.add(testCase);
testCase = new Utf8TestCase(
"Four bytes, U+FFFF zero-padded",
new int[] {0x61, 0xF0, 0x8F, 0xBF, 0xBF, 0x61},
2,
"a\uFFFD\uFFFD\uFFFD\uFFFDa");
if (javaVersion < 8) {
testCase.addForJvm(ERROR_POS_PLUS2);
}
TEST_CASES.add(testCase);
TEST_CASES.add(new Utf8TestCase(
"Four bytes, all 1's",
new int[] {0x61, 0xFF, 0xFF, 0xFF, 0xFF, 0x61},
1,
"a\uFFFD\uFFFD\uFFFD\uFFFDa"));
testCase = new Utf8TestCase(
"Four bytes, invalid first byte",
new int[] {0x61, 0xF8, 0x80, 0x80, 0x80, 0x61},
1,
"a\uFFFD\uFFFD\uFFFD\uFFFDa");
if (javaVersion < 8) {
testCase.addForJvm(ERROR_POS_PLUS4).addForJvm(
REPLACE_MISSING2).addForJvm(REPLACE_MISSING1);
}
TEST_CASES.add(testCase);
testCase = new Utf8TestCase(
"Four bytes, invalid second byte",
new int[] {0x61, 0xF1, 0xC0, 0x80, 0x80, 0x61},
2,
"a\uFFFD\uFFFD\uFFFD\uFFFDa");
if (javaVersion < 8) {
testCase.addForJvm(ERROR_POS_PLUS2);
}
TEST_CASES.add(testCase);
testCase = new Utf8TestCase(
"Four bytes, invalid third byte",
new int[] {0x61, 0xF1, 0x80, 0xC0, 0x80, 0x61},
3,
"a\uFFFD\uFFFD\uFFFDa");
if (javaVersion < 8) {
testCase.addForJvm(ERROR_POS_PLUS1);
}
TEST_CASES.add(testCase);
TEST_CASES.add(new Utf8TestCase(
"Four bytes, invalid fourth byte",
new int[] {0x61, 0xF1, 0x80, 0x80, 0xC0, 0x61},
4,
"a\uFFFD\uFFFDa"));
testCase = new Utf8TestCase(
"Five bytes, U+0000 zero padded",
new int[] {0x61, 0xF8, 0x80, 0x80, 0x80, 0x80, 0x61},
1,
"a\uFFFD\uFFFD\uFFFD\uFFFD\uFFFDa");
if (javaVersion < 8) {
testCase.addForJvm(ERROR_POS_PLUS4).addForJvm(REPLACE_MISSING4);
}
TEST_CASES.add(testCase);
testCase = new Utf8TestCase(
"Five bytes, U+007F zero padded",
new int[] {0x61, 0xF8, 0x80, 0x80, 0x81, 0xBF, 0x61},
1,
"a\uFFFD\uFFFD\uFFFD\uFFFD\uFFFDa");
if (javaVersion < 8) {
testCase.addForJvm(ERROR_POS_PLUS4).addForJvm(REPLACE_MISSING4);
}
TEST_CASES.add(testCase);
testCase = new Utf8TestCase(
"Five bytes, U+07FF zero padded",
new int[] {0x61, 0xF8, 0x80, 0x80, 0x9F, 0xBF, 0x61},
1,
"a\uFFFD\uFFFD\uFFFD\uFFFD\uFFFDa");
if (javaVersion < 8) {
testCase.addForJvm(ERROR_POS_PLUS4).addForJvm(REPLACE_MISSING4);
}
TEST_CASES.add(testCase);
testCase = new Utf8TestCase(
"Five bytes, U+FFFF zero padded",
new int[] {0x61, 0xF8, 0x80, 0x8F, 0xBF, 0xBF, 0x61},
1,
"a\uFFFD\uFFFD\uFFFD\uFFFD\uFFFDa");
if (javaVersion < 8) {
testCase.addForJvm(ERROR_POS_PLUS4).addForJvm(REPLACE_MISSING4);
}
TEST_CASES.add(testCase);
testCase = new Utf8TestCase(
"Six bytes, U+0000 zero padded",
new int[] {0x61, 0xFC, 0x80, 0x80, 0x80, 0x80, 0x80, 0x61},
1,
"a\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFDa");
if (javaVersion < 8) {
testCase.addForJvm(ERROR_POS_PLUS4).addForJvm(
ERROR_POS_PLUS1).addForJvm(REPLACE_MISSING4).addForJvm(
REPLACE_MISSING1);
}
TEST_CASES.add(testCase);
testCase = new Utf8TestCase(
"Six bytes, U+007F zero padded",
new int[] {0x61, 0xFC, 0x80, 0x80, 0x80, 0x81, 0xBF, 0x61},
1,
"a\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFDa");
if (javaVersion < 8) {
testCase.addForJvm(ERROR_POS_PLUS4).addForJvm(
ERROR_POS_PLUS1).addForJvm(REPLACE_MISSING4).addForJvm(
REPLACE_MISSING1);
}
TEST_CASES.add(testCase);
testCase = new Utf8TestCase(
"Six bytes, U+07FF zero padded",
new int[] {0x61, 0xFC, 0x80, 0x80, 0x80, 0x9F, 0xBF, 0x61},
1,
"a\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFDa");
if (javaVersion < 8) {
testCase.addForJvm(ERROR_POS_PLUS4).addForJvm(
ERROR_POS_PLUS1).addForJvm(REPLACE_MISSING4).addForJvm(
REPLACE_MISSING1);
}
TEST_CASES.add(testCase);
testCase = new Utf8TestCase(
"Six bytes, U+FFFF zero padded",
new int[] {0x61, 0xFC, 0x80, 0x80, 0x8F, 0xBF, 0xBF, 0x61},
1,
"a\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFDa");
if (javaVersion < 8) {
testCase.addForJvm(ERROR_POS_PLUS4).addForJvm(
ERROR_POS_PLUS1).addForJvm(REPLACE_MISSING4).addForJvm(
REPLACE_MISSING1);
}
TEST_CASES.add(testCase);
testCase = new Utf8TestCase(
"Original test case - derived from Autobahn?",
new int[] {0xCE, 0xBA, 0xE1, 0xDB, 0xB9, 0xCF, 0x83, 0xCE,
0xBC, 0xCE, 0xB5, 0xED, 0x80, 0x65, 0x64, 0x69,
0x74, 0x65, 0x64},
3,
"\u03BA\uFFFD\u06F9\u03C3\u03BC\u03B5\uFFFDedited");
if (javaVersion < 8) {
testCase.addForJvm(ERROR_POS_PLUS1);
}
TEST_CASES.add(testCase);
}
@Test
public void testHarmonyDecoder() {
CharsetDecoder decoder = new Utf8Decoder();
for (Utf8TestCase testCase : TEST_CASES) {
doTest(decoder, testCase, 0);
}
}
@Test
public void testJvmDecoder() {
CharsetDecoder decoder = StandardCharsets.UTF_8.newDecoder();
int testCount = 0;
try {
for (Utf8TestCase testCase : TEST_CASES) {
doTest(decoder, testCase, testCase.flagsJvm);
testCount++;
}
} finally {
System.err.println("Workarounds added to " + workAroundCount +
" tests to account for known JVM bugs");
if (testCount < TEST_CASES.size()) {
System.err.println("Executed " + testCount + " of " +
TEST_CASES.size() + " UTF-8 tests before " +
"encountering a failure");
}
}
}
private void doTest(CharsetDecoder decoder, Utf8TestCase testCase,
int flags) {
int len = testCase.input.length;
ByteBuffer bb = ByteBuffer.allocate(len);
CharBuffer cb = CharBuffer.allocate(len);
// Configure decoder to fail on an error
decoder.reset();
decoder.onMalformedInput(CodingErrorAction.REPORT);
decoder.onUnmappableCharacter(CodingErrorAction.REPORT);
// Add each byte one at a time. The decoder should fail as soon as
// an invalid sequence has been provided
for (int i = 0; i < len; i++) {
bb.put((byte) testCase.input[i]);
bb.flip();
CoderResult cr = decoder.decode(bb, cb, false);
if (cr.isError()) {
int expected = testCase.invalidIndex;
if ((flags & ERROR_POS_PLUS1) != 0) {
expected += 1;
}
if ((flags & ERROR_POS_PLUS2) != 0) {
expected += 2;
}
if ((flags & ERROR_POS_PLUS4) != 0) {
expected += 4;
}
Assert.assertEquals(testCase.description, expected, i);
break;
}
bb.compact();
}
// Configure decoder to replace on an error
decoder.reset();
decoder.onMalformedInput(CodingErrorAction.REPLACE);
decoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
// Add each byte one at a time.
bb.clear();
cb.clear();
for (int i = 0; i < len; i++) {
bb.put((byte) testCase.input[i]);
bb.flip();
CoderResult cr = decoder.decode(bb, cb, false);
if (cr.isError()) {
Assert.fail(testCase.description);
}
bb.compact();
}
// For incomplete sequences at the end of the input need to tell
// the decoder the input has ended
bb.flip();
CoderResult cr = decoder.decode(bb, cb, true);
if (cr.isError()) {
Assert.fail(testCase.description);
}
cb.flip();
String expected = testCase.outputReplaced;
if ((flags & REPLACE_SWALLOWS_TRAILER) != 0) {
expected = expected.substring(0, expected.length() - 1);
}
if ((flags & REPLACE_MISSING1) != 0) {
expected = expected.substring(0, 1) +
expected.substring(2, expected.length());
}
if ((flags & REPLACE_MISSING2) != 0) {
expected = expected.substring(0, 1) +
expected.substring(3, expected.length());
}
if ((flags & REPLACE_MISSING4) != 0) {
expected = expected.substring(0, 1) +
expected.substring(5, expected.length());
}
Assert.assertEquals(testCase.description, expected, cb.toString());
}
/**
* Encapsulates a single UTF-8 test case
*/
public static class Utf8TestCase {
public final String description;
public final int[] input;
public final int invalidIndex;
public final String outputReplaced;
public int flagsJvm = 0;
public Utf8TestCase(String description, int[] input, int invalidIndex,
String outputReplaced) {
this.description = description;
this.input = input;
this.invalidIndex = invalidIndex;
this.outputReplaced = outputReplaced;
}
public Utf8TestCase addForJvm(int flag) {
if (this.flagsJvm == 0) {
TestUtf8.workAroundCount++;
}
this.flagsJvm = this.flagsJvm | flag;
return this;
}
}
}