| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.tomcat.util.buf; |
| |
| import java.nio.ByteBuffer; |
| import java.nio.CharBuffer; |
| import java.nio.charset.CharsetDecoder; |
| import java.nio.charset.CoderResult; |
| import java.nio.charset.StandardCharsets; |
| |
| /** |
| * Decodes bytes to UTF-8. Extracted from Apache Harmony and modified to reject |
| * code points from U+D800 to U+DFFF as per RFC3629. The standard Java decoder |
| * does not reject these. It has also been modified to reject code points |
| * greater than U+10FFFF which the standard Java decoder rejects but the harmony |
| * one does not. |
| */ |
| public class Utf8Decoder extends CharsetDecoder { |
| |
| // The next table contains information about UTF-8 charset and |
| // correspondence of 1st byte to the length of sequence |
| // For information please visit http://www.ietf.org/rfc/rfc3629.txt |
| // |
| // Please note, o means 0, actually. |
| // ------------------------------------------------------------------- |
| // 0 1 2 3 Value |
| // ------------------------------------------------------------------- |
| // oxxxxxxx 00000000 00000000 0xxxxxxx |
| // 11oyyyyy 1oxxxxxx 00000000 00000yyy yyxxxxxx |
| // 111ozzzz 1oyyyyyy 1oxxxxxx 00000000 zzzzyyyy yyxxxxxx |
| // 1111ouuu 1ouuzzzz 1oyyyyyy 1oxxxxxx 000uuuuu zzzzyyyy yyxxxxxx |
| private static final int remainingBytes[] = { |
| // 1owwwwww |
| -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
| -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
| -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
| -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
| // 11oyyyyy |
| -1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
| 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
| // 111ozzzz |
| 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
| // 1111ouuu |
| 3, 3, 3, 3, 3, -1, -1, -1, |
| // > 11110111 |
| -1, -1, -1, -1, -1, -1, -1, -1}; |
| private static final int remainingNumbers[] = {0, // 0 1 2 3 |
| 4224, // (01o00000b << 6)+(1o000000b) |
| 401536, // (011o0000b << 12)+(1o000000b << 6)+(1o000000b) |
| 29892736 // (0111o000b << 18)+(1o000000b << 12)+(1o000000b << |
| // 6)+(1o000000b) |
| }; |
| private static final int lowerEncodingLimit[] = {-1, 0x80, 0x800, 0x10000}; |
| |
| |
| public Utf8Decoder() { |
| super(StandardCharsets.UTF_8, 1.0f, 1.0f); |
| } |
| |
| |
| @Override |
| protected CoderResult decodeLoop(ByteBuffer in, CharBuffer out) { |
| if (in.hasArray() && out.hasArray()) { |
| return decodeHasArray(in, out); |
| } |
| return decodeNotHasArray(in, out); |
| } |
| |
| |
| private CoderResult decodeNotHasArray(ByteBuffer in, CharBuffer out) { |
| int outRemaining = out.remaining(); |
| int pos = in.position(); |
| int limit = in.limit(); |
| try { |
| while (pos < limit) { |
| if (outRemaining == 0) { |
| return CoderResult.OVERFLOW; |
| } |
| int jchar = in.get(); |
| if (jchar < 0) { |
| jchar = jchar & 0x7F; |
| int tail = remainingBytes[jchar]; |
| if (tail == -1) { |
| return CoderResult.malformedForLength(1); |
| } |
| if (limit - pos < 1 + tail) { |
| // No early test for invalid sequences here as peeking |
| // at the next byte is harder |
| return CoderResult.UNDERFLOW; |
| } |
| int nextByte; |
| for (int i = 0; i < tail; i++) { |
| nextByte = in.get() & 0xFF; |
| if ((nextByte & 0xC0) != 0x80) { |
| return CoderResult.malformedForLength(1 + i); |
| } |
| jchar = (jchar << 6) + nextByte; |
| } |
| jchar -= remainingNumbers[tail]; |
| if (jchar < lowerEncodingLimit[tail]) { |
| // Should have been encoded in a fewer octets |
| return CoderResult.malformedForLength(1); |
| } |
| pos += tail; |
| } |
| // Apache Tomcat added test |
| if (jchar >= 0xD800 && jchar <= 0xDFFF) { |
| return CoderResult.unmappableForLength(3); |
| } |
| // Apache Tomcat added test |
| if (jchar > 0x10FFFF) { |
| return CoderResult.unmappableForLength(4); |
| } |
| if (jchar <= 0xffff) { |
| out.put((char) jchar); |
| outRemaining--; |
| } else { |
| if (outRemaining < 2) { |
| return CoderResult.OVERFLOW; |
| } |
| out.put((char) ((jchar >> 0xA) + 0xD7C0)); |
| out.put((char) ((jchar & 0x3FF) + 0xDC00)); |
| outRemaining -= 2; |
| } |
| pos++; |
| } |
| return CoderResult.UNDERFLOW; |
| } finally { |
| in.position(pos); |
| } |
| } |
| |
| |
| private CoderResult decodeHasArray(ByteBuffer in, CharBuffer out) { |
| int outRemaining = out.remaining(); |
| int pos = in.position(); |
| int limit = in.limit(); |
| final byte[] bArr = in.array(); |
| final char[] cArr = out.array(); |
| final int inIndexLimit = limit + in.arrayOffset(); |
| int inIndex = pos + in.arrayOffset(); |
| int outIndex = out.position() + out.arrayOffset(); |
| // if someone would change the limit in process, |
| // he would face consequences |
| for (; inIndex < inIndexLimit && outRemaining > 0; inIndex++) { |
| int jchar = bArr[inIndex]; |
| if (jchar < 0) { |
| jchar = jchar & 0x7F; |
| // If first byte is invalid, tail will be set to -1 |
| int tail = remainingBytes[jchar]; |
| if (tail == -1) { |
| in.position(inIndex - in.arrayOffset()); |
| out.position(outIndex - out.arrayOffset()); |
| return CoderResult.malformedForLength(1); |
| } |
| // Additional checks to detect invalid sequences ASAP |
| // Checks derived from Unicode 6.2, Chapter 3, Table 3-7 |
| // Check 2nd byte |
| int tailAvailable = inIndexLimit - inIndex - 1; |
| if (tailAvailable > 0) { |
| // First byte C2..DF, second byte 80..BF |
| if (jchar > 0x41 && jchar < 0x60 && |
| (bArr[inIndex + 1] & 0xC0) != 0x80) { |
| in.position(inIndex - in.arrayOffset()); |
| out.position(outIndex - out.arrayOffset()); |
| return CoderResult.malformedForLength(1); |
| } |
| // First byte E0, second byte A0..BF |
| if (jchar == 0x60 && (bArr[inIndex + 1] & 0xE0) != 0xA0) { |
| in.position(inIndex - in.arrayOffset()); |
| out.position(outIndex - out.arrayOffset()); |
| return CoderResult.malformedForLength(1); |
| } |
| // First byte E1..EC, second byte 80..BF |
| if (jchar > 0x60 && jchar < 0x6D && |
| (bArr[inIndex + 1] & 0xC0) != 0x80) { |
| in.position(inIndex - in.arrayOffset()); |
| out.position(outIndex - out.arrayOffset()); |
| return CoderResult.malformedForLength(1); |
| } |
| // First byte ED, second byte 80..9F |
| if (jchar == 0x6D && (bArr[inIndex + 1] & 0xE0) != 0x80) { |
| in.position(inIndex - in.arrayOffset()); |
| out.position(outIndex - out.arrayOffset()); |
| return CoderResult.malformedForLength(1); |
| } |
| // First byte EE..EF, second byte 80..BF |
| if (jchar > 0x6D && jchar < 0x70 && |
| (bArr[inIndex + 1] & 0xC0) != 0x80) { |
| in.position(inIndex - in.arrayOffset()); |
| out.position(outIndex - out.arrayOffset()); |
| return CoderResult.malformedForLength(1); |
| } |
| // First byte F0, second byte 90..BF |
| if (jchar == 0x70 && |
| ((bArr[inIndex + 1] & 0xFF) < 0x90 || |
| (bArr[inIndex + 1] & 0xFF) > 0xBF)) { |
| in.position(inIndex - in.arrayOffset()); |
| out.position(outIndex - out.arrayOffset()); |
| return CoderResult.malformedForLength(1); |
| } |
| // First byte F1..F3, second byte 80..BF |
| if (jchar > 0x70 && jchar < 0x74 && |
| (bArr[inIndex + 1] & 0xC0) != 0x80) { |
| in.position(inIndex - in.arrayOffset()); |
| out.position(outIndex - out.arrayOffset()); |
| return CoderResult.malformedForLength(1); |
| } |
| // First byte F4, second byte 80..8F |
| if (jchar == 0x74 && |
| (bArr[inIndex + 1] & 0xF0) != 0x80) { |
| in.position(inIndex - in.arrayOffset()); |
| out.position(outIndex - out.arrayOffset()); |
| return CoderResult.malformedForLength(1); |
| } |
| } |
| // Check third byte if present and expected |
| if (tailAvailable > 1 && tail > 1) { |
| if ((bArr[inIndex + 2] & 0xC0) != 0x80) { |
| in.position(inIndex - in.arrayOffset()); |
| out.position(outIndex - out.arrayOffset()); |
| return CoderResult.malformedForLength(2); |
| } |
| } |
| // Check fourth byte if present and expected |
| if (tailAvailable > 2 && tail > 2) { |
| if ((bArr[inIndex + 3] & 0xC0) != 0x80) { |
| in.position(inIndex - in.arrayOffset()); |
| out.position(outIndex - out.arrayOffset()); |
| return CoderResult.malformedForLength(3); |
| } |
| } |
| if (tailAvailable < tail) { |
| break; |
| } |
| for (int i = 0; i < tail; i++) { |
| int nextByte = bArr[inIndex + i + 1] & 0xFF; |
| if ((nextByte & 0xC0) != 0x80) { |
| in.position(inIndex - in.arrayOffset()); |
| out.position(outIndex - out.arrayOffset()); |
| return CoderResult.malformedForLength(1 + i); |
| } |
| jchar = (jchar << 6) + nextByte; |
| } |
| jchar -= remainingNumbers[tail]; |
| if (jchar < lowerEncodingLimit[tail]) { |
| // Should have been encoded in fewer octets |
| in.position(inIndex - in.arrayOffset()); |
| out.position(outIndex - out.arrayOffset()); |
| return CoderResult.malformedForLength(1); |
| } |
| inIndex += tail; |
| } |
| // Apache Tomcat added test |
| if (jchar >= 0xD800 && jchar <= 0xDFFF) { |
| return CoderResult.unmappableForLength(3); |
| } |
| // Apache Tomcat added test |
| if (jchar > 0x10FFFF) { |
| return CoderResult.unmappableForLength(4); |
| } |
| if (jchar <= 0xffff) { |
| cArr[outIndex++] = (char) jchar; |
| outRemaining--; |
| } else { |
| if (outRemaining < 2) { |
| // Encoded with 4 bytes. inIndex currently points |
| // to the final byte. Move it back to first byte. |
| inIndex -= 3; |
| in.position(inIndex - in.arrayOffset()); |
| out.position(outIndex - out.arrayOffset()); |
| return CoderResult.OVERFLOW; |
| } |
| cArr[outIndex++] = (char) ((jchar >> 0xA) + 0xD7C0); |
| cArr[outIndex++] = (char) ((jchar & 0x3FF) + 0xDC00); |
| outRemaining -= 2; |
| } |
| } |
| in.position(inIndex - in.arrayOffset()); |
| out.position(outIndex - out.arrayOffset()); |
| return (outRemaining == 0 && inIndex < inIndexLimit) ? |
| CoderResult.OVERFLOW : |
| CoderResult.UNDERFLOW; |
| } |
| } |