| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package io; |
| |
| import java.io.IOException; |
| import java.io.InputStream; |
| import java.io.InputStreamReader; |
| import java.io.Reader; |
| |
| import org.apache.xerces.impl.io.UTF16Reader; |
| import org.apache.xerces.util.XMLChar; |
| |
| /** |
| * This program tests the customized UTF-16 reader for the parser, |
| * comparing it with the Java UTF-16 reader. |
| * |
| * @version $Id$ |
| */ |
| public class UTF16 { |
| |
| // |
| // MAIN |
| // |
| |
| /** Main program entry. */ |
| public static void main(String[] argv) throws Exception { |
| testUTF16Decoder(true); |
| testUTF16Decoder(false); |
| } // main(String[]) |
| |
| // |
| // Public static methods |
| // |
| |
| public static void testUTF16Decoder(boolean isBigEndian) throws Exception { |
| |
| final int BLOCK_READ_SIZE = 2048; |
| final String encoding = isBigEndian ? "UnicodeBig" : "UnicodeLittle"; |
| final String shortName = isBigEndian ? "BE" : "LE"; |
| |
| // |
| // Test Java reference implementation of UTF-16 decoder |
| // |
| |
| System.err.println("#"); |
| System.err.println("# Testing Java UTF-16" + shortName + " decoder"); |
| System.err.println("#"); |
| |
| // test character by character |
| try { |
| InputStream stream = new UTF16Producer(isBigEndian); |
| Reader reader = new InputStreamReader(stream, encoding); |
| long time = testCharByChar(reader); |
| System.err.println("PASS ("+time+" ms)"); |
| reader.close(); |
| } |
| catch (IOException e) { |
| System.err.println("FAIL: "+e.getMessage()); |
| } |
| |
| // test character array |
| try { |
| InputStream stream = new UTF16Producer(isBigEndian); |
| Reader reader = new InputStreamReader(stream, encoding); |
| long time = testCharArray(reader, BLOCK_READ_SIZE); |
| System.err.println("PASS ("+time+" ms)"); |
| reader.close(); |
| } |
| catch (IOException e) { |
| System.err.println("FAIL: "+e.getMessage()); |
| } |
| |
| // |
| // Test custom implementation of UTF-16 decoder |
| // |
| |
| System.err.println("#"); |
| System.err.println("# Testing custom UTF-16" + shortName + " decoder"); |
| System.err.println("#"); |
| |
| // test character by character |
| try { |
| InputStream stream = new UTF16Producer(isBigEndian); |
| Reader reader = new UTF16Reader(stream, isBigEndian); |
| long time = testCharByChar(reader); |
| System.err.println("PASS ("+time+" ms)"); |
| reader.close(); |
| } |
| catch (IOException e) { |
| System.err.println("FAIL: "+e.getMessage()); |
| } |
| |
| // test character array |
| try { |
| InputStream stream = new UTF16Producer(isBigEndian); |
| Reader reader = new UTF16Reader(stream, isBigEndian); |
| long time = testCharArray(reader, BLOCK_READ_SIZE); |
| System.err.println("PASS ("+time+" ms)"); |
| reader.close(); |
| } |
| catch (IOException e) { |
| System.err.println("FAIL: "+e.getMessage()); |
| } |
| } |
| |
| /** This function tests the specified reader character by character. */ |
| public static long testCharByChar(Reader reader) throws Exception { |
| |
| long before = System.currentTimeMillis(); |
| System.err.println("# Testing character by character"); |
| |
| System.err.println("testing 0x000000 -> 0x00D7FF"); |
| for (int i = 0; i < 0xD800; i++) { |
| int c = reader.read(); |
| if (c != i) { |
| UTF8.expectedChar(null, i, c); |
| } |
| } |
| System.err.println("testing 0x00E000 -> 0x00FFFD"); |
| for (int i = 0xE000; i < 0xFFFE; i++) { |
| int c = reader.read(); |
| if (c != i) { |
| UTF8.expectedChar(null, i, c); |
| } |
| } |
| System.err.println("testing 0x010000 -> 0x10FFFF"); |
| for (int i = 0x10000; i < 0x110000; i++) { |
| // vars |
| int uuuuu = (i >> 16) & 0x001F; |
| int wwww = uuuuu - 1; |
| int zzzz = (i >> 12) & 0x000F; |
| int yyyyyy = (i >> 6) & 0x003F; |
| int xxxxxx = i & 0x003F; |
| int hs = 0xD800 | (wwww << 6) | (zzzz << 2) | (yyyyyy >> 4); |
| int ls = 0xDC00 | ((yyyyyy << 6) & 0x03C0) | xxxxxx; |
| // high surrogate |
| int c = reader.read(); |
| if (c != hs) { |
| UTF8.expectedChar("high surrogate", hs, c); |
| } |
| // low surrogate |
| c = reader.read(); |
| if (c != ls) { |
| UTF8.expectedChar("low surrogate", ls, c); |
| } |
| } |
| System.err.println("checking EOF"); |
| int c = reader.read(); |
| if (c != -1) { |
| UTF8.extraChar(c); |
| } |
| long after = System.currentTimeMillis(); |
| |
| return after - before; |
| |
| } // testCharByChar(Reader):long |
| |
| /** |
| * This function tests the given reader by performing block character |
| * reads of the specified size. |
| */ |
| public static long testCharArray(Reader reader, int size) throws Exception { |
| |
| long before = System.currentTimeMillis(); |
| System.err.println("# Testing character array of size "+size); |
| |
| char[] ch = new char[size]; |
| int count = 0; |
| int position = 0; |
| |
| System.err.println("testing 0x000000 -> 0x00D7FF"); |
| for (int i = 0; i < 0xD800; i++) { |
| if (position == count) { |
| count = UTF8.load(reader, ch); |
| position = 0; |
| } |
| int c = ch[position++]; |
| if (c != i) { |
| UTF8.expectedChar(null, i, c); |
| } |
| } |
| System.err.println("testing 0x00E000 -> 0x00FFFD"); |
| for (int i = 0xE000; i < 0xFFFE; i++) { |
| if (position == count) { |
| count = UTF8.load(reader, ch); |
| position = 0; |
| } |
| int c = ch[position++]; |
| if (c != i) { |
| UTF8.expectedChar(null, i, c); |
| } |
| } |
| System.err.println("testing 0x010000 -> 0x110000"); |
| for (int i = 0x10000; i < 0x110000; i++) { |
| // vars |
| int uuuuu = (i >> 16) & 0x001F; |
| int wwww = uuuuu - 1; |
| int zzzz = (i >> 12) & 0x000F; |
| int yyyyyy = (i >> 6) & 0x003F; |
| int xxxxxx = i & 0x003F; |
| int hs = 0xD800 | (wwww << 6) | (zzzz << 2) | (yyyyyy >> 4); |
| int ls = 0xDC00 | ((yyyyyy << 6) & 0x03C0) | xxxxxx; |
| // high surrogate |
| if (position == count) { |
| count = UTF8.load(reader, ch); |
| position = 0; |
| } |
| int c = ch[position++]; |
| if (c != hs) { |
| UTF8.expectedChar("high surrogate", hs, c); |
| } |
| // low surrogate |
| if (position == count) { |
| count = UTF8.load(reader, ch); |
| position = 0; |
| } |
| c = ch[position++]; |
| if (c != ls) { |
| UTF8.expectedChar("low surrogate", ls, c); |
| } |
| } |
| System.err.println("checking EOF"); |
| if (position == count) { |
| count = UTF8.load(reader, ch); |
| position = 0; |
| } |
| if (count != -1) { |
| UTF8.extraChar(ch[position]); |
| } |
| long after = System.currentTimeMillis(); |
| |
| return after - before; |
| |
| } // testCharArray(Reader):long |
| |
| // |
| // Classes |
| // |
| |
| /** |
| * This classes produces a stream of UTF-16 byte sequences for all |
| * valid Unicode characters. |
| */ |
| public static class UTF16Producer |
| extends InputStream { |
| |
| // |
| // Data |
| // |
| |
| /** The current code point. */ |
| private int fCodePoint; |
| |
| /** The current byte of the current code point. */ |
| private int fByte; |
| |
| /** Endianness. */ |
| private final boolean fIsBigEndian; |
| |
| // |
| // Constructors |
| // |
| |
| public UTF16Producer(boolean isBigEndian) { |
| fIsBigEndian = isBigEndian; |
| } |
| |
| // |
| // InputStream methods |
| // |
| |
| /** Reads the next character. */ |
| public int read() throws IOException { |
| |
| if (fCodePoint < 0xFFFE) { |
| // skip surrogate blocks |
| if (fCodePoint == 0xD800) { |
| fCodePoint = 0xE000; |
| } |
| switch (fByte) { |
| case 0: { |
| final int b; |
| if (fIsBigEndian) { |
| b = fCodePoint >> 8; |
| } |
| else { |
| b = fCodePoint & 0xff; |
| } |
| fByte++; |
| return b; |
| } |
| case 1: { |
| final int b; |
| if (fIsBigEndian) { |
| b = fCodePoint & 0xff; |
| } |
| else { |
| b = fCodePoint >> 8; |
| } |
| fCodePoint++; |
| fByte = 0; |
| return b; |
| } |
| default: { |
| throw new RuntimeException("byte "+fByte+" of 2 byte UTF-8 sequence"); |
| } |
| } |
| } |
| if (fCodePoint == 0xFFFE) { |
| fCodePoint = 0x10000; |
| } |
| if (fCodePoint < 0x110000) { |
| switch (fByte) { |
| case 0: { |
| final int b; |
| if (fIsBigEndian) { |
| b = XMLChar.highSurrogate(fCodePoint) >> 8; |
| } |
| else { |
| b = XMLChar.highSurrogate(fCodePoint) & 0xff; |
| } |
| fByte++; |
| return b; |
| } |
| case 1: { |
| final int b; |
| if (fIsBigEndian) { |
| b = XMLChar.highSurrogate(fCodePoint) & 0xff; |
| } |
| else { |
| b = XMLChar.highSurrogate(fCodePoint) >> 8; |
| } |
| fByte++; |
| return b; |
| } |
| case 2: { |
| final int b; |
| if (fIsBigEndian) { |
| b = XMLChar.lowSurrogate(fCodePoint) >> 8; |
| } |
| else { |
| b = XMLChar.lowSurrogate(fCodePoint) & 0xff; |
| } |
| fByte++; |
| return b; |
| } |
| case 3: { |
| final int b; |
| if (fIsBigEndian) { |
| b = XMLChar.lowSurrogate(fCodePoint) & 0xff; |
| } |
| else { |
| b = XMLChar.lowSurrogate(fCodePoint) >> 8; |
| } |
| fCodePoint++; |
| fByte = 0; |
| return b; |
| } |
| default: { |
| throw new RuntimeException("byte "+fByte+" of 2 byte UTF-8 sequence"); |
| } |
| } |
| } |
| return -1; |
| } |
| } |
| } |