|  | /* | 
|  | * Licensed to the Apache Software Foundation (ASF) under one or more | 
|  | * contributor license agreements.  See the NOTICE file distributed with | 
|  | * this work for additional information regarding copyright ownership. | 
|  | * The ASF licenses this file to You under the Apache License, Version 2.0 | 
|  | * (the "License"); you may not use this file except in compliance with | 
|  | * the License.  You may obtain a copy of the License at | 
|  | * | 
|  | *      http://www.apache.org/licenses/LICENSE-2.0 | 
|  | * | 
|  | * Unless required by applicable law or agreed to in writing, software | 
|  | * distributed under the License is distributed on an "AS IS" BASIS, | 
|  | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | 
|  | * See the License for the specific language governing permissions and | 
|  | * limitations under the License. | 
|  | */ | 
|  |  | 
|  | package io; | 
|  |  | 
|  | import java.io.IOException; | 
|  | import java.io.InputStream; | 
|  | import java.io.InputStreamReader; | 
|  | import java.io.Reader; | 
|  |  | 
|  | import org.apache.xerces.impl.io.UTF16Reader; | 
|  | import org.apache.xerces.util.XMLChar; | 
|  |  | 
|  | /** | 
|  | * This program tests the customized UTF-16 reader for the parser, | 
|  | * comparing it with the Java UTF-16 reader. | 
|  | * | 
|  | * @version $Id$ | 
|  | */ | 
|  | public class UTF16 { | 
|  |  | 
|  | // | 
|  | // MAIN | 
|  | // | 
|  |  | 
|  | /** Main program entry. */ | 
|  | public static void main(String[] argv) throws Exception { | 
|  | testUTF16Decoder(true); | 
|  | testUTF16Decoder(false); | 
|  | } // main(String[]) | 
|  |  | 
|  | // | 
|  | // Public static methods | 
|  | // | 
|  |  | 
|  | public static void testUTF16Decoder(boolean isBigEndian) throws Exception { | 
|  |  | 
|  | final int BLOCK_READ_SIZE = 2048; | 
|  | final String encoding = isBigEndian ? "UnicodeBig" : "UnicodeLittle"; | 
|  | final String shortName = isBigEndian ? "BE" : "LE"; | 
|  |  | 
|  | // | 
|  | // Test Java reference implementation of UTF-16 decoder | 
|  | // | 
|  |  | 
|  | System.err.println("#"); | 
|  | System.err.println("# Testing Java UTF-16" + shortName + " decoder"); | 
|  | System.err.println("#"); | 
|  |  | 
|  | // test character by character | 
|  | try { | 
|  | InputStream stream = new UTF16Producer(isBigEndian); | 
|  | Reader reader = new InputStreamReader(stream, encoding); | 
|  | long time = testCharByChar(reader); | 
|  | System.err.println("PASS ("+time+" ms)"); | 
|  | reader.close(); | 
|  | } | 
|  | catch (IOException e) { | 
|  | System.err.println("FAIL: "+e.getMessage()); | 
|  | } | 
|  |  | 
|  | // test character array | 
|  | try { | 
|  | InputStream stream = new UTF16Producer(isBigEndian); | 
|  | Reader reader = new InputStreamReader(stream, encoding); | 
|  | long time = testCharArray(reader, BLOCK_READ_SIZE); | 
|  | System.err.println("PASS ("+time+" ms)"); | 
|  | reader.close(); | 
|  | } | 
|  | catch (IOException e) { | 
|  | System.err.println("FAIL: "+e.getMessage()); | 
|  | } | 
|  |  | 
|  | // | 
|  | // Test custom implementation of UTF-16 decoder | 
|  | // | 
|  |  | 
|  | System.err.println("#"); | 
|  | System.err.println("# Testing custom UTF-16" + shortName + " decoder"); | 
|  | System.err.println("#"); | 
|  |  | 
|  | // test character by character | 
|  | try { | 
|  | InputStream stream = new UTF16Producer(isBigEndian); | 
|  | Reader reader = new UTF16Reader(stream, isBigEndian); | 
|  | long time = testCharByChar(reader); | 
|  | System.err.println("PASS ("+time+" ms)"); | 
|  | reader.close(); | 
|  | } | 
|  | catch (IOException e) { | 
|  | System.err.println("FAIL: "+e.getMessage()); | 
|  | } | 
|  |  | 
|  | // test character array | 
|  | try { | 
|  | InputStream stream = new UTF16Producer(isBigEndian); | 
|  | Reader reader = new UTF16Reader(stream, isBigEndian); | 
|  | long time = testCharArray(reader, BLOCK_READ_SIZE); | 
|  | System.err.println("PASS ("+time+" ms)"); | 
|  | reader.close(); | 
|  | } | 
|  | catch (IOException e) { | 
|  | System.err.println("FAIL: "+e.getMessage()); | 
|  | } | 
|  | } | 
|  |  | 
|  | /** This function tests the specified reader character by character. */ | 
|  | public static long testCharByChar(Reader reader) throws Exception { | 
|  |  | 
|  | long before = System.currentTimeMillis(); | 
|  | System.err.println("# Testing character by character"); | 
|  |  | 
|  | System.err.println("testing 0x000000 -> 0x00D7FF"); | 
|  | for (int i = 0; i < 0xD800; i++) { | 
|  | int c = reader.read(); | 
|  | if (c != i) { | 
|  | UTF8.expectedChar(null, i, c); | 
|  | } | 
|  | } | 
|  | System.err.println("testing 0x00E000 -> 0x00FFFD"); | 
|  | for (int i = 0xE000; i < 0xFFFE; i++) { | 
|  | int c = reader.read(); | 
|  | if (c != i) { | 
|  | UTF8.expectedChar(null, i, c); | 
|  | } | 
|  | } | 
|  | System.err.println("testing 0x010000 -> 0x10FFFF"); | 
|  | for (int i = 0x10000; i < 0x110000; i++) { | 
|  | // vars | 
|  | int uuuuu = (i >> 16) & 0x001F; | 
|  | int wwww = uuuuu - 1; | 
|  | int zzzz = (i >> 12) & 0x000F; | 
|  | int yyyyyy = (i >> 6) & 0x003F; | 
|  | int xxxxxx = i & 0x003F; | 
|  | int hs = 0xD800 | (wwww << 6) | (zzzz << 2) | (yyyyyy >> 4); | 
|  | int ls = 0xDC00 | ((yyyyyy << 6) & 0x03C0) | xxxxxx; | 
|  | // high surrogate | 
|  | int c = reader.read(); | 
|  | if (c != hs) { | 
|  | UTF8.expectedChar("high surrogate", hs, c); | 
|  | } | 
|  | // low surrogate | 
|  | c = reader.read(); | 
|  | if (c != ls) { | 
|  | UTF8.expectedChar("low surrogate", ls, c); | 
|  | } | 
|  | } | 
|  | System.err.println("checking EOF"); | 
|  | int c = reader.read(); | 
|  | if (c != -1) { | 
|  | UTF8.extraChar(c); | 
|  | } | 
|  | long after = System.currentTimeMillis(); | 
|  |  | 
|  | return after - before; | 
|  |  | 
|  | } // testCharByChar(Reader):long | 
|  |  | 
|  | /** | 
|  | * This function tests the given reader by performing block character | 
|  | * reads of the specified size. | 
|  | */ | 
|  | public static long testCharArray(Reader reader, int size) throws Exception { | 
|  |  | 
|  | long before = System.currentTimeMillis(); | 
|  | System.err.println("# Testing character array of size "+size); | 
|  |  | 
|  | char[] ch = new char[size]; | 
|  | int count = 0; | 
|  | int position = 0; | 
|  |  | 
|  | System.err.println("testing 0x000000 -> 0x00D7FF"); | 
|  | for (int i = 0; i < 0xD800; i++) { | 
|  | if (position == count) { | 
|  | count = UTF8.load(reader, ch); | 
|  | position = 0; | 
|  | } | 
|  | int c = ch[position++]; | 
|  | if (c != i) { | 
|  | UTF8.expectedChar(null, i, c); | 
|  | } | 
|  | } | 
|  | System.err.println("testing 0x00E000 -> 0x00FFFD"); | 
|  | for (int i = 0xE000; i < 0xFFFE; i++) { | 
|  | if (position == count) { | 
|  | count = UTF8.load(reader, ch); | 
|  | position = 0; | 
|  | } | 
|  | int c = ch[position++]; | 
|  | if (c != i) { | 
|  | UTF8.expectedChar(null, i, c); | 
|  | } | 
|  | } | 
|  | System.err.println("testing 0x010000 -> 0x110000"); | 
|  | for (int i = 0x10000; i < 0x110000; i++) { | 
|  | // vars | 
|  | int uuuuu = (i >> 16) & 0x001F; | 
|  | int wwww = uuuuu - 1; | 
|  | int zzzz = (i >> 12) & 0x000F; | 
|  | int yyyyyy = (i >> 6) & 0x003F; | 
|  | int xxxxxx = i & 0x003F; | 
|  | int hs = 0xD800 | (wwww << 6) | (zzzz << 2) | (yyyyyy >> 4); | 
|  | int ls = 0xDC00 | ((yyyyyy << 6) & 0x03C0) | xxxxxx; | 
|  | // high surrogate | 
|  | if (position == count) { | 
|  | count = UTF8.load(reader, ch); | 
|  | position = 0; | 
|  | } | 
|  | int c = ch[position++]; | 
|  | if (c != hs) { | 
|  | UTF8.expectedChar("high surrogate", hs, c); | 
|  | } | 
|  | // low surrogate | 
|  | if (position == count) { | 
|  | count = UTF8.load(reader, ch); | 
|  | position = 0; | 
|  | } | 
|  | c = ch[position++]; | 
|  | if (c != ls) { | 
|  | UTF8.expectedChar("low surrogate", ls, c); | 
|  | } | 
|  | } | 
|  | System.err.println("checking EOF"); | 
|  | if (position == count) { | 
|  | count = UTF8.load(reader, ch); | 
|  | position = 0; | 
|  | } | 
|  | if (count != -1) { | 
|  | UTF8.extraChar(ch[position]); | 
|  | } | 
|  | long after = System.currentTimeMillis(); | 
|  |  | 
|  | return after - before; | 
|  |  | 
|  | } // testCharArray(Reader):long | 
|  |  | 
|  | // | 
|  | // Classes | 
|  | // | 
|  |  | 
|  | /** | 
|  | * This classes produces a stream of UTF-16 byte sequences for all | 
|  | * valid Unicode characters. | 
|  | */ | 
|  | public static class UTF16Producer | 
|  | extends InputStream { | 
|  |  | 
|  | // | 
|  | // Data | 
|  | // | 
|  |  | 
|  | /** The current code point. */ | 
|  | private int fCodePoint; | 
|  |  | 
|  | /** The current byte of the current code point. */ | 
|  | private int fByte; | 
|  |  | 
|  | /** Endianness. */ | 
|  | private final boolean fIsBigEndian; | 
|  |  | 
|  | // | 
|  | // Constructors | 
|  | // | 
|  |  | 
|  | public UTF16Producer(boolean isBigEndian) { | 
|  | fIsBigEndian = isBigEndian; | 
|  | } | 
|  |  | 
|  | // | 
|  | // InputStream methods | 
|  | // | 
|  |  | 
|  | /** Reads the next character. */ | 
|  | public int read() throws IOException { | 
|  |  | 
|  | if (fCodePoint < 0xFFFE) { | 
|  | // skip surrogate blocks | 
|  | if (fCodePoint == 0xD800) { | 
|  | fCodePoint = 0xE000; | 
|  | } | 
|  | switch (fByte) { | 
|  | case 0: { | 
|  | final int b; | 
|  | if (fIsBigEndian) { | 
|  | b = fCodePoint >> 8; | 
|  | } | 
|  | else { | 
|  | b = fCodePoint & 0xff; | 
|  | } | 
|  | fByte++; | 
|  | return b; | 
|  | } | 
|  | case 1: { | 
|  | final int b; | 
|  | if (fIsBigEndian) { | 
|  | b = fCodePoint & 0xff; | 
|  | } | 
|  | else { | 
|  | b = fCodePoint >> 8; | 
|  | } | 
|  | fCodePoint++; | 
|  | fByte = 0; | 
|  | return b; | 
|  | } | 
|  | default: { | 
|  | throw new RuntimeException("byte "+fByte+" of 2 byte UTF-8 sequence"); | 
|  | } | 
|  | } | 
|  | } | 
|  | if (fCodePoint == 0xFFFE) { | 
|  | fCodePoint = 0x10000; | 
|  | } | 
|  | if (fCodePoint < 0x110000) { | 
|  | switch (fByte) { | 
|  | case 0: { | 
|  | final int b; | 
|  | if (fIsBigEndian) { | 
|  | b = XMLChar.highSurrogate(fCodePoint) >> 8; | 
|  | } | 
|  | else { | 
|  | b = XMLChar.highSurrogate(fCodePoint) & 0xff; | 
|  | } | 
|  | fByte++; | 
|  | return b; | 
|  | } | 
|  | case 1: { | 
|  | final int b; | 
|  | if (fIsBigEndian) { | 
|  | b = XMLChar.highSurrogate(fCodePoint) & 0xff; | 
|  | } | 
|  | else { | 
|  | b = XMLChar.highSurrogate(fCodePoint) >> 8; | 
|  | } | 
|  | fByte++; | 
|  | return b; | 
|  | } | 
|  | case 2: { | 
|  | final int b; | 
|  | if (fIsBigEndian) { | 
|  | b = XMLChar.lowSurrogate(fCodePoint) >> 8; | 
|  | } | 
|  | else { | 
|  | b = XMLChar.lowSurrogate(fCodePoint) & 0xff; | 
|  | } | 
|  | fByte++; | 
|  | return b; | 
|  | } | 
|  | case 3: { | 
|  | final int b; | 
|  | if (fIsBigEndian) { | 
|  | b = XMLChar.lowSurrogate(fCodePoint) & 0xff; | 
|  | } | 
|  | else { | 
|  | b = XMLChar.lowSurrogate(fCodePoint) >> 8; | 
|  | } | 
|  | fCodePoint++; | 
|  | fByte = 0; | 
|  | return b; | 
|  | } | 
|  | default: { | 
|  | throw new RuntimeException("byte "+fByte+" of 2 byte UTF-8 sequence"); | 
|  | } | 
|  | } | 
|  | } | 
|  | return -1; | 
|  | } | 
|  | } | 
|  | } |