| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package io; |
| |
| import java.io.IOException; |
| import java.io.InputStream; |
| import java.io.InputStreamReader; |
| import java.io.Reader; |
| |
| import org.apache.xerces.impl.io.UTF8Reader; |
| |
| /** |
| * This program tests the customized UTF-8 reader for the parser, |
| * comparing it with the Java UTF-8 reader. Interestingly, when |
| * reading character by character (as opposed to block character |
| * reads), the Java reader silently skips surrogate characters |
| * on the input! I've seen this behavior in 1.1.8, 1.2, and 1.3 |
| * under the Windows platform. |
| * |
| * @author Andy Clark, IBM |
| * |
| * @version $Id$ |
| */ |
| public class UTF8 { |
| |
| // |
| // MAIN |
| // |
| |
| /** Main program entry. */ |
| public static void main(String[] argv) throws Exception { |
| |
| final int BLOCK_READ_SIZE = 2048; |
| |
| // |
| // Test Java reference implementation of UTF-8 decoder |
| // |
| |
| System.err.println("#"); |
| System.err.println("# Testing Java UTF-8 decoder"); |
| System.err.println("#"); |
| |
| // test character by character |
| try { |
| InputStream stream = new UTF8Producer(); |
| Reader reader = new InputStreamReader(stream, "UTF8"); |
| long time = testCharByChar(reader); |
| System.err.println("PASS ("+time+" ms)"); |
| reader.close(); |
| } |
| catch (IOException e) { |
| System.err.println("FAIL: "+e.getMessage()); |
| } |
| |
| // test character array |
| try { |
| InputStream stream = new UTF8Producer(); |
| Reader reader = new InputStreamReader(stream, "UTF8"); |
| long time = testCharArray(reader, BLOCK_READ_SIZE); |
| System.err.println("PASS ("+time+" ms)"); |
| reader.close(); |
| } |
| catch (IOException e) { |
| System.err.println("FAIL: "+e.getMessage()); |
| } |
| |
| // |
| // Test custom implementation of UTF-8 decoder |
| // |
| |
| System.err.println("#"); |
| System.err.println("# Testing custom UTF-8 decoder"); |
| System.err.println("#"); |
| |
| // test character by character |
| try { |
| InputStream stream = new UTF8Producer(); |
| Reader reader = new UTF8Reader(stream); |
| long time = testCharByChar(reader); |
| System.err.println("PASS ("+time+" ms)"); |
| reader.close(); |
| } |
| catch (IOException e) { |
| System.err.println("FAIL: "+e.getMessage()); |
| } |
| |
| // test character array |
| try { |
| InputStream stream = new UTF8Producer(); |
| Reader reader = new UTF8Reader(stream); |
| long time = testCharArray(reader, BLOCK_READ_SIZE); |
| System.err.println("PASS ("+time+" ms)"); |
| reader.close(); |
| } |
| catch (IOException e) { |
| System.err.println("FAIL: "+e.getMessage()); |
| } |
| |
| } // main(String[]) |
| |
| // |
| // Public static methods |
| // |
| |
| /** This function tests the specified reader character by character. */ |
| public static long testCharByChar(Reader reader) throws Exception { |
| |
| long before = System.currentTimeMillis(); |
| System.err.println("# Testing character by character"); |
| |
| System.err.println("testing 0x000000 -> 0x00007F"); |
| for (int i = 0; i < 0x0080; i++) { |
| int c = reader.read(); |
| if (c != i) { |
| expectedChar(null, i, c); |
| } |
| } |
| System.err.println("testing 0x000080 -> 0x0007FF"); |
| for (int i = 0x0080; i < 0x0800; i++) { |
| int c = reader.read(); |
| if (c != i) { |
| expectedChar(null, i, c); |
| } |
| } |
| System.err.println("testing 0x000800 -> 0x00D7FF"); |
| for (int i = 0x0800; i < 0xD800; i++) { |
| int c = reader.read(); |
| if (c != i) { |
| expectedChar(null, i, c); |
| } |
| } |
| System.err.println("testing 0x00E000 -> 0x00FFFF"); |
| for (int i = 0xE000; i < 0x010000; i++) { |
| int c = reader.read(); |
| if (c != i) { |
| expectedChar(null, i, c); |
| } |
| } |
| System.err.println("testing 0x010000 -> 0x110000"); |
| for (int i = 0x10000; i < 0x110000; i++) { |
| // vars |
| int uuuuu = (i >> 16) & 0x001F; |
| int wwww = uuuuu - 1; |
| int zzzz = (i >> 12) & 0x000F; |
| int yyyyyy = (i >> 6) & 0x003F; |
| int xxxxxx = i & 0x003F; |
| int hs = 0xD800 | (wwww << 6) | (zzzz << 2) | (yyyyyy >> 4); |
| int ls = 0xDC00 | ((yyyyyy << 6) & 0x03C0) | xxxxxx; |
| // high surrogate |
| int c = reader.read(); |
| if (c != hs) { |
| expectedChar("high surrogate", hs, c); |
| } |
| // low surrogate |
| c = reader.read(); |
| if (c != ls) { |
| expectedChar("low surrogate", ls, c); |
| } |
| } |
| System.err.println("checking EOF"); |
| int c = reader.read(); |
| if (c != -1) { |
| extraChar(c); |
| } |
| long after = System.currentTimeMillis(); |
| |
| return after - before; |
| |
| } // testCharByChar(Reader):long |
| |
| /** |
| * This function tests the given reader by performing block character |
| * reads of the specified size. |
| */ |
| public static long testCharArray(Reader reader, int size) throws Exception { |
| |
| long before = System.currentTimeMillis(); |
| System.err.println("# Testing character array of size "+size); |
| |
| char[] ch = new char[size]; |
| int count = 0; |
| int position = 0; |
| |
| System.err.println("testing 0x000000 -> 0x00007F"); |
| for (int i = 0; i < 0x0080; i++) { |
| if (position == count) { |
| count = load(reader, ch); |
| position = 0; |
| } |
| int c = ch[position++]; |
| if (c != i) { |
| expectedChar(null, i, c); |
| } |
| } |
| System.err.println("testing 0x000080 -> 0x0007FF"); |
| for (int i = 0x0080; i < 0x0800; i++) { |
| if (position == count) { |
| count = load(reader, ch); |
| position = 0; |
| } |
| int c = ch[position++]; |
| if (c != i) { |
| expectedChar(null, i, c); |
| } |
| } |
| System.err.println("testing 0x000800 -> 0x00D7FF"); |
| for (int i = 0x0800; i < 0xD800; i++) { |
| if (position == count) { |
| count = load(reader, ch); |
| position = 0; |
| } |
| int c = ch[position++]; |
| if (c != i) { |
| expectedChar(null, i, c); |
| } |
| } |
| System.err.println("testing 0x00E000 -> 0x00FFFF"); |
| for (int i = 0xE000; i < 0x010000; i++) { |
| if (position == count) { |
| count = load(reader, ch); |
| position = 0; |
| } |
| int c = ch[position++]; |
| if (c != i) { |
| expectedChar(null, i, c); |
| } |
| } |
| System.err.println("testing 0x010000 -> 0x10FFFF"); |
| for (int i = 0x10000; i < 0x110000; i++) { |
| // vars |
| int uuuuu = (i >> 16) & 0x001F; |
| int wwww = uuuuu - 1; |
| int zzzz = (i >> 12) & 0x000F; |
| int yyyyyy = (i >> 6) & 0x003F; |
| int xxxxxx = i & 0x003F; |
| int hs = 0xD800 | (wwww << 6) | (zzzz << 2) | (yyyyyy >> 4); |
| int ls = 0xDC00 | ((yyyyyy << 6) & 0x03C0) | xxxxxx; |
| // high surrogate |
| if (position == count) { |
| count = load(reader, ch); |
| position = 0; |
| } |
| int c = ch[position++]; |
| if (c != hs) { |
| expectedChar("high surrogate", hs, c); |
| } |
| // low surrogate |
| if (position == count) { |
| count = load(reader, ch); |
| position = 0; |
| } |
| c = ch[position++]; |
| if (c != ls) { |
| expectedChar("low surrogate", ls, c); |
| } |
| } |
| System.err.println("checking EOF"); |
| if (position == count) { |
| count = load(reader, ch); |
| position = 0; |
| } |
| if (count != -1) { |
| extraChar(ch[position]); |
| } |
| long after = System.currentTimeMillis(); |
| |
| return after - before; |
| |
| } // testCharArray(Reader):long |
| |
| // |
| // Package private static methods |
| // |
| |
| /** Loads another block of characters from the reader. */ |
| static int load(Reader reader, char[] ch) throws IOException { |
| int count = reader.read(ch, 0, ch.length); |
| return count; |
| } // load(Reader,char[]):int |
| |
| /** Creates an I/O exception for expected character. */ |
| static void expectedChar(String prefix, int ec, int fc) throws IOException { |
| StringBuffer str = new StringBuffer(); |
| str.append("expected "); |
| if (prefix != null) { |
| str.append(prefix); |
| str.append(' '); |
| } |
| str.append("0x"); |
| str.append(Integer.toHexString(ec)); |
| str.append(" but found 0x"); |
| if (fc != -1) { |
| str.append(Integer.toHexString(fc)); |
| } |
| else { |
| str.append("EOF"); |
| } |
| String message = str.toString(); |
| throw new IOException(message); |
| } // expectedChar(String,int,int) |
| |
| /** Creates an I/O exception for extra character. */ |
| static void extraChar(int c) throws IOException { |
| StringBuffer str = new StringBuffer(); |
| str.append("found extra character 0x"); |
| str.append(Integer.toHexString(c)); |
| String message = str.toString(); |
| throw new IOException(message); |
| } // extraChar(int) |
| |
| // |
| // Classes |
| // |
| |
| /** |
| * This classes produces a stream of UTF-8 byte sequences for all |
| * valid Unicode characters. |
| * |
| * @author Andy Clark, IBM |
| */ |
| public static class UTF8Producer |
| extends InputStream { |
| |
| // |
| // Data |
| // |
| |
| /** The current code point. */ |
| private int fCodePoint; |
| |
| /** The current byte of the current code point. */ |
| private int fByte; |
| |
| // |
| // InputStream methods |
| // |
| |
| /** Reads the next character. */ |
| public int read() throws IOException { |
| |
| // UTF-8: [0xxx xxxx] |
| // Unicode: [0000 0000] [0xxx xxxx] |
| if (fCodePoint < 0x0080) { |
| int b = fCodePoint; |
| fCodePoint++; |
| fByte = 0; |
| return b; |
| } |
| |
| // UTF-8: [110y yyyy] [10xx xxxx] |
| // Unicode: [0000 0yyy] [yyxx xxxx] |
| if (fCodePoint < 0x0800) { |
| switch (fByte) { |
| case 0: { |
| int b = 0x00C0 | ((fCodePoint >> 6) & 0x001F); |
| fByte++; |
| return b; |
| } |
| case 1: { |
| int b = 0x0080 | (fCodePoint & 0x003F); |
| fCodePoint++; |
| fByte = 0; |
| return b; |
| } |
| default: { |
| throw new RuntimeException("byte "+fByte+" of 2 byte UTF-8 sequence"); |
| } |
| } |
| } |
| |
| // UTF-8: [1110 zzzz] [10yy yyyy] [10xx xxxx] |
| // Unicode: [zzzz yyyy] [yyxx xxxx]* |
| if (fCodePoint < 0x10000) { |
| switch (fByte) { |
| case 0: { |
| int b = 0x00E0 | ((fCodePoint >> 12) & 0x000F); |
| fByte++; |
| return b; |
| } |
| case 1: { |
| int b = 0x0080 | ((fCodePoint >> 6) & 0x003F); |
| fByte++; |
| return b; |
| } |
| case 2: { |
| int b = 0x0080 | (fCodePoint & 0x003F); |
| fCodePoint++; |
| // skip surrogate blocks |
| if (fCodePoint == 0xD800) { |
| fCodePoint = 0xE000; |
| } |
| fByte = 0; |
| return b; |
| } |
| default: { |
| throw new RuntimeException("byte "+fByte+" of 3 byte UTF-8 sequence"); |
| } |
| } |
| } |
| |
| // UTF-8: [1111 0uuu] [10uu zzzz] [10yy yyyy] [10xx xxxx]* |
| // Unicode: [1101 10ww] [wwzz zzyy] (high surrogate) |
| // [1101 11yy] [yyxx xxxx] (low surrogate) |
| // * uuuuu = wwww + 1 |
| // [0000 0000] [000u uuuu] [zzzz yyyy] [yyxx xxxx] |
| if (fCodePoint < 0x110000) { |
| switch (fByte) { |
| case 0: { |
| int uuuuu = (fCodePoint >> 16) & 0x001F; |
| int b = 0x00F0 | (uuuuu >> 2); |
| fByte++; |
| return b; |
| } |
| case 1: { |
| int uuuuu = (fCodePoint >> 16) & 0x001F; |
| int zzzz = (fCodePoint >> 12) & 0x000F; |
| int b = 0x0080 | ((uuuuu << 4) & 0x0030) | zzzz; |
| fByte++; |
| return b; |
| } |
| case 2: { |
| int yyyyyy = (fCodePoint >> 6) & 0x003F; |
| int b = 0x0080 | yyyyyy; |
| fByte++; |
| return b; |
| } |
| case 3: { |
| int xxxxxx = fCodePoint & 0x003F; |
| int b = 0x0080 | xxxxxx; |
| fCodePoint++; |
| fByte = 0; |
| return b; |
| } |
| default: { |
| throw new RuntimeException("byte "+fByte+" of 4 byte UTF-8 sequence"); |
| } |
| } |
| } |
| |
| // done |
| return -1; |
| |
| } // read():int |
| |
| } // class UTF8Producer |
| |
| } // class UTF8 |