| /* |
| * The Apache Software License, Version 1.1 |
| * |
| * |
| * Copyright (c) 2000 The Apache Software Foundation. All rights |
| * reserved. |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions |
| * are met: |
| * |
| * 1. Redistributions of source code must retain the above copyright |
| * notice, this list of conditions and the following disclaimer. |
| * |
| * 2. Redistributions in binary form must reproduce the above copyright |
| * notice, this list of conditions and the following disclaimer in |
| * the documentation and/or other materials provided with the |
| * distribution. |
| * |
| * 3. The end-user documentation included with the redistribution, |
| * if any, must include the following acknowledgment: |
| * "This product includes software developed by the |
| * Apache Software Foundation (http://www.apache.org/)." |
| * Alternately, this acknowledgment may appear in the software itself, |
| * if and wherever such third-party acknowledgments normally appear. |
| * |
| * 4. The names "Xerces" and "Apache Software Foundation" must |
| * not be used to endorse or promote products derived from this |
| * software without prior written permission. For written |
| * permission, please contact apache@apache.org. |
| * |
| * 5. Products derived from this software may not be called "Apache", |
| * nor may "Apache" appear in their name, without prior written |
| * permission of the Apache Software Foundation. |
| * |
| * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED |
| * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES |
| * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
| * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR |
| * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF |
| * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND |
| * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, |
| * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT |
| * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
| * SUCH DAMAGE. |
| * ==================================================================== |
| * |
| * This software consists of voluntary contributions made by many |
| * individuals on behalf of the Apache Software Foundation and was |
| * originally based on software copyright (c) 1999, International |
| * Business Machines, Inc., http://www.apache.org. For more |
| * information on the Apache Software Foundation, please see |
| * <http://www.apache.org/>. |
| */ |
| |
| package io; |
| |
| import java.io.EOFException; |
| import java.io.InputStream; |
| import java.io.InputStreamReader; |
| import java.io.IOException; |
| import java.io.Reader; |
| |
| import org.apache.xerces.impl.io.UTF8Reader; |
| |
| /** |
| * This program tests the customized UTF-8 reader for the parser, |
| * comparing it with the Java UTF-8 reader. Interestingly, when |
| * reading character by character (as opposed to block character |
| * reads), the Java reader silently skips surrogate characters |
| * on the input! I've seen this behavior in 1.1.8, 1.2, and 1.3 |
| * under the Windows platform. |
| * |
| * @author Andy Clark, IBM |
| * |
| * @version $Id$ |
| */ |
| public class UTF8 { |
| |
| // |
| // MAIN |
| // |
| |
| /** Main program entry. */ |
| public static void main(String[] argv) throws Exception { |
| |
| final int BLOCK_READ_SIZE = 2048; |
| |
| // |
| // Test Java reference implementation of UTF-8 decoder |
| // |
| |
| System.err.println("#"); |
| System.err.println("# Testing Java UTF-8 decoder"); |
| System.err.println("#"); |
| |
| // test character by character |
| try { |
| InputStream stream = new UTF8Producer(); |
| Reader reader = new InputStreamReader(stream, "UTF8"); |
| long time = testCharByChar(reader); |
| System.err.println("PASS ("+time+" ms)"); |
| reader.close(); |
| } |
| catch (IOException e) { |
| System.err.println("FAIL: "+e.getMessage()); |
| } |
| |
| // test character array |
| try { |
| InputStream stream = new UTF8Producer(); |
| Reader reader = new InputStreamReader(stream, "UTF8"); |
| long time = testCharArray(reader, BLOCK_READ_SIZE); |
| System.err.println("PASS ("+time+" ms)"); |
| reader.close(); |
| } |
| catch (IOException e) { |
| System.err.println("FAIL: "+e.getMessage()); |
| } |
| |
| // |
| // Test custom implementation of UTF-8 decoder |
| // |
| |
| System.err.println("#"); |
| System.err.println("# Testing custom UTF-8 decoder"); |
| System.err.println("#"); |
| |
| // test character by character |
| try { |
| InputStream stream = new UTF8Producer(); |
| Reader reader = new UTF8Reader(stream); |
| long time = testCharByChar(reader); |
| System.err.println("PASS ("+time+" ms)"); |
| reader.close(); |
| } |
| catch (IOException e) { |
| System.err.println("FAIL: "+e.getMessage()); |
| } |
| |
| // test character array |
| try { |
| InputStream stream = new UTF8Producer(); |
| Reader reader = new UTF8Reader(stream); |
| long time = testCharArray(reader, BLOCK_READ_SIZE); |
| System.err.println("PASS ("+time+" ms)"); |
| reader.close(); |
| } |
| catch (IOException e) { |
| System.err.println("FAIL: "+e.getMessage()); |
| } |
| |
| } // main(String[]) |
| |
| // |
| // Public static methods |
| // |
| |
| /** This function tests the specified reader character by character. */ |
| public static long testCharByChar(Reader reader) throws Exception { |
| |
| long before = System.currentTimeMillis(); |
| System.err.println("# Testing character by character"); |
| |
| System.err.println("testing 0x000000 -> 0x00007F"); |
| for (int i = 0; i < 0x0080; i++) { |
| int c = reader.read(); |
| if (c != i) { |
| expectedChar(null, i, c); |
| } |
| } |
| System.err.println("testing 0x000080 -> 0x0007FF"); |
| for (int i = 0x0080; i < 0x0800; i++) { |
| int c = reader.read(); |
| if (c != i) { |
| expectedChar(null, i, c); |
| } |
| } |
| System.err.println("testing 0x000800 -> 0x00D7FF"); |
| for (int i = 0x0800; i < 0xD800; i++) { |
| int c = reader.read(); |
| if (c != i) { |
| expectedChar(null, i, c); |
| } |
| } |
| System.err.println("testing 0x00E000 -> 0x00FFFF"); |
| for (int i = 0xE000; i < 0x010000; i++) { |
| int c = reader.read(); |
| if (c != i) { |
| expectedChar(null, i, c); |
| } |
| } |
| System.err.println("testing 0x010000 -> 0x110000"); |
| for (int i = 0x10000; i < 0x110000; i++) { |
| // vars |
| int uuuuu = (i >> 16) & 0x001F; |
| int wwww = uuuuu - 1; |
| int zzzz = (i >> 12) & 0x000F; |
| int yyyyyy = (i >> 6) & 0x003F; |
| int xxxxxx = i & 0x003F; |
| int hs = 0xD800 | (wwww << 6) | (zzzz << 2) | (yyyyyy >> 4); |
| int ls = 0xDC00 | ((yyyyyy << 6) & 0x03C0) | xxxxxx; |
| // high surrogate |
| int c = reader.read(); |
| if (c != hs) { |
| expectedChar("high surrogate", hs, c); |
| } |
| // low surrogate |
| c = reader.read(); |
| if (c != ls) { |
| expectedChar("low surrogate", ls, c); |
| } |
| } |
| System.err.println("checking EOF"); |
| int c = reader.read(); |
| if (c != -1) { |
| extraChar(c); |
| } |
| long after = System.currentTimeMillis(); |
| |
| return after - before; |
| |
| } // testCharByChar(Reader):long |
| |
| /** |
| * This function tests the given reader by performing block character |
| * reads of the specified size. |
| */ |
| public static long testCharArray(Reader reader, int size) throws Exception { |
| |
| long before = System.currentTimeMillis(); |
| System.err.println("# Testing character array of size "+size); |
| |
| char[] ch = new char[size]; |
| int count = 0; |
| int position = 0; |
| |
| System.err.println("testing 0x000000 -> 0x00007F"); |
| for (int i = 0; i < 0x0080; i++) { |
| if (position == count) { |
| count = load(reader, ch); |
| position = 0; |
| } |
| int c = ch[position++]; |
| if (c != i) { |
| expectedChar(null, i, c); |
| } |
| } |
| System.err.println("testing 0x000080 -> 0x0007FF"); |
| for (int i = 0x0080; i < 0x0800; i++) { |
| if (position == count) { |
| count = load(reader, ch); |
| position = 0; |
| } |
| int c = ch[position++]; |
| if (c != i) { |
| expectedChar(null, i, c); |
| } |
| } |
| System.err.println("testing 0x000800 -> 0x00D7FF"); |
| for (int i = 0x0800; i < 0xD800; i++) { |
| if (position == count) { |
| count = load(reader, ch); |
| position = 0; |
| } |
| int c = ch[position++]; |
| if (c != i) { |
| expectedChar(null, i, c); |
| } |
| } |
| System.err.println("testing 0x00E000 -> 0x00FFFF"); |
| for (int i = 0xE000; i < 0x010000; i++) { |
| if (position == count) { |
| count = load(reader, ch); |
| position = 0; |
| } |
| int c = ch[position++]; |
| if (c != i) { |
| expectedChar(null, i, c); |
| } |
| } |
| System.err.println("testing 0x010000 -> 0x110000"); |
| for (int i = 0x10000; i < 0x110000; i++) { |
| // vars |
| int uuuuu = (i >> 16) & 0x001F; |
| int wwww = uuuuu - 1; |
| int zzzz = (i >> 12) & 0x000F; |
| int yyyyyy = (i >> 6) & 0x003F; |
| int xxxxxx = i & 0x003F; |
| int hs = 0xD800 | (wwww << 6) | (zzzz << 2) | (yyyyyy >> 4); |
| int ls = 0xDC00 | ((yyyyyy << 6) & 0x03C0) | xxxxxx; |
| // high surrogate |
| if (position == count) { |
| count = load(reader, ch); |
| position = 0; |
| } |
| int c = ch[position++]; |
| if (c != hs) { |
| expectedChar("high surrogate", hs, c); |
| } |
| // low surrogate |
| if (position == count) { |
| count = load(reader, ch); |
| position = 0; |
| } |
| c = ch[position++]; |
| if (c != ls) { |
| expectedChar("low surrogate", ls, c); |
| } |
| } |
| System.err.println("checking EOF"); |
| if (position == count) { |
| count = load(reader, ch); |
| position = 0; |
| } |
| if (count != -1) { |
| extraChar(ch[position]); |
| } |
| long after = System.currentTimeMillis(); |
| |
| return after - before; |
| |
| } // testCharArray(Reader):long |
| |
| // |
| // Private static methods |
| // |
| |
| /** Loads another block of characters from the reader. */ |
| private static int load(Reader reader, char[] ch) throws IOException { |
| int count = reader.read(ch, 0, ch.length); |
| return count; |
| } // load(Reader,char[]):int |
| |
| /** Creates an I/O exception for expected character. */ |
| private static void expectedChar(String prefix, int ec, int fc) throws IOException { |
| StringBuffer str = new StringBuffer(); |
| str.append("expected "); |
| if (prefix != null) { |
| str.append(prefix); |
| str.append(' '); |
| } |
| str.append("0x"); |
| str.append(Integer.toHexString(ec)); |
| str.append(" but found 0x"); |
| if (fc != -1) { |
| str.append(Integer.toHexString(fc)); |
| } |
| else { |
| str.append("EOF"); |
| } |
| String message = str.toString(); |
| throw new IOException(message); |
| } // expectedChar(String,int,int) |
| |
| /** Creates an I/O exception for extra character. */ |
| private static void extraChar(int c) throws IOException { |
| StringBuffer str = new StringBuffer(); |
| str.append("found extra character 0x"); |
| str.append(Integer.toHexString(c)); |
| String message = str.toString(); |
| throw new IOException(message); |
| } // extraChar(int) |
| |
| // |
| // Classes |
| // |
| |
| /** |
| * This classes produces a stream of UTF-8 byte sequences for all |
| * valid Unicode characters. |
| * |
| * @author Andy Clark, IBM |
| */ |
| public static class UTF8Producer |
| extends InputStream { |
| |
| // |
| // Data |
| // |
| |
| /** The current code point. */ |
| private int fCodePoint; |
| |
| /** The current byte of the current code point. */ |
| private int fByte; |
| |
| // |
| // InputStream methods |
| // |
| |
| /** Reads the next character. */ |
| public int read() throws IOException { |
| |
| // UTF-8: [0xxx xxxx] |
| // Unicode: [0000 0000] [0xxx xxxx] |
| if (fCodePoint < 0x0080) { |
| int b = fCodePoint; |
| fCodePoint++; |
| fByte = 0; |
| return b; |
| } |
| |
| // UTF-8: [110y yyyy] [10xx xxxx] |
| // Unicode: [0000 0yyy] [yyxx xxxx] |
| if (fCodePoint < 0x0800) { |
| switch (fByte) { |
| case 0: { |
| int b = 0x00C0 | ((fCodePoint >> 6) & 0x001F); |
| fByte++; |
| return b; |
| } |
| case 1: { |
| int b = 0x0080 | (fCodePoint & 0x003F); |
| fCodePoint++; |
| fByte = 0; |
| return b; |
| } |
| default: { |
| throw new RuntimeException("byte "+fByte+" of 2 byte UTF-8 sequence"); |
| } |
| } |
| } |
| |
| // UTF-8: [1110 zzzz] [10yy yyyy] [10xx xxxx] |
| // Unicode: [zzzz yyyy] [yyxx xxxx]* |
| if (fCodePoint < 0x10000) { |
| switch (fByte) { |
| case 0: { |
| int b = 0x00E0 | ((fCodePoint >> 12) & 0x000F); |
| fByte++; |
| return b; |
| } |
| case 1: { |
| int b = 0x0080 | ((fCodePoint >> 6) & 0x003F); |
| fByte++; |
| return b; |
| } |
| case 2: { |
| int b = 0x0080 | (fCodePoint & 0x003F); |
| fCodePoint++; |
| // skip surrogate blocks |
| if (fCodePoint == 0xD800) { |
| fCodePoint = 0xE000; |
| } |
| fByte = 0; |
| return b; |
| } |
| default: { |
| throw new RuntimeException("byte "+fByte+" of 3 byte UTF-8 sequence"); |
| } |
| } |
| } |
| |
| // UTF-8: [1111 0uuu] [10uu zzzz] [10yy yyyy] [10xx xxxx]* |
| // Unicode: [1101 10ww] [wwzz zzyy] (high surrogate) |
| // [1101 11yy] [yyxx xxxx] (low surrogate) |
| // * uuuuu = wwww + 1 |
| // [0000 0000] [000u uuuu] [zzzz yyyy] [yyxx xxxx] |
| if (fCodePoint < 0x110000) { |
| switch (fByte) { |
| case 0: { |
| int uuuuu = (fCodePoint >> 16) & 0x001F; |
| int b = 0x00F0 | (uuuuu >> 2); |
| fByte++; |
| return b; |
| } |
| case 1: { |
| int uuuuu = (fCodePoint >> 16) & 0x001F; |
| int zzzz = (fCodePoint >> 12) & 0x000F; |
| int b = 0x0080 | ((uuuuu << 4) & 0x0030) | zzzz; |
| fByte++; |
| return b; |
| } |
| case 2: { |
| int yyyyyy = (fCodePoint >> 6) & 0x003F; |
| int b = 0x0080 | yyyyyy; |
| fByte++; |
| return b; |
| } |
| case 3: { |
| int xxxxxx = fCodePoint & 0x003F; |
| int b = 0x0080 | xxxxxx; |
| fCodePoint++; |
| fByte = 0; |
| return b; |
| } |
| default: { |
| throw new RuntimeException("byte "+fByte+" of 4 byte UTF-8 sequence"); |
| } |
| } |
| } |
| |
| // done |
| return -1; |
| |
| } // read():int |
| |
| } // class UTF8Producer |
| |
| } // class UTF8 |