jena-base/src/main/java/org/apache/jena/atlas/io/InStreamUTF8.java - jena - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.jena.atlas.io;

 import java.io.ByteArrayInputStream ;
 import java.io.IOException ;
 import java.io.InputStream ;
 import java.io.Reader ;

 import org.apache.jena.atlas.AtlasException ;

 /** Fast and streaming UTF-8 */
 public final class InStreamUTF8 extends Reader implements CharStream
 {
     // TODO Add line and col counts.
     // See arq.utf8.
     // TODO Better ready()/available() in InputStreamBuffered
     // TODO: chars > 16 bits -> convert to surrogate pairs.

     // The standard Java way of doing this is via charset decoders.
     // One small disadvantage is that bad UTF-8 does not get flagged as to
     // the byte position of the error.

     // This class collects knowledge of how UTF-8 encoding works;
     // the Java classes are usually slightly faster compared to using
     // this class with an InputStreamBuffered but the difference is small.
     // This class generated meaningful error messages (when line/col added).

     // The Java classes copy-convert a byte buffer into a char buffer.
     // Sometimes, for example in a parser, this isn't a convenient model
     // because the app is looking one character at a time and accumulating
     // the chars until it sees the end of a token of arbitrary length
     // or processes escape sequences.
     //
     // The app might use a StringBuilder so the bytes get copied into
     // a char buffer and out again.  Instead, this code assumes the
     // app is in charge of that.

     // UTF-8 (UTF-16) is different from other character sets because
     // the relationship with Java's internal character representation is
     // arithmetic, not a character mapping.

     /*
      * http://en.wikipedia.org/wiki/UTF-8
      * http://tools.ietf.org/html/rfc3629
      * http://www.ietf.org/rfc/rfc3629.txt
      *
      * Unicode                                  Byte1       Byte2       Byte3       Byte4
      * U+0000–U+007F    0 to 127                0xxxxxxx
      * U+0080–U+07FF    128 to 2,047            110yyyxx    10xxxxxx
      * U+0800–U+FFFF    2,048 to 65,535         1110yyyy    10yyyyxx    10xxxxxx
      * U+10000–U+10FFFF 65,536 to 1,114,111     11110zzz    10zzyyyy    10yyyyxx    10xxxxxx
      *
      * Restricted cases (RFC 3629)
      * 11110101-11110111    F5-F7   245-247     start of 4-byte sequence for codepoint above 10FFFF
      * 11111000-11111011    F8-FB   248-251     start of 5-byte sequence
      * 11111100-11111101    FC-FD   252-253     start of 6-byte sequence
      *
      * Illegal:
      * 11000000-11000001    C0-C1   192-193     Overlong encoding: start of a 2-byte sequence, but code point <= 127
      * 11111110-11111111    FE-FF   254-255     Invalid: not defined by original UTF-8 specification
      */

     // There is some sort of stream decoder backing the Sun implementation
     // of CharsetDecoder (sun.io.StreamDecoder) but it's not on all platforms
     // I want a known decoder specifically for UTF8

     private InputStreamBuffered input ;
     //private long count = 0 ;

     public InStreamUTF8(InputStream in)
     {
         if ( in instanceof InputStreamBuffered )
         {
             input = (InputStreamBuffered)in ;
             return ;
         }
         input = new InputStreamBuffered(in) ;
     }

     public InStreamUTF8(InputStreamBuffered in) { input = in ; }

     @Override
     public boolean ready() throws IOException
     {
         return input.available() > 0 ;
     }

     @Override
     public void close() throws IOException
     { input.close() ; }

     @Override
     public void closeStream()
     { IO.close(input) ; }

     @Override
     public int read(char[] cbuf, int off, int len) {
         // Doing this on a block of bytes may be faster.
         for ( int i = off ; i < off + len ; i++ ) {
             int x = read();
             if ( x == -1 ) {
                 if ( i == off )
                     return -1;
                 return (i - off);
             }
             cbuf[i] = (char)x;
         }
         return len;
     }

     @Override
     public final int read() {
         int ch = advance(input);
         // if ( ! Character.isDefined(ch) ) throw new
         // AtlasException(String.format("Undefined codepoint: 0x%04X", ch)) ;
         return ch;
     }

     /** Next codepoint, given the first byte of any UTF-8 byte sequence is already known.
      *  Not necessarily a valid char (this function can be used a straight UTF8 decoder
      */
     @Override
     public final int advance()
     { return advance(input) ; }

     /** Next codepoint */
     public static final int advance(InputStreamBuffered input) {
         int x = input.advance() ;
         if ( x == -1 ) return -1 ;
         return advance(input, x) ;
     }

     /** Next codepoint, given the first byte of any UTF-8 byte sequence is already known.
      * Not necessarily a valid char (this function can be used as a straight UTF8 decoder).
      */

     private static final int advance(InputStreamBuffered input, int x) {
         //count++ ;
         // ASCII Fastpath
         if ( x == -1 || (x >= 0 && x <= 127) ) {
             // count++ ;
             return x;
         }

         // 10 => extension byte
         // 110..... => 2 bytes
         if ( (x & 0xE0) == 0xC0 ) {
             int ch = readMultiBytes(input, x & 0x1F, 2);
             // count += 2 ;
             return ch;

         }
         // 1110.... => 3 bytes : 16 bits : not outside 16bit chars
         if ( (x & 0xF0) == 0xE0 ) {
             int ch = readMultiBytes(input, x & 0x0F, 3);
             // count += 3 ;
             // if ( ! Character.isDefined(ch) ) throw new
             // AtlasException(String.format("Undefined codepoint: 0x%04X", ch))
             // ;
             return ch;
         }

         // Looking like 4 byte character.
         int ch = -2;
         // 11110zzz => 4 bytes.
         if ( (x & 0xF8) == 0xF0 ) {
             ch = readMultiBytes(input, x & 0x08, 4);
             // Opps - need two returns. Character.toChars(ch, chars, 0) ;
             // count += 4 ;
         }

         else
             IO.exception(new IOException("Illegal UTF-8: " + x));

         // This test will go off. We're processing a 4 byte sequence but Java
         // only supports 16 bit chars.
         if ( ch > Character.MAX_VALUE )
             throw new AtlasException("Out of range character (must use a surrogate pair)");
         if ( !Character.isDefined(ch) )
             throw new AtlasException(String.format("Undefined codepoint: 0x%04X", ch));
         return ch;
     }

     private static int readMultiBytes(InputStreamBuffered input, int start, int len) {
         int x = start ;
         for ( int i = 0 ; i < len-1 ; i++ ) {
             int x2 = input.advance() ;
             if ( x2 == -1 )
                 throw new AtlasException("Premature end to UTF-8 sequence at end of input") ;

             if ( (x2 & 0xC0) != 0x80 )
                 //throw new AtlasException("Illegal UTF-8 processing character "+count+": "+x2) ;
                 throw new AtlasException(String.format("Illegal UTF-8 processing character: 0x%04X",x2)) ;
             // 6 bits of x2
             x = (x << 6) | (x2 & 0x3F);
         }
         return x ;
     }

     public static String decode(byte[] bytes) {
         try {
             char[] chars = new char[bytes.length];
             InputStream in = new ByteArrayInputStream(bytes);
             Reader r = new InStreamUTF8(in);
             int len;
             len = r.read(chars);
             IO.close(r);
             return new String(chars, 0, len);
         }
         catch (IOException ex) {
             IO.exception(ex);
             return null;
         }
     }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.jena.atlas.io;

	import java.io.ByteArrayInputStream ;
	import java.io.IOException ;
	import java.io.InputStream ;
	import java.io.Reader ;

	import org.apache.jena.atlas.AtlasException ;

	/** Fast and streaming UTF-8 */
	public final class InStreamUTF8 extends Reader implements CharStream
	{
	// TODO Add line and col counts.
	// See arq.utf8.
	// TODO Better ready()/available() in InputStreamBuffered
	// TODO: chars > 16 bits -> convert to surrogate pairs.

	// The standard Java way of doing this is via charset decoders.
	// One small disadvantage is that bad UTF-8 does not get flagged as to
	// the byte position of the error.

	// This class collects knowledge of how UTF-8 encoding works;
	// the Java classes are usually slightly faster compared to using
	// this class with an InputStreamBuffered but the difference is small.
	// This class generated meaningful error messages (when line/col added).

	// The Java classes copy-convert a byte buffer into a char buffer.
	// Sometimes, for example in a parser, this isn't a convenient model
	// because the app is looking one character at a time and accumulating
	// the chars until it sees the end of a token of arbitrary length
	// or processes escape sequences.
	//
	// The app might use a StringBuilder so the bytes get copied into
	// a char buffer and out again. Instead, this code assumes the
	// app is in charge of that.

	// UTF-8 (UTF-16) is different from other character sets because
	// the relationship with Java's internal character representation is
	// arithmetic, not a character mapping.

	/*
	* http://en.wikipedia.org/wiki/UTF-8
	* http://tools.ietf.org/html/rfc3629
	* http://www.ietf.org/rfc/rfc3629.txt
	*
	* Unicode Byte1 Byte2 Byte3 Byte4
	* U+0000–U+007F 0 to 127 0xxxxxxx
	* U+0080–U+07FF 128 to 2,047 110yyyxx 10xxxxxx
	* U+0800–U+FFFF 2,048 to 65,535 1110yyyy 10yyyyxx 10xxxxxx
	* U+10000–U+10FFFF 65,536 to 1,114,111 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
	*
	* Restricted cases (RFC 3629)
	* 11110101-11110111 F5-F7 245-247 start of 4-byte sequence for codepoint above 10FFFF
	* 11111000-11111011 F8-FB 248-251 start of 5-byte sequence
	* 11111100-11111101 FC-FD 252-253 start of 6-byte sequence
	*
	* Illegal:
	* 11000000-11000001 C0-C1 192-193 Overlong encoding: start of a 2-byte sequence, but code point <= 127
	* 11111110-11111111 FE-FF 254-255 Invalid: not defined by original UTF-8 specification
	*/

	// There is some sort of stream decoder backing the Sun implementation
	// of CharsetDecoder (sun.io.StreamDecoder) but it's not on all platforms
	// I want a known decoder specifically for UTF8

	private InputStreamBuffered input ;
	//private long count = 0 ;

	public InStreamUTF8(InputStream in)
	{
	if ( in instanceof InputStreamBuffered )
	{
	input = (InputStreamBuffered)in ;
	return ;
	}
	input = new InputStreamBuffered(in) ;
	}

	public InStreamUTF8(InputStreamBuffered in) { input = in ; }

	@Override
	public boolean ready() throws IOException
	{
	return input.available() > 0 ;
	}

	@Override
	public void close() throws IOException
	{ input.close() ; }

	@Override
	public void closeStream()
	{ IO.close(input) ; }

	@Override
	public int read(char[] cbuf, int off, int len) {
	// Doing this on a block of bytes may be faster.
	for ( int i = off ; i < off + len ; i++ ) {
	int x = read();
	if ( x == -1 ) {
	if ( i == off )
	return -1;
	return (i - off);
	}
	cbuf[i] = (char)x;
	}
	return len;
	}

	@Override
	public final int read() {
	int ch = advance(input);
	// if ( ! Character.isDefined(ch) ) throw new
	// AtlasException(String.format("Undefined codepoint: 0x%04X", ch)) ;
	return ch;
	}

	/** Next codepoint, given the first byte of any UTF-8 byte sequence is already known.
	* Not necessarily a valid char (this function can be used a straight UTF8 decoder
	*/
	@Override
	public final int advance()
	{ return advance(input) ; }

	/** Next codepoint */
	public static final int advance(InputStreamBuffered input) {
	int x = input.advance() ;
	if ( x == -1 ) return -1 ;
	return advance(input, x) ;
	}

	/** Next codepoint, given the first byte of any UTF-8 byte sequence is already known.
	* Not necessarily a valid char (this function can be used as a straight UTF8 decoder).
	*/

	private static final int advance(InputStreamBuffered input, int x) {
	//count++ ;
	// ASCII Fastpath
	if ( x == -1 \|\| (x >= 0 && x <= 127) ) {
	// count++ ;
	return x;
	}

	// 10 => extension byte
	// 110..... => 2 bytes
	if ( (x & 0xE0) == 0xC0 ) {
	int ch = readMultiBytes(input, x & 0x1F, 2);
	// count += 2 ;
	return ch;

	}
	// 1110.... => 3 bytes : 16 bits : not outside 16bit chars
	if ( (x & 0xF0) == 0xE0 ) {
	int ch = readMultiBytes(input, x & 0x0F, 3);
	// count += 3 ;
	// if ( ! Character.isDefined(ch) ) throw new
	// AtlasException(String.format("Undefined codepoint: 0x%04X", ch))
	// ;
	return ch;
	}

	// Looking like 4 byte character.
	int ch = -2;
	// 11110zzz => 4 bytes.
	if ( (x & 0xF8) == 0xF0 ) {
	ch = readMultiBytes(input, x & 0x08, 4);
	// Opps - need two returns. Character.toChars(ch, chars, 0) ;
	// count += 4 ;
	}

	else
	IO.exception(new IOException("Illegal UTF-8: " + x));

	// This test will go off. We're processing a 4 byte sequence but Java
	// only supports 16 bit chars.
	if ( ch > Character.MAX_VALUE )
	throw new AtlasException("Out of range character (must use a surrogate pair)");
	if ( !Character.isDefined(ch) )
	throw new AtlasException(String.format("Undefined codepoint: 0x%04X", ch));
	return ch;
	}

	private static int readMultiBytes(InputStreamBuffered input, int start, int len) {
	int x = start ;
	for ( int i = 0 ; i < len-1 ; i++ ) {
	int x2 = input.advance() ;
	if ( x2 == -1 )
	throw new AtlasException("Premature end to UTF-8 sequence at end of input") ;

	if ( (x2 & 0xC0) != 0x80 )
	//throw new AtlasException("Illegal UTF-8 processing character "+count+": "+x2) ;
	throw new AtlasException(String.format("Illegal UTF-8 processing character: 0x%04X",x2)) ;
	// 6 bits of x2
	x = (x << 6) \| (x2 & 0x3F);
	}
	return x ;
	}

	public static String decode(byte[] bytes) {
	try {
	char[] chars = new char[bytes.length];
	InputStream in = new ByteArrayInputStream(bytes);
	Reader r = new InStreamUTF8(in);
	int len;
	len = r.read(chars);
	IO.close(r);
	return new String(chars, 0, len);
	}
	catch (IOException ex) {
	IO.exception(ex);
	return null;
	}
	}
	}