src/org/apache/xerces/impl/io/UCSReader.java - xerces2-j - Git at Google

 /*
  * The Apache Software License, Version 1.1
  *
  *
  * Copyright (c) 2000 The Apache Software Foundation.  All rights
  * reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  *
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  *
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in
  *    the documentation and/or other materials provided with the
  *    distribution.
  *
  * 3. The end-user documentation included with the redistribution,
  *    if any, must include the following acknowledgment:
  *       "This product includes software developed by the
  *        Apache Software Foundation (http://www.apache.org/)."
  *    Alternately, this acknowledgment may appear in the software itself,
  *    if and wherever such third-party acknowledgments normally appear.
  *
  * 4. The names "Xerces" and "Apache Software Foundation" must
  *    not be used to endorse or promote products derived from this
  *    software without prior written permission. For written
  *    permission, please contact apache@apache.org.
  *
  * 5. Products derived from this software may not be called "Apache",
  *    nor may "Apache" appear in their name, without prior written
  *    permission of the Apache Software Foundation.
  *
  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
  * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * ====================================================================
  *
  * This software consists of voluntary contributions made by many
  * individuals on behalf of the Apache Software Foundation and was
  * originally based on software copyright (c) 1999, International
  * Business Machines, Inc., http://www.apache.org.  For more
  * information on the Apache Software Foundation, please see
  * <http://www.apache.org/>.
  */

 package org.apache.xerces.impl.io;

 import java.io.InputStream;
 import java.io.IOException;
 import java.io.Reader;

 /**
  * Reader for UCS-2 and UCS-4 encodings.
  * (i.e., encodings from ISO-10646-UCS-(2|4)).
  *
  * @author Neil Graham, IBM
  *
  * @version $Id$
  */
 public class UCSReader extends Reader {

     //
     // Constants
     //

     /** Default byte buffer size (8192, larger than that of ASCIIReader
      * since it's reasonable to surmise that the average UCS-4-encoded
      * file should be 4 times as large as the average ASCII-encoded file).
      */
     public static final int DEFAULT_BUFFER_SIZE = 8192;

     public static short UCS2LE = 1;
     public static short UCS2BE = 2;
     public static short UCS4LE = 4;
     public static short UCS4BE = 8;

     //
     // Data
     //

     /** Input stream. */
     protected InputStream fInputStream;

     /** Byte buffer. */
     protected byte[] fBuffer;

     // what kind of data we're dealing with
     protected short fEncoding;

     //
     // Constructors
     //

     /**
      * Constructs an ASCII reader from the specified input stream
      * using the default buffer size.  The Endian-ness and whether this is
      * UCS-2 or UCS-4 needs also to be known in advance.
      *
      * @param inputStream The input stream.
      * @param encoding One of UCS2LE, UCS2BE, UCS4LE or UCS4BE.
      */
     public UCSReader(InputStream inputStream, short encoding) {
         this(inputStream, DEFAULT_BUFFER_SIZE, encoding);
     } // <init>(InputStream, short)

     /**
      * Constructs an ASCII reader from the specified input stream
      * and buffer size.  The Endian-ness and whether this is
      * UCS-2 or UCS-4 needs also to be known in advance.
      *
      * @param inputStream The input stream.
      * @param size        The initial buffer size.
      * @param encoding One of UCS2LE, UCS2BE, UCS4LE or UCS4BE.
      */
     public UCSReader(InputStream inputStream, int size, short encoding) {
         fInputStream = inputStream;
         fBuffer = new byte[size];
         fEncoding = encoding;
     } // <init>(InputStream,int,short)

     //
     // Reader methods
     //

     /**
      * Read a single character.  This method will block until a character is
      * available, an I/O error occurs, or the end of the stream is reached.
      *
      * <p> Subclasses that intend to support efficient single-character input
      * should override this method.
      *
      * @return     The character read, as an integer in the range 0 to 127
      *             (<tt>0x00-0x7f</tt>), or -1 if the end of the stream has
      *             been reached
      *
      * @exception  IOException  If an I/O error occurs
      */
     public int read() throws IOException {
         int b0 = fInputStream.read() & 0xff;
         if (b0 == 0xff)
             return -1;
         int b1 = fInputStream.read() & 0xff;
         if (b1 == 0xff)
             return -1;
         if(fEncoding >=4) {
             int b2 = fInputStream.read() & 0xff;
             if (b2 == 0xff)
                 return -1;
             int b3 = fInputStream.read() & 0xff;
             if (b3 == 0xff)
                 return -1;
             System.err.println("b0 is " + (b0 & 0xff) + " b1 " + (b1 & 0xff) + " b2 " + (b2 & 0xff) + " b3 " + (b3 & 0xff));
             if (fEncoding == UCS4BE)
                 return (b0<<24)+(b1<<16)+(b2<<8)+b3;
             else
                 return (b3<<24)+(b2<<16)+(b1<<8)+b0;
         } else { // UCS-2
             if (fEncoding == UCS2BE)
                 return (b0<<8)+b1;
             else
                 return (b1<<8)+b0;
         }
     } // read():int

     /**
      * Read characters into a portion of an array.  This method will block
      * until some input is available, an I/O error occurs, or the end of the
      * stream is reached.
      *
      * @param      ch     Destination buffer
      * @param      offset Offset at which to start storing characters
      * @param      length Maximum number of characters to read
      *
      * @return     The number of characters read, or -1 if the end of the
      *             stream has been reached
      *
      * @exception  IOException  If an I/O error occurs
      */
     public int read(char ch[], int offset, int length) throws IOException {
         int byteLength = length << ((fEncoding >= 4)?2:1);
         if (byteLength > fBuffer.length) {
             byteLength = fBuffer.length;
         }
         int count = fInputStream.read(fBuffer, 0, byteLength);
         if(count == -1) return -1;
         // try and make count be a multiple of the number of bytes we're looking for
         if(fEncoding >= 4) { // BigEndian
             // this looks ugly, but it avoids an if at any rate...
             int numToRead = (4 - (count & 3) & 3);
             for(int i=0; i<numToRead; i++) {
                 int charRead = fInputStream.read();
                 if(charRead == -1) { // end of input; something likely went wrong!A  Pad buffer with nulls.
                     for (int j = i;j<numToRead; j++)
                         fBuffer[count+j] = 0;
                     break;
                 } else {
                     fBuffer[count+i] = (byte)charRead;
                 }
             }
             count += numToRead;
         } else {
             int numToRead = count & 1;
             if(numToRead != 0) {
                 count++;
                 int charRead = fInputStream.read();
                 if(charRead == -1) { // end of input; something likely went wrong!A  Pad buffer with nulls.
                     fBuffer[count] = 0;
                 } else {
                     fBuffer[count] = (byte)charRead;
                 }
             }
         }

         // now count is a multiple of the right number of bytes
         int numChars = count >> ((fEncoding >= 4)?2:1);
         int curPos = 0;
         for (int i = 0; i < numChars; i++) {
             int b0 = fBuffer[curPos++] & 0xff;
             int b1 = fBuffer[curPos++] & 0xff;
             if(fEncoding >=4) {
                 int b2 = fBuffer[curPos++] & 0xff;
                 int b3 = fBuffer[curPos++] & 0xff;
                 if (fEncoding == UCS4BE)
                     ch[offset+i] = (char)((b0<<24)+(b1<<16)+(b2<<8)+b3);
                 else
                     ch[offset+i] = (char)((b3<<24)+(b2<<16)+(b1<<8)+b0);
             } else { // UCS-2
                 if (fEncoding == UCS2BE)
                     ch[offset+i] = (char)((b0<<8)+b1);
                 else
                     ch[offset+i] = (char)((b1<<8)+b0);
             }
         }
         return numChars;
     } // read(char[],int,int)

     /**
      * Skip characters.  This method will block until some characters are
      * available, an I/O error occurs, or the end of the stream is reached.
      *
      * @param  n  The number of characters to skip
      *
      * @return    The number of characters actually skipped
      *
      * @exception  IOException  If an I/O error occurs
      */
     public long skip(long n) throws IOException {
         // charWidth will represent the number of bits to move
         // n leftward to get num of bytes to skip, and then move the result rightward
         // to get num of chars effectively skipped.
         // The trick with &'ing, as with elsewhere in this dcode, is
         // intended to avoid an expensive use of / that might not be optimized
         // away.
         int charWidth = (fEncoding >=4)?2:1;
         long bytesSkipped = fInputStream.skip(n<<charWidth);
         if((bytesSkipped & (charWidth | 1)) == 0) return bytesSkipped >> charWidth;
         return (bytesSkipped >> charWidth) + 1;
     } // skip(long):long

     /**
      * Tell whether this stream is ready to be read.
      *
      * @return True if the next read() is guaranteed not to block for input,
      * false otherwise.  Note that returning false does not guarantee that the
      * next read will block.
      *
      * @exception  IOException  If an I/O error occurs
      */
     public boolean ready() throws IOException {
 	    return false;
     } // ready()

     /**
      * Tell whether this stream supports the mark() operation.
      */
     public boolean markSupported() {
 	    return fInputStream.markSupported();
     } // markSupported()

     /**
      * Mark the present position in the stream.  Subsequent calls to reset()
      * will attempt to reposition the stream to this point.  Not all
      * character-input streams support the mark() operation.
      *
      * @param  readAheadLimit  Limit on the number of characters that may be
      *                         read while still preserving the mark.  After
      *                         reading this many characters, attempting to
      *                         reset the stream may fail.
      *
      * @exception  IOException  If the stream does not support mark(),
      *                          or if some other I/O error occurs
      */
     public void mark(int readAheadLimit) throws IOException {
 	    fInputStream.mark(readAheadLimit);
     } // mark(int)

     /**
      * Reset the stream.  If the stream has been marked, then attempt to
      * reposition it at the mark.  If the stream has not been marked, then
      * attempt to reset it in some way appropriate to the particular stream,
      * for example by repositioning it to its starting point.  Not all
      * character-input streams support the reset() operation, and some support
      * reset() without supporting mark().
      *
      * @exception  IOException  If the stream has not been marked,
      *                          or if the mark has been invalidated,
      *                          or if the stream does not support reset(),
      *                          or if some other I/O error occurs
      */
     public void reset() throws IOException {
         fInputStream.reset();
     } // reset()

     /**
      * Close the stream.  Once a stream has been closed, further read(),
      * ready(), mark(), or reset() invocations will throw an IOException.
      * Closing a previously-closed stream, however, has no effect.
      *
      * @exception  IOException  If an I/O error occurs
      */
      public void close() throws IOException {
          fInputStream.close();
      } // close()

 } // class UCSReader
	/*
	* The Apache Software License, Version 1.1
	*
	*
	* Copyright (c) 2000 The Apache Software Foundation. All rights
	* reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	*
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	*
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in
	* the documentation and/or other materials provided with the
	* distribution.
	*
	* 3. The end-user documentation included with the redistribution,
	* if any, must include the following acknowledgment:
	* "This product includes software developed by the
	* Apache Software Foundation (http://www.apache.org/)."
	* Alternately, this acknowledgment may appear in the software itself,
	* if and wherever such third-party acknowledgments normally appear.
	*
	* 4. The names "Xerces" and "Apache Software Foundation" must
	* not be used to endorse or promote products derived from this
	* software without prior written permission. For written
	* permission, please contact apache@apache.org.
	*
	* 5. Products derived from this software may not be called "Apache",
	* nor may "Apache" appear in their name, without prior written
	* permission of the Apache Software Foundation.
	*
	* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
	* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
	* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
	* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
	* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
	* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
	* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
	* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
	* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	* ====================================================================
	*
	* This software consists of voluntary contributions made by many
	* individuals on behalf of the Apache Software Foundation and was
	* originally based on software copyright (c) 1999, International
	* Business Machines, Inc., http://www.apache.org. For more
	* information on the Apache Software Foundation, please see
	* <http://www.apache.org/>.
	*/

	package org.apache.xerces.impl.io;

	import java.io.InputStream;
	import java.io.IOException;
	import java.io.Reader;

	/**
	* Reader for UCS-2 and UCS-4 encodings.
	* (i.e., encodings from ISO-10646-UCS-(2\|4)).
	*
	* @author Neil Graham, IBM
	*
	* @version $Id$
	*/
	public class UCSReader extends Reader {

	//
	// Constants
	//

	/** Default byte buffer size (8192, larger than that of ASCIIReader
	* since it's reasonable to surmise that the average UCS-4-encoded
	* file should be 4 times as large as the average ASCII-encoded file).
	*/
	public static final int DEFAULT_BUFFER_SIZE = 8192;

	public static short UCS2LE = 1;
	public static short UCS2BE = 2;
	public static short UCS4LE = 4;
	public static short UCS4BE = 8;

	//
	// Data
	//

	/** Input stream. */
	protected InputStream fInputStream;

	/** Byte buffer. */
	protected byte[] fBuffer;

	// what kind of data we're dealing with
	protected short fEncoding;

	//
	// Constructors
	//

	/**
	* Constructs an ASCII reader from the specified input stream
	* using the default buffer size. The Endian-ness and whether this is
	* UCS-2 or UCS-4 needs also to be known in advance.
	*
	* @param inputStream The input stream.
	* @param encoding One of UCS2LE, UCS2BE, UCS4LE or UCS4BE.
	*/
	public UCSReader(InputStream inputStream, short encoding) {
	this(inputStream, DEFAULT_BUFFER_SIZE, encoding);
	} // <init>(InputStream, short)

	/**
	* Constructs an ASCII reader from the specified input stream
	* and buffer size. The Endian-ness and whether this is
	* UCS-2 or UCS-4 needs also to be known in advance.
	*
	* @param inputStream The input stream.
	* @param size The initial buffer size.
	* @param encoding One of UCS2LE, UCS2BE, UCS4LE or UCS4BE.
	*/
	public UCSReader(InputStream inputStream, int size, short encoding) {
	fInputStream = inputStream;
	fBuffer = new byte[size];
	fEncoding = encoding;
	} // <init>(InputStream,int,short)

	//
	// Reader methods
	//

	/**
	* Read a single character. This method will block until a character is
	* available, an I/O error occurs, or the end of the stream is reached.
	*
	* <p> Subclasses that intend to support efficient single-character input
	* should override this method.
	*
	* @return The character read, as an integer in the range 0 to 127
	* (<tt>0x00-0x7f</tt>), or -1 if the end of the stream has
	* been reached
	*
	* @exception IOException If an I/O error occurs
	*/
	public int read() throws IOException {
	int b0 = fInputStream.read() & 0xff;
	if (b0 == 0xff)
	return -1;
	int b1 = fInputStream.read() & 0xff;
	if (b1 == 0xff)
	return -1;
	if(fEncoding >=4) {
	int b2 = fInputStream.read() & 0xff;
	if (b2 == 0xff)
	return -1;
	int b3 = fInputStream.read() & 0xff;
	if (b3 == 0xff)
	return -1;
	System.err.println("b0 is " + (b0 & 0xff) + " b1 " + (b1 & 0xff) + " b2 " + (b2 & 0xff) + " b3 " + (b3 & 0xff));
	if (fEncoding == UCS4BE)
	return (b0<<24)+(b1<<16)+(b2<<8)+b3;
	else
	return (b3<<24)+(b2<<16)+(b1<<8)+b0;
	} else { // UCS-2
	if (fEncoding == UCS2BE)
	return (b0<<8)+b1;
	else
	return (b1<<8)+b0;
	}
	} // read():int

	/**
	* Read characters into a portion of an array. This method will block
	* until some input is available, an I/O error occurs, or the end of the
	* stream is reached.
	*
	* @param ch Destination buffer
	* @param offset Offset at which to start storing characters
	* @param length Maximum number of characters to read
	*
	* @return The number of characters read, or -1 if the end of the
	* stream has been reached
	*
	* @exception IOException If an I/O error occurs
	*/
	public int read(char ch[], int offset, int length) throws IOException {
	int byteLength = length << ((fEncoding >= 4)?2:1);
	if (byteLength > fBuffer.length) {
	byteLength = fBuffer.length;
	}
	int count = fInputStream.read(fBuffer, 0, byteLength);
	if(count == -1) return -1;
	// try and make count be a multiple of the number of bytes we're looking for
	if(fEncoding >= 4) { // BigEndian
	// this looks ugly, but it avoids an if at any rate...
	int numToRead = (4 - (count & 3) & 3);
	for(int i=0; i<numToRead; i++) {
	int charRead = fInputStream.read();
	if(charRead == -1) { // end of input; something likely went wrong!A Pad buffer with nulls.
	for (int j = i;j<numToRead; j++)
	fBuffer[count+j] = 0;
	break;
	} else {
	fBuffer[count+i] = (byte)charRead;
	}
	}
	count += numToRead;
	} else {
	int numToRead = count & 1;
	if(numToRead != 0) {
	count++;
	int charRead = fInputStream.read();
	if(charRead == -1) { // end of input; something likely went wrong!A Pad buffer with nulls.
	fBuffer[count] = 0;
	} else {
	fBuffer[count] = (byte)charRead;
	}
	}
	}

	// now count is a multiple of the right number of bytes
	int numChars = count >> ((fEncoding >= 4)?2:1);
	int curPos = 0;
	for (int i = 0; i < numChars; i++) {
	int b0 = fBuffer[curPos++] & 0xff;
	int b1 = fBuffer[curPos++] & 0xff;
	if(fEncoding >=4) {
	int b2 = fBuffer[curPos++] & 0xff;
	int b3 = fBuffer[curPos++] & 0xff;
	if (fEncoding == UCS4BE)
	ch[offset+i] = (char)((b0<<24)+(b1<<16)+(b2<<8)+b3);
	else
	ch[offset+i] = (char)((b3<<24)+(b2<<16)+(b1<<8)+b0);
	} else { // UCS-2
	if (fEncoding == UCS2BE)
	ch[offset+i] = (char)((b0<<8)+b1);
	else
	ch[offset+i] = (char)((b1<<8)+b0);
	}
	}
	return numChars;
	} // read(char[],int,int)

	/**
	* Skip characters. This method will block until some characters are
	* available, an I/O error occurs, or the end of the stream is reached.
	*
	* @param n The number of characters to skip
	*
	* @return The number of characters actually skipped
	*
	* @exception IOException If an I/O error occurs
	*/
	public long skip(long n) throws IOException {
	// charWidth will represent the number of bits to move
	// n leftward to get num of bytes to skip, and then move the result rightward
	// to get num of chars effectively skipped.
	// The trick with &'ing, as with elsewhere in this dcode, is
	// intended to avoid an expensive use of / that might not be optimized
	// away.
	int charWidth = (fEncoding >=4)?2:1;
	long bytesSkipped = fInputStream.skip(n<<charWidth);
	if((bytesSkipped & (charWidth \| 1)) == 0) return bytesSkipped >> charWidth;
	return (bytesSkipped >> charWidth) + 1;
	} // skip(long):long

	/**
	* Tell whether this stream is ready to be read.
	*
	* @return True if the next read() is guaranteed not to block for input,
	* false otherwise. Note that returning false does not guarantee that the
	* next read will block.
	*
	* @exception IOException If an I/O error occurs
	*/
	public boolean ready() throws IOException {
	return false;
	} // ready()

	/**
	* Tell whether this stream supports the mark() operation.
	*/
	public boolean markSupported() {
	return fInputStream.markSupported();
	} // markSupported()

	/**
	* Mark the present position in the stream. Subsequent calls to reset()
	* will attempt to reposition the stream to this point. Not all
	* character-input streams support the mark() operation.
	*
	* @param readAheadLimit Limit on the number of characters that may be
	* read while still preserving the mark. After
	* reading this many characters, attempting to
	* reset the stream may fail.
	*
	* @exception IOException If the stream does not support mark(),
	* or if some other I/O error occurs
	*/
	public void mark(int readAheadLimit) throws IOException {
	fInputStream.mark(readAheadLimit);
	} // mark(int)

	/**
	* Reset the stream. If the stream has been marked, then attempt to
	* reposition it at the mark. If the stream has not been marked, then
	* attempt to reset it in some way appropriate to the particular stream,
	* for example by repositioning it to its starting point. Not all
	* character-input streams support the reset() operation, and some support
	* reset() without supporting mark().
	*
	* @exception IOException If the stream has not been marked,
	* or if the mark has been invalidated,
	* or if the stream does not support reset(),
	* or if some other I/O error occurs
	*/
	public void reset() throws IOException {
	fInputStream.reset();
	} // reset()

	/**
	* Close the stream. Once a stream has been closed, further read(),
	* ready(), mark(), or reset() invocations will throw an IOException.
	* Closing a previously-closed stream, however, has no effect.
	*
	* @exception IOException If an I/O error occurs
	*/
	public void close() throws IOException {
	fInputStream.close();
	} // close()

	} // class UCSReader