blob: 6289824d68e8b243da43177759b13521b50dafa2 [file] [log] [blame]
/*
* The Apache Software License, Version 1.1
*
*
* Copyright (c) 1999 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Xerces" and "Apache Software Foundation" must
* not be used to endorse or promote products derived from this
* software without prior written permission. For written
* permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* nor may "Apache" appear in their name, without prior written
* permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation and was
* originally based on software copyright (c) 1999, International
* Business Machines, Inc., http://www.apache.org. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
package org.apache.xerces.readers;
import org.apache.xerces.framework.XMLErrorReporter;
import org.apache.xerces.utils.CharDataChunk;
import org.apache.xerces.utils.StringPool;
import org.apache.xerces.utils.ImplementationMessages;
import java.io.InputStream;
/**
* Simple character-based version of a UTF8 reader.
*
* This class is not commonly used, but is provided as a much simplified
* example of the UTF8Reader class that uses the AbstractCharReader to
* perform all of the reader functions except for filling each buffer
* of the character data when needed (fillCurrentChunk). We read the
* input data from an InputStream and perform end-of-line normalization
* as we process that data.
*
* @version
*/
final class UTF8CharReader extends AbstractCharReader {
//
//
//
UTF8CharReader(XMLEntityHandler entityHandler, XMLErrorReporter errorReporter, boolean sendCharDataAsCharArray, InputStream dataStream, StringPool stringPool) throws Exception {
super(entityHandler, errorReporter, sendCharDataAsCharArray, stringPool);
fInputStream = dataStream;
fillCurrentChunk();
}
//
//
//
private InputStream fInputStream = null;
//
// When we fill a chunk there may be data that was read from the
// input stream that has not been "processed". We need to save
// that data, and any in-progress state, between the calls to
// fillCurrentChunk() in these instance variables.
//
private boolean fCheckOverflow = false;
private byte[] fOverflow = null;
private int fOverflowOffset = 0;
private int fOverflowEnd = 0;
private int fOutputOffset = 0;
private boolean fSkipLinefeed = false;
private int fPartialMultiByteIn = 0;
private byte[] fPartialMultiByteChar = new byte[3];
private int fPartialSurrogatePair = 0;
private boolean fPartialMultiByteResult = false;
//
//
//
protected int fillCurrentChunk() throws Exception {
//
// See if we can find a way to reuse the buffer that may have been returned
// with a recyled data chunk.
//
char[] recycledData = fCurrentChunk.toCharArray();
//
// If we have overflow from the last call, normalize from where
// we left off, copying into the front of the output buffer.
//
fOutputOffset = 0;
if (fCheckOverflow) {
//
// The fOverflowEnd should always be equal to CHUNK_SIZE, unless we hit
// EOF during the previous call. Copy the remaining data to the front
// of the buffer and return it as the final chunk.
//
fMostRecentData = recycledData;
if (fOverflowEnd < CharDataChunk.CHUNK_SIZE) {
recycledData = null;
if (fOverflowEnd > 0) {
if (fMostRecentData == null || fMostRecentData.length < 1 + fOverflowEnd - fOverflowOffset)
fMostRecentData = new char[1 + fOverflowEnd - fOverflowOffset];
copyNormalize(fOverflow, fOverflowOffset, fMostRecentData, fOutputOffset);
} else {
if (fMostRecentData == null)
fMostRecentData = new char[1];
}
fMostRecentData[fOutputOffset] = 0;
//
// Update our instance variables
//
fOverflow = null;
fLength += fOutputOffset;
fCurrentIndex = 0;
fCurrentChunk.setCharArray(fMostRecentData);
return (fMostRecentChar = fMostRecentData[0]);
}
if (fMostRecentData == null || fMostRecentData.length < CharDataChunk.CHUNK_SIZE)
fMostRecentData = new char[CharDataChunk.CHUNK_SIZE];
else
recycledData = null;
copyNormalize(fOverflow, fOverflowOffset, fMostRecentData, fOutputOffset);
fCheckOverflow = false;
} else {
if (fOverflow == null)
fOverflow = new byte[CharDataChunk.CHUNK_SIZE];
fMostRecentData = null;
}
while (true) {
fOverflowOffset = 0;
fOverflowEnd = 0;
int capacity = CharDataChunk.CHUNK_SIZE;
int result = 0;
do {
try {
result = fInputStream.read(fOverflow, fOverflowEnd, capacity);
} catch (java.io.IOException ex) {
result = -1;
}
if (result == -1) {
//
// We have reached the end of the stream.
//
fInputStream.close();
fInputStream = null;
if (fMostRecentData == null) {
//
// There is no previous output data, so we know that all of the
// new input data will fit.
//
fMostRecentData = recycledData;
if (fMostRecentData == null || fMostRecentData.length < 1 + fOverflowEnd)
fMostRecentData = new char[1 + fOverflowEnd];
else
recycledData = null;
copyNormalize(fOverflow, fOverflowOffset, fMostRecentData, fOutputOffset);
fOverflow = null;
fMostRecentData[fOutputOffset] = 0;
} else {
//
// Copy the input data to the end of the output buffer.
//
boolean alldone = copyNormalize(fOverflow, fOverflowOffset, fMostRecentData, fOutputOffset);
if (alldone) {
if (fOverflowEnd == CharDataChunk.CHUNK_SIZE) {
//
// Special case - everything fit into the overflow buffer,
// except that there is no room for the nul char we use to
// indicate EOF. Set the overflow buffer length to zero.
// On the next call to this method, we will detect this
// case and which we will handle above .
//
fCheckOverflow = true;
fOverflowOffset = 0;
fOverflowEnd = 0;
} else {
//
// It all fit into the output buffer.
//
fOverflow = null;
fMostRecentData[fOutputOffset] = 0;
}
} else {
//
// There is still input data left over, save the remaining data as
// the overflow buffer for the next call.
//
fCheckOverflow = true;
}
}
break;
}
if (result > 0) {
fOverflowEnd += result;
capacity -= result;
}
} while (capacity > 0);
//
//
//
if (result == -1)
break;
if (fMostRecentData != null) {
boolean alldone = copyNormalize(fOverflow, fOverflowOffset, fMostRecentData, fOutputOffset);
if (fOutputOffset == CharDataChunk.CHUNK_SIZE) {
//
// We filled the output buffer.
//
if (!alldone) {
//
// The input buffer will become the next overflow buffer.
//
fCheckOverflow = true;
}
break;
}
} else {
//
// Now normalize the end-of-line characters and see if we need to read more
// bytes to fill up the buffer.
//
fMostRecentData = recycledData;
if (fMostRecentData == null || fMostRecentData.length < CharDataChunk.CHUNK_SIZE)
fMostRecentData = new char[CharDataChunk.CHUNK_SIZE];
else
recycledData = null;
copyNormalize(fOverflow, fOverflowOffset, fMostRecentData, fOutputOffset);
if (fOutputOffset == CharDataChunk.CHUNK_SIZE) {
//
// The output buffer is full. We can return now.
//
break;
}
}
//
// We will need to get another intput buffer to be able to fill the
// overflow buffer completely.
//
}
//
// Update our instance variables
//
fLength += fOutputOffset;
fCurrentIndex = 0;
fCurrentChunk.setCharArray(fMostRecentData);
return (fMostRecentChar = fMostRecentData[0]);
}
//
// Copy and normalize bytes from the overflow buffer into chars in our data buffer.
//
private boolean copyNormalize(byte[] in, int inOffset, char[] out, int outOffset) throws Exception {
//
// Handle all edge cases before dropping into the inner loop.
//
int inEnd = fOverflowEnd;
int outEnd = out.length;
if (inOffset == inEnd)
return true;
byte b = in[inOffset];
if (fSkipLinefeed) {
fSkipLinefeed = false;
if (b == 0x0A) {
if (++inOffset == inEnd)
return exitNormalize(inOffset, outOffset, true);
b = in[inOffset];
}
} else if (fPartialMultiByteIn > 0) {
if (!handlePartialMultiByteChar(b, in, inOffset, inEnd, out, outOffset, outEnd))
return fPartialMultiByteResult;
inOffset = fOverflowOffset;
outOffset = fOutputOffset;
b = in[inOffset];
}
while (outOffset < outEnd) {
//
// Find the longest run that we can guarantee will not exceed the
// bounds of the outer loop.
//
int inCount = inEnd - inOffset;
int outCount = outEnd - outOffset;
if (inCount > outCount)
inCount = outCount;
inOffset++;
while (true) {
while (b == 0x0D || b < 0) {
if (b == 0x0D) {
out[outOffset++] = 0x0A;
if (inOffset == inEnd) {
fSkipLinefeed = true;
return exitNormalize(inOffset, outOffset, true);
}
b = in[inOffset];
if (b == 0x0A) {
if (++inOffset == inEnd)
return exitNormalize(inOffset, outOffset, true);
b = in[inOffset];
}
if (outOffset == outEnd)
return exitNormalize(inOffset, outOffset, false);
} else {
if (!handleMultiByteChar(b, in, inOffset, inEnd, out, outOffset, outEnd))
return fPartialMultiByteResult;
inOffset = fOverflowOffset;
outOffset = fOutputOffset;
b = in[inOffset];
}
inCount = inEnd - inOffset;
outCount = outEnd - outOffset;
if (inCount > outCount)
inCount = outCount;
inOffset++;
}
while (true) {
out[outOffset++] = (char)b;
if (--inCount == 0)
break;
b = in[inOffset++];
if (b == 0x0D || b < 0)
break;
}
if (inCount == 0)
break;
}
if (inOffset == inEnd)
break;
}
return exitNormalize(inOffset, outOffset, inOffset == inEnd);
}
//
//
//
private boolean exitNormalize(int inOffset, int outOffset, boolean result) {
fOverflowOffset = inOffset;
fOutputOffset = outOffset;
return result;
}
//
//
//
private void savePartialMultiByte(int inCount, byte bz, byte by, byte bx) {
fPartialMultiByteIn = inCount;
fPartialMultiByteChar[--inCount] = bz;
fPartialMultiByteChar[--inCount] = by;
fPartialMultiByteChar[--inCount] = bx;
}
private void savePartialMultiByte(int inCount, byte bz, byte by) {
fPartialMultiByteIn = inCount;
fPartialMultiByteChar[--inCount] = bz;
fPartialMultiByteChar[--inCount] = by;
}
private void savePartialMultiByte(int inCount, byte bz) {
fPartialMultiByteIn = inCount;
fPartialMultiByteChar[--inCount] = bz;
}
private boolean handleMultiByteChar(byte b, byte[] in, int inOffset, int inEnd, char[] out, int outOffset, int outEnd) throws Exception {
if (inOffset == inEnd) {
savePartialMultiByte(1, b);
fPartialMultiByteResult = exitNormalize(inOffset, outOffset, true);
return false;
}
byte b1 = in[inOffset++];
if ((b1 & 0xc0) != 0x80) {
Object[] args = {
Integer.toHexString(b & 0xff),
Integer.toHexString(b1 & 0xff)
};
deferException(ImplementationMessages.ENC5, args, outOffset);
out[outOffset++] = 0;
return exitNormalize(inOffset, outOffset, true);
}
if ((b & 0xe0) == 0xc0) { // 110yyyyy 10xxxxxx
int ch = ((0x1f & b)<<6) + (0x3f & b1);
out[outOffset++] = (char)ch;
if (inOffset == inEnd || outOffset == outEnd) {
fPartialMultiByteResult = exitNormalize(inOffset, outOffset, inOffset == inEnd);
return false;
}
} else {
if (inOffset == inEnd) {
savePartialMultiByte(2, b1, b);
fPartialMultiByteResult = exitNormalize(inOffset, outOffset, true);
return false;
}
byte b2 = in[inOffset++];
if ((b2 & 0xc0) != 0x80) {
Object[] args = {
Integer.toHexString(b & 0xff),
Integer.toHexString(b1 & 0xff),
Integer.toHexString(b2 & 0xff)
};
deferException(ImplementationMessages.ENC6, args, outOffset);
out[outOffset++] = 0;
return exitNormalize(inOffset, outOffset, true);
}
if ((b & 0xf0) == 0xe0) { // 1110zzzz 10yyyyyy 10xxxxxx
int ch = ((0x0f & b)<<12) + ((0x3f & b1)<<6) + (0x3f & b2);
out[outOffset++] = (char)ch;
if (inOffset == inEnd || outOffset == outEnd) {
fPartialMultiByteResult = exitNormalize(inOffset, outOffset, inOffset == inEnd);
return false;
}
} else {
if ((b & 0xf8) != 0xf0) {
Object[] args = { Integer.toHexString(b & 0xff) };
deferException(ImplementationMessages.ENC4, args, outOffset);
out[outOffset++] = 0;
return exitNormalize(inOffset, outOffset, true);
}
if (inOffset == inEnd) {
savePartialMultiByte(3, b2, b1, b);
fPartialMultiByteResult = exitNormalize(inOffset, outOffset, true);
return false;
}
byte b3 = in[inOffset++];
if ((b3 & 0xc0) != 0x80) {
Object[] args = {
Integer.toHexString(b & 0xff),
Integer.toHexString(b1 & 0xff),
Integer.toHexString(b2 & 0xff),
Integer.toHexString(b3 & 0xff)
};
deferException(ImplementationMessages.ENC7, args, outOffset);
out[outOffset++] = 0;
return exitNormalize(inOffset, outOffset, true);
}
int ch = ((0x0f & b)<<18) + ((0x3f & b1)<<12) + ((0x3f & b2)<<6) + (0x3f & b3);
if (ch >= 0x10000) {
out[outOffset++] = (char)(((ch-0x00010000)>>10)+0xd800);
ch = (((ch-0x00010000)&0x3ff)+0xdc00);
if (outOffset == outEnd) {
fPartialSurrogatePair = ch;
fPartialMultiByteResult = exitNormalize(inOffset, outOffset, inOffset == inEnd);
return false;
}
}
out[outOffset++] = (char)ch;
if (inOffset == inEnd || outOffset == outEnd) {
fPartialMultiByteResult = exitNormalize(inOffset, outOffset, inOffset == inEnd);
return false;
}
}
}
return exitNormalize(inOffset, outOffset, true);
}
private boolean handlePartialMultiByteChar(byte b, byte[] in, int inOffset, int inEnd, char[] out, int outOffset, int outEnd) throws Exception {
if (outOffset == outEnd) {
fPartialMultiByteResult = exitNormalize(inOffset, outOffset, inOffset == inEnd);
return false;
}
if (fPartialMultiByteIn == 4) {
out[outOffset++] = (char)fPartialSurrogatePair;
if (outOffset == outEnd) {
fPartialMultiByteResult = exitNormalize(inOffset, outOffset, false);
return false;
}
fOutputOffset = outOffset;
return true;
}
int byteIn = fPartialMultiByteIn;
fPartialMultiByteIn = 0;
byte b1 = 0;
byte b2 = 0;
byte b3 = 0;
switch (byteIn) {
case 1: b1 = b; break;
case 2: b2 = b; break;
case 3: b3 = b; break;
}
int i = byteIn;
switch (byteIn) {
case 3:
b2 = fPartialMultiByteChar[--i];
case 2:
b1 = fPartialMultiByteChar[--i];
case 1:
b = fPartialMultiByteChar[--i];
}
switch (byteIn) {
case 1:
if ((b1 & 0xc0) != 0x80) {
Object[] args = {
Integer.toHexString(b),
Integer.toHexString(b1)
};
deferException(ImplementationMessages.ENC5, args, outOffset);
out[outOffset++] = 0;
break;
}
// fall through
case 2:
if ((b & 0xe0) == 0xc0) { // 110yyyyy 10xxxxxx
int ch = ((0x1f & b)<<6) + (0x3f & b1);
out[outOffset++] = (char)ch;
if (outOffset == outEnd) {
fPartialMultiByteResult = exitNormalize(inOffset, outOffset, false);
return false;
}
if (byteIn < 2 && ++inOffset == inEnd) {
fPartialMultiByteResult = exitNormalize(inOffset, outOffset, true);
return false;
}
break;
}
if (byteIn < 2) {
if (++inOffset == inEnd) {
savePartialMultiByte(2, b1);
fPartialMultiByteResult = exitNormalize(inOffset, outOffset, true);
return false;
}
b2 = in[inOffset];
}
if ((b2 & 0xc0) != 0x80) {
Object[] args = {
Integer.toHexString(b),
Integer.toHexString(b1),
Integer.toHexString(b2)
};
deferException(ImplementationMessages.ENC6, args, outOffset);
out[outOffset++] = 0;
break;
}
// fall through
case 3:
if ((b & 0xf0) == 0xe0) { // 1110zzzz 10yyyyyy 10xxxxxx
int ch = ((0x0f & b)<<12) + ((0x3f & b1)<<6) + (0x3f & b2);
out[outOffset++] = (char)ch;
if (outOffset == outEnd) {
fPartialMultiByteResult = exitNormalize(inOffset, outOffset, false);
return false;
}
if (byteIn < 3 && ++inOffset == inEnd) {
fPartialMultiByteResult = exitNormalize(inOffset, outOffset, true);
return false;
}
break;
}
if (byteIn < 3) {
if ((b & 0xf8) != 0xf0) {
Object[] args = { Integer.toHexString(b) };
deferException(ImplementationMessages.ENC4, args, outOffset);
out[outOffset++] = 0;
break;
}
if (++inOffset == inEnd) {
savePartialMultiByte(3, b2, b1);
fPartialMultiByteResult = exitNormalize(inOffset, outOffset, true);
return false;
}
b3 = in[inOffset];
}
if ((b3 & 0xc0) != 0x80) {
Object[] args = {
Integer.toHexString(b),
Integer.toHexString(b1),
Integer.toHexString(b2),
Integer.toHexString(b3)
};
deferException(ImplementationMessages.ENC7, args, outOffset);
out[outOffset++] = 0;
break;
}
int ch = ((0x0f & b)<<18) + ((0x3f & b1)<<12) + ((0x3f & b2)<<6) + (0x3f & b3);
if (ch >= 0x10000) {
out[outOffset++] = (char)(((ch-0x00010000)>>10)+0xd800);
ch = (((ch-0x00010000)&0x3ff)+0xdc00);
if (outOffset == outEnd) {
fPartialSurrogatePair = ch;
fPartialMultiByteResult = exitNormalize(inOffset, outOffset, false);
return false;
}
}
out[outOffset++] = (char)ch;
if (outOffset == outEnd) {
fPartialMultiByteResult = exitNormalize(inOffset, outOffset, false);
return false;
}
if (++inOffset == inEnd) {
fPartialMultiByteResult = exitNormalize(inOffset, outOffset, true);
return false;
}
break;
}
return exitNormalize(inOffset, outOffset, true);
}
}