| /* |
| * The Apache Software License, Version 1.1 |
| * |
| * Copyright (c) 1999-2000 The Apache Software Foundation. All rights |
| * reserved. |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions |
| * are met: |
| * |
| * 1. Redistributions of source code must retain the above copyright |
| * notice, this list of conditions and the following disclaimer. |
| * |
| * 2. Redistributions in binary form must reproduce the above copyright |
| * notice, this list of conditions and the following disclaimer in |
| * the documentation and/or other materials provided with the |
| * distribution. |
| * |
| * 3. The end-user documentation included with the redistribution, |
| * if any, must include the following acknowledgment: |
| * "This product includes software developed by the |
| * Apache Software Foundation (http://www.apache.org/)." |
| * Alternately, this acknowledgment may appear in the software itself, |
| * if and wherever such third-party acknowledgments normally appear. |
| * |
| * 4. The names "Xerces" and "Apache Software Foundation" must |
| * not be used to endorse or promote products derived from this |
| * software without prior written permission. For written |
| * permission, please contact apache\@apache.org. |
| * |
| * 5. Products derived from this software may not be called "Apache", |
| * nor may "Apache" appear in their name, without prior written |
| * permission of the Apache Software Foundation. |
| * |
| * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED |
| * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES |
| * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
| * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR |
| * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF |
| * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND |
| * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, |
| * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT |
| * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
| * SUCH DAMAGE. |
| * ==================================================================== |
| * |
| * This software consists of voluntary contributions made by many |
| * individuals on behalf of the Apache Software Foundation, and was |
| * originally based on software copyright (c) 1999, International |
| * Business Machines, Inc., http://www.ibm.com . For more information |
| * on the Apache Software Foundation, please see |
| * <http://www.apache.org/>. |
| */ |
| |
| /** |
| * $Id$ |
| */ |
| |
| // --------------------------------------------------------------------------- |
| // Includes |
| // --------------------------------------------------------------------------- |
| #include <xercesc/util/TranscodingException.hpp> |
| #include <xercesc/util/XMLString.hpp> |
| #include <xercesc/util/XMLUniDefs.hpp> |
| #include <xercesc/util/XMLUTF8Transcoder.hpp> |
| #include <xercesc/util/UTFDataFormatException.hpp> |
| |
| |
| // --------------------------------------------------------------------------- |
| // Local static data |
| // |
| // gUTFBytes |
| // A list of counts of trailing bytes for each initial byte in the input. |
| // |
| // gUTFOffsets |
| // A list of values to offset each result char type, according to how |
| // many source bytes when into making it. |
| // |
| // gFirstByteMark |
| // A list of values to mask onto the first byte of an encoded sequence, |
| // indexed by the number of bytes used to create the sequence. |
| // --------------------------------------------------------------------------- |
| static const XMLByte gUTFBytes[256] = |
| { |
| 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 |
| , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 |
| , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 |
| , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 |
| , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 |
| , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 |
| , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 |
| , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 |
| , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 |
| , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 |
| , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 |
| , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 |
| , 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 |
| , 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 |
| , 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 |
| , 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5 |
| }; |
| |
| static const XMLUInt32 gUTFOffsets[6] = |
| { |
| 0, 0x3080, 0xE2080, 0x3C82080, 0xFA082080, 0x82082080 |
| }; |
| |
| static const XMLByte gFirstByteMark[7] = |
| { |
| 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC |
| }; |
| |
| |
| |
| // --------------------------------------------------------------------------- |
| // XMLUTF8Transcoder: Constructors and Destructor |
| // --------------------------------------------------------------------------- |
| XMLUTF8Transcoder::XMLUTF8Transcoder(const XMLCh* const encodingName |
| , const unsigned int blockSize) : |
| |
| XMLTranscoder(encodingName, blockSize) |
| { |
| } |
| |
| XMLUTF8Transcoder::~XMLUTF8Transcoder() |
| { |
| } |
| |
| |
| // --------------------------------------------------------------------------- |
| // XMLUTF8Transcoder: Implementation of the transcoder API |
| // --------------------------------------------------------------------------- |
| unsigned int |
| XMLUTF8Transcoder::transcodeFrom(const XMLByte* const srcData |
| , const unsigned int srcCount |
| , XMLCh* const toFill |
| , const unsigned int maxChars |
| , unsigned int& bytesEaten |
| , unsigned char* const charSizes) |
| { |
| // Watch for pathological scenario. Shouldn't happen, but... |
| if (!srcCount || !maxChars) |
| return 0; |
| |
| // If debugging, make sure that the block size is legal |
| #if defined(XERCES_DEBUG) |
| checkBlockSize(maxChars); |
| #endif |
| |
| // |
| // Get pointers to our start and end points of the input and output |
| // buffers. |
| // |
| const XMLByte* srcPtr = srcData; |
| const XMLByte* srcEnd = srcPtr + srcCount; |
| XMLCh* outPtr = toFill; |
| XMLCh* outEnd = outPtr + maxChars; |
| unsigned char* sizePtr = charSizes; |
| |
| |
| |
| // |
| // We now loop until we either run out of input data, or room to store |
| // output chars. |
| // |
| while ((srcPtr < srcEnd) && (outPtr < outEnd)) |
| { |
| // Get the next leading byte out |
| const XMLByte firstByte = *srcPtr; |
| |
| // Special-case ASCII, which is a leading byte value of <= 127 |
| if (firstByte <= 127) |
| { |
| *outPtr++ = XMLCh(firstByte); |
| srcPtr++; |
| *sizePtr++ = 1; |
| continue; |
| } |
| |
| // See how many trailing src bytes this sequence is going to require |
| const unsigned int trailingBytes = gUTFBytes[firstByte]; |
| |
| // |
| // If there are not enough source bytes to do this one, then we |
| // are done. Note that we done >= here because we are implicitly |
| // counting the 1 byte we get no matter what. |
| // |
| // If we break out here, then there is nothing to undo since we |
| // haven't updated any pointers yet. |
| // |
| if (srcPtr + trailingBytes >= srcEnd) |
| break; |
| |
| // Looks ok, so lets build up the value |
| XMLUInt32 tmpVal = 0; |
| switch(trailingBytes) |
| { |
| case 5 : tmpVal += *srcPtr++; tmpVal <<= 6; |
| case 4 : tmpVal += *srcPtr++; tmpVal <<= 6; |
| case 3 : tmpVal += *srcPtr++; tmpVal <<= 6; |
| case 2 : tmpVal += *srcPtr++; tmpVal <<= 6; |
| case 1 : tmpVal += *srcPtr++; tmpVal <<= 6; |
| case 0 : tmpVal += *srcPtr++; |
| break; |
| |
| default : |
| ThrowXML(TranscodingException, XMLExcepts::Trans_BadSrcSeq); |
| } |
| tmpVal -= gUTFOffsets[trailingBytes]; |
| |
| // |
| // If it will fit into a single char, then put it in. Otherwise |
| // encode it as a surrogate pair. If its not valid, use the |
| // replacement char. |
| // |
| if (!(tmpVal & 0xFFFF0000)) |
| { |
| *sizePtr++ = trailingBytes + 1; |
| *outPtr++ = XMLCh(tmpVal); |
| } |
| else if (tmpVal > 0x10FFFF) |
| { |
| // |
| // If we've gotten more than 32 chars so far, then just break |
| // out for now and lets process those. When we come back in |
| // here again, we'll get no chars and throw an exception. This |
| // way, the error will have a line and col number closer to |
| // the real problem area. |
| // |
| if ((outPtr - toFill) > 32) |
| break; |
| |
| ThrowXML(TranscodingException, XMLExcepts::Trans_BadSrcSeq); |
| } |
| else |
| { |
| // |
| // If we have enough room to store the leading and trailing |
| // chars, then lets do it. Else, pretend this one never |
| // happened, and leave it for the next time. Since we don't |
| // update the bytes read until the bottom of the loop, by |
| // breaking out here its like it never happened. |
| // |
| if (outPtr + 1 >= outEnd) |
| break; |
| |
| // Store the leading surrogate char |
| tmpVal -= 0x10000; |
| *sizePtr++ = trailingBytes + 1; |
| *outPtr++ = XMLCh((tmpVal >> 10) + 0xD800); |
| |
| // |
| // And then the treailing char. This one accounts for no |
| // bytes eaten from the source, so set the char size for this |
| // one to be zero. |
| // |
| *sizePtr++ = 0; |
| *outPtr++ = XMLCh(tmpVal & 0x3FF) + 0xDC00; |
| } |
| } |
| |
| // Update the bytes eaten |
| bytesEaten = srcPtr - srcData; |
| |
| // Return the characters read |
| return outPtr - toFill; |
| } |
| |
| |
| unsigned int |
| XMLUTF8Transcoder::transcodeTo( const XMLCh* const srcData |
| , const unsigned int srcCount |
| , XMLByte* const toFill |
| , const unsigned int maxBytes |
| , unsigned int& charsEaten |
| , const UnRepOpts options) |
| { |
| // Watch for pathological scenario. Shouldn't happen, but... |
| if (!srcCount || !maxBytes) |
| return 0; |
| |
| // |
| // Get pointers to our start and end points of the input and output |
| // buffers. |
| // |
| const XMLCh* srcPtr = srcData; |
| const XMLCh* srcEnd = srcPtr + srcCount; |
| XMLByte* outPtr = toFill; |
| XMLByte* outEnd = toFill + maxBytes; |
| |
| while (srcPtr < srcEnd) |
| { |
| // |
| // Tentatively get the next char out. We have to get it into a |
| // 32 bit value, because it could be a surrogate pair. |
| // |
| XMLUInt32 curVal = *srcPtr; |
| |
| // |
| // If its a leading surrogate, then lets see if we have the trailing |
| // available. If not, then give up now and leave it for next time. |
| // |
| unsigned int srcUsed = 1; |
| if ((curVal >= 0xD800) && (curVal <= 0xDBFF)) |
| { |
| if (srcPtr + 1 >= srcEnd) |
| break; |
| |
| // Create the composite surrogate pair |
| curVal = ((curVal - 0xD800) << 10) |
| + ((*(srcPtr + 1) - 0xDC00) + 0x10000); |
| |
| // And indicate that we ate another one |
| srcUsed++; |
| } |
| |
| // Figure out how many bytes we need |
| unsigned int encodedBytes; |
| if (curVal < 0x80) |
| encodedBytes = 1; |
| else if (curVal < 0x800) |
| encodedBytes = 2; |
| else if (curVal < 0x10000) |
| encodedBytes = 3; |
| else if (curVal < 0x200000) |
| encodedBytes = 4; |
| else if (curVal < 0x4000000) |
| encodedBytes = 5; |
| else if (curVal <= 0x7FFFFFFF) |
| encodedBytes = 6; |
| else |
| { |
| // If the options say to throw, then throw |
| if (options == UnRep_Throw) |
| { |
| XMLCh tmpBuf[16]; |
| XMLString::binToText(curVal, tmpBuf, 16, 16); |
| ThrowXML2 |
| ( |
| TranscodingException |
| , XMLExcepts::Trans_Unrepresentable |
| , tmpBuf |
| , getEncodingName() |
| ); |
| } |
| |
| // Else, use the replacement character |
| *outPtr++ = chSpace; |
| srcPtr += srcUsed; |
| continue; |
| } |
| |
| // |
| // If we cannot fully get this char into the output buffer, |
| // then leave it for the next time. |
| // |
| if (outPtr + encodedBytes > outEnd) |
| break; |
| |
| // We can do it, so update the source index |
| srcPtr += srcUsed; |
| |
| // |
| // And spit out the bytes. We spit them out in reverse order |
| // here, so bump up the output pointer and work down as we go. |
| // |
| outPtr += encodedBytes; |
| switch(encodedBytes) |
| { |
| case 6 : *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL); |
| curVal >>= 6; |
| case 5 : *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL); |
| curVal >>= 6; |
| case 4 : *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL); |
| curVal >>= 6; |
| case 3 : *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL); |
| curVal >>= 6; |
| case 2 : *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL); |
| curVal >>= 6; |
| case 1 : *--outPtr = XMLByte |
| ( |
| curVal | gFirstByteMark[encodedBytes] |
| ); |
| } |
| |
| // Add the encoded bytes back in again to indicate we've eaten them |
| outPtr += encodedBytes; |
| } |
| |
| // Fill in the chars we ate |
| charsEaten = (srcPtr - srcData); |
| |
| // And return the bytes we filled in |
| return (outPtr - toFill); |
| } |
| |
| |
| bool XMLUTF8Transcoder::canTranscodeTo(const unsigned int toCheck) const |
| { |
| // We can represent anything in the Unicode (with surrogates) range |
| return (toCheck <= 0x10FFFF); |
| } |