blob: fc41daaecc1e83e9e52db36a42abad039fec4329 [file] [log] [blame]
/*
* The Apache Software License, Version 1.1
*
* Copyright (c) 1999 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Xerces" and "Apache Software Foundation" must
* not be used to endorse or promote products derived from this
* software without prior written permission. For written
* permission, please contact apache\@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* nor may "Apache" appear in their name, without prior written
* permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation, and was
* originally based on software copyright (c) 1999, International
* Business Machines, Inc., http://www.ibm.com . For more information
* on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
/**
* $Log$
* Revision 1.4 1999/12/07 23:08:41 roddey
* Add in code to test for some control characters and report them as whitespace.
* ICU is not doing this currently, so we need to do it until they get that fixed.
*
* Revision 1.3 1999/11/18 20:16:52 abagchi
* Now works with ICU 1.3.1
*
* Revision 1.2 1999/11/17 22:36:41 rahulj
* Code works with ICU transcoding service
*
* Revision 1.1.1.1 1999/11/09 01:06:07 twl
* Initial checkin
*
* Revision 1.3 1999/11/08 20:45:33 rahul
* Swat for adding in Product name and CVS comment log variable.
*
*/
// ---------------------------------------------------------------------------
// Includes
// ---------------------------------------------------------------------------
#include <util/TranscodingException.hpp>
#include "ICUTransService.hpp"
#include <string.h>
#include <uloc.h>
#include <unicode.h>
#include <ucnv.h>
#include <ustring.h>
// ---------------------------------------------------------------------------
// ICUTransService: Public, static methods
// ---------------------------------------------------------------------------
void ICUTransService::setICUPath(const char* const pathToSet)
{
uloc_setDataDirectory(pathToSet);
}
// ---------------------------------------------------------------------------
// ICUTransService: Constructors and Destructor
// ---------------------------------------------------------------------------
ICUTransService::ICUTransService()
{
}
ICUTransService::~ICUTransService()
{
}
// ---------------------------------------------------------------------------
// ICUTransService: The virtual transcoding service API
// ---------------------------------------------------------------------------
int ICUTransService::compareIString(const XMLCh* const comp1
, const XMLCh* const comp2)
{
const XMLCh* psz1 = comp1;
const XMLCh* psz2 = comp2;
unsigned int curCount = 0;
while (true)
{
// If an inequality, then return the difference
if (Unicode::toUpperCase(*psz1) != Unicode::toUpperCase(*psz2))
return int(*psz1) - int(*psz2);
// If either has ended, then they both ended, so equal
if (!*psz1 || !*psz2)
break;
// Move upwards for the next round
psz1++;
psz2++;
}
return 0;
}
int ICUTransService::compareNIString(const XMLCh* const comp1
, const XMLCh* const comp2
, const unsigned int maxChars)
{
const XMLCh* psz1 = comp1;
const XMLCh* psz2 = comp2;
unsigned int curCount = 0;
while (true)
{
// If an inequality, then return difference
if (Unicode::toUpperCase(*psz1) != Unicode::toUpperCase(*psz2))
return int(*psz1) - int(*psz2);
// If either ended, then both ended, so equal
if (!*psz1 || !*psz2)
break;
// Move upwards to next chars
psz1++;
psz2++;
//
// Bump the count of chars done. If it equals the count then we
// are equal for the requested count, so break out and return
// equal.
//
curCount++;
if (maxChars == curCount)
break;
}
return 0;
}
bool ICUTransService::isSpace(const XMLCh toCheck) const
{
//
// <TBD>
// For now, we short circuit some of the control chars because ICU
// is not correctly reporting them as space. Later, when they change
// this, we can get rid of this special case.
//
if ((toCheck == 0x09)
|| (toCheck == 0x0A)
|| (toCheck == 0x0D))
{
return true;
}
return (Unicode::isSpaceChar(toCheck) != 0);
}
XMLTranscoder* ICUTransService::makeNewDefTranscoder()
{
//
// Try to create a default converter. If it fails, return a null pointer
// which will basically cause the system to give up because we really can't
// do anything without one.
//
UErrorCode uerr = U_ZERO_ERROR;
UConverter* converter = ucnv_open(NULL, &uerr);
if (!converter)
return 0;
// That went ok, so create an ICU transcoder wrapper and return it
return new ICUTranscoder(converter, 0);
}
XMLTranscoder*
ICUTransService::makeNewTranscoderFor( const XMLCh* const encodingName
, XMLTransService::Codes& resValue
, const unsigned int blockSize)
{
UErrorCode uerr = U_ZERO_ERROR;
UConverter* converter = ucnv_openU(encodingName, &uerr);
if (!converter)
{
resValue = XMLTransService::UnsupportedEncoding;
return 0;
}
return new ICUTranscoder(converter, blockSize);
}
// ---------------------------------------------------------------------------
// ICUTranscoder: Constructors and Destructor
// ---------------------------------------------------------------------------
ICUTranscoder::ICUTranscoder( UConverter* const toAdopt
, const unsigned int blockSize) :
fCharOfsBuf(0)
, fConverter(toAdopt)
{
// There won't be a block size if this is for a default transcoder
if (blockSize)
fCharOfsBuf = new long[blockSize];
}
ICUTranscoder::~ICUTranscoder()
{
delete [] fCharOfsBuf;
// If there is a converter, ask ICU to clean it up
if (fConverter)
{
// <TBD> Does this actually delete the structure???
ucnv_close(fConverter);
fConverter = 0;
}
}
// ---------------------------------------------------------------------------
// ICUTranscoder: The virtual transcoder API
// ---------------------------------------------------------------------------
unsigned int ICUTranscoder::calcRequiredSize(const XMLCh* const srcText)
{
if (!srcText)
return 0;
XMLMutexLock lockConverter(&fMutex);
UErrorCode err = U_ZERO_ERROR;
const int32_t targetCap = ucnv_fromUChars
(
fConverter
, 0
, 0
, srcText
, &err
);
if (err != U_BUFFER_OVERFLOW_ERROR)
return 0;
return (unsigned int)targetCap;
}
unsigned int ICUTranscoder::calcRequiredSize(const char* const srcText)
{
if (!srcText)
return 0;
XMLMutexLock lockConverter(&fMutex);
UErrorCode err = U_ZERO_ERROR;
const int32_t targetCap = ucnv_toUChars
(
fConverter
, 0
, 0
, srcText
, strlen(srcText)
, &err
);
if (err != U_BUFFER_OVERFLOW_ERROR)
return 0;
// Subtract one since it includes the terminator space
return (unsigned int)(targetCap - 1);
}
XMLCh ICUTranscoder::transcodeOne( const char* const srcData
, const unsigned int srcBytes
, unsigned int& bytesEaten)
{
// Check for stupid stuff
if (!srcBytes)
return 0;
XMLMutexLock lockConverter(&fMutex);
UErrorCode err = U_ZERO_ERROR;
const char* startSrc = srcData;
const XMLCh chRet = ucnv_getNextUChar
(
fConverter
, &startSrc
, (srcData + srcBytes) - 1
, &err
);
// Bail out if an error
if (U_FAILURE(err))
return 0;
// Calculate the bytes eaten and return the char
bytesEaten = startSrc - srcData;
return chRet;
}
char* ICUTranscoder::transcode(const XMLCh* const toTranscode)
{
char* retBuf = 0;
// Check for a couple of special cases
if (!toTranscode)
return 0;
if (!*toTranscode)
{
retBuf = new char[1];
retBuf[0] = 0;
return retBuf;
}
XMLMutexLock lockConverter(&fMutex);
// Caculate a return buffer size not too big, but less likely to overflow
int32_t targetLen = (int32_t)(u_strlen(toTranscode) * 1.25);
// Allocate the return buffer
retBuf = new char[targetLen + 1];
//Convert the Unicode string to char* using Intl stuff
UErrorCode err = U_ZERO_ERROR;
int32_t targetCap = ucnv_fromUChars
(
fConverter
, retBuf
, targetLen + 1
, toTranscode
, &err
);
// If targetLen is not enough then buffer overflow might occur
if (err == U_BUFFER_OVERFLOW_ERROR)
{
// Reset the error, delete the old buffer, allocate a new one, and try again
err = U_ZERO_ERROR;
delete [] retBuf;
retBuf = new char[targetCap];
targetCap = ucnv_fromUChars(fConverter, retBuf, targetCap, toTranscode, &err);
}
if (U_FAILURE(err))
{
delete [] retBuf;
return 0;
}
// Cap it off and return
retBuf[targetCap] = 0;
return retBuf;
}
bool ICUTranscoder::transcode( const XMLCh* const toTranscode
, char* const toFill
, const unsigned int maxChars)
{
// Watch for a few psycho corner cases
if (!toTranscode || !maxChars)
{
toFill[0] = 0;
return true;
}
if (!*toTranscode)
{
toFill[0] = 0;
return true;
}
XMLMutexLock lockConverter(&fMutex);
UErrorCode err = U_ZERO_ERROR;
int32_t targetCap;
targetCap = ucnv_fromUChars(fConverter, toFill, maxChars + 1, toTranscode, &err);
if (U_FAILURE(err))
return false;
return true;
}
XMLCh* ICUTranscoder::transcode(const char* const toTranscode)
{
// Watch for a few pyscho corner cases
if (!toTranscode)
return 0;
XMLCh* retVal = 0;
if (!*toTranscode)
{
retVal = new XMLCh[1];
retVal[0] = 0;
return retVal;
}
XMLMutexLock lockConverter(&fMutex);
//
// Get the length of the string to transcode. The Unicode string will
// almost always be no more chars than were in the source, so this is
// the best guess as to the storage needed.
//
const int32_t srcLen = (int32_t)strlen(toTranscode);
//
// Here we don't know what the target length will be so use 0 and expect
// an U_BUFFER_OVERFLOW_ERROR in which case it'd get resolved by the
// correct capacity value.
//
UErrorCode err = U_ZERO_ERROR;
int32_t targetCap;
targetCap = ucnv_toUChars
(
fConverter
, 0
, 0
, toTranscode
, srcLen
, &err
);
if (err != U_BUFFER_OVERFLOW_ERROR)
return 0;
err = U_ZERO_ERROR;
retVal = new XMLCh[targetCap];
ucnv_toUChars
(
fConverter
, retVal
, targetCap
, toTranscode
, srcLen
, &err
);
if (U_FAILURE(err))
return 0;
return retVal;
}
bool ICUTranscoder::transcode( const char* const toTranscode
, XMLCh* const toFill
, const unsigned int maxChars)
{
// Check for a couple of psycho corner cases
if (!toTranscode || !maxChars)
{
toFill[0] = 0;
return true;
}
if (!*toTranscode)
{
toFill[0] = 0;
return true;
}
XMLMutexLock lockConverter(&fMutex);
UErrorCode err = U_ZERO_ERROR;
const int32_t srcLen = (int32_t)strlen(toTranscode);
ucnv_toUChars
(
fConverter
, toFill
, maxChars + 1
, toTranscode
, srcLen
, &err
);
if (U_FAILURE(err))
return false;
return true;
}
unsigned int
ICUTranscoder::transcodeXML(const char* const srcData
, const unsigned int srcCount
, XMLCh* const toFill
, const unsigned int maxChars
, unsigned int& bytesEaten)
{
//
// If the input encoding uses fixed size characters, we can use a
// simpler, faster approach to computing the character sizes to be
// returned in the charSizes array.
//
const int maxCharSize = ucnv_getMaxCharSize(fConverter);
const int minCharSize = ucnv_getMinCharSize(fConverter);
//
// Set up pointers to the source and destination buffers.
//
UChar* startTarget = toFill;
const char* startSrc = srcData;
const char* endSrc = srcData + srcCount;
//
// Transoode the buffer. Buffer overflow errors are normal, occuring
// when the raw input buffer holds more characters than will fit
// in the Unicode output buffer.
//
UErrorCode err = U_ZERO_ERROR;
ucnv_toUnicode
(
fConverter
, &startTarget
, toFill + maxChars
, &startSrc
, endSrc
, 0
, false
, &err
);
if ((err != U_ZERO_ERROR) && (err != U_INDEX_OUTOFBOUNDS_ERROR))
ThrowXML(TranscodingException, XML4CExcepts::Trans_CouldNotXCodeXMLData);
// Calculate the bytes eaten and store in caller's param
bytesEaten = startSrc - srcData;
// Return the chars we put into the target buffer
return startTarget - toFill;
}