blob: 9d587b35c10fb34b3638113bf76a59ec636197b8 [file] [log] [blame]
/**********************************************************************
// @@@ START COPYRIGHT @@@
//
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
//
// @@@ END COPYRIGHT @@@
**********************************************************************/
/* -*-C++-*-
*****************************************************************************
*
* File: conversionLocale.h
* RCS: $Id:
* Description: The implementation of locale related conversion routins
*
*
* Created: 7/8/98
* Modified: $ $Date: 1998/08/10 16:00:39 $ (GMT)
* Language: C++
* Status: $State: Exp $
*
*
*
*
*****************************************************************************
*/
#include "Platform.h"
#include "ComOperators.h"
#include "ComASSERT.h"
#include "NLSConversion.h"
#include "charinfo.h"
#include "csconvert.h"
cnv_charset convertCharsetEnum (Int32 inset)
{
switch(inset){
case CharInfo::ISO88591: return cnv_ISO88591; break;
case CharInfo::SJIS: return cnv_SJIS; break;
case CharInfo::UNICODE: return cnv_UTF16; break;
case CharInfo::EUCJP: return cnv_EUCJP; break;
case CharInfo::BIG5: return cnv_BIG5; break;
case CharInfo::GB18030: return cnv_GB18030; break;
case CharInfo::UTF8: return cnv_UTF8; break;
case CharInfo::KSC5601: return cnv_KSC; break;
case CharInfo::GB2312: return cnv_GB2312; break;
case CharInfo::GBK: return cnv_GBK; break;
default: return cnv_UnknownCharSet; break;
}
}
// Unicode to UTF8 conversion.
//
// This function takes a Unicode UCS2/UTF16 string and returns its UTF8 equivalent.
// The optional utf8String argument holds the buffer into which the Unicode string
// will be stored. In case the argument is NULL or it is not big enough,
// the function allocates memory from the heap (if the heap pointer is not NULL),
// or from the C run-time system heap.
// If the memory allocation fails, the function returns 0. If any illegal
// characters are encountered, the function also returns 0.
//
charBuf* unicodeToUtf8(const NAWcharBuf& unicodeString, CollHeap *heap,
charBuf*& utf8String, NABoolean addNullAtEnd,
NABoolean allowInvalidCodePoint)
{
charBuf* cbufPtr = NULL; // tell unicodeTocset() to allocate a new buffer
charBuf* res = NULL;
Int32 errorcode = 0;
res = unicodeTocset ( unicodeString // in - const NAWcharBuf &
, heap // in - CollHeap *
, cbufPtr
, (Lng32) SQLCHARSETCODE_UTF8 // in - Lng32 targetCharSet
, errorcode // out - Int32 &
, addNullAtEnd
, allowInvalidCodePoint
);
if ( res == NULL || errorcode != 0) // translation failed
return NULL;
charBuf* output = checkSpace(heap, res->getStrLen(), utf8String, addNullAtEnd);
if ( output == NULL )
{
NADELETE(res, charBuf, heap);
return 0;
}
Int32 finalLengthInBytes = res->getStrLen();
memmove(output->data(), res->data(), finalLengthInBytes);
output->setStrLen(res->getStrLen());
if ( addNullAtEnd == TRUE )
output->data()[finalLengthInBytes] = '\0';
NADELETE(res, charBuf, heap);
return output;
}
Lng32 UnicodeStringToLocale(Lng32 charset, const NAWchar* wstr, Lng32 wstrLen,
char* buf, Lng32 bufLen, NABoolean addNullAtEnd,
NABoolean allowInvalidCodePoint)
{
charBuf cbuf((unsigned char*)buf, bufLen);
charBuf* cbufPtr = &cbuf;
charBuf* res = 0;
Int32 errorcode = 0;
switch (charset)
{
#ifdef IS_MP /* :cnu -- As of 8/30/2011, not used in SQ SQL */
case CharInfo::KANJI_MP:
res = unicodeToSjis(
NAWcharBuf((NAWchar*)wstr, wstrLen), 0, cbufPtr, addNullAtEnd,
allowInvalidCodePoint
);
break;
case CharInfo::KSC5601_MP:
res = unicodeToKsc5601(
NAWcharBuf((NAWchar*)wstr, wstrLen), 0, cbufPtr, addNullAtEnd,
allowInvalidCodePoint
);
break;
#endif
case CharInfo::ISO88591:
res = unicodeToISO88591(
NAWcharBuf((NAWchar*)wstr, wstrLen), 0, cbufPtr, addNullAtEnd,
allowInvalidCodePoint
);
break;
// case CharInfo::ISO88591:
case CharInfo::EUCJP:
case CharInfo::GB18030:
case CharInfo::GB2312:
case CharInfo::GBK:
case CharInfo::KSC5601:
case CharInfo::BIG5:
case CharInfo::UTF8:
case CharInfo::SJIS:
res = unicodeTocset(
NAWcharBuf((NAWchar*)wstr, wstrLen), 0, cbufPtr, charset, errorcode, addNullAtEnd,
allowInvalidCodePoint
);
break;
default:
break;
}
return (res) ? res->getStrLen() : 0;
}
Lng32
LocaleStringToUnicode(Lng32 charset, const char* str, Lng32 strLen,
NAWchar* wstrBuf, Lng32 wstrBufLen, NABoolean addNullAtEnd)
{
// Changed the algorithm to call the new LocaleToUTF16() but keep
// the old call to old ISO88591ToUnicode() when the character set is
// ISO88591. We want to keep the old "pass through" behavior so
// Use of ISO 8859-15 characters (a.k.a., Latin-9) in
// CHARACTER SET ISO88591 target column continues to work.
if (charset == (Lng32) CharInfo::ISO88591)
{
NAWcharBuf wcbuf(wstrBuf, wstrBufLen);
NAWcharBuf* wcbufPtr = &wcbuf;
NAWcharBuf* res = 0;
res = ISO88591ToUnicode(
charBuf((unsigned char*)str, strLen), 0,
wcbufPtr, addNullAtEnd
);
return (res) ? res->getStrLen() : 0;
}
//
// else (charset != (Lng32) CharInfo::ISO88591)
//
enum cnv_charset convCS = convertCharsetEnum(charset);
if (convCS == cnv_UnknownCharSet)
return 0; // nothing we can do; exit the routine
UInt32 outBufSizeInBytes = wstrBufLen*sizeof(NAWchar);
char * pFirstUntranslatedChar = NULL;
UInt32 outputDataLenInBytes = 0;
UInt32 translatedtCharCount = 0;
Int32 convStatus =
LocaleToUTF16(cnv_version1, // const enum cnv_version version
str, // const char *in_bufr
strLen, // const int in_len in # of bytes
(const char *)wstrBuf, // const char *out_bufr
(const Int32)outBufSizeInBytes,
convCS, // enum cnv_charset charset -- output charset
pFirstUntranslatedChar, // char * & first_untranslated_char
&outputDataLenInBytes, // unsigned int *output_data_len_p
0, // const int cnv_flags (default is 0)
(const Int32)addNullAtEnd,
&translatedtCharCount); // unsigned int *translated_char_cnt_p
UInt32 outLenInW = outputDataLenInBytes/sizeof(NAWchar);
if (convStatus == 0) // success
return outLenInW; // include the NULL terminator if (addNullAtEnd == TRUE)
// If convStatus != 0, LocaleToUTF16 will not add the NULL terminator
if (addNullAtEnd && wstrBuf && wstrBufLen > 0)
{
if (outLenInW < (UInt32)wstrBufLen)
wstrBuf[outLenInW] = WIDE_('\0');
else
{
// assume the specified wstrBufLen includes room for the NULL terminator
// when the passed-in addNullAtEnd parameter is set to TRUE
wstrBuf[wstrBufLen-1] = WIDE_('\0');
}
}
return 0; // tell the caller not to use data in wstrBuf
}
// ***************************************************************
// * Convert the string to UTF8
// ***************************************************************
#if 0 /* As of 8/30/2011, there are no callers in SQ SQL */
Int32 localeConvertToUTF8(char* source,
Lng32 sourceLen,
char* target,
Lng32 targetLen,
Lng32 charset,
CollHeap * heap,
Int32 *rtnCharCount,
Int32 *errorByteOff)
{
char * buffer = NULL;
Lng32 bufferLen = 0;
Int32 retCode = 1;
char *OrigSource = source;
// If the input string is invalid, simply return
if (source == NULL ||
strlen(source) == 0)
{
return CNV_ERR_NOINPUT;
}
// If the input charset is UTF8, simply return.
if (charset == cnv_UTF8)
{
return CNV_ERR_NO_CONVERSION_NEEDED;
}
// The resulting string will be bigger by a given factor.
Int32 multiplier = 8; // Includes future charset multiplier sizes
bufferLen = sourceLen + ((Lng32)strlen(source) * multiplier) + 1;
// Check that the target buffer was provided and its length
// will be able to hold the converted characters
if (target == NULL ||
targetLen < bufferLen)
{
return CNV_ERR_TARGET_SIZE_INVALID;
}
target[0] = '\0';
buffer = new char [bufferLen];
memset(buffer, '\0', sizeof(buffer));
char * pinstr = source; // Pointer to the input buffer.
char * p1stUnstranslatedChar = NULL;
UInt32 utf8StrLenInBytes = 0; // 64-bit
UInt32 charCount = 0; // number of characters translated/converted
Lng32 inStrLen = (Lng32)strlen(source);
for (Lng32 loopCounter = 0; retCode != 0 &&
(retCode != CNV_ERR_INVALID_CHAR ||
retCode != CNV_ERR_INVALID_CS ||
retCode != CNV_ERR_NOINPUT) &&
loopCounter < 16; // avoid infinite loop
loopCounter++)
{
retCode = LocaleToUTF8(cnv_version1,
pinstr,
inStrLen,
(const char*)buffer,
bufferLen,
(cnv_charset)charset,
(char* &)p1stUnstranslatedChar,
&utf8StrLenInBytes, // 64-bit
(const Int32)1, // addNullAtEnd_flag == TRUE
&charCount // 64-bit
);
if (rtnCharCount) *rtnCharCount = (Int32)charCount;
if (errorByteOff) *errorByteOff = p1stUnstranslatedChar - OrigSource;
switch(retCode)
{
case 0:
if (strlen(target) + strlen(buffer) >= (size_t)targetLen) // avoid overflow
{
retCode = CNV_ERR_TARGET_SIZE_INVALID;
// no need to be fancy - this condition is not supposed to happen anyway
// just chop off the extra bytes - last character in target may be chopped
// right in the middle
if ((size_t)targetLen > strlen(target))
{
buffer[targetLen - strlen(target) - 1] = '\0';
if (target[0] == '\0')
strcpy(target, buffer);
else
strcat(target, buffer);
}
loopCounter = 8888; // exit loop
}
else // have enough room
{
if (target[0] == '\0')
strcpy(target,buffer); // We're assuming that the input buffer was null terminated.
else
strcat(target,buffer);
loopCounter = 8888; // exit loop
}
break;
case CNV_ERR_BUFFER_OVERRUN:
if (strlen(target) + strlen(buffer) < (size_t)targetLen) // have enough room
{
if (target[0] == '\0')
{
strncpy(target, buffer, strlen(buffer));
strcat(target,"?"); // Force a question mark as output
}
else
{
strncat(target, buffer, strlen(buffer));
strcat(target,"?"); // Force a question mark as output
}
pinstr = p1stUnstranslatedChar; // We're going again, adjust the pointer to the input buffer.
// prepare local variables for the next conversion
inStrLen = (Lng32)(inStrLen - (p1stUnstranslatedChar - pinstr));
p1stUnstranslatedChar = NULL;
// intentionally keep the retCode == CNV_ERR_BUFFER_OVERRUN setting
// just in case we exceed the loop count limit
// go back to the beginning of the for loop
}
else // avoid overflow
{
retCode = CNV_ERR_TARGET_SIZE_INVALID;
// no need to be fancy - this condition is not supposed to happen anyway
// just chop off the extra bytes - last character in target may be chopped
// right in the middle
if ((size_t)targetLen > strlen(target))
{
buffer[targetLen - strlen(target) - 1] = '\0';
if (target[0] == '\0')
strcpy(target, buffer);
else
strcat(target, buffer);
}
loopCounter = 8888; // exit loop
}
break;
case CNV_ERR_INVALID_CHAR:
retCode = CNV_ERR_INVALID_CHAR;
break;
case CNV_ERR_INVALID_CS:
retCode = CNV_ERR_INVALID_CS;
break;
case CNV_ERR_NOINPUT:
retCode = CNV_ERR_NOINPUT;
break;
default:
retCode = CNV_ERR_INVALID_CHAR; // Bad character set conversion
break;
}
}
delete[] buffer;
buffer = NULL;
return retCode;
}
#endif /* As of 8/30/2011, no callers in SQ SQL */
// ***************************************************************
// * Encode the string from a UTF8 multibyte string to the
// * designated charset
// ***************************************************************
#if 0 /* As of 8/30/2011, there are no callers in SQ SQL */
Int32 UTF8ConvertToLocale(char* source,
Lng32 sourceLen,
char* target,
Lng32 targetLen,
Lng32 charset,
CollHeap *heap,
Int32 *charCount,
Int32 *errorByteOff)
{
Int32 retCode = 1;
char * OrigSource = source;
// If the input string is invalid, simply return
if (source == NULL ||
strlen(source) == 0)
{
return CNV_ERR_NOINPUT;
}
// If the ISO_MAPPING is UTF8, simply return the string since it is already UTF8.
if (charset == cnv_UTF8)
{
return CNV_ERR_NO_CONVERSION_NEEDED;
}
Lng32 bufferLen = (Lng32)strlen(source) + 1;
char * buffer = NULL;
// Check that the target buffer was provided and its length
// will be able to hold the converted characters
if (target == NULL ||
targetLen < bufferLen)
{
return CNV_ERR_TARGET_SIZE_INVALID;
}
target[0] = '\0';
buffer = new char [bufferLen];
memset(buffer, '\0', sizeof(buffer));
char* punstr = 0; // Pointer to the first unconverted character
// (either due to small buffer size or
// conversion error).
UInt32 outLen = 0; // output data length in number of bytes
UInt32 numTran = 0; // number of characters translated
for (Lng32 loopCounter = 0; retCode != 0 &&
(retCode != CNV_ERR_INVALID_CHAR ||
retCode != CNV_ERR_INVALID_CS ||
retCode != CNV_ERR_NOINPUT) &&
loopCounter < 16; // Avoid infinite loop
loopCounter++)
{
retCode = UTF8ToLocale(cnv_version1,
(const char*)source,
(Lng32)strlen(source),
(const char*)buffer,
bufferLen,
(cnv_charset)charset,
(char* &)punstr,
&outLen,
1, // addNullAtEnd_flag == TRUE
0, // allow_invalids == FALSE
&numTran, // 64-bit
0);
if (charCount) *charCount = (Int32)numTran;
if (errorByteOff) *errorByteOff = punstr - OrigSource;
switch(retCode)
{
case 0:
if (strlen(target) + strlen(buffer) >= (size_t)targetLen) // avoid overflow
{
retCode = CNV_ERR_TARGET_SIZE_INVALID;
// no need to be fancy - this condition is not supposed to happen anyway
// just chop off the extra bytes - last character in target may be chopped
// right in the middle
buffer[targetLen - strlen(target) - 1] = '\0';
if (target[0] == '\0')
strcpy(target, buffer);
else
strcat(target, buffer);
loopCounter = 8888; // exit loop
}
else
{
if (target[0] == '\0')
strcpy(target,buffer); // We're assuming that the input buffer was null terminated.
else
strcat(target,buffer);
loopCounter = 8888; // exit loop
}
break;
case CNV_ERR_BUFFER_OVERRUN:
if (strlen(target) + strlen(buffer) < (size_t)targetLen)
{
if (target[0] == '\0')
strncpy(target, buffer, strlen(buffer));
else
strncat(target, buffer, strlen(buffer));
source = punstr; // We're going again, adjust the pointer to the input buffer.
// intentionally keep the retCode == CNV_ERR_BUFFER_OVERRUN setting
// just in case we exceed the loop count limit
// go back to the beginning of the for loop
}
else // avoid overflow
{
retCode = CNV_ERR_TARGET_SIZE_INVALID;
// no need to be fancy - this condition is not supposed to happen anyway
// just chop off the extra bytes - last character in target may be chopped
// right in the middle
buffer[targetLen - strlen(target) - 1] = '\0';
if (target[0] == '\0')
strcpy(target, buffer);
else
strcat(target, buffer);
loopCounter = 8888; // exit loop
}
break;
case CNV_ERR_INVALID_CHAR:
retCode = CNV_ERR_INVALID_CHAR;
break;
case CNV_ERR_INVALID_CS:
retCode = CNV_ERR_INVALID_CS;
break;
case CNV_ERR_NOINPUT:
retCode = CNV_ERR_NOINPUT;
break;
default:
retCode = CNV_ERR_INVALID_CHAR; // Bad character set conversion
break;
}
}
delete[] buffer;
buffer = NULL;
return retCode;
}
#endif /* As of 8/30/2011, no callers in SQ SQL */
// -----------------------------------------------------------------------
// ComputeWidthInBytesOfMbsForDisplay:
//
// Returns the display width (in bytes) that is the closest to the
// specified maximum display width (in bytes) without chopping the
// rightmost multi-byte characters into two parts so that we do not
// encounter the situation where the first part of the multi-byte
// character is in the current display line and the other part of
// the character is in the next display line.
//
// If encounters an error, return the error code (a negative number)
// define in w:/common/csconvert.h.
//
//
// In parameter pv_eCharSet contains the character set attribute
// of the input string passed in via the parameter pp_cMultiByteStr.
//
// The out parameter pr_iNumOfTranslatedChars contains the number of
// the actual (i.e., UCS4) characters translated.
//
// The out parameter pr_iNumOfNAWchars contains the number of UCS2
// characters (NAwchar[acters]) instead of the number of the actual
// (i.e., UCS4) characters.
//
// Note that classes NAMemory and CollHeap are the same except for
// the names.
// -----------------------------------------------------------------------
// -----------------------------------------------------------------------
// ComputeStrLenInNAWchars:
//
// Returns the length of the input string (in the specified character set)
// in number of NAWchar(acters) - Note that a UTF16 character (i.e., a
// surrogate pair) will have a count of 2 NAWchar(acters).
//
// Return an error code (a negative number) if encounters an error. The
// error code values are defined in w:/common/csconvert.h.
// -----------------------------------------------------------------------
Int32 ComputeStrLenInNAWchars (const char * pStr,
const Int32 strLenInBytes,
const CharInfo::CharSet strCS,
NAMemory *workspaceHeap) // in - default is NULL (the C++ runtime heap)
{
if (pStr == NULL || strLenInBytes == 0)
return 0;
if (strCS == CharInfo::UCS2)
return strLenInBytes / BYTES_PER_NAWCHAR;
Int32 lenInNAWchars = 0;
char * pFirstByteOfUntranslatedChar = NULL;
UInt32 outputDataLen = 0;
Int32 rtnCode = 0;
cnv_charset cnvCharSet = convertCharsetEnum(strCS);
// Compute the size of the to-be-allocated output buffer, include a UCS-2 NULL terminator, for the worst case.
const Int32 bufSizeInBytes = (BYTES_PER_NAWCHAR+1) * strLenInBytes + BYTES_PER_NAWCHAR;
char * charBuf = new (workspaceHeap) char [bufSizeInBytes];
if (charBuf EQU NULL)
return CNV_ERR_INVALID_HEAP;
rtnCode =
LocaleToUTF16 ( cnv_version1 // in - const enum cnv_version
, pStr // in - const char * in_buf
, strLenInBytes // in - const int in_len
, charBuf // out - const char * out_buf - plenty of room
, bufSizeInBytes // in - const int out_len - buffer size in bytes
, cnvCharSet // in - const int cnv_charset
, pFirstByteOfUntranslatedChar // out - char * & ptr_to_first_untranslated_char
, & outputDataLen // out - unsigned int * output_data_len_p = NULL
// , 0 // in - const int cnv_flags = 0
// , (Int32)FALSE // in - const int addNullAtEnd_flag = FALSE
// , & translatedCharCount // out - unsigned int * translated_char_cnt_p = NULL
// , // in - unsigned int max_chars_to_convert = 0xffffffff
);
lenInNAWchars = outputDataLen / BYTES_PER_NAWCHAR;
NADELETEBASIC(charBuf, workspaceHeap);
if (rtnCode == 0)
return lenInNAWchars; // a positive integer value
else
return rtnCode; // a negative integer value
return lenInNAWchars;
} // ComputeStrLenInNAWchars()
// -----------------------------------------------------------------------
// ComputeStrLenInUCS4chars:
//
// Returns the actual (i.e., UCS4) character count of the input string
// (in the specified character set) in the actual (i.e., UCS4) characters.
// Return an error code (a negative number) if encounters an error. The
// error code values are defined in w:/common/csconvert.h. Note that
// this function does not need to use a workspace heap.
// -----------------------------------------------------------------------
Int32 ComputeStrLenInUCS4chars (const char * pStr,
const Int32 strLenInBytes,
const CharInfo::CharSet cs)
{
if (cs == CharInfo::ISO88591 || strLenInBytes == 0)
return strLenInBytes;
Int32 numberOfUCS4chars = 0;
Int32 firstCharLenInBuf = 0;
UInt32 /*ucs4_t*/ UCS4value;
cnv_charset cnvCharSet = convertCharsetEnum(cs);
const char *s = pStr;
Int32 num_trailing_zeros = 0;
Int32 len = (Int32)strLenInBytes;
while (len > 0)
{
firstCharLenInBuf = LocaleCharToUCS4 (s, len, &UCS4value, cnvCharSet);
if (firstCharLenInBuf <= 0)
return CNV_ERR_INVALID_CHAR;
numberOfUCS4chars++;
if ( *s == '\0' )
num_trailing_zeros += 1;
else
num_trailing_zeros = 0;
s += firstCharLenInBuf;
len -= firstCharLenInBuf;
}
return numberOfUCS4chars - num_trailing_zeros ; //NOTE: Don't count trailing zeros !
} // ComputeStrLenInUCS4chars ()