core/sql/common/conversionLocale.cpp - trafodion - Git at Google

 /**********************************************************************
 // @@@ START COPYRIGHT @@@
 //
 // Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements.  See the NOTICE file
 // distributed with this work for additional information
 // regarding copyright ownership.  The ASF licenses this file
 // to you under the Apache License, Version 2.0 (the
 // "License"); you may not use this file except in compliance
 // with the License.  You may obtain a copy of the License at
 //
 //   http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing,
 // software distributed under the License is distributed on an
 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.
 //
 // @@@ END COPYRIGHT @@@
 **********************************************************************/
 /* -*-C++-*-
  *****************************************************************************
  *
  * File:         conversionLocale.h
  * RCS:          $Id:
  * Description:  The implementation of locale related conversion routins
  *
  *
  * Created:      7/8/98
  * Modified:     $ $Date: 1998/08/10 16:00:39 $ (GMT)
  * Language:     C++
  * Status:       $State: Exp $
  *
  *
  *
  *
  *****************************************************************************
  */

 #include "Platform.h"
 #include "ComOperators.h"
 #include "ComASSERT.h"
 #include "NLSConversion.h"
 #include "charinfo.h"
 #include "csconvert.h"
 cnv_charset convertCharsetEnum (Int32 inset)
 {
  switch(inset){
    case CharInfo::ISO88591:   return cnv_ISO88591; break;
    case CharInfo::SJIS:       return cnv_SJIS; break;
    case CharInfo::UNICODE:    return cnv_UTF16; break;
    case CharInfo::EUCJP:      return cnv_EUCJP; break;
    case CharInfo::BIG5:       return cnv_BIG5; break;
    case CharInfo::GB18030:    return cnv_GB18030; break;
    case CharInfo::UTF8:       return cnv_UTF8; break;
    case CharInfo::KSC5601:    return cnv_KSC; break;
    case CharInfo::GB2312:     return cnv_GB2312; break;
    case CharInfo::GBK:        return cnv_GBK;    break;
    default:                   return cnv_UnknownCharSet; break;
    }
 }


 // Unicode to UTF8 conversion.
 //
 // This function takes a Unicode UCS2/UTF16 string and returns its UTF8 equivalent.
 // The optional utf8String argument holds the buffer into which the Unicode string
 // will be stored. In case the argument is NULL or it is not big enough,
 // the function allocates memory from the heap (if the heap pointer is not NULL),
 // or from the C run-time system heap.
 // If the memory allocation fails, the function returns 0. If any illegal
 // characters are encountered, the function also returns 0.
 //
 charBuf* unicodeToUtf8(const NAWcharBuf& unicodeString, CollHeap *heap,
                        charBuf*& utf8String, NABoolean addNullAtEnd,
                        NABoolean allowInvalidCodePoint)
 {
   charBuf* cbufPtr = NULL; // tell unicodeTocset() to allocate a new buffer
   charBuf* res = NULL;
   Int32 errorcode = 0;
   res = unicodeTocset ( unicodeString // in - const NAWcharBuf &
                       , heap          // in - CollHeap *
                       , cbufPtr
                       , (Lng32) SQLCHARSETCODE_UTF8 // in - Lng32 targetCharSet
                       , errorcode     // out - Int32 &
                       , addNullAtEnd
                       , allowInvalidCodePoint
                       );
    if ( res == NULL || errorcode != 0) // translation failed
      return NULL;

    charBuf* output = checkSpace(heap, res->getStrLen(), utf8String, addNullAtEnd);

    if ( output == NULL )
    {
      NADELETE(res, charBuf, heap);
      return 0;
    }

    Int32 finalLengthInBytes = res->getStrLen();
    memmove(output->data(), res->data(), finalLengthInBytes);
    output->setStrLen(res->getStrLen());

    if ( addNullAtEnd == TRUE )
      output->data()[finalLengthInBytes] = '\0';

    NADELETE(res, charBuf, heap);
    return output;
 }

 Lng32 UnicodeStringToLocale(Lng32 charset, const NAWchar* wstr, Lng32 wstrLen,
                            char* buf, Lng32 bufLen, NABoolean addNullAtEnd,
                            NABoolean allowInvalidCodePoint)
 {
    charBuf cbuf((unsigned char*)buf, bufLen);
    charBuf* cbufPtr = &cbuf;
    charBuf* res = 0;
    Int32 errorcode = 0;

    switch (charset)
    {
 #ifdef IS_MP /* :cnu -- As of 8/30/2011, not used in SQ SQL */
     case CharInfo::KANJI_MP:
       res = unicodeToSjis(
                 NAWcharBuf((NAWchar*)wstr, wstrLen), 0, cbufPtr, addNullAtEnd,
                            allowInvalidCodePoint
                         );
       break;
     case CharInfo::KSC5601_MP:
       res = unicodeToKsc5601(
                 NAWcharBuf((NAWchar*)wstr, wstrLen), 0, cbufPtr, addNullAtEnd,
                            allowInvalidCodePoint
                         );
       break;
 #endif
     case CharInfo::ISO88591:
      res = unicodeToISO88591(
                 NAWcharBuf((NAWchar*)wstr, wstrLen), 0, cbufPtr, addNullAtEnd,
                            allowInvalidCodePoint
                         );
      break;
  //	 case CharInfo::ISO88591:
 	 case CharInfo::EUCJP:
 	 case CharInfo::GB18030:
 	 case CharInfo::GB2312:
 	 case CharInfo::GBK:
 	 case CharInfo::KSC5601:
 	 case CharInfo::BIG5:
 	 case CharInfo::UTF8:
 	 case CharInfo::SJIS:
 	  res = unicodeTocset(
                 NAWcharBuf((NAWchar*)wstr, wstrLen), 0, cbufPtr, charset, errorcode, addNullAtEnd,
                            allowInvalidCodePoint
                         );
      break;
     default:
      break;
    }

    return (res) ? res->getStrLen() : 0;
 }


 Lng32
 LocaleStringToUnicode(Lng32 charset, const char* str, Lng32 strLen,
                       NAWchar* wstrBuf, Lng32 wstrBufLen, NABoolean addNullAtEnd)
 {
    // Changed the algorithm to call the new LocaleToUTF16() but keep
    // the old call to old ISO88591ToUnicode() when the character set is
    // ISO88591.  We want to keep the old "pass through" behavior so
    // Use of ISO 8859-15 characters (a.k.a., Latin-9) in
    // CHARACTER SET ISO88591 target column continues to work.

    if (charset == (Lng32) CharInfo::ISO88591)
    {
      NAWcharBuf wcbuf(wstrBuf, wstrBufLen);
      NAWcharBuf* wcbufPtr = &wcbuf;
      NAWcharBuf* res = 0;
      res = ISO88591ToUnicode(
                 charBuf((unsigned char*)str, strLen), 0,
                 wcbufPtr, addNullAtEnd
                         );
      return (res) ? res->getStrLen() : 0;
    }

    //
    // else (charset != (Lng32) CharInfo::ISO88591)
    //

    enum cnv_charset convCS = convertCharsetEnum(charset);
    if (convCS == cnv_UnknownCharSet)
      return 0; // nothing we can do; exit the routine

    UInt32 outBufSizeInBytes = wstrBufLen*sizeof(NAWchar);
    char * pFirstUntranslatedChar = NULL;
    UInt32 outputDataLenInBytes = 0;
    UInt32 translatedtCharCount = 0;
    Int32 convStatus =
      LocaleToUTF16(cnv_version1,           // const enum cnv_version version
                    str,                    // const char *in_bufr
                    strLen,                 // const int in_len in # of bytes
                    (const char *)wstrBuf,  // const char *out_bufr
                    (const Int32)outBufSizeInBytes,
                    convCS,       // enum cnv_charset charset -- output charset
                    pFirstUntranslatedChar, // char * & first_untranslated_char
                    &outputDataLenInBytes,  // unsigned int *output_data_len_p
                    0,                      // const int cnv_flags (default is 0)
                    (const Int32)addNullAtEnd,
                    &translatedtCharCount); // unsigned int *translated_char_cnt_p

    UInt32 outLenInW = outputDataLenInBytes/sizeof(NAWchar);
    if (convStatus == 0) // success
      return outLenInW;  // include the NULL terminator if (addNullAtEnd == TRUE)

    // If convStatus != 0, LocaleToUTF16 will not add the NULL terminator
    if (addNullAtEnd && wstrBuf && wstrBufLen > 0)
    {
      if (outLenInW < (UInt32)wstrBufLen)
        wstrBuf[outLenInW] = WIDE_('\0');
      else
      {
        // assume the specified wstrBufLen includes room for the NULL terminator
        // when the passed-in addNullAtEnd parameter is set to TRUE
        wstrBuf[wstrBufLen-1] = WIDE_('\0');
      }
    }
    return 0; // tell the caller not to use data in wstrBuf
 }

   // ***************************************************************
   // * Convert the string to UTF8
   // ***************************************************************
 #if 0 /* As of 8/30/2011, there are no callers in SQ SQL */
   Int32 localeConvertToUTF8(char* source,
                     Lng32 sourceLen,
                     char* target,
                     Lng32 targetLen,
                     Lng32 charset,
                     CollHeap * heap,
                     Int32  *rtnCharCount,
                     Int32  *errorByteOff)
   {

      char * buffer = NULL;
      Lng32 bufferLen = 0;
      Int32 retCode = 1;
      char *OrigSource = source;

      // If the input string is invalid, simply return
      if (source == NULL ||
          strlen(source) == 0)
      {
        return CNV_ERR_NOINPUT;
      }

      // If the input charset is UTF8, simply return.
      if (charset == cnv_UTF8)
      {
         return CNV_ERR_NO_CONVERSION_NEEDED;
      }

      // The resulting string will be bigger by a given factor.
      Int32 multiplier = 8;  // Includes future charset multiplier sizes
      bufferLen = sourceLen + ((Lng32)strlen(source) * multiplier) + 1;

      // Check that the target buffer was provided and its length
      // will be able to hold the converted characters

      if (target == NULL ||
          targetLen < bufferLen)
      {
        return CNV_ERR_TARGET_SIZE_INVALID;
      }

      target[0] = '\0';

      buffer = new char [bufferLen];
      memset(buffer, '\0', sizeof(buffer));

      char * pinstr = source; // Pointer to the input buffer.
      char * p1stUnstranslatedChar = NULL;
      UInt32  utf8StrLenInBytes = 0;  // 64-bit
      UInt32  charCount = 0;  // number of characters translated/converted
      Lng32 inStrLen = (Lng32)strlen(source);

      for (Lng32 loopCounter = 0; retCode != 0 &&
            (retCode != CNV_ERR_INVALID_CHAR ||
             retCode != CNV_ERR_INVALID_CS    ||
             retCode != CNV_ERR_NOINPUT) &&
             loopCounter < 16;  // avoid infinite loop
           loopCounter++)
      {
 	 retCode = LocaleToUTF8(cnv_version1,
    			pinstr,
 			inStrLen,
 			(const char*)buffer,
 			bufferLen,
 			(cnv_charset)charset,
 			(char* &)p1stUnstranslatedChar,
 			&utf8StrLenInBytes,  // 64-bit
 			(const Int32)1, // addNullAtEnd_flag == TRUE
 			&charCount  // 64-bit
 			);
         if (rtnCharCount) *rtnCharCount = (Int32)charCount;
         if (errorByteOff) *errorByteOff = p1stUnstranslatedChar - OrigSource;

 	switch(retCode)
 	{
   	  case 0:
             if (strlen(target) + strlen(buffer) >= (size_t)targetLen) // avoid overflow
             {
               retCode = CNV_ERR_TARGET_SIZE_INVALID;
               // no need to be fancy - this condition is not supposed to happen anyway
               // just chop off the extra bytes - last character in target may be chopped
               // right in the middle
               if ((size_t)targetLen > strlen(target))
               {
                 buffer[targetLen - strlen(target) - 1] = '\0';
                 if (target[0] == '\0')
                   strcpy(target, buffer);
                 else
                   strcat(target, buffer);
               }
               loopCounter = 8888;    // exit loop
             }
             else // have enough room
             {
             if (target[0] == '\0')
 	      strcpy(target,buffer); // We're assuming that the input buffer was null terminated.
             else
               strcat(target,buffer);
             loopCounter = 8888;      // exit loop
             }
 	    break;

 	  case CNV_ERR_BUFFER_OVERRUN:
           if (strlen(target) + strlen(buffer) < (size_t)targetLen) // have enough room
           {
             if (target[0] == '\0')
             {
 	      strncpy(target, buffer, strlen(buffer));
               strcat(target,"?");      // Force a question mark as output
             }
             else
             {
               strncat(target, buffer, strlen(buffer));
               strcat(target,"?");      // Force a question mark as output
             }

 	    pinstr = p1stUnstranslatedChar; // We're going again, adjust the pointer to the input buffer.
 	    // prepare local variables for the next conversion
             inStrLen = (Lng32)(inStrLen - (p1stUnstranslatedChar - pinstr));
             p1stUnstranslatedChar = NULL;
             // intentionally keep the retCode == CNV_ERR_BUFFER_OVERRUN setting
             // just in case we exceed the loop count limit

             // go back to the beginning of the for loop
             }
             else // avoid overflow
             {
               retCode = CNV_ERR_TARGET_SIZE_INVALID;
               // no need to be fancy - this condition is not supposed to happen anyway
               // just chop off the extra bytes - last character in target may be chopped
               // right in the middle
               if ((size_t)targetLen > strlen(target))
               {
                 buffer[targetLen - strlen(target) - 1] = '\0';
                 if (target[0] == '\0')
                   strcpy(target, buffer);
                 else
                   strcat(target, buffer);
               }
               loopCounter = 8888;    // exit loop
             }

             break;

 	  case CNV_ERR_INVALID_CHAR:
             retCode = CNV_ERR_INVALID_CHAR;
             break;
 	  case CNV_ERR_INVALID_CS:
             retCode = CNV_ERR_INVALID_CS;
             break;
 	  case CNV_ERR_NOINPUT:
             retCode = CNV_ERR_NOINPUT;
             break;
           default:
             retCode = CNV_ERR_INVALID_CHAR;  // Bad character set conversion
             break;
 	}
      }

         delete[] buffer;
         buffer = NULL;

         return retCode;
   }
 #endif /* As of 8/30/2011, no callers in SQ SQL */

   // ***************************************************************
   // * Encode the string from a UTF8 multibyte string to the
   // * designated charset
   // ***************************************************************

 #if 0 /* As of 8/30/2011, there are no callers in SQ SQL */
   Int32 UTF8ConvertToLocale(char* source,
                           Lng32 sourceLen,
                           char* target,
                           Lng32 targetLen,
                           Lng32 charset,
                           CollHeap *heap,
                           Int32  *charCount,
                           Int32  *errorByteOff)
   {
     Int32 retCode = 1;
     char * OrigSource = source;

     // If the input string is invalid, simply return
     if (source == NULL ||
         strlen(source) == 0)
       {
          return CNV_ERR_NOINPUT;
       }

     // If the ISO_MAPPING is UTF8, simply return the string since it is already UTF8.
     if (charset == cnv_UTF8)
       {
         return CNV_ERR_NO_CONVERSION_NEEDED;
       }

     Lng32 bufferLen = (Lng32)strlen(source) + 1;
     char * buffer = NULL;

     // Check that the target buffer was provided and its length
     // will be able to hold the converted characters

      if (target == NULL ||
          targetLen < bufferLen)
      {
        return CNV_ERR_TARGET_SIZE_INVALID;
      }

      target[0] = '\0';

     buffer = new char [bufferLen];
     memset(buffer, '\0', sizeof(buffer));

     char* punstr = 0;           // Pointer to the first unconverted character
                                 // (either due to small buffer size or
                                 // conversion error).

     UInt32 outLen = 0;    // output data length in number of bytes
     UInt32 numTran = 0;   // number of characters translated


     for (Lng32 loopCounter = 0; retCode != 0 &&
           (retCode != CNV_ERR_INVALID_CHAR ||
            retCode != CNV_ERR_INVALID_CS    ||
            retCode != CNV_ERR_NOINPUT) &&
            loopCounter < 16;      // Avoid infinite loop
          loopCounter++)
       {
         retCode = UTF8ToLocale(cnv_version1,
 			      (const char*)source,
 			      (Lng32)strlen(source),
 			      (const char*)buffer,
 			      bufferLen,
 			      (cnv_charset)charset,
 			      (char* &)punstr,
 			      &outLen,
 			      1, // addNullAtEnd_flag == TRUE
 			      0, // allow_invalids == FALSE
  			      &numTran,  // 64-bit
 			      0);
         if (charCount) *charCount = (Int32)numTran;
         if (errorByteOff) *errorByteOff = punstr - OrigSource;

         switch(retCode)
         {
   	  case 0:
             if (strlen(target) + strlen(buffer) >= (size_t)targetLen) // avoid overflow
             {
               retCode = CNV_ERR_TARGET_SIZE_INVALID;
               // no need to be fancy - this condition is not supposed to happen anyway
               // just chop off the extra bytes - last character in target may be chopped
               // right in the middle
               buffer[targetLen - strlen(target) - 1] = '\0';
               if (target[0] == '\0')
                 strcpy(target, buffer);
               else
                 strcat(target, buffer);
               loopCounter = 8888;    // exit loop
             }
             else
             {
             if (target[0] == '\0')
 	      strcpy(target,buffer); // We're assuming that the input buffer was null terminated.
             else
               strcat(target,buffer);
             loopCounter = 8888;        // exit loop
             }
             break;

   	  case CNV_ERR_BUFFER_OVERRUN:
           if (strlen(target) + strlen(buffer) < (size_t)targetLen)
           {
   	    if (target[0] == '\0')
   	      strncpy(target, buffer, strlen(buffer));
   	    else
   	      strncat(target, buffer, strlen(buffer));
   	    source = punstr; // We're going again, adjust the pointer to the input buffer.
   	    // intentionally keep the retCode == CNV_ERR_BUFFER_OVERRUN setting
   	    // just in case we exceed the loop count limit

             // go back to the beginning of the for loop
             }
             else // avoid overflow
             {
               retCode = CNV_ERR_TARGET_SIZE_INVALID;
               // no need to be fancy - this condition is not supposed to happen anyway
               // just chop off the extra bytes - last character in target may be chopped
               // right in the middle
               buffer[targetLen - strlen(target) - 1] = '\0';
               if (target[0] == '\0')
                 strcpy(target, buffer);
               else
                 strcat(target, buffer);
               loopCounter = 8888;    // exit loop
             }

   	    break;

   	  case CNV_ERR_INVALID_CHAR:
   	    retCode = CNV_ERR_INVALID_CHAR;
   	    break;
   	  case CNV_ERR_INVALID_CS:
   	    retCode = CNV_ERR_INVALID_CS;
   	    break;
   	  case CNV_ERR_NOINPUT:
   	    retCode = CNV_ERR_NOINPUT;
   	    break;
   	  default:
   	    retCode = CNV_ERR_INVALID_CHAR;  // Bad character set conversion
   	    break;
   	  }
 	}

         delete[] buffer;
         buffer = NULL;

 	return retCode;
   }
 #endif /* As of 8/30/2011, no callers in SQ SQL */

 // -----------------------------------------------------------------------
 // ComputeWidthInBytesOfMbsForDisplay:
 //
 // Returns the display width (in bytes) that is the closest to the
 // specified maximum display width (in bytes) without chopping the
 // rightmost multi-byte characters into two parts so that we do not
 // encounter the situation where the first part of the multi-byte
 // character is in the current display line and the other part of
 // the character is in the next display line.
 //
 // If encounters an error, return the error code (a negative number)
 // define in w:/common/csconvert.h.
 //
 //
 // In parameter pv_eCharSet contains the character set attribute
 // of the input string passed in via the parameter pp_cMultiByteStr.
 //
 // The out parameter pr_iNumOfTranslatedChars contains the number of
 // the actual (i.e., UCS4) characters translated.
 //
 // The out parameter pr_iNumOfNAWchars contains the number of UCS2
 // characters (NAwchar[acters]) instead of the number of the actual
 // (i.e., UCS4) characters.
 //
 // Note that classes NAMemory and CollHeap are the same except for
 // the names.
 // -----------------------------------------------------------------------

 // -----------------------------------------------------------------------
 // ComputeStrLenInNAWchars:
 //
 // Returns the length of the input string (in the specified character set)
 // in number of NAWchar(acters) - Note that a UTF16 character (i.e., a
 // surrogate pair) will have a count of 2 NAWchar(acters).
 //
 // Return an error code (a negative number) if encounters an error.  The
 // error code values are defined in w:/common/csconvert.h.
 // -----------------------------------------------------------------------
 Int32 ComputeStrLenInNAWchars (const char * pStr,
                                const Int32 strLenInBytes,
                                const CharInfo::CharSet strCS,
                                NAMemory *workspaceHeap) // in - default is NULL (the C++ runtime heap)
 {
   if (pStr == NULL || strLenInBytes == 0)
     return 0;

   if (strCS == CharInfo::UCS2)
     return strLenInBytes / BYTES_PER_NAWCHAR;

   Int32        lenInNAWchars = 0;
   char *       pFirstByteOfUntranslatedChar = NULL;
   UInt32       outputDataLen = 0;
   Int32        rtnCode = 0;

   cnv_charset  cnvCharSet = convertCharsetEnum(strCS);

   // Compute the size of the to-be-allocated output buffer, include a UCS-2 NULL terminator, for the worst case.
   const Int32  bufSizeInBytes = (BYTES_PER_NAWCHAR+1) * strLenInBytes + BYTES_PER_NAWCHAR;
   char *       charBuf = new (workspaceHeap) char [bufSizeInBytes];

   if (charBuf EQU NULL)
     return CNV_ERR_INVALID_HEAP;

   rtnCode =
     LocaleToUTF16 ( cnv_version1                    // in  - const enum cnv_version
                   , pStr                            // in  - const char *   in_buf
                   , strLenInBytes                   // in  - const int      in_len
                   , charBuf                         // out - const char *   out_buf - plenty of room
                   , bufSizeInBytes                  // in  - const int      out_len - buffer size in bytes
                   , cnvCharSet                      // in  - const int      cnv_charset
                   , pFirstByteOfUntranslatedChar    // out - char *       & ptr_to_first_untranslated_char
                   , & outputDataLen                 // out - unsigned int * output_data_len_p     = NULL
                // , 0                               // in  - const int      cnv_flags             = 0
                // , (Int32)FALSE                    // in  - const int      addNullAtEnd_flag     = FALSE
                // , & translatedCharCount           // out - unsigned int * translated_char_cnt_p = NULL
                // ,                                 // in  - unsigned int   max_chars_to_convert  = 0xffffffff
                   );
   lenInNAWchars = outputDataLen / BYTES_PER_NAWCHAR;
   NADELETEBASIC(charBuf, workspaceHeap);

   if (rtnCode == 0)
     return lenInNAWchars; // a positive integer value
   else
     return rtnCode;       // a negative integer value

   return lenInNAWchars;
 } // ComputeStrLenInNAWchars()

 // -----------------------------------------------------------------------
 // ComputeStrLenInUCS4chars:
 //
 // Returns the actual (i.e., UCS4) character count of the input string
 // (in the specified character set) in the actual (i.e., UCS4) characters.
 // Return an error code (a negative number) if encounters an error.  The
 // error code values are defined in w:/common/csconvert.h.  Note that
 // this function does not need to use a workspace heap.
 // -----------------------------------------------------------------------
 Int32 ComputeStrLenInUCS4chars (const char * pStr,
                                 const Int32 strLenInBytes,
                                 const CharInfo::CharSet cs)
 {
   if (cs == CharInfo::ISO88591 || strLenInBytes == 0)
     return strLenInBytes;

   Int32 numberOfUCS4chars = 0;
   Int32 firstCharLenInBuf = 0;
   UInt32 /*ucs4_t*/ UCS4value;
   cnv_charset cnvCharSet = convertCharsetEnum(cs);
   const char *s = pStr;
   Int32 num_trailing_zeros = 0;
   Int32 len = (Int32)strLenInBytes;

   while (len > 0)
   {
     firstCharLenInBuf = LocaleCharToUCS4 (s, len, &UCS4value, cnvCharSet);

     if (firstCharLenInBuf <= 0)
       return CNV_ERR_INVALID_CHAR;

     numberOfUCS4chars++;
     if ( *s == '\0' )
        num_trailing_zeros += 1;
     else
        num_trailing_zeros = 0;
     s += firstCharLenInBuf;
     len -= firstCharLenInBuf;
   }

   return numberOfUCS4chars - num_trailing_zeros ; //NOTE: Don't count trailing zeros !

 } // ComputeStrLenInUCS4chars ()
	/**********************************************************************
	// @@@ START COPYRIGHT @@@
	//
	// Licensed to the Apache Software Foundation (ASF) under one
	// or more contributor license agreements. See the NOTICE file
	// distributed with this work for additional information
	// regarding copyright ownership. The ASF licenses this file
	// to you under the Apache License, Version 2.0 (the
	// "License"); you may not use this file except in compliance
	// with the License. You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing,
	// software distributed under the License is distributed on an
	// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	// KIND, either express or implied. See the License for the
	// specific language governing permissions and limitations
	// under the License.
	//
	// @@@ END COPYRIGHT @@@
	**********************************************************************/
	/* --C++--
	*****************************************************************************
	*
	* File: conversionLocale.h
	* RCS: $Id:
	* Description: The implementation of locale related conversion routins
	*
	*
	* Created: 7/8/98
	* Modified: $ $Date: 1998/08/10 16:00:39 $ (GMT)
	* Language: C++
	* Status: $State: Exp $
	*
	*
	*
	*
	*****************************************************************************
	*/

	#include "Platform.h"
	#include "ComOperators.h"
	#include "ComASSERT.h"
	#include "NLSConversion.h"
	#include "charinfo.h"
	#include "csconvert.h"
	cnv_charset convertCharsetEnum (Int32 inset)
	{
	switch(inset){
	case CharInfo::ISO88591: return cnv_ISO88591; break;
	case CharInfo::SJIS: return cnv_SJIS; break;
	case CharInfo::UNICODE: return cnv_UTF16; break;
	case CharInfo::EUCJP: return cnv_EUCJP; break;
	case CharInfo::BIG5: return cnv_BIG5; break;
	case CharInfo::GB18030: return cnv_GB18030; break;
	case CharInfo::UTF8: return cnv_UTF8; break;
	case CharInfo::KSC5601: return cnv_KSC; break;
	case CharInfo::GB2312: return cnv_GB2312; break;
	case CharInfo::GBK: return cnv_GBK; break;
	default: return cnv_UnknownCharSet; break;
	}
	}


	// Unicode to UTF8 conversion.
	//
	// This function takes a Unicode UCS2/UTF16 string and returns its UTF8 equivalent.
	// The optional utf8String argument holds the buffer into which the Unicode string
	// will be stored. In case the argument is NULL or it is not big enough,
	// the function allocates memory from the heap (if the heap pointer is not NULL),
	// or from the C run-time system heap.
	// If the memory allocation fails, the function returns 0. If any illegal
	// characters are encountered, the function also returns 0.
	//
	charBuf* unicodeToUtf8(const NAWcharBuf& unicodeString, CollHeap *heap,
	charBuf*& utf8String, NABoolean addNullAtEnd,
	NABoolean allowInvalidCodePoint)
	{
	charBuf* cbufPtr = NULL; // tell unicodeTocset() to allocate a new buffer
	charBuf* res = NULL;
	Int32 errorcode = 0;
	res = unicodeTocset ( unicodeString // in - const NAWcharBuf &
	, heap // in - CollHeap *
	, cbufPtr
	, (Lng32) SQLCHARSETCODE_UTF8 // in - Lng32 targetCharSet
	, errorcode // out - Int32 &
	, addNullAtEnd
	, allowInvalidCodePoint
	);
	if ( res == NULL \|\| errorcode != 0) // translation failed
	return NULL;

	charBuf* output = checkSpace(heap, res->getStrLen(), utf8String, addNullAtEnd);

	if ( output == NULL )
	{
	NADELETE(res, charBuf, heap);
	return 0;
	}

	Int32 finalLengthInBytes = res->getStrLen();
	memmove(output->data(), res->data(), finalLengthInBytes);
	output->setStrLen(res->getStrLen());

	if ( addNullAtEnd == TRUE )
	output->data()[finalLengthInBytes] = '\0';

	NADELETE(res, charBuf, heap);
	return output;
	}

	Lng32 UnicodeStringToLocale(Lng32 charset, const NAWchar* wstr, Lng32 wstrLen,
	char* buf, Lng32 bufLen, NABoolean addNullAtEnd,
	NABoolean allowInvalidCodePoint)
	{
	charBuf cbuf((unsigned char*)buf, bufLen);
	charBuf* cbufPtr = &cbuf;
	charBuf* res = 0;
	Int32 errorcode = 0;

	switch (charset)
	{
	#ifdef IS_MP /* :cnu -- As of 8/30/2011, not used in SQ SQL */
	case CharInfo::KANJI_MP:
	res = unicodeToSjis(
	NAWcharBuf((NAWchar*)wstr, wstrLen), 0, cbufPtr, addNullAtEnd,
	allowInvalidCodePoint
	);
	break;
	case CharInfo::KSC5601_MP:
	res = unicodeToKsc5601(
	NAWcharBuf((NAWchar*)wstr, wstrLen), 0, cbufPtr, addNullAtEnd,
	allowInvalidCodePoint
	);
	break;
	#endif
	case CharInfo::ISO88591:
	res = unicodeToISO88591(
	NAWcharBuf((NAWchar*)wstr, wstrLen), 0, cbufPtr, addNullAtEnd,
	allowInvalidCodePoint
	);
	break;
	// case CharInfo::ISO88591:
	case CharInfo::EUCJP:
	case CharInfo::GB18030:
	case CharInfo::GB2312:
	case CharInfo::GBK:
	case CharInfo::KSC5601:
	case CharInfo::BIG5:
	case CharInfo::UTF8:
	case CharInfo::SJIS:
	res = unicodeTocset(
	NAWcharBuf((NAWchar*)wstr, wstrLen), 0, cbufPtr, charset, errorcode, addNullAtEnd,
	allowInvalidCodePoint
	);
	break;
	default:
	break;
	}

	return (res) ? res->getStrLen() : 0;
	}


	Lng32
	LocaleStringToUnicode(Lng32 charset, const char* str, Lng32 strLen,
	NAWchar* wstrBuf, Lng32 wstrBufLen, NABoolean addNullAtEnd)
	{
	// Changed the algorithm to call the new LocaleToUTF16() but keep
	// the old call to old ISO88591ToUnicode() when the character set is
	// ISO88591. We want to keep the old "pass through" behavior so
	// Use of ISO 8859-15 characters (a.k.a., Latin-9) in
	// CHARACTER SET ISO88591 target column continues to work.

	if (charset == (Lng32) CharInfo::ISO88591)
	{
	NAWcharBuf wcbuf(wstrBuf, wstrBufLen);
	NAWcharBuf* wcbufPtr = &wcbuf;
	NAWcharBuf* res = 0;
	res = ISO88591ToUnicode(
	charBuf((unsigned char*)str, strLen), 0,
	wcbufPtr, addNullAtEnd
	);
	return (res) ? res->getStrLen() : 0;
	}

	//
	// else (charset != (Lng32) CharInfo::ISO88591)
	//

	enum cnv_charset convCS = convertCharsetEnum(charset);
	if (convCS == cnv_UnknownCharSet)
	return 0; // nothing we can do; exit the routine

	UInt32 outBufSizeInBytes = wstrBufLen*sizeof(NAWchar);
	char * pFirstUntranslatedChar = NULL;
	UInt32 outputDataLenInBytes = 0;
	UInt32 translatedtCharCount = 0;
	Int32 convStatus =
	LocaleToUTF16(cnv_version1, // const enum cnv_version version
	str, // const char *in_bufr
	strLen, // const int in_len in # of bytes
	(const char )wstrBuf, // const char out_bufr
	(const Int32)outBufSizeInBytes,
	convCS, // enum cnv_charset charset -- output charset
	pFirstUntranslatedChar, // char * & first_untranslated_char
	&outputDataLenInBytes, // unsigned int *output_data_len_p
	0, // const int cnv_flags (default is 0)
	(const Int32)addNullAtEnd,
	&translatedtCharCount); // unsigned int *translated_char_cnt_p

	UInt32 outLenInW = outputDataLenInBytes/sizeof(NAWchar);
	if (convStatus == 0) // success
	return outLenInW; // include the NULL terminator if (addNullAtEnd == TRUE)

	// If convStatus != 0, LocaleToUTF16 will not add the NULL terminator
	if (addNullAtEnd && wstrBuf && wstrBufLen > 0)
	{
	if (outLenInW < (UInt32)wstrBufLen)
	wstrBuf[outLenInW] = WIDE_('\0');
	else
	{
	// assume the specified wstrBufLen includes room for the NULL terminator
	// when the passed-in addNullAtEnd parameter is set to TRUE
	wstrBuf[wstrBufLen-1] = WIDE_('\0');
	}
	}
	return 0; // tell the caller not to use data in wstrBuf
	}

	// ***************************************************************
	// * Convert the string to UTF8
	// ***************************************************************
	#if 0 /* As of 8/30/2011, there are no callers in SQ SQL */
	Int32 localeConvertToUTF8(char* source,
	Lng32 sourceLen,
	char* target,
	Lng32 targetLen,
	Lng32 charset,
	CollHeap * heap,
	Int32 *rtnCharCount,
	Int32 *errorByteOff)
	{

	char * buffer = NULL;
	Lng32 bufferLen = 0;
	Int32 retCode = 1;
	char *OrigSource = source;

	// If the input string is invalid, simply return
	if (source == NULL \|\|
	strlen(source) == 0)
	{
	return CNV_ERR_NOINPUT;
	}

	// If the input charset is UTF8, simply return.
	if (charset == cnv_UTF8)
	{
	return CNV_ERR_NO_CONVERSION_NEEDED;
	}

	// The resulting string will be bigger by a given factor.
	Int32 multiplier = 8; // Includes future charset multiplier sizes
	bufferLen = sourceLen + ((Lng32)strlen(source) * multiplier) + 1;

	// Check that the target buffer was provided and its length
	// will be able to hold the converted characters

	if (target == NULL \|\|
	targetLen < bufferLen)
	{
	return CNV_ERR_TARGET_SIZE_INVALID;
	}

	target[0] = '\0';

	buffer = new char [bufferLen];
	memset(buffer, '\0', sizeof(buffer));

	char * pinstr = source; // Pointer to the input buffer.
	char * p1stUnstranslatedChar = NULL;
	UInt32 utf8StrLenInBytes = 0; // 64-bit
	UInt32 charCount = 0; // number of characters translated/converted
	Lng32 inStrLen = (Lng32)strlen(source);

	for (Lng32 loopCounter = 0; retCode != 0 &&
	(retCode != CNV_ERR_INVALID_CHAR \|\|
	retCode != CNV_ERR_INVALID_CS \|\|
	retCode != CNV_ERR_NOINPUT) &&
	loopCounter < 16; // avoid infinite loop
	loopCounter++)
	{
	retCode = LocaleToUTF8(cnv_version1,
	pinstr,
	inStrLen,
	(const char*)buffer,
	bufferLen,
	(cnv_charset)charset,
	(char* &)p1stUnstranslatedChar,
	&utf8StrLenInBytes, // 64-bit
	(const Int32)1, // addNullAtEnd_flag == TRUE
	&charCount // 64-bit
	);
	if (rtnCharCount) *rtnCharCount = (Int32)charCount;
	if (errorByteOff) *errorByteOff = p1stUnstranslatedChar - OrigSource;

	switch(retCode)
	{
	case 0:
	if (strlen(target) + strlen(buffer) >= (size_t)targetLen) // avoid overflow
	{
	retCode = CNV_ERR_TARGET_SIZE_INVALID;
	// no need to be fancy - this condition is not supposed to happen anyway
	// just chop off the extra bytes - last character in target may be chopped
	// right in the middle
	if ((size_t)targetLen > strlen(target))
	{
	buffer[targetLen - strlen(target) - 1] = '\0';
	if (target[0] == '\0')
	strcpy(target, buffer);
	else
	strcat(target, buffer);
	}
	loopCounter = 8888; // exit loop
	}
	else // have enough room
	{
	if (target[0] == '\0')
	strcpy(target,buffer); // We're assuming that the input buffer was null terminated.
	else
	strcat(target,buffer);
	loopCounter = 8888; // exit loop
	}
	break;

	case CNV_ERR_BUFFER_OVERRUN:
	if (strlen(target) + strlen(buffer) < (size_t)targetLen) // have enough room
	{
	if (target[0] == '\0')
	{
	strncpy(target, buffer, strlen(buffer));
	strcat(target,"?"); // Force a question mark as output
	}
	else
	{
	strncat(target, buffer, strlen(buffer));
	strcat(target,"?"); // Force a question mark as output
	}

	pinstr = p1stUnstranslatedChar; // We're going again, adjust the pointer to the input buffer.
	// prepare local variables for the next conversion
	inStrLen = (Lng32)(inStrLen - (p1stUnstranslatedChar - pinstr));
	p1stUnstranslatedChar = NULL;
	// intentionally keep the retCode == CNV_ERR_BUFFER_OVERRUN setting
	// just in case we exceed the loop count limit

	// go back to the beginning of the for loop
	}
	else // avoid overflow
	{
	retCode = CNV_ERR_TARGET_SIZE_INVALID;
	// no need to be fancy - this condition is not supposed to happen anyway
	// just chop off the extra bytes - last character in target may be chopped
	// right in the middle
	if ((size_t)targetLen > strlen(target))
	{
	buffer[targetLen - strlen(target) - 1] = '\0';
	if (target[0] == '\0')
	strcpy(target, buffer);
	else
	strcat(target, buffer);
	}
	loopCounter = 8888; // exit loop
	}

	break;

	case CNV_ERR_INVALID_CHAR:
	retCode = CNV_ERR_INVALID_CHAR;
	break;
	case CNV_ERR_INVALID_CS:
	retCode = CNV_ERR_INVALID_CS;
	break;
	case CNV_ERR_NOINPUT:
	retCode = CNV_ERR_NOINPUT;
	break;
	default:
	retCode = CNV_ERR_INVALID_CHAR; // Bad character set conversion
	break;
	}
	}

	delete[] buffer;
	buffer = NULL;

	return retCode;
	}
	#endif /* As of 8/30/2011, no callers in SQ SQL */

	// ***************************************************************
	// * Encode the string from a UTF8 multibyte string to the
	// * designated charset
	// ***************************************************************

	#if 0 /* As of 8/30/2011, there are no callers in SQ SQL */
	Int32 UTF8ConvertToLocale(char* source,
	Lng32 sourceLen,
	char* target,
	Lng32 targetLen,
	Lng32 charset,
	CollHeap *heap,
	Int32 *charCount,
	Int32 *errorByteOff)
	{
	Int32 retCode = 1;
	char * OrigSource = source;

	// If the input string is invalid, simply return
	if (source == NULL \|\|
	strlen(source) == 0)
	{
	return CNV_ERR_NOINPUT;
	}

	// If the ISO_MAPPING is UTF8, simply return the string since it is already UTF8.
	if (charset == cnv_UTF8)
	{
	return CNV_ERR_NO_CONVERSION_NEEDED;
	}

	Lng32 bufferLen = (Lng32)strlen(source) + 1;
	char * buffer = NULL;

	// Check that the target buffer was provided and its length
	// will be able to hold the converted characters

	if (target == NULL \|\|
	targetLen < bufferLen)
	{
	return CNV_ERR_TARGET_SIZE_INVALID;
	}

	target[0] = '\0';

	buffer = new char [bufferLen];
	memset(buffer, '\0', sizeof(buffer));

	char* punstr = 0; // Pointer to the first unconverted character
	// (either due to small buffer size or
	// conversion error).

	UInt32 outLen = 0; // output data length in number of bytes
	UInt32 numTran = 0; // number of characters translated


	for (Lng32 loopCounter = 0; retCode != 0 &&
	(retCode != CNV_ERR_INVALID_CHAR \|\|
	retCode != CNV_ERR_INVALID_CS \|\|
	retCode != CNV_ERR_NOINPUT) &&
	loopCounter < 16; // Avoid infinite loop
	loopCounter++)
	{
	retCode = UTF8ToLocale(cnv_version1,
	(const char*)source,
	(Lng32)strlen(source),
	(const char*)buffer,
	bufferLen,
	(cnv_charset)charset,
	(char* &)punstr,
	&outLen,
	1, // addNullAtEnd_flag == TRUE
	0, // allow_invalids == FALSE
	&numTran, // 64-bit
	0);
	if (charCount) *charCount = (Int32)numTran;
	if (errorByteOff) *errorByteOff = punstr - OrigSource;

	switch(retCode)
	{
	case 0:
	if (strlen(target) + strlen(buffer) >= (size_t)targetLen) // avoid overflow
	{
	retCode = CNV_ERR_TARGET_SIZE_INVALID;
	// no need to be fancy - this condition is not supposed to happen anyway
	// just chop off the extra bytes - last character in target may be chopped
	// right in the middle
	buffer[targetLen - strlen(target) - 1] = '\0';
	if (target[0] == '\0')
	strcpy(target, buffer);
	else
	strcat(target, buffer);
	loopCounter = 8888; // exit loop
	}
	else
	{
	if (target[0] == '\0')
	strcpy(target,buffer); // We're assuming that the input buffer was null terminated.
	else
	strcat(target,buffer);
	loopCounter = 8888; // exit loop
	}
	break;

	case CNV_ERR_BUFFER_OVERRUN:
	if (strlen(target) + strlen(buffer) < (size_t)targetLen)
	{
	if (target[0] == '\0')
	strncpy(target, buffer, strlen(buffer));
	else
	strncat(target, buffer, strlen(buffer));
	source = punstr; // We're going again, adjust the pointer to the input buffer.
	// intentionally keep the retCode == CNV_ERR_BUFFER_OVERRUN setting
	// just in case we exceed the loop count limit

	// go back to the beginning of the for loop
	}
	else // avoid overflow
	{
	retCode = CNV_ERR_TARGET_SIZE_INVALID;
	// no need to be fancy - this condition is not supposed to happen anyway
	// just chop off the extra bytes - last character in target may be chopped
	// right in the middle
	buffer[targetLen - strlen(target) - 1] = '\0';
	if (target[0] == '\0')
	strcpy(target, buffer);
	else
	strcat(target, buffer);
	loopCounter = 8888; // exit loop
	}

	break;

	case CNV_ERR_INVALID_CHAR:
	retCode = CNV_ERR_INVALID_CHAR;
	break;
	case CNV_ERR_INVALID_CS:
	retCode = CNV_ERR_INVALID_CS;
	break;
	case CNV_ERR_NOINPUT:
	retCode = CNV_ERR_NOINPUT;
	break;
	default:
	retCode = CNV_ERR_INVALID_CHAR; // Bad character set conversion
	break;
	}
	}

	delete[] buffer;
	buffer = NULL;

	return retCode;
	}
	#endif /* As of 8/30/2011, no callers in SQ SQL */

	// -----------------------------------------------------------------------
	// ComputeWidthInBytesOfMbsForDisplay:
	//
	// Returns the display width (in bytes) that is the closest to the
	// specified maximum display width (in bytes) without chopping the
	// rightmost multi-byte characters into two parts so that we do not
	// encounter the situation where the first part of the multi-byte
	// character is in the current display line and the other part of
	// the character is in the next display line.
	//
	// If encounters an error, return the error code (a negative number)
	// define in w:/common/csconvert.h.
	//
	//
	// In parameter pv_eCharSet contains the character set attribute
	// of the input string passed in via the parameter pp_cMultiByteStr.
	//
	// The out parameter pr_iNumOfTranslatedChars contains the number of
	// the actual (i.e., UCS4) characters translated.
	//
	// The out parameter pr_iNumOfNAWchars contains the number of UCS2
	// characters (NAwchar[acters]) instead of the number of the actual
	// (i.e., UCS4) characters.
	//
	// Note that classes NAMemory and CollHeap are the same except for
	// the names.
	// -----------------------------------------------------------------------

	// -----------------------------------------------------------------------
	// ComputeStrLenInNAWchars:
	//
	// Returns the length of the input string (in the specified character set)
	// in number of NAWchar(acters) - Note that a UTF16 character (i.e., a
	// surrogate pair) will have a count of 2 NAWchar(acters).
	//
	// Return an error code (a negative number) if encounters an error. The
	// error code values are defined in w:/common/csconvert.h.
	// -----------------------------------------------------------------------
	Int32 ComputeStrLenInNAWchars (const char * pStr,
	const Int32 strLenInBytes,
	const CharInfo::CharSet strCS,
	NAMemory *workspaceHeap) // in - default is NULL (the C++ runtime heap)
	{
	if (pStr == NULL \|\| strLenInBytes == 0)
	return 0;

	if (strCS == CharInfo::UCS2)
	return strLenInBytes / BYTES_PER_NAWCHAR;

	Int32 lenInNAWchars = 0;
	char * pFirstByteOfUntranslatedChar = NULL;
	UInt32 outputDataLen = 0;
	Int32 rtnCode = 0;

	cnv_charset cnvCharSet = convertCharsetEnum(strCS);

	// Compute the size of the to-be-allocated output buffer, include a UCS-2 NULL terminator, for the worst case.
	const Int32 bufSizeInBytes = (BYTES_PER_NAWCHAR+1) * strLenInBytes + BYTES_PER_NAWCHAR;
	char * charBuf = new (workspaceHeap) char [bufSizeInBytes];

	if (charBuf EQU NULL)
	return CNV_ERR_INVALID_HEAP;

	rtnCode =
	LocaleToUTF16 ( cnv_version1 // in - const enum cnv_version
	, pStr // in - const char * in_buf
	, strLenInBytes // in - const int in_len
	, charBuf // out - const char * out_buf - plenty of room
	, bufSizeInBytes // in - const int out_len - buffer size in bytes
	, cnvCharSet // in - const int cnv_charset
	, pFirstByteOfUntranslatedChar // out - char * & ptr_to_first_untranslated_char
	, & outputDataLen // out - unsigned int * output_data_len_p = NULL
	// , 0 // in - const int cnv_flags = 0
	// , (Int32)FALSE // in - const int addNullAtEnd_flag = FALSE
	// , & translatedCharCount // out - unsigned int * translated_char_cnt_p = NULL
	// , // in - unsigned int max_chars_to_convert = 0xffffffff
	);
	lenInNAWchars = outputDataLen / BYTES_PER_NAWCHAR;
	NADELETEBASIC(charBuf, workspaceHeap);

	if (rtnCode == 0)
	return lenInNAWchars; // a positive integer value
	else
	return rtnCode; // a negative integer value

	return lenInNAWchars;
	} // ComputeStrLenInNAWchars()

	// -----------------------------------------------------------------------
	// ComputeStrLenInUCS4chars:
	//
	// Returns the actual (i.e., UCS4) character count of the input string
	// (in the specified character set) in the actual (i.e., UCS4) characters.
	// Return an error code (a negative number) if encounters an error. The
	// error code values are defined in w:/common/csconvert.h. Note that
	// this function does not need to use a workspace heap.
	// -----------------------------------------------------------------------
	Int32 ComputeStrLenInUCS4chars (const char * pStr,
	const Int32 strLenInBytes,
	const CharInfo::CharSet cs)
	{
	if (cs == CharInfo::ISO88591 \|\| strLenInBytes == 0)
	return strLenInBytes;

	Int32 numberOfUCS4chars = 0;
	Int32 firstCharLenInBuf = 0;
	UInt32 /ucs4_t/ UCS4value;
	cnv_charset cnvCharSet = convertCharsetEnum(cs);
	const char *s = pStr;
	Int32 num_trailing_zeros = 0;
	Int32 len = (Int32)strLenInBytes;

	while (len > 0)
	{
	firstCharLenInBuf = LocaleCharToUCS4 (s, len, &UCS4value, cnvCharSet);

	if (firstCharLenInBuf <= 0)
	return CNV_ERR_INVALID_CHAR;

	numberOfUCS4chars++;
	if ( *s == '\0' )
	num_trailing_zeros += 1;
	else
	num_trailing_zeros = 0;
	s += firstCharLenInBuf;
	len -= firstCharLenInBuf;
	}

	return numberOfUCS4chars - num_trailing_zeros ; //NOTE: Don't count trailing zeros !

	} // ComputeStrLenInUCS4chars ()