core/sql/common/csconvert.cpp - trafodion - Git at Google

 /**********************************************************************
 // @@@ START COPYRIGHT @@@
 //
 // Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements.  See the NOTICE file
 // distributed with this work for additional information
 // regarding copyright ownership.  The ASF licenses this file
 // to you under the Apache License, Version 2.0 (the
 // "License"); you may not use this file except in compliance
 // with the License.  You may obtain a copy of the License at
 //
 //   http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing,
 // software distributed under the License is distributed on an
 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.
 //
 // @@@ END COPYRIGHT @@@
 **********************************************************************/

 //
 // This source file contains interface routines to the Open Source
 // code character set conversion routines that are coded in C.
 //
 // NOTE: These routines are coded very generically so that the source
 //       for them can be used in not only the SQL/MX compiler build,
 //       but also used by the ODBC build and maybe others.

 #include <limits.h>
 #include <iconv.h>
 #include <stdio.h>
 #include <stdlib.h>

 #include "multi-byte.h"
 #include "fcconv.h"
 #include "csconvert.h"

 #include "from_GB18030.c"
 #include "from_GB2312.c"
 #include "from_GBK.c"

 #define  USE_OUR_MB_WC_DATA_TABLES
 #include "UCS_jp_data.c"
 #include "UCS_zs_data.c"
 #include "UCS_zb_data.c"
 #include "UCS_ko_data.c"
 #include "mb_iconv.c"
 #include "iconv_gen.c"


 #include "mb_lconv.c"
 #undef   USE_OUR_MB_WC_DATA_TABLES

 #if 0 // Don't need these since we chose to support GBK (a superset of GB2312) instead
 #define CODESET gb2312
 #define OUR_CS_GB2312_specific
 #define OUR_CS_GBK_specific
 #include "mb_lconv.c"
 #undef  OUR_CS_GBK_specific
 #undef  OUR_CS_GB2312_specific
 #undef  CODESET
 #endif // Don't need these since we chose to support GBK (a superset of GB2312) instead

 #define CODESET gbk
 #define OUR_CS_GBK_specific
 #define OUR_CS_GB2312_specific
 #include "mb_lconv.c"
 #undef  OUR_CS_GBK_specific
 #undef  OUR_CS_GB2312_specific
 #undef  CODESET

 #define CODESET gb18030
 #define OUR_CS_GB18030_specific
 #include "mb_lconv.c"
 #undef  OUR_CS_GB18030_specific
 #undef  CODESET

 #define ENSURE_VALID_CHARSET()                                               \
   {                                                                          \
      if ( (charset == cnv_UnknownCharSet) || (charset > cnv_Last_Valid_CS) ) \
         return( CNV_ERR_INVALID_CS );                                        \
   }

 #define ENSURE_VALID_INPUT() \
   { if ( (in_bufr == NULL) || (in_len <= 0) ) return CNV_ERR_NOINPUT; }

 #define ENSURE_VALID_OUTPUT() \
   { if ( (out_bufr == NULL) || (out_len <= 0) ) return CNV_ERR_BUFFER_OVERRUN; }

 #define CHECK_FOR_SERIOUS_ERRORS() \
   { ENSURE_VALID_CHARSET(); ENSURE_VALID_INPUT(); ENSURE_VALID_OUTPUT(); }

 #define SET_TRANSLATED_CHAR_CNT()                         \
   {                                                       \
      if ( translated_char_cnt_p != NULL )                 \
          *translated_char_cnt_p = (translated_char_cnt) ; \
   }

 #define SET_OUTPUT_DATA_LEN()                                        \
   {                                                                  \
      if ( output_data_len_p != NULL )                                \
          *output_data_len_p = ( (char *)target - (char *)out_bufr ); \
   }

 #define INITIALIZE_VARIABLES()                              \
      first_untranslated_char = (char *) in_bufr;            \
      unsigned int translated_char_cnt = 0;                  \
      unsigned char * source    = (unsigned char *) in_bufr; \
      unsigned char * endSource = source + in_len ;          \
      SET_TRANSLATED_CHAR_CNT();                             \

 typedef size_t (*csc_mbtowc_funcPtr) ( WChar_t *pwc, const char *ts,
                                        size_t maxlen, _LC_charmap_t *hdl ) ;
 typedef int    (*csc_input_utfPtr)   ( _LC_fcconv_iconv_t *, uchar_t **, int ) ;
 typedef int    (*csc_wctomb_funcPtr) ( char *s, WChar_t wc,
                                                 _LC_charmap_t *hdl ) ;
 typedef int    (*csc_output_utfPtr)  ( _LC_fcconv_iconv_t *, uchar_t *,
                                                              int, ucs4_t) ;

 static const csc_mbtowc_funcPtr  csc_mbtowc_ptrs[ cnv_Last_Valid_CS + 1] = {
    NULL,                   // cnv_UnknownCharset
    NULL,                   // cnv_UTF8
    NULL,                   // cnv_UTF16,
    NULL,                   // cnv_UTF32,
    NULL,                   // cnv_ISO88591
    Our_mbtowc_sjis_ucs4,   // See Our_mbtowc_sjis_ucs4()  in mb_lconv.c
    Our_mbtowc_eucjp_ucs4,  // See Our_mbtowc_eucjp_ucs4() in mb_lconv.c
    Our_mbtowc_cp949_ucs4,  // See Our_mbtowc_cp949_ucs4() in mb_lconv.c
    Our_mbtowc_big5_ucs4,   // See Our_mbtowc_big5_ucs4()  in mb_lconv.c
    __mbtowc_gbk_ucs4,      // See MBTOWC()  in mb_lconv.c
    __mbtowc_gb18030_ucs4,  // See MBTOWC()  in mb_lconv.c
    __mbtowc_gbk_ucs4       // See MBTOWC()  in mb_lconv.c
 };

 static const csc_input_utfPtr  csc_input_utf_ptrs[ cnv_Last_Valid_CS + 1] = {
    NULL,                   // cnv_UnknownCharset
    __input_utf8,           // cnv_UTF8
    __input_ucs2,           // cnv_UTF16,
    __input_ucs4,           // cnv_UTF32,
    NULL,                   // cnv_ISO88591
    NULL,
    NULL,
    NULL,
    NULL,
    NULL,
    NULL,
    NULL,
 };

 static const csc_wctomb_funcPtr  csc_wctomb_ptrs[ cnv_Last_Valid_CS + 1] = {
    NULL,                   // cnv_UnknownCharset
    NULL,                   // cnv_UTF8
    NULL,                   // cnv_UTF16,
    NULL,                   // cnv_UTF32,
    NULL,                   // cnv_ISO88591
    Our_wctomb_sjis_ucs4,   // See Our_mbtowc_sjis_ucs4()  in mb_lconv.c
    Our_wctomb_eucjp_ucs4,  // See Our_mbtowc_eucjp_ucs4() in mb_lconv.c
    Our_wctomb_cp949_ucs4,  // See Our_mbtowc_cp949_ucs4() in mb_lconv.c
    Our_wctomb_big5_ucs4,   // See Our_mbtowc_big5_ucs4()  in mb_lconv.c
    __wctomb_gbk_ucs4,      // See MBTOWC()  in mb_lconv.c
    __wctomb_gb18030_ucs4,  // See MBTOWC()  in mb_lconv.c
    __wctomb_gbk_ucs4       // See MBTOWC()  in mb_lconv.c
 };

 static const csc_output_utfPtr  csc_output_utf_ptrs[ cnv_Last_Valid_CS + 1] = {
    NULL,                   // cnv_UnknownCharset
    __output_utf8,          // cnv_UTF8
    __output_ucs2,          // cnv_UTF16,
    __output_ucs4,          // cnv_UTF32,
    NULL,                   // cnv_ISO88591
    NULL,
    NULL,
    NULL,
    NULL,
    NULL,
    NULL,
    NULL,
 };

 //
 //  LocaleToUTF16() - Convert a string of characters in the specified
 //                    character set to UTF-16.
 //
 int  LocaleToUTF16( const enum cnv_version version ,
                     const char *in_bufr ,  const int in_len ,
                     const char *out_bufr , const int out_len ,
                     enum cnv_charset charset ,
                     char * & first_untranslated_char ,
                     unsigned int *output_data_len_p ,
                     const int cnv_flags ,
                     const int addNullAtEnd_flag ,
                     unsigned int * translated_char_cnt_p ,
                     unsigned int max_chars_to_convert )
 {
     if ( version != cnv_version1 )
         return CNV_ERR_INVALID_VERS;

     // Initialize some return values early ... in case we give error

     INITIALIZE_VARIABLES();

     ucs2_t * target  = (ucs2_t *) out_bufr;
     ucs2_t * endTarget = target + ( out_len / sizeof(ucs2_t) );

     SET_OUTPUT_DATA_LEN();

     CHECK_FOR_SERIOUS_ERRORS();

     // We initialize a  _LC_fcconv_iconv_rec  struct here.
     // NOTE: For our purposes, the ONLY thing that
     //       must be initialized is the flags word.
     //
     _LC_fcconv_iconv_rec  cd;

     int revBytes = cnv_flags & CNV_REVERSE_OUTBYTES;
     cd.flags = CONV_BOM_WRITTEN | CONV_INPUT_PROCESSED |
                (revBytes ? CONV_REVERSE_OUTBYTE : 0);

     if ( max_chars_to_convert == 0xFFFFFFFF )
          max_chars_to_convert  = (unsigned int)in_len ;
     //
     // Fast path where charset is ISO88591 or a multi-byte charset.
     // An assumption made here is that non-ASCII chars will rarely be seen.
     // If one is found, we break out of this fast path and go down the
     // slow path.
     //
     int charsetIsWide = 0;
     if ( (charset == cnv_UTF16) || (charset == cnv_UTF32) )
        charsetIsWide = 1 ;

     if ( ! charsetIsWide )
     {
        unsigned int UCS4 = 0;

        int  maxLoopCnt = (int)( endTarget - target );
        if ( maxLoopCnt > in_len )
             maxLoopCnt = in_len ;
        if ( maxLoopCnt > (int) max_chars_to_convert )
             maxLoopCnt = (int) max_chars_to_convert ;

        unsigned int maxCharToHandle = (charset == cnv_ISO88591) ? 0x0FF : 0x7F;

        if ( revBytes )
        {
           while ( ( --maxLoopCnt >= 0 ) &&
                   ( (UCS4 = *source) <= maxCharToHandle ) )
           {
              source++;
              *target++ = UCS4 << 8;
           }
        }
        else
        {
           while ( ( --maxLoopCnt >= 0 ) &&
                   ( (UCS4 = *source) <= maxCharToHandle ) )
           {
              source++;
              *target++ = UCS4;
           }
        }
        translated_char_cnt = source - (unsigned char *)in_bufr ;
     }

     //
     // Slower path that handles all locales.
     //
     csc_mbtowc_funcPtr inputFuncPtr  ;
     csc_input_utfPtr   inputFuncPtr2 = NULL;

     inputFuncPtr = csc_mbtowc_ptrs[ charset ];
     if ( ( inputFuncPtr == NULL ) && ( charset != cnv_ISO88591 ) )
     {
        inputFuncPtr2 = csc_input_utf_ptrs[ charset ];
        if ( inputFuncPtr2 == NULL )
           return( CNV_ERR_INVALID_CS ); // Shouldn't ever happen ...
     }

     while ( (source < endSource) &&
             (translated_char_cnt < max_chars_to_convert) ) {

       int UCS4 = 0;  // Init. in loop in case new char longer than prev one.

       if ( ( (UCS4 = *source) < 0x080 )  &&  // If ASCII and
            ( target < endTarget )        &&  // output buffer has space yet
            ( ! charsetIsWide    ) )
       {
          source++ ;
          if ( revBytes )
             UCS4 <<= 8;
          *target++ = UCS4;
          translated_char_cnt += 1;
       }
       else
       {
         size_t mblen ;
         int ct   = -1; // Init. - assume an error.
         unsigned char * tmpsrc = source;
         first_untranslated_char = (char *)source; //...in case char is bad

         if ( charset == cnv_ISO88591 ) {
            UCS4 = *source;
            mblen = 1;
         }
         else {
            if ( inputFuncPtr != NULL )
                 mblen = (*inputFuncPtr)( (WChar_t *) &UCS4,
                                     (const char *)source, endSource - source,
                                     NULL );
            else {
                 UCS4 = (*inputFuncPtr2)( &cd, &tmpsrc, endSource - source );
                 if ( UCS4 < 0 ) mblen = -1;
                 else {
                        mblen = tmpsrc - source;
                 }
            }
         }
         if ( mblen == 0 ) /* mblen==0 when data is starts with '\0' */
              mblen = 1;
         if ( (mblen > 0) && (mblen < 0x7FFFFFFF) ) {
             if ( UCS4 < 0x0000D800 ) {  // If simple UCS2, just store it!
                if ( target < endTarget ) {
                   if ( revBytes )
                      UCS4 = ( ( UCS4 & 0x00FF ) << 8 ) |  ( UCS4 >> 8 ) ;

                   *target = UCS4;
                   ct = 2;
                }
                else ct = ERR_BUFFER_OVERRUN;
             }
             else {  // Not simple UCS2, so call routine that can handle it
                ct = __output_ucs2( &cd, (uchar_t *)target,
                                (endTarget - target)*sizeof(ucs2_t) , UCS4);
             }
         }
         else
             ct = mblen; /* Put error code in ct */

         if ( ct < 0 ) {
            // About to issue an error, so update return values first.

            SET_TRANSLATED_CHAR_CNT();
            SET_OUTPUT_DATA_LEN();

            if ( ct == ERR_BUFFER_OVERRUN )
                return CNV_ERR_BUFFER_OVERRUN;
            else
                return CNV_ERR_INVALID_CHAR;
         }
         target    += ct/sizeof( ucs2_t ) ;
         translated_char_cnt += 1;
         source    += mblen;
       }
     }
     first_untranslated_char = (char *) source;
     SET_TRANSLATED_CHAR_CNT();

     int rtnVal = 0;
     if ( addNullAtEnd_flag == TRUE ) {
         if ( target < endTarget ) {
            *target++ = 0;  // Store a 16-bit NULL
         }
         else {
            rtnVal = CNV_ERR_BUFFER_OVERRUN;
         }
     }
     SET_OUTPUT_DATA_LEN();
     return rtnVal;
 }

 //
 //  LocaleToUTF8() - Convert a string of characters in the specified
 //                    character set to UTF8.
 //
 int  LocaleToUTF8( const enum cnv_version version ,
                     const char *in_bufr ,  const int in_len ,
                     const char *out_bufr , const int out_len ,
                     enum cnv_charset charset ,
                     char * & first_untranslated_char ,
                     unsigned int *output_data_len_p ,
                     const int addNullAtEnd_flag ,
                     unsigned int * translated_char_cnt_p )
 {
     if ( version != cnv_version1 )
         return CNV_ERR_INVALID_VERS;

     INITIALIZE_VARIABLES();

     unsigned char * target    = (unsigned char *) out_bufr;
     unsigned char * endTarget = target + out_len ;

     SET_OUTPUT_DATA_LEN();

     CHECK_FOR_SERIOUS_ERRORS();

     // We initialize a  _LC_fcconv_iconv_rec  struct here.
     // NOTE: For our purposes, the ONLY thing that
     //       must be initialized is the flags word.
     //
     _LC_fcconv_iconv_rec  cd;

     cd.flags = CONV_BOM_WRITTEN | CONV_INPUT_PROCESSED ;

     //
     // Fast path where charset is ISO88591 or a multi-byte charset.
     // An assumption made here is that non-ASCII chars will rarely be seen.
     // If one is found, we break out of this fast path and go down the
     // slow path.
     //
     int charsetIsWide = 0;
     if ( (charset == cnv_UTF16) || (charset == cnv_UTF32) )
        charsetIsWide = 1 ;

     if ( ! charsetIsWide )
     {
        unsigned int UCS4 = 0;

        int  maxLoopCnt = endTarget - target ;
        if ( maxLoopCnt > in_len )
             maxLoopCnt = in_len ;

        while ( ( --maxLoopCnt >= 0) && ( (UCS4 = *source) < 0x080 ) )
        {
           source++ ;
           *target++ = UCS4;
        }
        translated_char_cnt = source - (unsigned char *)in_bufr ;
     }

     //
     // Slower path that handles all locales.
     //
     csc_mbtowc_funcPtr inputFuncPtr  ;
     csc_input_utfPtr   inputFuncPtr2 = NULL;

     inputFuncPtr = csc_mbtowc_ptrs[ charset ];
     if ( ( inputFuncPtr == NULL ) && ( charset != cnv_ISO88591 ) )
     {
        inputFuncPtr2 = csc_input_utf_ptrs[ charset ];
        if ( inputFuncPtr2 == NULL )
           return( CNV_ERR_INVALID_CS ); // Shouldn't ever happen ...
     }

     while ( source < endSource ) {

       int UCS4 = 0;  // Init. in loop in case new char longer than prev one.

       if ( ( (UCS4 = *source) < 0x080 )  &&   // If ASCII and
            ( target < endTarget )        &&   // output buffer has space yet
            ( ! charsetIsWide ) )
       {
          source++ ;
          *target++ = UCS4;
          translated_char_cnt += 1;
       }
       else
       {
         size_t mblen ;
         int ct   = -1; // Init. - assume an error.
         unsigned char * tmpsrc = source;
         first_untranslated_char = (char *)source; //...in case char is bad

         if ( charset == cnv_ISO88591 ) {
            UCS4 = *source;
            mblen = 1;
         }
         else if ( inputFuncPtr != NULL )
            mblen = (*inputFuncPtr)( (WChar_t *) &UCS4,
                                (const char *)source, endSource - source,
                                NULL );
         else {
            UCS4 = (*inputFuncPtr2)( &cd, &tmpsrc, endSource - source );
            if ( UCS4 < 0 ) mblen = -1;
            else            mblen = tmpsrc - source;
         }

         if ( mblen == 0 ) /* mblen==0 when data is starts with '\0' */
              mblen = 1;
         if ( (mblen > 0) && (mblen < 0x7FFFFFFF) )
             ct = __output_utf8( &cd, target, endTarget - target , UCS4);
         else
             ct = mblen; /* Put error code in ct */

         if ( ct < 0 ) {
            // About to give an error, so update return values first.

            SET_TRANSLATED_CHAR_CNT();
            SET_OUTPUT_DATA_LEN();

            if ( ct == ERR_BUFFER_OVERRUN )
                return CNV_ERR_BUFFER_OVERRUN;
            else
                return CNV_ERR_INVALID_CHAR;
         }
         source    += mblen;
         target    += ct;
         translated_char_cnt += 1;
       }
     }
     first_untranslated_char = (char *) source;
     SET_TRANSLATED_CHAR_CNT();

     int rtnVal = 0;
     if ( addNullAtEnd_flag == TRUE ) {
         if ( target < endTarget ) {
            *target++ = 0;  // Store an 8-bit NULL
         }
         else {
            rtnVal = CNV_ERR_BUFFER_OVERRUN;
         }
     }
     SET_OUTPUT_DATA_LEN();
     return rtnVal;
 }

 //
 // LocaleCharToUCS4() converts the FIRST char in the input string to its
 // UCS4 value.  Returns the UCS4 value at location specified AND the
 // length of the input character in bytes as the return value.
 //
 int  LocaleCharToUCS4( const char *in_bufr,       //Ptr to Input string
                        const int in_len,          //Len of Input string (bytes)
                        unsigned int *UCS4ptr ,    //Ptr to output location
                        enum cnv_charset charset )  //Locale Character Set
 {
         ENSURE_VALID_CHARSET();
         ENSURE_VALID_INPUT();

         unsigned char * tmpsrc = (unsigned char *) in_bufr;
         size_t mblen ;
         int UCS4 = 0;

         // We initialize a  _LC_fcconv_iconv_rec  struct here.
         // NOTE: For our purposes, the ONLY thing that
         //       must be initialized is the flags word.
         //
         _LC_fcconv_iconv_rec  cd;

         cd.flags = CONV_BOM_WRITTEN | CONV_INPUT_PROCESSED ;

         if ( charset == cnv_ISO88591 ) {
            UCS4 = *(unsigned char *)in_bufr;
            mblen = 1;
         }
         else {
            csc_mbtowc_funcPtr inputFuncPtr  ;
            csc_input_utfPtr   inputFuncPtr2 ;

            inputFuncPtr = csc_mbtowc_ptrs[ charset ];
            if ( inputFuncPtr != NULL )
                 mblen = (*inputFuncPtr)( (WChar_t *) &UCS4,
                                     (const char *)in_bufr, in_len, NULL );
            else {
                 inputFuncPtr2 = csc_input_utf_ptrs[ charset ];
                 if ( inputFuncPtr2 == NULL )
                    return( CNV_ERR_INVALID_CS ); // Shouldn't ever happen ...

                 UCS4 = (*inputFuncPtr2)( &cd, &tmpsrc, in_len );
                 if ( UCS4 < 0 ) mblen = -1;
                 else            mblen = tmpsrc - (unsigned char *)in_bufr;
            }
         }

         if ( mblen == 0 ) /* mblen==0 when data is starts with '\0' */
              mblen = 1;
         if ( (mblen > 0) && (mblen < 0x7FFFFFFF) ) {
             if ( UCS4ptr != NULL )  *UCS4ptr = UCS4; // Return the UCS4 value
             return (mblen);
         }
         return (CNV_ERR_INVALID_CHAR);
 }

 //
 // UCS4ToLocaleChar() converts the UCS4 value to the specified character set
 // and stores the character in the output buffer specified.
 // Returns length of the output character in bytes as the return value.
 //
 int  UCS4ToLocaleChar( const unsigned int *UCS4ptr , //Ptr to input char
                        const char *out_bufr,         //Ptr to output bufr
                        const int out_len,            //Len of output bufr
                        enum cnv_charset charset )    //Locale Character Set
 {
         char tmpspace[8]; /* big enough to ensure no buffer overflow */

         ENSURE_VALID_CHARSET();

         char * target = (char *) out_bufr;
         int UCS4 = *UCS4ptr;
         int ct = -1;

         // We initialize a  _LC_fcconv_iconv_rec  struct here.
         // NOTE: For our purposes, the ONLY thing that
         //       must be initialized is the flags word.
         //
         _LC_fcconv_iconv_rec  cd;

         cd.flags = CONV_BOM_WRITTEN | CONV_INPUT_PROCESSED ;

         if ( UCS4ptr == NULL )
             return CNV_ERR_NOINPUT;

         if ( charset == cnv_ISO88591 ) {
            if ( UCS4 <= 0x0FF ) { /* If valid ISO88591 char */
                 tmpspace[0] = UCS4;
                 ct = 1;
            }
         }
         else {
            csc_wctomb_funcPtr  outputFuncPtr  ;
            csc_output_utfPtr   outputFuncPtr2 ;

            outputFuncPtr = csc_wctomb_ptrs[ charset ];
            if ( outputFuncPtr != NULL )
                 ct = (int) (*outputFuncPtr)( tmpspace, (WChar_t) UCS4,
                                              (_LC_charmap_t *)NULL );
            else {
                 outputFuncPtr2 = csc_output_utf_ptrs[ charset ];
                 if ( outputFuncPtr2 == NULL )
                      return( CNV_ERR_INVALID_CS ); // Shouldn't ever happen ...

                 ct = (*outputFuncPtr2)( &cd, (unsigned char *)(&tmpspace),
                                   sizeof(tmpspace), UCS4 );
            }
         }

         if ( ct < 0 )                // If Bad character or conversion error
                 return (CNV_ERR_INVALID_CHAR);

         if ( ct <= out_len ) {
             if ( target != NULL ) {
                  char * tmpPtr = &tmpspace[0];
                  int iii = ct;
                  while (iii-- > 0 )
                       *target++ = *tmpPtr++;
             }
             return ( ct );
         }
         return CNV_ERR_BUFFER_OVERRUN;
 }

 //
 // csc_get_subst_char() -- Get substitution char and its length
 //
 // Arguments:  substitution_char - pointer to user's specified char
 //             tmpspace - pointer to caller's place to put the char
 //             charset - an "enum cnv_charset" value indicating the
 //                       target character set
 //
 // Return value: Length of substitution char in bytes
 //
 static int csc_get_subst_char( const char * substitution_char,
                                char * tmpspace,
                                enum cnv_charset charset )
 {
     int sc_ln = 1;           //Default: 1-byte substitution char
     tmpspace[0] = '?';
     if ( substitution_char != NULL ) {
        if ( charset == cnv_UTF16 ) {
           sc_ln = 2;
           tmpspace[0] = substitution_char[0] ;
           tmpspace[1] = substitution_char[1] ;
        }
        else if ( charset == cnv_UTF32 ) {
           sc_ln = 4;
           tmpspace[0] = substitution_char[0] ;
           tmpspace[1] = substitution_char[1] ;
           tmpspace[2] = substitution_char[2] ;
           tmpspace[3] = substitution_char[3] ;
        }
        else {
           //
           // If 1st byte of substitution char string is 0, use '?'.
           // Else, if string is 1 byte long, use it as is.
           // Else, if string is 2 bytes long, use it as is.
           // Else use '?'.
           //
           if (   ( substitution_char[0] != 0 )     &&
                ( ( substitution_char[1] == 0 )     ||
                  ( substitution_char[2] == 0 ) ) ) {
                tmpspace[0] = substitution_char[0] ;
                tmpspace[1] = substitution_char[1] ;
                if ( tmpspace[1] != 0 )
                   sc_ln = 2;
           }
        }
     }
     return ( sc_ln );
 }

 int addVariableLengthNull( unsigned char * & target,
                            unsigned char * endTarget,
                            int len_of_NULL )
 {
    if ( len_of_NULL <= (endTarget - target) ) {
       if ( len_of_NULL >= 2 ) {
          *target++ = 0;
          if ( len_of_NULL == 4 ) {
             *target++ = 0; *target++ = 0;
          }
       }
       *target++ = 0;
       return 0;
    }
    return CNV_ERR_BUFFER_OVERRUN;
 }

 //
 //  UTF16ToLocale() - Convert a string of UTF-16 characters
 //                    to the specified character set.
 //
 int  UTF16ToLocale( const enum cnv_version version ,
                     const char *in_bufr ,  const int in_len ,
                     const char *out_bufr , const int out_len ,
                     enum cnv_charset charset ,
                     char * & first_untranslated_char ,
                     unsigned int *output_data_len_p ,
                     const int cnv_flags ,
                     const int addNullAtEnd_flag ,
                     const int allow_invalids ,
                     unsigned int * translated_char_cnt_p ,
                     const char *substitution_char )
 {
     if ( version != cnv_version1 )
         return CNV_ERR_INVALID_VERS;

     INITIALIZE_VARIABLES();

     unsigned char * target    = (unsigned char *)out_bufr;
     unsigned char * endTarget = target + out_len ;

     SET_OUTPUT_DATA_LEN();

     CHECK_FOR_SERIOUS_ERRORS();

     int len_of_NULL = 1;
     int ct = 0;

     // We initialize a  _LC_fcconv_iconv_rec  struct here.
     // NOTE: For our purposes, the ONLY thing that
     //       must be initialized is the flags word.
     //
     _LC_fcconv_iconv_rec  cd;

     cd.flags = CONV_INPUT_PROCESSED | CONV_BOM_WRITTEN |
                ((cnv_flags && CNV_REVERSE_INBYTES) ? CONV_REVERSE_INBYTE : 0);

     //
     // Fast path where charset is ISO88591 or a multi-byte charset.
     // An assumption made here is that non-valid chars will rarely be seen.
     // If one is found, we break out of this fast path and go down the
     // slow path.
     //
     int charsetIsWide = 0;
     if ( (charset == cnv_UTF16) || (charset == cnv_UTF32) )
        charsetIsWide = 1 ;

     if ( ( ! charsetIsWide ) &&
          ( (cnv_flags & CNV_REVERSE_INBYTES) == 0 ) )
     {
        unsigned int UCS4    = 0;

        int  maxLoopCnt = endTarget - target ;
        if ( maxLoopCnt > (int) ( in_len / sizeof(ucs2_t) ) )
             maxLoopCnt = (int) ( in_len / sizeof(ucs2_t) ) ;

        unsigned int maxCharToHandle = (charset == cnv_ISO88591) ? 0x0FF : 0x7F;

        UCS4 = *( (ucs2_t *)source );
        if ( cnv_flags & CNV_REVERSE_INBYTES )
        {
           while ( --maxLoopCnt >= 0 )      // While more to do
           {
              UCS4 = *( (ucs2_t *)source ) ;
              UCS4 = ( ( UCS4 & 0x00FF ) << 8 ) |  ( UCS4 >> 8 ) ;
              if ( UCS4 <= maxCharToHandle )
              {
                 source   += sizeof(ucs2_t);
                 *target++ = UCS4;
              }
              else break;
           }
        }
        else while ( ( --maxLoopCnt >= 0 )   &&   // While more to do and
                     ( ( UCS4 = *( (ucs2_t *)source ) ) <= maxCharToHandle ) )
        {
           source   += sizeof(ucs2_t);
           *target++ = UCS4;
        }

        translated_char_cnt = target - (unsigned char *)out_bufr ;
     }

     //
     // Slower path that handles all locales.
     //
     csc_wctomb_funcPtr  outputFuncPtr  ;
     csc_output_utfPtr   outputFuncPtr2 = NULL;

     outputFuncPtr = csc_wctomb_ptrs[ charset ];
     if ( ( outputFuncPtr == NULL ) && ( charset != cnv_ISO88591 ) )
     {
        outputFuncPtr2 = csc_output_utf_ptrs[ charset ];
        if ( outputFuncPtr2 == NULL )
           return( CNV_ERR_INVALID_CS ); // Shouldn't ever happen ...
     }

     while ( source < endSource ) {

       unsigned int UCS4 = *((ucs2_t *)source);

       if ( cnv_flags & CNV_REVERSE_INBYTES )
          UCS4 = ( ( UCS4 & 0x00FF ) << 8 ) |  ( UCS4 >> 8 ) ;

       if ( ( UCS4 < 0x080 )       &&  // If ASCII and
            ( target < endTarget ) &&  // there is space yet and
            ( ! charsetIsWide ) )      // output is not wide characters
       {
          source   += sizeof(ucs2_t);
          *target++ = UCS4;
          translated_char_cnt += 1 ;
       }
       else
       {
         char tmpspace[8]; /* big enough to ensure no buffer overflow */

         first_untranslated_char = (char *) source; //...in case char is bad

         if ( UCS4 < 0xD800 )  // If simple UCS2, use it as already retrieved
            source += sizeof(ucs2_t);
         else
            UCS4 = __input_ucs2( &cd, &source, endSource - source );

         ct = -1;
         if ( (UCS4 != ERR_INPUT_INCOMPLETE) && (UCS4 != ERR_INVALID_CHAR) ) {
            if ( charset == cnv_ISO88591 ) {
                 if ( UCS4 <= 0x0FF ) {      // If valid ISO88591 char
                      tmpspace[0] = UCS4;
                      ct = 1;
                 }
            }
            else {
               if ( outputFuncPtr != NULL )
                  ct = (*outputFuncPtr)( tmpspace, (WChar_t) UCS4,
                                   (_LC_charmap_t *)NULL);
               else {
                  ct = (*outputFuncPtr2)( &cd, (unsigned char *)(tmpspace),
                                    sizeof(tmpspace), UCS4 );

                  if ( charset == cnv_UTF16 )
                     len_of_NULL = 2;
                  else if ( charset == cnv_UTF32 )
                     len_of_NULL = 4;
               }
            }
         }

         if ( ct < 0 ) {  // If Bad character or conversion error
             if ( allow_invalids == FALSE ) {
                 SET_TRANSLATED_CHAR_CNT();
                 SET_OUTPUT_DATA_LEN();
                 return (CNV_ERR_INVALID_CHAR);
             }

             ct = csc_get_subst_char( substitution_char, tmpspace , charset );

             if ( (UCS4 == ERR_INPUT_INCOMPLETE) || (UCS4 == ERR_INVALID_CHAR) )
                 source += 2 ;   // Skip bad character
             //else source was already incremented by __input_ucs2()
         }
         if ( ct <= (endTarget - target) ) {
             char * tmpPtr = &tmpspace[0];
             while (ct-- > 0 )
                  *target++ = *tmpPtr++;
             translated_char_cnt += 1;
         }
         else {
             SET_TRANSLATED_CHAR_CNT();
             SET_OUTPUT_DATA_LEN();
             return CNV_ERR_BUFFER_OVERRUN;
         }
       }
     }
     first_untranslated_char = (char *) source;
     SET_TRANSLATED_CHAR_CNT();

     int rtnVal = 0;
     if ( addNullAtEnd_flag == TRUE ) {
        rtnVal = addVariableLengthNull( target, endTarget, len_of_NULL );
     }
     SET_OUTPUT_DATA_LEN();
     return rtnVal;
 }
 #define TWO_BYTE_UTF8(firstByte, src, nxtB) ( (((firstByte) & 0xE0) == 0xC0) && \
                                         (( (nxtB=(*((src)+1))) & 0xC0) == 0x80) )
 //
 //  UTF8ToLocale() - Convert a string of UTF8 characters
 //                    to the specified character set.
 //
 int  UTF8ToLocale( const enum cnv_version version ,
                     const char *in_bufr ,  const int in_len ,
                     const char *out_bufr , const int out_len ,
                     enum cnv_charset charset ,
                     char * & first_untranslated_char ,
                     unsigned int *output_data_len_p ,
                     const int addNullAtEnd_flag ,
                     const int allow_invalids ,
                     unsigned int * translated_char_cnt_p ,
                     const char *substitution_char )
 {
     if ( version != cnv_version1 )
         return CNV_ERR_INVALID_VERS;

     INITIALIZE_VARIABLES();

     unsigned char * target    = (unsigned char *)out_bufr;
     unsigned char * endTarget = target + out_len ;

     SET_OUTPUT_DATA_LEN();

     CHECK_FOR_SERIOUS_ERRORS();

     int len_of_NULL = 1;
     int ct = 0;

     // We initialize a  _LC_fcconv_iconv_rec  struct here.
     // NOTE: For our purposes, the ONLY thing that
     //       must be initialized is the flags word.
     //
     _LC_fcconv_iconv_rec  cd;

     cd.flags = CONV_BOM_WRITTEN | CONV_INPUT_PROCESSED ;

     //
     // Fast path where charset is ISO88591 or a multi-byte charset.
     // An assumption made here is that invalid chars will rarely be seen.
     // If one is found, we break out of this fast path and go down the
     // slow path.
     //
     int charsetIsWide = 0;
     if ( (charset == cnv_UTF16) || (charset == cnv_UTF32) )
        charsetIsWide = 1 ;

     if ( ! charsetIsWide )
     {
       unsigned int UCS4    = 0;

       int  maxLoopCnt = endTarget - target ;
       if ( maxLoopCnt > in_len )
            maxLoopCnt = in_len ;

       while ( --maxLoopCnt >= 0 )
       {
          // If character is valid ASCII
          if ( (UCS4 = *source) < 0x080 ) {
             source++;
             *target++ = UCS4;
          }
          else
          {
             if (charset != cnv_ISO88591)
                // Let slower path handle the rest of the buffer.
                break;

             int nxtByte = 0;
             if ( ( maxLoopCnt > 0 )  && TWO_BYTE_UTF8( UCS4, source, nxtByte ) )
             {
                // Convert from UTF8 to UCS4.
                UCS4 = (UCS4 & 0x1F) << 6 | ( nxtByte & 0x3F );
                if ( UCS4 > 0x0FF )
                   break; // Non-ISO88591.  Let slower path handle the rest.
                source   += 2 ;
                *target++ = UCS4;
                if ( maxLoopCnt > (int) ( endSource - source ) )
                     maxLoopCnt-- ;  // Ensure we don't overrun input buffer
             }
             else break; // Let slower path handle the rest.
          }
       }
       translated_char_cnt = target - (unsigned char *)out_bufr ;
     }
     else if ( charset == cnv_UTF16 )
     {
        unsigned int UCS4    = 0;
        while ( ( source < endSource )         &&  // more input and
                ( ( UCS4 = *source ) < 0x080 ) &&  // it is ASCII and
                ( (endTarget - target) >= 2  ) )   // there is space left
        {
           *((ucs2_t *)target) = UCS4;
           source++;
           target += sizeof(ucs2_t);
        }
        translated_char_cnt = source - (unsigned char *)in_bufr;
        len_of_NULL = 2;
     }

     //
     // Slower path that handles all locales.
     //
     csc_wctomb_funcPtr  outputFuncPtr  ;
     csc_output_utfPtr   outputFuncPtr2 = NULL;

     outputFuncPtr = csc_wctomb_ptrs[ charset ];
     if ( ( outputFuncPtr == NULL ) && ( charset != cnv_ISO88591 ) )
     {
        outputFuncPtr2 = csc_output_utf_ptrs[ charset ];
        if ( outputFuncPtr2 == NULL )
           return( CNV_ERR_INVALID_CS ); // Shouldn't ever happen ...
     }

     while ( source < endSource ) {

       unsigned int UCS4 = *source;
       if ( ( UCS4 < 0x080 )       &&  // If ASCII and
            ( target < endTarget ) &&  // there is space yet and
            ( ! charsetIsWide ) )      // output is not wide characters
       {
          *target++ = UCS4;
          source++;
          translated_char_cnt += 1;
       }
       else
       {
         char tmpspace[8]; /* big enough to ensure no buffer overflow */

         first_untranslated_char = (char *) source; //...in case char is bad

         int UCS4 = __input_utf8( &cd, &source, endSource - source);

         ct = -1;
         if ( (UCS4 != ERR_INPUT_INCOMPLETE) && (UCS4 != ERR_INVALID_CHAR) ) {
            if ( charset == cnv_ISO88591 ) {
               if ( UCS4 <= 0x0FF )     {     // If valid ISO88591 char
                    tmpspace[0] = UCS4;
                    ct = 1;
               }
            }
            else {
               if ( outputFuncPtr != NULL )
                    ct = (*outputFuncPtr)( tmpspace, (WChar_t) UCS4, NULL );
               else {
                    ct = (*outputFuncPtr2)( &cd, (unsigned char *)(tmpspace),
                                            sizeof(tmpspace), UCS4);

                    if ( charset == cnv_UTF16 )
                       len_of_NULL = 2;
                    else if ( charset == cnv_UTF32 )
                       len_of_NULL = 4;
               }
            }
         }

         if ( ct < 0 ) {  // If Bad character or conversion error
             if ( allow_invalids == FALSE ) {
                 SET_TRANSLATED_CHAR_CNT();
                 SET_OUTPUT_DATA_LEN();
                 return (CNV_ERR_INVALID_CHAR);
             }

             ct = csc_get_subst_char( substitution_char, tmpspace , charset );

             if ( (UCS4 == ERR_INPUT_INCOMPLETE) || (UCS4 == ERR_INVALID_CHAR) )
                 source += 1 ;   // Skip bad character
             //else source was already incremented by __input_utf8()
         }
         if ( ct <= (endTarget - target) ) {
             char * tmpPtr = &tmpspace[0];
             translated_char_cnt += 1;
             while (ct-- > 0 )
                  *target++ = *tmpPtr++;
         }
         else {
            SET_TRANSLATED_CHAR_CNT();
            SET_OUTPUT_DATA_LEN();
            return CNV_ERR_BUFFER_OVERRUN;
         }
       }
     }
     first_untranslated_char = (char *) source;
     SET_TRANSLATED_CHAR_CNT();

     int rtnVal = 0;
     if ( addNullAtEnd_flag == TRUE ) {
        rtnVal = addVariableLengthNull( target, endTarget, len_of_NULL );
     }
     SET_OUTPUT_DATA_LEN();
     return rtnVal;
 }


 int lightValidateUTF8Str(const char *bufr,
                          int in_len,
                          int max_chars,
                          int ignore_trailing_blanks)
 {
   unsigned char c;
   int pos  = 0;
   int numc = 0;
   int maxc = ( max_chars ? max_chars : in_len );
   int byte = 1;
   int last_good_pos = 0;

   if ( (in_len < 0) || (max_chars < 0) ) // Defensive programming: Ensure no memory access exceptions.
     return -1;                           // Shouldn't ever happen, of course.

   while (pos < in_len && numc < maxc)
     {
       c = bufr[pos];

       if (c < 0x80 && byte == 1) // ascii
         numc++;
       else if (c >= 0x80 && c < 0xc0 && byte > 1) // second, third, or fourth byte of a multi-byte sequence
         {
           if (--byte == 1)
             numc++;
         }
       else if (c >= 0xc0 && c < 0xe0 && byte == 1) // start of 2-byte sequence
         byte = 2;
       else if (c >= 0xe0 && c < 0xf0 && byte == 1) // start of 3-byte sequence
         byte = 3;
       else if (c >= 0xf0 && c < 0xfc && byte == 1) // start of 4-byte sequence
         byte = 4;
       else
         return -1; // invalid byte sequence

       pos++;
     }

   if (byte == 1 && numc <= maxc)
     return pos; // string is valid and has valid char count, pos == in_len

   // We encountered too many characters or a partial character. The string
   // bufr[0..pos-1] contains numc entire characters and maybe one partial character.

   // check whether the extra characters are all blanks and it's safe to ignore them
   if (ignore_trailing_blanks && byte == 1)
     {
       int blankPos = pos-1; // the previous character is already past the char. limit

       while (blankPos < in_len && bufr[blankPos] == ' ')
         blankPos++;

       if (blankPos >= in_len)
         return in_len; // extra chars were all blanks
     }

   // back up until the end of the valid characters

   while (byte > 1 || numc > maxc)
     {
       pos--;
       c = bufr[pos];

       if (c < 0x80 || c >= 0xc0)
         {
           // this is the first byte of a character
           if (byte > 1)
             byte = 1;
           else
             numc--;
         }
     }

   return pos; // string needs to be truncated at position "pos" (to length "pos")
 }

 #if 0 /* Not currently called anywhere.*/
 int lightValidateUTF8StrAndPad(char *bufr,
                                int in_len,
                                int max_chars,
                                int ignore_trailing_blanks)
 {
   int trunc = lightValidateUTF8Str(bufr, in_len, max_chars, ignore_trailing_blanks);

   if (trunc < in_len && trunc >= 0)
     {
       for (int i=trunc; i<in_len; i++)
         bufr[i] = ' ';
     }

   return trunc;
 }
 #endif /* Not currently called anywhere.*/

 int fillWithMinUTF8Chars(char *bufr,
                          int in_len,
                          int max_chars)
 {
   int i;

   if (max_chars <= 0)
     max_chars = in_len;

   // fill with minimum characters (NUL), up to the
   // limit of characters
   memset(bufr, 0, max_chars);

   // fill up the remainder with blanks, which is the
   // convention for UTF-8 strings with a character limit
   if (in_len > max_chars)
     memset(&bufr[max_chars], ' ', in_len-max_chars);

   return max_chars;
 }

 int fillWithMaxUTF8Chars(char *bufr,
                          int in_len,
                          int max_chars)
 {
   // max values that fit into 4,3,2 and 1 byte(s):

   // Unicode RFC 3629 limits Unicode to values up to U+10FFFF.
   // See http://en.wikipedia.org/wiki/UTF-8

   const char *max4 = "\xF4\x8F\xBF\xBF"; // U+10FFFF
   const char *max3 = "\xEF\xBF\xBF";     // U+FFFF
   const char *max2 = "\xDF\xBF";         // U+07FF
   const char *max1 = "\x7F";             // U+7F

   int result = 0;
   int c = 0;

   if (max_chars <= 0)
     max_chars = in_len;

   // the highest UTF8 character has 4 bytes, fill up with
   // those as much as possible
   for (c=0; c<in_len/4 && c<max_chars; c++)
     {
       for (int j=0; j<4; j++)
         bufr[4*c+j] = max4[j];
       result += 4;
     }

   c *= 4;

   // then add a single 3, 2 or 1 byte character, if needed
   if (c < in_len && c/4 < max_chars)
     {
       switch (in_len - c)
         {
         case 3:
           bufr[c++] = max3[0];
           bufr[c++] = max3[1];
           bufr[c++] = max3[2];
           break;

         case 2:
           bufr[c++] = max2[0];
           bufr[c++] = max2[1];
           break;

         case 1:
           bufr[c++] = max1[0];
           break;
         }
       result = in_len;
     }

   // pad with blanks beyond max_chars, if needed
   if (result < in_len)
     for (int b=result; b<in_len; b++)
       bufr[b] = ' ';

   return result;
 }

 /* A method to find the beginning of an ASCII or UTF8 char that
    is at the end off a buffer.
 */
 char * findStartOfChar( char *someByteInChar, char *startOfBuffer )
 {
   char * rtnv = someByteInChar ;
   while ( rtnv > startOfBuffer && ( ( *rtnv & 0x80 ) ) &&
           ( ( *rtnv & 0xC0 ) != 0xC0 ) )
      rtnv-- ;
   return rtnv ;
 }

 /* A method to do character set conversion , using Glibc iconv */
 static int charsetConvert(const char *srcCharset,const char *targetCharset,char *inputbuf, size_t inputlen, char *outbuf,size_t outlen)
 {
   char **ptrin = &inputbuf;
   char **ptrout = &outbuf;

   iconv_t cd;
   cd = iconv_open(targetCharset,srcCharset);

   if (cd==0)
     return -1;

   if (iconv(cd,ptrin,(size_t*)&inputlen,ptrout,(size_t *)&outlen) == -1)
   {
     //error occurs
     iconv_close(cd);
     return -1;
   }

   iconv_close(cd);
   return outlen;
 }

 /* convert gbk string into UTF8 */
 int gbkToUtf8(char* gbkString, size_t gbklen,
               char* result ,size_t outlen, bool addNullAtEnd)
 {
    int originalOutlen = outlen;
    int finalLength = charsetConvert( "gbk","utf-8", gbkString, gbklen,  result, outlen);

    if (finalLength == -1 )
      return -1;

    if ( addNullAtEnd )
    {
      if(originalOutlen > finalLength )
        result[finalLength] = 0;
      else
        return -1;
    }

    return finalLength;
 }