blob: 35af9d5edfdf0ea93e1d02080d5ed4a403339bc7 [file] [log] [blame]
/**********************************************************************
// @@@ START COPYRIGHT @@@
//
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
//
// @@@ END COPYRIGHT @@@
**********************************************************************/
//
// This source file contains interface routines to the Open Source
// code character set conversion routines that are coded in C.
//
// NOTE: These routines are coded very generically so that the source
// for them can be used in not only the SQL/MX compiler build,
// but also used by the ODBC build and maybe others.
#include <limits.h>
#include <iconv.h>
#include <stdio.h>
#include <stdlib.h>
#include "multi-byte.h"
#include "fcconv.h"
#include "csconvert.h"
#include "from_GB18030.c"
#include "from_GB2312.c"
#include "from_GBK.c"
#define USE_OUR_MB_WC_DATA_TABLES
#include "UCS_jp_data.c"
#include "UCS_zs_data.c"
#include "UCS_zb_data.c"
#include "UCS_ko_data.c"
#include "mb_iconv.c"
#include "iconv_gen.c"
#include "mb_lconv.c"
#undef USE_OUR_MB_WC_DATA_TABLES
#if 0 // Don't need these since we chose to support GBK (a superset of GB2312) instead
#define CODESET gb2312
#define OUR_CS_GB2312_specific
#define OUR_CS_GBK_specific
#include "mb_lconv.c"
#undef OUR_CS_GBK_specific
#undef OUR_CS_GB2312_specific
#undef CODESET
#endif // Don't need these since we chose to support GBK (a superset of GB2312) instead
#define CODESET gbk
#define OUR_CS_GBK_specific
#define OUR_CS_GB2312_specific
#include "mb_lconv.c"
#undef OUR_CS_GBK_specific
#undef OUR_CS_GB2312_specific
#undef CODESET
#define CODESET gb18030
#define OUR_CS_GB18030_specific
#include "mb_lconv.c"
#undef OUR_CS_GB18030_specific
#undef CODESET
#define ENSURE_VALID_CHARSET() \
{ \
if ( (charset == cnv_UnknownCharSet) || (charset > cnv_Last_Valid_CS) ) \
return( CNV_ERR_INVALID_CS ); \
}
#define ENSURE_VALID_INPUT() \
{ if ( (in_bufr == NULL) || (in_len <= 0) ) return CNV_ERR_NOINPUT; }
#define ENSURE_VALID_OUTPUT() \
{ if ( (out_bufr == NULL) || (out_len <= 0) ) return CNV_ERR_BUFFER_OVERRUN; }
#define CHECK_FOR_SERIOUS_ERRORS() \
{ ENSURE_VALID_CHARSET(); ENSURE_VALID_INPUT(); ENSURE_VALID_OUTPUT(); }
#define SET_TRANSLATED_CHAR_CNT() \
{ \
if ( translated_char_cnt_p != NULL ) \
*translated_char_cnt_p = (translated_char_cnt) ; \
}
#define SET_OUTPUT_DATA_LEN() \
{ \
if ( output_data_len_p != NULL ) \
*output_data_len_p = ( (char *)target - (char *)out_bufr ); \
}
#define INITIALIZE_VARIABLES() \
first_untranslated_char = (char *) in_bufr; \
unsigned int translated_char_cnt = 0; \
unsigned char * source = (unsigned char *) in_bufr; \
unsigned char * endSource = source + in_len ; \
SET_TRANSLATED_CHAR_CNT(); \
typedef size_t (*csc_mbtowc_funcPtr) ( WChar_t *pwc, const char *ts,
size_t maxlen, _LC_charmap_t *hdl ) ;
typedef int (*csc_input_utfPtr) ( _LC_fcconv_iconv_t *, uchar_t **, int ) ;
typedef int (*csc_wctomb_funcPtr) ( char *s, WChar_t wc,
_LC_charmap_t *hdl ) ;
typedef int (*csc_output_utfPtr) ( _LC_fcconv_iconv_t *, uchar_t *,
int, ucs4_t) ;
static const csc_mbtowc_funcPtr csc_mbtowc_ptrs[ cnv_Last_Valid_CS + 1] = {
NULL, // cnv_UnknownCharset
NULL, // cnv_UTF8
NULL, // cnv_UTF16,
NULL, // cnv_UTF32,
NULL, // cnv_ISO88591
Our_mbtowc_sjis_ucs4, // See Our_mbtowc_sjis_ucs4() in mb_lconv.c
Our_mbtowc_eucjp_ucs4, // See Our_mbtowc_eucjp_ucs4() in mb_lconv.c
Our_mbtowc_cp949_ucs4, // See Our_mbtowc_cp949_ucs4() in mb_lconv.c
Our_mbtowc_big5_ucs4, // See Our_mbtowc_big5_ucs4() in mb_lconv.c
__mbtowc_gbk_ucs4, // See MBTOWC() in mb_lconv.c
__mbtowc_gb18030_ucs4, // See MBTOWC() in mb_lconv.c
__mbtowc_gbk_ucs4 // See MBTOWC() in mb_lconv.c
};
static const csc_input_utfPtr csc_input_utf_ptrs[ cnv_Last_Valid_CS + 1] = {
NULL, // cnv_UnknownCharset
__input_utf8, // cnv_UTF8
__input_ucs2, // cnv_UTF16,
__input_ucs4, // cnv_UTF32,
NULL, // cnv_ISO88591
NULL,
NULL,
NULL,
NULL,
NULL,
NULL,
NULL,
};
static const csc_wctomb_funcPtr csc_wctomb_ptrs[ cnv_Last_Valid_CS + 1] = {
NULL, // cnv_UnknownCharset
NULL, // cnv_UTF8
NULL, // cnv_UTF16,
NULL, // cnv_UTF32,
NULL, // cnv_ISO88591
Our_wctomb_sjis_ucs4, // See Our_mbtowc_sjis_ucs4() in mb_lconv.c
Our_wctomb_eucjp_ucs4, // See Our_mbtowc_eucjp_ucs4() in mb_lconv.c
Our_wctomb_cp949_ucs4, // See Our_mbtowc_cp949_ucs4() in mb_lconv.c
Our_wctomb_big5_ucs4, // See Our_mbtowc_big5_ucs4() in mb_lconv.c
__wctomb_gbk_ucs4, // See MBTOWC() in mb_lconv.c
__wctomb_gb18030_ucs4, // See MBTOWC() in mb_lconv.c
__wctomb_gbk_ucs4 // See MBTOWC() in mb_lconv.c
};
static const csc_output_utfPtr csc_output_utf_ptrs[ cnv_Last_Valid_CS + 1] = {
NULL, // cnv_UnknownCharset
__output_utf8, // cnv_UTF8
__output_ucs2, // cnv_UTF16,
__output_ucs4, // cnv_UTF32,
NULL, // cnv_ISO88591
NULL,
NULL,
NULL,
NULL,
NULL,
NULL,
NULL,
};
//
// LocaleToUTF16() - Convert a string of characters in the specified
// character set to UTF-16.
//
int LocaleToUTF16( const enum cnv_version version ,
const char *in_bufr , const int in_len ,
const char *out_bufr , const int out_len ,
enum cnv_charset charset ,
char * & first_untranslated_char ,
unsigned int *output_data_len_p ,
const int cnv_flags ,
const int addNullAtEnd_flag ,
unsigned int * translated_char_cnt_p ,
unsigned int max_chars_to_convert )
{
if ( version != cnv_version1 )
return CNV_ERR_INVALID_VERS;
// Initialize some return values early ... in case we give error
INITIALIZE_VARIABLES();
ucs2_t * target = (ucs2_t *) out_bufr;
ucs2_t * endTarget = target + ( out_len / sizeof(ucs2_t) );
SET_OUTPUT_DATA_LEN();
CHECK_FOR_SERIOUS_ERRORS();
// We initialize a _LC_fcconv_iconv_rec struct here.
// NOTE: For our purposes, the ONLY thing that
// must be initialized is the flags word.
//
_LC_fcconv_iconv_rec cd;
int revBytes = cnv_flags & CNV_REVERSE_OUTBYTES;
cd.flags = CONV_BOM_WRITTEN | CONV_INPUT_PROCESSED |
(revBytes ? CONV_REVERSE_OUTBYTE : 0);
if ( max_chars_to_convert == 0xFFFFFFFF )
max_chars_to_convert = (unsigned int)in_len ;
//
// Fast path where charset is ISO88591 or a multi-byte charset.
// An assumption made here is that non-ASCII chars will rarely be seen.
// If one is found, we break out of this fast path and go down the
// slow path.
//
int charsetIsWide = 0;
if ( (charset == cnv_UTF16) || (charset == cnv_UTF32) )
charsetIsWide = 1 ;
if ( ! charsetIsWide )
{
unsigned int UCS4 = 0;
int maxLoopCnt = (int)( endTarget - target );
if ( maxLoopCnt > in_len )
maxLoopCnt = in_len ;
if ( maxLoopCnt > (int) max_chars_to_convert )
maxLoopCnt = (int) max_chars_to_convert ;
unsigned int maxCharToHandle = (charset == cnv_ISO88591) ? 0x0FF : 0x7F;
if ( revBytes )
{
while ( ( --maxLoopCnt >= 0 ) &&
( (UCS4 = *source) <= maxCharToHandle ) )
{
source++;
*target++ = UCS4 << 8;
}
}
else
{
while ( ( --maxLoopCnt >= 0 ) &&
( (UCS4 = *source) <= maxCharToHandle ) )
{
source++;
*target++ = UCS4;
}
}
translated_char_cnt = source - (unsigned char *)in_bufr ;
}
//
// Slower path that handles all locales.
//
csc_mbtowc_funcPtr inputFuncPtr ;
csc_input_utfPtr inputFuncPtr2 = NULL;
inputFuncPtr = csc_mbtowc_ptrs[ charset ];
if ( ( inputFuncPtr == NULL ) && ( charset != cnv_ISO88591 ) )
{
inputFuncPtr2 = csc_input_utf_ptrs[ charset ];
if ( inputFuncPtr2 == NULL )
return( CNV_ERR_INVALID_CS ); // Shouldn't ever happen ...
}
while ( (source < endSource) &&
(translated_char_cnt < max_chars_to_convert) ) {
int UCS4 = 0; // Init. in loop in case new char longer than prev one.
if ( ( (UCS4 = *source) < 0x080 ) && // If ASCII and
( target < endTarget ) && // output buffer has space yet
( ! charsetIsWide ) )
{
source++ ;
if ( revBytes )
UCS4 <<= 8;
*target++ = UCS4;
translated_char_cnt += 1;
}
else
{
size_t mblen ;
int ct = -1; // Init. - assume an error.
unsigned char * tmpsrc = source;
first_untranslated_char = (char *)source; //...in case char is bad
if ( charset == cnv_ISO88591 ) {
UCS4 = *source;
mblen = 1;
}
else {
if ( inputFuncPtr != NULL )
mblen = (*inputFuncPtr)( (WChar_t *) &UCS4,
(const char *)source, endSource - source,
NULL );
else {
UCS4 = (*inputFuncPtr2)( &cd, &tmpsrc, endSource - source );
if ( UCS4 < 0 ) mblen = -1;
else {
mblen = tmpsrc - source;
}
}
}
if ( mblen == 0 ) /* mblen==0 when data is starts with '\0' */
mblen = 1;
if ( (mblen > 0) && (mblen < 0x7FFFFFFF) ) {
if ( UCS4 < 0x0000D800 ) { // If simple UCS2, just store it!
if ( target < endTarget ) {
if ( revBytes )
UCS4 = ( ( UCS4 & 0x00FF ) << 8 ) | ( UCS4 >> 8 ) ;
*target = UCS4;
ct = 2;
}
else ct = ERR_BUFFER_OVERRUN;
}
else { // Not simple UCS2, so call routine that can handle it
ct = __output_ucs2( &cd, (uchar_t *)target,
(endTarget - target)*sizeof(ucs2_t) , UCS4);
}
}
else
ct = mblen; /* Put error code in ct */
if ( ct < 0 ) {
// About to issue an error, so update return values first.
SET_TRANSLATED_CHAR_CNT();
SET_OUTPUT_DATA_LEN();
if ( ct == ERR_BUFFER_OVERRUN )
return CNV_ERR_BUFFER_OVERRUN;
else
return CNV_ERR_INVALID_CHAR;
}
target += ct/sizeof( ucs2_t ) ;
translated_char_cnt += 1;
source += mblen;
}
}
first_untranslated_char = (char *) source;
SET_TRANSLATED_CHAR_CNT();
int rtnVal = 0;
if ( addNullAtEnd_flag == TRUE ) {
if ( target < endTarget ) {
*target++ = 0; // Store a 16-bit NULL
}
else {
rtnVal = CNV_ERR_BUFFER_OVERRUN;
}
}
SET_OUTPUT_DATA_LEN();
return rtnVal;
}
//
// LocaleToUTF8() - Convert a string of characters in the specified
// character set to UTF8.
//
int LocaleToUTF8( const enum cnv_version version ,
const char *in_bufr , const int in_len ,
const char *out_bufr , const int out_len ,
enum cnv_charset charset ,
char * & first_untranslated_char ,
unsigned int *output_data_len_p ,
const int addNullAtEnd_flag ,
unsigned int * translated_char_cnt_p )
{
if ( version != cnv_version1 )
return CNV_ERR_INVALID_VERS;
INITIALIZE_VARIABLES();
unsigned char * target = (unsigned char *) out_bufr;
unsigned char * endTarget = target + out_len ;
SET_OUTPUT_DATA_LEN();
CHECK_FOR_SERIOUS_ERRORS();
// We initialize a _LC_fcconv_iconv_rec struct here.
// NOTE: For our purposes, the ONLY thing that
// must be initialized is the flags word.
//
_LC_fcconv_iconv_rec cd;
cd.flags = CONV_BOM_WRITTEN | CONV_INPUT_PROCESSED ;
//
// Fast path where charset is ISO88591 or a multi-byte charset.
// An assumption made here is that non-ASCII chars will rarely be seen.
// If one is found, we break out of this fast path and go down the
// slow path.
//
int charsetIsWide = 0;
if ( (charset == cnv_UTF16) || (charset == cnv_UTF32) )
charsetIsWide = 1 ;
if ( ! charsetIsWide )
{
unsigned int UCS4 = 0;
int maxLoopCnt = endTarget - target ;
if ( maxLoopCnt > in_len )
maxLoopCnt = in_len ;
while ( ( --maxLoopCnt >= 0) && ( (UCS4 = *source) < 0x080 ) )
{
source++ ;
*target++ = UCS4;
}
translated_char_cnt = source - (unsigned char *)in_bufr ;
}
//
// Slower path that handles all locales.
//
csc_mbtowc_funcPtr inputFuncPtr ;
csc_input_utfPtr inputFuncPtr2 = NULL;
inputFuncPtr = csc_mbtowc_ptrs[ charset ];
if ( ( inputFuncPtr == NULL ) && ( charset != cnv_ISO88591 ) )
{
inputFuncPtr2 = csc_input_utf_ptrs[ charset ];
if ( inputFuncPtr2 == NULL )
return( CNV_ERR_INVALID_CS ); // Shouldn't ever happen ...
}
while ( source < endSource ) {
int UCS4 = 0; // Init. in loop in case new char longer than prev one.
if ( ( (UCS4 = *source) < 0x080 ) && // If ASCII and
( target < endTarget ) && // output buffer has space yet
( ! charsetIsWide ) )
{
source++ ;
*target++ = UCS4;
translated_char_cnt += 1;
}
else
{
size_t mblen ;
int ct = -1; // Init. - assume an error.
unsigned char * tmpsrc = source;
first_untranslated_char = (char *)source; //...in case char is bad
if ( charset == cnv_ISO88591 ) {
UCS4 = *source;
mblen = 1;
}
else if ( inputFuncPtr != NULL )
mblen = (*inputFuncPtr)( (WChar_t *) &UCS4,
(const char *)source, endSource - source,
NULL );
else {
UCS4 = (*inputFuncPtr2)( &cd, &tmpsrc, endSource - source );
if ( UCS4 < 0 ) mblen = -1;
else mblen = tmpsrc - source;
}
if ( mblen == 0 ) /* mblen==0 when data is starts with '\0' */
mblen = 1;
if ( (mblen > 0) && (mblen < 0x7FFFFFFF) )
ct = __output_utf8( &cd, target, endTarget - target , UCS4);
else
ct = mblen; /* Put error code in ct */
if ( ct < 0 ) {
// About to give an error, so update return values first.
SET_TRANSLATED_CHAR_CNT();
SET_OUTPUT_DATA_LEN();
if ( ct == ERR_BUFFER_OVERRUN )
return CNV_ERR_BUFFER_OVERRUN;
else
return CNV_ERR_INVALID_CHAR;
}
source += mblen;
target += ct;
translated_char_cnt += 1;
}
}
first_untranslated_char = (char *) source;
SET_TRANSLATED_CHAR_CNT();
int rtnVal = 0;
if ( addNullAtEnd_flag == TRUE ) {
if ( target < endTarget ) {
*target++ = 0; // Store an 8-bit NULL
}
else {
rtnVal = CNV_ERR_BUFFER_OVERRUN;
}
}
SET_OUTPUT_DATA_LEN();
return rtnVal;
}
//
// LocaleCharToUCS4() converts the FIRST char in the input string to its
// UCS4 value. Returns the UCS4 value at location specified AND the
// length of the input character in bytes as the return value.
//
int LocaleCharToUCS4( const char *in_bufr, //Ptr to Input string
const int in_len, //Len of Input string (bytes)
unsigned int *UCS4ptr , //Ptr to output location
enum cnv_charset charset ) //Locale Character Set
{
ENSURE_VALID_CHARSET();
ENSURE_VALID_INPUT();
unsigned char * tmpsrc = (unsigned char *) in_bufr;
size_t mblen ;
int UCS4 = 0;
// We initialize a _LC_fcconv_iconv_rec struct here.
// NOTE: For our purposes, the ONLY thing that
// must be initialized is the flags word.
//
_LC_fcconv_iconv_rec cd;
cd.flags = CONV_BOM_WRITTEN | CONV_INPUT_PROCESSED ;
if ( charset == cnv_ISO88591 ) {
UCS4 = *(unsigned char *)in_bufr;
mblen = 1;
}
else {
csc_mbtowc_funcPtr inputFuncPtr ;
csc_input_utfPtr inputFuncPtr2 ;
inputFuncPtr = csc_mbtowc_ptrs[ charset ];
if ( inputFuncPtr != NULL )
mblen = (*inputFuncPtr)( (WChar_t *) &UCS4,
(const char *)in_bufr, in_len, NULL );
else {
inputFuncPtr2 = csc_input_utf_ptrs[ charset ];
if ( inputFuncPtr2 == NULL )
return( CNV_ERR_INVALID_CS ); // Shouldn't ever happen ...
UCS4 = (*inputFuncPtr2)( &cd, &tmpsrc, in_len );
if ( UCS4 < 0 ) mblen = -1;
else mblen = tmpsrc - (unsigned char *)in_bufr;
}
}
if ( mblen == 0 ) /* mblen==0 when data is starts with '\0' */
mblen = 1;
if ( (mblen > 0) && (mblen < 0x7FFFFFFF) ) {
if ( UCS4ptr != NULL ) *UCS4ptr = UCS4; // Return the UCS4 value
return (mblen);
}
return (CNV_ERR_INVALID_CHAR);
}
//
// UCS4ToLocaleChar() converts the UCS4 value to the specified character set
// and stores the character in the output buffer specified.
// Returns length of the output character in bytes as the return value.
//
int UCS4ToLocaleChar( const unsigned int *UCS4ptr , //Ptr to input char
const char *out_bufr, //Ptr to output bufr
const int out_len, //Len of output bufr
enum cnv_charset charset ) //Locale Character Set
{
char tmpspace[8]; /* big enough to ensure no buffer overflow */
ENSURE_VALID_CHARSET();
char * target = (char *) out_bufr;
int UCS4 = *UCS4ptr;
int ct = -1;
// We initialize a _LC_fcconv_iconv_rec struct here.
// NOTE: For our purposes, the ONLY thing that
// must be initialized is the flags word.
//
_LC_fcconv_iconv_rec cd;
cd.flags = CONV_BOM_WRITTEN | CONV_INPUT_PROCESSED ;
if ( UCS4ptr == NULL )
return CNV_ERR_NOINPUT;
if ( charset == cnv_ISO88591 ) {
if ( UCS4 <= 0x0FF ) { /* If valid ISO88591 char */
tmpspace[0] = UCS4;
ct = 1;
}
}
else {
csc_wctomb_funcPtr outputFuncPtr ;
csc_output_utfPtr outputFuncPtr2 ;
outputFuncPtr = csc_wctomb_ptrs[ charset ];
if ( outputFuncPtr != NULL )
ct = (int) (*outputFuncPtr)( tmpspace, (WChar_t) UCS4,
(_LC_charmap_t *)NULL );
else {
outputFuncPtr2 = csc_output_utf_ptrs[ charset ];
if ( outputFuncPtr2 == NULL )
return( CNV_ERR_INVALID_CS ); // Shouldn't ever happen ...
ct = (*outputFuncPtr2)( &cd, (unsigned char *)(&tmpspace),
sizeof(tmpspace), UCS4 );
}
}
if ( ct < 0 ) // If Bad character or conversion error
return (CNV_ERR_INVALID_CHAR);
if ( ct <= out_len ) {
if ( target != NULL ) {
char * tmpPtr = &tmpspace[0];
int iii = ct;
while (iii-- > 0 )
*target++ = *tmpPtr++;
}
return ( ct );
}
return CNV_ERR_BUFFER_OVERRUN;
}
//
// csc_get_subst_char() -- Get substitution char and its length
//
// Arguments: substitution_char - pointer to user's specified char
// tmpspace - pointer to caller's place to put the char
// charset - an "enum cnv_charset" value indicating the
// target character set
//
// Return value: Length of substitution char in bytes
//
static int csc_get_subst_char( const char * substitution_char,
char * tmpspace,
enum cnv_charset charset )
{
int sc_ln = 1; //Default: 1-byte substitution char
tmpspace[0] = '?';
if ( substitution_char != NULL ) {
if ( charset == cnv_UTF16 ) {
sc_ln = 2;
tmpspace[0] = substitution_char[0] ;
tmpspace[1] = substitution_char[1] ;
}
else if ( charset == cnv_UTF32 ) {
sc_ln = 4;
tmpspace[0] = substitution_char[0] ;
tmpspace[1] = substitution_char[1] ;
tmpspace[2] = substitution_char[2] ;
tmpspace[3] = substitution_char[3] ;
}
else {
//
// If 1st byte of substitution char string is 0, use '?'.
// Else, if string is 1 byte long, use it as is.
// Else, if string is 2 bytes long, use it as is.
// Else use '?'.
//
if ( ( substitution_char[0] != 0 ) &&
( ( substitution_char[1] == 0 ) ||
( substitution_char[2] == 0 ) ) ) {
tmpspace[0] = substitution_char[0] ;
tmpspace[1] = substitution_char[1] ;
if ( tmpspace[1] != 0 )
sc_ln = 2;
}
}
}
return ( sc_ln );
}
int addVariableLengthNull( unsigned char * & target,
unsigned char * endTarget,
int len_of_NULL )
{
if ( len_of_NULL <= (endTarget - target) ) {
if ( len_of_NULL >= 2 ) {
*target++ = 0;
if ( len_of_NULL == 4 ) {
*target++ = 0; *target++ = 0;
}
}
*target++ = 0;
return 0;
}
return CNV_ERR_BUFFER_OVERRUN;
}
//
// UTF16ToLocale() - Convert a string of UTF-16 characters
// to the specified character set.
//
int UTF16ToLocale( const enum cnv_version version ,
const char *in_bufr , const int in_len ,
const char *out_bufr , const int out_len ,
enum cnv_charset charset ,
char * & first_untranslated_char ,
unsigned int *output_data_len_p ,
const int cnv_flags ,
const int addNullAtEnd_flag ,
const int allow_invalids ,
unsigned int * translated_char_cnt_p ,
const char *substitution_char )
{
if ( version != cnv_version1 )
return CNV_ERR_INVALID_VERS;
INITIALIZE_VARIABLES();
unsigned char * target = (unsigned char *)out_bufr;
unsigned char * endTarget = target + out_len ;
SET_OUTPUT_DATA_LEN();
CHECK_FOR_SERIOUS_ERRORS();
int len_of_NULL = 1;
int ct = 0;
// We initialize a _LC_fcconv_iconv_rec struct here.
// NOTE: For our purposes, the ONLY thing that
// must be initialized is the flags word.
//
_LC_fcconv_iconv_rec cd;
cd.flags = CONV_INPUT_PROCESSED | CONV_BOM_WRITTEN |
((cnv_flags && CNV_REVERSE_INBYTES) ? CONV_REVERSE_INBYTE : 0);
//
// Fast path where charset is ISO88591 or a multi-byte charset.
// An assumption made here is that non-valid chars will rarely be seen.
// If one is found, we break out of this fast path and go down the
// slow path.
//
int charsetIsWide = 0;
if ( (charset == cnv_UTF16) || (charset == cnv_UTF32) )
charsetIsWide = 1 ;
if ( ( ! charsetIsWide ) &&
( (cnv_flags & CNV_REVERSE_INBYTES) == 0 ) )
{
unsigned int UCS4 = 0;
int maxLoopCnt = endTarget - target ;
if ( maxLoopCnt > (int) ( in_len / sizeof(ucs2_t) ) )
maxLoopCnt = (int) ( in_len / sizeof(ucs2_t) ) ;
unsigned int maxCharToHandle = (charset == cnv_ISO88591) ? 0x0FF : 0x7F;
UCS4 = *( (ucs2_t *)source );
if ( cnv_flags & CNV_REVERSE_INBYTES )
{
while ( --maxLoopCnt >= 0 ) // While more to do
{
UCS4 = *( (ucs2_t *)source ) ;
UCS4 = ( ( UCS4 & 0x00FF ) << 8 ) | ( UCS4 >> 8 ) ;
if ( UCS4 <= maxCharToHandle )
{
source += sizeof(ucs2_t);
*target++ = UCS4;
}
else break;
}
}
else while ( ( --maxLoopCnt >= 0 ) && // While more to do and
( ( UCS4 = *( (ucs2_t *)source ) ) <= maxCharToHandle ) )
{
source += sizeof(ucs2_t);
*target++ = UCS4;
}
translated_char_cnt = target - (unsigned char *)out_bufr ;
}
//
// Slower path that handles all locales.
//
csc_wctomb_funcPtr outputFuncPtr ;
csc_output_utfPtr outputFuncPtr2 = NULL;
outputFuncPtr = csc_wctomb_ptrs[ charset ];
if ( ( outputFuncPtr == NULL ) && ( charset != cnv_ISO88591 ) )
{
outputFuncPtr2 = csc_output_utf_ptrs[ charset ];
if ( outputFuncPtr2 == NULL )
return( CNV_ERR_INVALID_CS ); // Shouldn't ever happen ...
}
while ( source < endSource ) {
unsigned int UCS4 = *((ucs2_t *)source);
if ( cnv_flags & CNV_REVERSE_INBYTES )
UCS4 = ( ( UCS4 & 0x00FF ) << 8 ) | ( UCS4 >> 8 ) ;
if ( ( UCS4 < 0x080 ) && // If ASCII and
( target < endTarget ) && // there is space yet and
( ! charsetIsWide ) ) // output is not wide characters
{
source += sizeof(ucs2_t);
*target++ = UCS4;
translated_char_cnt += 1 ;
}
else
{
char tmpspace[8]; /* big enough to ensure no buffer overflow */
first_untranslated_char = (char *) source; //...in case char is bad
if ( UCS4 < 0xD800 ) // If simple UCS2, use it as already retrieved
source += sizeof(ucs2_t);
else
UCS4 = __input_ucs2( &cd, &source, endSource - source );
ct = -1;
if ( (UCS4 != ERR_INPUT_INCOMPLETE) && (UCS4 != ERR_INVALID_CHAR) ) {
if ( charset == cnv_ISO88591 ) {
if ( UCS4 <= 0x0FF ) { // If valid ISO88591 char
tmpspace[0] = UCS4;
ct = 1;
}
}
else {
if ( outputFuncPtr != NULL )
ct = (*outputFuncPtr)( tmpspace, (WChar_t) UCS4,
(_LC_charmap_t *)NULL);
else {
ct = (*outputFuncPtr2)( &cd, (unsigned char *)(tmpspace),
sizeof(tmpspace), UCS4 );
if ( charset == cnv_UTF16 )
len_of_NULL = 2;
else if ( charset == cnv_UTF32 )
len_of_NULL = 4;
}
}
}
if ( ct < 0 ) { // If Bad character or conversion error
if ( allow_invalids == FALSE ) {
SET_TRANSLATED_CHAR_CNT();
SET_OUTPUT_DATA_LEN();
return (CNV_ERR_INVALID_CHAR);
}
ct = csc_get_subst_char( substitution_char, tmpspace , charset );
if ( (UCS4 == ERR_INPUT_INCOMPLETE) || (UCS4 == ERR_INVALID_CHAR) )
source += 2 ; // Skip bad character
//else source was already incremented by __input_ucs2()
}
if ( ct <= (endTarget - target) ) {
char * tmpPtr = &tmpspace[0];
while (ct-- > 0 )
*target++ = *tmpPtr++;
translated_char_cnt += 1;
}
else {
SET_TRANSLATED_CHAR_CNT();
SET_OUTPUT_DATA_LEN();
return CNV_ERR_BUFFER_OVERRUN;
}
}
}
first_untranslated_char = (char *) source;
SET_TRANSLATED_CHAR_CNT();
int rtnVal = 0;
if ( addNullAtEnd_flag == TRUE ) {
rtnVal = addVariableLengthNull( target, endTarget, len_of_NULL );
}
SET_OUTPUT_DATA_LEN();
return rtnVal;
}
#define TWO_BYTE_UTF8(firstByte, src, nxtB) ( (((firstByte) & 0xE0) == 0xC0) && \
(( (nxtB=(*((src)+1))) & 0xC0) == 0x80) )
//
// UTF8ToLocale() - Convert a string of UTF8 characters
// to the specified character set.
//
int UTF8ToLocale( const enum cnv_version version ,
const char *in_bufr , const int in_len ,
const char *out_bufr , const int out_len ,
enum cnv_charset charset ,
char * & first_untranslated_char ,
unsigned int *output_data_len_p ,
const int addNullAtEnd_flag ,
const int allow_invalids ,
unsigned int * translated_char_cnt_p ,
const char *substitution_char )
{
if ( version != cnv_version1 )
return CNV_ERR_INVALID_VERS;
INITIALIZE_VARIABLES();
unsigned char * target = (unsigned char *)out_bufr;
unsigned char * endTarget = target + out_len ;
SET_OUTPUT_DATA_LEN();
CHECK_FOR_SERIOUS_ERRORS();
int len_of_NULL = 1;
int ct = 0;
// We initialize a _LC_fcconv_iconv_rec struct here.
// NOTE: For our purposes, the ONLY thing that
// must be initialized is the flags word.
//
_LC_fcconv_iconv_rec cd;
cd.flags = CONV_BOM_WRITTEN | CONV_INPUT_PROCESSED ;
//
// Fast path where charset is ISO88591 or a multi-byte charset.
// An assumption made here is that invalid chars will rarely be seen.
// If one is found, we break out of this fast path and go down the
// slow path.
//
int charsetIsWide = 0;
if ( (charset == cnv_UTF16) || (charset == cnv_UTF32) )
charsetIsWide = 1 ;
if ( ! charsetIsWide )
{
unsigned int UCS4 = 0;
int maxLoopCnt = endTarget - target ;
if ( maxLoopCnt > in_len )
maxLoopCnt = in_len ;
while ( --maxLoopCnt >= 0 )
{
// If character is valid ASCII
if ( (UCS4 = *source) < 0x080 ) {
source++;
*target++ = UCS4;
}
else
{
if (charset != cnv_ISO88591)
// Let slower path handle the rest of the buffer.
break;
int nxtByte = 0;
if ( ( maxLoopCnt > 0 ) && TWO_BYTE_UTF8( UCS4, source, nxtByte ) )
{
// Convert from UTF8 to UCS4.
UCS4 = (UCS4 & 0x1F) << 6 | ( nxtByte & 0x3F );
if ( UCS4 > 0x0FF )
break; // Non-ISO88591. Let slower path handle the rest.
source += 2 ;
*target++ = UCS4;
if ( maxLoopCnt > (int) ( endSource - source ) )
maxLoopCnt-- ; // Ensure we don't overrun input buffer
}
else break; // Let slower path handle the rest.
}
}
translated_char_cnt = target - (unsigned char *)out_bufr ;
}
else if ( charset == cnv_UTF16 )
{
unsigned int UCS4 = 0;
while ( ( source < endSource ) && // more input and
( ( UCS4 = *source ) < 0x080 ) && // it is ASCII and
( (endTarget - target) >= 2 ) ) // there is space left
{
*((ucs2_t *)target) = UCS4;
source++;
target += sizeof(ucs2_t);
}
translated_char_cnt = source - (unsigned char *)in_bufr;
len_of_NULL = 2;
}
//
// Slower path that handles all locales.
//
csc_wctomb_funcPtr outputFuncPtr ;
csc_output_utfPtr outputFuncPtr2 = NULL;
outputFuncPtr = csc_wctomb_ptrs[ charset ];
if ( ( outputFuncPtr == NULL ) && ( charset != cnv_ISO88591 ) )
{
outputFuncPtr2 = csc_output_utf_ptrs[ charset ];
if ( outputFuncPtr2 == NULL )
return( CNV_ERR_INVALID_CS ); // Shouldn't ever happen ...
}
while ( source < endSource ) {
unsigned int UCS4 = *source;
if ( ( UCS4 < 0x080 ) && // If ASCII and
( target < endTarget ) && // there is space yet and
( ! charsetIsWide ) ) // output is not wide characters
{
*target++ = UCS4;
source++;
translated_char_cnt += 1;
}
else
{
char tmpspace[8]; /* big enough to ensure no buffer overflow */
first_untranslated_char = (char *) source; //...in case char is bad
int UCS4 = __input_utf8( &cd, &source, endSource - source);
ct = -1;
if ( (UCS4 != ERR_INPUT_INCOMPLETE) && (UCS4 != ERR_INVALID_CHAR) ) {
if ( charset == cnv_ISO88591 ) {
if ( UCS4 <= 0x0FF ) { // If valid ISO88591 char
tmpspace[0] = UCS4;
ct = 1;
}
}
else {
if ( outputFuncPtr != NULL )
ct = (*outputFuncPtr)( tmpspace, (WChar_t) UCS4, NULL );
else {
ct = (*outputFuncPtr2)( &cd, (unsigned char *)(tmpspace),
sizeof(tmpspace), UCS4);
if ( charset == cnv_UTF16 )
len_of_NULL = 2;
else if ( charset == cnv_UTF32 )
len_of_NULL = 4;
}
}
}
if ( ct < 0 ) { // If Bad character or conversion error
if ( allow_invalids == FALSE ) {
SET_TRANSLATED_CHAR_CNT();
SET_OUTPUT_DATA_LEN();
return (CNV_ERR_INVALID_CHAR);
}
ct = csc_get_subst_char( substitution_char, tmpspace , charset );
if ( (UCS4 == ERR_INPUT_INCOMPLETE) || (UCS4 == ERR_INVALID_CHAR) )
source += 1 ; // Skip bad character
//else source was already incremented by __input_utf8()
}
if ( ct <= (endTarget - target) ) {
char * tmpPtr = &tmpspace[0];
translated_char_cnt += 1;
while (ct-- > 0 )
*target++ = *tmpPtr++;
}
else {
SET_TRANSLATED_CHAR_CNT();
SET_OUTPUT_DATA_LEN();
return CNV_ERR_BUFFER_OVERRUN;
}
}
}
first_untranslated_char = (char *) source;
SET_TRANSLATED_CHAR_CNT();
int rtnVal = 0;
if ( addNullAtEnd_flag == TRUE ) {
rtnVal = addVariableLengthNull( target, endTarget, len_of_NULL );
}
SET_OUTPUT_DATA_LEN();
return rtnVal;
}
int lightValidateUTF8Str(const char *bufr,
int in_len,
int max_chars,
int ignore_trailing_blanks)
{
unsigned char c;
int pos = 0;
int numc = 0;
int maxc = ( max_chars ? max_chars : in_len );
int byte = 1;
int last_good_pos = 0;
if ( (in_len < 0) || (max_chars < 0) ) // Defensive programming: Ensure no memory access exceptions.
return -1; // Shouldn't ever happen, of course.
while (pos < in_len && numc < maxc)
{
c = bufr[pos];
if (c < 0x80 && byte == 1) // ascii
numc++;
else if (c >= 0x80 && c < 0xc0 && byte > 1) // second, third, or fourth byte of a multi-byte sequence
{
if (--byte == 1)
numc++;
}
else if (c >= 0xc0 && c < 0xe0 && byte == 1) // start of 2-byte sequence
byte = 2;
else if (c >= 0xe0 && c < 0xf0 && byte == 1) // start of 3-byte sequence
byte = 3;
else if (c >= 0xf0 && c < 0xfc && byte == 1) // start of 4-byte sequence
byte = 4;
else
return -1; // invalid byte sequence
pos++;
}
if (byte == 1 && numc <= maxc)
return pos; // string is valid and has valid char count, pos == in_len
// We encountered too many characters or a partial character. The string
// bufr[0..pos-1] contains numc entire characters and maybe one partial character.
// check whether the extra characters are all blanks and it's safe to ignore them
if (ignore_trailing_blanks && byte == 1)
{
int blankPos = pos-1; // the previous character is already past the char. limit
while (blankPos < in_len && bufr[blankPos] == ' ')
blankPos++;
if (blankPos >= in_len)
return in_len; // extra chars were all blanks
}
// back up until the end of the valid characters
while (byte > 1 || numc > maxc)
{
pos--;
c = bufr[pos];
if (c < 0x80 || c >= 0xc0)
{
// this is the first byte of a character
if (byte > 1)
byte = 1;
else
numc--;
}
}
return pos; // string needs to be truncated at position "pos" (to length "pos")
}
#if 0 /* Not currently called anywhere.*/
int lightValidateUTF8StrAndPad(char *bufr,
int in_len,
int max_chars,
int ignore_trailing_blanks)
{
int trunc = lightValidateUTF8Str(bufr, in_len, max_chars, ignore_trailing_blanks);
if (trunc < in_len && trunc >= 0)
{
for (int i=trunc; i<in_len; i++)
bufr[i] = ' ';
}
return trunc;
}
#endif /* Not currently called anywhere.*/
int fillWithMinUTF8Chars(char *bufr,
int in_len,
int max_chars)
{
int i;
if (max_chars <= 0)
max_chars = in_len;
// fill with minimum characters (NUL), up to the
// limit of characters
memset(bufr, 0, max_chars);
// fill up the remainder with blanks, which is the
// convention for UTF-8 strings with a character limit
if (in_len > max_chars)
memset(&bufr[max_chars], ' ', in_len-max_chars);
return max_chars;
}
int fillWithMaxUTF8Chars(char *bufr,
int in_len,
int max_chars)
{
// max values that fit into 4,3,2 and 1 byte(s):
// Unicode RFC 3629 limits Unicode to values up to U+10FFFF.
// See http://en.wikipedia.org/wiki/UTF-8
const char *max4 = "\xF4\x8F\xBF\xBF"; // U+10FFFF
const char *max3 = "\xEF\xBF\xBF"; // U+FFFF
const char *max2 = "\xDF\xBF"; // U+07FF
const char *max1 = "\x7F"; // U+7F
int result = 0;
int c = 0;
if (max_chars <= 0)
max_chars = in_len;
// the highest UTF8 character has 4 bytes, fill up with
// those as much as possible
for (c=0; c<in_len/4 && c<max_chars; c++)
{
for (int j=0; j<4; j++)
bufr[4*c+j] = max4[j];
result += 4;
}
c *= 4;
// then add a single 3, 2 or 1 byte character, if needed
if (c < in_len && c/4 < max_chars)
{
switch (in_len - c)
{
case 3:
bufr[c++] = max3[0];
bufr[c++] = max3[1];
bufr[c++] = max3[2];
break;
case 2:
bufr[c++] = max2[0];
bufr[c++] = max2[1];
break;
case 1:
bufr[c++] = max1[0];
break;
}
result = in_len;
}
// pad with blanks beyond max_chars, if needed
if (result < in_len)
for (int b=result; b<in_len; b++)
bufr[b] = ' ';
return result;
}
/* A method to find the beginning of an ASCII or UTF8 char that
is at the end off a buffer.
*/
char * findStartOfChar( char *someByteInChar, char *startOfBuffer )
{
char * rtnv = someByteInChar ;
while ( rtnv > startOfBuffer && ( ( *rtnv & 0x80 ) ) &&
( ( *rtnv & 0xC0 ) != 0xC0 ) )
rtnv-- ;
return rtnv ;
}
/* A method to do character set conversion , using Glibc iconv */
static int charsetConvert(const char *srcCharset,const char *targetCharset,char *inputbuf, size_t inputlen, char *outbuf,size_t outlen)
{
char **ptrin = &inputbuf;
char **ptrout = &outbuf;
iconv_t cd;
cd = iconv_open(targetCharset,srcCharset);
if (cd==0)
return -1;
if (iconv(cd,ptrin,(size_t*)&inputlen,ptrout,(size_t *)&outlen) == -1)
{
//error occurs
iconv_close(cd);
return -1;
}
iconv_close(cd);
return outlen;
}
/* convert gbk string into UTF8 */
int gbkToUtf8(char* gbkString, size_t gbklen,
char* result ,size_t outlen, bool addNullAtEnd)
{
int originalOutlen = outlen;
int finalLength = charsetConvert( "gbk","utf-8", gbkString, gbklen, result, outlen);
if (finalLength == -1 )
return -1;
if ( addNullAtEnd )
{
if(originalOutlen > finalLength )
result[finalLength] = 0;
else
return -1;
}
return finalLength;
}