blob: 3e9417cad72de95b24ca592a764b1a3a595e4741 [file] [log] [blame]
/**********************************************************************
// @@@ START COPYRIGHT @@@
//
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
//
// @@@ END COPYRIGHT @@@
**********************************************************************/
#ifndef CSCONVERT_H
#define CSCONVERT_H
//
// This source file and csconvert.cpp contain interface routines to
// the character set conversion routines that are coded in C.
//
// NOTE: These routines are coded very generically so that the source
// for them can be used in not only the SQL/MX compiler build,
// but also used by the ODBC build and maybe others.
enum cnv_version { cnv_version1 = 1 }; /* For future expansion */
#ifndef cnv_charset_DEFINED
#define cnv_charset_DEFINED
enum cnv_charset { cnv_UnknownCharSet = 0, cnv_UTF8 = 1,
cnv_UTF16 = 2, cnv_UTF32 = 3,
cnv_ISO88591 = 4, cnv_SJIS = 5,
cnv_EUCJP = 6, cnv_KSC = 7,
cnv_BIG5 = 8, cnv_GB2312 = 9,
cnv_GB18030 = 10, cnv_GBK = 11,
cnv_Last_Valid_CS = 11
};
#endif
#ifndef TRUE
#define TRUE 1
#endif
#ifndef FALSE
#define FALSE 0
#endif
//NOTE: The following definitions assume that FALSE = 0 and TRUE <> 0
int LocaleToUTF8( const enum cnv_version version,
const char *in_bufr, const int in_len,
const char *out_bufr, const int out_len,
enum cnv_charset charset,
char * & first_untranslated_char,
unsigned int *output_data_len_p = NULL ,
const int addNullAtEnd_flag = FALSE,
unsigned int *translated_char_cnt_p = NULL );
int UTF8ToLocale( const enum cnv_version version,
const char *in_bufr, const int in_len,
const char *out_bufr, const int out_len,
enum cnv_charset charset,
char * & first_untranslated_char,
unsigned int *output_data_len_p = NULL ,
const int addNullAtEnd_flag = FALSE ,
const int allow_invalids = FALSE ,
unsigned int * translated_char_cnt_p = NULL ,
const char *substitution_char = NULL );
int LocaleToUTF16( const enum cnv_version version,
const char *in_bufr, const int in_len,
const char *out_bufr, const int out_len,
enum cnv_charset charset,
char * & first_untranslated_char,
unsigned int *output_data_len_p = NULL ,
const int cnv_flags = 0 ,
const int addNullAtEnd_flag = FALSE,
unsigned int *translated_char_cnt_p = NULL ,
unsigned int max_chars_to_convert = 0xffffffff);
int UTF16ToLocale( const enum cnv_version version,
const char *in_bufr, const int in_len,
const char *out_bufr, const int out_len,
enum cnv_charset charset,
char * & first_untranslated_char,
unsigned int *output_data_len_p = NULL ,
const int cnv_flags = 0 ,
const int addNullAtEnd_flag = FALSE ,
const int allow_invalids = FALSE ,
unsigned int * translated_char_cnt_p = NULL ,
const char *substitution_char = NULL );
int gbkToUtf8(char* gbkString, size_t gbklen,
char* result ,size_t outlen, bool addNullAtEnd=FALSE);
/*
* LocaleCharToUCS4() converts the FIRST char in the input string to its
* UCS4 value. Returns the UCS4 value at location specified AND the
* length of the input character in bytes as the return value.
*/
int LocaleCharToUCS4( const char *in_bufr, //Ptr to Input string
const int in_len, //Len of Input string (bytes)
unsigned int *UCS4ptr , //Ptr to output location
enum cnv_charset charset ); //Locale Character Set
/*
* UCS4ToLocaleChar() converts the UCS4 value to the specified character set
* and stores the character in the output buffer specified.
* Returns length of the output character in bytes as the return value.
*/
int UCS4ToLocaleChar( const unsigned int *UCS4ptr , //Ptr to input char
const char *out_bufr, //Ptr to output bufr
const int out_len, //Len of output bufr
enum cnv_charset charset ); //Locale Character Set
/*
* For each routine, the return value is 0 for success.
* Otherwise, it is one of the following error codes.
*/
#define CNV_ERR_INVALID_CHAR -1 // Character in input cannot be converted
#define CNV_ERR_BUFFER_OVERRUN -2 // No output buffer or not big enough
#define CNV_ERR_NOINPUT -3 // No input buffer or input cnt <= 0
#define CNV_ERR_INVALID_CS -4 // Invalid Character Set specified
#define CNV_ERR_INVALID_VERS -5 // Invalid version specified
#define CNV_ERR_NO_CONVERSION_NEEDED -6 // Source and target Character Sets are the same
#define CNV_ERR_TARGET_SIZE_INVALID -7 // Provided target buffer is not large enough
// to handle the conversion
#define CNV_ERR_INVALID_HEAP -8 // A valid HEAP pointer was not provided
/*
* For the cnv_flags argument to LocaleToUTF16(), the following is defined:
*/
#define CNV_REVERSE_OUTBYTES 0x1 // Set TRUE when output data must be Big-Endian
// and running on a Little-Endian machine
// or vice versa.
/*
* For the cnv_flags argument to UTF16ToLocale(), the following is defined:
*/
#define CNV_REVERSE_INBYTES 0x2 // Set TRUE when input data is Big-Endian
// and running on a Little-Endian machine
// or vice versa.
/* NOTES:
*
* All buffer lengths are in BYTES.
*
* The caller is responsible for allocating the output buffer
* and ensuring it is big enough (or dealing with a non-zero
* return value--looping or something--if the output buffer
* isn't big enough.)
*
* The first_untranslated_char pointer will be set to point
* within the input buffer to the first character position
* that was not processed (either because it was a bad
* character OR because the output buffer was full OR because
* the caller-specified maximum (max_chars_to_convert) limit
* was reached). If the caller's input buffer is exhausted
* without returning early, the first_untranslated_char pointer
* will be set to point to the end of the input buffer.
*
* For the cnv_flags arg, see the related #defines above.
*
* All 4 of these interface routines assume that the caller
* will deal with any BOM (Byte-Order-Mark) at the start of
* any file that the input data might be coming from...and
* that the caller will prepend any BOM needed before any
* output data from these routines is put into a file.
* It is anticipated that, if there are any such files, they
* will be in Big-Endian format, although that is up to the
* callers.
*
* The addNullAtEnd_flag, if TRUE, specifies to add a NULL
* (1 or 2 bytes of binary 0) at the end of the valid data in
* the output buffer (provided, of course, that there is
* sufficient room in the output buffer to do so.)
*
* The optional translated_char_cnt pointer argument, if
* supplied, is where the routine returns the count of
* successfully translated characters (whether an error is
* encountered or not.) If not supplied or if NULL is
* supplied, the count is not returned. This is a character
* count, not a byte count.
*
* The allow_invalids flag, if true, results in a substitution
* character (see next paragraph) being put in the output buffer
* whenever UTF16ToLocale() encounters a Unicode character
* that it cannot translate to the specified character set.
* For UTF16ToLocale(), after putting the substitution char in
* the output buffer, the routine will keep going after skipping
* the "bad" character in the input buffer. If the "bad"
* character was a valid UTF16 character but just couldn't
* be translated to the specified output locale character
* set, the entire character will be skipped. Otherwise
* two bytes of the input buffer will be skipped.
* For UTF8ToLocale(), after putting the substitution char in
* the output buffer, the routine will keep going after skipping
* the "bad" character in the input buffer. If the "bad"
* character was a valid UTF8 character but just couldn't
* be translated to the specified output locale character
* set, the entire character will be skipped. Otherwise
* one byte of the input buffer will be skipped.
*
* The substitution_char pointer, if not NULL, should point to
* a 1-byte or 2-byte substitution character followed immediately
* by a byte containing a binary 0. See description of
* the allow_invalids flag above. If NULL is specified and the
* allow_invalids flag is non-zero, then the default substitution
* character, namely a ? (question mark), is used.
*/
/*
* Two methods to validate a UTF8 string to ensure that it has
* no partial characters and that it has no more than a given
* number of characters.
*
* Return value:
* negative value:
* The string contains invalid
* UTF-8 or parameter error. Note
* that this light validation routine
* does NOT recognize invalid code points,
* overlong encodings, or UTF-16 surrogate
* pairs encoded as two UTF-8 chars, and
* possibly other problems.
* non-negative value:
* The valid length of the string,
* after removing partial characters
* and extraneous characters
*
* The second method also pads any bytes that are
* truncated with blanks
*/
int lightValidateUTF8Str(const char *bufr, // ptr to buffer to validate
int in_len, // len in bytes of buffer
int max_chars = 0, // max chars allowed in buffer or 0 for unlimited
int ignore_trailing_blanks = 1); // don't count trailing blanks as chars
int lightValidateUTF8StrAndPad(char *bufr,
int in_len,
int max_chars = 0,
int ignore_trailing_blanks = 1);
/* A method to create the minimum/maximum valid UTF-8 character
string that fits into a given buffer. Used to form low/high keys.
If max_chars is > 0, generates at most max_chars and pads
the remaining bytes with blanks. Returns the space occupied
by actual characters, not padding (same as in_len if max_chars == 0).
*/
int fillWithMinUTF8Chars(char *bufr,
int in_len, // in bytes
int max_chars);
int fillWithMaxUTF8Chars(char *bufr,
int in_len, // in bytes
int max_chars);
inline int fillWithMinMaxUTF8Chars(char *bufr,
int in_len, // in bytes
int max_chars,
int is_max)
{ if (is_max) return fillWithMaxUTF8Chars(bufr, in_len, max_chars);
else return fillWithMinUTF8Chars(bufr, in_len, max_chars); }
/* A method to find the beginning of a UTF8 char that is at the end off
a buffer.
*/
char * findStartOfChar( char *someByteInChar, char *startOfBuffer );
#endif /* CSCONVERT_H */