| /**********************************************************************
|
| // @@@ START COPYRIGHT @@@ |
| // |
| // Licensed to the Apache Software Foundation (ASF) under one |
| // or more contributor license agreements. See the NOTICE file |
| // distributed with this work for additional information |
| // regarding copyright ownership. The ASF licenses this file |
| // to you under the Apache License, Version 2.0 (the |
| // "License"); you may not use this file except in compliance |
| // with the License. You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, |
| // software distributed under the License is distributed on an |
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| // KIND, either express or implied. See the License for the |
| // specific language governing permissions and limitations |
| // under the License. |
| // |
| // @@@ END COPYRIGHT @@@ |
| **********************************************************************/
|
| #ifndef CSCONVERT_H
|
| #define CSCONVERT_H
|
|
|
| //
|
| // This source file and csconvert.cpp contain interface routines to
|
| // the character set conversion routines that are coded in C.
|
| //
|
| // NOTE: These routines are coded very generically so that the source
|
| // for them can be used in not only the SQL/MX compiler build,
|
| // but also used by the ODBC build and maybe others.
|
|
|
| #ifndef NA_EIDPROC
|
| #define NA_EIDPROC
|
| #define _resident
|
| #endif
|
|
|
| enum cnv_version { cnv_version1 = 1 }; /* For future expansion */
|
|
|
| #ifndef cnv_charset_DEFINED
|
| #define cnv_charset_DEFINED
|
| enum cnv_charset { cnv_UnknownCharSet = 0, cnv_UTF8 = 1,
|
| cnv_UTF16 = 2, cnv_UTF32 = 3,
|
| cnv_ISO88591 = 4, cnv_SJIS = 5,
|
| cnv_EUCJP = 6, cnv_KSC = 7,
|
| cnv_BIG5 = 8, cnv_GB2312 = 9,
|
| cnv_GB18030 = 10, cnv_GBK = 11,
|
| cnv_Last_Valid_CS = 11
|
| };
|
| #endif
|
|
|
| #ifndef TRUE
|
| #define TRUE 1
|
| #endif
|
|
|
| #ifndef FALSE
|
| #define FALSE 0
|
| #endif
|
|
|
| //NOTE: The following definitions assume that FALSE = 0 and TRUE <> 0
|
| NA_EIDPROC
|
| int LocaleToUTF8( const enum cnv_version version,
|
| const char *in_bufr, const int in_len,
|
| const char *out_bufr, const int out_len,
|
| enum cnv_charset charset,
|
| char * & first_untranslated_char,
|
| unsigned int *output_data_len_p = NULL ,
|
| const int addNullAtEnd_flag = FALSE,
|
| unsigned int *translated_char_cnt_p = NULL );
|
|
|
| NA_EIDPROC
|
| int UTF8ToLocale( const enum cnv_version version,
|
| const char *in_bufr, const int in_len,
|
| const char *out_bufr, const int out_len,
|
| enum cnv_charset charset,
|
| char * & first_untranslated_char,
|
| unsigned int *output_data_len_p = NULL ,
|
| const int addNullAtEnd_flag = FALSE ,
|
| const int allow_invalids = FALSE ,
|
| unsigned int * translated_char_cnt_p = NULL ,
|
| const char *substitution_char = NULL );
|
|
|
| NA_EIDPROC
|
| int LocaleToUTF16( const enum cnv_version version,
|
| const char *in_bufr, const int in_len,
|
| const char *out_bufr, const int out_len,
|
| enum cnv_charset charset,
|
| char * & first_untranslated_char,
|
| unsigned int *output_data_len_p = NULL ,
|
| const int cnv_flags = 0 ,
|
| const int addNullAtEnd_flag = FALSE,
|
| unsigned int *translated_char_cnt_p = NULL ,
|
| unsigned int max_chars_to_convert = 0xffffffff);
|
|
|
| NA_EIDPROC
|
| int UTF16ToLocale( const enum cnv_version version,
|
| const char *in_bufr, const int in_len,
|
| const char *out_bufr, const int out_len,
|
| enum cnv_charset charset,
|
| char * & first_untranslated_char,
|
| unsigned int *output_data_len_p = NULL ,
|
| const int cnv_flags = 0 ,
|
| const int addNullAtEnd_flag = FALSE ,
|
| const int allow_invalids = FALSE ,
|
| unsigned int * translated_char_cnt_p = NULL ,
|
| const char *substitution_char = NULL );
|
|
|
| /*
|
| * LocaleCharToUCS4() converts the FIRST char in the input string to its
|
| * UCS4 value. Returns the UCS4 value at location specified AND the
|
| * length of the input character in bytes as the return value.
|
| */
|
| NA_EIDPROC
|
| int LocaleCharToUCS4( const char *in_bufr, //Ptr to Input string
|
| const int in_len, //Len of Input string (bytes)
|
| unsigned int *UCS4ptr , //Ptr to output location
|
| enum cnv_charset charset ); //Locale Character Set
|
| /*
|
| * UCS4ToLocaleChar() converts the UCS4 value to the specified character set
|
| * and stores the character in the output buffer specified.
|
| * Returns length of the output character in bytes as the return value.
|
| */
|
| NA_EIDPROC
|
| int UCS4ToLocaleChar( const unsigned int *UCS4ptr , //Ptr to input char
|
| const char *out_bufr, //Ptr to output bufr
|
| const int out_len, //Len of output bufr
|
| enum cnv_charset charset ); //Locale Character Set
|
|
|
| /*
|
| * For each routine, the return value is 0 for success.
|
| * Otherwise, it is one of the following error codes.
|
| */
|
| #define CNV_ERR_INVALID_CHAR -1 // Character in input cannot be converted
|
| #define CNV_ERR_BUFFER_OVERRUN -2 // No output buffer or not big enough
|
| #define CNV_ERR_NOINPUT -3 // No input buffer or input cnt <= 0
|
| #define CNV_ERR_INVALID_CS -4 // Invalid Character Set specified
|
| #define CNV_ERR_INVALID_VERS -5 // Invalid version specified
|
| #define CNV_ERR_NO_CONVERSION_NEEDED -6 // Source and target Character Sets are the same
|
| #define CNV_ERR_TARGET_SIZE_INVALID -7 // Provided target buffer is not large enough
|
| // to handle the conversion
|
| #define CNV_ERR_INVALID_HEAP -8 // A valid HEAP pointer was not provided
|
|
|
| /*
|
| * For the cnv_flags argument to LocaleToUTF16(), the following is defined:
|
| */
|
| #define CNV_REVERSE_OUTBYTES 0x1 // Set TRUE when output data must be Big-Endian
|
| // and running on a Little-Endian machine
|
| // or vice versa.
|
|
|
| /*
|
| * For the cnv_flags argument to UTF16ToLocale(), the following is defined:
|
| */
|
| #define CNV_REVERSE_INBYTES 0x2 // Set TRUE when input data is Big-Endian
|
| // and running on a Little-Endian machine
|
| // or vice versa.
|
|
|
| /* NOTES:
|
| *
|
| * All buffer lengths are in BYTES.
|
| *
|
| * The caller is responsible for allocating the output buffer
|
| * and ensuring it is big enough (or dealing with a non-zero
|
| * return value--looping or something--if the output buffer
|
| * isn't big enough.)
|
| *
|
| * The first_untranslated_char pointer will be set to point
|
| * within the input buffer to the first character position
|
| * that was not processed (either because it was a bad
|
| * character OR because the output buffer was full OR because
|
| * the caller-specified maximum (max_chars_to_convert) limit
|
| * was reached). If the caller's input buffer is exhausted
|
| * without returning early, the first_untranslated_char pointer
|
| * will be set to point to the end of the input buffer.
|
| *
|
| * For the cnv_flags arg, see the related #defines above.
|
| *
|
| * All 4 of these interface routines assume that the caller
|
| * will deal with any BOM (Byte-Order-Mark) at the start of
|
| * any file that the input data might be coming from...and
|
| * that the caller will prepend any BOM needed before any
|
| * output data from these routines is put into a file.
|
| * It is anticipated that, if there are any such files, they
|
| * will be in Big-Endian format, although that is up to the
|
| * callers.
|
| *
|
| * The addNullAtEnd_flag, if TRUE, specifies to add a NULL
|
| * (1 or 2 bytes of binary 0) at the end of the valid data in
|
| * the output buffer (provided, of course, that there is
|
| * sufficient room in the output buffer to do so.)
|
| *
|
| * The optional translated_char_cnt pointer argument, if
|
| * supplied, is where the routine returns the count of
|
| * successfully translated characters (whether an error is
|
| * encountered or not.) If not supplied or if NULL is
|
| * supplied, the count is not returned. This is a character
|
| * count, not a byte count.
|
| *
|
| * The allow_invalids flag, if true, results in a substitution
|
| * character (see next paragraph) being put in the output buffer
|
| * whenever UTF16ToLocale() encounters a Unicode character
|
| * that it cannot translate to the specified character set.
|
| * For UTF16ToLocale(), after putting the substitution char in
|
| * the output buffer, the routine will keep going after skipping
|
| * the "bad" character in the input buffer. If the "bad"
|
| * character was a valid UTF16 character but just couldn't
|
| * be translated to the specified output locale character
|
| * set, the entire character will be skipped. Otherwise
|
| * two bytes of the input buffer will be skipped.
|
| * For UTF8ToLocale(), after putting the substitution char in
|
| * the output buffer, the routine will keep going after skipping
|
| * the "bad" character in the input buffer. If the "bad"
|
| * character was a valid UTF8 character but just couldn't
|
| * be translated to the specified output locale character
|
| * set, the entire character will be skipped. Otherwise
|
| * one byte of the input buffer will be skipped.
|
| *
|
| * The substitution_char pointer, if not NULL, should point to
|
| * a 1-byte or 2-byte substitution character followed immediately
|
| * by a byte containing a binary 0. See description of
|
| * the allow_invalids flag above. If NULL is specified and the
|
| * allow_invalids flag is non-zero, then the default substitution
|
| * character, namely a ? (question mark), is used.
|
| */
|
|
|
| /*
|
| * Two methods to validate a UTF8 string to ensure that it has
|
| * no partial characters and that it has no more than a given
|
| * number of characters.
|
| *
|
| * Return value:
|
| * negative value:
|
| * The string contains invalid
|
| * UTF-8 or parameter error. Note
|
| * that this light validation routine
|
| * does NOT recognize invalid code points,
|
| * overlong encodings, or UTF-16 surrogate
|
| * pairs encoded as two UTF-8 chars, and
|
| * possibly other problems.
|
| * non-negative value:
|
| * The valid length of the string,
|
| * after removing partial characters
|
| * and extraneous characters
|
| *
|
| * The second method also pads any bytes that are
|
| * truncated with blanks
|
| */
|
|
|
| NA_EIDPROC
|
| int lightValidateUTF8Str(const char *bufr, // ptr to buffer to validate
|
| int in_len, // len in bytes of buffer
|
| int max_chars = 0, // max chars allowed in buffer or 0 for unlimited
|
| int ignore_trailing_blanks = 1); // don't count trailing blanks as chars
|
|
|
| NA_EIDPROC
|
| int lightValidateUTF8StrAndPad(char *bufr,
|
| int in_len,
|
| int max_chars = 0,
|
| int ignore_trailing_blanks = 1);
|
|
|
| #endif /* CSCONVERT_H */
|