| /* |
| * This file contains public functions for conversion between |
| * client encoding and server (database) encoding. |
| * |
| * Tatsuo Ishii |
| * |
| * $PostgreSQL: pgsql/src/backend/utils/mb/mbutils.c,v 1.59 2006/10/04 00:30:02 momjian Exp $ |
| */ |
| #include "postgres.h" |
| |
| #include "access/xact.h" |
| #include "catalog/catquery.h" |
| #include "catalog/namespace.h" |
| #include "mb/pg_wchar.h" |
| #include "utils/builtins.h" |
| #include "utils/memutils.h" |
| #include "utils/pg_locale.h" |
| #include "utils/syscache.h" |
| |
| /* |
| * When converting strings between different encodings, we assume that space |
| * for converted result is 4-to-1 growth in the worst case. The rate for |
| * currently supported encoding pairs are within 3 (SJIS JIS X0201 half width |
| * kanna -> UTF8 is the worst case). So "4" should be enough for the moment. |
| * |
| * Note that this is not the same as the maximum character width in any |
| * particular encoding. |
| */ |
| #define MAX_CONVERSION_GROWTH 4 |
| |
| /* |
| * We handle for actual FE and BE encoding setting encoding-identificator |
| * and encoding-name too. It prevent searching and conversion from encoding |
| * to encoding name in getdatabaseencoding() and other routines. |
| */ |
| static pg_enc2name *ClientEncoding = &pg_enc2name_tbl[PG_SQL_ASCII]; |
| static pg_enc2name *DatabaseEncoding = &pg_enc2name_tbl[PG_SQL_ASCII]; |
| |
| /* |
| * Caches for conversion function info. These values are allocated in |
| * MbProcContext. That context is a child of TopMemoryContext, |
| * which allows these values to survive across transactions. See |
| * SetClientEncoding() for more details. |
| */ |
| static MemoryContext MbProcContext = NULL; |
| static FmgrInfo *ToServerConvProc = NULL; |
| static FmgrInfo *ToClientConvProc = NULL; |
| |
| /* |
| * During backend startup we can't set client encoding because we (a) |
| * can't look up the conversion functions, and (b) may not know the database |
| * encoding yet either. So SetClientEncoding() just accepts anything and |
| * remembers it for InitializeClientEncoding() to apply later. |
| */ |
| static bool backend_startup_complete = false; |
| static int pending_client_encoding = PG_SQL_ASCII; |
| |
| |
| /* Internal functions */ |
| static char * |
| perform_default_encoding_conversion(const char *src, int len, bool is_client_to_server, |
| int custom_client_encoding, |
| FmgrInfo *custom_encoding_proc); |
| static int cliplen(const char *str, int len, int limit); |
| |
| |
| /* |
| * Set the client encoding and save fmgrinfo for the conversion |
| * function if necessary. Returns 0 if okay, -1 if not (bad encoding |
| * or can't support conversion) |
| */ |
| int |
| SetClientEncoding(int encoding, bool doit) |
| { |
| int current_server_encoding; |
| Oid to_server_proc, |
| to_client_proc; |
| FmgrInfo *to_server; |
| FmgrInfo *to_client; |
| MemoryContext oldcontext; |
| |
| if (!PG_VALID_FE_ENCODING(encoding)) |
| return -1; |
| |
| /* Can't do anything during startup, per notes above */ |
| if (!backend_startup_complete) |
| { |
| if (doit) |
| pending_client_encoding = encoding; |
| return 0; |
| } |
| |
| current_server_encoding = GetDatabaseEncoding(); |
| |
| /* |
| * Check for cases that require no conversion function. |
| */ |
| if (current_server_encoding == encoding || |
| current_server_encoding == PG_SQL_ASCII || |
| encoding == PG_SQL_ASCII) |
| { |
| if (doit) |
| { |
| ClientEncoding = &pg_enc2name_tbl[encoding]; |
| ToServerConvProc = NULL; |
| ToClientConvProc = NULL; |
| if (MbProcContext) |
| MemoryContextReset(MbProcContext); |
| } |
| return 0; |
| } |
| |
| /* |
| * If we're not inside a transaction then we can't do catalog lookups, so |
| * fail. After backend startup, this could only happen if we are |
| * re-reading postgresql.conf due to SIGHUP --- so basically this just |
| * constrains the ability to change client_encoding on the fly from |
| * postgresql.conf. Which would probably be a stupid thing to do anyway. |
| */ |
| if (!IsTransactionState()) |
| return -1; |
| |
| /* |
| * Look up the conversion functions. |
| */ |
| to_server_proc = FindDefaultConversionProc(encoding, |
| current_server_encoding); |
| if (!OidIsValid(to_server_proc)) |
| return -1; |
| to_client_proc = FindDefaultConversionProc(current_server_encoding, |
| encoding); |
| if (!OidIsValid(to_client_proc)) |
| return -1; |
| |
| /* |
| * Done if not wanting to actually apply setting. |
| */ |
| if (!doit) |
| return 0; |
| |
| /* Before loading the new fmgr info, remove the old info, if any */ |
| ToServerConvProc = NULL; |
| ToClientConvProc = NULL; |
| if (MbProcContext != NULL) |
| { |
| MemoryContextReset(MbProcContext); |
| } |
| else |
| { |
| /* |
| * This is the first time through, so create the context. Make it a |
| * child of TopMemoryContext so that these values survive across |
| * transactions. |
| */ |
| MbProcContext = AllocSetContextCreate(TopMemoryContext, |
| "MbProcContext", |
| ALLOCSET_SMALL_MINSIZE, |
| ALLOCSET_SMALL_INITSIZE, |
| ALLOCSET_SMALL_MAXSIZE); |
| } |
| |
| /* Load the fmgr info into MbProcContext */ |
| oldcontext = MemoryContextSwitchTo(MbProcContext); |
| to_server = palloc(sizeof(FmgrInfo)); |
| to_client = palloc(sizeof(FmgrInfo)); |
| fmgr_info(to_server_proc, to_server); |
| fmgr_info(to_client_proc, to_client); |
| MemoryContextSwitchTo(oldcontext); |
| |
| ClientEncoding = &pg_enc2name_tbl[encoding]; |
| ToServerConvProc = to_server; |
| ToClientConvProc = to_client; |
| |
| return 0; |
| } |
| |
| /* |
| * Initialize client encoding if necessary. |
| * called from InitPostgres() once during backend startup. |
| */ |
| void |
| InitializeClientEncoding(void) |
| { |
| Assert(!backend_startup_complete); |
| backend_startup_complete = true; |
| |
| if (SetClientEncoding(pending_client_encoding, true) < 0) |
| { |
| /* |
| * Oops, the requested conversion is not available. We couldn't fail |
| * before, but we can now. |
| */ |
| ereport(FATAL, |
| (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), |
| errmsg("conversion between %s and %s is not supported", |
| pg_enc2name_tbl[pending_client_encoding].name, |
| GetDatabaseEncodingName()))); |
| } |
| } |
| |
| /* |
| * returns the current client encoding |
| */ |
| int |
| pg_get_client_encoding(void) |
| { |
| Assert(ClientEncoding); |
| return ClientEncoding->encoding; |
| } |
| |
| /* |
| * returns the current client encoding name |
| */ |
| const char * |
| pg_get_client_encoding_name(void) |
| { |
| Assert(ClientEncoding); |
| return ClientEncoding->name; |
| } |
| |
| /* |
| * Apply encoding conversion on src and return it. The encoding |
| * conversion function is chosen from the pg_conversion system catalog |
| * marked as "default". If it is not found in the schema search path, |
| * it's taken from pg_catalog schema. If it even is not in the schema, |
| * warn and return src. |
| * |
| * If conversion occurs, a palloc'd null-terminated string is returned. |
| * In the case of no conversion, src is returned. |
| * |
| * CAUTION: although the presence of a length argument means that callers |
| * can pass non-null-terminated strings, care is required because the same |
| * string will be passed back if no conversion occurs. Such callers *must* |
| * check whether result == src and handle that case differently. |
| * |
| * Note: we try to avoid raising error, since that could get us into |
| * infinite recursion when this function is invoked during error message |
| * sending. It should be OK to raise error for overlength strings though, |
| * since the recursion will come with a shorter message. |
| */ |
| unsigned char * |
| pg_do_encoding_conversion(unsigned char *src, int len, |
| int src_encoding, int dest_encoding) |
| { |
| unsigned char *result; |
| Oid proc; |
| |
| if (!IsTransactionState()) |
| return src; |
| |
| if (src_encoding == dest_encoding) |
| return src; |
| |
| if (src_encoding == PG_SQL_ASCII || dest_encoding == PG_SQL_ASCII) |
| return src; |
| |
| if (len <= 0) |
| return src; |
| |
| proc = FindDefaultConversionProc(src_encoding, dest_encoding); |
| if (!OidIsValid(proc)) |
| { |
| ereport(LOG, |
| (errcode(ERRCODE_UNDEFINED_FUNCTION), |
| errmsg("default conversion function for encoding \"%s\" to \"%s\" does not exist", |
| pg_encoding_to_char(src_encoding), |
| pg_encoding_to_char(dest_encoding)))); |
| return src; |
| } |
| |
| /* |
| * XXX we should avoid throwing errors in OidFunctionCall. Otherwise we |
| * are going into infinite loop! So we have to make sure that the |
| * function exists before calling OidFunctionCall. |
| */ |
| /* XXX: would have been function_exists() */ |
| if (!(caql_getcount( |
| NULL, |
| cql("SELECT COUNT(*) FROM pg_proc " |
| " WHERE oid = :1 ", |
| ObjectIdGetDatum(proc))) > 0)) |
| { |
| elog(LOG, "cache lookup failed for function %u", proc); |
| return src; |
| } |
| |
| /* |
| * Allocate space for conversion result, being wary of integer overflow |
| */ |
| if ((Size) len >= (MaxAllocSize / (Size) MAX_CONVERSION_GROWTH)) |
| ereport(ERROR, |
| (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), |
| errmsg("out of memory"), |
| errdetail("String of %d bytes is too long for encoding conversion.", |
| len))); |
| |
| result = palloc(len * MAX_CONVERSION_GROWTH + 1); |
| |
| OidFunctionCall5(proc, |
| Int32GetDatum(src_encoding), |
| Int32GetDatum(dest_encoding), |
| CStringGetDatum((char *)src), |
| CStringGetDatum((char *)result), |
| Int32GetDatum(len)); |
| return result; |
| } |
| |
| /* |
| * Convert string using encoding_name. The source |
| * encoding is the DB encoding. |
| * |
| * BYTEA convert_to(TEXT string, NAME encoding_name) */ |
| Datum |
| pg_convert_to(PG_FUNCTION_ARGS) |
| { |
| Datum string = PG_GETARG_DATUM(0); |
| Datum dest_encoding_name = PG_GETARG_DATUM(1); |
| Datum src_encoding_name = DirectFunctionCall1(namein, |
| CStringGetDatum(DatabaseEncoding->name)); |
| Datum result; |
| |
| /* |
| * pg_convert expects a bytea as its first argument. We're passing it a |
| * text argument here, relying on the fact that they are both in fact |
| * varlena types, and thus structurally identical. |
| */ |
| result = DirectFunctionCall3(pg_convert, string, |
| src_encoding_name, dest_encoding_name); |
| |
| PG_RETURN_DATUM(result); |
| } |
| |
| /* |
| * Convert string using encoding_name. The destination |
| * encoding is the DB encoding. |
| * |
| * TEXT convert_from(BYTEA string, NAME encoding_name) */ |
| Datum |
| pg_convert_from(PG_FUNCTION_ARGS) |
| { |
| Datum string = PG_GETARG_DATUM(0); |
| Datum src_encoding_name = PG_GETARG_DATUM(1); |
| Datum dest_encoding_name = DirectFunctionCall1(namein, |
| CStringGetDatum(DatabaseEncoding->name)); |
| Datum result; |
| |
| result = DirectFunctionCall3(pg_convert, string, |
| src_encoding_name, dest_encoding_name); |
| |
| /* |
| * pg_convert returns a bytea, which we in turn return as text, relying on |
| * the fact that they are both in fact varlena types, and thus |
| * structurally identical. Although not all bytea values are valid text, |
| * in this case it will be because we've told pg_convert to return one |
| * that is valid as text in the current database encoding. |
| */ |
| PG_RETURN_DATUM(result); |
| } |
| |
| /* |
| * Convert string using encoding_names. |
| * |
| * BYTEA convert(BYTEA string, NAME src_encoding_name, NAME dest_encoding_name) |
| */ |
| Datum |
| pg_convert(PG_FUNCTION_ARGS) |
| { |
| bytea *string = PG_GETARG_BYTEA_P(0); |
| char *src_encoding_name = NameStr(*PG_GETARG_NAME(1)); |
| int src_encoding = pg_char_to_encoding(src_encoding_name); |
| char *dest_encoding_name = NameStr(*PG_GETARG_NAME(2)); |
| int dest_encoding = pg_char_to_encoding(dest_encoding_name); |
| unsigned char *result; |
| bytea *retval; |
| unsigned char *str; |
| int len; |
| |
| if (src_encoding < 0) |
| ereport(ERROR, |
| (errcode(ERRCODE_INVALID_PARAMETER_VALUE), |
| errmsg("invalid source encoding name \"%s\"", |
| src_encoding_name))); |
| if (dest_encoding < 0) |
| ereport(ERROR, |
| (errcode(ERRCODE_INVALID_PARAMETER_VALUE), |
| errmsg("invalid destination encoding name \"%s\"", |
| dest_encoding_name))); |
| |
| /* make sure that source string is valid and null terminated */ |
| len = VARSIZE(string) - VARHDRSZ; |
| pg_verify_mbstr(src_encoding, VARDATA(string), len, false); |
| str = palloc(len + 1); |
| memcpy(str, VARDATA(string), len); |
| *(str + len) = '\0'; |
| |
| result = pg_do_encoding_conversion(str, len, src_encoding, dest_encoding); |
| |
| /* |
| * build bytea data type structure. |
| */ |
| len = strlen((char *) result) + VARHDRSZ; |
| retval = palloc(len); |
| SET_VARSIZE(retval, len); |
| memcpy(VARDATA(retval), result, len - VARHDRSZ); |
| |
| if (result != str) |
| pfree(result); |
| pfree(str); |
| |
| /* free memory if allocated by the toaster */ |
| PG_FREE_IF_COPY(string, 0); |
| |
| PG_RETURN_BYTEA_P(retval); |
| } |
| |
| /* |
| * get the length of the string considered as text in the specified |
| * encoding. Raises an error if the data is not valid in that |
| * encoding. |
| * |
| * INT4 length (BYTEA string, NAME src_encoding_name) |
| */ |
| Datum |
| length_in_encoding(PG_FUNCTION_ARGS) |
| { |
| bytea *string = PG_GETARG_BYTEA_P(0); |
| char *src_encoding_name = NameStr(*PG_GETARG_NAME(1)); |
| int src_encoding = pg_char_to_encoding(src_encoding_name); |
| int len = VARSIZE(string) - VARHDRSZ; |
| int retval; |
| |
| if (src_encoding < 0) |
| ereport(ERROR, |
| (errcode(ERRCODE_INVALID_PARAMETER_VALUE), |
| errmsg("invalid encoding name \"%s\"", |
| src_encoding_name))); |
| |
| retval = pg_verify_mbstr_len(src_encoding, VARDATA(string), len, false); |
| PG_RETURN_INT32(retval); |
| |
| } |
| |
| /* |
| * Convert string using encoding_name. |
| * |
| * TEXT convert2(TEXT string, NAME src_encoding_name, NAME dest_encoding_name) |
| */ |
| Datum |
| pg_convert2(PG_FUNCTION_ARGS) |
| { |
| text *string = PG_GETARG_TEXT_P(0); |
| char *src_encoding_name = NameStr(*PG_GETARG_NAME(1)); |
| int src_encoding = pg_char_to_encoding(src_encoding_name); |
| char *dest_encoding_name = NameStr(*PG_GETARG_NAME(2)); |
| int dest_encoding = pg_char_to_encoding(dest_encoding_name); |
| unsigned char *result; |
| text *retval; |
| unsigned char *str; |
| int len; |
| |
| if (src_encoding < 0) |
| ereport(ERROR, |
| (errcode(ERRCODE_INVALID_PARAMETER_VALUE), |
| errmsg("invalid source encoding name \"%s\"", |
| src_encoding_name))); |
| if (dest_encoding < 0) |
| ereport(ERROR, |
| (errcode(ERRCODE_INVALID_PARAMETER_VALUE), |
| errmsg("invalid destination encoding name \"%s\"", |
| dest_encoding_name))); |
| |
| /* make sure that source string is null terminated */ |
| len = VARSIZE(string) - VARHDRSZ; |
| str = palloc(len + 1); |
| memcpy(str, VARDATA(string), len); |
| *(str + len) = '\0'; |
| |
| result = pg_do_encoding_conversion(str, len, src_encoding, dest_encoding); |
| if (result == NULL) |
| elog(ERROR, "encoding conversion failed"); |
| |
| /* |
| * build text data type structure. we cannot use textin() here, since |
| * textin assumes that input string encoding is same as database encoding. |
| */ |
| len = strlen((char *) result) + VARHDRSZ; |
| retval = palloc(len); |
| SET_VARSIZE(retval, len); |
| memcpy(VARDATA(retval), result, len - VARHDRSZ); |
| |
| if (result != str) |
| pfree(result); |
| pfree(str); |
| |
| /* free memory if allocated by the toaster */ |
| PG_FREE_IF_COPY(string, 0); |
| |
| PG_RETURN_TEXT_P(retval); |
| } |
| |
| /* |
| * convert client encoding to server encoding. |
| */ |
| char * |
| pg_client_to_server(const char *s, int len) |
| { |
| Assert(DatabaseEncoding); |
| Assert(ClientEncoding); |
| |
| if (len <= 0) |
| return (char *) s; |
| |
| if (ClientEncoding->encoding == DatabaseEncoding->encoding || |
| ClientEncoding->encoding == PG_SQL_ASCII) |
| { |
| /* |
| * No conversion is needed, but we must still validate the data. |
| */ |
| (void) pg_verify_mbstr(DatabaseEncoding->encoding, s, len, false); |
| return (char *) s; |
| } |
| |
| if (DatabaseEncoding->encoding == PG_SQL_ASCII) |
| { |
| /* |
| * No conversion is possible, but we must still validate the data, |
| * because the client-side code might have done string escaping using |
| * the selected client_encoding. If the client encoding is ASCII-safe |
| * then we just do a straight validation under that encoding. For an |
| * ASCII-unsafe encoding we have a problem: we dare not pass such data |
| * to the parser but we have no way to convert it. We compromise by |
| * rejecting the data if it contains any non-ASCII characters. |
| */ |
| if (PG_VALID_BE_ENCODING(ClientEncoding->encoding)) |
| (void) pg_verify_mbstr(ClientEncoding->encoding, s, len, false); |
| else |
| { |
| int i; |
| |
| for (i = 0; i < len; i++) |
| { |
| if (s[i] == '\0' || IS_HIGHBIT_SET(s[i])) |
| ereport(ERROR, |
| (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), |
| errmsg("invalid byte value for encoding \"%s\": 0x%02x", |
| pg_enc2name_tbl[PG_SQL_ASCII].name, |
| (unsigned char) s[i]))); |
| } |
| } |
| return (char *) s; |
| } |
| |
| return perform_default_encoding_conversion(s, len, true, -1, NULL); |
| } |
| |
| /* |
| * convert server encoding to client encoding. |
| */ |
| char * |
| pg_server_to_client(const char *s, int len) |
| { |
| Assert(DatabaseEncoding); |
| Assert(ClientEncoding); |
| |
| if (len <= 0) |
| return (char *) s; |
| |
| if (ClientEncoding->encoding == DatabaseEncoding->encoding || |
| ClientEncoding->encoding == PG_SQL_ASCII || |
| DatabaseEncoding->encoding == PG_SQL_ASCII) |
| return (char *) s; /* assume data is valid */ |
| |
| return perform_default_encoding_conversion(s, len, false, -1, NULL); |
| } |
| |
| /* |
| * Perform default encoding conversion using cached FmgrInfo. Since |
| * this function does not access database at all, it is safe to call |
| * outside transactions. Explicit setting client encoding required |
| * before calling this function. Otherwise no conversion is |
| * performed. |
| * |
| * NOTE: this function was slightly updated to allow passing in a source |
| * encoding that is not necessarily ClientEncoding->encoding for client-to- |
| * server conversion. Default value is -1, which means: use ClientEncoding. |
| * See pg_custom_client_to_server for information. |
| */ |
| static char * |
| perform_default_encoding_conversion(const char *src, int len, bool is_client_to_server, |
| int custom_client_encoding, |
| FmgrInfo *custom_encoding_proc) |
| { |
| char *result; |
| int src_encoding, |
| dest_encoding; |
| FmgrInfo *flinfo; |
| |
| if (is_client_to_server) |
| { |
| if(custom_client_encoding == -1) |
| { |
| /* this is the normal path of execution */ |
| src_encoding = ClientEncoding->encoding; |
| dest_encoding = DatabaseEncoding->encoding; |
| flinfo = ToServerConvProc; |
| } |
| else |
| { |
| /* this is the custom path of execution, for external tbl encodings */ |
| src_encoding = custom_client_encoding; |
| dest_encoding = DatabaseEncoding->encoding; |
| flinfo = custom_encoding_proc; |
| } |
| } |
| else |
| { |
| if(custom_client_encoding == -1) |
| { |
| /* this is the normal path of execution */ |
| src_encoding = DatabaseEncoding->encoding; |
| dest_encoding = ClientEncoding->encoding; |
| flinfo = ToClientConvProc; |
| } |
| else |
| { |
| /* this is the custom path of execution, for external tbl encodings */ |
| src_encoding = DatabaseEncoding->encoding; |
| dest_encoding = custom_client_encoding; |
| flinfo = custom_encoding_proc; |
| } |
| } |
| |
| if (flinfo == NULL) |
| return (char *) src; |
| |
| /* |
| * Allocate space for conversion result, being wary of integer overflow |
| */ |
| if ((Size) len >= (MaxAllocSize / (Size) MAX_CONVERSION_GROWTH)) |
| ereport(ERROR, |
| (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), |
| errmsg("out of memory"), |
| errdetail("String of %d bytes is too long for encoding conversion.", |
| len))); |
| |
| result = palloc(len * MAX_CONVERSION_GROWTH + 1); |
| |
| FunctionCall5(flinfo, |
| Int32GetDatum(src_encoding), |
| Int32GetDatum(dest_encoding), |
| CStringGetDatum((char *) src), |
| CStringGetDatum(result), |
| Int32GetDatum(len)); |
| |
| return result; |
| } |
| |
| |
| |
| #ifdef USE_WIDE_UPPER_LOWER |
| |
| /* |
| * wchar2char --- convert wide characters to multibyte format |
| * |
| * This has the same API as the standard wcstombs() function; in particular, |
| * tolen is the maximum number of bytes to store at *to, and *from must be |
| * zero-terminated. The output will be zero-terminated iff there is room. |
| */ |
| size_t |
| wchar2char(char *to, const wchar_t *from, size_t tolen) |
| { |
| size_t result; |
| |
| if (tolen == 0) |
| return 0; |
| |
| #ifdef WIN32 |
| |
| /* |
| * On Windows, the "Unicode" locales assume UTF16 not UTF8 encoding, and |
| * for some reason mbstowcs and wcstombs won't do this for us, so we use |
| * MultiByteToWideChar(). |
| */ |
| if (GetDatabaseEncoding() == PG_UTF8) |
| { |
| result = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, tolen, |
| NULL, NULL); |
| /* A zero return is failure */ |
| if (result <= 0) |
| result = -1; |
| else |
| { |
| Assert(result <= tolen); |
| /* Microsoft counts the zero terminator in the result */ |
| result--; |
| } |
| } |
| else |
| #endif /* WIN32 */ |
| { |
| Assert(!lc_ctype_is_c()); |
| result = wcstombs(to, from, tolen); |
| } |
| return result; |
| } |
| |
| /* |
| * char2wchar --- convert multibyte characters to wide characters |
| * |
| * This has almost the API of mbstowcs(), except that *from need not be |
| * null-terminated; instead, the number of input bytes is specified as |
| * fromlen. Also, we ereport() rather than returning -1 for invalid |
| * input encoding. tolen is the maximum number of wchar_t's to store at *to. |
| * The output will be zero-terminated iff there is room. |
| */ |
| size_t |
| char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen) |
| { |
| size_t result; |
| |
| if (tolen == 0) |
| return 0; |
| |
| #ifdef WIN32 |
| /* See WIN32 "Unicode" comment above */ |
| if (GetDatabaseEncoding() == PG_UTF8) |
| { |
| /* Win32 API does not work for zero-length input */ |
| if (fromlen == 0) |
| result = 0; |
| else |
| { |
| result = MultiByteToWideChar(CP_UTF8, 0, from, fromlen, to, tolen - 1); |
| /* A zero return is failure */ |
| if (result == 0) |
| result = -1; |
| } |
| |
| if (result != -1) |
| { |
| Assert(result < tolen); |
| /* Append trailing null wchar (MultiByteToWideChar() does not) */ |
| to[result] = 0; |
| } |
| } |
| else |
| #endif /* WIN32 */ |
| { |
| /* mbstowcs requires ending '\0' */ |
| char *str = pnstrdup(from, fromlen); |
| |
| Assert(!lc_ctype_is_c()); |
| result = mbstowcs(to, str, tolen); |
| pfree(str); |
| } |
| |
| if (result == -1) |
| { |
| /* |
| * Invalid multibyte character encountered. We try to give a useful |
| * error message by letting pg_verifymbstr check the string. But it's |
| * possible that the string is OK to us, and not OK to mbstowcs --- |
| * this suggests that the LC_CTYPE locale is different from the |
| * database encoding. Give a generic error message if verifymbstr |
| * can't find anything wrong. |
| */ |
| pg_verifymbstr(from, fromlen, false); /* might not return */ |
| /* but if it does ... */ |
| ereport(ERROR, |
| (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), |
| errmsg("invalid multibyte character for locale"), |
| errhint("The server's LC_CTYPE locale is probably incompatible with the database encoding."))); |
| } |
| |
| return result; |
| } |
| #endif |
| |
| /* |
| * pg_custom_client_to_server |
| * |
| * convert client encoding to server encoding, but use the passed in encodings |
| * instead of the global client and server encoding variables. |
| * |
| * This routine is basically a slightly modified version of pg_client_to_server. |
| * Instead of creating this routine a better way may have been to just call |
| * pg_do_encoding_conversion(), which takes in the necessary arguments, however |
| * it does not do several necessary checks that pg_client_to_server() does, and |
| * altering it to have those check may break other parts of the system. Therefore |
| * until there's a better idea we resort to duplicating some code. |
| * |
| * The reason for creating this routine is to let external tables do data |
| * conversion reliably. Since each external table has an encoding attached to |
| * it we'd like to just convert from that encoding to the server encoding without |
| * altering the global client_encoding variable for this local database. |
| */ |
| char * |
| pg_custom_to_server(const char *s, int len, int src_encoding, void *cep) |
| { |
| FmgrInfo *custom_encoding_proc = (FmgrInfo *)cep; |
| |
| Assert(DatabaseEncoding); |
| Assert(ClientEncoding); |
| |
| if (len <= 0) |
| return (char *) s; |
| |
| if (src_encoding == DatabaseEncoding->encoding || |
| src_encoding == PG_SQL_ASCII) |
| { |
| /* |
| * No conversion is needed, but we must still validate the data. |
| */ |
| (void) pg_verify_mbstr(DatabaseEncoding->encoding, s, len, false); |
| return (char *) s; |
| } |
| |
| if (DatabaseEncoding->encoding == PG_SQL_ASCII) |
| { |
| /* |
| * No conversion is possible, but we must still validate the data, |
| * because the client-side code might have done string escaping using |
| * the selected client_encoding. If the client encoding is ASCII-safe |
| * then we just do a straight validation under that encoding. For an |
| * ASCII-unsafe encoding we have a problem: we dare not pass such data |
| * to the parser but we have no way to convert it. We compromise by |
| * rejecting the data if it contains any non-ASCII characters. |
| */ |
| if (PG_VALID_BE_ENCODING(src_encoding)) |
| (void) pg_verify_mbstr(src_encoding, s, len, false); |
| else |
| { |
| int i; |
| |
| for (i = 0; i < len; i++) |
| { |
| if (s[i] == '\0' || IS_HIGHBIT_SET(s[i])) |
| ereport(ERROR, |
| (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), |
| errmsg("invalid byte value for encoding \"%s\": 0x%02x", |
| pg_enc2name_tbl[PG_SQL_ASCII].name, |
| (unsigned char) s[i]))); |
| } |
| } |
| return (char *) s; |
| } |
| |
| return perform_default_encoding_conversion(s, len, true, src_encoding, custom_encoding_proc); |
| } |
| |
| /* |
| * pg_server_to_custom |
| * |
| * convert server encoding to custom encoding. the reverse of pg_custom_to_server. |
| * see pg_custom_to_server, and perform_default_encoding_conversion headers for |
| * more information. |
| */ |
| char * |
| pg_server_to_custom(const char *s, int len, int dest_encoding, void *cep) |
| { |
| FmgrInfo *custom_encoding_proc = (FmgrInfo *)cep; |
| |
| Assert(DatabaseEncoding); |
| |
| if (len <= 0) |
| return (char *) s; |
| |
| if (dest_encoding == DatabaseEncoding->encoding || |
| dest_encoding == PG_SQL_ASCII || |
| DatabaseEncoding->encoding == PG_SQL_ASCII) |
| return (char *) s; /* assume data is valid */ |
| |
| return perform_default_encoding_conversion(s, len, false, dest_encoding, custom_encoding_proc); |
| } |
| |
| /* convert a multibyte string to a wchar */ |
| int |
| pg_mb2wchar(const char *from, pg_wchar *to) |
| { |
| return (*pg_wchar_table[DatabaseEncoding->encoding].mb2wchar_with_len) ((const unsigned char *) from, to, strlen(from)); |
| } |
| |
| /* convert a multibyte string to a wchar with a limited length */ |
| int |
| pg_mb2wchar_with_len(const char *from, pg_wchar *to, int len) |
| { |
| return (*pg_wchar_table[DatabaseEncoding->encoding].mb2wchar_with_len) ((const unsigned char *) from, to, len); |
| } |
| |
| /* same, with any encoding */ |
| int |
| pg_encoding_mb2wchar_with_len(int encoding, |
| const char *from, pg_wchar *to, int len) |
| { |
| return (*pg_wchar_table[encoding].mb2wchar_with_len) ((const unsigned char *) from, to, len); |
| } |
| |
| /* returns the byte length of a multibyte character */ |
| int |
| pg_mblen(const char *mbstr) |
| { |
| return ((*pg_wchar_table[DatabaseEncoding->encoding].mblen) ((const unsigned char *) mbstr)); |
| } |
| |
| /* returns the display length of a multibyte character */ |
| int |
| pg_dsplen(const char *mbstr) |
| { |
| return ((*pg_wchar_table[DatabaseEncoding->encoding].dsplen) ((const unsigned char *) mbstr)); |
| } |
| |
| /* returns the length (counted in wchars) of a multibyte string */ |
| int |
| pg_mbstrlen(const char *mbstr) |
| { |
| int len = 0; |
| |
| /* optimization for single byte encoding */ |
| if (pg_database_encoding_max_length() == 1) |
| return strlen(mbstr); |
| |
| while (*mbstr) |
| { |
| mbstr += pg_mblen(mbstr); |
| len++; |
| } |
| return len; |
| } |
| |
| /* returns the length (counted in wchars) of a multibyte string |
| * (not necessarily NULL terminated) |
| */ |
| int |
| pg_mbstrlen_with_len(const char *mbstr, int limit) |
| { |
| int len = 0; |
| |
| /* optimization for single byte encoding */ |
| if (pg_database_encoding_max_length() == 1) |
| return limit; |
| |
| while (limit > 0 && *mbstr) |
| { |
| int l = pg_mblen(mbstr); |
| |
| limit -= l; |
| mbstr += l; |
| len++; |
| } |
| return len; |
| } |
| |
| /* |
| * returns the byte length of a multibyte string |
| * (not necessarily NULL terminated) |
| * that is no longer than limit. |
| * this function does not break multibyte character boundary. |
| */ |
| int |
| pg_mbcliplen(const char *mbstr, int len, int limit) |
| { |
| return pg_encoding_mbcliplen(DatabaseEncoding->encoding, mbstr, |
| len, limit); |
| } |
| |
| /* |
| * pg_mbcliplen with specified encoding |
| */ |
| int |
| pg_encoding_mbcliplen(int encoding, const char *mbstr, |
| int len, int limit) |
| { |
| mblen_converter mblen_fn; |
| int clen = 0; |
| int l; |
| |
| /* optimization for single byte encoding */ |
| if (pg_encoding_max_length(encoding) == 1) |
| return cliplen(mbstr, len, limit); |
| |
| mblen_fn = pg_wchar_table[encoding].mblen; |
| |
| while (len > 0 && *mbstr) |
| { |
| l = (*mblen_fn) ((const unsigned char *) mbstr); |
| if ((clen + l) > limit) |
| break; |
| clen += l; |
| if (clen == limit) |
| break; |
| len -= l; |
| mbstr += l; |
| } |
| return clen; |
| } |
| |
| /* |
| * Similar to pg_mbcliplen except the limit parameter specifies the |
| * character length, not the byte length. |
| */ |
| int |
| pg_mbcharcliplen(const char *mbstr, int len, int limit) |
| { |
| int clen = 0; |
| int nch = 0; |
| int l; |
| |
| /* optimization for single byte encoding */ |
| if (pg_database_encoding_max_length() == 1) |
| return cliplen(mbstr, len, limit); |
| |
| while (len > 0 && *mbstr) |
| { |
| l = pg_mblen(mbstr); |
| nch++; |
| if (nch > limit) |
| break; |
| clen += l; |
| len -= l; |
| mbstr += l; |
| } |
| return clen; |
| } |
| |
| /* mbcliplen for any single-byte encoding */ |
| static int |
| cliplen(const char *str, int len, int limit) |
| { |
| int l = 0; |
| |
| len = Min(len, limit); |
| while (l < len && str[l]) |
| l++; |
| return l; |
| } |
| |
| #if defined(ENABLE_NLS) && defined(WIN32) |
| static const struct codeset_map { |
| int encoding; |
| const char *codeset; |
| } codeset_map_array[] = { |
| {PG_UTF8, "UTF-8"}, |
| {PG_LATIN1, "LATIN1"}, |
| {PG_LATIN2, "LATIN2"}, |
| {PG_LATIN3, "LATIN3"}, |
| {PG_LATIN4, "LATIN4"}, |
| {PG_ISO_8859_5, "ISO-8859-5"}, |
| {PG_ISO_8859_6, "ISO_8859-6"}, |
| {PG_ISO_8859_7, "ISO-8859-7"}, |
| {PG_ISO_8859_8, "ISO-8859-8"}, |
| {PG_LATIN5, "LATIN5"}, |
| {PG_LATIN6, "LATIN6"}, |
| {PG_LATIN7, "LATIN7"}, |
| {PG_LATIN8, "LATIN8"}, |
| {PG_LATIN9, "LATIN-9"}, |
| {PG_LATIN10, "LATIN10"}, |
| {PG_KOI8R, "KOI8-R"}, |
| {PG_WIN1250, "CP1250"}, |
| {PG_WIN1251, "CP1251"}, |
| {PG_WIN1252, "CP1252"}, |
| {PG_WIN1253, "CP1253"}, |
| {PG_WIN1254, "CP1254"}, |
| {PG_WIN1255, "CP1255"}, |
| {PG_WIN1256, "CP1256"}, |
| {PG_WIN1257, "CP1257"}, |
| {PG_WIN1258, "CP1258"}, |
| {PG_WIN866, "CP866"}, |
| {PG_WIN874, "CP874"}, |
| {PG_EUC_CN, "EUC-CN"}, |
| {PG_EUC_JP, "EUC-JP"}, |
| {PG_EUC_KR, "EUC-KR"}, |
| {PG_EUC_TW, "EUC-TW"}, |
| {PG_EUC_JIS_2004, "EUC-JP"} |
| }; |
| #endif /* WIN32 */ |
| |
| void |
| SetDatabaseEncoding(int encoding) |
| { |
| if (!PG_VALID_BE_ENCODING(encoding)) |
| elog(ERROR, "invalid database encoding: %d", encoding); |
| |
| DatabaseEncoding = &pg_enc2name_tbl[encoding]; |
| Assert(DatabaseEncoding->encoding == encoding); |
| } |
| |
| /* |
| * Bind gettext to the codeset equivalent with the database encoding. |
| */ |
| void |
| pg_bind_textdomain_codeset(const char *domainname) |
| { |
| #if defined(ENABLE_NLS) |
| int encoding = GetDatabaseEncoding(); |
| int i; |
| |
| /* |
| * gettext() uses the codeset specified by LC_CTYPE by default, so if that |
| * matches the database encoding we don't need to do anything. In CREATE |
| * DATABASE, we enforce or trust that the locale's codeset matches |
| * database encoding, except for the C locale. In C locale, we bind |
| * gettext() explicitly to the right codeset. |
| * |
| * On Windows, though, gettext() tends to get confused so we always bind |
| * it. |
| */ |
| #ifndef WIN32 |
| const char *ctype = setlocale(LC_CTYPE, NULL); |
| |
| if (pg_strcasecmp(ctype, "C") != 0 && pg_strcasecmp(ctype, "POSIX") != 0) |
| return; |
| #endif |
| |
| for (i = 0; pg_enc2gettext_tbl[i].name != NULL; i++) |
| { |
| if (pg_enc2gettext_tbl[i].encoding == encoding) |
| { |
| if (bind_textdomain_codeset(domainname, |
| pg_enc2gettext_tbl[i].name) == NULL) |
| elog(LOG, "bind_textdomain_codeset failed"); |
| break; |
| } |
| } |
| #endif |
| } |
| |
| void |
| SetDefaultClientEncoding(void) |
| { |
| ClientEncoding = &pg_enc2name_tbl[GetDatabaseEncoding()]; |
| } |
| |
| int |
| GetDatabaseEncoding(void) |
| { |
| Assert(DatabaseEncoding); |
| return DatabaseEncoding->encoding; |
| } |
| |
| const char * |
| GetDatabaseEncodingName(void) |
| { |
| Assert(DatabaseEncoding); |
| return DatabaseEncoding->name; |
| } |
| |
| Datum |
| getdatabaseencoding(PG_FUNCTION_ARGS) |
| { |
| Assert(DatabaseEncoding); |
| return DirectFunctionCall1(namein, CStringGetDatum(DatabaseEncoding->name)); |
| } |
| |
| Datum |
| pg_client_encoding(PG_FUNCTION_ARGS) |
| { |
| Assert(ClientEncoding); |
| return DirectFunctionCall1(namein, CStringGetDatum(ClientEncoding->name)); |
| } |