src/backend/utils/mb/mbutils.c - hawq - Git at Google

 /*
  * This file contains public functions for conversion between
  * client encoding and server (database) encoding.
  *
  * Tatsuo Ishii
  *
  * $PostgreSQL: pgsql/src/backend/utils/mb/mbutils.c,v 1.59 2006/10/04 00:30:02 momjian Exp $
  */
 #include "postgres.h"

 #include "access/xact.h"
 #include "catalog/catquery.h"
 #include "catalog/namespace.h"
 #include "mb/pg_wchar.h"
 #include "utils/builtins.h"
 #include "utils/memutils.h"
 #include "utils/pg_locale.h"
 #include "utils/syscache.h"

 /*
  * When converting strings between different encodings, we assume that space
  * for converted result is 4-to-1 growth in the worst case. The rate for
  * currently supported encoding pairs are within 3 (SJIS JIS X0201 half width
  * kanna -> UTF8 is the worst case).  So "4" should be enough for the moment.
  *
  * Note that this is not the same as the maximum character width in any
  * particular encoding.
  */
 #define MAX_CONVERSION_GROWTH  4

 /*
  * We handle for actual FE and BE encoding setting encoding-identificator
  * and encoding-name too. It prevent searching and conversion from encoding
  * to encoding name in getdatabaseencoding() and other routines.
  */
 static pg_enc2name *ClientEncoding = &pg_enc2name_tbl[PG_SQL_ASCII];
 static pg_enc2name *DatabaseEncoding = &pg_enc2name_tbl[PG_SQL_ASCII];

 /*
  * Caches for conversion function info. These values are allocated in
  * MbProcContext. That context is a child of TopMemoryContext,
  * which allows these values to survive across transactions. See
  * SetClientEncoding() for more details.
  */
 static MemoryContext MbProcContext = NULL;
 static FmgrInfo *ToServerConvProc = NULL;
 static FmgrInfo *ToClientConvProc = NULL;

 /*
  * During backend startup we can't set client encoding because we (a)
  * can't look up the conversion functions, and (b) may not know the database
  * encoding yet either.  So SetClientEncoding() just accepts anything and
  * remembers it for InitializeClientEncoding() to apply later.
  */
 static bool backend_startup_complete = false;
 static int	pending_client_encoding = PG_SQL_ASCII;


 /* Internal functions */
 static char *
 perform_default_encoding_conversion(const char *src, int len, bool is_client_to_server,
 									int custom_client_encoding,
 									FmgrInfo *custom_encoding_proc);
 static int	cliplen(const char *str, int len, int limit);


 /*
  * Set the client encoding and save fmgrinfo for the conversion
  * function if necessary.  Returns 0 if okay, -1 if not (bad encoding
  * or can't support conversion)
  */
 int
 SetClientEncoding(int encoding, bool doit)
 {
 	int			current_server_encoding;
 	Oid			to_server_proc,
 				to_client_proc;
 	FmgrInfo   *to_server;
 	FmgrInfo   *to_client;
 	MemoryContext oldcontext;

 	if (!PG_VALID_FE_ENCODING(encoding))
 		return -1;

 	/* Can't do anything during startup, per notes above */
 	if (!backend_startup_complete)
 	{
 		if (doit)
 			pending_client_encoding = encoding;
 		return 0;
 	}

 	current_server_encoding = GetDatabaseEncoding();

 	/*
 	 * Check for cases that require no conversion function.
 	 */
 	if (current_server_encoding == encoding ||
 		current_server_encoding == PG_SQL_ASCII ||
 		encoding == PG_SQL_ASCII)
 	{
 		if (doit)
 		{
 			ClientEncoding = &pg_enc2name_tbl[encoding];
 			ToServerConvProc = NULL;
 			ToClientConvProc = NULL;
 			if (MbProcContext)
 				MemoryContextReset(MbProcContext);
 		}
 		return 0;
 	}

 	/*
 	 * If we're not inside a transaction then we can't do catalog lookups, so
 	 * fail.  After backend startup, this could only happen if we are
 	 * re-reading postgresql.conf due to SIGHUP --- so basically this just
 	 * constrains the ability to change client_encoding on the fly from
 	 * postgresql.conf.  Which would probably be a stupid thing to do anyway.
 	 */
 	if (!IsTransactionState())
 		return -1;

 	/*
 	 * Look up the conversion functions.
 	 */
 	to_server_proc = FindDefaultConversionProc(encoding,
 											   current_server_encoding);
 	if (!OidIsValid(to_server_proc))
 		return -1;
 	to_client_proc = FindDefaultConversionProc(current_server_encoding,
 											   encoding);
 	if (!OidIsValid(to_client_proc))
 		return -1;

 	/*
 	 * Done if not wanting to actually apply setting.
 	 */
 	if (!doit)
 		return 0;

 	/* Before loading the new fmgr info, remove the old info, if any */
 	ToServerConvProc = NULL;
 	ToClientConvProc = NULL;
 	if (MbProcContext != NULL)
 	{
 		MemoryContextReset(MbProcContext);
 	}
 	else
 	{
 		/*
 		 * This is the first time through, so create the context. Make it a
 		 * child of TopMemoryContext so that these values survive across
 		 * transactions.
 		 */
 		MbProcContext = AllocSetContextCreate(TopMemoryContext,
 											  "MbProcContext",
 											  ALLOCSET_SMALL_MINSIZE,
 											  ALLOCSET_SMALL_INITSIZE,
 											  ALLOCSET_SMALL_MAXSIZE);
 	}

 	/* Load the fmgr info into MbProcContext */
 	oldcontext = MemoryContextSwitchTo(MbProcContext);
 	to_server = palloc(sizeof(FmgrInfo));
 	to_client = palloc(sizeof(FmgrInfo));
 	fmgr_info(to_server_proc, to_server);
 	fmgr_info(to_client_proc, to_client);
 	MemoryContextSwitchTo(oldcontext);

 	ClientEncoding = &pg_enc2name_tbl[encoding];
 	ToServerConvProc = to_server;
 	ToClientConvProc = to_client;

 	return 0;
 }

 /*
  * Initialize client encoding if necessary.
  *		called from InitPostgres() once during backend startup.
  */
 void
 InitializeClientEncoding(void)
 {
 	Assert(!backend_startup_complete);
 	backend_startup_complete = true;

 	if (SetClientEncoding(pending_client_encoding, true) < 0)
 	{
 		/*
 		 * Oops, the requested conversion is not available. We couldn't fail
 		 * before, but we can now.
 		 */
 		ereport(FATAL,
 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 				 errmsg("conversion between %s and %s is not supported",
 						pg_enc2name_tbl[pending_client_encoding].name,
 						GetDatabaseEncodingName())));
 	}
 }

 /*
  * returns the current client encoding
  */
 int
 pg_get_client_encoding(void)
 {
 	Assert(ClientEncoding);
 	return ClientEncoding->encoding;
 }

 /*
  * returns the current client encoding name
  */
 const char *
 pg_get_client_encoding_name(void)
 {
 	Assert(ClientEncoding);
 	return ClientEncoding->name;
 }

 /*
  * Apply encoding conversion on src and return it. The encoding
  * conversion function is chosen from the pg_conversion system catalog
  * marked as "default". If it is not found in the schema search path,
  * it's taken from pg_catalog schema. If it even is not in the schema,
  * warn and return src.
  *
  * If conversion occurs, a palloc'd null-terminated string is returned.
  * In the case of no conversion, src is returned.
  *
  * CAUTION: although the presence of a length argument means that callers
  * can pass non-null-terminated strings, care is required because the same
  * string will be passed back if no conversion occurs.	Such callers *must*
  * check whether result == src and handle that case differently.
  *
  * Note: we try to avoid raising error, since that could get us into
  * infinite recursion when this function is invoked during error message
  * sending.  It should be OK to raise error for overlength strings though,
  * since the recursion will come with a shorter message.
  */
 unsigned char *
 pg_do_encoding_conversion(unsigned char *src, int len,
 						  int src_encoding, int dest_encoding)
 {
 	unsigned char *result;
 	Oid			proc;

 	if (!IsTransactionState())
 		return src;

 	if (src_encoding == dest_encoding)
 		return src;

 	if (src_encoding == PG_SQL_ASCII || dest_encoding == PG_SQL_ASCII)
 		return src;

 	if (len <= 0)
 		return src;

 	proc = FindDefaultConversionProc(src_encoding, dest_encoding);
 	if (!OidIsValid(proc))
 	{
 		ereport(LOG,
 				(errcode(ERRCODE_UNDEFINED_FUNCTION),
 				 errmsg("default conversion function for encoding \"%s\" to \"%s\" does not exist",
 						pg_encoding_to_char(src_encoding),
 						pg_encoding_to_char(dest_encoding))));
 		return src;
 	}

 	/*
 	 * XXX we should avoid throwing errors in OidFunctionCall. Otherwise we
 	 * are going into infinite loop!  So we have to make sure that the
 	 * function exists before calling OidFunctionCall.
 	 */
 	/* XXX: would have been function_exists() */
 	if (!(caql_getcount(
 					NULL,
 					cql("SELECT COUNT(*) FROM pg_proc "
 						" WHERE oid = :1 ",
 						ObjectIdGetDatum(proc))) > 0))
 	{
 		elog(LOG, "cache lookup failed for function %u", proc);
 		return src;
 	}

 	/*
 	 * Allocate space for conversion result, being wary of integer overflow
 	 */
 	if ((Size) len >= (MaxAllocSize / (Size) MAX_CONVERSION_GROWTH))
 		ereport(ERROR,
 				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
 				 errmsg("out of memory"),
 		 errdetail("String of %d bytes is too long for encoding conversion.",
 				   len)));

 	result = palloc(len * MAX_CONVERSION_GROWTH + 1);

 	OidFunctionCall5(proc,
 					 Int32GetDatum(src_encoding),
 					 Int32GetDatum(dest_encoding),
 					 CStringGetDatum((char *)src),
 					 CStringGetDatum((char *)result),
 					 Int32GetDatum(len));
 	return result;
 }

 /*
  * Convert string using encoding_name. The source
  * encoding is the DB encoding.
  *
  * BYTEA convert_to(TEXT string, NAME encoding_name) */
 Datum
 pg_convert_to(PG_FUNCTION_ARGS)
 {
 	Datum		string = PG_GETARG_DATUM(0);
 	Datum		dest_encoding_name = PG_GETARG_DATUM(1);
 	Datum		src_encoding_name = DirectFunctionCall1(namein,
 									CStringGetDatum(DatabaseEncoding->name));
 	Datum		result;

 	/*
 	 * pg_convert expects a bytea as its first argument. We're passing it a
 	 * text argument here, relying on the fact that they are both in fact
 	 * varlena types, and thus structurally identical.
 	 */
 	result = DirectFunctionCall3(pg_convert, string,
 								 src_encoding_name, dest_encoding_name);

 	PG_RETURN_DATUM(result);
 }

 /*
  * Convert string using encoding_name. The destination
  * encoding is the DB encoding.
  *
  * TEXT convert_from(BYTEA string, NAME encoding_name) */
 Datum
 pg_convert_from(PG_FUNCTION_ARGS)
 {
 	Datum		string = PG_GETARG_DATUM(0);
 	Datum		src_encoding_name = PG_GETARG_DATUM(1);
 	Datum		dest_encoding_name = DirectFunctionCall1(namein,
 									CStringGetDatum(DatabaseEncoding->name));
 	Datum		result;

 	result = DirectFunctionCall3(pg_convert, string,
 								 src_encoding_name, dest_encoding_name);

 	/*
 	 * pg_convert returns a bytea, which we in turn return as text, relying on
 	 * the fact that they are both in fact varlena types, and thus
 	 * structurally identical. Although not all bytea values are valid text,
 	 * in this case it will be because we've told pg_convert to return one
 	 * that is valid as text in the current database encoding.
 	 */
 	PG_RETURN_DATUM(result);
 }

 /*
  * Convert string using encoding_names.
  *
  * BYTEA convert(BYTEA string, NAME src_encoding_name, NAME dest_encoding_name)
  */
 Datum
 pg_convert(PG_FUNCTION_ARGS)
 {
 	bytea	   *string = PG_GETARG_BYTEA_P(0);
 	char	   *src_encoding_name = NameStr(*PG_GETARG_NAME(1));
 	int			src_encoding = pg_char_to_encoding(src_encoding_name);
 	char	   *dest_encoding_name = NameStr(*PG_GETARG_NAME(2));
 	int			dest_encoding = pg_char_to_encoding(dest_encoding_name);
 	unsigned char *result;
 	bytea	   *retval;
 	unsigned char *str;
 	int			len;

 	if (src_encoding < 0)
 		ereport(ERROR,
 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 				 errmsg("invalid source encoding name \"%s\"",
 						src_encoding_name)));
 	if (dest_encoding < 0)
 		ereport(ERROR,
 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 				 errmsg("invalid destination encoding name \"%s\"",
 						dest_encoding_name)));

 	/* make sure that source string is valid and null terminated */
 	len = VARSIZE(string) - VARHDRSZ;
 	pg_verify_mbstr(src_encoding, VARDATA(string), len, false);
 	str = palloc(len + 1);
 	memcpy(str, VARDATA(string), len);
 	*(str + len) = '\0';

 	result = pg_do_encoding_conversion(str, len, src_encoding, dest_encoding);

 	/*
 	 * build bytea data type structure.
 	 */
 	len = strlen((char *) result) + VARHDRSZ;
 	retval = palloc(len);
 	SET_VARSIZE(retval, len);
 	memcpy(VARDATA(retval), result, len - VARHDRSZ);

 	if (result != str)
 		pfree(result);
 	pfree(str);

 	/* free memory if allocated by the toaster */
 	PG_FREE_IF_COPY(string, 0);

 	PG_RETURN_BYTEA_P(retval);
 }

 /*
  * get the length of the string considered as text in the specified
  * encoding. Raises an error if the data is not valid in that
  * encoding.
  *
  * INT4 length (BYTEA string, NAME src_encoding_name)
  */
 Datum
 length_in_encoding(PG_FUNCTION_ARGS)
 {
 	bytea	   *string = PG_GETARG_BYTEA_P(0);
 	char	   *src_encoding_name = NameStr(*PG_GETARG_NAME(1));
 	int			src_encoding = pg_char_to_encoding(src_encoding_name);
 	int			len = VARSIZE(string) - VARHDRSZ;
 	int			retval;

 	if (src_encoding < 0)
 		ereport(ERROR,
 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 				 errmsg("invalid encoding name \"%s\"",
 						src_encoding_name)));

 	retval = pg_verify_mbstr_len(src_encoding, VARDATA(string), len, false);
 	PG_RETURN_INT32(retval);

 }

 /*
  * Convert string using encoding_name.
  *
  * TEXT convert2(TEXT string, NAME src_encoding_name, NAME dest_encoding_name)
  */
 Datum
 pg_convert2(PG_FUNCTION_ARGS)
 {
 	text	   *string = PG_GETARG_TEXT_P(0);
 	char	   *src_encoding_name = NameStr(*PG_GETARG_NAME(1));
 	int			src_encoding = pg_char_to_encoding(src_encoding_name);
 	char	   *dest_encoding_name = NameStr(*PG_GETARG_NAME(2));
 	int			dest_encoding = pg_char_to_encoding(dest_encoding_name);
 	unsigned char *result;
 	text	   *retval;
 	unsigned char *str;
 	int			len;

 	if (src_encoding < 0)
 		ereport(ERROR,
 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 				 errmsg("invalid source encoding name \"%s\"",
 						src_encoding_name)));
 	if (dest_encoding < 0)
 		ereport(ERROR,
 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 				 errmsg("invalid destination encoding name \"%s\"",
 						dest_encoding_name)));

 	/* make sure that source string is null terminated */
 	len = VARSIZE(string) - VARHDRSZ;
 	str = palloc(len + 1);
 	memcpy(str, VARDATA(string), len);
 	*(str + len) = '\0';

 	result = pg_do_encoding_conversion(str, len, src_encoding, dest_encoding);
 	if (result == NULL)
 		elog(ERROR, "encoding conversion failed");

 	/*
 	 * build text data type structure. we cannot use textin() here, since
 	 * textin assumes that input string encoding is same as database encoding.
 	 */
 	len = strlen((char *) result) + VARHDRSZ;
 	retval = palloc(len);
 	SET_VARSIZE(retval, len);
 	memcpy(VARDATA(retval), result, len - VARHDRSZ);

 	if (result != str)
 		pfree(result);
 	pfree(str);

 	/* free memory if allocated by the toaster */
 	PG_FREE_IF_COPY(string, 0);

 	PG_RETURN_TEXT_P(retval);
 }

 /*
  * convert client encoding to server encoding.
  */
 char *
 pg_client_to_server(const char *s, int len)
 {
 	Assert(DatabaseEncoding);
 	Assert(ClientEncoding);

 	if (len <= 0)
 		return (char *) s;

 	if (ClientEncoding->encoding == DatabaseEncoding->encoding ||
 		ClientEncoding->encoding == PG_SQL_ASCII)
 	{
 		/*
 		 * No conversion is needed, but we must still validate the data.
 		 */
 		(void) pg_verify_mbstr(DatabaseEncoding->encoding, s, len, false);
 		return (char *) s;
 	}

 	if (DatabaseEncoding->encoding == PG_SQL_ASCII)
 	{
 		/*
 		 * No conversion is possible, but we must still validate the data,
 		 * because the client-side code might have done string escaping using
 		 * the selected client_encoding.  If the client encoding is ASCII-safe
 		 * then we just do a straight validation under that encoding.  For an
 		 * ASCII-unsafe encoding we have a problem: we dare not pass such data
 		 * to the parser but we have no way to convert it.	We compromise by
 		 * rejecting the data if it contains any non-ASCII characters.
 		 */
 		if (PG_VALID_BE_ENCODING(ClientEncoding->encoding))
 			(void) pg_verify_mbstr(ClientEncoding->encoding, s, len, false);
 		else
 		{
 			int			i;

 			for (i = 0; i < len; i++)
 			{
 				if (s[i] == '\0' || IS_HIGHBIT_SET(s[i]))
 					ereport(ERROR,
 							(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
 					 errmsg("invalid byte value for encoding \"%s\": 0x%02x",
 							pg_enc2name_tbl[PG_SQL_ASCII].name,
 							(unsigned char) s[i])));
 			}
 		}
 		return (char *) s;
 	}

 	return perform_default_encoding_conversion(s, len, true, -1, NULL);
 }

 /*
  * convert server encoding to client encoding.
  */
 char *
 pg_server_to_client(const char *s, int len)
 {
 	Assert(DatabaseEncoding);
 	Assert(ClientEncoding);

 	if (len <= 0)
 		return (char *) s;

 	if (ClientEncoding->encoding == DatabaseEncoding->encoding ||
 		ClientEncoding->encoding == PG_SQL_ASCII ||
 		DatabaseEncoding->encoding == PG_SQL_ASCII)
 		return (char *) s;		/* assume data is valid */

 	return perform_default_encoding_conversion(s, len, false, -1, NULL);
 }

 /*
  *	Perform default encoding conversion using cached FmgrInfo. Since
  *	this function does not access database at all, it is safe to call
  *	outside transactions. Explicit setting client encoding required
  *	before calling this function. Otherwise no conversion is
  *	performed.
  *
  *  NOTE: this function was slightly updated to allow passing in a source
  *  encoding that is not necessarily ClientEncoding->encoding for client-to-
  *  server conversion. Default value is -1, which means: use ClientEncoding.
  *  See pg_custom_client_to_server for information.
  */
 static char *
 perform_default_encoding_conversion(const char *src, int len, bool is_client_to_server,
 									int custom_client_encoding,
 									FmgrInfo *custom_encoding_proc)
 {
 	char	   *result;
 	int			src_encoding,
 				dest_encoding;
 	FmgrInfo   *flinfo;

 	if (is_client_to_server)
 	{
 		if(custom_client_encoding == -1)
 		{
 			/* this is the normal path of execution */
 			src_encoding = ClientEncoding->encoding;
 			dest_encoding = DatabaseEncoding->encoding;
 			flinfo = ToServerConvProc;
 		}
 		else
 		{
 			/* this is the custom path of execution, for external tbl encodings */
 			src_encoding = custom_client_encoding;
 			dest_encoding = DatabaseEncoding->encoding;
 			flinfo = custom_encoding_proc;
 		}
 	}
 	else
 	{
 		if(custom_client_encoding == -1)
 		{
 			/* this is the normal path of execution */
 			src_encoding = DatabaseEncoding->encoding;
 			dest_encoding = ClientEncoding->encoding;
 			flinfo = ToClientConvProc;
 		}
 		else
 		{
 			/* this is the custom path of execution, for external tbl encodings */
 			src_encoding = DatabaseEncoding->encoding;
 			dest_encoding = custom_client_encoding;
 			flinfo = custom_encoding_proc;
 		}
 	}

 	if (flinfo == NULL)
 		return (char *) src;

 	/*
 	 * Allocate space for conversion result, being wary of integer overflow
 	 */
 	if ((Size) len >= (MaxAllocSize / (Size) MAX_CONVERSION_GROWTH))
 		ereport(ERROR,
 				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
 				 errmsg("out of memory"),
 		 errdetail("String of %d bytes is too long for encoding conversion.",
 				   len)));

 	result = palloc(len * MAX_CONVERSION_GROWTH + 1);

 	FunctionCall5(flinfo,
 				  Int32GetDatum(src_encoding),
 				  Int32GetDatum(dest_encoding),
 				  CStringGetDatum((char *) src),
 				  CStringGetDatum(result),
 				  Int32GetDatum(len));

 	return result;
 }


 #ifdef USE_WIDE_UPPER_LOWER

 /*
  * wchar2char --- convert wide characters to multibyte format
  *
  * This has the same API as the standard wcstombs() function; in particular,
  * tolen is the maximum number of bytes to store at *to, and *from must be
  * zero-terminated.  The output will be zero-terminated iff there is room.
  */
 size_t
 wchar2char(char *to, const wchar_t *from, size_t tolen)
 {
 	size_t		result;

 	if (tolen == 0)
 		return 0;

 #ifdef WIN32

 	/*
 	 * On Windows, the "Unicode" locales assume UTF16 not UTF8 encoding, and
 	 * for some reason mbstowcs and wcstombs won't do this for us, so we use
 	 * MultiByteToWideChar().
 	 */
 	if (GetDatabaseEncoding() == PG_UTF8)
 	{
 		result = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, tolen,
 									 NULL, NULL);
 		/* A zero return is failure */
 		if (result <= 0)
 			result = -1;
 		else
 		{
 			Assert(result <= tolen);
 			/* Microsoft counts the zero terminator in the result */
 			result--;
 		}
 	}
 	else
 #endif   /* WIN32 */
 	{
 		Assert(!lc_ctype_is_c());
 		result = wcstombs(to, from, tolen);
 	}
 	return result;
 }

 /*
  * char2wchar --- convert multibyte characters to wide characters
  *
  * This has almost the API of mbstowcs(), except that *from need not be
  * null-terminated; instead, the number of input bytes is specified as
  * fromlen.  Also, we ereport() rather than returning -1 for invalid
  * input encoding.	tolen is the maximum number of wchar_t's to store at *to.
  * The output will be zero-terminated iff there is room.
  */
 size_t
 char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen)
 {
 	size_t		result;

 	if (tolen == 0)
 		return 0;

 #ifdef WIN32
 	/* See WIN32 "Unicode" comment above */
 	if (GetDatabaseEncoding() == PG_UTF8)
 	{
 		/* Win32 API does not work for zero-length input */
 		if (fromlen == 0)
 			result = 0;
 		else
 		{
 			result = MultiByteToWideChar(CP_UTF8, 0, from, fromlen, to, tolen - 1);
 			/* A zero return is failure */
 			if (result == 0)
 				result = -1;
 		}

 		if (result != -1)
 		{
 			Assert(result < tolen);
 			/* Append trailing null wchar (MultiByteToWideChar() does not) */
 			to[result] = 0;
 		}
 	}
 	else
 #endif   /* WIN32 */
 	{
 		/* mbstowcs requires ending '\0' */
 		char	   *str = pnstrdup(from, fromlen);

 		Assert(!lc_ctype_is_c());
 		result = mbstowcs(to, str, tolen);
 		pfree(str);
 	}

 	if (result == -1)
 	{
 		/*
 		 * Invalid multibyte character encountered.  We try to give a useful
 		 * error message by letting pg_verifymbstr check the string.  But it's
 		 * possible that the string is OK to us, and not OK to mbstowcs ---
 		 * this suggests that the LC_CTYPE locale is different from the
 		 * database encoding.  Give a generic error message if verifymbstr
 		 * can't find anything wrong.
 		 */
 		pg_verifymbstr(from, fromlen, false);	/* might not return */
 		/* but if it does ... */
 		ereport(ERROR,
 				(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
 				 errmsg("invalid multibyte character for locale"),
 				 errhint("The server's LC_CTYPE locale is probably incompatible with the database encoding.")));
 	}

 	return result;
 }
 #endif

 /*
  * pg_custom_client_to_server
  *
  * convert client encoding to server encoding, but use the passed in encodings
  * instead of the global client and server encoding variables.
  *
  * This routine is basically a slightly modified version of pg_client_to_server.
  * Instead of creating this routine a better way may have been to just call
  * pg_do_encoding_conversion(), which takes in the necessary arguments, however
  * it does not do several necessary checks that pg_client_to_server() does, and
  * altering it to have those check may break other parts of the system. Therefore
  * until there's a better idea we resort to duplicating some code.
  *
  * The reason for creating this routine is to let external tables do data
  * conversion reliably. Since each external table has an encoding attached to
  * it we'd like to just convert from that encoding to the server encoding without
  * altering the global client_encoding variable for this local database.
  */
 char *
 pg_custom_to_server(const char *s, int len, int src_encoding, void *cep)
 {
 	FmgrInfo *custom_encoding_proc = (FmgrInfo *)cep;

 	Assert(DatabaseEncoding);
 	Assert(ClientEncoding);

 	if (len <= 0)
 		return (char *) s;

 	if (src_encoding == DatabaseEncoding->encoding ||
 		src_encoding == PG_SQL_ASCII)
 	{
 		/*
 		 * No conversion is needed, but we must still validate the data.
 		 */
 		(void) pg_verify_mbstr(DatabaseEncoding->encoding, s, len, false);
 		return (char *) s;
 	}

 	if (DatabaseEncoding->encoding == PG_SQL_ASCII)
 	{
 		/*
 		 * No conversion is possible, but we must still validate the data,
 		 * because the client-side code might have done string escaping using
 		 * the selected client_encoding.  If the client encoding is ASCII-safe
 		 * then we just do a straight validation under that encoding.  For an
 		 * ASCII-unsafe encoding we have a problem: we dare not pass such data
 		 * to the parser but we have no way to convert it.	We compromise by
 		 * rejecting the data if it contains any non-ASCII characters.
 		 */
 		if (PG_VALID_BE_ENCODING(src_encoding))
 			(void) pg_verify_mbstr(src_encoding, s, len, false);
 		else
 		{
 			int			i;

 			for (i = 0; i < len; i++)
 			{
 				if (s[i] == '\0' || IS_HIGHBIT_SET(s[i]))
 					ereport(ERROR,
 							(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
 							 errmsg("invalid byte value for encoding \"%s\": 0x%02x",
 									pg_enc2name_tbl[PG_SQL_ASCII].name,
 									(unsigned char) s[i])));
 			}
 		}
 		return (char *) s;
 	}

 	return perform_default_encoding_conversion(s, len, true, src_encoding, custom_encoding_proc);
 }

 /*
  * pg_server_to_custom
  *
  * convert server encoding to custom encoding. the reverse of pg_custom_to_server.
  * see pg_custom_to_server, and perform_default_encoding_conversion headers for
  * more information.
  */
 char *
 pg_server_to_custom(const char *s, int len, int dest_encoding, void *cep)
 {
 	FmgrInfo *custom_encoding_proc = (FmgrInfo *)cep;

 	Assert(DatabaseEncoding);

 	if (len <= 0)
 		return (char *) s;

 	if (dest_encoding == DatabaseEncoding->encoding ||
 		dest_encoding == PG_SQL_ASCII ||
 		DatabaseEncoding->encoding == PG_SQL_ASCII)
 		return (char *) s;		/* assume data is valid */

 	return perform_default_encoding_conversion(s, len, false, dest_encoding, custom_encoding_proc);
 }

 /* convert a multibyte string to a wchar */
 int
 pg_mb2wchar(const char *from, pg_wchar *to)
 {
 	return (*pg_wchar_table[DatabaseEncoding->encoding].mb2wchar_with_len) ((const unsigned char *) from, to, strlen(from));
 }

 /* convert a multibyte string to a wchar with a limited length */
 int
 pg_mb2wchar_with_len(const char *from, pg_wchar *to, int len)
 {
 	return (*pg_wchar_table[DatabaseEncoding->encoding].mb2wchar_with_len) ((const unsigned char *) from, to, len);
 }

 /* same, with any encoding */
 int
 pg_encoding_mb2wchar_with_len(int encoding,
 							  const char *from, pg_wchar *to, int len)
 {
 	return (*pg_wchar_table[encoding].mb2wchar_with_len) ((const unsigned char *) from, to, len);
 }

 /* returns the byte length of a multibyte character */
 int
 pg_mblen(const char *mbstr)
 {
 	return ((*pg_wchar_table[DatabaseEncoding->encoding].mblen) ((const unsigned char *) mbstr));
 }

 /* returns the display length of a multibyte character */
 int
 pg_dsplen(const char *mbstr)
 {
 	return ((*pg_wchar_table[DatabaseEncoding->encoding].dsplen) ((const unsigned char *) mbstr));
 }

 /* returns the length (counted in wchars) of a multibyte string */
 int
 pg_mbstrlen(const char *mbstr)
 {
 	int			len = 0;

 	/* optimization for single byte encoding */
 	if (pg_database_encoding_max_length() == 1)
 		return strlen(mbstr);

 	while (*mbstr)
 	{
 		mbstr += pg_mblen(mbstr);
 		len++;
 	}
 	return len;
 }

 /* returns the length (counted in wchars) of a multibyte string
  * (not necessarily NULL terminated)
  */
 int
 pg_mbstrlen_with_len(const char *mbstr, int limit)
 {
 	int			len = 0;

 	/* optimization for single byte encoding */
 	if (pg_database_encoding_max_length() == 1)
 		return limit;

 	while (limit > 0 && *mbstr)
 	{
 		int			l = pg_mblen(mbstr);

 		limit -= l;
 		mbstr += l;
 		len++;
 	}
 	return len;
 }

 /*
  * returns the byte length of a multibyte string
  * (not necessarily NULL terminated)
  * that is no longer than limit.
  * this function does not break multibyte character boundary.
  */
 int
 pg_mbcliplen(const char *mbstr, int len, int limit)
 {
 	return pg_encoding_mbcliplen(DatabaseEncoding->encoding, mbstr,
 								 len, limit);
 }

 /*
  * pg_mbcliplen with specified encoding
  */
 int
 pg_encoding_mbcliplen(int encoding, const char *mbstr,
 					  int len, int limit)
 {
 	mblen_converter mblen_fn;
 	int			clen = 0;
 	int			l;

 	/* optimization for single byte encoding */
 	if (pg_encoding_max_length(encoding) == 1)
 		return cliplen(mbstr, len, limit);

 	mblen_fn = pg_wchar_table[encoding].mblen;

 	while (len > 0 && *mbstr)
 	{
 		l = (*mblen_fn) ((const unsigned char *) mbstr);
 		if ((clen + l) > limit)
 			break;
 		clen += l;
 		if (clen == limit)
 			break;
 		len -= l;
 		mbstr += l;
 	}
 	return clen;
 }

 /*
  * Similar to pg_mbcliplen except the limit parameter specifies the
  * character length, not the byte length.
  */
 int
 pg_mbcharcliplen(const char *mbstr, int len, int limit)
 {
 	int			clen = 0;
 	int			nch = 0;
 	int			l;

 	/* optimization for single byte encoding */
 	if (pg_database_encoding_max_length() == 1)
 		return cliplen(mbstr, len, limit);

 	while (len > 0 && *mbstr)
 	{
 		l = pg_mblen(mbstr);
 		nch++;
 		if (nch > limit)
 			break;
 		clen += l;
 		len -= l;
 		mbstr += l;
 	}
 	return clen;
 }

 /* mbcliplen for any single-byte encoding */
 static int
 cliplen(const char *str, int len, int limit)
 {
 	int			l = 0;

 	len = Min(len, limit);
 	while (l < len && str[l])
 		l++;
 	return l;
 }

 #if defined(ENABLE_NLS) && defined(WIN32)
 static const struct codeset_map {
 	int	encoding;
 	const char *codeset;
 } codeset_map_array[] = {
 	{PG_UTF8, "UTF-8"},
 	{PG_LATIN1, "LATIN1"},
 	{PG_LATIN2, "LATIN2"},
 	{PG_LATIN3, "LATIN3"},
 	{PG_LATIN4, "LATIN4"},
 	{PG_ISO_8859_5, "ISO-8859-5"},
 	{PG_ISO_8859_6, "ISO_8859-6"},
 	{PG_ISO_8859_7, "ISO-8859-7"},
 	{PG_ISO_8859_8, "ISO-8859-8"},
 	{PG_LATIN5, "LATIN5"},
 	{PG_LATIN6, "LATIN6"},
 	{PG_LATIN7, "LATIN7"},
 	{PG_LATIN8, "LATIN8"},
 	{PG_LATIN9, "LATIN-9"},
 	{PG_LATIN10, "LATIN10"},
 	{PG_KOI8R, "KOI8-R"},
 	{PG_WIN1250, "CP1250"},
 	{PG_WIN1251, "CP1251"},
 	{PG_WIN1252, "CP1252"},
 	{PG_WIN1253, "CP1253"},
 	{PG_WIN1254, "CP1254"},
 	{PG_WIN1255, "CP1255"},
 	{PG_WIN1256, "CP1256"},
 	{PG_WIN1257, "CP1257"},
 	{PG_WIN1258, "CP1258"},
 	{PG_WIN866, "CP866"},
 	{PG_WIN874, "CP874"},
 	{PG_EUC_CN, "EUC-CN"},
 	{PG_EUC_JP, "EUC-JP"},
 	{PG_EUC_KR, "EUC-KR"},
 	{PG_EUC_TW, "EUC-TW"},
 	{PG_EUC_JIS_2004, "EUC-JP"}
 };
 #endif /* WIN32 */

 void
 SetDatabaseEncoding(int encoding)
 {
 	if (!PG_VALID_BE_ENCODING(encoding))
 		elog(ERROR, "invalid database encoding: %d", encoding);

 	DatabaseEncoding = &pg_enc2name_tbl[encoding];
 	Assert(DatabaseEncoding->encoding == encoding);
 }

 /*
  * Bind gettext to the codeset equivalent with the database encoding.
  */
 void
 pg_bind_textdomain_codeset(const char *domainname)
 {
 #if defined(ENABLE_NLS)
 	int			encoding = GetDatabaseEncoding();
 	int			i;

 	/*
 	 * gettext() uses the codeset specified by LC_CTYPE by default, so if that
 	 * matches the database encoding we don't need to do anything. In CREATE
 	 * DATABASE, we enforce or trust that the locale's codeset matches
 	 * database encoding, except for the C locale. In C locale, we bind
 	 * gettext() explicitly to the right codeset.
 	 *
 	 * On Windows, though, gettext() tends to get confused so we always bind
 	 * it.
 	 */
 #ifndef WIN32
 	const char *ctype = setlocale(LC_CTYPE, NULL);

 	if (pg_strcasecmp(ctype, "C") != 0 && pg_strcasecmp(ctype, "POSIX") != 0)
 		return;
 #endif

 	for (i = 0; pg_enc2gettext_tbl[i].name != NULL; i++)
 	{
 		if (pg_enc2gettext_tbl[i].encoding == encoding)
 		{
 			if (bind_textdomain_codeset(domainname,
 										pg_enc2gettext_tbl[i].name) == NULL)
 				elog(LOG, "bind_textdomain_codeset failed");
 			break;
 		}
 	}
 #endif
 }

 void
 SetDefaultClientEncoding(void)
 {
 	ClientEncoding = &pg_enc2name_tbl[GetDatabaseEncoding()];
 }

 int
 GetDatabaseEncoding(void)
 {
 	Assert(DatabaseEncoding);
 	return DatabaseEncoding->encoding;
 }

 const char *
 GetDatabaseEncodingName(void)
 {
 	Assert(DatabaseEncoding);
 	return DatabaseEncoding->name;
 }

 Datum
 getdatabaseencoding(PG_FUNCTION_ARGS)
 {
 	Assert(DatabaseEncoding);
 	return DirectFunctionCall1(namein, CStringGetDatum(DatabaseEncoding->name));
 }

 Datum
 pg_client_encoding(PG_FUNCTION_ARGS)
 {
 	Assert(ClientEncoding);
 	return DirectFunctionCall1(namein, CStringGetDatum(ClientEncoding->name));
 }