src/backend/utils/mb/conv.c - cloudberry - Git at Google

 /*-------------------------------------------------------------------------
  *
  *	  Utility functions for conversion procs.
  *
  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
  *	  src/backend/utils/mb/conv.c
  *
  *-------------------------------------------------------------------------
  */
 #include "postgres.h"
 #include "mb/pg_wchar.h"


 /*
  * local2local: a generic single byte charset encoding
  * conversion between two ASCII-superset encodings.
  *
  * l points to the source string of length len
  * p is the output area (must be large enough!)
  * src_encoding is the PG identifier for the source encoding
  * dest_encoding is the PG identifier for the target encoding
  * tab holds conversion entries for the source charset
  * starting from 128 (0x80). each entry in the table holds the corresponding
  * code point for the target charset, or 0 if there is no equivalent code.
  *
  * Returns the number of input bytes consumed.  If noError is true, this can
  * be less than 'len'.
  */
 int
 local2local(const unsigned char *l,
 			unsigned char *p,
 			int len,
 			int src_encoding,
 			int dest_encoding,
 			const unsigned char *tab,
 			bool noError)
 {
 	const unsigned char *start = l;
 	unsigned char c1,
 				c2;

 	while (len > 0)
 	{
 		c1 = *l;
 		if (c1 == 0)
 		{
 			if (noError)
 				break;
 			report_invalid_encoding(src_encoding, (const char *) l, len);
 		}
 		if (!IS_HIGHBIT_SET(c1))
 			*p++ = c1;
 		else
 		{
 			c2 = tab[c1 - HIGHBIT];
 			if (c2)
 				*p++ = c2;
 			else
 			{
 				if (noError)
 					break;
 				report_untranslatable_char(src_encoding, dest_encoding,
 										   (const char *) l, len);
 			}
 		}
 		l++;
 		len--;
 	}
 	*p = '\0';

 	return l - start;
 }

 /*
  * LATINn ---> MIC when the charset's local codes map directly to MIC
  *
  * l points to the source string of length len
  * p is the output area (must be large enough!)
  * lc is the mule character set id for the local encoding
  * encoding is the PG identifier for the local encoding
  *
  * Returns the number of input bytes consumed.  If noError is true, this can
  * be less than 'len'.
  */
 int
 latin2mic(const unsigned char *l, unsigned char *p, int len,
 		  int lc, int encoding, bool noError)
 {
 	const unsigned char *start = l;
 	int			c1;

 	while (len > 0)
 	{
 		c1 = *l;
 		if (c1 == 0)
 		{
 			if (noError)
 				break;
 			report_invalid_encoding(encoding, (const char *) l, len);
 		}
 		if (IS_HIGHBIT_SET(c1))
 			*p++ = lc;
 		*p++ = c1;
 		l++;
 		len--;
 	}
 	*p = '\0';

 	return l - start;
 }

 /*
  * MIC ---> LATINn when the charset's local codes map directly to MIC
  *
  * mic points to the source string of length len
  * p is the output area (must be large enough!)
  * lc is the mule character set id for the local encoding
  * encoding is the PG identifier for the local encoding
  *
  * Returns the number of input bytes consumed.  If noError is true, this can
  * be less than 'len'.
  */
 int
 mic2latin(const unsigned char *mic, unsigned char *p, int len,
 		  int lc, int encoding, bool noError)
 {
 	const unsigned char *start = mic;
 	int			c1;

 	while (len > 0)
 	{
 		c1 = *mic;
 		if (c1 == 0)
 		{
 			if (noError)
 				break;
 			report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
 		}
 		if (!IS_HIGHBIT_SET(c1))
 		{
 			/* easy for ASCII */
 			*p++ = c1;
 			mic++;
 			len--;
 		}
 		else
 		{
 			int			l = pg_mule_mblen(mic);

 			if (len < l)
 			{
 				if (noError)
 					break;
 				report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
 										len);
 			}
 			if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]))
 			{
 				if (noError)
 					break;
 				report_untranslatable_char(PG_MULE_INTERNAL, encoding,
 										   (const char *) mic, len);
 			}
 			*p++ = mic[1];
 			mic += 2;
 			len -= 2;
 		}
 	}
 	*p = '\0';

 	return mic - start;
 }


 /*
  * latin2mic_with_table: a generic single byte charset encoding
  * conversion from a local charset to the mule internal code.
  *
  * l points to the source string of length len
  * p is the output area (must be large enough!)
  * lc is the mule character set id for the local encoding
  * encoding is the PG identifier for the local encoding
  * tab holds conversion entries for the local charset
  * starting from 128 (0x80). each entry in the table holds the corresponding
  * code point for the mule encoding, or 0 if there is no equivalent code.
  *
  * Returns the number of input bytes consumed.  If noError is true, this can
  * be less than 'len'.
  */
 int
 latin2mic_with_table(const unsigned char *l,
 					 unsigned char *p,
 					 int len,
 					 int lc,
 					 int encoding,
 					 const unsigned char *tab,
 					 bool noError)
 {
 	const unsigned char *start = l;
 	unsigned char c1,
 				c2;

 	while (len > 0)
 	{
 		c1 = *l;
 		if (c1 == 0)
 		{
 			if (noError)
 				break;
 			report_invalid_encoding(encoding, (const char *) l, len);
 		}
 		if (!IS_HIGHBIT_SET(c1))
 			*p++ = c1;
 		else
 		{
 			c2 = tab[c1 - HIGHBIT];
 			if (c2)
 			{
 				*p++ = lc;
 				*p++ = c2;
 			}
 			else
 			{
 				if (noError)
 					break;
 				report_untranslatable_char(encoding, PG_MULE_INTERNAL,
 										   (const char *) l, len);
 			}
 		}
 		l++;
 		len--;
 	}
 	*p = '\0';

 	return l - start;
 }

 /*
  * mic2latin_with_table: a generic single byte charset encoding
  * conversion from the mule internal code to a local charset.
  *
  * mic points to the source string of length len
  * p is the output area (must be large enough!)
  * lc is the mule character set id for the local encoding
  * encoding is the PG identifier for the local encoding
  * tab holds conversion entries for the mule internal code's second byte,
  * starting from 128 (0x80). each entry in the table holds the corresponding
  * code point for the local charset, or 0 if there is no equivalent code.
  *
  * Returns the number of input bytes consumed.  If noError is true, this can
  * be less than 'len'.
  */
 int
 mic2latin_with_table(const unsigned char *mic,
 					 unsigned char *p,
 					 int len,
 					 int lc,
 					 int encoding,
 					 const unsigned char *tab,
 					 bool noError)
 {
 	const unsigned char *start = mic;
 	unsigned char c1,
 				c2;

 	while (len > 0)
 	{
 		c1 = *mic;
 		if (c1 == 0)
 		{
 			if (noError)
 				break;
 			report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
 		}
 		if (!IS_HIGHBIT_SET(c1))
 		{
 			/* easy for ASCII */
 			*p++ = c1;
 			mic++;
 			len--;
 		}
 		else
 		{
 			int			l = pg_mule_mblen(mic);

 			if (len < l)
 			{
 				if (noError)
 					break;
 				report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
 										len);
 			}
 			if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]) ||
 				(c2 = tab[mic[1] - HIGHBIT]) == 0)
 			{
 				if (noError)
 					break;
 				report_untranslatable_char(PG_MULE_INTERNAL, encoding,
 										   (const char *) mic, len);
 				break;			/* keep compiler quiet */
 			}
 			*p++ = c2;
 			mic += 2;
 			len -= 2;
 		}
 	}
 	*p = '\0';

 	return mic - start;
 }

 /*
  * comparison routine for bsearch()
  * this routine is intended for combined UTF8 -> local code
  */
 static int
 compare3(const void *p1, const void *p2)
 {
 	uint32		s1,
 				s2,
 				d1,
 				d2;

 	s1 = *(const uint32 *) p1;
 	s2 = *((const uint32 *) p1 + 1);
 	d1 = ((const pg_utf_to_local_combined *) p2)->utf1;
 	d2 = ((const pg_utf_to_local_combined *) p2)->utf2;
 	return (s1 > d1 || (s1 == d1 && s2 > d2)) ? 1 : ((s1 == d1 && s2 == d2) ? 0 : -1);
 }

 /*
  * comparison routine for bsearch()
  * this routine is intended for local code -> combined UTF8
  */
 static int
 compare4(const void *p1, const void *p2)
 {
 	uint32		v1,
 				v2;

 	v1 = *(const uint32 *) p1;
 	v2 = ((const pg_local_to_utf_combined *) p2)->code;
 	return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
 }

 /*
  * store 32bit character representation into multibyte stream
  */
 static inline unsigned char *
 store_coded_char(unsigned char *dest, uint32 code)
 {
 	if (code & 0xff000000)
 		*dest++ = code >> 24;
 	if (code & 0x00ff0000)
 		*dest++ = code >> 16;
 	if (code & 0x0000ff00)
 		*dest++ = code >> 8;
 	if (code & 0x000000ff)
 		*dest++ = code;
 	return dest;
 }

 /*
  * Convert a character using a conversion radix tree.
  *
  * 'l' is the length of the input character in bytes, and b1-b4 are
  * the input character's bytes.
  */
 static inline uint32
 pg_mb_radix_conv(const pg_mb_radix_tree *rt,
 				 int l,
 				 unsigned char b1,
 				 unsigned char b2,
 				 unsigned char b3,
 				 unsigned char b4)
 {
 	if (l == 4)
 	{
 		/* 4-byte code */

 		/* check code validity */
 		if (b1 < rt->b4_1_lower || b1 > rt->b4_1_upper ||
 			b2 < rt->b4_2_lower || b2 > rt->b4_2_upper ||
 			b3 < rt->b4_3_lower || b3 > rt->b4_3_upper ||
 			b4 < rt->b4_4_lower || b4 > rt->b4_4_upper)
 			return 0;

 		/* perform lookup */
 		if (rt->chars32)
 		{
 			uint32		idx = rt->b4root;

 			idx = rt->chars32[b1 + idx - rt->b4_1_lower];
 			idx = rt->chars32[b2 + idx - rt->b4_2_lower];
 			idx = rt->chars32[b3 + idx - rt->b4_3_lower];
 			return rt->chars32[b4 + idx - rt->b4_4_lower];
 		}
 		else
 		{
 			uint16		idx = rt->b4root;

 			idx = rt->chars16[b1 + idx - rt->b4_1_lower];
 			idx = rt->chars16[b2 + idx - rt->b4_2_lower];
 			idx = rt->chars16[b3 + idx - rt->b4_3_lower];
 			return rt->chars16[b4 + idx - rt->b4_4_lower];
 		}
 	}
 	else if (l == 3)
 	{
 		/* 3-byte code */

 		/* check code validity */
 		if (b2 < rt->b3_1_lower || b2 > rt->b3_1_upper ||
 			b3 < rt->b3_2_lower || b3 > rt->b3_2_upper ||
 			b4 < rt->b3_3_lower || b4 > rt->b3_3_upper)
 			return 0;

 		/* perform lookup */
 		if (rt->chars32)
 		{
 			uint32		idx = rt->b3root;

 			idx = rt->chars32[b2 + idx - rt->b3_1_lower];
 			idx = rt->chars32[b3 + idx - rt->b3_2_lower];
 			return rt->chars32[b4 + idx - rt->b3_3_lower];
 		}
 		else
 		{
 			uint16		idx = rt->b3root;

 			idx = rt->chars16[b2 + idx - rt->b3_1_lower];
 			idx = rt->chars16[b3 + idx - rt->b3_2_lower];
 			return rt->chars16[b4 + idx - rt->b3_3_lower];
 		}
 	}
 	else if (l == 2)
 	{
 		/* 2-byte code */

 		/* check code validity - first byte */
 		if (b3 < rt->b2_1_lower || b3 > rt->b2_1_upper ||
 			b4 < rt->b2_2_lower || b4 > rt->b2_2_upper)
 			return 0;

 		/* perform lookup */
 		if (rt->chars32)
 		{
 			uint32		idx = rt->b2root;

 			idx = rt->chars32[b3 + idx - rt->b2_1_lower];
 			return rt->chars32[b4 + idx - rt->b2_2_lower];
 		}
 		else
 		{
 			uint16		idx = rt->b2root;

 			idx = rt->chars16[b3 + idx - rt->b2_1_lower];
 			return rt->chars16[b4 + idx - rt->b2_2_lower];
 		}
 	}
 	else if (l == 1)
 	{
 		/* 1-byte code */

 		/* check code validity - first byte */
 		if (b4 < rt->b1_lower || b4 > rt->b1_upper)
 			return 0;

 		/* perform lookup */
 		if (rt->chars32)
 			return rt->chars32[b4 + rt->b1root - rt->b1_lower];
 		else
 			return rt->chars16[b4 + rt->b1root - rt->b1_lower];
 	}
 	return 0;					/* shouldn't happen */
 }

 /*
  * UTF8 ---> local code
  *
  * utf: input string in UTF8 encoding (need not be null-terminated)
  * len: length of input string (in bytes)
  * iso: pointer to the output area (must be large enough!)
 		  (output string will be null-terminated)
  * map: conversion map for single characters
  * cmap: conversion map for combined characters
  *		  (optional, pass NULL if none)
  * cmapsize: number of entries in the conversion map for combined characters
  *		  (optional, pass 0 if none)
  * conv_func: algorithmic encoding conversion function
  *		  (optional, pass NULL if none)
  * encoding: PG identifier for the local encoding
  *
  * For each character, the cmap (if provided) is consulted first; if no match,
  * the map is consulted next; if still no match, the conv_func (if provided)
  * is applied.  An error is raised if no match is found.
  *
  * See pg_wchar.h for more details about the data structures used here.
  *
  * Returns the number of input bytes consumed.  If noError is true, this can
  * be less than 'len'.
  */
 int
 UtfToLocal(const unsigned char *utf, int len,
 		   unsigned char *iso,
 		   const pg_mb_radix_tree *map,
 		   const pg_utf_to_local_combined *cmap, int cmapsize,
 		   utf_local_conversion_func conv_func,
 		   int encoding, bool noError)
 {
 	uint32		iutf;
 	int			l;
 	const pg_utf_to_local_combined *cp;
 	const unsigned char *start = utf;

 	if (!PG_VALID_ENCODING(encoding))
 		ereport(ERROR,
 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 				 errmsg("invalid encoding number: %d", encoding)));

 	for (; len > 0; len -= l)
 	{
 		unsigned char b1 = 0;
 		unsigned char b2 = 0;
 		unsigned char b3 = 0;
 		unsigned char b4 = 0;

 		/* "break" cases all represent errors */
 		if (*utf == '\0')
 			break;

 		l = pg_utf_mblen(utf);
 		if (len < l)
 			break;

 		if (!pg_utf8_islegal(utf, l))
 			break;

 		if (l == 1)
 		{
 			/* ASCII case is easy, assume it's one-to-one conversion */
 			*iso++ = *utf++;
 			continue;
 		}

 		/* collect coded char of length l */
 		if (l == 2)
 		{
 			b3 = *utf++;
 			b4 = *utf++;
 		}
 		else if (l == 3)
 		{
 			b2 = *utf++;
 			b3 = *utf++;
 			b4 = *utf++;
 		}
 		else if (l == 4)
 		{
 			b1 = *utf++;
 			b2 = *utf++;
 			b3 = *utf++;
 			b4 = *utf++;
 		}
 		else
 		{
 			elog(ERROR, "unsupported character length %d", l);
 			iutf = 0;			/* keep compiler quiet */
 		}
 		iutf = (b1 << 24 | b2 << 16 | b3 << 8 | b4);

 		/* First, try with combined map if possible */
 		if (cmap && len > l)
 		{
 			const unsigned char *utf_save = utf;
 			int			len_save = len;
 			int			l_save = l;

 			/* collect next character, same as above */
 			len -= l;

 			l = pg_utf_mblen(utf);
 			if (len < l)
 			{
 				/* need more data to decide if this is a combined char */
 				utf -= l_save;
 				break;
 			}

 			if (!pg_utf8_islegal(utf, l))
 			{
 				if (!noError)
 					report_invalid_encoding(PG_UTF8, (const char *) utf, len);
 				utf -= l_save;
 				break;
 			}

 			/* We assume ASCII character cannot be in combined map */
 			if (l > 1)
 			{
 				uint32		iutf2;
 				uint32		cutf[2];

 				if (l == 2)
 				{
 					iutf2 = *utf++ << 8;
 					iutf2 |= *utf++;
 				}
 				else if (l == 3)
 				{
 					iutf2 = *utf++ << 16;
 					iutf2 |= *utf++ << 8;
 					iutf2 |= *utf++;
 				}
 				else if (l == 4)
 				{
 					iutf2 = *utf++ << 24;
 					iutf2 |= *utf++ << 16;
 					iutf2 |= *utf++ << 8;
 					iutf2 |= *utf++;
 				}
 				else
 				{
 					elog(ERROR, "unsupported character length %d", l);
 					iutf2 = 0;	/* keep compiler quiet */
 				}

 				cutf[0] = iutf;
 				cutf[1] = iutf2;

 				cp = bsearch(cutf, cmap, cmapsize,
 							 sizeof(pg_utf_to_local_combined), compare3);

 				if (cp)
 				{
 					iso = store_coded_char(iso, cp->code);
 					continue;
 				}
 			}

 			/* fail, so back up to reprocess second character next time */
 			utf = utf_save;
 			len = len_save;
 			l = l_save;
 		}

 		/* Now check ordinary map */
 		if (map)
 		{
 			uint32		converted = pg_mb_radix_conv(map, l, b1, b2, b3, b4);

 			if (converted)
 			{
 				iso = store_coded_char(iso, converted);
 				continue;
 			}
 		}

 		/* if there's a conversion function, try that */
 		if (conv_func)
 		{
 			uint32		converted = (*conv_func) (iutf);

 			if (converted)
 			{
 				iso = store_coded_char(iso, converted);
 				continue;
 			}
 		}

 		/* failed to translate this character */
 		utf -= l;
 		if (noError)
 			break;
 		report_untranslatable_char(PG_UTF8, encoding,
 								   (const char *) utf, len);
 	}

 	/* if we broke out of loop early, must be invalid input */
 	if (len > 0 && !noError)
 		report_invalid_encoding(PG_UTF8, (const char *) utf, len);

 	*iso = '\0';

 	return utf - start;
 }

 /*
  * local code ---> UTF8
  *
  * iso: input string in local encoding (need not be null-terminated)
  * len: length of input string (in bytes)
  * utf: pointer to the output area (must be large enough!)
 		  (output string will be null-terminated)
  * map: conversion map for single characters
  * cmap: conversion map for combined characters
  *		  (optional, pass NULL if none)
  * cmapsize: number of entries in the conversion map for combined characters
  *		  (optional, pass 0 if none)
  * conv_func: algorithmic encoding conversion function
  *		  (optional, pass NULL if none)
  * encoding: PG identifier for the local encoding
  *
  * For each character, the map is consulted first; if no match, the cmap
  * (if provided) is consulted next; if still no match, the conv_func
  * (if provided) is applied.  An error is raised if no match is found.
  *
  * See pg_wchar.h for more details about the data structures used here.
  *
  * Returns the number of input bytes consumed.  If noError is true, this can
  * be less than 'len'.
  */
 int
 LocalToUtf(const unsigned char *iso, int len,
 		   unsigned char *utf,
 		   const pg_mb_radix_tree *map,
 		   const pg_local_to_utf_combined *cmap, int cmapsize,
 		   utf_local_conversion_func conv_func,
 		   int encoding,
 		   bool noError)
 {
 	uint32		iiso;
 	int			l;
 	const pg_local_to_utf_combined *cp;
 	const unsigned char *start = iso;

 	if (!PG_VALID_ENCODING(encoding))
 		ereport(ERROR,
 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 				 errmsg("invalid encoding number: %d", encoding)));

 	for (; len > 0; len -= l)
 	{
 		unsigned char b1 = 0;
 		unsigned char b2 = 0;
 		unsigned char b3 = 0;
 		unsigned char b4 = 0;

 		/* "break" cases all represent errors */
 		if (*iso == '\0')
 			break;

 		if (!IS_HIGHBIT_SET(*iso))
 		{
 			/* ASCII case is easy, assume it's one-to-one conversion */
 			*utf++ = *iso++;
 			l = 1;
 			continue;
 		}

 		l = pg_encoding_verifymbchar(encoding, (const char *) iso, len);
 		if (l < 0)
 			break;

 		/* collect coded char of length l */
 		if (l == 1)
 			b4 = *iso++;
 		else if (l == 2)
 		{
 			b3 = *iso++;
 			b4 = *iso++;
 		}
 		else if (l == 3)
 		{
 			b2 = *iso++;
 			b3 = *iso++;
 			b4 = *iso++;
 		}
 		else if (l == 4)
 		{
 			b1 = *iso++;
 			b2 = *iso++;
 			b3 = *iso++;
 			b4 = *iso++;
 		}
 		else
 		{
 			elog(ERROR, "unsupported character length %d", l);
 			iiso = 0;			/* keep compiler quiet */
 		}
 		iiso = (b1 << 24 | b2 << 16 | b3 << 8 | b4);

 		if (map)
 		{
 			uint32		converted = pg_mb_radix_conv(map, l, b1, b2, b3, b4);

 			if (converted)
 			{
 				utf = store_coded_char(utf, converted);
 				continue;
 			}

 			/* If there's a combined character map, try that */
 			if (cmap)
 			{
 				cp = bsearch(&iiso, cmap, cmapsize,
 							 sizeof(pg_local_to_utf_combined), compare4);

 				if (cp)
 				{
 					utf = store_coded_char(utf, cp->utf1);
 					utf = store_coded_char(utf, cp->utf2);
 					continue;
 				}
 			}
 		}

 		/* if there's a conversion function, try that */
 		if (conv_func)
 		{
 			uint32		converted = (*conv_func) (iiso);

 			if (converted)
 			{
 				utf = store_coded_char(utf, converted);
 				continue;
 			}
 		}

 		/* failed to translate this character */
 		iso -= l;
 		if (noError)
 			break;
 		report_untranslatable_char(encoding, PG_UTF8,
 								   (const char *) iso, len);
 	}

 	/* if we broke out of loop early, must be invalid input */
 	if (len > 0 && !noError)
 		report_invalid_encoding(encoding, (const char *) iso, len);

 	*utf = '\0';

 	return iso - start;
 }
	/*-------------------------------------------------------------------------
	*
	* Utility functions for conversion procs.
	*
	* Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
	* Portions Copyright (c) 1994, Regents of the University of California
	*
	* IDENTIFICATION
	* src/backend/utils/mb/conv.c
	*
	*-------------------------------------------------------------------------
	*/
	#include "postgres.h"
	#include "mb/pg_wchar.h"


	/*
	* local2local: a generic single byte charset encoding
	* conversion between two ASCII-superset encodings.
	*
	* l points to the source string of length len
	* p is the output area (must be large enough!)
	* src_encoding is the PG identifier for the source encoding
	* dest_encoding is the PG identifier for the target encoding
	* tab holds conversion entries for the source charset
	* starting from 128 (0x80). each entry in the table holds the corresponding
	* code point for the target charset, or 0 if there is no equivalent code.
	*
	* Returns the number of input bytes consumed. If noError is true, this can
	* be less than 'len'.
	*/
	int
	local2local(const unsigned char *l,
	unsigned char *p,
	int len,
	int src_encoding,
	int dest_encoding,
	const unsigned char *tab,
	bool noError)
	{
	const unsigned char *start = l;
	unsigned char c1,
	c2;

	while (len > 0)
	{
	c1 = *l;
	if (c1 == 0)
	{
	if (noError)
	break;
	report_invalid_encoding(src_encoding, (const char *) l, len);
	}
	if (!IS_HIGHBIT_SET(c1))
	*p++ = c1;
	else
	{
	c2 = tab[c1 - HIGHBIT];
	if (c2)
	*p++ = c2;
	else
	{
	if (noError)
	break;
	report_untranslatable_char(src_encoding, dest_encoding,
	(const char *) l, len);
	}
	}
	l++;
	len--;
	}
	*p = '\0';

	return l - start;
	}

	/*
	* LATINn ---> MIC when the charset's local codes map directly to MIC
	*
	* l points to the source string of length len
	* p is the output area (must be large enough!)
	* lc is the mule character set id for the local encoding
	* encoding is the PG identifier for the local encoding
	*
	* Returns the number of input bytes consumed. If noError is true, this can
	* be less than 'len'.
	*/
	int
	latin2mic(const unsigned char l, unsigned char p, int len,
	int lc, int encoding, bool noError)
	{
	const unsigned char *start = l;
	int c1;

	while (len > 0)
	{
	c1 = *l;
	if (c1 == 0)
	{
	if (noError)
	break;
	report_invalid_encoding(encoding, (const char *) l, len);
	}
	if (IS_HIGHBIT_SET(c1))
	*p++ = lc;
	*p++ = c1;
	l++;
	len--;
	}
	*p = '\0';

	return l - start;
	}

	/*
	* MIC ---> LATINn when the charset's local codes map directly to MIC
	*
	* mic points to the source string of length len
	* p is the output area (must be large enough!)
	* lc is the mule character set id for the local encoding
	* encoding is the PG identifier for the local encoding
	*
	* Returns the number of input bytes consumed. If noError is true, this can
	* be less than 'len'.
	*/
	int
	mic2latin(const unsigned char mic, unsigned char p, int len,
	int lc, int encoding, bool noError)
	{
	const unsigned char *start = mic;
	int c1;

	while (len > 0)
	{
	c1 = *mic;
	if (c1 == 0)
	{
	if (noError)
	break;
	report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
	}
	if (!IS_HIGHBIT_SET(c1))
	{
	/* easy for ASCII */
	*p++ = c1;
	mic++;
	len--;
	}
	else
	{
	int l = pg_mule_mblen(mic);

	if (len < l)
	{
	if (noError)
	break;
	report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
	len);
	}
	if (l != 2 \|\| c1 != lc \|\| !IS_HIGHBIT_SET(mic[1]))
	{
	if (noError)
	break;
	report_untranslatable_char(PG_MULE_INTERNAL, encoding,
	(const char *) mic, len);
	}
	*p++ = mic[1];
	mic += 2;
	len -= 2;
	}
	}
	*p = '\0';

	return mic - start;
	}


	/*
	* latin2mic_with_table: a generic single byte charset encoding
	* conversion from a local charset to the mule internal code.
	*
	* l points to the source string of length len
	* p is the output area (must be large enough!)
	* lc is the mule character set id for the local encoding
	* encoding is the PG identifier for the local encoding
	* tab holds conversion entries for the local charset
	* starting from 128 (0x80). each entry in the table holds the corresponding
	* code point for the mule encoding, or 0 if there is no equivalent code.
	*
	* Returns the number of input bytes consumed. If noError is true, this can
	* be less than 'len'.
	*/
	int
	latin2mic_with_table(const unsigned char *l,
	unsigned char *p,
	int len,
	int lc,
	int encoding,
	const unsigned char *tab,
	bool noError)
	{
	const unsigned char *start = l;
	unsigned char c1,
	c2;

	while (len > 0)
	{
	c1 = *l;
	if (c1 == 0)
	{
	if (noError)
	break;
	report_invalid_encoding(encoding, (const char *) l, len);
	}
	if (!IS_HIGHBIT_SET(c1))
	*p++ = c1;
	else
	{
	c2 = tab[c1 - HIGHBIT];
	if (c2)
	{
	*p++ = lc;
	*p++ = c2;
	}
	else
	{
	if (noError)
	break;
	report_untranslatable_char(encoding, PG_MULE_INTERNAL,
	(const char *) l, len);
	}
	}
	l++;
	len--;
	}
	*p = '\0';

	return l - start;
	}

	/*
	* mic2latin_with_table: a generic single byte charset encoding
	* conversion from the mule internal code to a local charset.
	*
	* mic points to the source string of length len
	* p is the output area (must be large enough!)
	* lc is the mule character set id for the local encoding
	* encoding is the PG identifier for the local encoding
	* tab holds conversion entries for the mule internal code's second byte,
	* starting from 128 (0x80). each entry in the table holds the corresponding
	* code point for the local charset, or 0 if there is no equivalent code.
	*
	* Returns the number of input bytes consumed. If noError is true, this can
	* be less than 'len'.
	*/
	int
	mic2latin_with_table(const unsigned char *mic,
	unsigned char *p,
	int len,
	int lc,
	int encoding,
	const unsigned char *tab,
	bool noError)
	{
	const unsigned char *start = mic;
	unsigned char c1,
	c2;

	while (len > 0)
	{
	c1 = *mic;
	if (c1 == 0)
	{
	if (noError)
	break;
	report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
	}
	if (!IS_HIGHBIT_SET(c1))
	{
	/* easy for ASCII */
	*p++ = c1;
	mic++;
	len--;
	}
	else
	{
	int l = pg_mule_mblen(mic);

	if (len < l)
	{
	if (noError)
	break;
	report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
	len);
	}
	if (l != 2 \|\| c1 != lc \|\| !IS_HIGHBIT_SET(mic[1]) \|\|
	(c2 = tab[mic[1] - HIGHBIT]) == 0)
	{
	if (noError)
	break;
	report_untranslatable_char(PG_MULE_INTERNAL, encoding,
	(const char *) mic, len);
	break; /* keep compiler quiet */
	}
	*p++ = c2;
	mic += 2;
	len -= 2;
	}
	}
	*p = '\0';

	return mic - start;
	}

	/*
	* comparison routine for bsearch()
	* this routine is intended for combined UTF8 -> local code
	*/
	static int
	compare3(const void p1, const void p2)
	{
	uint32 s1,
	s2,
	d1,
	d2;

	s1 = (const uint32 ) p1;
	s2 = ((const uint32 ) p1 + 1);
	d1 = ((const pg_utf_to_local_combined *) p2)->utf1;
	d2 = ((const pg_utf_to_local_combined *) p2)->utf2;
	return (s1 > d1 \|\| (s1 == d1 && s2 > d2)) ? 1 : ((s1 == d1 && s2 == d2) ? 0 : -1);
	}

	/*
	* comparison routine for bsearch()
	* this routine is intended for local code -> combined UTF8
	*/
	static int
	compare4(const void p1, const void p2)
	{
	uint32 v1,
	v2;

	v1 = (const uint32 ) p1;
	v2 = ((const pg_local_to_utf_combined *) p2)->code;
	return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
	}

	/*
	* store 32bit character representation into multibyte stream
	*/
	static inline unsigned char *
	store_coded_char(unsigned char *dest, uint32 code)
	{
	if (code & 0xff000000)
	*dest++ = code >> 24;
	if (code & 0x00ff0000)
	*dest++ = code >> 16;
	if (code & 0x0000ff00)
	*dest++ = code >> 8;
	if (code & 0x000000ff)
	*dest++ = code;
	return dest;
	}

	/*
	* Convert a character using a conversion radix tree.
	*
	* 'l' is the length of the input character in bytes, and b1-b4 are
	* the input character's bytes.
	*/
	static inline uint32
	pg_mb_radix_conv(const pg_mb_radix_tree *rt,
	int l,
	unsigned char b1,
	unsigned char b2,
	unsigned char b3,
	unsigned char b4)
	{
	if (l == 4)
	{
	/* 4-byte code */

	/* check code validity */
	if (b1 < rt->b4_1_lower \|\| b1 > rt->b4_1_upper \|\|
	b2 < rt->b4_2_lower \|\| b2 > rt->b4_2_upper \|\|
	b3 < rt->b4_3_lower \|\| b3 > rt->b4_3_upper \|\|
	b4 < rt->b4_4_lower \|\| b4 > rt->b4_4_upper)
	return 0;

	/* perform lookup */
	if (rt->chars32)
	{
	uint32 idx = rt->b4root;

	idx = rt->chars32[b1 + idx - rt->b4_1_lower];
	idx = rt->chars32[b2 + idx - rt->b4_2_lower];
	idx = rt->chars32[b3 + idx - rt->b4_3_lower];
	return rt->chars32[b4 + idx - rt->b4_4_lower];
	}
	else
	{
	uint16 idx = rt->b4root;

	idx = rt->chars16[b1 + idx - rt->b4_1_lower];
	idx = rt->chars16[b2 + idx - rt->b4_2_lower];
	idx = rt->chars16[b3 + idx - rt->b4_3_lower];
	return rt->chars16[b4 + idx - rt->b4_4_lower];
	}
	}
	else if (l == 3)
	{
	/* 3-byte code */

	/* check code validity */
	if (b2 < rt->b3_1_lower \|\| b2 > rt->b3_1_upper \|\|
	b3 < rt->b3_2_lower \|\| b3 > rt->b3_2_upper \|\|
	b4 < rt->b3_3_lower \|\| b4 > rt->b3_3_upper)
	return 0;

	/* perform lookup */
	if (rt->chars32)
	{
	uint32 idx = rt->b3root;

	idx = rt->chars32[b2 + idx - rt->b3_1_lower];
	idx = rt->chars32[b3 + idx - rt->b3_2_lower];
	return rt->chars32[b4 + idx - rt->b3_3_lower];
	}
	else
	{
	uint16 idx = rt->b3root;

	idx = rt->chars16[b2 + idx - rt->b3_1_lower];
	idx = rt->chars16[b3 + idx - rt->b3_2_lower];
	return rt->chars16[b4 + idx - rt->b3_3_lower];
	}
	}
	else if (l == 2)
	{
	/* 2-byte code */

	/* check code validity - first byte */
	if (b3 < rt->b2_1_lower \|\| b3 > rt->b2_1_upper \|\|
	b4 < rt->b2_2_lower \|\| b4 > rt->b2_2_upper)
	return 0;

	/* perform lookup */
	if (rt->chars32)
	{
	uint32 idx = rt->b2root;

	idx = rt->chars32[b3 + idx - rt->b2_1_lower];
	return rt->chars32[b4 + idx - rt->b2_2_lower];
	}
	else
	{
	uint16 idx = rt->b2root;

	idx = rt->chars16[b3 + idx - rt->b2_1_lower];
	return rt->chars16[b4 + idx - rt->b2_2_lower];
	}
	}
	else if (l == 1)
	{
	/* 1-byte code */

	/* check code validity - first byte */
	if (b4 < rt->b1_lower \|\| b4 > rt->b1_upper)
	return 0;

	/* perform lookup */
	if (rt->chars32)
	return rt->chars32[b4 + rt->b1root - rt->b1_lower];
	else
	return rt->chars16[b4 + rt->b1root - rt->b1_lower];
	}
	return 0; /* shouldn't happen */
	}

	/*
	* UTF8 ---> local code
	*
	* utf: input string in UTF8 encoding (need not be null-terminated)
	* len: length of input string (in bytes)
	* iso: pointer to the output area (must be large enough!)
	(output string will be null-terminated)
	* map: conversion map for single characters
	* cmap: conversion map for combined characters
	* (optional, pass NULL if none)
	* cmapsize: number of entries in the conversion map for combined characters
	* (optional, pass 0 if none)
	* conv_func: algorithmic encoding conversion function
	* (optional, pass NULL if none)
	* encoding: PG identifier for the local encoding
	*
	* For each character, the cmap (if provided) is consulted first; if no match,
	* the map is consulted next; if still no match, the conv_func (if provided)
	* is applied. An error is raised if no match is found.
	*
	* See pg_wchar.h for more details about the data structures used here.
	*
	* Returns the number of input bytes consumed. If noError is true, this can
	* be less than 'len'.
	*/
	int
	UtfToLocal(const unsigned char *utf, int len,
	unsigned char *iso,
	const pg_mb_radix_tree *map,
	const pg_utf_to_local_combined *cmap, int cmapsize,
	utf_local_conversion_func conv_func,
	int encoding, bool noError)
	{
	uint32 iutf;
	int l;
	const pg_utf_to_local_combined *cp;
	const unsigned char *start = utf;

	if (!PG_VALID_ENCODING(encoding))
	ereport(ERROR,
	(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
	errmsg("invalid encoding number: %d", encoding)));

	for (; len > 0; len -= l)
	{
	unsigned char b1 = 0;
	unsigned char b2 = 0;
	unsigned char b3 = 0;
	unsigned char b4 = 0;

	/* "break" cases all represent errors */
	if (*utf == '\0')
	break;

	l = pg_utf_mblen(utf);
	if (len < l)
	break;

	if (!pg_utf8_islegal(utf, l))
	break;

	if (l == 1)
	{
	/* ASCII case is easy, assume it's one-to-one conversion */
	iso++ = utf++;
	continue;
	}

	/* collect coded char of length l */
	if (l == 2)
	{
	b3 = *utf++;
	b4 = *utf++;
	}
	else if (l == 3)
	{
	b2 = *utf++;
	b3 = *utf++;
	b4 = *utf++;
	}
	else if (l == 4)
	{
	b1 = *utf++;
	b2 = *utf++;
	b3 = *utf++;
	b4 = *utf++;
	}
	else
	{
	elog(ERROR, "unsupported character length %d", l);
	iutf = 0; /* keep compiler quiet */
	}
	iutf = (b1 << 24 \| b2 << 16 \| b3 << 8 \| b4);

	/* First, try with combined map if possible */
	if (cmap && len > l)
	{
	const unsigned char *utf_save = utf;
	int len_save = len;
	int l_save = l;

	/* collect next character, same as above */
	len -= l;

	l = pg_utf_mblen(utf);
	if (len < l)
	{
	/* need more data to decide if this is a combined char */
	utf -= l_save;
	break;
	}

	if (!pg_utf8_islegal(utf, l))
	{
	if (!noError)
	report_invalid_encoding(PG_UTF8, (const char *) utf, len);
	utf -= l_save;
	break;
	}

	/* We assume ASCII character cannot be in combined map */
	if (l > 1)
	{
	uint32 iutf2;
	uint32 cutf[2];

	if (l == 2)
	{
	iutf2 = *utf++ << 8;
	iutf2 \|= *utf++;
	}
	else if (l == 3)
	{
	iutf2 = *utf++ << 16;
	iutf2 \|= *utf++ << 8;
	iutf2 \|= *utf++;
	}
	else if (l == 4)
	{
	iutf2 = *utf++ << 24;
	iutf2 \|= *utf++ << 16;
	iutf2 \|= *utf++ << 8;
	iutf2 \|= *utf++;
	}
	else
	{
	elog(ERROR, "unsupported character length %d", l);
	iutf2 = 0; /* keep compiler quiet */
	}

	cutf[0] = iutf;
	cutf[1] = iutf2;

	cp = bsearch(cutf, cmap, cmapsize,
	sizeof(pg_utf_to_local_combined), compare3);

	if (cp)
	{
	iso = store_coded_char(iso, cp->code);
	continue;
	}
	}

	/* fail, so back up to reprocess second character next time */
	utf = utf_save;
	len = len_save;
	l = l_save;
	}

	/* Now check ordinary map */
	if (map)
	{
	uint32 converted = pg_mb_radix_conv(map, l, b1, b2, b3, b4);

	if (converted)
	{
	iso = store_coded_char(iso, converted);
	continue;
	}
	}

	/* if there's a conversion function, try that */
	if (conv_func)
	{
	uint32 converted = (*conv_func) (iutf);

	if (converted)
	{
	iso = store_coded_char(iso, converted);
	continue;
	}
	}

	/* failed to translate this character */
	utf -= l;
	if (noError)
	break;
	report_untranslatable_char(PG_UTF8, encoding,
	(const char *) utf, len);
	}

	/* if we broke out of loop early, must be invalid input */
	if (len > 0 && !noError)
	report_invalid_encoding(PG_UTF8, (const char *) utf, len);

	*iso = '\0';

	return utf - start;
	}

	/*
	* local code ---> UTF8
	*
	* iso: input string in local encoding (need not be null-terminated)
	* len: length of input string (in bytes)
	* utf: pointer to the output area (must be large enough!)
	(output string will be null-terminated)
	* map: conversion map for single characters
	* cmap: conversion map for combined characters
	* (optional, pass NULL if none)
	* cmapsize: number of entries in the conversion map for combined characters
	* (optional, pass 0 if none)
	* conv_func: algorithmic encoding conversion function
	* (optional, pass NULL if none)
	* encoding: PG identifier for the local encoding
	*
	* For each character, the map is consulted first; if no match, the cmap
	* (if provided) is consulted next; if still no match, the conv_func
	* (if provided) is applied. An error is raised if no match is found.
	*
	* See pg_wchar.h for more details about the data structures used here.
	*
	* Returns the number of input bytes consumed. If noError is true, this can
	* be less than 'len'.
	*/
	int
	LocalToUtf(const unsigned char *iso, int len,
	unsigned char *utf,
	const pg_mb_radix_tree *map,
	const pg_local_to_utf_combined *cmap, int cmapsize,
	utf_local_conversion_func conv_func,
	int encoding,
	bool noError)
	{
	uint32 iiso;
	int l;
	const pg_local_to_utf_combined *cp;
	const unsigned char *start = iso;

	if (!PG_VALID_ENCODING(encoding))
	ereport(ERROR,
	(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
	errmsg("invalid encoding number: %d", encoding)));

	for (; len > 0; len -= l)
	{
	unsigned char b1 = 0;
	unsigned char b2 = 0;
	unsigned char b3 = 0;
	unsigned char b4 = 0;

	/* "break" cases all represent errors */
	if (*iso == '\0')
	break;

	if (!IS_HIGHBIT_SET(*iso))
	{
	/* ASCII case is easy, assume it's one-to-one conversion */
	utf++ = iso++;
	l = 1;
	continue;
	}

	l = pg_encoding_verifymbchar(encoding, (const char *) iso, len);
	if (l < 0)
	break;

	/* collect coded char of length l */
	if (l == 1)
	b4 = *iso++;
	else if (l == 2)
	{
	b3 = *iso++;
	b4 = *iso++;
	}
	else if (l == 3)
	{
	b2 = *iso++;
	b3 = *iso++;
	b4 = *iso++;
	}
	else if (l == 4)
	{
	b1 = *iso++;
	b2 = *iso++;
	b3 = *iso++;
	b4 = *iso++;
	}
	else
	{
	elog(ERROR, "unsupported character length %d", l);
	iiso = 0; /* keep compiler quiet */
	}
	iiso = (b1 << 24 \| b2 << 16 \| b3 << 8 \| b4);

	if (map)
	{
	uint32 converted = pg_mb_radix_conv(map, l, b1, b2, b3, b4);

	if (converted)
	{
	utf = store_coded_char(utf, converted);
	continue;
	}

	/* If there's a combined character map, try that */
	if (cmap)
	{
	cp = bsearch(&iiso, cmap, cmapsize,
	sizeof(pg_local_to_utf_combined), compare4);

	if (cp)
	{
	utf = store_coded_char(utf, cp->utf1);
	utf = store_coded_char(utf, cp->utf2);
	continue;
	}
	}
	}

	/* if there's a conversion function, try that */
	if (conv_func)
	{
	uint32 converted = (*conv_func) (iiso);

	if (converted)
	{
	utf = store_coded_char(utf, converted);
	continue;
	}
	}

	/* failed to translate this character */
	iso -= l;
	if (noError)
	break;
	report_untranslatable_char(encoding, PG_UTF8,
	(const char *) iso, len);
	}

	/* if we broke out of loop early, must be invalid input */
	if (len > 0 && !noError)
	report_invalid_encoding(encoding, (const char *) iso, len);

	*utf = '\0';

	return iso - start;
	}