src/bin/psql/mbprint.c - hawq - Git at Google

 /*
  * psql - the PostgreSQL interactive terminal
  *
  * Copyright (c) 2000-2010, PostgreSQL Global Development Group
  *
  * $PostgreSQL: pgsql/src/bin/psql/mbprint.c,v 1.38.6.1 2010/08/16 00:06:24 tgl Exp $
  *
  * XXX this file does not really belong in psql/.  Perhaps move to libpq?
  * It also seems that the mbvalidate function is redundant with existing
  * functionality.
  */

 #include "postgres_fe.h"
 #include "mbprint.h"
 #include "libpq-fe.h"
 #ifndef PGSCRIPTS
 #include "settings.h"
 #endif

 /*
  * To avoid version-skew problems, this file must not use declarations
  * from pg_wchar.h: the encoding IDs we are dealing with are determined
  * by the libpq.so we are linked with, and that might not match the
  * numbers we see at compile time.	(If this file were inside libpq,
  * the problem would go away...)
  *
  * Hence, we have our own definition of pg_wchar, and we get the values
  * of any needed encoding IDs on-the-fly.
  */

 typedef unsigned int pg_wchar;

 static int
 pg_get_utf8_id(void)
 {
 	static int	utf8_id = -1;

 	if (utf8_id < 0)
 		utf8_id = pg_char_to_encoding("utf8");
 	return utf8_id;
 }

 #define PG_UTF8		pg_get_utf8_id()


 static pg_wchar
 utf2ucs(const unsigned char *c)
 {
 	/*
 	 * one char version of pg_utf2wchar_with_len. no control here, c must
 	 * point to a large enough string
 	 */
 	if ((*c & 0x80) == 0)
 		return (pg_wchar) c[0];
 	else if ((*c & 0xe0) == 0xc0)
 		return (pg_wchar) (((c[0] & 0x1f) << 6) |
 						   (c[1] & 0x3f));
 	else if ((*c & 0xf0) == 0xe0)
 		return (pg_wchar) (((c[0] & 0x0f) << 12) |
 						   ((c[1] & 0x3f) << 6) |
 						   (c[2] & 0x3f));
 	else if ((*c & 0xf8) == 0xf0)
 		return (pg_wchar) (((c[0] & 0x07) << 18) |
 						   ((c[1] & 0x3f) << 12) |
 						   ((c[2] & 0x3f) << 6) |
 						   (c[3] & 0x3f));
 	else
 		/* that is an invalid code on purpose */
 		return 0xffffffff;
 }


 /*
  * Unicode 3.1 compliant validation : for each category, it checks the
  * combination of each byte to make sure it maps to a valid range. It also
  * returns -1 for the following UCS values: ucs > 0x10ffff ucs & 0xfffe =
  * 0xfffe 0xfdd0 < ucs < 0xfdef ucs & 0xdb00 = 0xd800 (surrogates)
  */
 static int
 utf_charcheck(const unsigned char *c)
 {
 	if ((*c & 0x80) == 0)
 		return 1;
 	else if ((*c & 0xe0) == 0xc0)
 	{
 		/* two-byte char */
 		if (((c[1] & 0xc0) == 0x80) && ((c[0] & 0x1f) > 0x01))
 			return 2;
 		return -1;
 	}
 	else if ((*c & 0xf0) == 0xe0)
 	{
 		/* three-byte char */
 		if (((c[1] & 0xc0) == 0x80) &&
 			(((c[0] & 0x0f) != 0x00) || ((c[1] & 0x20) == 0x20)) &&
 			((c[2] & 0xc0) == 0x80))
 		{
 			int			z = c[0] & 0x0f;
 			int			yx = ((c[1] & 0x3f) << 6) | (c[0] & 0x3f);
 			int			lx = yx & 0x7f;

 			/* check 0xfffe/0xffff, 0xfdd0..0xfedf range, surrogates */
 			if (((z == 0x0f) &&
 				 (((yx & 0xffe) == 0xffe) ||
 			   (((yx & 0xf80) == 0xd80) && (lx >= 0x30) && (lx <= 0x4f)))) ||
 				((z == 0x0d) && ((yx & 0xb00) == 0x800)))
 				return -1;
 			return 3;
 		}
 		return -1;
 	}
 	else if ((*c & 0xf8) == 0xf0)
 	{
 		int			u = ((c[0] & 0x07) << 2) | ((c[1] & 0x30) >> 4);

 		/* four-byte char */
 		if (((c[1] & 0xc0) == 0x80) &&
 			(u > 0x00) && (u <= 0x10) &&
 			((c[2] & 0xc0) == 0x80) && ((c[3] & 0xc0) == 0x80))
 		{
 			/* test for 0xzzzzfffe/0xzzzzfffff */
 			if (((c[1] & 0x0f) == 0x0f) && ((c[2] & 0x3f) == 0x3f) &&
 				((c[3] & 0x3e) == 0x3e))
 				return -1;
 			return 4;
 		}
 		return -1;
 	}
 	return -1;
 }


 static void
 mb_utf_validate(unsigned char *pwcs)
 {
 	unsigned char *p = pwcs;

 	while (*pwcs)
 	{
 		int			len;

 		if ((len = utf_charcheck(pwcs)) > 0)
 		{
 			if (p != pwcs)
 			{
 				int			i;

 				for (i = 0; i < len; i++)
 					*p++ = *pwcs++;
 			}
 			else
 			{
 				pwcs += len;
 				p += len;
 			}
 		}
 		else
 			/* we skip the char */
 			pwcs++;
 	}
 	if (p != pwcs)
 		*p = '\0';
 }

 /*
  * public functions : wcswidth and mbvalidate
  */

 /*
  * pg_wcswidth is the dumb width function. It assumes that everything will
  * only appear on one line. OTOH it is easier to use if this applies to you.
  */
 int
 pg_wcswidth(const unsigned char *pwcs, size_t len, int encoding)
 {
 	int			width = 0;

 	while (len > 0)
 	{
 		int			chlen,
 					chwidth;

 		chlen = PQmblen((const char *) pwcs, encoding);
 		if (chlen > len)
 			break;				/* Invalid string */

 		chwidth = PQdsplen((const char *) pwcs, encoding);

 		if (chwidth > 0)
 			width += chwidth;
 		pwcs += chlen;
 	}
 	return width;
 }

 /*
  * pg_wcssize takes the given string in the given encoding and returns three
  * values:
  *	  result_width: Width in display characters of the longest line in string
  *	  result_height: Number of lines in display output
  *	  result_format_size: Number of bytes required to store formatted
  *		representation of string
  *
  * This MUST be kept in sync with pg_wcsformat!
  */
 void
 pg_wcssize(unsigned char *pwcs, size_t len, int encoding,
 		   int *result_width, int *result_height, int *result_format_size)
 {
 	int			w,
 				chlen = 0,
 				linewidth = 0;
 	int			width = 0;
 	int			height = 1;
 	int			format_size = 0;

 	for (; *pwcs && len > 0; pwcs += chlen)
 	{
 		chlen = PQmblen((char *) pwcs, encoding);
 		if (len < (size_t) chlen)
 			break;
 		w = PQdsplen((char *) pwcs, encoding);

 		if (chlen == 1)			/* single-byte char */
 		{
 			if (*pwcs == '\n')	/* Newline */
 			{
 				if (linewidth > width)
 					width = linewidth;
 				linewidth = 0;
 				height += 1;
 				format_size += 1;		/* For NUL char */
 			}
 			else if (*pwcs == '\r')		/* Linefeed */
 			{
 				linewidth += 2;
 				format_size += 2;
 			}
 			else if (*pwcs == '\t')		/* Tab */
 			{
 				do
 				{
 					linewidth++;
 					format_size++;
 				} while (linewidth % 8 != 0);
 			}
 			else if (w < 0)		/* Other control char */
 			{
 				linewidth += 4;
 				format_size += 4;
 			}
 			else	/* Output it as-is */
 			{
 				linewidth += w;
 				format_size += 1;
 			}
 		}
 		else if (w < 0)			/* Non-ascii control char */
 		{
 			linewidth += 6;		/* \u0000 */
 			format_size += 6;
 		}
 		else	/* All other chars */
 		{
 			linewidth += w;
 			format_size += chlen;
 		}
 		len -= chlen;
 	}
 	if (linewidth > width)
 		width = linewidth;
 	format_size += 1;			/* For NUL char */

 	/* Set results */
 	if (result_width)
 		*result_width = width;
 	if (result_height)
 		*result_height = height;
 	if (result_format_size)
 		*result_format_size = format_size;
 }

 /*
  *	Format a string into one or more "struct lineptr" lines.
  *	lines[i].ptr == NULL indicates the end of the array.
  *
  * This MUST be kept in sync with pg_wcssize!
  */
 void
 pg_wcsformat(unsigned char *pwcs, size_t len, int encoding,
 			 struct lineptr * lines, int count)
 {
 	int			w,
 				chlen = 0;
 	int			linewidth = 0;
 	unsigned char *ptr = lines->ptr;	/* Pointer to data area */

 	for (; *pwcs && len > 0; pwcs += chlen)
 	{
 		chlen = PQmblen((char *) pwcs, encoding);
 		if (len < (size_t) chlen)
 			break;
 		w = PQdsplen((char *) pwcs, encoding);

 		if (chlen == 1)			/* single-byte char */
 		{
 			if (*pwcs == '\n')	/* Newline */
 			{
 				*ptr++ = '\0';
 				lines->width = linewidth;
 				linewidth = 0;
 				lines++;
 				count--;
 				if (count <= 0)
 					exit(1);	/* Screwup */

 				/* make next line point to remaining memory */
 				lines->ptr = ptr;
 			}
 			else if (*pwcs == '\r')		/* Linefeed */
 			{
 				strcpy((char *) ptr, "\\r");
 				linewidth += 2;
 				ptr += 2;
 			}
 			else if (*pwcs == '\t')		/* Tab */
 			{
 				do
 				{
 					*ptr++ = ' ';
 					linewidth++;
 				} while (linewidth % 8 != 0);
 			}
 			else if (w < 0)		/* Other control char */
 			{
 				sprintf((char *) ptr, "\\x%02X", *pwcs);
 				linewidth += 4;
 				ptr += 4;
 			}
 			else	/* Output it as-is */
 			{
 				linewidth += w;
 				*ptr++ = *pwcs;
 			}
 		}
 		else if (w < 0)			/* Non-ascii control char */
 		{
 			if (encoding == PG_UTF8)
 				sprintf((char *) ptr, "\\u%04X", utf2ucs(pwcs));
 			else
 			{
 				/*
 				 * This case cannot happen in the current code because only
 				 * UTF-8 signals multibyte control characters. But we may need
 				 * to support it at some stage
 				 */
 				sprintf((char *) ptr, "\\u????");
 			}
 			ptr += 6;
 			linewidth += 6;
 		}
 		else	/* All other chars */
 		{
 			int			i;

 			for (i = 0; i < chlen; i++)
 				*ptr++ = pwcs[i];
 			linewidth += w;
 		}
 		len -= chlen;
 	}
 	lines->width = linewidth;
 	*ptr++ = '\0';				/* Terminate formatted string */

 	if (count <= 0)
 		exit(1);				/* Screwup */

 	(lines + 1)->ptr = NULL;	/* terminate line array */
 }

 unsigned char *
 mbvalidate(unsigned char *pwcs, int encoding)
 {
 	if (encoding == PG_UTF8)
 		mb_utf_validate((unsigned char *) pwcs);
 	else
 	{
 		/*
 		 * other encodings needing validation should add their own routines
 		 * here
 		 */
 	}

 	return pwcs;
 }
	/*
	* psql - the PostgreSQL interactive terminal
	*
	* Copyright (c) 2000-2010, PostgreSQL Global Development Group
	*
	* $PostgreSQL: pgsql/src/bin/psql/mbprint.c,v 1.38.6.1 2010/08/16 00:06:24 tgl Exp $
	*
	* XXX this file does not really belong in psql/. Perhaps move to libpq?
	* It also seems that the mbvalidate function is redundant with existing
	* functionality.
	*/

	#include "postgres_fe.h"
	#include "mbprint.h"
	#include "libpq-fe.h"
	#ifndef PGSCRIPTS
	#include "settings.h"
	#endif

	/*
	* To avoid version-skew problems, this file must not use declarations
	* from pg_wchar.h: the encoding IDs we are dealing with are determined
	* by the libpq.so we are linked with, and that might not match the
	* numbers we see at compile time. (If this file were inside libpq,
	* the problem would go away...)
	*
	* Hence, we have our own definition of pg_wchar, and we get the values
	* of any needed encoding IDs on-the-fly.
	*/

	typedef unsigned int pg_wchar;

	static int
	pg_get_utf8_id(void)
	{
	static int utf8_id = -1;

	if (utf8_id < 0)
	utf8_id = pg_char_to_encoding("utf8");
	return utf8_id;
	}

	#define PG_UTF8 pg_get_utf8_id()


	static pg_wchar
	utf2ucs(const unsigned char *c)
	{
	/*
	* one char version of pg_utf2wchar_with_len. no control here, c must
	* point to a large enough string
	*/
	if ((*c & 0x80) == 0)
	return (pg_wchar) c[0];
	else if ((*c & 0xe0) == 0xc0)
	return (pg_wchar) (((c[0] & 0x1f) << 6) \|
	(c[1] & 0x3f));
	else if ((*c & 0xf0) == 0xe0)
	return (pg_wchar) (((c[0] & 0x0f) << 12) \|
	((c[1] & 0x3f) << 6) \|
	(c[2] & 0x3f));
	else if ((*c & 0xf8) == 0xf0)
	return (pg_wchar) (((c[0] & 0x07) << 18) \|
	((c[1] & 0x3f) << 12) \|
	((c[2] & 0x3f) << 6) \|
	(c[3] & 0x3f));
	else
	/* that is an invalid code on purpose */
	return 0xffffffff;
	}


	/*
	* Unicode 3.1 compliant validation : for each category, it checks the
	* combination of each byte to make sure it maps to a valid range. It also
	* returns -1 for the following UCS values: ucs > 0x10ffff ucs & 0xfffe =
	* 0xfffe 0xfdd0 < ucs < 0xfdef ucs & 0xdb00 = 0xd800 (surrogates)
	*/
	static int
	utf_charcheck(const unsigned char *c)
	{
	if ((*c & 0x80) == 0)
	return 1;
	else if ((*c & 0xe0) == 0xc0)
	{
	/* two-byte char */
	if (((c[1] & 0xc0) == 0x80) && ((c[0] & 0x1f) > 0x01))
	return 2;
	return -1;
	}
	else if ((*c & 0xf0) == 0xe0)
	{
	/* three-byte char */
	if (((c[1] & 0xc0) == 0x80) &&
	(((c[0] & 0x0f) != 0x00) \|\| ((c[1] & 0x20) == 0x20)) &&
	((c[2] & 0xc0) == 0x80))
	{
	int z = c[0] & 0x0f;
	int yx = ((c[1] & 0x3f) << 6) \| (c[0] & 0x3f);
	int lx = yx & 0x7f;

	/* check 0xfffe/0xffff, 0xfdd0..0xfedf range, surrogates */
	if (((z == 0x0f) &&
	(((yx & 0xffe) == 0xffe) \|\|
	(((yx & 0xf80) == 0xd80) && (lx >= 0x30) && (lx <= 0x4f)))) \|\|
	((z == 0x0d) && ((yx & 0xb00) == 0x800)))
	return -1;
	return 3;
	}
	return -1;
	}
	else if ((*c & 0xf8) == 0xf0)
	{
	int u = ((c[0] & 0x07) << 2) \| ((c[1] & 0x30) >> 4);

	/* four-byte char */
	if (((c[1] & 0xc0) == 0x80) &&
	(u > 0x00) && (u <= 0x10) &&
	((c[2] & 0xc0) == 0x80) && ((c[3] & 0xc0) == 0x80))
	{
	/* test for 0xzzzzfffe/0xzzzzfffff */
	if (((c[1] & 0x0f) == 0x0f) && ((c[2] & 0x3f) == 0x3f) &&
	((c[3] & 0x3e) == 0x3e))
	return -1;
	return 4;
	}
	return -1;
	}
	return -1;
	}


	static void
	mb_utf_validate(unsigned char *pwcs)
	{
	unsigned char *p = pwcs;

	while (*pwcs)
	{
	int len;

	if ((len = utf_charcheck(pwcs)) > 0)
	{
	if (p != pwcs)
	{
	int i;

	for (i = 0; i < len; i++)
	p++ = pwcs++;
	}
	else
	{
	pwcs += len;
	p += len;
	}
	}
	else
	/* we skip the char */
	pwcs++;
	}
	if (p != pwcs)
	*p = '\0';
	}

	/*
	* public functions : wcswidth and mbvalidate
	*/

	/*
	* pg_wcswidth is the dumb width function. It assumes that everything will
	* only appear on one line. OTOH it is easier to use if this applies to you.
	*/
	int
	pg_wcswidth(const unsigned char *pwcs, size_t len, int encoding)
	{
	int width = 0;

	while (len > 0)
	{
	int chlen,
	chwidth;

	chlen = PQmblen((const char *) pwcs, encoding);
	if (chlen > len)
	break; /* Invalid string */

	chwidth = PQdsplen((const char *) pwcs, encoding);

	if (chwidth > 0)
	width += chwidth;
	pwcs += chlen;
	}
	return width;
	}

	/*
	* pg_wcssize takes the given string in the given encoding and returns three
	* values:
	* result_width: Width in display characters of the longest line in string
	* result_height: Number of lines in display output
	* result_format_size: Number of bytes required to store formatted
	* representation of string
	*
	* This MUST be kept in sync with pg_wcsformat!
	*/
	void
	pg_wcssize(unsigned char *pwcs, size_t len, int encoding,
	int result_width, int result_height, int *result_format_size)
	{
	int w,
	chlen = 0,
	linewidth = 0;
	int width = 0;
	int height = 1;
	int format_size = 0;

	for (; *pwcs && len > 0; pwcs += chlen)
	{
	chlen = PQmblen((char *) pwcs, encoding);
	if (len < (size_t) chlen)
	break;
	w = PQdsplen((char *) pwcs, encoding);

	if (chlen == 1) /* single-byte char */
	{
	if (pwcs == '\n') / Newline */
	{
	if (linewidth > width)
	width = linewidth;
	linewidth = 0;
	height += 1;
	format_size += 1; /* For NUL char */
	}
	else if (pwcs == '\r') / Linefeed */
	{
	linewidth += 2;
	format_size += 2;
	}
	else if (pwcs == '\t') / Tab */
	{
	do
	{
	linewidth++;
	format_size++;
	} while (linewidth % 8 != 0);
	}
	else if (w < 0) /* Other control char */
	{
	linewidth += 4;
	format_size += 4;
	}
	else /* Output it as-is */
	{
	linewidth += w;
	format_size += 1;
	}
	}
	else if (w < 0) /* Non-ascii control char */
	{
	linewidth += 6; /* \u0000 */
	format_size += 6;
	}
	else /* All other chars */
	{
	linewidth += w;
	format_size += chlen;
	}
	len -= chlen;
	}
	if (linewidth > width)
	width = linewidth;
	format_size += 1; /* For NUL char */

	/* Set results */
	if (result_width)
	*result_width = width;
	if (result_height)
	*result_height = height;
	if (result_format_size)
	*result_format_size = format_size;
	}

	/*
	* Format a string into one or more "struct lineptr" lines.
	* lines[i].ptr == NULL indicates the end of the array.
	*
	* This MUST be kept in sync with pg_wcssize!
	*/
	void
	pg_wcsformat(unsigned char *pwcs, size_t len, int encoding,
	struct lineptr * lines, int count)
	{
	int w,
	chlen = 0;
	int linewidth = 0;
	unsigned char ptr = lines->ptr; / Pointer to data area */

	for (; *pwcs && len > 0; pwcs += chlen)
	{
	chlen = PQmblen((char *) pwcs, encoding);
	if (len < (size_t) chlen)
	break;
	w = PQdsplen((char *) pwcs, encoding);

	if (chlen == 1) /* single-byte char */
	{
	if (pwcs == '\n') / Newline */
	{
	*ptr++ = '\0';
	lines->width = linewidth;
	linewidth = 0;
	lines++;
	count--;
	if (count <= 0)
	exit(1); /* Screwup */

	/* make next line point to remaining memory */
	lines->ptr = ptr;
	}
	else if (pwcs == '\r') / Linefeed */
	{
	strcpy((char *) ptr, "\\r");
	linewidth += 2;
	ptr += 2;
	}
	else if (pwcs == '\t') / Tab */
	{
	do
	{
	*ptr++ = ' ';
	linewidth++;
	} while (linewidth % 8 != 0);
	}
	else if (w < 0) /* Other control char */
	{
	sprintf((char ) ptr, "\\x%02X", pwcs);
	linewidth += 4;
	ptr += 4;
	}
	else /* Output it as-is */
	{
	linewidth += w;
	ptr++ = pwcs;
	}
	}
	else if (w < 0) /* Non-ascii control char */
	{
	if (encoding == PG_UTF8)
	sprintf((char *) ptr, "\\u%04X", utf2ucs(pwcs));
	else
	{
	/*
	* This case cannot happen in the current code because only
	* UTF-8 signals multibyte control characters. But we may need
	* to support it at some stage
	*/
	sprintf((char *) ptr, "\\u????");
	}
	ptr += 6;
	linewidth += 6;
	}
	else /* All other chars */
	{
	int i;

	for (i = 0; i < chlen; i++)
	*ptr++ = pwcs[i];
	linewidth += w;
	}
	len -= chlen;
	}
	lines->width = linewidth;
	ptr++ = '\0'; / Terminate formatted string */

	if (count <= 0)
	exit(1); /* Screwup */

	(lines + 1)->ptr = NULL; /* terminate line array */
	}

	unsigned char *
	mbvalidate(unsigned char *pwcs, int encoding)
	{
	if (encoding == PG_UTF8)
	mb_utf_validate((unsigned char *) pwcs);
	else
	{
	/*
	* other encodings needing validation should add their own routines
	* here
	*/
	}

	return pwcs;
	}