src/backend/parser/parser.c - cloudberry - Git at Google

 /*-------------------------------------------------------------------------
  *
  * parser.c
  *		Main entry point/driver for PostgreSQL grammar
  *
  * Note that the grammar is not allowed to perform any table access
  * (since we need to be able to do basic parsing even while inside an
  * aborted transaction).  Therefore, the data structures returned by
  * the grammar are "raw" parsetrees that still need to be analyzed by
  * analyze.c and related files.
  *
  *
  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
  *	  src/backend/parser/parser.c
  *
  *-------------------------------------------------------------------------
  */

 #include "postgres.h"

 #include "mb/pg_wchar.h"
 #include "parser/gramparse.h"
 #include "parser/parser.h"
 #include "parser/scansup.h"

 static bool check_uescapechar(unsigned char escape);
 static char *str_udeescape(const char *str, char escape,
 						   int position, core_yyscan_t yyscanner);

 #include "cdb/cdbvars.h"

 /*
  * raw_parser
  *		Given a query in string form, do lexical and grammatical analysis.
  *
  * Returns a list of raw (un-analyzed) parse trees.  The contents of the
  * list have the form required by the specified RawParseMode.
  */
 List *
 raw_parser(const char *str, RawParseMode mode)
 {
 	core_yyscan_t yyscanner;
 	base_yy_extra_type yyextra;
 	int			yyresult;

 	/*
 	 * In GPDB, temporarily disable escape_string_warning, if we're in a QE
 	 * node. When we're parsing a PL/pgSQL function, e.g. in a CREATE FUNCTION
 	 * command, you should've gotten the same warning from the QD node already.
 	 * We could probably disable the warning in QE nodes altogether, not just
 	 * in PL/pgSQL, but it can be useful for catching escaping bugs, when
 	 * internal queries are dispatched from QD to QEs.
 	 */
 	bool            save_escape_string_warning = escape_string_warning;
 	PG_TRY();
 	{
 		if (Gp_role == GP_ROLE_EXECUTE)
 			escape_string_warning = false;

 		/* initialize the flex scanner */
 		yyscanner = scanner_init(str, &yyextra.core_yy_extra,
 								 &ScanKeywords, ScanKeywordTokens);

 		if (Gp_role == GP_ROLE_EXECUTE)
 			escape_string_warning = save_escape_string_warning;
 	}
 	PG_CATCH();
 	{
 		if (Gp_role == GP_ROLE_EXECUTE)
 			escape_string_warning = save_escape_string_warning;
 		PG_RE_THROW();
 	}
 	PG_END_TRY();

 	yyextra.tail_partition_magic = false;
 	/* base_yylex() only needs us to initialize the lookahead token, if any */
 	if (mode == RAW_PARSE_DEFAULT)
 		yyextra.have_lookahead = false;
 	else
 	{
 		/* this array is indexed by RawParseMode enum */
 		static const int mode_token[] = {
 			0,					/* RAW_PARSE_DEFAULT */
 			MODE_TYPE_NAME,		/* RAW_PARSE_TYPE_NAME */
 			MODE_PLPGSQL_EXPR,	/* RAW_PARSE_PLPGSQL_EXPR */
 			MODE_PLPGSQL_ASSIGN1,	/* RAW_PARSE_PLPGSQL_ASSIGN1 */
 			MODE_PLPGSQL_ASSIGN2,	/* RAW_PARSE_PLPGSQL_ASSIGN2 */
 			MODE_PLPGSQL_ASSIGN3	/* RAW_PARSE_PLPGSQL_ASSIGN3 */
 		};

 		yyextra.have_lookahead = true;
 		yyextra.lookahead_token = mode_token[mode];
 		yyextra.lookahead_yylloc = 0;
 		yyextra.lookahead_end = NULL;
 	}
 	/* initialize the bison parser */
 	parser_init(&yyextra);

 	/* Parse! */
 	yyresult = base_yyparse(yyscanner);

 	/* Clean up (release memory) */
 	scanner_finish(yyscanner);

 	if (yyresult)				/* error */
 		return NIL;

 	return yyextra.parsetree;
 }


 /*
  * Intermediate filter between parser and core lexer (core_yylex in scan.l).
  *
  * This filter is needed because in some cases the standard SQL grammar
  * requires more than one token lookahead.  We reduce these cases to one-token
  * lookahead by replacing tokens here, in order to keep the grammar LALR(1).
  *
  * Using a filter is simpler than trying to recognize multiword tokens
  * directly in scan.l, because we'd have to allow for comments between the
  * words.  Furthermore it's not clear how to do that without re-introducing
  * scanner backtrack, which would cost more performance than this filter
  * layer does.
  *
  * We also use this filter to convert UIDENT and USCONST sequences into
  * plain IDENT and SCONST tokens.  While that could be handled by additional
  * productions in the main grammar, it's more efficient to do it like this.
  *
  * The filter also provides a convenient place to translate between
  * the core_YYSTYPE and YYSTYPE representations (which are really the
  * same thing anyway, but notationally they're different).
  */
 int
 base_yylex(YYSTYPE *lvalp, YYLTYPE *llocp, core_yyscan_t yyscanner)
 {
 	base_yy_extra_type *yyextra = pg_yyget_extra(yyscanner);
 	int			cur_token;
 	int			next_token;
 	int			cur_token_length;
 	YYLTYPE		cur_yylloc;

 	/* Get next token --- we might already have it */
 	if (yyextra->have_lookahead)
 	{
 		cur_token = yyextra->lookahead_token;
 		lvalp->core_yystype = yyextra->lookahead_yylval;
 		*llocp = yyextra->lookahead_yylloc;
 		if (yyextra->lookahead_end)
 			*(yyextra->lookahead_end) = yyextra->lookahead_hold_char;
 		yyextra->have_lookahead = false;
 	}
 	else
 		cur_token = core_yylex(&(lvalp->core_yystype), llocp, yyscanner);

 	/*
 	 * Check for special handling of PARTITION keyword. (see
 	 * OptFirstPartitionSpec rule in the grammar)
 	 */
 	if (yyextra->tail_partition_magic)
 	{
 		if (cur_token == PARTITION)
 		{
 			yyextra->tail_partition_magic = false;
 			return PARTITION_TAIL;
 		}
 	}

 	/*
 	 * If this token isn't one that requires lookahead, just return it.  If it
 	 * does, determine the token length.  (We could get that via strlen(), but
 	 * since we have such a small set of possibilities, hardwiring seems
 	 * feasible and more efficient --- at least for the fixed-length cases.)
 	 */
 	switch (cur_token)
 	{
 		case NOT:
 			cur_token_length = 3;
 			break;
 		case NULLS_P:
 			cur_token_length = 5;
 			break;
 		case WITH:
 			cur_token_length = 4;
 			break;
 		case UIDENT:
 		case USCONST:
 			cur_token_length = strlen(yyextra->core_yy_extra.scanbuf + *llocp);
 			break;
 		default:
 			return cur_token;
 	}

 	/*
 	 * Identify end+1 of current token.  core_yylex() has temporarily stored a
 	 * '\0' here, and will undo that when we call it again.  We need to redo
 	 * it to fully revert the lookahead call for error reporting purposes.
 	 */
 	yyextra->lookahead_end = yyextra->core_yy_extra.scanbuf +
 		*llocp + cur_token_length;
 	Assert(*(yyextra->lookahead_end) == '\0');

 	/*
 	 * Save and restore *llocp around the call.  It might look like we could
 	 * avoid this by just passing &lookahead_yylloc to core_yylex(), but that
 	 * does not work because flex actually holds onto the last-passed pointer
 	 * internally, and will use that for error reporting.  We need any error
 	 * reports to point to the current token, not the next one.
 	 */
 	cur_yylloc = *llocp;

 	/* Get next token, saving outputs into lookahead variables */
 	next_token = core_yylex(&(yyextra->lookahead_yylval), llocp, yyscanner);
 	yyextra->lookahead_token = next_token;
 	yyextra->lookahead_yylloc = *llocp;

 	*llocp = cur_yylloc;

 	/* Now revert the un-truncation of the current token */
 	yyextra->lookahead_hold_char = *(yyextra->lookahead_end);
 	*(yyextra->lookahead_end) = '\0';

 	yyextra->have_lookahead = true;

 	/* Replace cur_token if needed, based on lookahead */
 	switch (cur_token)
 	{
 		case NOT:
 			/* Replace NOT by NOT_LA if it's followed by BETWEEN, IN, etc */
 			switch (next_token)
 			{
 				case BETWEEN:
 				case IN_P:
 				case LIKE:
 				case ILIKE:
 				case SIMILAR:
 					cur_token = NOT_LA;
 					break;
 			}
 			break;

 		case NULLS_P:
 			/* Replace NULLS_P by NULLS_LA if it's followed by FIRST or LAST */
 			switch (next_token)
 			{
 				case FIRST_P:
 				case LAST_P:
 					cur_token = NULLS_LA;
 					break;
 			}
 			break;

 		case WITH:
 			/* Replace WITH by WITH_LA if it's followed by TIME or ORDINALITY */
 			switch (next_token)
 			{
 				case TIME:
 				case ORDINALITY:
 					cur_token = WITH_LA;
 					break;
 			}
 			break;

 		case UIDENT:
 		case USCONST:
 			/* Look ahead for UESCAPE */
 			if (next_token == UESCAPE)
 			{
 				/* Yup, so get third token, which had better be SCONST */
 				const char *escstr;

 				/* Again save and restore *llocp */
 				cur_yylloc = *llocp;

 				/* Un-truncate current token so errors point to third token */
 				*(yyextra->lookahead_end) = yyextra->lookahead_hold_char;

 				/* Get third token */
 				next_token = core_yylex(&(yyextra->lookahead_yylval),
 										llocp, yyscanner);

 				/* If we throw error here, it will point to third token */
 				if (next_token != SCONST)
 					scanner_yyerror("UESCAPE must be followed by a simple string literal",
 									yyscanner);

 				escstr = yyextra->lookahead_yylval.str;
 				if (strlen(escstr) != 1 || !check_uescapechar(escstr[0]))
 					scanner_yyerror("invalid Unicode escape character",
 									yyscanner);

 				/* Now restore *llocp; errors will point to first token */
 				*llocp = cur_yylloc;

 				/* Apply Unicode conversion */
 				lvalp->core_yystype.str =
 					str_udeescape(lvalp->core_yystype.str,
 								  escstr[0],
 								  *llocp,
 								  yyscanner);

 				/*
 				 * We don't need to revert the un-truncation of UESCAPE.  What
 				 * we do want to do is clear have_lookahead, thereby consuming
 				 * all three tokens.
 				 */
 				yyextra->have_lookahead = false;
 			}
 			else
 			{
 				/* No UESCAPE, so convert using default escape character */
 				lvalp->core_yystype.str =
 					str_udeescape(lvalp->core_yystype.str,
 								  '\\',
 								  *llocp,
 								  yyscanner);
 			}

 			if (cur_token == UIDENT)
 			{
 				/* It's an identifier, so truncate as appropriate */
 				truncate_identifier(lvalp->core_yystype.str,
 									strlen(lvalp->core_yystype.str),
 									true);
 				cur_token = IDENT;
 			}
 			else if (cur_token == USCONST)
 			{
 				cur_token = SCONST;
 			}
 			break;
 	}

 	return cur_token;
 }

 /* convert hex digit (caller should have verified that) to value */
 static unsigned int
 hexval(unsigned char c)
 {
 	if (c >= '0' && c <= '9')
 		return c - '0';
 	if (c >= 'a' && c <= 'f')
 		return c - 'a' + 0xA;
 	if (c >= 'A' && c <= 'F')
 		return c - 'A' + 0xA;
 	elog(ERROR, "invalid hexadecimal digit");
 	return 0;					/* not reached */
 }

 /* is Unicode code point acceptable? */
 static void
 check_unicode_value(pg_wchar c)
 {
 	if (!is_valid_unicode_codepoint(c))
 		ereport(ERROR,
 				(errcode(ERRCODE_SYNTAX_ERROR),
 				 errmsg("invalid Unicode escape value")));
 }

 /* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */
 static bool
 check_uescapechar(unsigned char escape)
 {
 	if (isxdigit(escape)
 		|| escape == '+'
 		|| escape == '\''
 		|| escape == '"'
 		|| scanner_isspace(escape))
 		return false;
 	else
 		return true;
 }

 /*
  * Process Unicode escapes in "str", producing a palloc'd plain string
  *
  * escape: the escape character to use
  * position: start position of U&'' or U&"" string token
  * yyscanner: context information needed for error reports
  */
 static char *
 str_udeescape(const char *str, char escape,
 			  int position, core_yyscan_t yyscanner)
 {
 	const char *in;
 	char	   *new,
 			   *out;
 	size_t		new_len;
 	pg_wchar	pair_first = 0;
 	ScannerCallbackState scbstate;

 	/*
 	 * Guesstimate that result will be no longer than input, but allow enough
 	 * padding for Unicode conversion.
 	 */
 	new_len = strlen(str) + MAX_UNICODE_EQUIVALENT_STRING + 1;
 	new = palloc(new_len);

 	in = str;
 	out = new;
 	while (*in)
 	{
 		/* Enlarge string if needed */
 		size_t		out_dist = out - new;

 		if (out_dist > new_len - (MAX_UNICODE_EQUIVALENT_STRING + 1))
 		{
 			new_len *= 2;
 			new = repalloc(new, new_len);
 			out = new + out_dist;
 		}

 		if (in[0] == escape)
 		{
 			/*
 			 * Any errors reported while processing this escape sequence will
 			 * have an error cursor pointing at the escape.
 			 */
 			setup_scanner_errposition_callback(&scbstate, yyscanner,
 											   in - str + position + 3);	/* 3 for U&" */
 			if (in[1] == escape)
 			{
 				if (pair_first)
 					goto invalid_pair;
 				*out++ = escape;
 				in += 2;
 			}
 			else if (isxdigit((unsigned char) in[1]) &&
 					 isxdigit((unsigned char) in[2]) &&
 					 isxdigit((unsigned char) in[3]) &&
 					 isxdigit((unsigned char) in[4]))
 			{
 				pg_wchar	unicode;

 				unicode = (hexval(in[1]) << 12) +
 					(hexval(in[2]) << 8) +
 					(hexval(in[3]) << 4) +
 					hexval(in[4]);
 				check_unicode_value(unicode);
 				if (pair_first)
 				{
 					if (is_utf16_surrogate_second(unicode))
 					{
 						unicode = surrogate_pair_to_codepoint(pair_first, unicode);
 						pair_first = 0;
 					}
 					else
 						goto invalid_pair;
 				}
 				else if (is_utf16_surrogate_second(unicode))
 					goto invalid_pair;

 				if (is_utf16_surrogate_first(unicode))
 					pair_first = unicode;
 				else
 				{
 					pg_unicode_to_server(unicode, (unsigned char *) out);
 					out += strlen(out);
 				}
 				in += 5;
 			}
 			else if (in[1] == '+' &&
 					 isxdigit((unsigned char) in[2]) &&
 					 isxdigit((unsigned char) in[3]) &&
 					 isxdigit((unsigned char) in[4]) &&
 					 isxdigit((unsigned char) in[5]) &&
 					 isxdigit((unsigned char) in[6]) &&
 					 isxdigit((unsigned char) in[7]))
 			{
 				pg_wchar	unicode;

 				unicode = (hexval(in[2]) << 20) +
 					(hexval(in[3]) << 16) +
 					(hexval(in[4]) << 12) +
 					(hexval(in[5]) << 8) +
 					(hexval(in[6]) << 4) +
 					hexval(in[7]);
 				check_unicode_value(unicode);
 				if (pair_first)
 				{
 					if (is_utf16_surrogate_second(unicode))
 					{
 						unicode = surrogate_pair_to_codepoint(pair_first, unicode);
 						pair_first = 0;
 					}
 					else
 						goto invalid_pair;
 				}
 				else if (is_utf16_surrogate_second(unicode))
 					goto invalid_pair;

 				if (is_utf16_surrogate_first(unicode))
 					pair_first = unicode;
 				else
 				{
 					pg_unicode_to_server(unicode, (unsigned char *) out);
 					out += strlen(out);
 				}
 				in += 8;
 			}
 			else
 				ereport(ERROR,
 						(errcode(ERRCODE_SYNTAX_ERROR),
 						 errmsg("invalid Unicode escape"),
 						 errhint("Unicode escapes must be \\XXXX or \\+XXXXXX.")));

 			cancel_scanner_errposition_callback(&scbstate);
 		}
 		else
 		{
 			if (pair_first)
 				goto invalid_pair;

 			*out++ = *in++;
 		}
 	}

 	/* unfinished surrogate pair? */
 	if (pair_first)
 		goto invalid_pair;

 	*out = '\0';
 	return new;

 	/*
 	 * We might get here with the error callback active, or not.  Call
 	 * scanner_errposition to make sure an error cursor appears; if the
 	 * callback is active, this is duplicative but harmless.
 	 */
 invalid_pair:
 	ereport(ERROR,
 			(errcode(ERRCODE_SYNTAX_ERROR),
 			 errmsg("invalid Unicode surrogate pair"),
 			 scanner_errposition(in - str + position + 3,	/* 3 for U&" */
 								 yyscanner)));
 	return NULL;				/* keep compiler quiet */
 }
	/*-------------------------------------------------------------------------
	*
	* parser.c
	* Main entry point/driver for PostgreSQL grammar
	*
	* Note that the grammar is not allowed to perform any table access
	* (since we need to be able to do basic parsing even while inside an
	* aborted transaction). Therefore, the data structures returned by
	* the grammar are "raw" parsetrees that still need to be analyzed by
	* analyze.c and related files.
	*
	*
	* Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
	* Portions Copyright (c) 1994, Regents of the University of California
	*
	* IDENTIFICATION
	* src/backend/parser/parser.c
	*
	*-------------------------------------------------------------------------
	*/

	#include "postgres.h"

	#include "mb/pg_wchar.h"
	#include "parser/gramparse.h"
	#include "parser/parser.h"
	#include "parser/scansup.h"

	static bool check_uescapechar(unsigned char escape);
	static char str_udeescape(const char str, char escape,
	int position, core_yyscan_t yyscanner);

	#include "cdb/cdbvars.h"

	/*
	* raw_parser
	* Given a query in string form, do lexical and grammatical analysis.
	*
	* Returns a list of raw (un-analyzed) parse trees. The contents of the
	* list have the form required by the specified RawParseMode.
	*/
	List *
	raw_parser(const char *str, RawParseMode mode)
	{
	core_yyscan_t yyscanner;
	base_yy_extra_type yyextra;
	int yyresult;

	/*
	* In GPDB, temporarily disable escape_string_warning, if we're in a QE
	* node. When we're parsing a PL/pgSQL function, e.g. in a CREATE FUNCTION
	* command, you should've gotten the same warning from the QD node already.
	* We could probably disable the warning in QE nodes altogether, not just
	* in PL/pgSQL, but it can be useful for catching escaping bugs, when
	* internal queries are dispatched from QD to QEs.
	*/
	bool save_escape_string_warning = escape_string_warning;
	PG_TRY();
	{
	if (Gp_role == GP_ROLE_EXECUTE)
	escape_string_warning = false;

	/* initialize the flex scanner */
	yyscanner = scanner_init(str, &yyextra.core_yy_extra,
	&ScanKeywords, ScanKeywordTokens);

	if (Gp_role == GP_ROLE_EXECUTE)
	escape_string_warning = save_escape_string_warning;
	}
	PG_CATCH();
	{
	if (Gp_role == GP_ROLE_EXECUTE)
	escape_string_warning = save_escape_string_warning;
	PG_RE_THROW();
	}
	PG_END_TRY();

	yyextra.tail_partition_magic = false;
	/* base_yylex() only needs us to initialize the lookahead token, if any */
	if (mode == RAW_PARSE_DEFAULT)
	yyextra.have_lookahead = false;
	else
	{
	/* this array is indexed by RawParseMode enum */
	static const int mode_token[] = {
	0, /* RAW_PARSE_DEFAULT */
	MODE_TYPE_NAME, /* RAW_PARSE_TYPE_NAME */
	MODE_PLPGSQL_EXPR, /* RAW_PARSE_PLPGSQL_EXPR */
	MODE_PLPGSQL_ASSIGN1, /* RAW_PARSE_PLPGSQL_ASSIGN1 */
	MODE_PLPGSQL_ASSIGN2, /* RAW_PARSE_PLPGSQL_ASSIGN2 */
	MODE_PLPGSQL_ASSIGN3 /* RAW_PARSE_PLPGSQL_ASSIGN3 */
	};

	yyextra.have_lookahead = true;
	yyextra.lookahead_token = mode_token[mode];
	yyextra.lookahead_yylloc = 0;
	yyextra.lookahead_end = NULL;
	}
	/* initialize the bison parser */
	parser_init(&yyextra);

	/* Parse! */
	yyresult = base_yyparse(yyscanner);

	/* Clean up (release memory) */
	scanner_finish(yyscanner);

	if (yyresult) /* error */
	return NIL;

	return yyextra.parsetree;
	}


	/*
	* Intermediate filter between parser and core lexer (core_yylex in scan.l).
	*
	* This filter is needed because in some cases the standard SQL grammar
	* requires more than one token lookahead. We reduce these cases to one-token
	* lookahead by replacing tokens here, in order to keep the grammar LALR(1).
	*
	* Using a filter is simpler than trying to recognize multiword tokens
	* directly in scan.l, because we'd have to allow for comments between the
	* words. Furthermore it's not clear how to do that without re-introducing
	* scanner backtrack, which would cost more performance than this filter
	* layer does.
	*
	* We also use this filter to convert UIDENT and USCONST sequences into
	* plain IDENT and SCONST tokens. While that could be handled by additional
	* productions in the main grammar, it's more efficient to do it like this.
	*
	* The filter also provides a convenient place to translate between
	* the core_YYSTYPE and YYSTYPE representations (which are really the
	* same thing anyway, but notationally they're different).
	*/
	int
	base_yylex(YYSTYPE lvalp, YYLTYPE llocp, core_yyscan_t yyscanner)
	{
	base_yy_extra_type *yyextra = pg_yyget_extra(yyscanner);
	int cur_token;
	int next_token;
	int cur_token_length;
	YYLTYPE cur_yylloc;

	/* Get next token --- we might already have it */
	if (yyextra->have_lookahead)
	{
	cur_token = yyextra->lookahead_token;
	lvalp->core_yystype = yyextra->lookahead_yylval;
	*llocp = yyextra->lookahead_yylloc;
	if (yyextra->lookahead_end)
	*(yyextra->lookahead_end) = yyextra->lookahead_hold_char;
	yyextra->have_lookahead = false;
	}
	else
	cur_token = core_yylex(&(lvalp->core_yystype), llocp, yyscanner);

	/*
	* Check for special handling of PARTITION keyword. (see
	* OptFirstPartitionSpec rule in the grammar)
	*/
	if (yyextra->tail_partition_magic)
	{
	if (cur_token == PARTITION)
	{
	yyextra->tail_partition_magic = false;
	return PARTITION_TAIL;
	}
	}

	/*
	* If this token isn't one that requires lookahead, just return it. If it
	* does, determine the token length. (We could get that via strlen(), but
	* since we have such a small set of possibilities, hardwiring seems
	* feasible and more efficient --- at least for the fixed-length cases.)
	*/
	switch (cur_token)
	{
	case NOT:
	cur_token_length = 3;
	break;
	case NULLS_P:
	cur_token_length = 5;
	break;
	case WITH:
	cur_token_length = 4;
	break;
	case UIDENT:
	case USCONST:
	cur_token_length = strlen(yyextra->core_yy_extra.scanbuf + *llocp);
	break;
	default:
	return cur_token;
	}

	/*
	* Identify end+1 of current token. core_yylex() has temporarily stored a
	* '\0' here, and will undo that when we call it again. We need to redo
	* it to fully revert the lookahead call for error reporting purposes.
	*/
	yyextra->lookahead_end = yyextra->core_yy_extra.scanbuf +
	*llocp + cur_token_length;
	Assert(*(yyextra->lookahead_end) == '\0');

	/*
	* Save and restore *llocp around the call. It might look like we could
	* avoid this by just passing &lookahead_yylloc to core_yylex(), but that
	* does not work because flex actually holds onto the last-passed pointer
	* internally, and will use that for error reporting. We need any error
	* reports to point to the current token, not the next one.
	*/
	cur_yylloc = *llocp;

	/* Get next token, saving outputs into lookahead variables */
	next_token = core_yylex(&(yyextra->lookahead_yylval), llocp, yyscanner);
	yyextra->lookahead_token = next_token;
	yyextra->lookahead_yylloc = *llocp;

	*llocp = cur_yylloc;

	/* Now revert the un-truncation of the current token */
	yyextra->lookahead_hold_char = *(yyextra->lookahead_end);
	*(yyextra->lookahead_end) = '\0';

	yyextra->have_lookahead = true;

	/* Replace cur_token if needed, based on lookahead */
	switch (cur_token)
	{
	case NOT:
	/* Replace NOT by NOT_LA if it's followed by BETWEEN, IN, etc */
	switch (next_token)
	{
	case BETWEEN:
	case IN_P:
	case LIKE:
	case ILIKE:
	case SIMILAR:
	cur_token = NOT_LA;
	break;
	}
	break;

	case NULLS_P:
	/* Replace NULLS_P by NULLS_LA if it's followed by FIRST or LAST */
	switch (next_token)
	{
	case FIRST_P:
	case LAST_P:
	cur_token = NULLS_LA;
	break;
	}
	break;

	case WITH:
	/* Replace WITH by WITH_LA if it's followed by TIME or ORDINALITY */
	switch (next_token)
	{
	case TIME:
	case ORDINALITY:
	cur_token = WITH_LA;
	break;
	}
	break;

	case UIDENT:
	case USCONST:
	/* Look ahead for UESCAPE */
	if (next_token == UESCAPE)
	{
	/* Yup, so get third token, which had better be SCONST */
	const char *escstr;

	/* Again save and restore llocp /
	cur_yylloc = *llocp;

	/* Un-truncate current token so errors point to third token */
	*(yyextra->lookahead_end) = yyextra->lookahead_hold_char;

	/* Get third token */
	next_token = core_yylex(&(yyextra->lookahead_yylval),
	llocp, yyscanner);

	/* If we throw error here, it will point to third token */
	if (next_token != SCONST)
	scanner_yyerror("UESCAPE must be followed by a simple string literal",
	yyscanner);

	escstr = yyextra->lookahead_yylval.str;
	if (strlen(escstr) != 1 \|\| !check_uescapechar(escstr[0]))
	scanner_yyerror("invalid Unicode escape character",
	yyscanner);

	/* Now restore llocp; errors will point to first token /
	*llocp = cur_yylloc;

	/* Apply Unicode conversion */
	lvalp->core_yystype.str =
	str_udeescape(lvalp->core_yystype.str,
	escstr[0],
	*llocp,
	yyscanner);

	/*
	* We don't need to revert the un-truncation of UESCAPE. What
	* we do want to do is clear have_lookahead, thereby consuming
	* all three tokens.
	*/
	yyextra->have_lookahead = false;
	}
	else
	{
	/* No UESCAPE, so convert using default escape character */
	lvalp->core_yystype.str =
	str_udeescape(lvalp->core_yystype.str,
	'\\',
	*llocp,
	yyscanner);
	}

	if (cur_token == UIDENT)
	{
	/* It's an identifier, so truncate as appropriate */
	truncate_identifier(lvalp->core_yystype.str,
	strlen(lvalp->core_yystype.str),
	true);
	cur_token = IDENT;
	}
	else if (cur_token == USCONST)
	{
	cur_token = SCONST;
	}
	break;
	}

	return cur_token;
	}

	/* convert hex digit (caller should have verified that) to value */
	static unsigned int
	hexval(unsigned char c)
	{
	if (c >= '0' && c <= '9')
	return c - '0';
	if (c >= 'a' && c <= 'f')
	return c - 'a' + 0xA;
	if (c >= 'A' && c <= 'F')
	return c - 'A' + 0xA;
	elog(ERROR, "invalid hexadecimal digit");
	return 0; /* not reached */
	}

	/* is Unicode code point acceptable? */
	static void
	check_unicode_value(pg_wchar c)
	{
	if (!is_valid_unicode_codepoint(c))
	ereport(ERROR,
	(errcode(ERRCODE_SYNTAX_ERROR),
	errmsg("invalid Unicode escape value")));
	}

	/* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */
	static bool
	check_uescapechar(unsigned char escape)
	{
	if (isxdigit(escape)
	\|\| escape == '+'
	\|\| escape == '\''
	\|\| escape == '"'
	\|\| scanner_isspace(escape))
	return false;
	else
	return true;
	}

	/*
	* Process Unicode escapes in "str", producing a palloc'd plain string
	*
	* escape: the escape character to use
	* position: start position of U&'' or U&"" string token
	* yyscanner: context information needed for error reports
	*/
	static char *
	str_udeescape(const char *str, char escape,
	int position, core_yyscan_t yyscanner)
	{
	const char *in;
	char *new,
	*out;
	size_t new_len;
	pg_wchar pair_first = 0;
	ScannerCallbackState scbstate;

	/*
	* Guesstimate that result will be no longer than input, but allow enough
	* padding for Unicode conversion.
	*/
	new_len = strlen(str) + MAX_UNICODE_EQUIVALENT_STRING + 1;
	new = palloc(new_len);

	in = str;
	out = new;
	while (*in)
	{
	/* Enlarge string if needed */
	size_t out_dist = out - new;

	if (out_dist > new_len - (MAX_UNICODE_EQUIVALENT_STRING + 1))
	{
	new_len *= 2;
	new = repalloc(new, new_len);
	out = new + out_dist;
	}

	if (in[0] == escape)
	{
	/*
	* Any errors reported while processing this escape sequence will
	* have an error cursor pointing at the escape.
	*/
	setup_scanner_errposition_callback(&scbstate, yyscanner,
	in - str + position + 3); /* 3 for U&" */
	if (in[1] == escape)
	{
	if (pair_first)
	goto invalid_pair;
	*out++ = escape;
	in += 2;
	}
	else if (isxdigit((unsigned char) in[1]) &&
	isxdigit((unsigned char) in[2]) &&
	isxdigit((unsigned char) in[3]) &&
	isxdigit((unsigned char) in[4]))
	{
	pg_wchar unicode;

	unicode = (hexval(in[1]) << 12) +
	(hexval(in[2]) << 8) +
	(hexval(in[3]) << 4) +
	hexval(in[4]);
	check_unicode_value(unicode);
	if (pair_first)
	{
	if (is_utf16_surrogate_second(unicode))
	{
	unicode = surrogate_pair_to_codepoint(pair_first, unicode);
	pair_first = 0;
	}
	else
	goto invalid_pair;
	}
	else if (is_utf16_surrogate_second(unicode))
	goto invalid_pair;

	if (is_utf16_surrogate_first(unicode))
	pair_first = unicode;
	else
	{
	pg_unicode_to_server(unicode, (unsigned char *) out);
	out += strlen(out);
	}
	in += 5;
	}
	else if (in[1] == '+' &&
	isxdigit((unsigned char) in[2]) &&
	isxdigit((unsigned char) in[3]) &&
	isxdigit((unsigned char) in[4]) &&
	isxdigit((unsigned char) in[5]) &&
	isxdigit((unsigned char) in[6]) &&
	isxdigit((unsigned char) in[7]))
	{
	pg_wchar unicode;

	unicode = (hexval(in[2]) << 20) +
	(hexval(in[3]) << 16) +
	(hexval(in[4]) << 12) +
	(hexval(in[5]) << 8) +
	(hexval(in[6]) << 4) +
	hexval(in[7]);
	check_unicode_value(unicode);
	if (pair_first)
	{
	if (is_utf16_surrogate_second(unicode))
	{
	unicode = surrogate_pair_to_codepoint(pair_first, unicode);
	pair_first = 0;
	}
	else
	goto invalid_pair;
	}
	else if (is_utf16_surrogate_second(unicode))
	goto invalid_pair;

	if (is_utf16_surrogate_first(unicode))
	pair_first = unicode;
	else
	{
	pg_unicode_to_server(unicode, (unsigned char *) out);
	out += strlen(out);
	}
	in += 8;
	}
	else
	ereport(ERROR,
	(errcode(ERRCODE_SYNTAX_ERROR),
	errmsg("invalid Unicode escape"),
	errhint("Unicode escapes must be \\XXXX or \\+XXXXXX.")));

	cancel_scanner_errposition_callback(&scbstate);
	}
	else
	{
	if (pair_first)
	goto invalid_pair;

	out++ = in++;
	}
	}

	/* unfinished surrogate pair? */
	if (pair_first)
	goto invalid_pair;

	*out = '\0';
	return new;

	/*
	* We might get here with the error callback active, or not. Call
	* scanner_errposition to make sure an error cursor appears; if the
	* callback is active, this is duplicative but harmless.
	*/
	invalid_pair:
	ereport(ERROR,
	(errcode(ERRCODE_SYNTAX_ERROR),
	errmsg("invalid Unicode surrogate pair"),
	scanner_errposition(in - str + position + 3, /* 3 for U&" */
	yyscanner)));
	return NULL; /* keep compiler quiet */
	}