src/backend/parser/scan.l - cloudberry - Git at Google

 %top{
 /*-------------------------------------------------------------------------
  *
  * scan.l
  *	  lexical scanner for PostgreSQL
  *
  * NOTE NOTE NOTE:
  *
  * The rules in this file must be kept in sync with src/fe_utils/psqlscan.l
  * and src/interfaces/ecpg/preproc/pgc.l!
  *
  * The rules are designed so that the scanner never has to backtrack,
  * in the sense that there is always a rule that can match the input
  * consumed so far (the rule action may internally throw back some input
  * with yyless(), however).  As explained in the flex manual, this makes
  * for a useful speed increase --- several percent faster when measuring
  * raw parsing (Flex + Bison).  The extra complexity is mostly in the rules
  * for handling float numbers and continued string literals.  If you change
  * the lexical rules, verify that you haven't broken the no-backtrack
  * property by running flex with the "-b" option and checking that the
  * resulting "lex.backup" file says that no backing up is needed.  (As of
  * Postgres 9.2, this check is made automatically by the Makefile.)
  *
  *
  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
  *	  src/backend/parser/scan.l
  *
  *-------------------------------------------------------------------------
  */
 #include "postgres.h"

 #include <ctype.h>
 #include <unistd.h>

 #include "common/string.h"
 #include "gramparse.h"
 #include "nodes/miscnodes.h"
 #include "parser/parser.h"		/* only needed for GUC variables */
 #include "parser/scansup.h"
 #include "port/pg_bitutils.h"
 #include "mb/pg_wchar.h"
 #include "utils/builtins.h"
 }

 %{

 /* LCOV_EXCL_START */

 /* Avoid exit() on fatal scanner errors (a bit ugly -- see yy_fatal_error) */
 #undef fprintf
 #define fprintf(file, fmt, msg)  fprintf_to_ereport(fmt, msg)

 static void
 fprintf_to_ereport(const char *fmt, const char *msg)
 {
 	ereport(ERROR, (errmsg_internal("%s", msg)));
 }

 /*
  * GUC variables.  This is a DIRECT violation of the warning given at the
  * head of gram.y, ie flex/bison code must not depend on any GUC variables;
  * as such, changing their values can induce very unintuitive behavior.
  * But we shall have to live with it until we can remove these variables.
  */
 int			backslash_quote = BACKSLASH_QUOTE_SAFE_ENCODING;
 bool		escape_string_warning = true;
 bool		standard_conforming_strings = true;

 /*
  * Constant data exported from this file.  This array maps from the
  * zero-based keyword numbers returned by ScanKeywordLookup to the
  * Bison token numbers needed by gram.y.  This is exported because
  * callers need to pass it to scanner_init, if they are using the
  * standard keyword list ScanKeywords.
  */
 #define PG_KEYWORD(kwname, value, category, collabel) value,

 const uint16 ScanKeywordTokens[] = {
 #include "parser/kwlist.h"
 };

 #undef PG_KEYWORD

 /*
  * Set the type of YYSTYPE.
  */
 #define YYSTYPE core_YYSTYPE

 /*
  * Set the type of yyextra.  All state variables used by the scanner should
  * be in yyextra, *not* statically allocated.
  */
 #define YY_EXTRA_TYPE core_yy_extra_type *

 /*
  * Each call to yylex must set yylloc to the location of the found token
  * (expressed as a byte offset from the start of the input text).
  * When we parse a token that requires multiple lexer rules to process,
  * this should be done in the first such rule, else yylloc will point
  * into the middle of the token.
  */
 #define SET_YYLLOC()  (*(yylloc) = yytext - yyextra->scanbuf)

 /*
  * Advance yylloc by the given number of bytes.
  */
 #define ADVANCE_YYLLOC(delta)  ( *(yylloc) += (delta) )

 /*
  * Sometimes, we do want yylloc to point into the middle of a token; this is
  * useful for instance to throw an error about an escape sequence within a
  * string literal.  But if we find no error there, we want to revert yylloc
  * to the token start, so that that's the location reported to the parser.
  * Use PUSH_YYLLOC/POP_YYLLOC to save/restore yylloc around such code.
  * (Currently the implied "stack" is just one location, but someday we might
  * need to nest these.)
  */
 #define PUSH_YYLLOC()	(yyextra->save_yylloc = *(yylloc))
 #define POP_YYLLOC()	(*(yylloc) = yyextra->save_yylloc)

 #define startlit()	( yyextra->literallen = 0 )
 static void addlit(char *ytext, int yleng, core_yyscan_t yyscanner);
 static void addlitchar(unsigned char ychar, core_yyscan_t yyscanner);
 static char *litbufdup(core_yyscan_t yyscanner);
 static unsigned char unescape_single_char(unsigned char c, core_yyscan_t yyscanner);
 static int	process_integer_literal(const char *token, YYSTYPE *lval, int base);
 static void addunicode(pg_wchar c, yyscan_t yyscanner);

 #define yyerror(msg)  scanner_yyerror(msg, yyscanner)

 #define lexer_errposition()  scanner_errposition(*(yylloc), yyscanner)

 static void check_string_escape_warning(unsigned char ychar, core_yyscan_t yyscanner);
 static void check_escape_warning(core_yyscan_t yyscanner);

 /*
  * Work around a bug in flex 2.5.35: it emits a couple of functions that
  * it forgets to emit declarations for.  Since we use -Wmissing-prototypes,
  * this would cause warnings.  Providing our own declarations should be
  * harmless even when the bug gets fixed.
  */
 extern int	core_yyget_column(yyscan_t yyscanner);
 extern void core_yyset_column(int column_no, yyscan_t yyscanner);

 %}

 %option reentrant
 %option bison-bridge
 %option bison-locations
 %option 8bit
 %option never-interactive
 %option nodefault
 %option noinput
 %option nounput
 %option noyywrap
 %option noyyalloc
 %option noyyrealloc
 %option noyyfree
 %option warn
 %option prefix="core_yy"

 /*
  * OK, here is a short description of lex/flex rules behavior.
  * The longest pattern which matches an input string is always chosen.
  * For equal-length patterns, the first occurring in the rules list is chosen.
  * INITIAL is the starting state, to which all non-conditional rules apply.
  * Exclusive states change parsing rules while the state is active.  When in
  * an exclusive state, only those rules defined for that state apply.
  *
  * We use exclusive states for quoted strings, extended comments,
  * and to eliminate parsing troubles for numeric strings.
  * Exclusive states:
  *  <xb> bit string literal
  *  <xc> extended C-style comments
  *  <xd> delimited identifiers (double-quoted identifiers)
  *  <xh> hexadecimal byte string
  *  <xq> standard quoted strings
  *  <xqs> quote stop (detect continued strings)
  *  <xe> extended quoted strings (support backslash escape sequences)
  *  <xdolq> $foo$ quoted strings
  *  <xui> quoted identifier with Unicode escapes
  *  <xus> quoted string with Unicode escapes
  *  <xeu> Unicode surrogate pair in extended quoted string
  *
  * Remember to add an <<EOF>> case whenever you add a new exclusive state!
  * The default one is probably not the right thing.
  */

 %x xb
 %x xc
 %x xd
 %x xh
 %x xq
 %x xqs
 %x xe
 %x xdolq
 %x xui
 %x xus
 %x xeu

 /*
  * In order to make the world safe for Windows and Mac clients as well as
  * Unix ones, we accept either \n or \r as a newline.  A DOS-style \r\n
  * sequence will be seen as two successive newlines, but that doesn't cause
  * any problems.  Comments that start with -- and extend to the next
  * newline are treated as equivalent to a single whitespace character.
  *
  * NOTE a fine point: if there is no newline following --, we will absorb
  * everything to the end of the input as a comment.  This is correct.  Older
  * versions of Postgres failed to recognize -- as a comment if the input
  * did not end with a newline.
  *
  * XXX perhaps \f (formfeed) should be treated as a newline as well?
  *
  * XXX if you change the set of whitespace characters, fix scanner_isspace()
  * to agree.
  */

 space			[ \t\n\r\f]
 horiz_space		[ \t\f]
 newline			[\n\r]
 non_newline		[^\n\r]

 comment			("--"{non_newline}*)

 whitespace		({space}+|{comment})

 /*
  * SQL requires at least one newline in the whitespace separating
  * string literals that are to be concatenated.  Silly, but who are we
  * to argue?  Note that {whitespace_with_newline} should not have * after
  * it, whereas {whitespace} should generally have a * after it...
  */

 special_whitespace		({space}+|{comment}{newline})
 horiz_whitespace		({horiz_space}|{comment})
 whitespace_with_newline	({horiz_whitespace}*{newline}{special_whitespace}*)

 quote			'
 /* If we see {quote} then {quotecontinue}, the quoted string continues */
 quotecontinue	{whitespace_with_newline}{quote}

 /*
  * {quotecontinuefail} is needed to avoid lexer backup when we fail to match
  * {quotecontinue}.  It might seem that this could just be {whitespace}*,
  * but if there's a dash after {whitespace_with_newline}, it must be consumed
  * to see if there's another dash --- which would start a {comment} and thus
  * allow continuation of the {quotecontinue} token.
  */
 quotecontinuefail	{whitespace}*"-"?

 /* Bit string
  * It is tempting to scan the string for only those characters
  * which are allowed. However, this leads to silently swallowed
  * characters if illegal characters are included in the string.
  * For example, if xbinside is [01] then B'ABCD' is interpreted
  * as a zero-length string, and the ABCD' is lost!
  * Better to pass the string forward and let the input routines
  * validate the contents.
  */
 xbstart			[bB]{quote}
 xbinside		[^']*

 /* Hexadecimal byte string */
 xhstart			[xX]{quote}
 xhinside		[^']*

 /* National character */
 xnstart			[nN]{quote}

 /* Quoted string that allows backslash escapes */
 xestart			[eE]{quote}
 xeinside		[^\\']+
 xeescape		[\\][^0-7]
 xeoctesc		[\\][0-7]{1,3}
 xehexesc		[\\]x[0-9A-Fa-f]{1,2}
 xeunicode		[\\](u[0-9A-Fa-f]{4}|U[0-9A-Fa-f]{8})
 xeunicodefail	[\\](u[0-9A-Fa-f]{0,3}|U[0-9A-Fa-f]{0,7})

 /* Extended quote
  * xqdouble implements embedded quote, ''''
  */
 xqstart			{quote}
 xqdouble		{quote}{quote}
 xqinside		[^']+

 /* $foo$ style quotes ("dollar quoting")
  * The quoted string starts with $foo$ where "foo" is an optional string
  * in the form of an identifier, except that it may not contain "$",
  * and extends to the first occurrence of an identical string.
  * There is *no* processing of the quoted text.
  *
  * {dolqfailed} is an error rule to avoid scanner backup when {dolqdelim}
  * fails to match its trailing "$".
  */
 dolq_start		[A-Za-z\200-\377_]
 dolq_cont		[A-Za-z\200-\377_0-9]
 dolqdelim		\$({dolq_start}{dolq_cont}*)?\$
 dolqfailed		\${dolq_start}{dolq_cont}*
 dolqinside		[^$]+

 /* Double quote
  * Allows embedded spaces and other special characters into identifiers.
  */
 dquote			\"
 xdstart			{dquote}
 xdstop			{dquote}
 xddouble		{dquote}{dquote}
 xdinside		[^"]+

 /* Quoted identifier with Unicode escapes */
 xuistart		[uU]&{dquote}

 /* Quoted string with Unicode escapes */
 xusstart		[uU]&{quote}

 /* error rule to avoid backup */
 xufailed		[uU]&


 /* C-style comments
  *
  * The "extended comment" syntax closely resembles allowable operator syntax.
  * The tricky part here is to get lex to recognize a string starting with
  * slash-star as a comment, when interpreting it as an operator would produce
  * a longer match --- remember lex will prefer a longer match!  Also, if we
  * have something like plus-slash-star, lex will think this is a 3-character
  * operator whereas we want to see it as a + operator and a comment start.
  * The solution is two-fold:
  * 1. append {op_chars}* to xcstart so that it matches as much text as
  *    {operator} would. Then the tie-breaker (first matching rule of same
  *    length) ensures xcstart wins.  We put back the extra stuff with yyless()
  *    in case it contains a star-slash that should terminate the comment.
  * 2. In the operator rule, check for slash-star within the operator, and
  *    if found throw it back with yyless().  This handles the plus-slash-star
  *    problem.
  * Dash-dash comments have similar interactions with the operator rule.
  */
 xcstart			\/\*{op_chars}*
 xcstop			\*+\/
 xcinside		[^*/]+

 ident_start		[A-Za-z\200-\377_]
 ident_cont		[A-Za-z\200-\377_0-9\$]

 identifier		{ident_start}{ident_cont}*

 /* Assorted special-case operators and operator-like tokens */
 typecast		"::"
 dot_dot			\.\.
 colon_equals	":="

 /*
  * These operator-like tokens (unlike the above ones) also match the {operator}
  * rule, which means that they might be overridden by a longer match if they
  * are followed by a comment start or a + or - character. Accordingly, if you
  * add to this list, you must also add corresponding code to the {operator}
  * block to return the correct token in such cases. (This is not needed in
  * psqlscan.l since the token value is ignored there.)
  */
 equals_greater	"=>"
 less_equals		"<="
 greater_equals	">="
 less_greater	"<>"
 not_equals		"!="

 /*
  * "self" is the set of chars that should be returned as single-character
  * tokens.  "op_chars" is the set of chars that can make up "Op" tokens,
  * which can be one or more characters long (but if a single-char token
  * appears in the "self" set, it is not to be returned as an Op).  Note
  * that the sets overlap, but each has some chars that are not in the other.
  *
  * If you change either set, adjust the character lists appearing in the
  * rule for "operator"!
  */
 self			[,()\[\].;\:\+\-\*\/\%\^\<\>\=]
 op_chars		[\~\!\@\#\^\&\|\`\?\+\-\*\/\%\<\>\=]
 operator		{op_chars}+

 /*
  * Numbers
  *
  * Unary minus is not part of a number here.  Instead we pass it separately to
  * the parser, and there it gets coerced via doNegate().
  *
  * {numericfail} is used because we would like "1..10" to lex as 1, dot_dot, 10.
  *
  * {realfail} is added to prevent the need for scanner
  * backup when the {real} rule fails to match completely.
  */
 decdigit		[0-9]
 hexdigit		[0-9A-Fa-f]
 octdigit		[0-7]
 bindigit		[0-1]

 decinteger		{decdigit}(_?{decdigit})*
 hexinteger		0[xX](_?{hexdigit})+
 octinteger		0[oO](_?{octdigit})+
 bininteger		0[bB](_?{bindigit})+

 hexfail			0[xX]_?
 octfail			0[oO]_?
 binfail			0[bB]_?

 numeric			(({decinteger}\.{decinteger}?)|(\.{decinteger}))
 numericfail		{decinteger}\.\.

 real			({decinteger}|{numeric})[Ee][-+]?{decinteger}
 realfail		({decinteger}|{numeric})[Ee][-+]

 /* Positional parameters don't accept underscores. */
 param			\${decdigit}+

 /*
  * An identifier immediately following an integer literal is disallowed because
  * in some cases it's ambiguous what is meant: for example, 0x1234 could be
  * either a hexinteger or a decinteger "0" and an identifier "x1234".  We can
  * detect such problems by seeing if integer_junk matches a longer substring
  * than any of the XXXinteger patterns (decinteger, hexinteger, octinteger,
  * bininteger).  One "junk" pattern is sufficient because
  * {decinteger}{identifier} will match all the same strings we'd match with
  * {hexinteger}{identifier} etc.
  *
  * Note that the rule for integer_junk must appear after the ones for
  * XXXinteger to make this work correctly: 0x1234 will match both hexinteger
  * and integer_junk, and we need hexinteger to be chosen in that case.
  *
  * Also disallow strings matched by numeric_junk, real_junk and param_junk
  * for consistency.
  */
 integer_junk	{decinteger}{identifier}
 numeric_junk	{numeric}{identifier}
 real_junk		{real}{identifier}
 param_junk		\${decdigit}+{identifier}

 other			.

 /*
  * Dollar quoted strings are totally opaque, and no escaping is done on them.
  * Other quoted strings must allow some special characters such as single-quote
  *  and newline.
  * Embedded single-quotes are implemented both in the SQL standard
  *  style of two adjacent single quotes "''" and in the Postgres/Java style
  *  of escaped-quote "\'".
  * Other embedded escaped characters are matched explicitly and the leading
  *  backslash is dropped from the string.
  * Note that xcstart must appear before operator, as explained above!
  *  Also whitespace (comment) must appear before operator.
  */

 %%

 {whitespace}	{
 					/* ignore */
 				}

 {xcstart}		{
 					/* Set location in case of syntax error in comment */
 					SET_YYLLOC();
 					yyextra->xcdepth = 0;
 					BEGIN(xc);
 					/* Put back any characters past slash-star; see above */
 					yyless(2);
 				}

 <xc>{
 {xcstart}		{
 					(yyextra->xcdepth)++;
 					/* Put back any characters past slash-star; see above */
 					yyless(2);
 				}

 {xcstop}		{
 					if (yyextra->xcdepth <= 0)
 						BEGIN(INITIAL);
 					else
 						(yyextra->xcdepth)--;
 				}

 {xcinside}		{
 					/* ignore */
 				}

 {op_chars}		{
 					/* ignore */
 				}

 \*+				{
 					/* ignore */
 				}

 <<EOF>>			{
 					yyerror("unterminated /* comment");
 				}
 } /* <xc> */

 {xbstart}		{
 					/* Binary bit type.
 					 * At some point we should simply pass the string
 					 * forward to the parser and label it there.
 					 * In the meantime, place a leading "b" on the string
 					 * to mark it for the input routine as a binary string.
 					 */
 					SET_YYLLOC();
 					BEGIN(xb);
 					startlit();
 					addlitchar('b', yyscanner);
 				}
 <xh>{xhinside}	|
 <xb>{xbinside}	{
 					addlit(yytext, yyleng, yyscanner);
 				}
 <xb><<EOF>>		{ yyerror("unterminated bit string literal"); }

 {xhstart}		{
 					/* Hexadecimal bit type.
 					 * At some point we should simply pass the string
 					 * forward to the parser and label it there.
 					 * In the meantime, place a leading "x" on the string
 					 * to mark it for the input routine as a hex string.
 					 */
 					SET_YYLLOC();
 					BEGIN(xh);
 					startlit();
 					addlitchar('x', yyscanner);
 				}
 <xh><<EOF>>		{ yyerror("unterminated hexadecimal string literal"); }

 {xnstart}		{
 					/* National character.
 					 * We will pass this along as a normal character string,
 					 * but preceded with an internally-generated "NCHAR".
 					 */
 					int		kwnum;

 					SET_YYLLOC();
 					yyless(1);	/* eat only 'n' this time */

 					kwnum = ScanKeywordLookup("nchar",
 											  yyextra->keywordlist);
 					if (kwnum >= 0)
 					{
 						yylval->keyword = GetScanKeyword(kwnum,
 														 yyextra->keywordlist);
 						return yyextra->keyword_tokens[kwnum];
 					}
 					else
 					{
 						/* If NCHAR isn't a keyword, just return "n" */
 						yylval->str = pstrdup("n");
 						return IDENT;
 					}
 				}

 {xqstart}		{
 					yyextra->warn_on_first_escape = true;
 					yyextra->saw_non_ascii = false;
 					SET_YYLLOC();
 					if (yyextra->standard_conforming_strings)
 						BEGIN(xq);
 					else
 						BEGIN(xe);
 					startlit();
 				}
 {xestart}		{
 					yyextra->warn_on_first_escape = false;
 					yyextra->saw_non_ascii = false;
 					SET_YYLLOC();
 					BEGIN(xe);
 					startlit();
 				}
 {xusstart}		{
 					SET_YYLLOC();
 					if (!yyextra->standard_conforming_strings)
 						ereport(ERROR,
 								(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 								 errmsg("unsafe use of string constant with Unicode escapes"),
 								 errdetail("String constants with Unicode escapes cannot be used when standard_conforming_strings is off."),
 								 lexer_errposition()));
 					BEGIN(xus);
 					startlit();
 				}

 <xb,xh,xq,xe,xus>{quote} {
 					/*
 					 * When we are scanning a quoted string and see an end
 					 * quote, we must look ahead for a possible continuation.
 					 * If we don't see one, we know the end quote was in fact
 					 * the end of the string.  To reduce the lexer table size,
 					 * we use a single "xqs" state to do the lookahead for all
 					 * types of strings.
 					 */
 					yyextra->state_before_str_stop = YYSTATE;
 					BEGIN(xqs);
 				}
 <xqs>{quotecontinue} {
 					/*
 					 * Found a quote continuation, so return to the in-quote
 					 * state and continue scanning the literal.  Nothing is
 					 * added to the literal's contents.
 					 */
 					BEGIN(yyextra->state_before_str_stop);
 				}
 <xqs>{quotecontinuefail} |
 <xqs>{other} |
 <xqs><<EOF>>	{
 					/*
 					 * Failed to see a quote continuation.  Throw back
 					 * everything after the end quote, and handle the string
 					 * according to the state we were in previously.
 					 */
 					yyless(0);
 					BEGIN(INITIAL);

 					switch (yyextra->state_before_str_stop)
 					{
 						case xb:
 							yylval->str = litbufdup(yyscanner);
 							return BCONST;
 						case xh:
 							yylval->str = litbufdup(yyscanner);
 							return XCONST;
 						case xq:
 						case xe:
 							/*
 							 * Check that the data remains valid, if it might
 							 * have been made invalid by unescaping any chars.
 							 */
 							if (yyextra->saw_non_ascii)
 								pg_verifymbstr(yyextra->literalbuf,
 											   yyextra->literallen,
 											   false);
 							yylval->str = litbufdup(yyscanner);
 							return SCONST;
 						case xus:
 							yylval->str = litbufdup(yyscanner);
 							return USCONST;
 						default:
 							yyerror("unhandled previous state in xqs");
 					}
 				}

 <xq,xe,xus>{xqdouble} {
 					addlitchar('\'', yyscanner);
 				}
 <xq,xus>{xqinside}  {
 					addlit(yytext, yyleng, yyscanner);
 				}
 <xe>{xeinside}  {
 					addlit(yytext, yyleng, yyscanner);
 				}
 <xe>{xeunicode} {
 					pg_wchar	c = strtoul(yytext + 2, NULL, 16);

 					/*
 					 * For consistency with other productions, issue any
 					 * escape warning with cursor pointing to start of string.
 					 * We might want to change that, someday.
 					 */
 					check_escape_warning(yyscanner);

 					/* Remember start of overall string token ... */
 					PUSH_YYLLOC();
 					/* ... and set the error cursor to point at this esc seq */
 					SET_YYLLOC();

 					if (is_utf16_surrogate_first(c))
 					{
 						yyextra->utf16_first_part = c;
 						BEGIN(xeu);
 					}
 					else if (is_utf16_surrogate_second(c))
 						yyerror("invalid Unicode surrogate pair");
 					else
 						addunicode(c, yyscanner);

 					/* Restore yylloc to be start of string token */
 					POP_YYLLOC();
 				}
 <xeu>{xeunicode} {
 					pg_wchar	c = strtoul(yytext + 2, NULL, 16);

 					/* Remember start of overall string token ... */
 					PUSH_YYLLOC();
 					/* ... and set the error cursor to point at this esc seq */
 					SET_YYLLOC();

 					if (!is_utf16_surrogate_second(c))
 						yyerror("invalid Unicode surrogate pair");

 					c = surrogate_pair_to_codepoint(yyextra->utf16_first_part, c);

 					addunicode(c, yyscanner);

 					/* Restore yylloc to be start of string token */
 					POP_YYLLOC();

 					BEGIN(xe);
 				}
 <xeu>. |
 <xeu>\n |
 <xeu><<EOF>>	{
 					/* Set the error cursor to point at missing esc seq */
 					SET_YYLLOC();
 					yyerror("invalid Unicode surrogate pair");
 				}
 <xe,xeu>{xeunicodefail}	{
 					/* Set the error cursor to point at malformed esc seq */
 					SET_YYLLOC();
 					ereport(ERROR,
 							(errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
 							 errmsg("invalid Unicode escape"),
 							 errhint("Unicode escapes must be \\uXXXX or \\UXXXXXXXX."),
 							 lexer_errposition()));
 				}
 <xe>{xeescape}  {
 					if (yytext[1] == '\'')
 					{
 						if (yyextra->backslash_quote == BACKSLASH_QUOTE_OFF ||
 							(yyextra->backslash_quote == BACKSLASH_QUOTE_SAFE_ENCODING &&
 							 PG_ENCODING_IS_CLIENT_ONLY(pg_get_client_encoding())))
 							ereport(ERROR,
 									(errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
 									 errmsg("unsafe use of \\' in a string literal"),
 									 errhint("Use '' to write quotes in strings. \\' is insecure in client-only encodings."),
 									 lexer_errposition()));
 					}
 					check_string_escape_warning(yytext[1], yyscanner);
 					addlitchar(unescape_single_char(yytext[1], yyscanner),
 							   yyscanner);
 				}
 <xe>{xeoctesc}  {
 					unsigned char c = strtoul(yytext + 1, NULL, 8);

 					check_escape_warning(yyscanner);
 					addlitchar(c, yyscanner);
 					if (c == '\0' || IS_HIGHBIT_SET(c))
 						yyextra->saw_non_ascii = true;
 				}
 <xe>{xehexesc}  {
 					unsigned char c = strtoul(yytext + 2, NULL, 16);

 					check_escape_warning(yyscanner);
 					addlitchar(c, yyscanner);
 					if (c == '\0' || IS_HIGHBIT_SET(c))
 						yyextra->saw_non_ascii = true;
 				}
 <xe>.			{
 					/* This is only needed for \ just before EOF */
 					addlitchar(yytext[0], yyscanner);
 				}
 <xq,xe,xus><<EOF>>		{ yyerror("unterminated quoted string"); }

 {dolqdelim}		{
 					SET_YYLLOC();
 					yyextra->dolqstart = pstrdup(yytext);
 					BEGIN(xdolq);
 					startlit();
 				}
 {dolqfailed}	{
 					SET_YYLLOC();
 					/* throw back all but the initial "$" */
 					yyless(1);
 					/* and treat it as {other} */
 					return yytext[0];
 				}
 <xdolq>{dolqdelim} {
 					if (strcmp(yytext, yyextra->dolqstart) == 0)
 					{
 						pfree(yyextra->dolqstart);
 						yyextra->dolqstart = NULL;
 						BEGIN(INITIAL);
 						yylval->str = litbufdup(yyscanner);
 						return SCONST;
 					}
 					else
 					{
 						/*
 						 * When we fail to match $...$ to dolqstart, transfer
 						 * the $... part to the output, but put back the final
 						 * $ for rescanning.  Consider $delim$...$junk$delim$
 						 */
 						addlit(yytext, yyleng - 1, yyscanner);
 						yyless(yyleng - 1);
 					}
 				}
 <xdolq>{dolqinside} {
 					addlit(yytext, yyleng, yyscanner);
 				}
 <xdolq>{dolqfailed} {
 					addlit(yytext, yyleng, yyscanner);
 				}
 <xdolq>.		{
 					/* This is only needed for $ inside the quoted text */
 					addlitchar(yytext[0], yyscanner);
 				}
 <xdolq><<EOF>>	{ yyerror("unterminated dollar-quoted string"); }

 {xdstart}		{
 					SET_YYLLOC();
 					BEGIN(xd);
 					startlit();
 				}
 {xuistart}		{
 					SET_YYLLOC();
 					BEGIN(xui);
 					startlit();
 				}
 <xd>{xdstop}	{
 					char	   *ident;

 					BEGIN(INITIAL);
 					if (yyextra->literallen == 0)
 						yyerror("zero-length delimited identifier");
 					ident = litbufdup(yyscanner);
 					if (yyextra->literallen >= NAMEDATALEN)
 						truncate_identifier(ident, yyextra->literallen, true);
 					yylval->str = ident;
 					return IDENT;
 				}
 <xui>{dquote}	{
 					BEGIN(INITIAL);
 					if (yyextra->literallen == 0)
 						yyerror("zero-length delimited identifier");
 					/* can't truncate till after we de-escape the ident */
 					yylval->str = litbufdup(yyscanner);
 					return UIDENT;
 				}
 <xd,xui>{xddouble}	{
 					addlitchar('"', yyscanner);
 				}
 <xd,xui>{xdinside}	{
 					addlit(yytext, yyleng, yyscanner);
 				}
 <xd,xui><<EOF>>		{ yyerror("unterminated quoted identifier"); }

 {xufailed}	{
 					char	   *ident;

 					SET_YYLLOC();
 					/* throw back all but the initial u/U */
 					yyless(1);
 					/* and treat it as {identifier} */
 					ident = downcase_truncate_identifier(yytext, yyleng, true);
 					yylval->str = ident;
 					return IDENT;
 				}

 {typecast}		{
 					SET_YYLLOC();
 					return TYPECAST;
 				}

 {dot_dot}		{
 					SET_YYLLOC();
 					return DOT_DOT;
 				}

 {colon_equals}	{
 					SET_YYLLOC();
 					return COLON_EQUALS;
 				}

 {equals_greater} {
 					SET_YYLLOC();
 					return EQUALS_GREATER;
 				}

 {less_equals}	{
 					SET_YYLLOC();
 					return LESS_EQUALS;
 				}

 {greater_equals} {
 					SET_YYLLOC();
 					return GREATER_EQUALS;
 				}

 {less_greater}	{
 					/* We accept both "<>" and "!=" as meaning NOT_EQUALS */
 					SET_YYLLOC();
 					return NOT_EQUALS;
 				}

 {not_equals}	{
 					/* We accept both "<>" and "!=" as meaning NOT_EQUALS */
 					SET_YYLLOC();
 					return NOT_EQUALS;
 				}

 {self}			{
 					SET_YYLLOC();
 					return yytext[0];
 				}

 {operator}		{
 					/*
 					 * Check for embedded slash-star or dash-dash; those
 					 * are comment starts, so operator must stop there.
 					 * Note that slash-star or dash-dash at the first
 					 * character will match a prior rule, not this one.
 					 */
 					int			nchars = yyleng;
 					char	   *slashstar = strstr(yytext, "/*");
 					char	   *dashdash = strstr(yytext, "--");

 					if (slashstar && dashdash)
 					{
 						/* if both appear, take the first one */
 						if (slashstar > dashdash)
 							slashstar = dashdash;
 					}
 					else if (!slashstar)
 						slashstar = dashdash;
 					if (slashstar)
 						nchars = slashstar - yytext;

 					/*
 					 * For SQL compatibility, '+' and '-' cannot be the
 					 * last char of a multi-char operator unless the operator
 					 * contains chars that are not in SQL operators.
 					 * The idea is to lex '=-' as two operators, but not
 					 * to forbid operator names like '?-' that could not be
 					 * sequences of SQL operators.
 					 */
 					if (nchars > 1 &&
 						(yytext[nchars - 1] == '+' ||
 						 yytext[nchars - 1] == '-'))
 					{
 						int			ic;

 						for (ic = nchars - 2; ic >= 0; ic--)
 						{
 							char c = yytext[ic];
 							if (c == '~' || c == '!' || c == '@' ||
 								c == '#' || c == '^' || c == '&' ||
 								c == '|' || c == '`' || c == '?' ||
 								c == '%')
 								break;
 						}
 						if (ic < 0)
 						{
 							/*
 							 * didn't find a qualifying character, so remove
 							 * all trailing [+-]
 							 */
 							do {
 								nchars--;
 							} while (nchars > 1 &&
 								 (yytext[nchars - 1] == '+' ||
 								  yytext[nchars - 1] == '-'));
 						}
 					}

 					SET_YYLLOC();

 					if (nchars < yyleng)
 					{
 						/* Strip the unwanted chars from the token */
 						yyless(nchars);
 						/*
 						 * If what we have left is only one char, and it's
 						 * one of the characters matching "self", then
 						 * return it as a character token the same way
 						 * that the "self" rule would have.
 						 */
 						if (nchars == 1 &&
 							strchr(",()[].;:+-*/%^<>=", yytext[0]))
 							return yytext[0];
 						/*
 						 * Likewise, if what we have left is two chars, and
 						 * those match the tokens ">=", "<=", "=>", "<>" or
 						 * "!=", then we must return the appropriate token
 						 * rather than the generic Op.
 						 */
 						if (nchars == 2)
 						{
 							if (yytext[0] == '=' && yytext[1] == '>')
 								return EQUALS_GREATER;
 							if (yytext[0] == '>' && yytext[1] == '=')
 								return GREATER_EQUALS;
 							if (yytext[0] == '<' && yytext[1] == '=')
 								return LESS_EQUALS;
 							if (yytext[0] == '<' && yytext[1] == '>')
 								return NOT_EQUALS;
 							if (yytext[0] == '!' && yytext[1] == '=')
 								return NOT_EQUALS;
 						}
 					}

 					/*
 					 * Complain if operator is too long.  Unlike the case
 					 * for identifiers, we make this an error not a notice-
 					 * and-truncate, because the odds are we are looking at
 					 * a syntactic mistake anyway.
 					 */
 					if (nchars >= NAMEDATALEN)
 						yyerror("operator too long");

 					yylval->str = pstrdup(yytext);
 					return Op;
 				}

 {param}			{
 					SET_YYLLOC();
 					yylval->ival = atol(yytext + 1);
 					return PARAM;
 				}
 {param_junk}	{
 					SET_YYLLOC();
 					yyerror("trailing junk after parameter");
 				}

 {decinteger}	{
 					SET_YYLLOC();
 					return process_integer_literal(yytext, yylval, 10);
 				}
 {hexinteger}	{
 					SET_YYLLOC();
 					return process_integer_literal(yytext, yylval, 16);
 				}
 {octinteger}	{
 					SET_YYLLOC();
 					return process_integer_literal(yytext, yylval, 8);
 				}
 {bininteger}	{
 					SET_YYLLOC();
 					return process_integer_literal(yytext, yylval, 2);
 				}
 {hexfail}		{
 					SET_YYLLOC();
 					yyerror("invalid hexadecimal integer");
 				}
 {octfail}		{
 					SET_YYLLOC();
 					yyerror("invalid octal integer");
 				}
 {binfail}		{
 					SET_YYLLOC();
 					yyerror("invalid binary integer");
 				}
 {numeric}		{
 					SET_YYLLOC();
 					yylval->str = pstrdup(yytext);
 					return FCONST;
 				}
 {numericfail}	{
 					/* throw back the .., and treat as integer */
 					yyless(yyleng - 2);
 					SET_YYLLOC();
 					return process_integer_literal(yytext, yylval, 10);
 				}
 {real}			{
 					SET_YYLLOC();
 					yylval->str = pstrdup(yytext);
 					return FCONST;
 				}
 {realfail}		{
 					SET_YYLLOC();
 					yyerror("trailing junk after numeric literal");
 				}
 {integer_junk}	{
 					SET_YYLLOC();
 					yyerror("trailing junk after numeric literal");
 				}
 {numeric_junk}	{
 					SET_YYLLOC();
 					yyerror("trailing junk after numeric literal");
 				}
 {real_junk}		{
 					SET_YYLLOC();
 					yyerror("trailing junk after numeric literal");
 				}


 {identifier}	{
 					int			kwnum;
 					char	   *ident;

 					SET_YYLLOC();

 					/* Is it a keyword? */
 					kwnum = ScanKeywordLookup(yytext,
 											  yyextra->keywordlist);
 					if (kwnum >= 0)
 					{
 						yylval->keyword = GetScanKeyword(kwnum,
 														 yyextra->keywordlist);
 						return yyextra->keyword_tokens[kwnum];
 					}

 					/*
 					 * No.  Convert the identifier to lower case, and truncate
 					 * if necessary.
 					 */
 					ident = downcase_truncate_identifier(yytext, yyleng, true);
 					yylval->str = ident;
 					return IDENT;
 				}

 {other}			{
 					SET_YYLLOC();
 					return yytext[0];
 				}

 <<EOF>>			{
 					SET_YYLLOC();
 					yyterminate();
 				}

 %%

 /* LCOV_EXCL_STOP */

 /*
  * Arrange access to yyextra for subroutines of the main yylex() function.
  * We expect each subroutine to have a yyscanner parameter.  Rather than
  * use the yyget_xxx functions, which might or might not get inlined by the
  * compiler, we cheat just a bit and cast yyscanner to the right type.
  */
 #undef yyextra
 #define yyextra  (((struct yyguts_t *) yyscanner)->yyextra_r)

 /* Likewise for a couple of other things we need. */
 #undef yylloc
 #define yylloc	(((struct yyguts_t *) yyscanner)->yylloc_r)
 #undef yyleng
 #define yyleng	(((struct yyguts_t *) yyscanner)->yyleng_r)


 /*
  * scanner_errposition
  *		Report a lexer or grammar error cursor position, if possible.
  *
  * This is expected to be used within an ereport() call, or via an error
  * callback such as setup_scanner_errposition_callback().  The return value
  * is a dummy (always 0, in fact).
  *
  * Note that this can only be used for messages emitted during raw parsing
  * (essentially, scan.l, parser.c, and gram.y), since it requires the
  * yyscanner struct to still be available.
  */
 int
 scanner_errposition(int location, core_yyscan_t yyscanner)
 {
 	int			pos;

 	if (location < 0)
 		return 0;				/* no-op if location is unknown */

 	/* Convert byte offset to character number */
 	pos = pg_mbstrlen_with_len(yyextra->scanbuf, location) + 1;
 	/* And pass it to the ereport mechanism */
 	return errposition(pos);
 }

 /*
  * Error context callback for inserting scanner error location.
  *
  * Note that this will be called for *any* error occurring while the
  * callback is installed.  We avoid inserting an irrelevant error location
  * if the error is a query cancel --- are there any other important cases?
  */
 static void
 scb_error_callback(void *arg)
 {
 	ScannerCallbackState *scbstate = (ScannerCallbackState *) arg;

 	if (geterrcode() != ERRCODE_QUERY_CANCELED)
 		(void) scanner_errposition(scbstate->location, scbstate->yyscanner);
 }

 /*
  * setup_scanner_errposition_callback
  *		Arrange for non-scanner errors to report an error position
  *
  * Sometimes the scanner calls functions that aren't part of the scanner
  * subsystem and can't reasonably be passed the yyscanner pointer; yet
  * we would like any errors thrown in those functions to be tagged with an
  * error location.  Use this function to set up an error context stack
  * entry that will accomplish that.  Usage pattern:
  *
  *		declare a local variable "ScannerCallbackState scbstate"
  *		...
  *		setup_scanner_errposition_callback(&scbstate, yyscanner, location);
  *		call function that might throw error;
  *		cancel_scanner_errposition_callback(&scbstate);
  */
 void
 setup_scanner_errposition_callback(ScannerCallbackState *scbstate,
 								   core_yyscan_t yyscanner,
 								   int location)
 {
 	/* Setup error traceback support for ereport() */
 	scbstate->yyscanner = yyscanner;
 	scbstate->location = location;
 	scbstate->errcallback.callback = scb_error_callback;
 	scbstate->errcallback.arg = (void *) scbstate;
 	scbstate->errcallback.previous = error_context_stack;
 	error_context_stack = &scbstate->errcallback;
 }

 /*
  * Cancel a previously-set-up errposition callback.
  */
 void
 cancel_scanner_errposition_callback(ScannerCallbackState *scbstate)
 {
 	/* Pop the error context stack */
 	error_context_stack = scbstate->errcallback.previous;
 }

 /*
  * scanner_yyerror
  *		Report a lexer or grammar error.
  *
  * The message's cursor position is whatever YYLLOC was last set to,
  * ie, the start of the current token if called within yylex(), or the
  * most recently lexed token if called from the grammar.
  * This is OK for syntax error messages from the Bison parser, because Bison
  * parsers report error as soon as the first unparsable token is reached.
  * Beware of using yyerror for other purposes, as the cursor position might
  * be misleading!
  */
 void
 scanner_yyerror(const char *message, core_yyscan_t yyscanner)
 {
 	const char *loc = yyextra->scanbuf + *yylloc;

 	if (*loc == YY_END_OF_BUFFER_CHAR)
 	{
 		ereport(ERROR,
 				(errcode(ERRCODE_SYNTAX_ERROR),
 		/* translator: %s is typically the translation of "syntax error" */
 				 errmsg("%s at end of input", _(message)),
 				 lexer_errposition()));
 	}
 	else
 	{
 		ereport(ERROR,
 				(errcode(ERRCODE_SYNTAX_ERROR),
 		/* translator: first %s is typically the translation of "syntax error" */
 				 errmsg("%s at or near \"%s\"", _(message), loc),
 				 lexer_errposition()));
 	}
 }


 /*
  * Called before any actual parsing is done
  */
 core_yyscan_t
 scanner_init(const char *str,
 			 core_yy_extra_type *yyext,
 			 const ScanKeywordList *keywordlist,
 			 const uint16 *keyword_tokens)
 {
 	Size		slen = strlen(str);
 	yyscan_t	scanner;

 	if (yylex_init(&scanner) != 0)
 		elog(ERROR, "yylex_init() failed: %m");

 	core_yyset_extra(yyext, scanner);

 	yyext->keywordlist = keywordlist;
 	yyext->keyword_tokens = keyword_tokens;

 	yyext->backslash_quote = backslash_quote;
 	yyext->escape_string_warning = escape_string_warning;
 	yyext->standard_conforming_strings = standard_conforming_strings;

 	/*
 	 * Make a scan buffer with special termination needed by flex.
 	 */
 	yyext->scanbuf = (char *) palloc(slen + 2);
 	yyext->scanbuflen = slen;
 	memcpy(yyext->scanbuf, str, slen);
 	yyext->scanbuf[slen] = yyext->scanbuf[slen + 1] = YY_END_OF_BUFFER_CHAR;
 	yy_scan_buffer(yyext->scanbuf, slen + 2, scanner);

 	/* initialize literal buffer to a reasonable but expansible size */
 	yyext->literalalloc = 1024;
 	yyext->literalbuf = (char *) palloc(yyext->literalalloc);
 	yyext->literallen = 0;

 	return scanner;
 }


 /*
  * Called after parsing is done to clean up after scanner_init()
  */
 void
 scanner_finish(core_yyscan_t yyscanner)
 {
 	/*
 	 * We don't bother to call yylex_destroy(), because all it would do is
 	 * pfree a small amount of control storage.  It's cheaper to leak the
 	 * storage until the parsing context is destroyed.  The amount of space
 	 * involved is usually negligible compared to the output parse tree
 	 * anyway.
 	 *
 	 * We do bother to pfree the scanbuf and literal buffer, but only if they
 	 * represent a nontrivial amount of space.  The 8K cutoff is arbitrary.
 	 */
 	if (yyextra->scanbuflen >= 8192)
 		pfree(yyextra->scanbuf);
 	if (yyextra->literalalloc >= 8192)
 		pfree(yyextra->literalbuf);
 }


 static void
 addlit(char *ytext, int yleng, core_yyscan_t yyscanner)
 {
 	/* enlarge buffer if needed */
 	if ((yyextra->literallen + yleng) >= yyextra->literalalloc)
 	{
 		yyextra->literalalloc = pg_nextpower2_32(yyextra->literallen + yleng + 1);
 		yyextra->literalbuf = (char *) repalloc(yyextra->literalbuf,
 												yyextra->literalalloc);
 	}
 	/* append new data */
 	memcpy(yyextra->literalbuf + yyextra->literallen, ytext, yleng);
 	yyextra->literallen += yleng;
 }


 static void
 addlitchar(unsigned char ychar, core_yyscan_t yyscanner)
 {
 	/* enlarge buffer if needed */
 	if ((yyextra->literallen + 1) >= yyextra->literalalloc)
 	{
 		yyextra->literalalloc *= 2;
 		yyextra->literalbuf = (char *) repalloc(yyextra->literalbuf,
 												yyextra->literalalloc);
 	}
 	/* append new data */
 	yyextra->literalbuf[yyextra->literallen] = ychar;
 	yyextra->literallen += 1;
 }


 /*
  * Create a palloc'd copy of literalbuf, adding a trailing null.
  */
 static char *
 litbufdup(core_yyscan_t yyscanner)
 {
 	int			llen = yyextra->literallen;
 	char	   *new;

 	new = palloc(llen + 1);
 	memcpy(new, yyextra->literalbuf, llen);
 	new[llen] = '\0';
 	return new;
 }

 /*
  * Process {decinteger}, {hexinteger}, etc.  Note this will also do the right
  * thing with {numeric}, ie digits and a decimal point.
  */
 static int
 process_integer_literal(const char *token, YYSTYPE *lval, int base)
 {
 	ErrorSaveContext escontext = {T_ErrorSaveContext};
 	int32		val;

 	val = pg_strtoint32_safe(token, (Node *) &escontext);
 	if (escontext.error_occurred)
 	{
 		/* integer too large (or contains decimal pt), treat it as a float */
 		lval->str = pstrdup(token);
 		return FCONST;
 	}
 	lval->ival = val;
 	return ICONST;
 }

 static void
 addunicode(pg_wchar c, core_yyscan_t yyscanner)
 {
 	ScannerCallbackState scbstate;
 	char		buf[MAX_UNICODE_EQUIVALENT_STRING + 1];

 	if (!is_valid_unicode_codepoint(c))
 		yyerror("invalid Unicode escape value");

 	/*
 	 * We expect that pg_unicode_to_server() will complain about any
 	 * unconvertible code point, so we don't have to set saw_non_ascii.
 	 */
 	setup_scanner_errposition_callback(&scbstate, yyscanner, *(yylloc));
 	pg_unicode_to_server(c, (unsigned char *) buf);
 	cancel_scanner_errposition_callback(&scbstate);
 	addlit(buf, strlen(buf), yyscanner);
 }

 static unsigned char
 unescape_single_char(unsigned char c, core_yyscan_t yyscanner)
 {
 	switch (c)
 	{
 		case 'b':
 			return '\b';
 		case 'f':
 			return '\f';
 		case 'n':
 			return '\n';
 		case 'r':
 			return '\r';
 		case 't':
 			return '\t';
 		default:
 			/* check for backslash followed by non-7-bit-ASCII */
 			if (c == '\0' || IS_HIGHBIT_SET(c))
 				yyextra->saw_non_ascii = true;

 			return c;
 	}
 }

 static void
 check_string_escape_warning(unsigned char ychar, core_yyscan_t yyscanner)
 {
 	if (ychar == '\'')
 	{
 		if (yyextra->warn_on_first_escape && yyextra->escape_string_warning)
 			ereport(WARNING,
 					(errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
 					 errmsg("nonstandard use of \\' in a string literal"),
 					 errhint("Use '' to write quotes in strings, or use the escape string syntax (E'...')."),
 					 lexer_errposition()));
 		yyextra->warn_on_first_escape = false;	/* warn only once per string */
 	}
 	else if (ychar == '\\')
 	{
 		if (yyextra->warn_on_first_escape && yyextra->escape_string_warning)
 			ereport(WARNING,
 					(errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
 					 errmsg("nonstandard use of \\\\ in a string literal"),
 					 errhint("Use the escape string syntax for backslashes, e.g., E'\\\\'."),
 					 lexer_errposition()));
 		yyextra->warn_on_first_escape = false;	/* warn only once per string */
 	}
 	else
 		check_escape_warning(yyscanner);
 }

 static void
 check_escape_warning(core_yyscan_t yyscanner)
 {
 	if (yyextra->warn_on_first_escape && yyextra->escape_string_warning)
 		ereport(WARNING,
 				(errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
 				 errmsg("nonstandard use of escape in a string literal"),
 		errhint("Use the escape string syntax for escapes, e.g., E'\\r\\n'."),
 				 lexer_errposition()));
 	yyextra->warn_on_first_escape = false;		/* warn only once per string */
 }

 /*
  * Interface functions to make flex use palloc() instead of malloc().
  * It'd be better to make these static, but flex insists otherwise.
  */

 void *
 core_yyalloc(yy_size_t bytes, core_yyscan_t yyscanner)
 {
 	return palloc(bytes);
 }

 void *
 core_yyrealloc(void *ptr, yy_size_t bytes, core_yyscan_t yyscanner)
 {
 	if (ptr)
 		return repalloc(ptr, bytes);
 	else
 		return palloc(bytes);
 }

 void
 core_yyfree(void *ptr, core_yyscan_t yyscanner)
 {
 	if (ptr)
 		pfree(ptr);
 }