| /*------------------------------------------------------------------------- |
| * |
| * parser.c |
| * Main entry point/driver for PostgreSQL grammar |
| * |
| * Note that the grammar is not allowed to perform any table access |
| * (since we need to be able to do basic parsing even while inside an |
| * aborted transaction). Therefore, the data structures returned by |
| * the grammar are "raw" parsetrees that still need to be analyzed by |
| * analyze.c and related files. |
| * |
| * |
| * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group |
| * Portions Copyright (c) 1994, Regents of the University of California |
| * |
| * IDENTIFICATION |
| * src/backend/parser/parser.c |
| * |
| *------------------------------------------------------------------------- |
| */ |
| |
| #include "postgres.h" |
| |
| #include "mb/pg_wchar.h" |
| #include "parser/gramparse.h" |
| #include "parser/parser.h" |
| #include "parser/scansup.h" |
| |
| static bool check_uescapechar(unsigned char escape); |
| static char *str_udeescape(const char *str, char escape, |
| int position, core_yyscan_t yyscanner); |
| |
| #include "cdb/cdbvars.h" |
| |
| /* |
| * raw_parser |
| * Given a query in string form, do lexical and grammatical analysis. |
| * |
| * Returns a list of raw (un-analyzed) parse trees. The contents of the |
| * list have the form required by the specified RawParseMode. |
| */ |
| List * |
| raw_parser(const char *str, RawParseMode mode) |
| { |
| core_yyscan_t yyscanner; |
| base_yy_extra_type yyextra; |
| int yyresult; |
| |
| /* |
| * In GPDB, temporarily disable escape_string_warning, if we're in a QE |
| * node. When we're parsing a PL/pgSQL function, e.g. in a CREATE FUNCTION |
| * command, you should've gotten the same warning from the QD node already. |
| * We could probably disable the warning in QE nodes altogether, not just |
| * in PL/pgSQL, but it can be useful for catching escaping bugs, when |
| * internal queries are dispatched from QD to QEs. |
| */ |
| bool save_escape_string_warning = escape_string_warning; |
| PG_TRY(); |
| { |
| if (Gp_role == GP_ROLE_EXECUTE) |
| escape_string_warning = false; |
| |
| /* initialize the flex scanner */ |
| yyscanner = scanner_init(str, &yyextra.core_yy_extra, |
| &ScanKeywords, ScanKeywordTokens); |
| |
| if (Gp_role == GP_ROLE_EXECUTE) |
| escape_string_warning = save_escape_string_warning; |
| } |
| PG_CATCH(); |
| { |
| if (Gp_role == GP_ROLE_EXECUTE) |
| escape_string_warning = save_escape_string_warning; |
| PG_RE_THROW(); |
| } |
| PG_END_TRY(); |
| |
| yyextra.tail_partition_magic = false; |
| /* base_yylex() only needs us to initialize the lookahead token, if any */ |
| if (mode == RAW_PARSE_DEFAULT) |
| yyextra.have_lookahead = false; |
| else |
| { |
| /* this array is indexed by RawParseMode enum */ |
| static const int mode_token[] = { |
| 0, /* RAW_PARSE_DEFAULT */ |
| MODE_TYPE_NAME, /* RAW_PARSE_TYPE_NAME */ |
| MODE_PLPGSQL_EXPR, /* RAW_PARSE_PLPGSQL_EXPR */ |
| MODE_PLPGSQL_ASSIGN1, /* RAW_PARSE_PLPGSQL_ASSIGN1 */ |
| MODE_PLPGSQL_ASSIGN2, /* RAW_PARSE_PLPGSQL_ASSIGN2 */ |
| MODE_PLPGSQL_ASSIGN3 /* RAW_PARSE_PLPGSQL_ASSIGN3 */ |
| }; |
| |
| yyextra.have_lookahead = true; |
| yyextra.lookahead_token = mode_token[mode]; |
| yyextra.lookahead_yylloc = 0; |
| yyextra.lookahead_end = NULL; |
| } |
| /* initialize the bison parser */ |
| parser_init(&yyextra); |
| |
| /* Parse! */ |
| yyresult = base_yyparse(yyscanner); |
| |
| /* Clean up (release memory) */ |
| scanner_finish(yyscanner); |
| |
| if (yyresult) /* error */ |
| return NIL; |
| |
| return yyextra.parsetree; |
| } |
| |
| |
| /* |
| * Intermediate filter between parser and core lexer (core_yylex in scan.l). |
| * |
| * This filter is needed because in some cases the standard SQL grammar |
| * requires more than one token lookahead. We reduce these cases to one-token |
| * lookahead by replacing tokens here, in order to keep the grammar LALR(1). |
| * |
| * Using a filter is simpler than trying to recognize multiword tokens |
| * directly in scan.l, because we'd have to allow for comments between the |
| * words. Furthermore it's not clear how to do that without re-introducing |
| * scanner backtrack, which would cost more performance than this filter |
| * layer does. |
| * |
| * We also use this filter to convert UIDENT and USCONST sequences into |
| * plain IDENT and SCONST tokens. While that could be handled by additional |
| * productions in the main grammar, it's more efficient to do it like this. |
| * |
| * The filter also provides a convenient place to translate between |
| * the core_YYSTYPE and YYSTYPE representations (which are really the |
| * same thing anyway, but notationally they're different). |
| */ |
| int |
| base_yylex(YYSTYPE *lvalp, YYLTYPE *llocp, core_yyscan_t yyscanner) |
| { |
| base_yy_extra_type *yyextra = pg_yyget_extra(yyscanner); |
| int cur_token; |
| int next_token; |
| int cur_token_length; |
| YYLTYPE cur_yylloc; |
| |
| /* Get next token --- we might already have it */ |
| if (yyextra->have_lookahead) |
| { |
| cur_token = yyextra->lookahead_token; |
| lvalp->core_yystype = yyextra->lookahead_yylval; |
| *llocp = yyextra->lookahead_yylloc; |
| if (yyextra->lookahead_end) |
| *(yyextra->lookahead_end) = yyextra->lookahead_hold_char; |
| yyextra->have_lookahead = false; |
| } |
| else |
| cur_token = core_yylex(&(lvalp->core_yystype), llocp, yyscanner); |
| |
| /* |
| * Check for special handling of PARTITION keyword. (see |
| * OptFirstPartitionSpec rule in the grammar) |
| */ |
| if (yyextra->tail_partition_magic) |
| { |
| if (cur_token == PARTITION) |
| { |
| yyextra->tail_partition_magic = false; |
| return PARTITION_TAIL; |
| } |
| } |
| |
| /* |
| * If this token isn't one that requires lookahead, just return it. If it |
| * does, determine the token length. (We could get that via strlen(), but |
| * since we have such a small set of possibilities, hardwiring seems |
| * feasible and more efficient --- at least for the fixed-length cases.) |
| */ |
| switch (cur_token) |
| { |
| case NOT: |
| cur_token_length = 3; |
| break; |
| case NULLS_P: |
| cur_token_length = 5; |
| break; |
| case WITH: |
| cur_token_length = 4; |
| break; |
| case UIDENT: |
| case USCONST: |
| cur_token_length = strlen(yyextra->core_yy_extra.scanbuf + *llocp); |
| break; |
| default: |
| return cur_token; |
| } |
| |
| /* |
| * Identify end+1 of current token. core_yylex() has temporarily stored a |
| * '\0' here, and will undo that when we call it again. We need to redo |
| * it to fully revert the lookahead call for error reporting purposes. |
| */ |
| yyextra->lookahead_end = yyextra->core_yy_extra.scanbuf + |
| *llocp + cur_token_length; |
| Assert(*(yyextra->lookahead_end) == '\0'); |
| |
| /* |
| * Save and restore *llocp around the call. It might look like we could |
| * avoid this by just passing &lookahead_yylloc to core_yylex(), but that |
| * does not work because flex actually holds onto the last-passed pointer |
| * internally, and will use that for error reporting. We need any error |
| * reports to point to the current token, not the next one. |
| */ |
| cur_yylloc = *llocp; |
| |
| /* Get next token, saving outputs into lookahead variables */ |
| next_token = core_yylex(&(yyextra->lookahead_yylval), llocp, yyscanner); |
| yyextra->lookahead_token = next_token; |
| yyextra->lookahead_yylloc = *llocp; |
| |
| *llocp = cur_yylloc; |
| |
| /* Now revert the un-truncation of the current token */ |
| yyextra->lookahead_hold_char = *(yyextra->lookahead_end); |
| *(yyextra->lookahead_end) = '\0'; |
| |
| yyextra->have_lookahead = true; |
| |
| /* Replace cur_token if needed, based on lookahead */ |
| switch (cur_token) |
| { |
| case NOT: |
| /* Replace NOT by NOT_LA if it's followed by BETWEEN, IN, etc */ |
| switch (next_token) |
| { |
| case BETWEEN: |
| case IN_P: |
| case LIKE: |
| case ILIKE: |
| case SIMILAR: |
| cur_token = NOT_LA; |
| break; |
| } |
| break; |
| |
| case NULLS_P: |
| /* Replace NULLS_P by NULLS_LA if it's followed by FIRST or LAST */ |
| switch (next_token) |
| { |
| case FIRST_P: |
| case LAST_P: |
| cur_token = NULLS_LA; |
| break; |
| } |
| break; |
| |
| case WITH: |
| /* Replace WITH by WITH_LA if it's followed by TIME or ORDINALITY */ |
| switch (next_token) |
| { |
| case TIME: |
| case ORDINALITY: |
| cur_token = WITH_LA; |
| break; |
| } |
| break; |
| |
| case UIDENT: |
| case USCONST: |
| /* Look ahead for UESCAPE */ |
| if (next_token == UESCAPE) |
| { |
| /* Yup, so get third token, which had better be SCONST */ |
| const char *escstr; |
| |
| /* Again save and restore *llocp */ |
| cur_yylloc = *llocp; |
| |
| /* Un-truncate current token so errors point to third token */ |
| *(yyextra->lookahead_end) = yyextra->lookahead_hold_char; |
| |
| /* Get third token */ |
| next_token = core_yylex(&(yyextra->lookahead_yylval), |
| llocp, yyscanner); |
| |
| /* If we throw error here, it will point to third token */ |
| if (next_token != SCONST) |
| scanner_yyerror("UESCAPE must be followed by a simple string literal", |
| yyscanner); |
| |
| escstr = yyextra->lookahead_yylval.str; |
| if (strlen(escstr) != 1 || !check_uescapechar(escstr[0])) |
| scanner_yyerror("invalid Unicode escape character", |
| yyscanner); |
| |
| /* Now restore *llocp; errors will point to first token */ |
| *llocp = cur_yylloc; |
| |
| /* Apply Unicode conversion */ |
| lvalp->core_yystype.str = |
| str_udeescape(lvalp->core_yystype.str, |
| escstr[0], |
| *llocp, |
| yyscanner); |
| |
| /* |
| * We don't need to revert the un-truncation of UESCAPE. What |
| * we do want to do is clear have_lookahead, thereby consuming |
| * all three tokens. |
| */ |
| yyextra->have_lookahead = false; |
| } |
| else |
| { |
| /* No UESCAPE, so convert using default escape character */ |
| lvalp->core_yystype.str = |
| str_udeescape(lvalp->core_yystype.str, |
| '\\', |
| *llocp, |
| yyscanner); |
| } |
| |
| if (cur_token == UIDENT) |
| { |
| /* It's an identifier, so truncate as appropriate */ |
| truncate_identifier(lvalp->core_yystype.str, |
| strlen(lvalp->core_yystype.str), |
| true); |
| cur_token = IDENT; |
| } |
| else if (cur_token == USCONST) |
| { |
| cur_token = SCONST; |
| } |
| break; |
| } |
| |
| return cur_token; |
| } |
| |
| /* convert hex digit (caller should have verified that) to value */ |
| static unsigned int |
| hexval(unsigned char c) |
| { |
| if (c >= '0' && c <= '9') |
| return c - '0'; |
| if (c >= 'a' && c <= 'f') |
| return c - 'a' + 0xA; |
| if (c >= 'A' && c <= 'F') |
| return c - 'A' + 0xA; |
| elog(ERROR, "invalid hexadecimal digit"); |
| return 0; /* not reached */ |
| } |
| |
| /* is Unicode code point acceptable? */ |
| static void |
| check_unicode_value(pg_wchar c) |
| { |
| if (!is_valid_unicode_codepoint(c)) |
| ereport(ERROR, |
| (errcode(ERRCODE_SYNTAX_ERROR), |
| errmsg("invalid Unicode escape value"))); |
| } |
| |
| /* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */ |
| static bool |
| check_uescapechar(unsigned char escape) |
| { |
| if (isxdigit(escape) |
| || escape == '+' |
| || escape == '\'' |
| || escape == '"' |
| || scanner_isspace(escape)) |
| return false; |
| else |
| return true; |
| } |
| |
| /* |
| * Process Unicode escapes in "str", producing a palloc'd plain string |
| * |
| * escape: the escape character to use |
| * position: start position of U&'' or U&"" string token |
| * yyscanner: context information needed for error reports |
| */ |
| static char * |
| str_udeescape(const char *str, char escape, |
| int position, core_yyscan_t yyscanner) |
| { |
| const char *in; |
| char *new, |
| *out; |
| size_t new_len; |
| pg_wchar pair_first = 0; |
| ScannerCallbackState scbstate; |
| |
| /* |
| * Guesstimate that result will be no longer than input, but allow enough |
| * padding for Unicode conversion. |
| */ |
| new_len = strlen(str) + MAX_UNICODE_EQUIVALENT_STRING + 1; |
| new = palloc(new_len); |
| |
| in = str; |
| out = new; |
| while (*in) |
| { |
| /* Enlarge string if needed */ |
| size_t out_dist = out - new; |
| |
| if (out_dist > new_len - (MAX_UNICODE_EQUIVALENT_STRING + 1)) |
| { |
| new_len *= 2; |
| new = repalloc(new, new_len); |
| out = new + out_dist; |
| } |
| |
| if (in[0] == escape) |
| { |
| /* |
| * Any errors reported while processing this escape sequence will |
| * have an error cursor pointing at the escape. |
| */ |
| setup_scanner_errposition_callback(&scbstate, yyscanner, |
| in - str + position + 3); /* 3 for U&" */ |
| if (in[1] == escape) |
| { |
| if (pair_first) |
| goto invalid_pair; |
| *out++ = escape; |
| in += 2; |
| } |
| else if (isxdigit((unsigned char) in[1]) && |
| isxdigit((unsigned char) in[2]) && |
| isxdigit((unsigned char) in[3]) && |
| isxdigit((unsigned char) in[4])) |
| { |
| pg_wchar unicode; |
| |
| unicode = (hexval(in[1]) << 12) + |
| (hexval(in[2]) << 8) + |
| (hexval(in[3]) << 4) + |
| hexval(in[4]); |
| check_unicode_value(unicode); |
| if (pair_first) |
| { |
| if (is_utf16_surrogate_second(unicode)) |
| { |
| unicode = surrogate_pair_to_codepoint(pair_first, unicode); |
| pair_first = 0; |
| } |
| else |
| goto invalid_pair; |
| } |
| else if (is_utf16_surrogate_second(unicode)) |
| goto invalid_pair; |
| |
| if (is_utf16_surrogate_first(unicode)) |
| pair_first = unicode; |
| else |
| { |
| pg_unicode_to_server(unicode, (unsigned char *) out); |
| out += strlen(out); |
| } |
| in += 5; |
| } |
| else if (in[1] == '+' && |
| isxdigit((unsigned char) in[2]) && |
| isxdigit((unsigned char) in[3]) && |
| isxdigit((unsigned char) in[4]) && |
| isxdigit((unsigned char) in[5]) && |
| isxdigit((unsigned char) in[6]) && |
| isxdigit((unsigned char) in[7])) |
| { |
| pg_wchar unicode; |
| |
| unicode = (hexval(in[2]) << 20) + |
| (hexval(in[3]) << 16) + |
| (hexval(in[4]) << 12) + |
| (hexval(in[5]) << 8) + |
| (hexval(in[6]) << 4) + |
| hexval(in[7]); |
| check_unicode_value(unicode); |
| if (pair_first) |
| { |
| if (is_utf16_surrogate_second(unicode)) |
| { |
| unicode = surrogate_pair_to_codepoint(pair_first, unicode); |
| pair_first = 0; |
| } |
| else |
| goto invalid_pair; |
| } |
| else if (is_utf16_surrogate_second(unicode)) |
| goto invalid_pair; |
| |
| if (is_utf16_surrogate_first(unicode)) |
| pair_first = unicode; |
| else |
| { |
| pg_unicode_to_server(unicode, (unsigned char *) out); |
| out += strlen(out); |
| } |
| in += 8; |
| } |
| else |
| ereport(ERROR, |
| (errcode(ERRCODE_SYNTAX_ERROR), |
| errmsg("invalid Unicode escape"), |
| errhint("Unicode escapes must be \\XXXX or \\+XXXXXX."))); |
| |
| cancel_scanner_errposition_callback(&scbstate); |
| } |
| else |
| { |
| if (pair_first) |
| goto invalid_pair; |
| |
| *out++ = *in++; |
| } |
| } |
| |
| /* unfinished surrogate pair? */ |
| if (pair_first) |
| goto invalid_pair; |
| |
| *out = '\0'; |
| return new; |
| |
| /* |
| * We might get here with the error callback active, or not. Call |
| * scanner_errposition to make sure an error cursor appears; if the |
| * callback is active, this is duplicative but harmless. |
| */ |
| invalid_pair: |
| ereport(ERROR, |
| (errcode(ERRCODE_SYNTAX_ERROR), |
| errmsg("invalid Unicode surrogate pair"), |
| scanner_errposition(in - str + position + 3, /* 3 for U&" */ |
| yyscanner))); |
| return NULL; /* keep compiler quiet */ |
| } |