| %top{ |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| */ |
| |
| /* |
| * The rules in this scanner implementation are based on the followings. |
| * |
| * - openCypher |
| * - Cypher Query Language Reference (Version 9) |
| * - Grammar Specification (M13) |
| * - ANTLR Grammar (M13) |
| * - JSON (RFC 8259) |
| */ |
| |
| #include "postgres.h" |
| |
| #include "common/string.h" |
| #include "mb/pg_wchar.h" |
| |
| #include "parser/ag_scanner.h" |
| } |
| |
| %option 8bit |
| %option never-interactive |
| %option noyywrap |
| %option reentrant |
| %option extra-type="ag_yy_extra" |
| %option prefix="ag_yy" |
| %option nounistd |
| %option fast noread |
| %option backup |
| %option perf-report perf-report |
| %option nodefault |
| %option warn |
| |
| /* to override the default memory management */ |
| %option noyyalloc noyyrealloc noyyfree |
| |
| /* remove warnings */ |
| %option noinput nounput |
| /* remove unneeded routines */ |
| %option noyy_scan_bytes noyy_scan_string |
| %option noyyget_leng noyyget_text |
| %option noyyget_lineno noyyset_lineno |
| %option noyyget_in noyyset_in noyyget_out noyyset_out |
| %option noyyget_lval noyyset_lval noyyget_lloc noyyset_lloc |
| %option noyyget_debug noyyset_debug |
| |
| /* |
| * whitespace rule in Cypher handles twenty-four characters out of the |
| * twenty-five characters defined as whitespace characters, four extra control |
| * characters (FS, GS, RS, and US), and Mongolian vowel separator in Unicode. |
| * |
| * Only six of them below have been considered as whitespace characters here. |
| * This character set is a superset of whitespace characters in JSON. |
| * |
| * [\t\n\v\f\r ] |
| * U+0009 CHARACTER TABULATION (HT, Horizontal Tab) |
| * U+000A LINE FEED (LF) |
| * U+000B LINE TABULATION (VT, Vertical Tab) |
| * U+000C FORM FEED (FF) |
| * U+000D CARRIAGE RETURN (CR) |
| * U+0020 SPACE |
| * |
| * The other characters are listed below for future reference. To handle them, |
| * you may use the patterns that match UTF-8 encoded code points of them. |
| * |
| * \xC2[\x85\xA0] |
| * U+0085 NEXT LINE (NEL) -- not in Cypher |
| * U+00A0 NO-BREAK SPACE |
| * \xE1\x9A\x80 |
| * U+1680 OGHAM SPACE MARK |
| * \xE2\x80[\x80-\x8A\xA8\xA9\xAF] |
| * U+2000 EN QUAD |
| * U+2001 EM QUAD |
| * U+2002 EN SPACE |
| * U+2003 EM SPACE |
| * U+2004 THREE-PER-EM SPACE |
| * U+2005 FOUR-PER-EM SPACE |
| * U+2006 SIX-PER-EM SPACE |
| * U+2007 FIGURE SPACE |
| * U+2008 PUNCTUATION SPACE |
| * U+2009 THIN SPACE |
| * U+200A HAIR SPACE |
| * U+2028 LINE SEPARATOR |
| * U+2029 PARAGRAPH SEPARATOR |
| * U+202F NARROW NO-BREAK SPACE |
| * \xE2\x81\x9F |
| * U+205F MEDIUM MATHEMATICAL SPACE |
| * \xE3\x80\x80 |
| * U+3000 IDEOGRAPHIC SPACE |
| * |
| * [\x1C-\x1F] |
| * U+001C INFORMATION SEPARATOR FOUR (FS, File Separator) |
| * U+001D INFORMATION SEPARATOR THREE (GS, Group Separator) |
| * U+001E INFORMATION SEPARATOR TWO (RS, Record Separator) |
| * U+001F INFORMATION SEPARATOR ONE (US, Unit Separator) |
| * |
| * \xE1\xA0\x8E |
| * U+180E MONGOLIAN VOWEL SEPARATOR -- not a whitespace anymore |
| */ |
| whitespace [\t\n\v\f\r ]+ |
| |
| /* |
| * Comment rule for multi-line comment in Cypher does not match comments that |
| * end with an odd number of "*"s before the closing sequence. |
| * Therefore, the rule has been modified so that it can match such comments. |
| */ |
| %x mlcomment |
| mlcstart "/*" |
| mlcchars [^*]+|\*+ |
| mlcstop \*+\/ |
| slcomment "//"[^\n\r]* |
| |
| /* |
| * For numbers, unary plus and minus are handled as operators later in Cypher |
| * grammar although JSON numbers may be prefixed with an optional minus sign. |
| * |
| * JSON does not support octal and hexadecimal integer literals. |
| */ |
| |
| digit [0-9] |
| hexdigit [0-9A-Fa-f] |
| |
| /* |
| * digitseq pattern covers DecimalInteger and OctalInteger rules in Cypher. |
| * Integer in JSON is represented in "0|[1-9][0-9]*" pattern that is covered by |
| * digitseq pattern. |
| */ |
| digitseq {digit}+ |
| |
| /* |
| * hexint pattern covers HexInteger rule in Cypher and also accepts "0X" prefix |
| * for convenience. |
| */ |
| hexint 0[Xx]{hexdigit}+ |
| hexintfail 0[Xx] |
| |
| /* |
| * decimal pattern covers RegularDecimalReal rule in Cypher and also accepts |
| * "{digitseq}\." pattern (e.g. "1.") which RegularDecimalReal rule doesn't. |
| * Decimal in JSON is represented in "(0|[1-9][0-9]*)\.[0-9]+" pattern that is |
| * covered by decimal pattern. |
| * |
| * decimalfail pattern is for ranges (e.g. "0..1"). The action for the pattern |
| * consumes digitseq and returns dot_dot back to the input stream so that |
| * dot_dot can be matched next. |
| */ |
| decimal {digitseq}\.{digit}*|\.{digitseq} |
| decimalfail {digitseq}\.\. |
| |
| /* |
| * decimalsci pattern covers ExponentDecimalReal rule in Cypher. It also |
| * accepts coefficients in "{digitseq}\." pattern and explicit positive |
| * exponents ("+") which ExponentDecimalReal rule doesn't. |
| * Scientific notation in JSON is represented in |
| * "(0|[1-9][0-9]*)(\.[0-9]+)?[Ee][+-]?[0-9]+" pattern that is covered by |
| * decimalsci pattern. |
| */ |
| decimalsci ({digitseq}|{decimal})[Ee][+-]?{digitseq} |
| decimalscifail1 ({digitseq}|{decimal})[Ee] |
| decimalscifail2 ({digitseq}|{decimal})[Ee][+-] |
| |
| /* |
| * These patterns cover StringLiteral rule in Cypher and JSON strings. |
| * The escape sequence "\/" has been added for JSON strings. |
| * |
| * esasciifail and esunicodefail patterns handle escape sequences that are not |
| * accepted by esascii and esunicode patterns respectively. |
| * |
| * Since esasciifail pattern can match anything that esascii pattern can, |
| * esascii must appear first before esasciifail in the rules section. |
| * |
| * qstru start condition is for Unicode low surrogates. |
| */ |
| %x dqstr sqstr qstru |
| dquote \" |
| dqchars [^"\\]+ |
| squote ' |
| sqchars [^'\\]+ |
| esascii \\["'/\\bfnrt] |
| esasciifail \\[^Uu]? |
| esunicode \\(U{hexdigit}{8}|u{hexdigit}{4}) |
| esunicodefail \\(U{hexdigit}{0,7}|u{hexdigit}{0,3}) |
| any (?s:.) |
| |
| /* id pattern is for UnescapedSymbolicName rule in Cypher. */ |
| id {idstart}{idcont}* |
| idstart [A-Z_a-z\x80-\xFF] |
| idcont [$0-9A-Z_a-z\x80-\xFF] |
| |
| /* These are for EscapedSymbolicName rule in Cypher. */ |
| %x bqid |
| bquote ` |
| bqchars [^`]+ |
| esbquote {bquote}{bquote} |
| |
| /* |
| * Parameter rule in Cypher is "$" followed by SymbolicName or DecimalInteger |
| * rule. However, according to "Cypher Query Language Reference", |
| * |
| * Parameters may consist of letters and numbers, and any combination of |
| * these, but cannot start with a number or a currency symbol. |
| * |
| * So, a modified version of Parameter rule that follows the above explanation |
| * has been used. |
| */ |
| param \${id} |
| |
| /* |
| * These are tokens that are used as operators and language constructs in |
| * Cypher, and some of them are structural characters in JSON. |
| */ |
| left_contains "<@" |
| right_contains "@>" |
| any_exists "?|" |
| all_exists "?&" |
| concat "||" |
| access_path "#>" |
| lt_gt "<>" |
| lt_eq "<=" |
| gt_eq ">=" |
| dot_dot ".." |
| plus_eq "+=" |
| eq_tilde "=~" |
| typecast "::" |
| self [?%()*+,\-./:;<=>[\]^{|}] |
| |
| other . |
| |
| %{ |
| typedef struct strbuf |
| { |
| char *buffer; |
| int capacity; |
| int length; |
| } strbuf; |
| |
| static void strbuf_init(strbuf *sb, int capacity); |
| static void strbuf_cleanup(strbuf *sb); |
| static void strbuf_append_buf(strbuf *sb, const char *b, const int len); |
| static void strbuf_append_char(strbuf *sb, const char c); |
| static void strbuf_append_codepoint(strbuf *sb, const pg_wchar c); |
| static void strbuf_ensure_capacity(strbuf *sb, int len); |
| static const char *strbuf_get_str(strbuf *sb); |
| static void strbuf_reset(strbuf *sb); |
| |
| typedef struct ag_yy_extra |
| { |
| /* |
| * accumulate matched strings to build a complete literal if multiple rules |
| * are needed to scan it, or keep a decimal integer literal that is |
| * converted from a hexadecimal or an octal integer literal if it is too |
| * large to fit in "int" type |
| */ |
| strbuf literal_buf; |
| |
| // for Unicode surrogate pair |
| pg_wchar high_surrogate; |
| int start_cond; |
| |
| // for the location of the current token and the actual position of it |
| const char *scan_buf; |
| int last_loc; |
| } ag_yy_extra; |
| |
| static void integer_literal_to_token(const char *s, ag_token *token, |
| ag_yy_extra *extra); |
| #define hexadecimal_to_decimal(numstr, sb) _numstr_to_decimal(numstr, 16, sb) |
| #define octal_to_decimal(numstr, sb) _numstr_to_decimal(numstr, 8, sb) |
| static void _numstr_to_decimal(const char *numstr, const int base, strbuf *sb); |
| static uint32 hexdigit_value(const char c); |
| static uint32 octdigit_value(const char c); |
| |
| static bool is_high_surrogate(const pg_wchar c); |
| static bool is_low_surrogate(const pg_wchar c); |
| |
| #define update_location() \ |
| do \ |
| { \ |
| yyextra.last_loc = yytext - yyextra.scan_buf; \ |
| } while (0) |
| #define get_location() (yyextra.last_loc) |
| |
| #define scan_errmsg(msg) _scan_errmsg(msg, &yyextra) |
| static int _scan_errmsg(const char *msg, const ag_yy_extra *extra); |
| #define scan_errposition() _scan_errposition(yyextra.last_loc, &yyextra) |
| static int _scan_errposition(const int location, const ag_yy_extra *extra); |
| |
| /* |
| * Avoid exit() on fatal scanner errors. |
| * Call yy_fatal_error() just to keep compiler quiet. |
| */ |
| #define YY_FATAL_ERROR(msg) \ |
| do \ |
| { \ |
| ereport(ERROR, (errmsg_internal("%s", msg))); \ |
| yy_fatal_error(NULL, NULL); \ |
| } while (0) |
| |
| /* |
| * "yyscanner" must be used for the name of the parameter because it is |
| * referenced internally. "yyscan_t" is OK because it is actually "void *" |
| * and is the same with "ag_scanner_t". |
| */ |
| #define YY_DECL ag_token ag_scanner_next_token(yyscan_t yyscanner) |
| #define NDIGITS_PER_REMAINDER 9 |
| %} |
| |
| %% |
| |
| %{ |
| // This is used in the actions below. |
| ag_token token; |
| %} |
| |
| {whitespace} { |
| // ignore |
| } |
| |
| {mlcstart} { |
| // update location in case of unterminated comment |
| update_location(); |
| BEGIN(mlcomment); |
| } |
| |
| <mlcomment>{mlcchars} { |
| // ignore |
| } |
| |
| <mlcomment>{mlcstop} { |
| BEGIN(INITIAL); |
| } |
| |
| <mlcomment><<EOF>> { |
| ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), |
| scan_errmsg("unterminated /* comment"), |
| scan_errposition())); |
| } |
| |
| {slcomment} { |
| // ignore |
| } |
| |
| {digitseq} | |
| {hexint} { |
| update_location(); |
| integer_literal_to_token(yytext, &token, &yyextra); |
| token.location = get_location(); |
| return token; |
| } |
| |
| {hexintfail} { |
| update_location(); |
| ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), |
| scan_errmsg("invalid hexadecimal integer literal"), |
| scan_errposition())); |
| } |
| |
| {decimal} | |
| {decimalsci} { |
| update_location(); |
| token.type = AG_TOKEN_DECIMAL; |
| token.value.s = yytext; |
| token.location = get_location(); |
| return token; |
| } |
| |
| {decimalfail} { |
| // return dot_dot back to the input stream |
| yyless(yyleng - 2); |
| |
| update_location(); |
| |
| // consume digitseq |
| integer_literal_to_token(yytext, &token, &yyextra); |
| token.location = get_location(); |
| return token; |
| } |
| |
| {decimalscifail1} | |
| {decimalscifail2} { |
| update_location(); |
| ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), |
| scan_errmsg("invalid scientific notation literal"), |
| scan_errposition())); |
| } |
| |
| {dquote} { |
| update_location(); |
| strbuf_reset(&yyextra.literal_buf); |
| BEGIN(dqstr); |
| } |
| |
| {squote} { |
| update_location(); |
| strbuf_reset(&yyextra.literal_buf); |
| BEGIN(sqstr); |
| } |
| |
| <dqstr>{dqchars} | |
| <sqstr>{sqchars} { |
| strbuf_append_buf(&yyextra.literal_buf, yytext, yyleng); |
| } |
| |
| <dqstr,sqstr>{esascii} { |
| char c; |
| |
| switch (yytext[1]) |
| { |
| case 'b': |
| c = '\b'; |
| break; |
| case 'f': |
| c = '\f'; |
| break; |
| case 'n': |
| c = '\n'; |
| break; |
| case 'r': |
| c = '\r'; |
| break; |
| case 't': |
| c = '\t'; |
| break; |
| default: |
| // '"', '\'', '/', and '\\' |
| c = yytext[1]; |
| break; |
| } |
| |
| strbuf_append_char(&yyextra.literal_buf, c); |
| } |
| |
| <dqstr,sqstr>{esasciifail} { |
| if (yyleng == 1) |
| { |
| /* |
| * This happens when the scanner meets "\"<<EOF>>. Just consume "\" |
| * so that <dqstr,sqstr,qstru><<EOF>> rule can do the rest. |
| */ |
| strbuf_append_char(&yyextra.literal_buf, '\\'); |
| } |
| else |
| { |
| update_location(); |
| ereport(ERROR, |
| (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE), |
| scan_errmsg("invalid escape sequence"), |
| errdetail("Valid escape sequences are \\\", \\', \\/, \\\\, \\b, \\f, \\n, \\r, \\t, \\uXXXX, and \\UXXXXXXXX."), |
| scan_errposition())); |
| } |
| } |
| |
| <dqstr,sqstr>{esunicode} { |
| pg_wchar c; |
| |
| // It is unnecessary to check endptr and errno here. |
| c = strtoul(yytext + 2, NULL, 16); |
| if (c > 0x10FFFF) |
| { |
| // c is greater than the maximum value of a Unicode code point. |
| update_location(); |
| ereport(ERROR, |
| (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE), |
| scan_errmsg("invalid Unicode escape value"), |
| errdetail("Unicode escape values cannot be greater than 10FFFF, which is the maximum value of a code point."), |
| scan_errposition())); |
| } |
| else if (c > 0x7F) |
| { |
| if (GetDatabaseEncoding() != PG_UTF8) |
| { |
| update_location(); |
| ereport(ERROR, |
| (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE), |
| scan_errmsg("unsupported Unicode escape value"), |
| errdetail("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8."), |
| scan_errposition())); |
| } |
| |
| if (is_high_surrogate(c)) |
| { |
| yyextra.high_surrogate = c; |
| yyextra.start_cond = YY_START; |
| BEGIN(qstru); |
| } |
| else if (is_low_surrogate(c)) |
| { |
| update_location(); |
| ereport(ERROR, |
| (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE), |
| scan_errmsg("invalid Unicode surrogate pair"), |
| errdetail("A low surrogate must follow a high surrogate."), |
| scan_errposition())); |
| } |
| else |
| { |
| strbuf_append_codepoint(&yyextra.literal_buf, c); |
| } |
| } |
| else if (c > 0) |
| { |
| // c is an ASCII character. |
| strbuf_append_char(&yyextra.literal_buf, (char)c); |
| } |
| else |
| { |
| /* |
| * U+0000 NUL is the minimum value of a Unicode code point. |
| * However, it is invalid in quoted strings as well as query strings. |
| */ |
| update_location(); |
| ereport(ERROR, |
| (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE), |
| scan_errmsg("unsupported Unicode escape value"), |
| errdetail("Unicode code point value 0000 is not allowed in quoted strings."), |
| scan_errposition())); |
| } |
| } |
| |
| <qstru>{esunicode} { |
| pg_wchar c; |
| |
| c = strtoul(yytext + 2, NULL, 16); |
| if (is_low_surrogate(c)) |
| { |
| c = surrogate_pair_to_codepoint(yyextra.high_surrogate, c); |
| // 0x010000 <= c <= 0x10FFFF always holds for surrogate pairs. |
| strbuf_append_codepoint(&yyextra.literal_buf, c); |
| BEGIN(yyextra.start_cond); |
| } |
| else |
| { |
| update_location(); |
| ereport(ERROR, |
| (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE), |
| scan_errmsg("invalid Unicode surrogate pair"), |
| errdetail("A low surrogate must follow a high surrogate."), |
| scan_errposition())); |
| } |
| } |
| |
| <dqstr,sqstr,qstru>{esunicodefail} { |
| update_location(); |
| ereport(ERROR, |
| (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE), |
| scan_errmsg("invalid Unicode escape sequence"), |
| errhint("Unicode escape sequences must be \\uXXXX or \\UXXXXXXXX."), |
| scan_errposition())); |
| } |
| |
| <qstru>{any} { |
| update_location(); |
| ereport(ERROR, (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE), |
| scan_errmsg("invalid Unicode surrogate pair"), |
| errdetail("A low surrogate must follow a high surrogate."), |
| scan_errposition())); |
| } |
| |
| <dqstr>{dquote} | |
| <sqstr>{squote} { |
| BEGIN(INITIAL); |
| |
| /* |
| * In quoted strings, only Unicode escape sequences need to be verified, |
| * and the actions for <dqstr,sqstr>{esunicode} and <qstru>{esunicode} |
| * rules verify the code point values. So, quoted strings are always valid. |
| */ |
| |
| token.type = AG_TOKEN_STRING; |
| token.value.s = strbuf_get_str(&yyextra.literal_buf); |
| token.location = get_location(); |
| return token; |
| } |
| |
| <dqstr,sqstr,qstru><<EOF>> { |
| ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), |
| scan_errmsg("unterminated quoted string"), |
| scan_errposition())); |
| } |
| |
| {id} { |
| update_location(); |
| token.type = AG_TOKEN_IDENTIFIER; |
| token.value.s = yytext; |
| token.location = get_location(); |
| return token; |
| } |
| |
| {bquote} { |
| update_location(); |
| strbuf_reset(&yyextra.literal_buf); |
| BEGIN(bqid); |
| } |
| |
| <bqid>{bqchars} { |
| strbuf_append_buf(&yyextra.literal_buf, yytext, yyleng); |
| } |
| |
| <bqid>{esbquote} { |
| strbuf_append_char(&yyextra.literal_buf, '`'); |
| } |
| |
| <bqid>{bquote} { |
| BEGIN(INITIAL); |
| |
| if (yyextra.literal_buf.length == 0) |
| { |
| ereport(ERROR, (errcode(ERRCODE_INVALID_NAME), |
| scan_errmsg("zero-length quoted identifier"), |
| scan_errposition())); |
| } |
| |
| token.type = AG_TOKEN_IDENTIFIER; |
| token.value.s = strbuf_get_str(&yyextra.literal_buf); |
| token.location = get_location(); |
| return token; |
| } |
| |
| <bqid><<EOF>> { |
| ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), |
| scan_errmsg("unterminated quoted identifier"), |
| scan_errposition())); |
| } |
| |
| {param} { |
| update_location(); |
| token.type = AG_TOKEN_PARAMETER; |
| token.value.s = yytext + 1; |
| token.location = get_location(); |
| return token; |
| } |
| |
| {concat} { |
| update_location(); |
| token.type = AG_TOKEN_CONCAT; |
| token.value.s = yytext; |
| token.location = get_location(); |
| return token; |
| } |
| |
| {access_path} { |
| update_location(); |
| token.type = AG_TOKEN_ACCESS_PATH; |
| token.value.s = yytext; |
| token.location = get_location(); |
| return token; |
| } |
| |
| {any_exists} { |
| update_location(); |
| token.type = AG_TOKEN_ANY_EXISTS; |
| token.value.s = yytext; |
| token.location = get_location(); |
| return token; |
| } |
| |
| {left_contains} { |
| update_location(); |
| token.type = AG_TOKEN_LEFT_CONTAINS; |
| token.value.s = yytext; |
| token.location = get_location(); |
| return token; |
| } |
| |
| {right_contains} { |
| update_location(); |
| token.type = AG_TOKEN_RIGHT_CONTAINS; |
| token.value.s = yytext; |
| token.location = get_location(); |
| return token; |
| } |
| |
| {all_exists} { |
| update_location(); |
| token.type = AG_TOKEN_ALL_EXISTS; |
| token.value.s = yytext; |
| token.location = get_location(); |
| return token; |
| } |
| |
| {lt_gt} { |
| update_location(); |
| token.type = AG_TOKEN_LT_GT; |
| token.value.s = yytext; |
| token.location = get_location(); |
| return token; |
| } |
| |
| {lt_eq} { |
| update_location(); |
| token.type = AG_TOKEN_LT_EQ; |
| token.value.s = yytext; |
| token.location = get_location(); |
| return token; |
| } |
| |
| {gt_eq} { |
| update_location(); |
| token.type = AG_TOKEN_GT_EQ; |
| token.value.s = yytext; |
| token.location = get_location(); |
| return token; |
| } |
| |
| {dot_dot} { |
| update_location(); |
| token.type = AG_TOKEN_DOT_DOT; |
| token.value.s = yytext; |
| token.location = get_location(); |
| return token; |
| } |
| |
| {plus_eq} { |
| update_location(); |
| token.type = AG_TOKEN_PLUS_EQ; |
| token.value.s = yytext; |
| token.location = get_location(); |
| return token; |
| } |
| |
| {eq_tilde} { |
| update_location(); |
| token.type = AG_TOKEN_EQ_TILDE; |
| token.value.s = yytext; |
| token.location = get_location(); |
| return token; |
| } |
| |
| {typecast} { |
| update_location(); |
| token.type = AG_TOKEN_TYPECAST; |
| token.value.s = yytext; |
| token.location = get_location(); |
| return token; |
| } |
| |
| {self} { |
| update_location(); |
| token.type = AG_TOKEN_CHAR; |
| token.value.c = yytext[0]; |
| token.location = get_location(); |
| return token; |
| } |
| |
| {other} { |
| update_location(); |
| ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), |
| scan_errmsg("unexpected character"), |
| scan_errposition())); |
| } |
| |
| <<EOF>> { |
| update_location(); |
| token.type = AG_TOKEN_NULL; |
| token.value.c = '\0'; |
| token.location = get_location(); |
| return token; |
| } |
| |
| %% |
| |
| /* |
| * Override the default memory management to make flex use palloc() instead of |
| * malloc(). |
| */ |
| |
| void *ag_yyalloc(yy_size_t size, yyscan_t yyscanner) |
| { |
| return palloc(size); |
| } |
| |
| void *ag_yyrealloc(void *ptr, yy_size_t size, yyscan_t yyscanner) |
| { |
| // see realloc(3) |
| if (ptr) |
| { |
| if (size == 0) |
| { |
| pfree(ptr); |
| return NULL; |
| } |
| else |
| { |
| return repalloc(ptr, size); |
| } |
| } |
| else |
| { |
| return palloc(size); |
| } |
| } |
| |
| void ag_yyfree(void *ptr, yyscan_t yyscanner) |
| { |
| if (ptr) |
| pfree(ptr); |
| } |
| |
| static void strbuf_init(strbuf *sb, int capacity) |
| { |
| sb->buffer = palloc(capacity); |
| sb->capacity = capacity; |
| sb->length = 0; |
| } |
| |
| static void strbuf_cleanup(strbuf *sb) |
| { |
| if (sb->buffer) |
| pfree(sb->buffer); |
| } |
| |
| static void strbuf_append_buf(strbuf *sb, const char *b, const int len) |
| { |
| strbuf_ensure_capacity(sb, sb->length + len); |
| memcpy(sb->buffer + sb->length, b, len); |
| sb->length += len; |
| } |
| |
| static void strbuf_append_char(strbuf *sb, const char c) |
| { |
| strbuf_ensure_capacity(sb, sb->length + 1); |
| sb->buffer[sb->length] = c; |
| sb->length += 1; |
| } |
| |
| static void strbuf_append_codepoint(strbuf *sb, const pg_wchar c) |
| { |
| unsigned char buf[6]; |
| |
| unicode_to_utf8(c, buf); |
| strbuf_append_buf(sb, (char *)buf, pg_utf_mblen(buf)); |
| } |
| |
| /* |
| * len cannot be greater than MaxAllocSize because ReadCommand() reads |
| * a message and places the message body in StringInfo. |
| */ |
| static void strbuf_ensure_capacity(strbuf *sb, int len) |
| { |
| // consider additional 1 byte for the last '\0' character |
| if (len < sb->capacity) |
| return; |
| |
| do |
| { |
| sb->capacity *= 2; |
| } while (sb->capacity <= len); |
| |
| sb->buffer = repalloc(sb->buffer, sb->capacity); |
| } |
| |
| static const char *strbuf_get_str(strbuf *sb) |
| { |
| sb->buffer[sb->length] = '\0'; |
| return sb->buffer; |
| } |
| |
| static void strbuf_reset(strbuf *sb) |
| { |
| sb->length = 0; |
| } |
| |
| static void integer_literal_to_token(const char *s, ag_token *token, |
| ag_yy_extra *extra) |
| { |
| char *endptr; |
| int i; |
| |
| errno = 0; |
| i = strtoint(s, &endptr, 0); |
| |
| /* |
| * This is only needed for invalid octal integer literals. (e.g. "08") |
| * Other cases cannot happen because of digitseq and hexint rules. |
| */ |
| if (*endptr != '\0') |
| { |
| ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), |
| _scan_errmsg("invalid octal integer literal", extra), |
| _scan_errposition(extra->last_loc, extra))); |
| } |
| |
| // Treat it as a decimal if it is too large to be an "int" value. |
| if (errno == ERANGE) |
| { |
| /* |
| * Accessing s[0] and s[1] is safe because ERANGE is returned only if |
| * there are 10 or more characters in s. In this case, the shortest |
| * integer literals for decimal, hexadecimal, and octal integers are |
| * "2147483648", "0x80000000", and "020000000000" respectively. |
| */ |
| if (s[0] == '0') |
| { |
| strbuf_reset(&extra->literal_buf); |
| |
| /* |
| * No matter how many characters s has, if all digits in s are |
| * zeros, strtoint() returns 0 without an error. |
| * So, _numstr_to_decimal() assumes that there is at least one |
| * non-zero digit in s. |
| */ |
| if (s[1] == 'X' || s[1] == 'x') |
| hexadecimal_to_decimal(s + 2, &extra->literal_buf); |
| else |
| octal_to_decimal(s + 1, &extra->literal_buf); |
| |
| s = strbuf_get_str(&extra->literal_buf); |
| } |
| token->type = AG_TOKEN_DECIMAL; |
| token->value.s = s; |
| return; |
| } |
| |
| token->type = AG_TOKEN_INTEGER; |
| token->value.i = i; |
| } |
| |
| /* |
| * convert a string of a hexadecimal or an octal integer to a string of the |
| * corresponding decimal integer |
| */ |
| static void _numstr_to_decimal(const char *numstr, const int base, strbuf *sb) |
| { |
| // constants for each base |
| int ndigits_per_word; |
| int nbits_per_digit; |
| uint32 (*digit_value)(const char); |
| |
| /* |
| * constants for the conversion |
| * |
| * "divisor" is 10^9. |
| * |
| * At most 3 divisions are needed to eliminate 1 word. |
| * hex: 4294967295999999999 -> 4294967295 -> 4 -> 0 |
| * oct: 1073741823999999999 -> 1073741823 -> 1 -> 0 |
| */ |
| const uint64 divisor = 1000000000; |
| const int ndivisions = 3; |
| |
| int ndigits; |
| int nwords; |
| uint32 *words; |
| const char *digitp; |
| int word_i; |
| int ndigits_word0; |
| uint32 word; |
| uint32 *remainders; |
| int nremainders; |
| int i; |
| |
| // set constants for each base |
| switch (base) |
| { |
| case 16: |
| /* |
| * Hexadecimal |
| * |
| * Maximum value for each word |
| * 0xFFFFFFFF = 4294967295 |
| * Divisor |
| * 0x3B9ACA00 = 1000000000 |
| * Maximum remainder |
| * 0x3B9AC9FF = 999999999 |
| * |
| * Maximum dividend |
| * 0x3B9AC9FFFFFFFFFF = 4294967295999999999 |
| * Quotient of the maximum dividend and the divisor |
| * 0xFFFFFFFF = 4294967295 |
| * Remainer of the above division |
| * 0x3B9AC9FF = 999999999 |
| */ |
| ndigits_per_word = 8; |
| nbits_per_digit = 4; |
| digit_value = hexdigit_value; |
| break; |
| case 8: |
| /* |
| * Octal |
| * |
| * Maximum value for each word |
| * 07777777777 = 1073741823 |
| * Divisor |
| * 07346545000 = 1000000000 |
| * Maximum remainder |
| * 07346544777 = 999999999 |
| * |
| * Maximum dividend |
| * 073465447777777777777 = 1073741823999999999 |
| * Quotient of the maximum dividend and the divisor |
| * 07777777777 = 1073741823 |
| * Remainer of the above division |
| * 07346544777 = 999999999 |
| */ |
| ndigits_per_word = 10; |
| nbits_per_digit = 3; |
| digit_value = octdigit_value; |
| break; |
| default: |
| Assert(!"invalid base"); |
| return; |
| } |
| |
| // skip leading zeros |
| while (*numstr == '0') |
| numstr++; |
| |
| // number of digits in "numstr" |
| ndigits = strlen(numstr); |
| Assert(ndigits > 0); |
| |
| // prepare "words" to store "numstr" in two's complement representation |
| nwords = (ndigits + (ndigits_per_word - 1)) / ndigits_per_word; |
| words = palloc(sizeof(*words) * nwords); |
| |
| digitp = numstr; |
| word_i = 0; |
| |
| // number of digits for the first word |
| ndigits_word0 = ndigits % ndigits_per_word; |
| if (ndigits_word0 == 0) |
| ndigits_word0 = ndigits_per_word; |
| |
| // fill the first word |
| word = digit_value(*digitp++); |
| for (i = 1; i < ndigits_word0; i++) |
| { |
| word <<= nbits_per_digit; |
| word |= digit_value(*digitp++); |
| } |
| words[word_i++] = word; |
| |
| // fill the rest of "words" |
| while (word_i < nwords) |
| { |
| word = digit_value(*digitp++); |
| for (i = 1; i < ndigits_per_word; i++) |
| { |
| word <<= nbits_per_digit; |
| word |= digit_value(*digitp++); |
| } |
| words[word_i++] = word; |
| } |
| |
| // At most "ndivisions" divisions are needed to eliminate 1 word. |
| remainders = palloc(sizeof(*remainders) * (ndivisions * nwords)); |
| |
| nremainders = 0; |
| word_i = 0; |
| // repeat dividing "words" by "divisor" until the quotient becomes 0 |
| while (word_i < nwords) |
| { |
| uint64 r; |
| |
| r = 0; |
| // divide "words" by "divisor" |
| for (i = word_i; i < nwords; i++) |
| { |
| uint64 d; |
| uint64 q; |
| |
| d = (uint64)words[i]; |
| d |= r << (nbits_per_digit * ndigits_per_word); |
| |
| q = d / divisor; |
| r = d % divisor; |
| |
| words[i] = (uint32)q; |
| } |
| |
| // collect the remainder to build the result |
| remainders[nremainders++] = (uint32)r; |
| |
| /* |
| * Divisions over the first effective word is done |
| * and "words" is getting closer to 0. |
| */ |
| if (words[word_i] == 0) |
| word_i++; |
| } |
| |
| // convert the collected remainders to a string, starting from the last one |
| for (i = nremainders - 1; i >= 0; i--) |
| { |
| char buf[NDIGITS_PER_REMAINDER]; |
| int buf_i; |
| uint32 tmp; |
| |
| buf_i = NDIGITS_PER_REMAINDER; |
| |
| for (tmp = remainders[i]; tmp > 0; tmp /= 10) |
| buf[--buf_i] = '0' + (char)(tmp % 10); |
| |
| // leading zeros for intermediate digits |
| if (i < nremainders - 1) |
| { |
| while (buf_i > 0) |
| buf[--buf_i] = '0'; |
| } |
| |
| strbuf_append_buf(sb, &buf[buf_i], NDIGITS_PER_REMAINDER - buf_i); |
| } |
| |
| pfree(remainders); |
| pfree(words); |
| } |
| |
| static uint32 hexdigit_value(const char c) |
| { |
| if (c >= '0' && c <= '9') |
| return c - '0'; |
| |
| if (c >= 'A' && c <= 'F') |
| return 0xA + (c - 'A'); |
| |
| Assert(c >= 'a' && c <= 'f'); |
| return 0xA + (c - 'a'); |
| } |
| |
| static uint32 octdigit_value(const char c) |
| { |
| Assert(c >= '0' && c <= '7'); |
| return c - '0'; |
| } |
| |
| static bool is_high_surrogate(const pg_wchar c) |
| { |
| return (c >= 0xD800 && c <= 0xDBFF); |
| } |
| |
| static bool is_low_surrogate(const pg_wchar c) |
| { |
| return (c >= 0xDC00 && c <= 0xDFFF); |
| } |
| |
| static int _scan_errmsg(const char *msg, const ag_yy_extra *extra) |
| { |
| const char *t = extra->scan_buf + extra->last_loc; |
| |
| if (t[0] == YY_END_OF_BUFFER_CHAR) |
| return errmsg("%s at end of input", msg); |
| else |
| return errmsg("%s at or near \"%s\"", msg, t); |
| } |
| |
| static int _scan_errposition(const int location, const ag_yy_extra *extra) |
| { |
| int pos; |
| |
| // no-op if location is unknown |
| if (location < 0) |
| return 0; |
| |
| // convert byte offset to number of characters |
| pos = pg_mbstrlen_with_len(extra->scan_buf, location) + 1; |
| |
| return errposition(pos); |
| } |
| |
| ag_scanner_t ag_scanner_create(const char *s) |
| { |
| Size len; |
| char *buf; |
| yyscan_t yyscanner; |
| ag_yy_extra extra; |
| int ret; |
| |
| // The last two YY_END_OF_BUFFER_CHAR are required by flex. |
| len = strlen(s); |
| buf = palloc(len + 2); |
| memcpy(buf, s, len); |
| buf[len] = YY_END_OF_BUFFER_CHAR; |
| buf[len + 1] = YY_END_OF_BUFFER_CHAR; |
| |
| ret = ag_yylex_init(&yyscanner); |
| if (ret) |
| elog(ERROR, "ag_yylex_init() failed: %m"); |
| |
| strbuf_init(&extra.literal_buf, 1024); |
| extra.high_surrogate = 0; |
| extra.start_cond = INITIAL; |
| extra.scan_buf = buf; |
| extra.last_loc = 0; |
| ag_yyset_extra(extra, yyscanner); |
| |
| ag_yy_scan_buffer(buf, len + 2, yyscanner); |
| |
| return yyscanner; |
| } |
| |
| void ag_scanner_destroy(ag_scanner_t scanner) |
| { |
| ag_yy_extra extra; |
| |
| extra = ag_yyget_extra(scanner); |
| strbuf_cleanup(&extra.literal_buf); |
| |
| ag_yylex_destroy(scanner); |
| } |
| |
| int ag_scanner_errmsg(const char *msg, ag_scanner_t *scanner) |
| { |
| ag_yy_extra extra; |
| |
| extra = ag_yyget_extra(scanner); |
| |
| return _scan_errmsg(msg, &extra); |
| } |
| |
| int ag_scanner_errposition(const int location, ag_scanner_t *scanner) |
| { |
| ag_yy_extra extra; |
| |
| extra = ag_yyget_extra(scanner); |
| |
| return _scan_errposition(location, &extra); |
| } |