src/backend/parser/ag_scanner.l - age - Git at Google

 %top{
 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 /*
  * The rules in this scanner implementation are based on the followings.
  *
  *     - openCypher
  *           - Cypher Query Language Reference (Version 9)
  *           - Grammar Specification (M13)
  *           - ANTLR Grammar (M13)
  *     - JSON (RFC 8259)
  */

 #include "postgres.h"

 #include "common/string.h"
 #include "mb/pg_wchar.h"

 #include "parser/ag_scanner.h"
 }

 %option 8bit
 %option never-interactive
 %option noyywrap
 %option reentrant
 %option extra-type="ag_yy_extra"
 %option prefix="ag_yy"
 %option nounistd
 %option fast noread
 %option backup
 %option perf-report perf-report
 %option nodefault
 %option warn

 /* to override the default memory management */
 %option noyyalloc noyyrealloc noyyfree

 /* remove warnings */
 %option noinput nounput
 /* remove unneeded routines */
 %option noyy_scan_bytes noyy_scan_string
 %option noyyget_leng noyyget_text
 %option noyyget_lineno noyyset_lineno
 %option noyyget_in noyyset_in noyyget_out noyyset_out
 %option noyyget_lval noyyset_lval noyyget_lloc noyyset_lloc
 %option noyyget_debug noyyset_debug

 /*
  * whitespace rule in Cypher handles twenty-four characters out of the
  * twenty-five characters defined as whitespace characters, four extra control
  * characters (FS, GS, RS, and US), and Mongolian vowel separator in Unicode.
  *
  * Only six of them below have been considered as whitespace characters here.
  * This character set is a superset of whitespace characters in JSON.
  *
  *     [\t\n\v\f\r ]
  *         U+0009 CHARACTER TABULATION (HT, Horizontal Tab)
  *         U+000A LINE FEED (LF)
  *         U+000B LINE TABULATION (VT, Vertical Tab)
  *         U+000C FORM FEED (FF)
  *         U+000D CARRIAGE RETURN (CR)
  *         U+0020 SPACE
  *
  * The other characters are listed below for future reference. To handle them,
  * you may use the patterns that match UTF-8 encoded code points of them.
  *
  *     \xC2[\x85\xA0]
  *         U+0085 NEXT LINE (NEL) -- not in Cypher
  *         U+00A0 NO-BREAK SPACE
  *     \xE1\x9A\x80
  *         U+1680 OGHAM SPACE MARK
  *     \xE2\x80[\x80-\x8A\xA8\xA9\xAF]
  *         U+2000 EN QUAD
  *         U+2001 EM QUAD
  *         U+2002 EN SPACE
  *         U+2003 EM SPACE
  *         U+2004 THREE-PER-EM SPACE
  *         U+2005 FOUR-PER-EM SPACE
  *         U+2006 SIX-PER-EM SPACE
  *         U+2007 FIGURE SPACE
  *         U+2008 PUNCTUATION SPACE
  *         U+2009 THIN SPACE
  *         U+200A HAIR SPACE
  *         U+2028 LINE SEPARATOR
  *         U+2029 PARAGRAPH SEPARATOR
  *         U+202F NARROW NO-BREAK SPACE
  *     \xE2\x81\x9F
  *         U+205F MEDIUM MATHEMATICAL SPACE
  *     \xE3\x80\x80
  *         U+3000 IDEOGRAPHIC SPACE
  *
  *     [\x1C-\x1F]
  *         U+001C INFORMATION SEPARATOR FOUR (FS, File Separator)
  *         U+001D INFORMATION SEPARATOR THREE (GS, Group Separator)
  *         U+001E INFORMATION SEPARATOR TWO (RS, Record Separator)
  *         U+001F INFORMATION SEPARATOR ONE (US, Unit Separator)
  *
  *     \xE1\xA0\x8E
  *         U+180E MONGOLIAN VOWEL SEPARATOR -- not a whitespace anymore
  */
 whitespace [\t\n\v\f\r ]+

 /*
  * Comment rule for multi-line comment in Cypher does not match comments that
  * end with an odd number of "*"s before the closing sequence.
  * Therefore, the rule has been modified so that it can match such comments.
  */
 %x mlcomment
 mlcstart  "/*"
 mlcchars  [^*]+|\*+
 mlcstop    \*+\/
 slcomment "//"[^\n\r]*

 /*
  * For numbers, unary plus and minus are handled as operators later in Cypher
  * grammar although JSON numbers may be prefixed with an optional minus sign.
  *
  * JSON does not support octal and hexadecimal integer literals.
  */

 digit    [0-9]
 hexdigit [0-9A-Fa-f]

 /*
  * digitseq pattern covers DecimalInteger and OctalInteger rules in Cypher.
  * Integer in JSON is represented in "0|[1-9][0-9]*" pattern that is covered by
  * digitseq pattern.
  */
 digitseq {digit}+

 /*
  * hexint pattern covers HexInteger rule in Cypher and also accepts "0X" prefix
  * for convenience.
  */
 hexint     0[Xx]{hexdigit}+
 hexintfail 0[Xx]

 /*
  * decimal pattern covers RegularDecimalReal rule in Cypher and also accepts
  * "{digitseq}\." pattern (e.g. "1.") which RegularDecimalReal rule doesn't.
  * Decimal in JSON is represented in "(0|[1-9][0-9]*)\.[0-9]+" pattern that is
  * covered by decimal pattern.
  *
  * decimalfail pattern is for ranges (e.g. "0..1"). The action for the pattern
  * consumes digitseq and returns dot_dot back to the input stream so that
  * dot_dot can be matched next.
  */
 decimal     {digitseq}\.{digit}*|\.{digitseq}
 decimalfail {digitseq}\.\.

 /*
  * decimalsci pattern covers ExponentDecimalReal rule in Cypher. It also
  * accepts coefficients in "{digitseq}\." pattern and explicit positive
  * exponents ("+") which ExponentDecimalReal rule doesn't.
  * Scientific notation in JSON is represented in
  * "(0|[1-9][0-9]*)(\.[0-9]+)?[Ee][+-]?[0-9]+" pattern that is covered by
  * decimalsci pattern.
  */
 decimalsci      ({digitseq}|{decimal})[Ee][+-]?{digitseq}
 decimalscifail1 ({digitseq}|{decimal})[Ee]
 decimalscifail2 ({digitseq}|{decimal})[Ee][+-]

 /*
  * These patterns cover StringLiteral rule in Cypher and JSON strings.
  * The escape sequence "\/" has been added for JSON strings.
  *
  * esasciifail and esunicodefail patterns handle escape sequences that are not
  * accepted by esascii and esunicode patterns respectively.
  *
  * Since esasciifail pattern can match anything that esascii pattern can,
  * esascii must appear first before esasciifail in the rules section.
  *
  * qstru start condition is for Unicode low surrogates.
  */
 %x dqstr sqstr qstru
 dquote        \"
 dqchars       [^"\\]+
 squote        '
 sqchars       [^'\\]+
 esascii       \\["'/\\bfnrt]
 esasciifail   \\[^Uu]?
 esunicode     \\(U{hexdigit}{8}|u{hexdigit}{4})
 esunicodefail \\(U{hexdigit}{0,7}|u{hexdigit}{0,3})
 any           (?s:.)

 /* id pattern is for UnescapedSymbolicName rule in Cypher. */
 id      {idstart}{idcont}*
 idstart [A-Z_a-z\x80-\xFF]
 idcont  [$0-9A-Z_a-z\x80-\xFF]

 /* These are for EscapedSymbolicName rule in Cypher. */
 %x bqid
 bquote   `
 bqchars  [^`]+
 esbquote {bquote}{bquote}

 /*
  * Parameter rule in Cypher is "$" followed by SymbolicName or DecimalInteger
  * rule. However, according to "Cypher Query Language Reference",
  *
  *     Parameters may consist of letters and numbers, and any combination of
  *     these, but cannot start with a number or a currency symbol.
  *
  * So, a modified version of Parameter rule that follows the above explanation
  * has been used.
  */
 param \${id}

 /*
  * These are tokens that are used as operators and language constructs in
  * Cypher, and some of them are structural characters in JSON.
  */
 any_exists  "?|"
 all_exists  "?&"
 concat      "||"
 access_path "#>"
 lt_gt       "<>"
 lt_eq       "<="
 gt_eq       ">="
 dot_dot     ".."
 plus_eq     "+="
 eq_tilde    "=~"
 typecast    "::"
 self        [?%()*+,\-./:;<=>[\]^{|}]

 other .

 %{
 typedef struct strbuf
 {
     char *buffer;
     int capacity;
     int length;
 } strbuf;

 static void strbuf_init(strbuf *sb, int capacity);
 static void strbuf_cleanup(strbuf *sb);
 static void strbuf_append_buf(strbuf *sb, const char *b, const int len);
 static void strbuf_append_char(strbuf *sb, const char c);
 static void strbuf_append_codepoint(strbuf *sb, const pg_wchar c);
 static void strbuf_ensure_capacity(strbuf *sb, int len);
 static const char *strbuf_get_str(strbuf *sb);
 static void strbuf_reset(strbuf *sb);

 typedef struct ag_yy_extra
 {
     /*
      * accumulate matched strings to build a complete literal if multiple rules
      * are needed to scan it, or keep a decimal integer literal that is
      * converted from a hexadecimal or an octal integer literal if it is too
      * large to fit in "int" type
      */
     strbuf literal_buf;

     // for Unicode surrogate pair
     pg_wchar high_surrogate;
     int start_cond;

     // for the location of the current token and the actual position of it
     const char *scan_buf;
     int last_loc;
 } ag_yy_extra;

 static void integer_literal_to_token(const char *s, ag_token *token,
                                      ag_yy_extra *extra);
 #define hexadecimal_to_decimal(numstr, sb) _numstr_to_decimal(numstr, 16, sb)
 #define octal_to_decimal(numstr, sb) _numstr_to_decimal(numstr, 8, sb)
 static void _numstr_to_decimal(const char *numstr, const int base, strbuf *sb);
 static uint32 hexdigit_value(const char c);
 static uint32 octdigit_value(const char c);

 static bool is_high_surrogate(const pg_wchar c);
 static bool is_low_surrogate(const pg_wchar c);

 #define update_location() \
     do \
     { \
         yyextra.last_loc = yytext - yyextra.scan_buf; \
     } while (0)
 #define get_location() (yyextra.last_loc)

 #define scan_errmsg(msg) _scan_errmsg(msg, &yyextra)
 static int _scan_errmsg(const char *msg, const ag_yy_extra *extra);
 #define scan_errposition() _scan_errposition(yyextra.last_loc, &yyextra)
 static int _scan_errposition(const int location, const ag_yy_extra *extra);

 /*
  * Avoid exit() on fatal scanner errors.
  * Call yy_fatal_error() just to keep compiler quiet.
  */
 #define YY_FATAL_ERROR(msg) \
     do \
     { \
         ereport(ERROR, (errmsg_internal("%s", msg))); \
         yy_fatal_error(NULL, NULL); \
     } while (0)

 /*
  * "yyscanner" must be used for the name of the parameter because it is
  * referenced internally. "yyscan_t" is OK because it is actually "void *"
  * and is the same with "ag_scanner_t".
  */
 #define YY_DECL ag_token ag_scanner_next_token(yyscan_t yyscanner)
 #define NDIGITS_PER_REMAINDER 9
 %}

 %%

 %{
 // This is used in the actions below.
 ag_token token;
 %}

 {whitespace} {
     // ignore
 }

 {mlcstart} {
     // update location in case of unterminated comment
     update_location();
     BEGIN(mlcomment);
 }

 <mlcomment>{mlcchars} {
     // ignore
 }

 <mlcomment>{mlcstop} {
     BEGIN(INITIAL);
 }

 <mlcomment><<EOF>> {
     ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR),
                     scan_errmsg("unterminated /* comment"),
                     scan_errposition()));
 }

 {slcomment} {
     // ignore
 }

 {digitseq} |
 {hexint} {
     update_location();
     integer_literal_to_token(yytext, &token, &yyextra);
     token.location = get_location();
     return token;
 }

 {hexintfail} {
     update_location();
     ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR),
                     scan_errmsg("invalid hexadecimal integer literal"),
                     scan_errposition()));
 }

 {decimal} |
 {decimalsci} {
     update_location();
     token.type = AG_TOKEN_DECIMAL;
     token.value.s = yytext;
     token.location = get_location();
     return token;
 }

 {decimalfail} {
     // return dot_dot back to the input stream
     yyless(yyleng - 2);

     update_location();

     // consume digitseq
     integer_literal_to_token(yytext, &token, &yyextra);
     token.location = get_location();
     return token;
 }

 {decimalscifail1} |
 {decimalscifail2} {
     update_location();
     ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR),
                     scan_errmsg("invalid scientific notation literal"),
                     scan_errposition()));
 }

 {dquote} {
     update_location();
     strbuf_reset(&yyextra.literal_buf);
     BEGIN(dqstr);
 }

 {squote} {
     update_location();
     strbuf_reset(&yyextra.literal_buf);
     BEGIN(sqstr);
 }

 <dqstr>{dqchars} |
 <sqstr>{sqchars} {
     strbuf_append_buf(&yyextra.literal_buf, yytext, yyleng);
 }

 <dqstr,sqstr>{esascii} {
     char c;

     switch (yytext[1])
     {
     case 'b':
         c = '\b';
         break;
     case 'f':
         c = '\f';
         break;
     case 'n':
         c = '\n';
         break;
     case 'r':
         c = '\r';
         break;
     case 't':
         c = '\t';
         break;
     default:
         // '"', '\'', '/', and '\\'
         c = yytext[1];
         break;
     }

     strbuf_append_char(&yyextra.literal_buf, c);
 }

 <dqstr,sqstr>{esasciifail} {
     if (yyleng == 1)
     {
         /*
          * This happens when the scanner meets "\"<<EOF>>. Just consume "\"
          * so that <dqstr,sqstr,qstru><<EOF>> rule can do the rest.
          */
         strbuf_append_char(&yyextra.literal_buf, '\\');
     }
     else
     {
         update_location();
         ereport(ERROR,
                 (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
                  scan_errmsg("invalid escape sequence"),
                  errdetail("Valid escape sequences are \\\", \\', \\/, \\\\, \\b, \\f, \\n, \\r, \\t, \\uXXXX, and \\UXXXXXXXX."),
                  scan_errposition()));
     }
 }

 <dqstr,sqstr>{esunicode} {
     pg_wchar c;

     // It is unnecessary to check endptr and errno here.
     c = strtoul(yytext + 2, NULL, 16);
     if (c > 0x10FFFF)
     {
         // c is greater than the maximum value of a Unicode code point.
         update_location();
         ereport(ERROR,
                 (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
                  scan_errmsg("invalid Unicode escape value"),
                  errdetail("Unicode escape values cannot be greater than 10FFFF, which is the maximum value of a code point."),
                  scan_errposition()));
     }
     else if (c > 0x7F)
     {
         if (GetDatabaseEncoding() != PG_UTF8)
         {
             update_location();
             ereport(ERROR,
                     (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
                      scan_errmsg("unsupported Unicode escape value"),
                      errdetail("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8."),
                      scan_errposition()));
         }

         if (is_high_surrogate(c))
         {
             yyextra.high_surrogate = c;
             yyextra.start_cond = YY_START;
             BEGIN(qstru);
         }
         else if (is_low_surrogate(c))
         {
             update_location();
             ereport(ERROR,
                     (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
                      scan_errmsg("invalid Unicode surrogate pair"),
                      errdetail("A low surrogate must follow a high surrogate."),
                      scan_errposition()));
         }
         else
         {
             strbuf_append_codepoint(&yyextra.literal_buf, c);
         }
     }
     else if (c > 0)
     {
         // c is an ASCII character.
         strbuf_append_char(&yyextra.literal_buf, (char)c);
     }
     else
     {
         /*
          * U+0000 NUL is the minimum value of a Unicode code point.
          * However, it is invalid in quoted strings as well as query strings.
          */
         update_location();
         ereport(ERROR,
                 (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
                  scan_errmsg("unsupported Unicode escape value"),
                  errdetail("Unicode code point value 0000 is not allowed in quoted strings."),
                  scan_errposition()));
     }
 }

 <qstru>{esunicode} {
     pg_wchar c;

     c = strtoul(yytext + 2, NULL, 16);
     if (is_low_surrogate(c))
     {
         c = surrogate_pair_to_codepoint(yyextra.high_surrogate, c);
         // 0x010000 <= c <= 0x10FFFF always holds for surrogate pairs.
         strbuf_append_codepoint(&yyextra.literal_buf, c);
         BEGIN(yyextra.start_cond);
     }
     else
     {
         update_location();
         ereport(ERROR,
                 (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
                  scan_errmsg("invalid Unicode surrogate pair"),
                  errdetail("A low surrogate must follow a high surrogate."),
                  scan_errposition()));
     }
 }

 <dqstr,sqstr,qstru>{esunicodefail} {
     update_location();
     ereport(ERROR,
             (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
              scan_errmsg("invalid Unicode escape sequence"),
              errhint("Unicode escape sequences must be \\uXXXX or \\UXXXXXXXX."),
              scan_errposition()));
 }

 <qstru>{any} {
     update_location();
     ereport(ERROR, (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
                     scan_errmsg("invalid Unicode surrogate pair"),
                     errdetail("A low surrogate must follow a high surrogate."),
                     scan_errposition()));
 }

 <dqstr>{dquote} |
 <sqstr>{squote} {
     BEGIN(INITIAL);

     /*
      * In quoted strings, only Unicode escape sequences need to be verified,
      * and the actions for <dqstr,sqstr>{esunicode} and <qstru>{esunicode}
      * rules verify the code point values. So, quoted strings are always valid.
      */

     token.type = AG_TOKEN_STRING;
     token.value.s = strbuf_get_str(&yyextra.literal_buf);
     token.location = get_location();
     return token;
 }

 <dqstr,sqstr,qstru><<EOF>> {
     ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR),
                     scan_errmsg("unterminated quoted string"),
                     scan_errposition()));
 }

 {id} {
     update_location();
     token.type = AG_TOKEN_IDENTIFIER;
     token.value.s = yytext;
     token.location = get_location();
     return token;
 }

 {bquote} {
     update_location();
     strbuf_reset(&yyextra.literal_buf);
     BEGIN(bqid);
 }

 <bqid>{bqchars} {
     strbuf_append_buf(&yyextra.literal_buf, yytext, yyleng);
 }

 <bqid>{esbquote} {
     strbuf_append_char(&yyextra.literal_buf, '`');
 }

 <bqid>{bquote} {
     BEGIN(INITIAL);

     if (yyextra.literal_buf.length == 0)
     {
         ereport(ERROR, (errcode(ERRCODE_INVALID_NAME),
                         scan_errmsg("zero-length quoted identifier"),
                         scan_errposition()));
     }

     token.type = AG_TOKEN_IDENTIFIER;
     token.value.s = strbuf_get_str(&yyextra.literal_buf);
     token.location = get_location();
     return token;
 }

 <bqid><<EOF>> {
     ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR),
                     scan_errmsg("unterminated quoted identifier"),
                     scan_errposition()));
 }

 {param} {
     update_location();
     token.type = AG_TOKEN_PARAMETER;
     token.value.s = yytext + 1;
     token.location = get_location();
     return token;
 }

 {concat} {
     update_location();
     token.type = AG_TOKEN_CONCAT;
     token.value.s = yytext;
     token.location = get_location();
     return token;
 }

 {access_path} {
     update_location();
     token.type = AG_TOKEN_ACCESS_PATH;
     token.value.s = yytext;
     token.location = get_location();
     return token;
 }

 {any_exists} {
     update_location();
     token.type = AG_TOKEN_ANY_EXISTS;
     token.value.s = yytext;
     token.location = get_location();
     return token;
 }

 {all_exists} {
     update_location();
     token.type = AG_TOKEN_ALL_EXISTS;
     token.value.s = yytext;
     token.location = get_location();
     return token;
 }

 {lt_gt} {
     update_location();
     token.type = AG_TOKEN_LT_GT;
     token.value.s = yytext;
     token.location = get_location();
     return token;
 }

 {lt_eq} {
     update_location();
     token.type = AG_TOKEN_LT_EQ;
     token.value.s = yytext;
     token.location = get_location();
     return token;
 }

 {gt_eq} {
     update_location();
     token.type = AG_TOKEN_GT_EQ;
     token.value.s = yytext;
     token.location = get_location();
     return token;
 }

 {dot_dot} {
     update_location();
     token.type = AG_TOKEN_DOT_DOT;
     token.value.s = yytext;
     token.location = get_location();
     return token;
 }

 {plus_eq} {
     update_location();
     token.type = AG_TOKEN_PLUS_EQ;
     token.value.s = yytext;
     token.location = get_location();
     return token;
 }

 {eq_tilde} {
     update_location();
     token.type = AG_TOKEN_EQ_TILDE;
     token.value.s = yytext;
     token.location = get_location();
     return token;
 }

 {typecast} {
     update_location();
     token.type = AG_TOKEN_TYPECAST;
     token.value.s = yytext;
     token.location = get_location();
     return token;
 }

 {self} {
     update_location();
     token.type = AG_TOKEN_CHAR;
     token.value.c = yytext[0];
     token.location = get_location();
     return token;
 }

 {other} {
     update_location();
     ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR),
                     scan_errmsg("unexpected character"),
                     scan_errposition()));
 }

 <<EOF>> {
     update_location();
     token.type = AG_TOKEN_NULL;
     token.value.c = '\0';
     token.location = get_location();
     return token;
 }

 %%

 /*
  * Override the default memory management to make flex use palloc() instead of
  * malloc().
  */

 void *ag_yyalloc(yy_size_t size, yyscan_t yyscanner)
 {
     return palloc(size);
 }

 void *ag_yyrealloc(void *ptr, yy_size_t size, yyscan_t yyscanner)
 {
     // see realloc(3)
     if (ptr)
     {
         if (size == 0)
         {
             pfree(ptr);
             return NULL;
         }
         else
         {
             return repalloc(ptr, size);
         }
     }
     else
     {
         return palloc(size);
     }
 }

 void ag_yyfree(void *ptr, yyscan_t yyscanner)
 {
     if (ptr)
         pfree(ptr);
 }

 static void strbuf_init(strbuf *sb, int capacity)
 {
     sb->buffer = palloc(capacity);
     sb->capacity = capacity;
     sb->length = 0;
 }

 static void strbuf_cleanup(strbuf *sb)
 {
     if (sb->buffer)
         pfree(sb->buffer);
 }

 static void strbuf_append_buf(strbuf *sb, const char *b, const int len)
 {
     strbuf_ensure_capacity(sb, sb->length + len);
     memcpy(sb->buffer + sb->length, b, len);
     sb->length += len;
 }

 static void strbuf_append_char(strbuf *sb, const char c)
 {
     strbuf_ensure_capacity(sb, sb->length + 1);
     sb->buffer[sb->length] = c;
     sb->length += 1;
 }

 static void strbuf_append_codepoint(strbuf *sb, const pg_wchar c)
 {
     unsigned char buf[6];

     unicode_to_utf8(c, buf);
     strbuf_append_buf(sb, (char *)buf, pg_utf_mblen(buf));
 }

 /*
  * len cannot be greater than MaxAllocSize because ReadCommand() reads
  * a message and places the message body in StringInfo.
  */
 static void strbuf_ensure_capacity(strbuf *sb, int len)
 {
     // consider additional 1 byte for the last '\0' character
     if (len < sb->capacity)
         return;

     do
     {
         sb->capacity *= 2;
     } while (sb->capacity <= len);

     sb->buffer = repalloc(sb->buffer, sb->capacity);
 }

 static const char *strbuf_get_str(strbuf *sb)
 {
     sb->buffer[sb->length] = '\0';
     return sb->buffer;
 }

 static void strbuf_reset(strbuf *sb)
 {
     sb->length = 0;
 }

 static void integer_literal_to_token(const char *s, ag_token *token,
                                      ag_yy_extra *extra)
 {
     char *endptr;
     int i;

     errno = 0;
     i = strtoint(s, &endptr, 0);

     /*
      * This is only needed for invalid octal integer literals. (e.g. "08")
      * Other cases cannot happen because of digitseq and hexint rules.
      */
     if (*endptr != '\0')
     {
         ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR),
                         _scan_errmsg("invalid octal integer literal", extra),
                         _scan_errposition(extra->last_loc, extra)));
     }

     // Treat it as a decimal if it is too large to be an "int" value.
     if (errno == ERANGE)
     {
         /*
          * Accessing s[0] and s[1] is safe because ERANGE is returned only if
          * there are 10 or more characters in s. In this case, the shortest
          * integer literals for decimal, hexadecimal, and octal integers are
          * "2147483648", "0x80000000", and "020000000000" respectively.
          */
         if (s[0] == '0')
         {
             strbuf_reset(&extra->literal_buf);

             /*
              * No matter how many characters s has, if all digits in s are
              * zeros, strtoint() returns 0 without an error.
              * So, _numstr_to_decimal() assumes that there is at least one
              * non-zero digit in s.
              */
             if (s[1] == 'X' || s[1] == 'x')
                 hexadecimal_to_decimal(s + 2, &extra->literal_buf);
             else
                 octal_to_decimal(s + 1, &extra->literal_buf);

             s = strbuf_get_str(&extra->literal_buf);
         }
         token->type = AG_TOKEN_DECIMAL;
         token->value.s = s;
         return;
     }

     token->type = AG_TOKEN_INTEGER;
     token->value.i = i;
 }

 /*
  * convert a string of a hexadecimal or an octal integer to a string of the
  * corresponding decimal integer
  */
 static void _numstr_to_decimal(const char *numstr, const int base, strbuf *sb)
 {
     // constants for each base
     int ndigits_per_word;
     int nbits_per_digit;
     uint32 (*digit_value)(const char);

     /*
      * constants for the conversion
      *
      *     "divisor" is 10^9.
      *
      *     At most 3 divisions are needed to eliminate 1 word.
      *         hex: 4294967295999999999 -> 4294967295 -> 4 -> 0
      *         oct: 1073741823999999999 -> 1073741823 -> 1 -> 0
      */
     const uint64 divisor = 1000000000;
     const int ndivisions = 3;

     int ndigits;
     int nwords;
     uint32 *words;
     const char *digitp;
     int word_i;
     int ndigits_word0;
     uint32 word;
     uint32 *remainders;
     int nremainders;
     int i;

     // set constants for each base
     switch (base)
     {
     case 16:
         /*
          * Hexadecimal
          *
          * Maximum value for each word
          *     0xFFFFFFFF = 4294967295
          * Divisor
          *     0x3B9ACA00 = 1000000000
          * Maximum remainder
          *     0x3B9AC9FF = 999999999
          *
          * Maximum dividend
          *     0x3B9AC9FFFFFFFFFF = 4294967295999999999
          * Quotient of the maximum dividend and the divisor
          *     0xFFFFFFFF = 4294967295
          * Remainer of the above division
          *     0x3B9AC9FF = 999999999
          */
         ndigits_per_word = 8;
         nbits_per_digit = 4;
         digit_value = hexdigit_value;
         break;
     case 8:
         /*
          * Octal
          *
          * Maximum value for each word
          *     07777777777 = 1073741823
          * Divisor
          *     07346545000 = 1000000000
          * Maximum remainder
          *     07346544777 = 999999999
          *
          * Maximum dividend
          *     073465447777777777777 = 1073741823999999999
          * Quotient of the maximum dividend and the divisor
          *     07777777777 = 1073741823
          * Remainer of the above division
          *     07346544777 = 999999999
          */
         ndigits_per_word = 10;
         nbits_per_digit = 3;
         digit_value = octdigit_value;
         break;
     default:
         Assert(!"invalid base");
         return;
     }

     // skip leading zeros
     while (*numstr == '0')
         numstr++;

     // number of digits in "numstr"
     ndigits = strlen(numstr);
     Assert(ndigits > 0);

     // prepare "words" to store "numstr" in two's complement representation
     nwords = (ndigits + (ndigits_per_word - 1)) / ndigits_per_word;
     words = palloc(sizeof(*words) * nwords);

     digitp = numstr;
     word_i = 0;

     // number of digits for the first word
     ndigits_word0 = ndigits % ndigits_per_word;
     if (ndigits_word0 == 0)
         ndigits_word0 = ndigits_per_word;

     // fill the first word
     word = digit_value(*digitp++);
     for (i = 1; i < ndigits_word0; i++)
     {
         word <<= nbits_per_digit;
         word |= digit_value(*digitp++);
     }
     words[word_i++] = word;

     // fill the rest of "words"
     while (word_i < nwords)
     {
         word = digit_value(*digitp++);
         for (i = 1; i < ndigits_per_word; i++)
         {
             word <<= nbits_per_digit;
             word |= digit_value(*digitp++);
         }
         words[word_i++] = word;
     }

     // At most "ndivisions" divisions are needed to eliminate 1 word.
     remainders = palloc(sizeof(*remainders) * (ndivisions * nwords));

     nremainders = 0;
     word_i = 0;
     // repeat dividing "words" by "divisor" until the quotient becomes 0
     while (word_i < nwords)
     {
         uint64 r;

         r = 0;
         // divide "words" by "divisor"
         for (i = word_i; i < nwords; i++)
         {
             uint64 d;
             uint64 q;

             d = (uint64)words[i];
             d |= r << (nbits_per_digit * ndigits_per_word);

             q = d / divisor;
             r = d % divisor;

             words[i] = (uint32)q;
         }

         // collect the remainder to build the result
         remainders[nremainders++] = (uint32)r;

         /*
          * Divisions over the first effective word is done
          * and "words" is getting closer to 0.
          */
         if (words[word_i] == 0)
             word_i++;
     }

     // convert the collected remainders to a string, starting from the last one
     for (i = nremainders - 1; i >= 0; i--)
     {
         char buf[NDIGITS_PER_REMAINDER];
         int buf_i;
         uint32 tmp;

         buf_i = NDIGITS_PER_REMAINDER;

         for (tmp = remainders[i]; tmp > 0; tmp /= 10)
             buf[--buf_i] = '0' + (char)(tmp % 10);

         // leading zeros for intermediate digits
         if (i < nremainders - 1)
         {
             while (buf_i > 0)
                 buf[--buf_i] = '0';
         }

         strbuf_append_buf(sb, &buf[buf_i], NDIGITS_PER_REMAINDER - buf_i);
     }

     pfree(remainders);
     pfree(words);
 }

 static uint32 hexdigit_value(const char c)
 {
     if (c >= '0' && c <= '9')
         return c - '0';

     if (c >= 'A' && c <= 'F')
         return 0xA + (c - 'A');

     Assert(c >= 'a' && c <= 'f');
     return 0xA + (c - 'a');
 }

 static uint32 octdigit_value(const char c)
 {
     Assert(c >= '0' && c <= '7');
     return c - '0';
 }

 static bool is_high_surrogate(const pg_wchar c)
 {
     return (c >= 0xD800 && c <= 0xDBFF);
 }

 static bool is_low_surrogate(const pg_wchar c)
 {
     return (c >= 0xDC00 && c <= 0xDFFF);
 }

 static int _scan_errmsg(const char *msg, const ag_yy_extra *extra)
 {
     const char *t = extra->scan_buf + extra->last_loc;

     if (t[0] == YY_END_OF_BUFFER_CHAR)
         return errmsg("%s at end of input", msg);
     else
         return errmsg("%s at or near \"%s\"", msg, t);
 }

 static int _scan_errposition(const int location, const ag_yy_extra *extra)
 {
     int pos;

     // no-op if location is unknown
     if (location < 0)
         return 0;

     // convert byte offset to number of characters
     pos = pg_mbstrlen_with_len(extra->scan_buf, location) + 1;

     return errposition(pos);
 }

 ag_scanner_t ag_scanner_create(const char *s)
 {
     Size len;
     char *buf;
     yyscan_t yyscanner;
     ag_yy_extra extra;
     int ret;

     // The last two YY_END_OF_BUFFER_CHAR are required by flex.
     len = strlen(s);
     buf = palloc(len + 2);
     memcpy(buf, s, len);
     buf[len] = YY_END_OF_BUFFER_CHAR;
     buf[len + 1] = YY_END_OF_BUFFER_CHAR;

     ret = ag_yylex_init(&yyscanner);
     if (ret)
         elog(ERROR, "ag_yylex_init() failed: %m");

     strbuf_init(&extra.literal_buf, 1024);
     extra.high_surrogate = 0;
     extra.start_cond = INITIAL;
     extra.scan_buf = buf;
     extra.last_loc = 0;
     ag_yyset_extra(extra, yyscanner);

     ag_yy_scan_buffer(buf, len + 2, yyscanner);

     return yyscanner;
 }

 void ag_scanner_destroy(ag_scanner_t scanner)
 {
     ag_yy_extra extra;

     extra = ag_yyget_extra(scanner);
     strbuf_cleanup(&extra.literal_buf);

     ag_yylex_destroy(scanner);
 }

 int ag_scanner_errmsg(const char *msg, ag_scanner_t *scanner)
 {
     ag_yy_extra extra;

     extra = ag_yyget_extra(scanner);

     return _scan_errmsg(msg, &extra);
 }

 int ag_scanner_errposition(const int location, ag_scanner_t *scanner)
 {
     ag_yy_extra extra;

     extra = ag_yyget_extra(scanner);

     return _scan_errposition(location, &extra);
 }