| /* |
| * Lexer for source files, ToNumber() string conversions, RegExp expressions, |
| * and JSON. |
| * |
| * Provides a stream of Ecmascript tokens from an UTF-8/CESU-8 buffer. The |
| * caller can also rewind the token stream into a certain position which is |
| * needed by the compiler part for multi-pass scanning. Tokens are |
| * represented as duk_token structures, and contain line number information. |
| * Token types are identified with DUK_TOK_* defines. |
| * |
| * Characters are decoded into a fixed size lookup window consisting of |
| * decoded Unicode code points, with window positions past the end of the |
| * input filled with an invalid codepoint (-1). The tokenizer can thus |
| * perform multiple character lookups efficiently and with few sanity |
| * checks (such as access outside the end of the input), which keeps the |
| * tokenization code small at the cost of performance. |
| * |
| * Character data in tokens, such as identifier names and string literals, |
| * is encoded into CESU-8 format on-the-fly while parsing the token in |
| * question. The string data is made reachable to garbage collection by |
| * placing the token-related values in value stack entries allocated for |
| * this purpose by the caller. The characters exist in Unicode code point |
| * form only in the fixed size lookup window, which keeps character data |
| * expansion (of especially ASCII data) low. |
| * |
| * Token parsing supports the full range of Unicode characters as described |
| * in the E5 specification. Parsing has been optimized for ASCII characters |
| * because ordinary Ecmascript code consists almost entirely of ASCII |
| * characters. Matching of complex Unicode codepoint sets (such as in the |
| * IdentifierStart and IdentifierPart productions) is optimized for size, |
| * and is done using a linear scan of a bit-packed list of ranges. This is |
| * very slow, but should never be entered unless the source code actually |
| * contains Unicode characters. |
| * |
| * Ecmascript tokenization is partially context sensitive. First, |
| * additional future reserved words are recognized in strict mode (see E5 |
| * Section 7.6.1.2). Second, a forward slash character ('/') can be |
| * recognized either as starting a RegExp literal or as a division operator, |
| * depending on context. The caller must provide necessary context flags |
| * when requesting a new token. |
| * |
| * Future work: |
| * |
| * * Make line number tracking optional, as it consumes space. |
| * |
| * * Add a feature flag for disabling UTF-8 decoding of input, as most |
| * source code is ASCII. Because of Unicode escapes written in ASCII, |
| * this does not allow Unicode support to be removed from e.g. |
| * duk_unicode_is_identifier_start() nor does it allow removal of CESU-8 |
| * encoding of e.g. string literals. |
| * |
| * * Add a feature flag for disabling Unicode compliance of e.g. identifier |
| * names. This allows for a build more than a kilobyte smaller, because |
| * Unicode ranges needed by duk_unicode_is_identifier_start() and |
| * duk_unicode_is_identifier_part() can be dropped. String literals |
| * should still be allowed to contain escaped Unicode, so this still does |
| * not allow removal of CESU-8 encoding of e.g. string literals. |
| * |
| * * Character lookup tables for codepoints above BMP could be stripped. |
| * |
| * * Strictly speaking, E5 specification requires that source code consists |
| * of 16-bit code units, and if not, must be conceptually converted to |
| * that format first. The current lexer processes Unicode code points |
| * and allows characters outside the BMP. These should be converted to |
| * surrogate pairs while reading the source characters into the window, |
| * not after tokens have been formed (as is done now). However, the fix |
| * is not trivial because two characters are decoded from one codepoint. |
| * |
| * * Optimize for speed as well as size. Large if-else ladders are (at |
| * least potentially) slow. |
| */ |
| |
| #include "duk_internal.h" |
| |
| /* |
| * Various defines and file specific helper macros |
| */ |
| |
| #define DUK__MAX_RE_DECESC_DIGITS 9 |
| #define DUK__MAX_RE_QUANT_DIGITS 9 /* Does not allow e.g. 2**31-1, but one more would allow overflows of u32. */ |
| |
| /* whether to use macros or helper function depends on call count */ |
| #define DUK__ISDIGIT(x) ((x) >= DUK_ASC_0 && (x) <= DUK_ASC_9) |
| #define DUK__ISHEXDIGIT(x) duk__is_hex_digit((x)) |
| #define DUK__ISOCTDIGIT(x) ((x) >= DUK_ASC_0 && (x) <= DUK_ASC_7) |
| #define DUK__ISDIGIT03(x) ((x) >= DUK_ASC_0 && (x) <= DUK_ASC_3) |
| #define DUK__ISDIGIT47(x) ((x) >= DUK_ASC_4 && (x) <= DUK_ASC_7) |
| |
| /* lexer character window helpers */ |
| #define DUK__LOOKUP(lex_ctx,index) ((lex_ctx)->window[(index)].codepoint) |
| #define DUK__ADVANCECHARS(lex_ctx,count) duk__advance_bytes((lex_ctx), (count) * sizeof(duk_lexer_codepoint)) |
| #define DUK__ADVANCEBYTES(lex_ctx,count) duk__advance_bytes((lex_ctx), (count)) |
| #define DUK__INITBUFFER(lex_ctx) duk__initbuffer((lex_ctx)) |
| #define DUK__APPENDBUFFER(lex_ctx,x) duk__appendbuffer((lex_ctx), (duk_codepoint_t) (x)) |
| |
| /* lookup shorthands (note: assume context variable is named 'lex_ctx') */ |
| #define DUK__L0() DUK__LOOKUP(lex_ctx, 0) |
| #define DUK__L1() DUK__LOOKUP(lex_ctx, 1) |
| #define DUK__L2() DUK__LOOKUP(lex_ctx, 2) |
| #define DUK__L3() DUK__LOOKUP(lex_ctx, 3) |
| #define DUK__L4() DUK__LOOKUP(lex_ctx, 4) |
| #define DUK__L5() DUK__LOOKUP(lex_ctx, 5) |
| |
| /* packed advance/token number macro used by multiple functions */ |
| #define DUK__ADVTOK(advbytes,tok) ((((advbytes) * sizeof(duk_lexer_codepoint)) << 8) + (tok)) |
| |
| /* |
| * Advance lookup window by N characters, filling in new characters as |
| * necessary. After returning caller is guaranteed a character window of |
| * at least DUK_LEXER_WINDOW_SIZE characters. |
| * |
| * The main function duk__advance_bytes() is called at least once per every |
| * token so it has a major lexer/compiler performance impact. There are two |
| * variants for the main duk__advance_bytes() algorithm: a sliding window |
| * approach which is slightly faster at the cost of larger code footprint, |
| * and a simple copying one. |
| * |
| * Decoding directly from the source string would be another lexing option. |
| * But the lookup window based approach has the advantage of hiding the |
| * source string and its encoding effectively which gives more flexibility |
| * going forward to e.g. support chunked streaming of source from flash. |
| * |
| * Decodes UTF-8/CESU-8 leniently with support for code points from U+0000 to |
| * U+10FFFF, causing an error if the input is unparseable. Leniency means: |
| * |
| * * Unicode code point validation is intentionally not performed, |
| * except to check that the codepoint does not exceed 0x10ffff. |
| * |
| * * In particular, surrogate pairs are allowed and not combined, which |
| * allows source files to represent all SourceCharacters with CESU-8. |
| * Broken surrogate pairs are allowed, as Ecmascript does not mandate |
| * their validation. |
| * |
| * * Allow non-shortest UTF-8 encodings. |
| * |
| * Leniency here causes few security concerns because all character data is |
| * decoded into Unicode codepoints before lexer processing, and is then |
| * re-encoded into CESU-8. The source can be parsed as strict UTF-8 with |
| * a compiler option. However, Ecmascript source characters include -all- |
| * 16-bit unsigned integer codepoints, so leniency seems to be appropriate. |
| * |
| * Note that codepoints above the BMP are not strictly SourceCharacters, |
| * but the lexer still accepts them as such. Before ending up in a string |
| * or an identifier name, codepoints above BMP are converted into surrogate |
| * pairs and then CESU-8 encoded, resulting in 16-bit Unicode data as |
| * expected by Ecmascript. |
| * |
| * An alternative approach to dealing with invalid or partial sequences |
| * would be to skip them and replace them with e.g. the Unicode replacement |
| * character U+FFFD. This has limited utility because a replacement character |
| * will most likely cause a parse error, unless it occurs inside a string. |
| * Further, Ecmascript source is typically pure ASCII. |
| * |
| * See: |
| * |
| * http://en.wikipedia.org/wiki/UTF-8 |
| * http://en.wikipedia.org/wiki/CESU-8 |
| * http://tools.ietf.org/html/rfc3629 |
| * http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences |
| * |
| * Future work: |
| * |
| * * Reject other invalid Unicode sequences (see Wikipedia entry for examples) |
| * in strict UTF-8 mode. |
| * |
| * * Size optimize. An attempt to use a 16-byte lookup table for the first |
| * byte resulted in a code increase though. |
| * |
| * * Is checking against maximum 0x10ffff really useful? 4-byte encoding |
| * imposes a certain limit anyway. |
| * |
| * * Support chunked streaming of source code. Can be implemented either |
| * by streaming chunks of bytes or chunks of codepoints. |
| */ |
| |
| #if defined(DUK_USE_LEXER_SLIDING_WINDOW) |
| DUK_LOCAL void duk__fill_lexer_buffer(duk_lexer_ctx *lex_ctx, duk_small_uint_t start_offset_bytes) { |
| duk_lexer_codepoint *cp, *cp_end; |
| duk_ucodepoint_t x; |
| duk_small_uint_t contlen; |
| const duk_uint8_t *p, *p_end; |
| #if defined(DUK_USE_STRICT_UTF8_SOURCE) |
| duk_ucodepoint_t mincp; |
| #endif |
| duk_int_t input_line; |
| |
| /* Use temporaries and update lex_ctx only when finished. */ |
| input_line = lex_ctx->input_line; |
| p = lex_ctx->input + lex_ctx->input_offset; |
| p_end = lex_ctx->input + lex_ctx->input_length; |
| |
| cp = (duk_lexer_codepoint *) (void *) ((duk_uint8_t *) lex_ctx->buffer + start_offset_bytes); |
| cp_end = lex_ctx->buffer + DUK_LEXER_BUFFER_SIZE; |
| |
| for (; cp != cp_end; cp++) { |
| cp->offset = (duk_size_t) (p - lex_ctx->input); |
| cp->line = input_line; |
| |
| /* XXX: potential issue with signed pointers, p_end < p. */ |
| if (DUK_UNLIKELY(p >= p_end)) { |
| /* If input_offset were assigned a negative value, it would |
| * result in a large positive value. Most likely it would be |
| * larger than input_length and be caught here. In any case |
| * no memory unsafe behavior would happen. |
| */ |
| cp->codepoint = -1; |
| continue; |
| } |
| |
| x = (duk_ucodepoint_t) (*p++); |
| |
| /* Fast path. */ |
| |
| if (DUK_LIKELY(x < 0x80UL)) { |
| DUK_ASSERT(x != 0x2028UL && x != 0x2029UL); /* not LS/PS */ |
| if (DUK_UNLIKELY(x <= 0x000dUL)) { |
| if ((x == 0x000aUL) || |
| ((x == 0x000dUL) && (p >= p_end || *p != 0x000aUL))) { |
| /* lookup for 0x000a above assumes shortest encoding now */ |
| |
| /* E5 Section 7.3, treat the following as newlines: |
| * LF |
| * CR [not followed by LF] |
| * LS |
| * PS |
| * |
| * For CR LF, CR is ignored if it is followed by LF, and the LF will bump |
| * the line number. |
| */ |
| input_line++; |
| } |
| } |
| |
| cp->codepoint = (duk_codepoint_t) x; |
| continue; |
| } |
| |
| /* Slow path. */ |
| |
| if (x < 0xc0UL) { |
| /* 10xx xxxx -> invalid */ |
| goto error_encoding; |
| } else if (x < 0xe0UL) { |
| /* 110x xxxx 10xx xxxx */ |
| contlen = 1; |
| #if defined(DUK_USE_STRICT_UTF8_SOURCE) |
| mincp = 0x80UL; |
| #endif |
| x = x & 0x1fUL; |
| } else if (x < 0xf0UL) { |
| /* 1110 xxxx 10xx xxxx 10xx xxxx */ |
| contlen = 2; |
| #if defined(DUK_USE_STRICT_UTF8_SOURCE) |
| mincp = 0x800UL; |
| #endif |
| x = x & 0x0fUL; |
| } else if (x < 0xf8UL) { |
| /* 1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx */ |
| contlen = 3; |
| #if defined(DUK_USE_STRICT_UTF8_SOURCE) |
| mincp = 0x10000UL; |
| #endif |
| x = x & 0x07UL; |
| } else { |
| /* no point in supporting encodings of 5 or more bytes */ |
| goto error_encoding; |
| } |
| |
| DUK_ASSERT(p_end >= p); |
| if ((duk_size_t) contlen > (duk_size_t) (p_end - p)) { |
| goto error_clipped; |
| } |
| |
| while (contlen > 0) { |
| duk_small_uint_t y; |
| y = *p++; |
| if ((y & 0xc0U) != 0x80U) { |
| /* check that byte has the form 10xx xxxx */ |
| goto error_encoding; |
| } |
| x = x << 6; |
| x += y & 0x3fUL; |
| contlen--; |
| } |
| |
| /* check final character validity */ |
| |
| if (x > 0x10ffffUL) { |
| goto error_encoding; |
| } |
| #if defined(DUK_USE_STRICT_UTF8_SOURCE) |
| if (x < mincp || (x >= 0xd800UL && x <= 0xdfffUL) || x == 0xfffeUL) { |
| goto error_encoding; |
| } |
| #endif |
| |
| DUK_ASSERT(x != 0x000aUL && x != 0x000dUL); |
| if ((x == 0x2028UL) || (x == 0x2029UL)) { |
| input_line++; |
| } |
| |
| cp->codepoint = (duk_codepoint_t) x; |
| } |
| |
| lex_ctx->input_offset = (duk_size_t) (p - lex_ctx->input); |
| lex_ctx->input_line = input_line; |
| return; |
| |
| error_clipped: /* clipped codepoint */ |
| error_encoding: /* invalid codepoint encoding or codepoint */ |
| lex_ctx->input_offset = (duk_size_t) (p - lex_ctx->input); |
| lex_ctx->input_line = input_line; |
| |
| DUK_ERROR_SYNTAX(lex_ctx->thr, "utf-8 decode failed"); |
| } |
| |
| DUK_LOCAL void duk__advance_bytes(duk_lexer_ctx *lex_ctx, duk_small_uint_t count_bytes) { |
| duk_small_uint_t used_bytes, avail_bytes; |
| |
| DUK_ASSERT_DISABLE(count_bytes >= 0); /* unsigned */ |
| DUK_ASSERT(count_bytes <= (duk_small_uint_t) (DUK_LEXER_WINDOW_SIZE * sizeof(duk_lexer_codepoint))); |
| DUK_ASSERT(lex_ctx->window >= lex_ctx->buffer); |
| DUK_ASSERT(lex_ctx->window < lex_ctx->buffer + DUK_LEXER_BUFFER_SIZE); |
| DUK_ASSERT((duk_uint8_t *) lex_ctx->window + count_bytes <= (duk_uint8_t *) lex_ctx->buffer + DUK_LEXER_BUFFER_SIZE * sizeof(duk_lexer_codepoint)); |
| |
| /* Zero 'count' is also allowed to make call sites easier. |
| * Arithmetic in bytes generates better code in GCC. |
| */ |
| |
| lex_ctx->window = (duk_lexer_codepoint *) (void *) ((duk_uint8_t *) lex_ctx->window + count_bytes); /* avoid multiply */ |
| used_bytes = (duk_small_uint_t) ((duk_uint8_t *) lex_ctx->window - (duk_uint8_t *) lex_ctx->buffer); |
| avail_bytes = DUK_LEXER_BUFFER_SIZE * sizeof(duk_lexer_codepoint) - used_bytes; |
| if (avail_bytes < (duk_small_uint_t) (DUK_LEXER_WINDOW_SIZE * sizeof(duk_lexer_codepoint))) { |
| /* Not enough data to provide a full window, so "scroll" window to |
| * start of buffer and fill up the rest. |
| */ |
| DUK_MEMMOVE((void *) lex_ctx->buffer, |
| (const void *) lex_ctx->window, |
| (size_t) avail_bytes); |
| lex_ctx->window = lex_ctx->buffer; |
| duk__fill_lexer_buffer(lex_ctx, avail_bytes); |
| } |
| } |
| |
| DUK_LOCAL void duk__init_lexer_window(duk_lexer_ctx *lex_ctx) { |
| lex_ctx->window = lex_ctx->buffer; |
| duk__fill_lexer_buffer(lex_ctx, 0); |
| } |
| #else /* DUK_USE_LEXER_SLIDING_WINDOW */ |
| DUK_LOCAL duk_codepoint_t duk__read_char(duk_lexer_ctx *lex_ctx) { |
| duk_ucodepoint_t x; |
| duk_small_uint_t len; |
| duk_small_uint_t i; |
| const duk_uint8_t *p; |
| #if defined(DUK_USE_STRICT_UTF8_SOURCE) |
| duk_ucodepoint_t mincp; |
| #endif |
| duk_size_t input_offset; |
| |
| input_offset = lex_ctx->input_offset; |
| if (DUK_UNLIKELY(input_offset >= lex_ctx->input_length)) { |
| /* If input_offset were assigned a negative value, it would |
| * result in a large positive value. Most likely it would be |
| * larger than input_length and be caught here. In any case |
| * no memory unsafe behavior would happen. |
| */ |
| return -1; |
| } |
| |
| p = lex_ctx->input + input_offset; |
| x = (duk_ucodepoint_t) (*p); |
| |
| if (DUK_LIKELY(x < 0x80UL)) { |
| /* 0xxx xxxx -> fast path */ |
| |
| /* input offset tracking */ |
| lex_ctx->input_offset++; |
| |
| DUK_ASSERT(x != 0x2028UL && x != 0x2029UL); /* not LS/PS */ |
| if (DUK_UNLIKELY(x <= 0x000dUL)) { |
| if ((x == 0x000aUL) || |
| ((x == 0x000dUL) && (lex_ctx->input_offset >= lex_ctx->input_length || |
| lex_ctx->input[lex_ctx->input_offset] != 0x000aUL))) { |
| /* lookup for 0x000a above assumes shortest encoding now */ |
| |
| /* E5 Section 7.3, treat the following as newlines: |
| * LF |
| * CR [not followed by LF] |
| * LS |
| * PS |
| * |
| * For CR LF, CR is ignored if it is followed by LF, and the LF will bump |
| * the line number. |
| */ |
| lex_ctx->input_line++; |
| } |
| } |
| |
| return (duk_codepoint_t) x; |
| } |
| |
| /* Slow path. */ |
| |
| if (x < 0xc0UL) { |
| /* 10xx xxxx -> invalid */ |
| goto error_encoding; |
| } else if (x < 0xe0UL) { |
| /* 110x xxxx 10xx xxxx */ |
| len = 2; |
| #if defined(DUK_USE_STRICT_UTF8_SOURCE) |
| mincp = 0x80UL; |
| #endif |
| x = x & 0x1fUL; |
| } else if (x < 0xf0UL) { |
| /* 1110 xxxx 10xx xxxx 10xx xxxx */ |
| len = 3; |
| #if defined(DUK_USE_STRICT_UTF8_SOURCE) |
| mincp = 0x800UL; |
| #endif |
| x = x & 0x0fUL; |
| } else if (x < 0xf8UL) { |
| /* 1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx */ |
| len = 4; |
| #if defined(DUK_USE_STRICT_UTF8_SOURCE) |
| mincp = 0x10000UL; |
| #endif |
| x = x & 0x07UL; |
| } else { |
| /* no point in supporting encodings of 5 or more bytes */ |
| goto error_encoding; |
| } |
| |
| DUK_ASSERT(lex_ctx->input_length >= lex_ctx->input_offset); |
| if ((duk_size_t) len > (duk_size_t) (lex_ctx->input_length - lex_ctx->input_offset)) { |
| goto error_clipped; |
| } |
| |
| p++; |
| for (i = 1; i < len; i++) { |
| duk_small_uint_t y; |
| y = *p++; |
| if ((y & 0xc0U) != 0x80U) { |
| /* check that byte has the form 10xx xxxx */ |
| goto error_encoding; |
| } |
| x = x << 6; |
| x += y & 0x3fUL; |
| } |
| |
| /* check final character validity */ |
| |
| if (x > 0x10ffffUL) { |
| goto error_encoding; |
| } |
| #if defined(DUK_USE_STRICT_UTF8_SOURCE) |
| if (x < mincp || (x >= 0xd800UL && x <= 0xdfffUL) || x == 0xfffeUL) { |
| goto error_encoding; |
| } |
| #endif |
| |
| /* input offset tracking */ |
| lex_ctx->input_offset += len; |
| |
| /* line tracking */ |
| DUK_ASSERT(x != 0x000aUL && x != 0x000dUL); |
| if ((x == 0x2028UL) || (x == 0x2029UL)) { |
| lex_ctx->input_line++; |
| } |
| |
| return (duk_codepoint_t) x; |
| |
| error_clipped: /* clipped codepoint */ |
| error_encoding: /* invalid codepoint encoding or codepoint */ |
| DUK_ERROR_SYNTAX(lex_ctx->thr, "utf-8 decode failed"); |
| return 0; |
| } |
| |
| DUK_LOCAL void duk__advance_bytes(duk_lexer_ctx *lex_ctx, duk_small_uint_t count_bytes) { |
| duk_small_uint_t keep_bytes; |
| duk_lexer_codepoint *cp, *cp_end; |
| |
| DUK_ASSERT_DISABLE(count_bytes >= 0); /* unsigned */ |
| DUK_ASSERT(count_bytes <= (duk_small_uint_t) (DUK_LEXER_WINDOW_SIZE * sizeof(duk_lexer_codepoint))); |
| |
| /* Zero 'count' is also allowed to make call sites easier. */ |
| |
| keep_bytes = DUK_LEXER_WINDOW_SIZE * sizeof(duk_lexer_codepoint) - count_bytes; |
| DUK_MEMMOVE((void *) lex_ctx->window, |
| (const void *) ((duk_uint8_t *) lex_ctx->window + count_bytes), |
| (size_t) keep_bytes); |
| |
| cp = (duk_lexer_codepoint *) ((duk_uint8_t *) lex_ctx->window + keep_bytes); |
| cp_end = lex_ctx->window + DUK_LEXER_WINDOW_SIZE; |
| for (; cp != cp_end; cp++) { |
| cp->offset = lex_ctx->input_offset; |
| cp->line = lex_ctx->input_line; |
| cp->codepoint = duk__read_char(lex_ctx); |
| } |
| } |
| |
| DUK_LOCAL void duk__init_lexer_window(duk_lexer_ctx *lex_ctx) { |
| /* Call with count == DUK_LEXER_WINDOW_SIZE to fill buffer initially. */ |
| duk__advance_bytes(lex_ctx, DUK_LEXER_WINDOW_SIZE * sizeof(duk_lexer_codepoint)); /* fill window */ |
| } |
| #endif /* DUK_USE_LEXER_SLIDING_WINDOW */ |
| |
| /* |
| * (Re)initialize the temporary byte buffer. May be called extra times |
| * with little impact. |
| */ |
| |
| DUK_LOCAL void duk__initbuffer(duk_lexer_ctx *lex_ctx) { |
| /* Reuse buffer as is unless buffer has grown large. */ |
| if (DUK_HBUFFER_DYNAMIC_GET_SIZE(lex_ctx->buf) < DUK_LEXER_TEMP_BUF_LIMIT) { |
| /* Keep current size */ |
| } else { |
| duk_hbuffer_resize(lex_ctx->thr, lex_ctx->buf, DUK_LEXER_TEMP_BUF_LIMIT); |
| } |
| |
| DUK_BW_INIT_WITHBUF(lex_ctx->thr, &lex_ctx->bw, lex_ctx->buf); |
| } |
| |
| /* |
| * Append a Unicode codepoint to the temporary byte buffer. Performs |
| * CESU-8 surrogate pair encoding for codepoints above the BMP. |
| * Existing surrogate pairs are allowed and also encoded into CESU-8. |
| */ |
| |
| DUK_LOCAL void duk__appendbuffer(duk_lexer_ctx *lex_ctx, duk_codepoint_t x) { |
| /* |
| * Since character data is only generated by decoding the source or by |
| * the compiler itself, we rely on the input codepoints being correct |
| * and avoid a check here. |
| * |
| * Character data can also come here through decoding of Unicode |
| * escapes ("\udead\ubeef") so all 16-but unsigned values can be |
| * present, even when the source file itself is strict UTF-8. |
| */ |
| |
| DUK_ASSERT(x >= 0 && x <= 0x10ffff); |
| |
| DUK_BW_WRITE_ENSURE_CESU8(lex_ctx->thr, &lex_ctx->bw, (duk_ucodepoint_t) x); |
| } |
| |
| /* |
| * Intern the temporary byte buffer into a valstack slot |
| * (in practice, slot1 or slot2). |
| */ |
| |
| DUK_LOCAL void duk__internbuffer(duk_lexer_ctx *lex_ctx, duk_idx_t valstack_idx) { |
| duk_context *ctx = (duk_context *) lex_ctx->thr; |
| |
| DUK_ASSERT(valstack_idx == lex_ctx->slot1_idx || valstack_idx == lex_ctx->slot2_idx); |
| |
| DUK_BW_PUSH_AS_STRING(lex_ctx->thr, &lex_ctx->bw); |
| duk_replace(ctx, valstack_idx); |
| } |
| |
| /* |
| * Init lexer context |
| */ |
| |
| DUK_INTERNAL void duk_lexer_initctx(duk_lexer_ctx *lex_ctx) { |
| DUK_ASSERT(lex_ctx != NULL); |
| |
| DUK_MEMZERO(lex_ctx, sizeof(*lex_ctx)); |
| #if defined(DUK_USE_EXPLICIT_NULL_INIT) |
| #if defined(DUK_USE_LEXER_SLIDING_WINDOW) |
| lex_ctx->window = NULL; |
| #endif |
| lex_ctx->thr = NULL; |
| lex_ctx->input = NULL; |
| lex_ctx->buf = NULL; |
| #endif |
| } |
| |
| /* |
| * Set lexer input position and reinitialize lookup window. |
| */ |
| |
| /* NB: duk_lexer_getpoint() is a macro only */ |
| |
| DUK_INTERNAL void duk_lexer_setpoint(duk_lexer_ctx *lex_ctx, duk_lexer_point *pt) { |
| DUK_ASSERT_DISABLE(pt->offset >= 0); /* unsigned */ |
| DUK_ASSERT(pt->line >= 1); |
| lex_ctx->input_offset = pt->offset; |
| lex_ctx->input_line = pt->line; |
| duk__init_lexer_window(lex_ctx); |
| } |
| |
| /* |
| * Lexing helpers |
| */ |
| |
| /* numeric value of a hex digit (also covers octal and decimal digits) */ |
| DUK_LOCAL duk_codepoint_t duk__hexval(duk_lexer_ctx *lex_ctx, duk_codepoint_t x) { |
| duk_small_int_t t; |
| |
| /* Here 'x' is a Unicode codepoint */ |
| if (DUK_LIKELY(x >= 0 && x <= 0xff)) { |
| t = duk_hex_dectab[x]; |
| if (DUK_LIKELY(t >= 0)) { |
| return t; |
| } |
| } |
| |
| /* Throwing an error this deep makes the error rather vague, but |
| * saves hundreds of bytes of code. |
| */ |
| DUK_ERROR_SYNTAX(lex_ctx->thr, "decode error"); |
| return 0; |
| } |
| |
| /* having this as a separate function provided a size benefit */ |
| DUK_LOCAL duk_bool_t duk__is_hex_digit(duk_codepoint_t x) { |
| if (DUK_LIKELY(x >= 0 && x <= 0xff)) { |
| return (duk_hex_dectab[x] >= 0); |
| } |
| return 0; |
| } |
| |
| DUK_LOCAL duk_codepoint_t duk__decode_hexesc_from_window(duk_lexer_ctx *lex_ctx, duk_small_int_t lookup_offset) { |
| /* validation performed by duk__hexval */ |
| return (duk__hexval(lex_ctx, lex_ctx->window[lookup_offset].codepoint) << 4) | |
| (duk__hexval(lex_ctx, lex_ctx->window[lookup_offset + 1].codepoint)); |
| } |
| |
| DUK_LOCAL duk_codepoint_t duk__decode_uniesc_from_window(duk_lexer_ctx *lex_ctx, duk_small_int_t lookup_offset) { |
| /* validation performed by duk__hexval */ |
| return (duk__hexval(lex_ctx, lex_ctx->window[lookup_offset].codepoint) << 12) | |
| (duk__hexval(lex_ctx, lex_ctx->window[lookup_offset + 1].codepoint) << 8) | |
| (duk__hexval(lex_ctx, lex_ctx->window[lookup_offset + 2].codepoint) << 4) | |
| (duk__hexval(lex_ctx, lex_ctx->window[lookup_offset + 3].codepoint)); |
| } |
| |
| /* |
| * Parse Ecmascript source InputElementDiv or InputElementRegExp |
| * (E5 Section 7), skipping whitespace, comments, and line terminators. |
| * |
| * Possible results are: |
| * (1) a token |
| * (2) a line terminator (skipped) |
| * (3) a comment (skipped) |
| * (4) EOF |
| * |
| * White space is automatically skipped from the current position (but |
| * not after the input element). If input has already ended, returns |
| * DUK_TOK_EOF indefinitely. If a parse error occurs, uses an DUK_ERROR() |
| * macro call (and hence a longjmp through current heap longjmp context). |
| * Comments and line terminator tokens are automatically skipped. |
| * |
| * The input element being matched is determined by regexp_mode; if set, |
| * parses a InputElementRegExp, otherwise a InputElementDiv. The |
| * difference between these are handling of productions starting with a |
| * forward slash. |
| * |
| * If strict_mode is set, recognizes additional future reserved words |
| * specific to strict mode, and refuses to parse octal literals. |
| * |
| * The matching strategy below is to (currently) use a six character |
| * lookup window to quickly determine which production is the -longest- |
| * matching one, and then parse that. The top-level if-else clauses |
| * match the first character, and the code blocks for each clause |
| * handle -all- alternatives for that first character. Ecmascript |
| * specification uses the "longest match wins" semantics, so the order |
| * of the if-clauses matters. |
| * |
| * Misc notes: |
| * |
| * * Ecmascript numeric literals do not accept a sign character. |
| * Consequently e.g. "-1.0" is parsed as two tokens: a negative |
| * sign and a positive numeric literal. The compiler performs |
| * the negation during compilation, so this has no adverse impact. |
| * |
| * * There is no token for "undefined": it is just a value available |
| * from the global object (or simply established by doing a reference |
| * to an undefined value). |
| * |
| * * Some contexts want Identifier tokens, which are IdentifierNames |
| * excluding reserved words, while some contexts want IdentifierNames |
| * directly. In the latter case e.g. "while" is interpreted as an |
| * identifier name, not a DUK_TOK_WHILE token. The solution here is |
| * to provide both token types: DUK_TOK_WHILE goes to 't' while |
| * DUK_TOK_IDENTIFIER goes to 't_nores', and 'slot1' always contains |
| * the identifier / keyword name. |
| * |
| * * Directive prologue needs to identify string literals such as |
| * "use strict" and 'use strict', which are sensitive to line |
| * continuations and escape sequences. For instance, "use\u0020strict" |
| * is a valid directive but is distinct from "use strict". The solution |
| * here is to decode escapes while tokenizing, but to keep track of the |
| * number of escapes. Directive detection can then check that the |
| * number of escapes is zero. |
| * |
| * * Multi-line comments with one or more internal LineTerminator are |
| * treated like a line terminator to comply with automatic semicolon |
| * insertion. |
| */ |
| |
| DUK_INTERNAL |
| void duk_lexer_parse_js_input_element(duk_lexer_ctx *lex_ctx, |
| duk_token *out_token, |
| duk_bool_t strict_mode, |
| duk_bool_t regexp_mode) { |
| duk_codepoint_t x; /* temporary, must be signed and 32-bit to hold Unicode code points */ |
| duk_small_uint_t advtok = 0; /* (advance << 8) + token_type, updated at function end, |
| * init is unnecessary but suppresses "may be used uninitialized" warnings. |
| */ |
| duk_bool_t got_lineterm = 0; /* got lineterm preceding non-whitespace, non-lineterm token */ |
| |
| if (++lex_ctx->token_count >= lex_ctx->token_limit) { |
| DUK_ERROR_RANGE(lex_ctx->thr, "token limit"); |
| return; /* unreachable */ |
| } |
| |
| out_token->t = DUK_TOK_EOF; |
| out_token->t_nores = -1; /* marker: copy t if not changed */ |
| #if 0 /* not necessary to init, disabled for faster parsing */ |
| out_token->num = DUK_DOUBLE_NAN; |
| out_token->str1 = NULL; |
| out_token->str2 = NULL; |
| #endif |
| out_token->num_escapes = 0; |
| /* out_token->lineterm set by caller */ |
| |
| /* This would be nice, but parsing is faster without resetting the |
| * value slots. The only side effect is that references to temporary |
| * string values may linger until lexing is finished; they're then |
| * freed normally. |
| */ |
| #if 0 |
| duk_to_undefined((duk_context *) lex_ctx->thr, lex_ctx->slot1_idx); |
| duk_to_undefined((duk_context *) lex_ctx->thr, lex_ctx->slot2_idx); |
| #endif |
| |
| /* 'advtok' indicates how much to advance and which token id to assign |
| * at the end. This shared functionality minimizes code size. All |
| * code paths are required to set 'advtok' to some value, so no default |
| * init value is used. Code paths calling DUK_ERROR() never return so |
| * they don't need to set advtok. |
| */ |
| |
| /* |
| * Matching order: |
| * |
| * Punctuator first chars, also covers comments, regexps |
| * LineTerminator |
| * Identifier or reserved word, also covers null/true/false literals |
| * NumericLiteral |
| * StringLiteral |
| * EOF |
| * |
| * The order does not matter as long as the longest match is |
| * always correctly identified. There are order dependencies |
| * in the clauses, so it's not trivial to convert to a switch. |
| */ |
| |
| restart_lineupdate: |
| out_token->start_line = lex_ctx->window[0].line; |
| |
| restart: |
| out_token->start_offset = lex_ctx->window[0].offset; |
| |
| x = DUK__L0(); |
| |
| switch (x) { |
| case DUK_ASC_SPACE: |
| case DUK_ASC_HT: /* fast paths for space and tab */ |
| DUK__ADVANCECHARS(lex_ctx, 1); |
| goto restart; |
| case DUK_ASC_LF: /* LF line terminator; CR LF and Unicode lineterms are handled in slow path */ |
| DUK__ADVANCECHARS(lex_ctx, 1); |
| got_lineterm = 1; |
| goto restart_lineupdate; |
| case DUK_ASC_SLASH: /* '/' */ |
| if (DUK__L1() == '/') { |
| /* |
| * E5 Section 7.4, allow SourceCharacter (which is any 16-bit |
| * code point). |
| */ |
| |
| /* DUK__ADVANCECHARS(lex_ctx, 2) would be correct here, but it unnecessary */ |
| for (;;) { |
| x = DUK__L0(); |
| if (x < 0 || duk_unicode_is_line_terminator(x)) { |
| break; |
| } |
| DUK__ADVANCECHARS(lex_ctx, 1); |
| } |
| goto restart; /* line terminator will be handled on next round */ |
| } else if (DUK__L1() == '*') { |
| /* |
| * E5 Section 7.4. If the multi-line comment contains a newline, |
| * it is treated like a single line terminator for automatic |
| * semicolon insertion. |
| */ |
| |
| duk_bool_t last_asterisk = 0; |
| DUK__ADVANCECHARS(lex_ctx, 2); |
| for (;;) { |
| x = DUK__L0(); |
| if (x < 0) { |
| DUK_ERROR_SYNTAX(lex_ctx->thr, "eof in multiline comment"); |
| } |
| DUK__ADVANCECHARS(lex_ctx, 1); |
| if (last_asterisk && x == '/') { |
| break; |
| } |
| if (duk_unicode_is_line_terminator(x)) { |
| got_lineterm = 1; |
| } |
| last_asterisk = (x == '*'); |
| } |
| goto restart_lineupdate; |
| } else if (regexp_mode) { |
| #if defined(DUK_USE_REGEXP_SUPPORT) |
| /* |
| * "/" followed by something in regexp mode. See E5 Section 7.8.5. |
| * |
| * RegExp parsing is a bit complex. First, the regexp body is delimited |
| * by forward slashes, but the body may also contain forward slashes as |
| * part of an escape sequence or inside a character class (delimited by |
| * square brackets). A mini state machine is used to implement these. |
| * |
| * Further, an early (parse time) error must be thrown if the regexp |
| * would cause a run-time error when used in the expression new RegExp(...). |
| * Parsing here simply extracts the (candidate) regexp, and also accepts |
| * invalid regular expressions (which are delimited properly). The caller |
| * (compiler) must perform final validation and regexp compilation. |
| * |
| * RegExp first char may not be '/' (single line comment) or '*' (multi- |
| * line comment). These have already been checked above, so there is no |
| * need below for special handling of the first regexp character as in |
| * the E5 productions. |
| * |
| * About unicode escapes within regexp literals: |
| * |
| * E5 Section 7.8.5 grammar does NOT accept \uHHHH escapes. |
| * However, Section 6 states that regexps accept the escapes, |
| * see paragraph starting with "In string literals...". |
| * The regexp grammar, which sees the decoded regexp literal |
| * (after lexical parsing) DOES have a \uHHHH unicode escape. |
| * So, for instance: |
| * |
| * /\u1234/ |
| * |
| * should first be parsed by the lexical grammar as: |
| * |
| * '\' 'u' RegularExpressionBackslashSequence |
| * '1' RegularExpressionNonTerminator |
| * '2' RegularExpressionNonTerminator |
| * '3' RegularExpressionNonTerminator |
| * '4' RegularExpressionNonTerminator |
| * |
| * and the escape itself is then parsed by the regexp engine. |
| * This is the current implementation. |
| * |
| * Minor spec inconsistency: |
| * |
| * E5 Section 7.8.5 RegularExpressionBackslashSequence is: |
| * |
| * \ RegularExpressionNonTerminator |
| * |
| * while Section A.1 RegularExpressionBackslashSequence is: |
| * |
| * \ NonTerminator |
| * |
| * The latter is not normative and a typo. |
| * |
| */ |
| |
| /* first, parse regexp body roughly */ |
| |
| duk_small_int_t state = 0; /* 0=base, 1=esc, 2=class, 3=class+esc */ |
| |
| DUK__INITBUFFER(lex_ctx); |
| for (;;) { |
| DUK__ADVANCECHARS(lex_ctx, 1); /* skip opening slash on first loop */ |
| x = DUK__L0(); |
| if (x < 0 || duk_unicode_is_line_terminator(x)) { |
| DUK_ERROR_SYNTAX(lex_ctx->thr, "eof or line terminator in regexp"); |
| } |
| x = DUK__L0(); /* re-read to avoid spill / fetch */ |
| if (state == 0) { |
| if (x == '/') { |
| DUK__ADVANCECHARS(lex_ctx, 1); /* eat closing slash */ |
| break; |
| } else if (x == '\\') { |
| state = 1; |
| } else if (x == '[') { |
| state = 2; |
| } |
| } else if (state == 1) { |
| state = 0; |
| } else if (state == 2) { |
| if (x == ']') { |
| state = 0; |
| } else if (x == '\\') { |
| state = 3; |
| } |
| } else { /* state == 3 */ |
| state = 2; |
| } |
| DUK__APPENDBUFFER(lex_ctx, x); |
| } |
| duk__internbuffer(lex_ctx, lex_ctx->slot1_idx); |
| out_token->str1 = duk_get_hstring((duk_context *) lex_ctx->thr, lex_ctx->slot1_idx); |
| |
| /* second, parse flags */ |
| |
| DUK__INITBUFFER(lex_ctx); |
| for (;;) { |
| x = DUK__L0(); |
| if (!duk_unicode_is_identifier_part(x)) { |
| break; |
| } |
| x = DUK__L0(); /* re-read to avoid spill / fetch */ |
| DUK__APPENDBUFFER(lex_ctx, x); |
| DUK__ADVANCECHARS(lex_ctx, 1); |
| } |
| duk__internbuffer(lex_ctx, lex_ctx->slot2_idx); |
| out_token->str2 = duk_get_hstring((duk_context *) lex_ctx->thr, lex_ctx->slot2_idx); |
| |
| DUK__INITBUFFER(lex_ctx); /* free some memory */ |
| |
| /* validation of the regexp is caller's responsibility */ |
| |
| advtok = DUK__ADVTOK(0, DUK_TOK_REGEXP); |
| #else |
| DUK_ERROR_SYNTAX(lex_ctx->thr, "regexp support disabled"); |
| #endif |
| } else if (DUK__L1() == '=') { |
| /* "/=" and not in regexp mode */ |
| advtok = DUK__ADVTOK(2, DUK_TOK_DIV_EQ); |
| } else { |
| /* "/" and not in regexp mode */ |
| advtok = DUK__ADVTOK(1, DUK_TOK_DIV); |
| } |
| break; |
| case DUK_ASC_LCURLY: /* '{' */ |
| advtok = DUK__ADVTOK(1, DUK_TOK_LCURLY); |
| break; |
| case DUK_ASC_RCURLY: /* '}' */ |
| advtok = DUK__ADVTOK(1, DUK_TOK_RCURLY); |
| break; |
| case DUK_ASC_LPAREN: /* '(' */ |
| advtok = DUK__ADVTOK(1, DUK_TOK_LPAREN); |
| break; |
| case DUK_ASC_RPAREN: /* ')' */ |
| advtok = DUK__ADVTOK(1, DUK_TOK_RPAREN); |
| break; |
| case DUK_ASC_LBRACKET: /* '[' */ |
| advtok = DUK__ADVTOK(1, DUK_TOK_LBRACKET); |
| break; |
| case DUK_ASC_RBRACKET: /* ']' */ |
| advtok = DUK__ADVTOK(1, DUK_TOK_RBRACKET); |
| break; |
| case DUK_ASC_PERIOD: /* '.' */ |
| if (DUK__ISDIGIT(DUK__L1())) { |
| /* Period followed by a digit can only start DecimalLiteral |
| * (handled in slow path). We could jump straight into the |
| * DecimalLiteral handling but should avoid goto to inside |
| * a block. |
| */ |
| goto slow_path; |
| } |
| advtok = DUK__ADVTOK(1, DUK_TOK_PERIOD); |
| break; |
| case DUK_ASC_SEMICOLON: /* ';' */ |
| advtok = DUK__ADVTOK(1, DUK_TOK_SEMICOLON); |
| break; |
| case DUK_ASC_COMMA: /* ',' */ |
| advtok = DUK__ADVTOK(1, DUK_TOK_COMMA); |
| break; |
| case DUK_ASC_LANGLE: /* '<' */ |
| if (DUK__L1() == '<' && DUK__L2() == '=') { |
| advtok = DUK__ADVTOK(3, DUK_TOK_ALSHIFT_EQ); |
| } else if (DUK__L1() == '=') { |
| advtok = DUK__ADVTOK(2, DUK_TOK_LE); |
| } else if (DUK__L1() == '<') { |
| advtok = DUK__ADVTOK(2, DUK_TOK_ALSHIFT); |
| } else { |
| advtok = DUK__ADVTOK(1, DUK_TOK_LT); |
| } |
| break; |
| case DUK_ASC_RANGLE: /* '>' */ |
| if (DUK__L1() == '>' && DUK__L2() == '>' && DUK__L3() == '=') { |
| advtok = DUK__ADVTOK(4, DUK_TOK_RSHIFT_EQ); |
| } else if (DUK__L1() == '>' && DUK__L2() == '>') { |
| advtok = DUK__ADVTOK(3, DUK_TOK_RSHIFT); |
| } else if (DUK__L1() == '>' && DUK__L2() == '=') { |
| advtok = DUK__ADVTOK(3, DUK_TOK_ARSHIFT_EQ); |
| } else if (DUK__L1() == '=') { |
| advtok = DUK__ADVTOK(2, DUK_TOK_GE); |
| } else if (DUK__L1() == '>') { |
| advtok = DUK__ADVTOK(2, DUK_TOK_ARSHIFT); |
| } else { |
| advtok = DUK__ADVTOK(1, DUK_TOK_GT); |
| } |
| break; |
| case DUK_ASC_EQUALS: /* '=' */ |
| if (DUK__L1() == '=' && DUK__L2() == '=') { |
| advtok = DUK__ADVTOK(3, DUK_TOK_SEQ); |
| } else if (DUK__L1() == '=') { |
| advtok = DUK__ADVTOK(2, DUK_TOK_EQ); |
| } else { |
| advtok = DUK__ADVTOK(1, DUK_TOK_EQUALSIGN); |
| } |
| break; |
| case DUK_ASC_EXCLAMATION: /* '!' */ |
| if (DUK__L1() == '=' && DUK__L2() == '=') { |
| advtok = DUK__ADVTOK(3, DUK_TOK_SNEQ); |
| } else if (DUK__L1() == '=') { |
| advtok = DUK__ADVTOK(2, DUK_TOK_NEQ); |
| } else { |
| advtok = DUK__ADVTOK(1, DUK_TOK_LNOT); |
| } |
| break; |
| case DUK_ASC_PLUS: /* '+' */ |
| if (DUK__L1() == '+') { |
| advtok = DUK__ADVTOK(2, DUK_TOK_INCREMENT); |
| } else if (DUK__L1() == '=') { |
| advtok = DUK__ADVTOK(2, DUK_TOK_ADD_EQ); |
| } else { |
| advtok = DUK__ADVTOK(1, DUK_TOK_ADD); |
| } |
| break; |
| case DUK_ASC_MINUS: /* '-' */ |
| if (DUK__L1() == '-') { |
| advtok = DUK__ADVTOK(2, DUK_TOK_DECREMENT); |
| } else if (DUK__L1() == '=') { |
| advtok = DUK__ADVTOK(2, DUK_TOK_SUB_EQ); |
| } else { |
| advtok = DUK__ADVTOK(1, DUK_TOK_SUB); |
| } |
| break; |
| case DUK_ASC_STAR: /* '*' */ |
| if (DUK__L1() == '=') { |
| advtok = DUK__ADVTOK(2, DUK_TOK_MUL_EQ); |
| } else { |
| advtok = DUK__ADVTOK(1, DUK_TOK_MUL); |
| } |
| break; |
| case DUK_ASC_PERCENT: /* '%' */ |
| if (DUK__L1() == '=') { |
| advtok = DUK__ADVTOK(2, DUK_TOK_MOD_EQ); |
| } else { |
| advtok = DUK__ADVTOK(1, DUK_TOK_MOD); |
| } |
| break; |
| case DUK_ASC_AMP: /* '&' */ |
| if (DUK__L1() == '&') { |
| advtok = DUK__ADVTOK(2, DUK_TOK_LAND); |
| } else if (DUK__L1() == '=') { |
| advtok = DUK__ADVTOK(2, DUK_TOK_BAND_EQ); |
| } else { |
| advtok = DUK__ADVTOK(1, DUK_TOK_BAND); |
| } |
| break; |
| case DUK_ASC_PIPE: /* '|' */ |
| if (DUK__L1() == '|') { |
| advtok = DUK__ADVTOK(2, DUK_TOK_LOR); |
| } else if (DUK__L1() == '=') { |
| advtok = DUK__ADVTOK(2, DUK_TOK_BOR_EQ); |
| } else { |
| advtok = DUK__ADVTOK(1, DUK_TOK_BOR); |
| } |
| break; |
| case DUK_ASC_CARET: /* '^' */ |
| if (DUK__L1() == '=') { |
| advtok = DUK__ADVTOK(2, DUK_TOK_BXOR_EQ); |
| } else { |
| advtok = DUK__ADVTOK(1, DUK_TOK_BXOR); |
| } |
| break; |
| case DUK_ASC_TILDE: /* '~' */ |
| advtok = DUK__ADVTOK(1, DUK_TOK_BNOT); |
| break; |
| case DUK_ASC_QUESTION: /* '?' */ |
| advtok = DUK__ADVTOK(1, DUK_TOK_QUESTION); |
| break; |
| case DUK_ASC_COLON: /* ':' */ |
| advtok = DUK__ADVTOK(1, DUK_TOK_COLON); |
| break; |
| case DUK_ASC_DOUBLEQUOTE: /* '"' */ |
| case DUK_ASC_SINGLEQUOTE: { /* '\'' */ |
| duk_small_int_t quote = x; /* Note: duk_uint8_t type yields larger code */ |
| duk_small_int_t adv; |
| |
| DUK__INITBUFFER(lex_ctx); |
| for (;;) { |
| DUK__ADVANCECHARS(lex_ctx, 1); /* eat opening quote on first loop */ |
| x = DUK__L0(); |
| if (x < 0 || duk_unicode_is_line_terminator(x)) { |
| DUK_ERROR_SYNTAX(lex_ctx->thr, "eof or line terminator in string literal"); |
| } |
| if (x == quote) { |
| DUK__ADVANCECHARS(lex_ctx, 1); /* eat closing quote */ |
| break; |
| } |
| if (x == '\\') { |
| /* DUK__L0 -> '\' char |
| * DUK__L1 ... DUK__L5 -> more lookup |
| */ |
| |
| x = DUK__L1(); |
| |
| /* How much to advance before next loop; note that next loop |
| * will advance by 1 anyway, so -1 from the total escape |
| * length (e.g. len('\uXXXX') - 1 = 6 - 1). As a default, |
| * 1 is good. |
| */ |
| adv = 2 - 1; /* note: long live range */ |
| |
| if (x < 0) { |
| DUK_ERROR_SYNTAX(lex_ctx->thr, "eof or line terminator in string literal"); |
| } |
| if (duk_unicode_is_line_terminator(x)) { |
| /* line continuation */ |
| if (x == 0x000d && DUK__L2() == 0x000a) { |
| /* CR LF again a special case */ |
| adv = 3 - 1; |
| } |
| } else if (x == '\'') { |
| DUK__APPENDBUFFER(lex_ctx, 0x0027); |
| } else if (x == '"') { |
| DUK__APPENDBUFFER(lex_ctx, 0x0022); |
| } else if (x == '\\') { |
| DUK__APPENDBUFFER(lex_ctx, 0x005c); |
| } else if (x == 'b') { |
| DUK__APPENDBUFFER(lex_ctx, 0x0008); |
| } else if (x == 'f') { |
| DUK__APPENDBUFFER(lex_ctx, 0x000c); |
| } else if (x == 'n') { |
| DUK__APPENDBUFFER(lex_ctx, 0x000a); |
| } else if (x == 'r') { |
| DUK__APPENDBUFFER(lex_ctx, 0x000d); |
| } else if (x == 't') { |
| DUK__APPENDBUFFER(lex_ctx, 0x0009); |
| } else if (x == 'v') { |
| DUK__APPENDBUFFER(lex_ctx, 0x000b); |
| } else if (x == 'x') { |
| adv = 4 - 1; |
| DUK__APPENDBUFFER(lex_ctx, duk__decode_hexesc_from_window(lex_ctx, 2)); |
| } else if (x == 'u') { |
| adv = 6 - 1; |
| DUK__APPENDBUFFER(lex_ctx, duk__decode_uniesc_from_window(lex_ctx, 2)); |
| } else if (DUK__ISDIGIT(x)) { |
| duk_codepoint_t ch = 0; /* initialized to avoid warnings of unused var */ |
| |
| /* |
| * Octal escape or zero escape: |
| * \0 (lookahead not DecimalDigit) |
| * \1 ... \7 (lookahead not DecimalDigit) |
| * \ZeroToThree OctalDigit (lookahead not DecimalDigit) |
| * \FourToSeven OctalDigit (no lookahead restrictions) |
| * \ZeroToThree OctalDigit OctalDigit (no lookahead restrictions) |
| * |
| * Zero escape is part of the standard syntax. Octal escapes are |
| * defined in E5 Section B.1.2, and are only allowed in non-strict mode. |
| * Any other productions starting with a decimal digit are invalid. |
| */ |
| |
| if (x == '0' && !DUK__ISDIGIT(DUK__L2())) { |
| /* Zero escape (also allowed in non-strict mode) */ |
| ch = 0; |
| /* adv = 2 - 1 default OK */ |
| #if defined(DUK_USE_OCTAL_SUPPORT) |
| } else if (strict_mode) { |
| /* No other escape beginning with a digit in strict mode */ |
| DUK_ERROR_SYNTAX(lex_ctx->thr, "invalid escape in string literal"); |
| } else if (DUK__ISDIGIT03(x) && DUK__ISOCTDIGIT(DUK__L2()) && DUK__ISOCTDIGIT(DUK__L3())) { |
| /* Three digit octal escape, digits validated. */ |
| adv = 4 - 1; |
| ch = (duk__hexval(lex_ctx, x) << 6) + |
| (duk__hexval(lex_ctx, DUK__L2()) << 3) + |
| duk__hexval(lex_ctx, DUK__L3()); |
| } else if (((DUK__ISDIGIT03(x) && !DUK__ISDIGIT(DUK__L3())) || DUK__ISDIGIT47(x)) && |
| DUK__ISOCTDIGIT(DUK__L2())) { |
| /* Two digit octal escape, digits validated. |
| * |
| * The if-condition is a bit tricky. We could catch e.g. |
| * '\039' in the three-digit escape and fail it there (by |
| * validating the digits), but we want to avoid extra |
| * additional validation code. |
| */ |
| adv = 3 - 1; |
| ch = (duk__hexval(lex_ctx, x) << 3) + |
| duk__hexval(lex_ctx, DUK__L2()); |
| } else if (DUK__ISDIGIT(x) && !DUK__ISDIGIT(DUK__L2())) { |
| /* One digit octal escape, digit validated. */ |
| /* adv = 2 default OK */ |
| ch = duk__hexval(lex_ctx, x); |
| #else |
| /* fall through to error */ |
| #endif |
| } else { |
| DUK_ERROR_SYNTAX(lex_ctx->thr, "invalid escape in string literal"); |
| } |
| |
| DUK__APPENDBUFFER(lex_ctx, ch); |
| } else { |
| /* escaped NonEscapeCharacter */ |
| DUK__APPENDBUFFER(lex_ctx, x); |
| } |
| DUK__ADVANCECHARS(lex_ctx, adv); |
| |
| /* Track number of escapes; count not really needed but directive |
| * prologues need to detect whether there were any escapes or line |
| * continuations or not. |
| */ |
| out_token->num_escapes++; |
| } else { |
| /* part of string */ |
| DUK__APPENDBUFFER(lex_ctx, x); |
| } |
| } |
| |
| duk__internbuffer(lex_ctx, lex_ctx->slot1_idx); |
| out_token->str1 = duk_get_hstring((duk_context *) lex_ctx->thr, lex_ctx->slot1_idx); |
| |
| DUK__INITBUFFER(lex_ctx); /* free some memory */ |
| |
| advtok = DUK__ADVTOK(0, DUK_TOK_STRING); |
| break; |
| } |
| default: |
| goto slow_path; |
| } /* switch */ |
| |
| goto skip_slow_path; |
| |
| slow_path: |
| if (duk_unicode_is_line_terminator(x)) { |
| if (x == 0x000d && DUK__L1() == 0x000a) { |
| /* |
| * E5 Section 7.3: CR LF is detected as a single line terminator for |
| * line numbers. Here we also detect it as a single line terminator |
| * token. |
| */ |
| DUK__ADVANCECHARS(lex_ctx, 2); |
| } else { |
| DUK__ADVANCECHARS(lex_ctx, 1); |
| } |
| got_lineterm = 1; |
| goto restart_lineupdate; |
| } else if (duk_unicode_is_identifier_start(x) || x == '\\') { |
| /* |
| * Parse an identifier and then check whether it is: |
| * - reserved word (keyword or other reserved word) |
| * - "null" (NullLiteral) |
| * - "true" (BooleanLiteral) |
| * - "false" (BooleanLiteral) |
| * - anything else => identifier |
| * |
| * This does not follow the E5 productions cleanly, but is |
| * useful and compact. |
| * |
| * Note that identifiers may contain Unicode escapes, |
| * see E5 Sections 6 and 7.6. They must be decoded first, |
| * and the result checked against allowed characters. |
| * The above if-clause accepts an identifier start and an |
| * '\' character -- no other token can begin with a '\'. |
| * |
| * Note that "get" and "set" are not reserved words in E5 |
| * specification so they are recognized as plain identifiers |
| * (the tokens DUK_TOK_GET and DUK_TOK_SET are actually not |
| * used now). The compiler needs to work around this. |
| * |
| * Strictly speaking, following Ecmascript longest match |
| * specification, an invalid escape for the first character |
| * should cause a syntax error. However, an invalid escape |
| * for IdentifierParts should just terminate the identifier |
| * early (longest match), and let the next tokenization |
| * fail. For instance Rhino croaks with 'foo\z' when |
| * parsing the identifier. This has little practical impact. |
| */ |
| |
| duk_small_int_t i, i_end; |
| duk_bool_t first = 1; |
| duk_hstring *str; |
| |
| DUK__INITBUFFER(lex_ctx); |
| for (;;) { |
| /* re-lookup first char on first loop */ |
| if (DUK__L0() == '\\') { |
| duk_codepoint_t ch; |
| if (DUK__L1() != 'u') { |
| DUK_ERROR_SYNTAX(lex_ctx->thr, "invalid unicode escape in identifier"); |
| } |
| |
| ch = duk__decode_uniesc_from_window(lex_ctx, 2); |
| |
| /* IdentifierStart is stricter than IdentifierPart, so if the first |
| * character is escaped, must have a stricter check here. |
| */ |
| if (!(first ? duk_unicode_is_identifier_start(ch) : duk_unicode_is_identifier_part(ch))) { |
| DUK_ERROR_SYNTAX(lex_ctx->thr, "invalid unicode escape in identifier"); |
| } |
| DUK__APPENDBUFFER(lex_ctx, ch); |
| DUK__ADVANCECHARS(lex_ctx, 6); |
| |
| /* Track number of escapes: necessary for proper keyword |
| * detection. |
| */ |
| out_token->num_escapes++; |
| } else { |
| /* Note: first character is checked against this. But because |
| * IdentifierPart includes all IdentifierStart characters, and |
| * the first character (if unescaped) has already been checked |
| * in the if condition, this is OK. |
| */ |
| if (!duk_unicode_is_identifier_part(DUK__L0())) { |
| break; |
| } |
| DUK__APPENDBUFFER(lex_ctx, DUK__L0()); |
| DUK__ADVANCECHARS(lex_ctx, 1); |
| } |
| first = 0; |
| } |
| |
| duk__internbuffer(lex_ctx, lex_ctx->slot1_idx); |
| out_token->str1 = duk_get_hstring((duk_context *) lex_ctx->thr, lex_ctx->slot1_idx); |
| str = out_token->str1; |
| DUK_ASSERT(str != NULL); |
| out_token->t_nores = DUK_TOK_IDENTIFIER; |
| |
| DUK__INITBUFFER(lex_ctx); /* free some memory */ |
| |
| /* |
| * Interned identifier is compared against reserved words, which are |
| * currently interned into the heap context. See genbuiltins.py. |
| * |
| * Note that an escape in the identifier disables recognition of |
| * keywords; e.g. "\u0069f = 1;" is a valid statement (assigns to |
| * identifier named "if"). This is not necessarily compliant, |
| * see test-dec-escaped-char-in-keyword.js. |
| * |
| * Note: "get" and "set" are awkward. They are not officially |
| * ReservedWords (and indeed e.g. "var set = 1;" is valid), and |
| * must come out as DUK_TOK_IDENTIFIER. The compiler needs to |
| * work around this a bit. |
| */ |
| |
| /* XXX: optimize by adding the token numbers directly into the |
| * always interned duk_hstring objects (there should be enough |
| * flag bits free for that)? |
| */ |
| |
| i_end = (strict_mode ? DUK_STRIDX_END_RESERVED : DUK_STRIDX_START_STRICT_RESERVED); |
| |
| advtok = DUK__ADVTOK(0, DUK_TOK_IDENTIFIER); |
| if (out_token->num_escapes == 0) { |
| for (i = DUK_STRIDX_START_RESERVED; i < i_end; i++) { |
| DUK_ASSERT(i >= 0 && i < DUK_HEAP_NUM_STRINGS); |
| if (DUK_HTHREAD_GET_STRING(lex_ctx->thr, i) == str) { |
| advtok = DUK__ADVTOK(0, DUK_STRIDX_TO_TOK(i)); |
| break; |
| } |
| } |
| } |
| } else if (DUK__ISDIGIT(x) || (x == '.')) { |
| /* Note: decimal number may start with a period, but must be followed by a digit */ |
| |
| /* |
| * DecimalLiteral, HexIntegerLiteral, OctalIntegerLiteral |
| * "pre-parsing", followed by an actual, accurate parser step. |
| * |
| * Note: the leading sign character ('+' or '-') is -not- part of |
| * the production in E5 grammar, and that the a DecimalLiteral |
| * starting with a '0' must be followed by a non-digit. Leading |
| * zeroes are syntax errors and must be checked for. |
| * |
| * XXX: the two step parsing process is quite awkward, it would |
| * be more straightforward to allow numconv to parse the longest |
| * valid prefix (it already does that, it only needs to indicate |
| * where the input ended). However, the lexer decodes characters |
| * using a lookup window, so this is not a trivial change. |
| */ |
| |
| /* XXX: because of the final check below (that the literal is not |
| * followed by a digit), this could maybe be simplified, if we bail |
| * out early from a leading zero (and if there are no periods etc). |
| * Maybe too complex. |
| */ |
| |
| duk_double_t val; |
| duk_bool_t int_only = 0; |
| duk_bool_t allow_hex = 0; |
| duk_small_int_t state; /* 0=before period/exp, |
| * 1=after period, before exp |
| * 2=after exp, allow '+' or '-' |
| * 3=after exp and exp sign |
| */ |
| duk_small_uint_t s2n_flags; |
| duk_codepoint_t y; |
| |
| DUK__INITBUFFER(lex_ctx); |
| y = DUK__L1(); |
| if (x == '0' && (y == 'x' || y == 'X')) { |
| DUK__APPENDBUFFER(lex_ctx, x); |
| DUK__APPENDBUFFER(lex_ctx, y); |
| DUK__ADVANCECHARS(lex_ctx, 2); |
| int_only = 1; |
| allow_hex = 1; |
| #if defined(DUK_USE_OCTAL_SUPPORT) |
| } else if (!strict_mode && x == '0' && DUK__ISDIGIT(y)) { |
| /* Note: if DecimalLiteral starts with a '0', it can only be |
| * followed by a period or an exponent indicator which starts |
| * with 'e' or 'E'. Hence the if-check above ensures that |
| * OctalIntegerLiteral is the only valid NumericLiteral |
| * alternative at this point (even if y is, say, '9'). |
| */ |
| |
| DUK__APPENDBUFFER(lex_ctx, x); |
| DUK__ADVANCECHARS(lex_ctx, 1); |
| int_only = 1; |
| #endif |
| } |
| |
| state = 0; |
| for (;;) { |
| x = DUK__L0(); /* re-lookup curr char on first round */ |
| if (DUK__ISDIGIT(x)) { |
| /* Note: intentionally allow leading zeroes here, as the |
| * actual parser will check for them. |
| */ |
| if (state == 2) { |
| state = 3; |
| } |
| } else if (allow_hex && DUK__ISHEXDIGIT(x)) { |
| /* Note: 'e' and 'E' are also accepted here. */ |
| ; |
| } else if (x == '.') { |
| if (state >= 1 || int_only) { |
| break; |
| } else { |
| state = 1; |
| } |
| } else if (x == 'e' || x == 'E') { |
| if (state >= 2 || int_only) { |
| break; |
| } else { |
| state = 2; |
| } |
| } else if (x == '-' || x == '+') { |
| if (state != 2) { |
| break; |
| } else { |
| state = 3; |
| } |
| } else { |
| break; |
| } |
| DUK__APPENDBUFFER(lex_ctx, x); |
| DUK__ADVANCECHARS(lex_ctx, 1); |
| } |
| |
| /* XXX: better coercion */ |
| duk__internbuffer(lex_ctx, lex_ctx->slot1_idx); |
| |
| s2n_flags = DUK_S2N_FLAG_ALLOW_EXP | |
| DUK_S2N_FLAG_ALLOW_FRAC | |
| DUK_S2N_FLAG_ALLOW_NAKED_FRAC | |
| DUK_S2N_FLAG_ALLOW_EMPTY_FRAC | |
| #if defined(DUK_USE_OCTAL_SUPPORT) |
| (strict_mode ? 0 : DUK_S2N_FLAG_ALLOW_AUTO_OCT_INT) | |
| #endif |
| DUK_S2N_FLAG_ALLOW_AUTO_HEX_INT; |
| |
| duk_dup((duk_context *) lex_ctx->thr, lex_ctx->slot1_idx); |
| duk_numconv_parse((duk_context *) lex_ctx->thr, 10 /*radix*/, s2n_flags); |
| val = duk_to_number((duk_context *) lex_ctx->thr, -1); |
| if (DUK_ISNAN(val)) { |
| DUK_ERROR_SYNTAX(lex_ctx->thr, "invalid numeric literal"); |
| } |
| duk_replace((duk_context *) lex_ctx->thr, lex_ctx->slot1_idx); /* could also just pop? */ |
| |
| DUK__INITBUFFER(lex_ctx); /* free some memory */ |
| |
| /* Section 7.8.3 (note): NumericLiteral must be followed by something other than |
| * IdentifierStart or DecimalDigit. |
| */ |
| |
| if (DUK__ISDIGIT(DUK__L0()) || duk_unicode_is_identifier_start(DUK__L0())) { |
| DUK_ERROR_SYNTAX(lex_ctx->thr, "invalid numeric literal"); |
| } |
| |
| out_token->num = val; |
| advtok = DUK__ADVTOK(0, DUK_TOK_NUMBER); |
| } else if (duk_unicode_is_whitespace(DUK__LOOKUP(lex_ctx, 0))) { |
| DUK__ADVANCECHARS(lex_ctx, 1); |
| goto restart; |
| } else if (x < 0) { |
| advtok = DUK__ADVTOK(0, DUK_TOK_EOF); |
| } else { |
| DUK_ERROR_SYNTAX(lex_ctx->thr, "invalid token"); |
| } |
| skip_slow_path: |
| |
| /* |
| * Shared exit path |
| */ |
| |
| DUK__ADVANCEBYTES(lex_ctx, advtok >> 8); |
| out_token->t = advtok & 0xff; |
| if (out_token->t_nores < 0) { |
| out_token->t_nores = out_token->t; |
| } |
| out_token->lineterm = got_lineterm; |
| |
| /* Automatic semicolon insertion is allowed if a token is preceded |
| * by line terminator(s), or terminates a statement list (right curly |
| * or EOF). |
| */ |
| if (got_lineterm || out_token->t == DUK_TOK_RCURLY || out_token->t == DUK_TOK_EOF) { |
| out_token->allow_auto_semi = 1; |
| } else { |
| out_token->allow_auto_semi = 0; |
| } |
| } |
| |
| #if defined(DUK_USE_REGEXP_SUPPORT) |
| |
| /* |
| * Parse a RegExp token. The grammar is described in E5 Section 15.10. |
| * Terminal constructions (such as quantifiers) are parsed directly here. |
| * |
| * 0xffffffffU is used as a marker for "infinity" in quantifiers. Further, |
| * DUK__MAX_RE_QUANT_DIGITS limits the maximum number of digits that |
| * will be accepted for a quantifier. |
| */ |
| |
| DUK_INTERNAL void duk_lexer_parse_re_token(duk_lexer_ctx *lex_ctx, duk_re_token *out_token) { |
| duk_small_int_t advtok = 0; /* init is unnecessary but suppresses "may be used uninitialized" warnings */ |
| duk_codepoint_t x, y; |
| |
| if (++lex_ctx->token_count >= lex_ctx->token_limit) { |
| DUK_ERROR_RANGE(lex_ctx->thr, "token limit"); |
| return; /* unreachable */ |
| } |
| |
| DUK_MEMZERO(out_token, sizeof(*out_token)); |
| |
| x = DUK__L0(); |
| y = DUK__L1(); |
| |
| DUK_DDD(DUK_DDDPRINT("parsing regexp token, L0=%ld, L1=%ld", (long) x, (long) y)); |
| |
| switch (x) { |
| case '|': { |
| advtok = DUK__ADVTOK(1, DUK_RETOK_DISJUNCTION); |
| break; |
| } |
| case '^': { |
| advtok = DUK__ADVTOK(1, DUK_RETOK_ASSERT_START); |
| break; |
| } |
| case '$': { |
| advtok = DUK__ADVTOK(1, DUK_RETOK_ASSERT_END); |
| break; |
| } |
| case '?': { |
| out_token->qmin = 0; |
| out_token->qmax = 1; |
| if (y == '?') { |
| advtok = DUK__ADVTOK(2, DUK_RETOK_QUANTIFIER); |
| out_token->greedy = 0; |
| } else { |
| advtok = DUK__ADVTOK(1, DUK_RETOK_QUANTIFIER); |
| out_token->greedy = 1; |
| } |
| break; |
| } |
| case '*': { |
| out_token->qmin = 0; |
| out_token->qmax = DUK_RE_QUANTIFIER_INFINITE; |
| if (y == '?') { |
| advtok = DUK__ADVTOK(2, DUK_RETOK_QUANTIFIER); |
| out_token->greedy = 0; |
| } else { |
| advtok = DUK__ADVTOK(1, DUK_RETOK_QUANTIFIER); |
| out_token->greedy = 1; |
| } |
| break; |
| } |
| case '+': { |
| out_token->qmin = 1; |
| out_token->qmax = DUK_RE_QUANTIFIER_INFINITE; |
| if (y == '?') { |
| advtok = DUK__ADVTOK(2, DUK_RETOK_QUANTIFIER); |
| out_token->greedy = 0; |
| } else { |
| advtok = DUK__ADVTOK(1, DUK_RETOK_QUANTIFIER); |
| out_token->greedy = 1; |
| } |
| break; |
| } |
| case '{': { |
| /* Production allows 'DecimalDigits', including leading zeroes */ |
| duk_uint_fast32_t val1 = 0; |
| duk_uint_fast32_t val2 = DUK_RE_QUANTIFIER_INFINITE; |
| duk_small_int_t digits = 0; |
| #if defined(DUK_USE_ES6_REGEXP_BRACES) |
| duk_lexer_point lex_pt; |
| #endif |
| |
| #if defined(DUK_USE_ES6_REGEXP_BRACES) |
| /* Store lexer position, restoring if quantifier is invalid. */ |
| DUK_LEXER_GETPOINT(lex_ctx, &lex_pt); |
| #endif |
| |
| for (;;) { |
| DUK__ADVANCECHARS(lex_ctx, 1); /* eat '{' on entry */ |
| x = DUK__L0(); |
| if (DUK__ISDIGIT(x)) { |
| digits++; |
| val1 = val1 * 10 + (duk_uint_fast32_t) duk__hexval(lex_ctx, x); |
| } else if (x == ',') { |
| if (digits > DUK__MAX_RE_QUANT_DIGITS) { |
| goto invalid_quantifier; |
| } |
| if (val2 != DUK_RE_QUANTIFIER_INFINITE) { |
| goto invalid_quantifier; |
| } |
| if (DUK__L1() == '}') { |
| /* form: { DecimalDigits , }, val1 = min count */ |
| if (digits == 0) { |
| goto invalid_quantifier; |
| } |
| out_token->qmin = val1; |
| out_token->qmax = DUK_RE_QUANTIFIER_INFINITE; |
| DUK__ADVANCECHARS(lex_ctx, 2); |
| break; |
| } |
| val2 = val1; |
| val1 = 0; |
| digits = 0; /* not strictly necessary because of lookahead '}' above */ |
| } else if (x == '}') { |
| if (digits > DUK__MAX_RE_QUANT_DIGITS) { |
| goto invalid_quantifier; |
| } |
| if (digits == 0) { |
| goto invalid_quantifier; |
| } |
| if (val2 != DUK_RE_QUANTIFIER_INFINITE) { |
| /* val2 = min count, val1 = max count */ |
| out_token->qmin = val2; |
| out_token->qmax = val1; |
| } else { |
| /* val1 = count */ |
| out_token->qmin = val1; |
| out_token->qmax = val1; |
| } |
| DUK__ADVANCECHARS(lex_ctx, 1); |
| break; |
| } else { |
| goto invalid_quantifier; |
| } |
| } |
| if (DUK__L0() == '?') { |
| out_token->greedy = 0; |
| DUK__ADVANCECHARS(lex_ctx, 1); |
| } else { |
| out_token->greedy = 1; |
| } |
| advtok = DUK__ADVTOK(0, DUK_RETOK_QUANTIFIER); |
| break; |
| invalid_quantifier: |
| #if defined(DUK_USE_ES6_REGEXP_BRACES) |
| /* Failed to match the quantifier, restore lexer and parse |
| * opening brace as a literal. |
| */ |
| DUK_LEXER_SETPOINT(lex_ctx, &lex_pt); |
| advtok = DUK__ADVTOK(1, DUK_RETOK_ATOM_CHAR); |
| out_token->num = '{'; |
| #else |
| DUK_ERROR_SYNTAX(lex_ctx->thr, "invalid regexp quantifier"); |
| #endif |
| break; |
| } |
| case '.': { |
| advtok = DUK__ADVTOK(1, DUK_RETOK_ATOM_PERIOD); |
| break; |
| } |
| case '\\': { |
| /* The E5.1 specification does not seem to allow IdentifierPart characters |
| * to be used as identity escapes. Unfortunately this includes '$', which |
| * cannot be escaped as '\$'; it needs to be escaped e.g. as '\u0024'. |
| * Many other implementations (including V8 and Rhino, for instance) do |
| * accept '\$' as a valid identity escape, which is quite pragmatic. |
| * See: test-regexp-identity-escape-dollar.js. |
| */ |
| |
| advtok = DUK__ADVTOK(2, DUK_RETOK_ATOM_CHAR); /* default: char escape (two chars) */ |
| if (y == 'b') { |
| advtok = DUK__ADVTOK(2, DUK_RETOK_ASSERT_WORD_BOUNDARY); |
| } else if (y == 'B') { |
| advtok = DUK__ADVTOK(2, DUK_RETOK_ASSERT_NOT_WORD_BOUNDARY); |
| } else if (y == 'f') { |
| out_token->num = 0x000c; |
| } else if (y == 'n') { |
| out_token->num = 0x000a; |
| } else if (y == 't') { |
| out_token->num = 0x0009; |
| } else if (y == 'r') { |
| out_token->num = 0x000d; |
| } else if (y == 'v') { |
| out_token->num = 0x000b; |
| } else if (y == 'c') { |
| x = DUK__L2(); |
| if ((x >= 'a' && x <= 'z') || |
| (x >= 'A' && x <= 'Z')) { |
| out_token->num = (x % 32); |
| advtok = DUK__ADVTOK(3, DUK_RETOK_ATOM_CHAR); |
| } else { |
| DUK_ERROR_SYNTAX(lex_ctx->thr, "invalid regexp escape"); |
| } |
| } else if (y == 'x') { |
| out_token->num = duk__decode_hexesc_from_window(lex_ctx, 2); |
| advtok = DUK__ADVTOK(4, DUK_RETOK_ATOM_CHAR); |
| } else if (y == 'u') { |
| out_token->num = duk__decode_uniesc_from_window(lex_ctx, 2); |
| advtok = DUK__ADVTOK(6, DUK_RETOK_ATOM_CHAR); |
| } else if (y == 'd') { |
| advtok = DUK__ADVTOK(2, DUK_RETOK_ATOM_DIGIT); |
| } else if (y == 'D') { |
| advtok = DUK__ADVTOK(2, DUK_RETOK_ATOM_NOT_DIGIT); |
| } else if (y == 's') { |
| advtok = DUK__ADVTOK(2, DUK_RETOK_ATOM_WHITE); |
| } else if (y == 'S') { |
| advtok = DUK__ADVTOK(2, DUK_RETOK_ATOM_NOT_WHITE); |
| } else if (y == 'w') { |
| advtok = DUK__ADVTOK(2, DUK_RETOK_ATOM_WORD_CHAR); |
| } else if (y == 'W') { |
| advtok = DUK__ADVTOK(2, DUK_RETOK_ATOM_NOT_WORD_CHAR); |
| } else if (DUK__ISDIGIT(y)) { |
| /* E5 Section 15.10.2.11 */ |
| if (y == '0') { |
| if (DUK__ISDIGIT(DUK__L2())) { |
| DUK_ERROR_SYNTAX(lex_ctx->thr, "invalid regexp escape"); |
| } |
| out_token->num = 0x0000; |
| advtok = DUK__ADVTOK(2, DUK_RETOK_ATOM_CHAR); |
| } else { |
| /* XXX: shared parsing? */ |
| duk_uint_fast32_t val = 0; |
| duk_small_int_t i; |
| for (i = 0; ; i++) { |
| if (i >= DUK__MAX_RE_DECESC_DIGITS) { |
| DUK_ERROR_SYNTAX(lex_ctx->thr, "invalid regexp escape"); |
| } |
| DUK__ADVANCECHARS(lex_ctx, 1); /* eat backslash on entry */ |
| x = DUK__L0(); |
| if (!DUK__ISDIGIT(x)) { |
| break; |
| } |
| val = val * 10 + (duk_uint_fast32_t) duk__hexval(lex_ctx, x); |
| } |
| /* DUK__L0() cannot be a digit, because the loop doesn't terminate if it is */ |
| advtok = DUK__ADVTOK(0, DUK_RETOK_ATOM_BACKREFERENCE); |
| out_token->num = val; |
| } |
| } else if ((y >= 0 && !duk_unicode_is_identifier_part(y)) || |
| #if defined(DUK_USE_NONSTD_REGEXP_DOLLAR_ESCAPE) |
| y == '$' || |
| #endif |
| y == DUK_UNICODE_CP_ZWNJ || |
| y == DUK_UNICODE_CP_ZWJ) { |
| /* IdentityEscape, with dollar added as a valid additional |
| * non-standard escape (see test-regexp-identity-escape-dollar.js). |
| * Careful not to match end-of-buffer (<0) here. |
| */ |
| out_token->num = y; |
| } else { |
| DUK_ERROR_SYNTAX(lex_ctx->thr, "invalid regexp escape"); |
| } |
| break; |
| } |
| case '(': { |
| /* XXX: naming is inconsistent: ATOM_END_GROUP ends an ASSERT_START_LOOKAHEAD */ |
| |
| if (y == '?') { |
| if (DUK__L2() == '=') { |
| /* (?= */ |
| advtok = DUK__ADVTOK(3, DUK_RETOK_ASSERT_START_POS_LOOKAHEAD); |
| } else if (DUK__L2() == '!') { |
| /* (?! */ |
| advtok = DUK__ADVTOK(3, DUK_RETOK_ASSERT_START_NEG_LOOKAHEAD); |
| } else if (DUK__L2() == ':') { |
| /* (?: */ |
| advtok = DUK__ADVTOK(3, DUK_RETOK_ATOM_START_NONCAPTURE_GROUP); |
| } |
| } else { |
| /* ( */ |
| advtok = DUK__ADVTOK(1, DUK_RETOK_ATOM_START_CAPTURE_GROUP); |
| } |
| break; |
| } |
| case ')': { |
| advtok = DUK__ADVTOK(1, DUK_RETOK_ATOM_END_GROUP); |
| break; |
| } |
| case '[': { |
| /* |
| * To avoid creating a heavy intermediate value for the list of ranges, |
| * only the start token ('[' or '[^') is parsed here. The regexp |
| * compiler parses the ranges itself. |
| */ |
| advtok = DUK__ADVTOK(1, DUK_RETOK_ATOM_START_CHARCLASS); |
| if (y == '^') { |
| advtok = DUK__ADVTOK(2, DUK_RETOK_ATOM_START_CHARCLASS_INVERTED); |
| } |
| break; |
| } |
| #if !defined(DUK_USE_ES6_REGEXP_BRACES) |
| case '}': |
| #endif |
| case ']': { |
| /* Although these could be parsed as PatternCharacters unambiguously (here), |
| * E5 Section 15.10.1 grammar explicitly forbids these as PatternCharacters. |
| */ |
| DUK_ERROR_SYNTAX(lex_ctx->thr, "invalid regexp character"); |
| break; |
| } |
| case -1: { |
| /* EOF */ |
| advtok = DUK__ADVTOK(0, DUK_TOK_EOF); |
| break; |
| } |
| default: { |
| /* PatternCharacter, all excluded characters are matched by cases above */ |
| advtok = DUK__ADVTOK(1, DUK_RETOK_ATOM_CHAR); |
| out_token->num = x; |
| break; |
| } |
| } |
| |
| /* |
| * Shared exit path |
| */ |
| |
| DUK__ADVANCEBYTES(lex_ctx, advtok >> 8); |
| out_token->t = advtok & 0xff; |
| } |
| |
| /* |
| * Special parser for character classes; calls callback for every |
| * range parsed and returns the number of ranges present. |
| */ |
| |
| /* XXX: this duplicates functionality in duk_regexp.c where a similar loop is |
| * required anyway. We could use that BUT we need to update the regexp compiler |
| * 'nranges' too. Work this out a bit more cleanly to save space. |
| */ |
| |
| /* XXX: the handling of character range detection is a bit convoluted. |
| * Try to simplify and make smaller. |
| */ |
| |
| /* XXX: logic for handling character ranges is now incorrect, it will accept |
| * e.g. [\d-z] whereas it should croak from it? SMJS accepts this too, though. |
| * |
| * Needs a read through and a lot of additional tests. |
| */ |
| |
| DUK_LOCAL |
| void duk__emit_u16_direct_ranges(duk_lexer_ctx *lex_ctx, |
| duk_re_range_callback gen_range, |
| void *userdata, |
| const duk_uint16_t *ranges, |
| duk_small_int_t num) { |
| const duk_uint16_t *ranges_end; |
| |
| DUK_UNREF(lex_ctx); |
| |
| ranges_end = ranges + num; |
| while (ranges < ranges_end) { |
| /* mark range 'direct', bypass canonicalization (see Wiki) */ |
| gen_range(userdata, (duk_codepoint_t) ranges[0], (duk_codepoint_t) ranges[1], 1); |
| ranges += 2; |
| } |
| } |
| |
| DUK_INTERNAL void duk_lexer_parse_re_ranges(duk_lexer_ctx *lex_ctx, duk_re_range_callback gen_range, void *userdata) { |
| duk_codepoint_t start = -1; |
| duk_codepoint_t ch; |
| duk_codepoint_t x; |
| duk_bool_t dash = 0; |
| |
| DUK_DD(DUK_DDPRINT("parsing regexp ranges")); |
| |
| for (;;) { |
| x = DUK__L0(); |
| DUK__ADVANCECHARS(lex_ctx, 1); |
| |
| ch = -1; /* not strictly necessary, but avoids "uninitialized variable" warnings */ |
| DUK_UNREF(ch); |
| |
| if (x < 0) { |
| DUK_ERROR_SYNTAX(lex_ctx->thr, "eof in character class"); |
| } else if (x == ']') { |
| DUK_ASSERT(!dash); /* lookup should prevent this */ |
| if (start >= 0) { |
| gen_range(userdata, start, start, 0); |
| } |
| break; |
| } else if (x == '-') { |
| if (start >= 0 && !dash && DUK__L0() != ']') { |
| /* '-' as a range indicator */ |
| dash = 1; |
| continue; |
| } else { |
| /* '-' verbatim */ |
| ch = x; |
| } |
| } else if (x == '\\') { |
| /* |
| * The escapes are same as outside a character class, except that \b has a |
| * different meaning, and \B and backreferences are prohibited (see E5 |
| * Section 15.10.2.19). However, it's difficult to share code because we |
| * handle e.g. "\n" very differently: here we generate a single character |
| * range for it. |
| */ |
| |
| x = DUK__L0(); |
| DUK__ADVANCECHARS(lex_ctx, 1); |
| |
| if (x == 'b') { |
| /* Note: '\b' in char class is different than outside (assertion), |
| * '\B' is not allowed and is caught by the duk_unicode_is_identifier_part() |
| * check below. |
| */ |
| ch = 0x0008; |
| } else if (x == 'f') { |
| ch = 0x000c; |
| } else if (x == 'n') { |
| ch = 0x000a; |
| } else if (x == 't') { |
| ch = 0x0009; |
| } else if (x == 'r') { |
| ch = 0x000d; |
| } else if (x == 'v') { |
| ch = 0x000b; |
| } else if (x == 'c') { |
| x = DUK__L0(); |
| DUK__ADVANCECHARS(lex_ctx, 1); |
| if ((x >= 'a' && x <= 'z') || |
| (x >= 'A' && x <= 'Z')) { |
| ch = (x % 32); |
| } else { |
| DUK_ERROR_SYNTAX(lex_ctx->thr, "invalid regexp escape"); |
| return; /* never reached, but avoids warnings of |
| * potentially unused variables. |
| */ |
| } |
| } else if (x == 'x') { |
| ch = duk__decode_hexesc_from_window(lex_ctx, 0); |
| DUK__ADVANCECHARS(lex_ctx, 2); |
| } else if (x == 'u') { |
| ch = duk__decode_uniesc_from_window(lex_ctx, 0); |
| DUK__ADVANCECHARS(lex_ctx, 4); |
| } else if (x == 'd') { |
| duk__emit_u16_direct_ranges(lex_ctx, |
| gen_range, |
| userdata, |
| duk_unicode_re_ranges_digit, |
| sizeof(duk_unicode_re_ranges_digit) / sizeof(duk_uint16_t)); |
| ch = -1; |
| } else if (x == 'D') { |
| duk__emit_u16_direct_ranges(lex_ctx, |
| gen_range, |
| userdata, |
| duk_unicode_re_ranges_not_digit, |
| sizeof(duk_unicode_re_ranges_not_digit) / sizeof(duk_uint16_t)); |
| ch = -1; |
| } else if (x == 's') { |
| duk__emit_u16_direct_ranges(lex_ctx, |
| gen_range, |
| userdata, |
| duk_unicode_re_ranges_white, |
| sizeof(duk_unicode_re_ranges_white) / sizeof(duk_uint16_t)); |
| ch = -1; |
| } else if (x == 'S') { |
| duk__emit_u16_direct_ranges(lex_ctx, |
| gen_range, |
| userdata, |
| duk_unicode_re_ranges_not_white, |
| sizeof(duk_unicode_re_ranges_not_white) / sizeof(duk_uint16_t)); |
| ch = -1; |
| } else if (x == 'w') { |
| duk__emit_u16_direct_ranges(lex_ctx, |
| gen_range, |
| userdata, |
| duk_unicode_re_ranges_wordchar, |
| sizeof(duk_unicode_re_ranges_wordchar) / sizeof(duk_uint16_t)); |
| ch = -1; |
| } else if (x == 'W') { |
| duk__emit_u16_direct_ranges(lex_ctx, |
| gen_range, |
| userdata, |
| duk_unicode_re_ranges_not_wordchar, |
| sizeof(duk_unicode_re_ranges_not_wordchar) / sizeof(duk_uint16_t)); |
| ch = -1; |
| } else if (DUK__ISDIGIT(x)) { |
| /* DecimalEscape, only \0 is allowed, no leading zeroes are allowed */ |
| if (x == '0' && !DUK__ISDIGIT(DUK__L0())) { |
| ch = 0x0000; |
| } else { |
| DUK_ERROR_SYNTAX(lex_ctx->thr, "invalid regexp escape"); |
| } |
| } else if (!duk_unicode_is_identifier_part(x) |
| #if defined(DUK_USE_NONSTD_REGEXP_DOLLAR_ESCAPE) |
| || x == '$' |
| #endif |
| ) { |
| /* IdentityEscape */ |
| ch = x; |
| } else { |
| DUK_ERROR_SYNTAX(lex_ctx->thr, "invalid regexp escape"); |
| } |
| } else { |
| /* character represents itself */ |
| ch = x; |
| } |
| |
| /* ch is a literal character here or -1 if parsed entity was |
| * an escape such as "\s". |
| */ |
| |
| if (ch < 0) { |
| /* multi-character sets not allowed as part of ranges, see |
| * E5 Section 15.10.2.15, abstract operation CharacterRange. |
| */ |
| if (start >= 0) { |
| if (dash) { |
| DUK_ERROR_SYNTAX(lex_ctx->thr, "invalid range"); |
| } else { |
| gen_range(userdata, start, start, 0); |
| start = -1; |
| /* dash is already 0 */ |
| } |
| } |
| } else { |
| if (start >= 0) { |
| if (dash) { |
| if (start > ch) { |
| DUK_ERROR_SYNTAX(lex_ctx->thr, "invalid range"); |
| } |
| gen_range(userdata, start, ch, 0); |
| start = -1; |
| dash = 0; |
| } else { |
| gen_range(userdata, start, start, 0); |
| start = ch; |
| /* dash is already 0 */ |
| } |
| } else { |
| start = ch; |
| } |
| } |
| } |
| |
| return; |
| } |
| |
| #endif /* DUK_USE_REGEXP_SUPPORT */ |