| /* |
| * Regular Expression Engine |
| * |
| * Copyright (c) 2017-2018 Fabrice Bellard |
| * |
| * Permission is hereby granted, free of charge, to any person obtaining a copy |
| * of this software and associated documentation files (the "Software"), to deal |
| * in the Software without restriction, including without limitation the rights |
| * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
| * copies of the Software, and to permit persons to whom the Software is |
| * furnished to do so, subject to the following conditions: |
| * |
| * The above copyright notice and this permission notice shall be included in |
| * all copies or substantial portions of the Software. |
| * |
| * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
| * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN |
| * THE SOFTWARE. |
| */ |
| #include <stdlib.h> |
| #include <stdio.h> |
| #include <stdarg.h> |
| #include <inttypes.h> |
| #include <string.h> |
| #include <assert.h> |
| |
| #include "cutils.h" |
| #include "libregexp.h" |
| #include "libunicode.h" |
| |
| /* |
| TODO: |
| |
| - Add a lock step execution mode (=linear time execution guaranteed) |
| when the regular expression is "simple" i.e. no backreference nor |
| complicated lookahead. The opcodes are designed for this execution |
| model. |
| */ |
| |
| #if defined(TEST) |
| #define DUMP_REOP |
| #endif |
| |
| typedef enum { |
| #define DEF(id, size) REOP_ ## id, |
| #include "libregexp-opcode.h" |
| #undef DEF |
| REOP_COUNT, |
| } REOPCodeEnum; |
| |
| #define CAPTURE_COUNT_MAX 255 |
| #define STACK_SIZE_MAX 255 |
| /* must be large enough to have a negligible runtime cost and small |
| enough to call the interrupt callback often. */ |
| #define INTERRUPT_COUNTER_INIT 10000 |
| |
| /* unicode code points */ |
| #define CP_LS 0x2028 |
| #define CP_PS 0x2029 |
| |
| #define TMP_BUF_SIZE 128 |
| |
| typedef struct { |
| DynBuf byte_code; |
| const uint8_t *buf_ptr; |
| const uint8_t *buf_end; |
| const uint8_t *buf_start; |
| int re_flags; |
| BOOL is_unicode; |
| BOOL unicode_sets; /* if set, is_unicode is also set */ |
| BOOL ignore_case; |
| BOOL multi_line; |
| BOOL dotall; |
| int capture_count; |
| int total_capture_count; /* -1 = not computed yet */ |
| int has_named_captures; /* -1 = don't know, 0 = no, 1 = yes */ |
| void *opaque; |
| DynBuf group_names; |
| union { |
| char error_msg[TMP_BUF_SIZE]; |
| char tmp_buf[TMP_BUF_SIZE]; |
| } u; |
| } REParseState; |
| |
| typedef struct { |
| #ifdef DUMP_REOP |
| const char *name; |
| #endif |
| uint8_t size; |
| } REOpCode; |
| |
| static const REOpCode reopcode_info[REOP_COUNT] = { |
| #ifdef DUMP_REOP |
| #define DEF(id, size) { #id, size }, |
| #else |
| #define DEF(id, size) { size }, |
| #endif |
| #include "libregexp-opcode.h" |
| #undef DEF |
| }; |
| |
| #define RE_HEADER_FLAGS 0 |
| #define RE_HEADER_CAPTURE_COUNT 2 |
| #define RE_HEADER_STACK_SIZE 3 |
| #define RE_HEADER_BYTECODE_LEN 4 |
| |
| #define RE_HEADER_LEN 8 |
| |
| static inline int is_digit(int c) { |
| return c >= '0' && c <= '9'; |
| } |
| |
| /* insert 'len' bytes at position 'pos'. Return < 0 if error. */ |
| static int dbuf_insert(DynBuf *s, int pos, int len) |
| { |
| if (dbuf_realloc(s, s->size + len)) |
| return -1; |
| memmove(s->buf + pos + len, s->buf + pos, s->size - pos); |
| s->size += len; |
| return 0; |
| } |
| |
| typedef struct REString { |
| struct REString *next; |
| uint32_t hash; |
| uint32_t len; |
| uint32_t buf[]; |
| } REString; |
| |
| typedef struct { |
| /* the string list is the union of 'char_range' and of the strings |
| in hash_table[]. The strings in hash_table[] have a length != |
| 1. */ |
| CharRange cr; |
| uint32_t n_strings; |
| uint32_t hash_size; |
| int hash_bits; |
| REString **hash_table; |
| } REStringList; |
| |
| static uint32_t re_string_hash(int len, const uint32_t *buf) |
| { |
| int i; |
| uint32_t h; |
| h = 1; |
| for(i = 0; i < len; i++) |
| h = h * 263 + buf[i]; |
| return h * 0x61C88647; |
| } |
| |
| static void re_string_list_init(REParseState *s1, REStringList *s) |
| { |
| cr_init(&s->cr, s1->opaque, lre_realloc); |
| s->n_strings = 0; |
| s->hash_size = 0; |
| s->hash_bits = 0; |
| s->hash_table = NULL; |
| } |
| |
| static void re_string_list_free(REStringList *s) |
| { |
| REString *p, *p_next; |
| int i; |
| for(i = 0; i < s->hash_size; i++) { |
| for(p = s->hash_table[i]; p != NULL; p = p_next) { |
| p_next = p->next; |
| lre_realloc(s->cr.mem_opaque, p, 0); |
| } |
| } |
| lre_realloc(s->cr.mem_opaque, s->hash_table, 0); |
| |
| cr_free(&s->cr); |
| } |
| |
| static void lre_print_char(int c, BOOL is_range) |
| { |
| if (c == '\'' || c == '\\' || |
| (is_range && (c == '-' || c == ']'))) { |
| printf("\\%c", c); |
| } else if (c >= ' ' && c <= 126) { |
| printf("%c", c); |
| } else { |
| printf("\\u{%04x}", c); |
| } |
| } |
| |
| static __maybe_unused void re_string_list_dump(const char *str, const REStringList *s) |
| { |
| REString *p; |
| const CharRange *cr; |
| int i, j, k; |
| |
| printf("%s:\n", str); |
| printf(" ranges: ["); |
| cr = &s->cr; |
| for(i = 0; i < cr->len; i += 2) { |
| lre_print_char(cr->points[i], TRUE); |
| if (cr->points[i] != cr->points[i + 1] - 1) { |
| printf("-"); |
| lre_print_char(cr->points[i + 1] - 1, TRUE); |
| } |
| } |
| printf("]\n"); |
| |
| j = 0; |
| for(i = 0; i < s->hash_size; i++) { |
| for(p = s->hash_table[i]; p != NULL; p = p->next) { |
| printf(" %d/%d: '", j, s->n_strings); |
| for(k = 0; k < p->len; k++) { |
| lre_print_char(p->buf[k], FALSE); |
| } |
| printf("'\n"); |
| j++; |
| } |
| } |
| } |
| |
| static int re_string_find2(REStringList *s, int len, const uint32_t *buf, |
| uint32_t h0, BOOL add_flag) |
| { |
| uint32_t h = 0; /* avoid warning */ |
| REString *p; |
| if (s->n_strings != 0) { |
| h = h0 >> (32 - s->hash_bits); |
| for(p = s->hash_table[h]; p != NULL; p = p->next) { |
| if (p->hash == h0 && p->len == len && |
| !memcmp(p->buf, buf, len * sizeof(buf[0]))) { |
| return 1; |
| } |
| } |
| } |
| /* not found */ |
| if (!add_flag) |
| return 0; |
| /* increase the size of the hash table if needed */ |
| if (unlikely((s->n_strings + 1) > s->hash_size)) { |
| REString **new_hash_table, *p_next; |
| int new_hash_bits, i; |
| uint32_t new_hash_size; |
| new_hash_bits = max_int(s->hash_bits + 1, 4); |
| new_hash_size = 1 << new_hash_bits; |
| new_hash_table = lre_realloc(s->cr.mem_opaque, NULL, |
| sizeof(new_hash_table[0]) * new_hash_size); |
| if (!new_hash_table) |
| return -1; |
| memset(new_hash_table, 0, sizeof(new_hash_table[0]) * new_hash_size); |
| for(i = 0; i < s->hash_size; i++) { |
| for(p = s->hash_table[i]; p != NULL; p = p_next) { |
| p_next = p->next; |
| h = p->hash >> (32 - new_hash_bits); |
| p->next = new_hash_table[h]; |
| new_hash_table[h] = p; |
| } |
| } |
| lre_realloc(s->cr.mem_opaque, s->hash_table, 0); |
| s->hash_bits = new_hash_bits; |
| s->hash_size = new_hash_size; |
| s->hash_table = new_hash_table; |
| h = h0 >> (32 - s->hash_bits); |
| } |
| |
| p = lre_realloc(s->cr.mem_opaque, NULL, sizeof(REString) + len * sizeof(buf[0])); |
| if (!p) |
| return -1; |
| p->next = s->hash_table[h]; |
| s->hash_table[h] = p; |
| s->n_strings++; |
| p->hash = h0; |
| p->len = len; |
| memcpy(p->buf, buf, sizeof(buf[0]) * len); |
| return 1; |
| } |
| |
| static int re_string_find(REStringList *s, int len, const uint32_t *buf, |
| BOOL add_flag) |
| { |
| uint32_t h0; |
| h0 = re_string_hash(len, buf); |
| return re_string_find2(s, len, buf, h0, add_flag); |
| } |
| |
| /* return -1 if memory error, 0 if OK */ |
| static int re_string_add(REStringList *s, int len, const uint32_t *buf) |
| { |
| if (len == 1) { |
| return cr_union_interval(&s->cr, buf[0], buf[0]); |
| } |
| if (re_string_find(s, len, buf, TRUE) < 0) |
| return -1; |
| return 0; |
| } |
| |
| /* a = a op b */ |
| static int re_string_list_op(REStringList *a, REStringList *b, int op) |
| { |
| int i, ret; |
| REString *p, **pp; |
| |
| if (cr_op1(&a->cr, b->cr.points, b->cr.len, op)) |
| return -1; |
| |
| switch(op) { |
| case CR_OP_UNION: |
| if (b->n_strings != 0) { |
| for(i = 0; i < b->hash_size; i++) { |
| for(p = b->hash_table[i]; p != NULL; p = p->next) { |
| if (re_string_find2(a, p->len, p->buf, p->hash, TRUE) < 0) |
| return -1; |
| } |
| } |
| } |
| break; |
| case CR_OP_INTER: |
| case CR_OP_SUB: |
| for(i = 0; i < a->hash_size; i++) { |
| pp = &a->hash_table[i]; |
| for(;;) { |
| p = *pp; |
| if (p == NULL) |
| break; |
| ret = re_string_find2(b, p->len, p->buf, p->hash, FALSE); |
| if (op == CR_OP_SUB) |
| ret = !ret; |
| if (!ret) { |
| /* remove it */ |
| *pp = p->next; |
| a->n_strings--; |
| lre_realloc(a->cr.mem_opaque, p, 0); |
| } else { |
| /* keep it */ |
| pp = &p->next; |
| } |
| } |
| } |
| break; |
| default: |
| abort(); |
| } |
| return 0; |
| } |
| |
| static int re_string_list_canonicalize(REParseState *s1, |
| REStringList *s, BOOL is_unicode) |
| { |
| if (cr_regexp_canonicalize(&s->cr, is_unicode)) |
| return -1; |
| if (s->n_strings != 0) { |
| REStringList a_s, *a = &a_s; |
| int i, j; |
| REString *p; |
| |
| /* XXX: simplify */ |
| re_string_list_init(s1, a); |
| |
| a->n_strings = s->n_strings; |
| a->hash_size = s->hash_size; |
| a->hash_bits = s->hash_bits; |
| a->hash_table = s->hash_table; |
| |
| s->n_strings = 0; |
| s->hash_size = 0; |
| s->hash_bits = 0; |
| s->hash_table = NULL; |
| |
| for(i = 0; i < a->hash_size; i++) { |
| for(p = a->hash_table[i]; p != NULL; p = p->next) { |
| for(j = 0; j < p->len; j++) { |
| p->buf[j] = lre_canonicalize(p->buf[j], is_unicode); |
| } |
| if (re_string_add(s, p->len, p->buf)) { |
| re_string_list_free(a); |
| return -1; |
| } |
| } |
| } |
| re_string_list_free(a); |
| } |
| return 0; |
| } |
| |
| static const uint16_t char_range_d[] = { |
| 1, |
| 0x0030, 0x0039 + 1, |
| }; |
| |
| /* code point ranges for Zs,Zl or Zp property */ |
| static const uint16_t char_range_s[] = { |
| 10, |
| 0x0009, 0x000D + 1, |
| 0x0020, 0x0020 + 1, |
| 0x00A0, 0x00A0 + 1, |
| 0x1680, 0x1680 + 1, |
| 0x2000, 0x200A + 1, |
| /* 2028;LINE SEPARATOR;Zl;0;WS;;;;;N;;;;; */ |
| /* 2029;PARAGRAPH SEPARATOR;Zp;0;B;;;;;N;;;;; */ |
| 0x2028, 0x2029 + 1, |
| 0x202F, 0x202F + 1, |
| 0x205F, 0x205F + 1, |
| 0x3000, 0x3000 + 1, |
| /* FEFF;ZERO WIDTH NO-BREAK SPACE;Cf;0;BN;;;;;N;BYTE ORDER MARK;;;; */ |
| 0xFEFF, 0xFEFF + 1, |
| }; |
| |
| static const uint16_t char_range_w[] = { |
| 4, |
| 0x0030, 0x0039 + 1, |
| 0x0041, 0x005A + 1, |
| 0x005F, 0x005F + 1, |
| 0x0061, 0x007A + 1, |
| }; |
| |
| #define CLASS_RANGE_BASE 0x40000000 |
| |
| typedef enum { |
| CHAR_RANGE_d, |
| CHAR_RANGE_D, |
| CHAR_RANGE_s, |
| CHAR_RANGE_S, |
| CHAR_RANGE_w, |
| CHAR_RANGE_W, |
| } CharRangeEnum; |
| |
| static const uint16_t * const char_range_table[] = { |
| char_range_d, |
| char_range_s, |
| char_range_w, |
| }; |
| |
| static int cr_init_char_range(REParseState *s, REStringList *cr, uint32_t c) |
| { |
| BOOL invert; |
| const uint16_t *c_pt; |
| int len, i; |
| |
| invert = c & 1; |
| c_pt = char_range_table[c >> 1]; |
| len = *c_pt++; |
| re_string_list_init(s, cr); |
| for(i = 0; i < len * 2; i++) { |
| if (cr_add_point(&cr->cr, c_pt[i])) |
| goto fail; |
| } |
| if (invert) { |
| if (cr_invert(&cr->cr)) |
| goto fail; |
| } |
| return 0; |
| fail: |
| re_string_list_free(cr); |
| return -1; |
| } |
| |
| #ifdef DUMP_REOP |
| static __maybe_unused void lre_dump_bytecode(const uint8_t *buf, |
| int buf_len) |
| { |
| int pos, len, opcode, bc_len, re_flags, i; |
| uint32_t val; |
| |
| assert(buf_len >= RE_HEADER_LEN); |
| |
| re_flags = lre_get_flags(buf); |
| bc_len = get_u32(buf + RE_HEADER_BYTECODE_LEN); |
| assert(bc_len + RE_HEADER_LEN <= buf_len); |
| printf("flags: 0x%x capture_count=%d stack_size=%d\n", |
| re_flags, buf[RE_HEADER_CAPTURE_COUNT], buf[RE_HEADER_STACK_SIZE]); |
| if (re_flags & LRE_FLAG_NAMED_GROUPS) { |
| const char *p; |
| p = (char *)buf + RE_HEADER_LEN + bc_len; |
| printf("named groups: "); |
| for(i = 1; i < buf[RE_HEADER_CAPTURE_COUNT]; i++) { |
| if (i != 1) |
| printf(","); |
| printf("<%s>", p); |
| p += strlen(p) + 1; |
| } |
| printf("\n"); |
| assert(p == (char *)(buf + buf_len)); |
| } |
| printf("bytecode_len=%d\n", bc_len); |
| |
| buf += RE_HEADER_LEN; |
| pos = 0; |
| while (pos < bc_len) { |
| printf("%5u: ", pos); |
| opcode = buf[pos]; |
| len = reopcode_info[opcode].size; |
| if (opcode >= REOP_COUNT) { |
| printf(" invalid opcode=0x%02x\n", opcode); |
| break; |
| } |
| if ((pos + len) > bc_len) { |
| printf(" buffer overflow (opcode=0x%02x)\n", opcode); |
| break; |
| } |
| printf("%s", reopcode_info[opcode].name); |
| switch(opcode) { |
| case REOP_char: |
| case REOP_char_i: |
| val = get_u16(buf + pos + 1); |
| if (val >= ' ' && val <= 126) |
| printf(" '%c'", val); |
| else |
| printf(" 0x%04x", val); |
| break; |
| case REOP_char32: |
| case REOP_char32_i: |
| val = get_u32(buf + pos + 1); |
| if (val >= ' ' && val <= 126) |
| printf(" '%c'", val); |
| else |
| printf(" 0x%08x", val); |
| break; |
| case REOP_goto: |
| case REOP_split_goto_first: |
| case REOP_split_next_first: |
| case REOP_loop: |
| case REOP_lookahead: |
| case REOP_negative_lookahead: |
| val = get_u32(buf + pos + 1); |
| val += (pos + 5); |
| printf(" %u", val); |
| break; |
| case REOP_simple_greedy_quant: |
| printf(" %u %u %u %u", |
| get_u32(buf + pos + 1) + (pos + 17), |
| get_u32(buf + pos + 1 + 4), |
| get_u32(buf + pos + 1 + 8), |
| get_u32(buf + pos + 1 + 12)); |
| break; |
| case REOP_save_start: |
| case REOP_save_end: |
| case REOP_back_reference: |
| case REOP_back_reference_i: |
| case REOP_backward_back_reference: |
| case REOP_backward_back_reference_i: |
| printf(" %u", buf[pos + 1]); |
| break; |
| case REOP_save_reset: |
| printf(" %u %u", buf[pos + 1], buf[pos + 2]); |
| break; |
| case REOP_push_i32: |
| val = get_u32(buf + pos + 1); |
| printf(" %d", val); |
| break; |
| case REOP_range: |
| case REOP_range_i: |
| { |
| int n, i; |
| n = get_u16(buf + pos + 1); |
| len += n * 4; |
| for(i = 0; i < n * 2; i++) { |
| val = get_u16(buf + pos + 3 + i * 2); |
| printf(" 0x%04x", val); |
| } |
| } |
| break; |
| case REOP_range32: |
| case REOP_range32_i: |
| { |
| int n, i; |
| n = get_u16(buf + pos + 1); |
| len += n * 8; |
| for(i = 0; i < n * 2; i++) { |
| val = get_u32(buf + pos + 3 + i * 4); |
| printf(" 0x%08x", val); |
| } |
| } |
| break; |
| default: |
| break; |
| } |
| printf("\n"); |
| pos += len; |
| } |
| } |
| #endif |
| |
| static void re_emit_op(REParseState *s, int op) |
| { |
| dbuf_putc(&s->byte_code, op); |
| } |
| |
| /* return the offset of the u32 value */ |
| static int re_emit_op_u32(REParseState *s, int op, uint32_t val) |
| { |
| int pos; |
| dbuf_putc(&s->byte_code, op); |
| pos = s->byte_code.size; |
| dbuf_put_u32(&s->byte_code, val); |
| return pos; |
| } |
| |
| static int re_emit_goto(REParseState *s, int op, uint32_t val) |
| { |
| int pos; |
| dbuf_putc(&s->byte_code, op); |
| pos = s->byte_code.size; |
| dbuf_put_u32(&s->byte_code, val - (pos + 4)); |
| return pos; |
| } |
| |
| static void re_emit_op_u8(REParseState *s, int op, uint32_t val) |
| { |
| dbuf_putc(&s->byte_code, op); |
| dbuf_putc(&s->byte_code, val); |
| } |
| |
| static void re_emit_op_u16(REParseState *s, int op, uint32_t val) |
| { |
| dbuf_putc(&s->byte_code, op); |
| dbuf_put_u16(&s->byte_code, val); |
| } |
| |
| static int __attribute__((format(printf, 2, 3))) re_parse_error(REParseState *s, const char *fmt, ...) |
| { |
| va_list ap; |
| va_start(ap, fmt); |
| vsnprintf(s->u.error_msg, sizeof(s->u.error_msg), fmt, ap); |
| va_end(ap); |
| return -1; |
| } |
| |
| static int re_parse_out_of_memory(REParseState *s) |
| { |
| return re_parse_error(s, "out of memory"); |
| } |
| |
| /* If allow_overflow is false, return -1 in case of |
| overflow. Otherwise return INT32_MAX. */ |
| static int parse_digits(const uint8_t **pp, BOOL allow_overflow) |
| { |
| const uint8_t *p; |
| uint64_t v; |
| int c; |
| |
| p = *pp; |
| v = 0; |
| for(;;) { |
| c = *p; |
| if (c < '0' || c > '9') |
| break; |
| v = v * 10 + c - '0'; |
| if (v >= INT32_MAX) { |
| if (allow_overflow) |
| v = INT32_MAX; |
| else |
| return -1; |
| } |
| p++; |
| } |
| *pp = p; |
| return v; |
| } |
| |
| static int re_parse_expect(REParseState *s, const uint8_t **pp, int c) |
| { |
| const uint8_t *p; |
| p = *pp; |
| if (*p != c) |
| return re_parse_error(s, "expecting '%c'", c); |
| p++; |
| *pp = p; |
| return 0; |
| } |
| |
| /* Parse an escape sequence, *pp points after the '\': |
| allow_utf16 value: |
| 0 : no UTF-16 escapes allowed |
| 1 : UTF-16 escapes allowed |
| 2 : UTF-16 escapes allowed and escapes of surrogate pairs are |
| converted to a unicode character (unicode regexp case). |
| |
| Return the unicode char and update *pp if recognized, |
| return -1 if malformed escape, |
| return -2 otherwise. */ |
| int lre_parse_escape(const uint8_t **pp, int allow_utf16) |
| { |
| const uint8_t *p; |
| uint32_t c; |
| |
| p = *pp; |
| c = *p++; |
| switch(c) { |
| case 'b': |
| c = '\b'; |
| break; |
| case 'f': |
| c = '\f'; |
| break; |
| case 'n': |
| c = '\n'; |
| break; |
| case 'r': |
| c = '\r'; |
| break; |
| case 't': |
| c = '\t'; |
| break; |
| case 'v': |
| c = '\v'; |
| break; |
| case 'x': |
| case 'u': |
| { |
| int h, n, i; |
| uint32_t c1; |
| |
| if (*p == '{' && allow_utf16) { |
| p++; |
| c = 0; |
| for(;;) { |
| h = from_hex(*p++); |
| if (h < 0) |
| return -1; |
| c = (c << 4) | h; |
| if (c > 0x10FFFF) |
| return -1; |
| if (*p == '}') |
| break; |
| } |
| p++; |
| } else { |
| if (c == 'x') { |
| n = 2; |
| } else { |
| n = 4; |
| } |
| |
| c = 0; |
| for(i = 0; i < n; i++) { |
| h = from_hex(*p++); |
| if (h < 0) { |
| return -1; |
| } |
| c = (c << 4) | h; |
| } |
| if (is_hi_surrogate(c) && |
| allow_utf16 == 2 && p[0] == '\\' && p[1] == 'u') { |
| /* convert an escaped surrogate pair into a |
| unicode char */ |
| c1 = 0; |
| for(i = 0; i < 4; i++) { |
| h = from_hex(p[2 + i]); |
| if (h < 0) |
| break; |
| c1 = (c1 << 4) | h; |
| } |
| if (i == 4 && is_lo_surrogate(c1)) { |
| p += 6; |
| c = from_surrogate(c, c1); |
| } |
| } |
| } |
| } |
| break; |
| case '0': case '1': case '2': case '3': |
| case '4': case '5': case '6': case '7': |
| c -= '0'; |
| if (allow_utf16 == 2) { |
| /* only accept \0 not followed by digit */ |
| if (c != 0 || is_digit(*p)) |
| return -1; |
| } else { |
| /* parse a legacy octal sequence */ |
| uint32_t v; |
| v = *p - '0'; |
| if (v > 7) |
| break; |
| c = (c << 3) | v; |
| p++; |
| if (c >= 32) |
| break; |
| v = *p - '0'; |
| if (v > 7) |
| break; |
| c = (c << 3) | v; |
| p++; |
| } |
| break; |
| default: |
| return -2; |
| } |
| *pp = p; |
| return c; |
| } |
| |
| #ifdef CONFIG_ALL_UNICODE |
| /* XXX: we use the same chars for name and value */ |
| static BOOL is_unicode_char(int c) |
| { |
| return ((c >= '0' && c <= '9') || |
| (c >= 'A' && c <= 'Z') || |
| (c >= 'a' && c <= 'z') || |
| (c == '_')); |
| } |
| |
| /* XXX: memory error test */ |
| static void seq_prop_cb(void *opaque, const uint32_t *seq, int seq_len) |
| { |
| REStringList *sl = opaque; |
| re_string_add(sl, seq_len, seq); |
| } |
| |
| static int parse_unicode_property(REParseState *s, REStringList *cr, |
| const uint8_t **pp, BOOL is_inv, |
| BOOL allow_sequence_prop) |
| { |
| const uint8_t *p; |
| char name[64], value[64]; |
| char *q; |
| BOOL script_ext; |
| int ret; |
| |
| p = *pp; |
| if (*p != '{') |
| return re_parse_error(s, "expecting '{' after \\p"); |
| p++; |
| q = name; |
| while (is_unicode_char(*p)) { |
| if ((q - name) >= sizeof(name) - 1) |
| goto unknown_property_name; |
| *q++ = *p++; |
| } |
| *q = '\0'; |
| q = value; |
| if (*p == '=') { |
| p++; |
| while (is_unicode_char(*p)) { |
| if ((q - value) >= sizeof(value) - 1) |
| return re_parse_error(s, "unknown unicode property value"); |
| *q++ = *p++; |
| } |
| } |
| *q = '\0'; |
| if (*p != '}') |
| return re_parse_error(s, "expecting '}'"); |
| p++; |
| // printf("name=%s value=%s\n", name, value); |
| |
| if (!strcmp(name, "Script") || !strcmp(name, "sc")) { |
| script_ext = FALSE; |
| goto do_script; |
| } else if (!strcmp(name, "Script_Extensions") || !strcmp(name, "scx")) { |
| script_ext = TRUE; |
| do_script: |
| re_string_list_init(s, cr); |
| ret = unicode_script(&cr->cr, value, script_ext); |
| if (ret) { |
| re_string_list_free(cr); |
| if (ret == -2) |
| return re_parse_error(s, "unknown unicode script"); |
| else |
| goto out_of_memory; |
| } |
| } else if (!strcmp(name, "General_Category") || !strcmp(name, "gc")) { |
| re_string_list_init(s, cr); |
| ret = unicode_general_category(&cr->cr, value); |
| if (ret) { |
| re_string_list_free(cr); |
| if (ret == -2) |
| return re_parse_error(s, "unknown unicode general category"); |
| else |
| goto out_of_memory; |
| } |
| } else if (value[0] == '\0') { |
| re_string_list_init(s, cr); |
| ret = unicode_general_category(&cr->cr, name); |
| if (ret == -1) { |
| re_string_list_free(cr); |
| goto out_of_memory; |
| } |
| if (ret < 0) { |
| ret = unicode_prop(&cr->cr, name); |
| if (ret == -1) { |
| re_string_list_free(cr); |
| goto out_of_memory; |
| } |
| } |
| if (ret < 0 && !is_inv && allow_sequence_prop) { |
| CharRange cr_tmp; |
| cr_init(&cr_tmp, s->opaque, lre_realloc); |
| ret = unicode_sequence_prop(name, seq_prop_cb, cr, &cr_tmp); |
| cr_free(&cr_tmp); |
| if (ret == -1) { |
| re_string_list_free(cr); |
| goto out_of_memory; |
| } |
| } |
| if (ret < 0) |
| goto unknown_property_name; |
| } else { |
| unknown_property_name: |
| return re_parse_error(s, "unknown unicode property name"); |
| } |
| |
| /* the ordering of case folding and inversion differs with |
| unicode_sets. 'unicode_sets' ordering is more consistent */ |
| /* XXX: the spec seems incorrect, we do it as the other engines |
| seem to do it. */ |
| if (s->ignore_case && s->unicode_sets) { |
| if (re_string_list_canonicalize(s, cr, s->is_unicode)) { |
| re_string_list_free(cr); |
| goto out_of_memory; |
| } |
| } |
| if (is_inv) { |
| if (cr_invert(&cr->cr)) { |
| re_string_list_free(cr); |
| goto out_of_memory; |
| } |
| } |
| if (s->ignore_case && !s->unicode_sets) { |
| if (re_string_list_canonicalize(s, cr, s->is_unicode)) { |
| re_string_list_free(cr); |
| goto out_of_memory; |
| } |
| } |
| *pp = p; |
| return 0; |
| out_of_memory: |
| return re_parse_out_of_memory(s); |
| } |
| #endif /* CONFIG_ALL_UNICODE */ |
| |
| static int get_class_atom(REParseState *s, REStringList *cr, |
| const uint8_t **pp, BOOL inclass); |
| |
| static int parse_class_string_disjunction(REParseState *s, REStringList *cr, |
| const uint8_t **pp) |
| { |
| const uint8_t *p; |
| DynBuf str; |
| int c; |
| |
| p = *pp; |
| if (*p != '{') |
| return re_parse_error(s, "expecting '{' after \\q"); |
| |
| dbuf_init2(&str, s->opaque, lre_realloc); |
| re_string_list_init(s, cr); |
| |
| p++; |
| for(;;) { |
| str.size = 0; |
| while (*p != '}' && *p != '|') { |
| c = get_class_atom(s, NULL, &p, FALSE); |
| if (c < 0) |
| goto fail; |
| if (dbuf_put_u32(&str, c)) { |
| re_parse_out_of_memory(s); |
| goto fail; |
| } |
| } |
| if (re_string_add(cr, str.size / 4, (uint32_t *)str.buf)) { |
| re_parse_out_of_memory(s); |
| goto fail; |
| } |
| if (*p == '}') |
| break; |
| p++; |
| } |
| if (s->ignore_case) { |
| if (re_string_list_canonicalize(s, cr, TRUE)) |
| goto fail; |
| } |
| p++; /* skip the '}' */ |
| dbuf_free(&str); |
| *pp = p; |
| return 0; |
| fail: |
| dbuf_free(&str); |
| re_string_list_free(cr); |
| return -1; |
| } |
| |
| /* return -1 if error otherwise the character or a class range |
| (CLASS_RANGE_BASE) if cr != NULL. In case of class range, 'cr' is |
| initialized. Otherwise, it is ignored. */ |
| static int get_class_atom(REParseState *s, REStringList *cr, |
| const uint8_t **pp, BOOL inclass) |
| { |
| const uint8_t *p; |
| uint32_t c; |
| int ret; |
| |
| p = *pp; |
| |
| c = *p; |
| switch(c) { |
| case '\\': |
| p++; |
| if (p >= s->buf_end) |
| goto unexpected_end; |
| c = *p++; |
| switch(c) { |
| case 'd': |
| c = CHAR_RANGE_d; |
| goto class_range; |
| case 'D': |
| c = CHAR_RANGE_D; |
| goto class_range; |
| case 's': |
| c = CHAR_RANGE_s; |
| goto class_range; |
| case 'S': |
| c = CHAR_RANGE_S; |
| goto class_range; |
| case 'w': |
| c = CHAR_RANGE_w; |
| goto class_range; |
| case 'W': |
| c = CHAR_RANGE_W; |
| class_range: |
| if (!cr) |
| goto default_escape; |
| if (cr_init_char_range(s, cr, c)) |
| return -1; |
| c = CLASS_RANGE_BASE; |
| break; |
| case 'c': |
| c = *p; |
| if ((c >= 'a' && c <= 'z') || |
| (c >= 'A' && c <= 'Z') || |
| (((c >= '0' && c <= '9') || c == '_') && |
| inclass && !s->is_unicode)) { /* Annex B.1.4 */ |
| c &= 0x1f; |
| p++; |
| } else if (s->is_unicode) { |
| goto invalid_escape; |
| } else { |
| /* otherwise return '\' and 'c' */ |
| p--; |
| c = '\\'; |
| } |
| break; |
| case '-': |
| if (!inclass && s->is_unicode) |
| goto invalid_escape; |
| break; |
| case '^': |
| case '$': |
| case '\\': |
| case '.': |
| case '*': |
| case '+': |
| case '?': |
| case '(': |
| case ')': |
| case '[': |
| case ']': |
| case '{': |
| case '}': |
| case '|': |
| case '/': |
| /* always valid to escape these characters */ |
| break; |
| #ifdef CONFIG_ALL_UNICODE |
| case 'p': |
| case 'P': |
| if (s->is_unicode && cr) { |
| if (parse_unicode_property(s, cr, &p, (c == 'P'), s->unicode_sets)) |
| return -1; |
| c = CLASS_RANGE_BASE; |
| break; |
| } |
| goto default_escape; |
| #endif |
| case 'q': |
| if (s->unicode_sets && cr && inclass) { |
| if (parse_class_string_disjunction(s, cr, &p)) |
| return -1; |
| c = CLASS_RANGE_BASE; |
| break; |
| } |
| goto default_escape; |
| default: |
| default_escape: |
| p--; |
| ret = lre_parse_escape(&p, s->is_unicode * 2); |
| if (ret >= 0) { |
| c = ret; |
| } else { |
| if (s->is_unicode) { |
| invalid_escape: |
| return re_parse_error(s, "invalid escape sequence in regular expression"); |
| } else { |
| /* just ignore the '\' */ |
| goto normal_char; |
| } |
| } |
| break; |
| } |
| break; |
| case '\0': |
| if (p >= s->buf_end) { |
| unexpected_end: |
| return re_parse_error(s, "unexpected end"); |
| } |
| /* fall thru */ |
| goto normal_char; |
| |
| case '&': |
| case '!': |
| case '#': |
| case '$': |
| case '%': |
| case '*': |
| case '+': |
| case ',': |
| case '.': |
| case ':': |
| case ';': |
| case '<': |
| case '=': |
| case '>': |
| case '?': |
| case '@': |
| case '^': |
| case '`': |
| case '~': |
| if (s->unicode_sets && p[1] == c) { |
| /* forbidden double characters */ |
| return re_parse_error(s, "invalid class set operation in regular expression"); |
| } |
| goto normal_char; |
| |
| case '(': |
| case ')': |
| case '[': |
| case ']': |
| case '{': |
| case '}': |
| case '/': |
| case '-': |
| case '|': |
| if (s->unicode_sets) { |
| /* invalid characters in unicode sets */ |
| return re_parse_error(s, "invalid character in class in regular expression"); |
| } |
| goto normal_char; |
| |
| default: |
| normal_char: |
| /* normal char */ |
| if (c >= 128) { |
| c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p); |
| if ((unsigned)c > 0xffff && !s->is_unicode) { |
| /* XXX: should handle non BMP-1 code points */ |
| return re_parse_error(s, "malformed unicode char"); |
| } |
| } else { |
| p++; |
| } |
| break; |
| } |
| *pp = p; |
| return c; |
| } |
| |
| static int re_emit_range(REParseState *s, const CharRange *cr) |
| { |
| int len, i; |
| uint32_t high; |
| |
| len = (unsigned)cr->len / 2; |
| if (len >= 65535) |
| return re_parse_error(s, "too many ranges"); |
| if (len == 0) { |
| re_emit_op_u32(s, REOP_char32, -1); |
| } else { |
| high = cr->points[cr->len - 1]; |
| if (high == UINT32_MAX) |
| high = cr->points[cr->len - 2]; |
| if (high <= 0xffff) { |
| /* can use 16 bit ranges with the conversion that 0xffff = |
| infinity */ |
| re_emit_op_u16(s, s->ignore_case ? REOP_range_i : REOP_range, len); |
| for(i = 0; i < cr->len; i += 2) { |
| dbuf_put_u16(&s->byte_code, cr->points[i]); |
| high = cr->points[i + 1] - 1; |
| if (high == UINT32_MAX - 1) |
| high = 0xffff; |
| dbuf_put_u16(&s->byte_code, high); |
| } |
| } else { |
| re_emit_op_u16(s, s->ignore_case ? REOP_range32_i : REOP_range32, len); |
| for(i = 0; i < cr->len; i += 2) { |
| dbuf_put_u32(&s->byte_code, cr->points[i]); |
| dbuf_put_u32(&s->byte_code, cr->points[i + 1] - 1); |
| } |
| } |
| } |
| return 0; |
| } |
| |
| static int re_string_cmp_len(const void *a, const void *b, void *arg) |
| { |
| REString *p1 = *(REString **)a; |
| REString *p2 = *(REString **)b; |
| return (p1->len < p2->len) - (p1->len > p2->len); |
| } |
| |
| static void re_emit_char(REParseState *s, int c) |
| { |
| if (c <= 0xffff) |
| re_emit_op_u16(s, s->ignore_case ? REOP_char_i : REOP_char, c); |
| else |
| re_emit_op_u32(s, s->ignore_case ? REOP_char32_i : REOP_char32, c); |
| } |
| |
| static int re_emit_string_list(REParseState *s, const REStringList *sl) |
| { |
| REString **tab, *p; |
| int i, j, split_pos, last_match_pos, n; |
| BOOL has_empty_string, is_last; |
| |
| // re_string_list_dump("sl", sl); |
| if (sl->n_strings == 0) { |
| /* simple case: only characters */ |
| if (re_emit_range(s, &sl->cr)) |
| return -1; |
| } else { |
| /* at least one string list is present : match the longest ones first */ |
| /* XXX: add a new op_switch opcode to compile as a trie */ |
| tab = lre_realloc(s->opaque, NULL, sizeof(tab[0]) * sl->n_strings); |
| if (!tab) { |
| re_parse_out_of_memory(s); |
| return -1; |
| } |
| has_empty_string = FALSE; |
| n = 0; |
| for(i = 0; i < sl->hash_size; i++) { |
| for(p = sl->hash_table[i]; p != NULL; p = p->next) { |
| if (p->len == 0) { |
| has_empty_string = TRUE; |
| } else { |
| tab[n++] = p; |
| } |
| } |
| } |
| assert(n <= sl->n_strings); |
| |
| rqsort(tab, n, sizeof(tab[0]), re_string_cmp_len, NULL); |
| |
| last_match_pos = -1; |
| for(i = 0; i < n; i++) { |
| p = tab[i]; |
| is_last = !has_empty_string && sl->cr.len == 0 && i == (n - 1); |
| if (!is_last) |
| split_pos = re_emit_op_u32(s, REOP_split_next_first, 0); |
| else |
| split_pos = 0; |
| for(j = 0; j < p->len; j++) { |
| re_emit_char(s, p->buf[j]); |
| } |
| if (!is_last) { |
| last_match_pos = re_emit_op_u32(s, REOP_goto, last_match_pos); |
| put_u32(s->byte_code.buf + split_pos, s->byte_code.size - (split_pos + 4)); |
| } |
| } |
| |
| if (sl->cr.len != 0) { |
| /* char range */ |
| is_last = !has_empty_string; |
| if (!is_last) |
| split_pos = re_emit_op_u32(s, REOP_split_next_first, 0); |
| else |
| split_pos = 0; /* not used */ |
| if (re_emit_range(s, &sl->cr)) { |
| lre_realloc(s->opaque, tab, 0); |
| return -1; |
| } |
| if (!is_last) |
| put_u32(s->byte_code.buf + split_pos, s->byte_code.size - (split_pos + 4)); |
| } |
| |
| /* patch the 'goto match' */ |
| while (last_match_pos != -1) { |
| int next_pos = get_u32(s->byte_code.buf + last_match_pos); |
| put_u32(s->byte_code.buf + last_match_pos, s->byte_code.size - (last_match_pos + 4)); |
| last_match_pos = next_pos; |
| } |
| |
| lre_realloc(s->opaque, tab, 0); |
| } |
| return 0; |
| } |
| |
| static int re_parse_nested_class(REParseState *s, REStringList *cr, const uint8_t **pp); |
| |
| static int re_parse_class_set_operand(REParseState *s, REStringList *cr, const uint8_t **pp) |
| { |
| int c1; |
| const uint8_t *p = *pp; |
| |
| if (*p == '[') { |
| if (re_parse_nested_class(s, cr, pp)) |
| return -1; |
| } else { |
| c1 = get_class_atom(s, cr, pp, TRUE); |
| if (c1 < 0) |
| return -1; |
| if (c1 < CLASS_RANGE_BASE) { |
| /* create a range with a single character */ |
| re_string_list_init(s, cr); |
| if (s->ignore_case) |
| c1 = lre_canonicalize(c1, s->is_unicode); |
| if (cr_union_interval(&cr->cr, c1, c1)) { |
| re_string_list_free(cr); |
| return -1; |
| } |
| } |
| } |
| return 0; |
| } |
| |
| static int re_parse_nested_class(REParseState *s, REStringList *cr, const uint8_t **pp) |
| { |
| const uint8_t *p; |
| uint32_t c1, c2; |
| int ret; |
| REStringList cr1_s, *cr1 = &cr1_s; |
| BOOL invert, is_first; |
| |
| if (lre_check_stack_overflow(s->opaque, 0)) |
| return re_parse_error(s, "stack overflow"); |
| |
| re_string_list_init(s, cr); |
| p = *pp; |
| p++; /* skip '[' */ |
| |
| invert = FALSE; |
| if (*p == '^') { |
| p++; |
| invert = TRUE; |
| } |
| |
| /* handle unions */ |
| is_first = TRUE; |
| for(;;) { |
| if (*p == ']') |
| break; |
| if (*p == '[' && s->unicode_sets) { |
| if (re_parse_nested_class(s, cr1, &p)) |
| goto fail; |
| goto class_union; |
| } else { |
| c1 = get_class_atom(s, cr1, &p, TRUE); |
| if ((int)c1 < 0) |
| goto fail; |
| if (*p == '-' && p[1] != ']') { |
| const uint8_t *p0 = p + 1; |
| if (p[1] == '-' && s->unicode_sets && is_first) |
| goto class_atom; /* first character class followed by '--' */ |
| if (c1 >= CLASS_RANGE_BASE) { |
| if (s->is_unicode) { |
| re_string_list_free(cr1); |
| goto invalid_class_range; |
| } |
| /* Annex B: match '-' character */ |
| goto class_atom; |
| } |
| c2 = get_class_atom(s, cr1, &p0, TRUE); |
| if ((int)c2 < 0) |
| goto fail; |
| if (c2 >= CLASS_RANGE_BASE) { |
| re_string_list_free(cr1); |
| if (s->is_unicode) { |
| goto invalid_class_range; |
| } |
| /* Annex B: match '-' character */ |
| goto class_atom; |
| } |
| p = p0; |
| if (c2 < c1) { |
| invalid_class_range: |
| re_parse_error(s, "invalid class range"); |
| goto fail; |
| } |
| if (s->ignore_case) { |
| CharRange cr2_s, *cr2 = &cr2_s; |
| cr_init(cr2, s->opaque, lre_realloc); |
| if (cr_add_interval(cr2, c1, c2 + 1) || |
| cr_regexp_canonicalize(cr2, s->is_unicode) || |
| cr_op1(&cr->cr, cr2->points, cr2->len, CR_OP_UNION)) { |
| cr_free(cr2); |
| goto memory_error; |
| } |
| cr_free(cr2); |
| } else { |
| if (cr_union_interval(&cr->cr, c1, c2)) |
| goto memory_error; |
| } |
| is_first = FALSE; /* union operation */ |
| } else { |
| class_atom: |
| if (c1 >= CLASS_RANGE_BASE) { |
| class_union: |
| ret = re_string_list_op(cr, cr1, CR_OP_UNION); |
| re_string_list_free(cr1); |
| if (ret) |
| goto memory_error; |
| } else { |
| if (s->ignore_case) |
| c1 = lre_canonicalize(c1, s->is_unicode); |
| if (cr_union_interval(&cr->cr, c1, c1)) |
| goto memory_error; |
| } |
| } |
| } |
| if (s->unicode_sets && is_first) { |
| if (*p == '&' && p[1] == '&' && p[2] != '&') { |
| /* handle '&&' */ |
| for(;;) { |
| if (*p == ']') { |
| break; |
| } else if (*p == '&' && p[1] == '&' && p[2] != '&') { |
| p += 2; |
| } else { |
| goto invalid_operation; |
| } |
| if (re_parse_class_set_operand(s, cr1, &p)) |
| goto fail; |
| ret = re_string_list_op(cr, cr1, CR_OP_INTER); |
| re_string_list_free(cr1); |
| if (ret) |
| goto memory_error; |
| } |
| } else if (*p == '-' && p[1] == '-') { |
| /* handle '--' */ |
| for(;;) { |
| if (*p == ']') { |
| break; |
| } else if (*p == '-' && p[1] == '-') { |
| p += 2; |
| } else { |
| invalid_operation: |
| re_parse_error(s, "invalid operation in regular expression"); |
| goto fail; |
| } |
| if (re_parse_class_set_operand(s, cr1, &p)) |
| goto fail; |
| ret = re_string_list_op(cr, cr1, CR_OP_SUB); |
| re_string_list_free(cr1); |
| if (ret) |
| goto memory_error; |
| } |
| } |
| } |
| is_first = FALSE; |
| } |
| |
| p++; /* skip ']' */ |
| *pp = p; |
| if (invert) { |
| /* XXX: add may_contain_string syntax check to be fully |
| compliant. The test here accepts more input than the |
| spec. */ |
| if (cr->n_strings != 0) { |
| re_parse_error(s, "negated character class with strings in regular expression debugger eval code"); |
| goto fail; |
| } |
| if (cr_invert(&cr->cr)) |
| goto memory_error; |
| } |
| return 0; |
| memory_error: |
| re_parse_out_of_memory(s); |
| fail: |
| re_string_list_free(cr); |
| return -1; |
| } |
| |
| static int re_parse_char_class(REParseState *s, const uint8_t **pp) |
| { |
| REStringList cr_s, *cr = &cr_s; |
| |
| if (re_parse_nested_class(s, cr, pp)) |
| return -1; |
| if (re_emit_string_list(s, cr)) |
| goto fail; |
| re_string_list_free(cr); |
| return 0; |
| fail: |
| re_string_list_free(cr); |
| return -1; |
| } |
| |
| /* Return: |
| - true if the opcodes may not advance the char pointer |
| - false if the opcodes always advance the char pointer |
| */ |
| static BOOL re_need_check_advance(const uint8_t *bc_buf, int bc_buf_len) |
| { |
| int pos, opcode, len; |
| uint32_t val; |
| BOOL ret; |
| |
| ret = TRUE; |
| pos = 0; |
| while (pos < bc_buf_len) { |
| opcode = bc_buf[pos]; |
| len = reopcode_info[opcode].size; |
| switch(opcode) { |
| case REOP_range: |
| case REOP_range_i: |
| val = get_u16(bc_buf + pos + 1); |
| len += val * 4; |
| goto simple_char; |
| case REOP_range32: |
| case REOP_range32_i: |
| val = get_u16(bc_buf + pos + 1); |
| len += val * 8; |
| goto simple_char; |
| case REOP_char: |
| case REOP_char_i: |
| case REOP_char32: |
| case REOP_char32_i: |
| case REOP_dot: |
| case REOP_any: |
| simple_char: |
| ret = FALSE; |
| break; |
| case REOP_line_start: |
| case REOP_line_start_m: |
| case REOP_line_end: |
| case REOP_line_end_m: |
| case REOP_push_i32: |
| case REOP_push_char_pos: |
| case REOP_drop: |
| case REOP_word_boundary: |
| case REOP_word_boundary_i: |
| case REOP_not_word_boundary: |
| case REOP_not_word_boundary_i: |
| case REOP_prev: |
| /* no effect */ |
| break; |
| case REOP_save_start: |
| case REOP_save_end: |
| case REOP_save_reset: |
| case REOP_back_reference: |
| case REOP_back_reference_i: |
| case REOP_backward_back_reference: |
| case REOP_backward_back_reference_i: |
| break; |
| default: |
| /* safe behavior: we cannot predict the outcome */ |
| return TRUE; |
| } |
| pos += len; |
| } |
| return ret; |
| } |
| |
| /* return -1 if a simple quantifier cannot be used. Otherwise return |
| the number of characters in the atom. */ |
| static int re_is_simple_quantifier(const uint8_t *bc_buf, int bc_buf_len) |
| { |
| int pos, opcode, len, count; |
| uint32_t val; |
| |
| count = 0; |
| pos = 0; |
| while (pos < bc_buf_len) { |
| opcode = bc_buf[pos]; |
| len = reopcode_info[opcode].size; |
| switch(opcode) { |
| case REOP_range: |
| case REOP_range_i: |
| val = get_u16(bc_buf + pos + 1); |
| len += val * 4; |
| goto simple_char; |
| case REOP_range32: |
| case REOP_range32_i: |
| val = get_u16(bc_buf + pos + 1); |
| len += val * 8; |
| goto simple_char; |
| case REOP_char: |
| case REOP_char_i: |
| case REOP_char32: |
| case REOP_char32_i: |
| case REOP_dot: |
| case REOP_any: |
| simple_char: |
| count++; |
| break; |
| case REOP_line_start: |
| case REOP_line_start_m: |
| case REOP_line_end: |
| case REOP_line_end_m: |
| case REOP_word_boundary: |
| case REOP_word_boundary_i: |
| case REOP_not_word_boundary: |
| case REOP_not_word_boundary_i: |
| break; |
| default: |
| return -1; |
| } |
| pos += len; |
| } |
| return count; |
| } |
| |
| /* '*pp' is the first char after '<' */ |
| static int re_parse_group_name(char *buf, int buf_size, const uint8_t **pp) |
| { |
| const uint8_t *p, *p1; |
| uint32_t c, d; |
| char *q; |
| |
| p = *pp; |
| q = buf; |
| for(;;) { |
| c = *p; |
| if (c == '\\') { |
| p++; |
| if (*p != 'u') |
| return -1; |
| c = lre_parse_escape(&p, 2); // accept surrogate pairs |
| } else if (c == '>') { |
| break; |
| } else if (c >= 128) { |
| c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p); |
| if (is_hi_surrogate(c)) { |
| d = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p1); |
| if (is_lo_surrogate(d)) { |
| c = from_surrogate(c, d); |
| p = p1; |
| } |
| } |
| } else { |
| p++; |
| } |
| if (c > 0x10FFFF) |
| return -1; |
| if (q == buf) { |
| if (!lre_js_is_ident_first(c)) |
| return -1; |
| } else { |
| if (!lre_js_is_ident_next(c)) |
| return -1; |
| } |
| if ((q - buf + UTF8_CHAR_LEN_MAX + 1) > buf_size) |
| return -1; |
| if (c < 128) { |
| *q++ = c; |
| } else { |
| q += unicode_to_utf8((uint8_t*)q, c); |
| } |
| } |
| if (q == buf) |
| return -1; |
| *q = '\0'; |
| p++; |
| *pp = p; |
| return 0; |
| } |
| |
| /* if capture_name = NULL: return the number of captures + 1. |
| Otherwise, return the capture index corresponding to capture_name |
| or -1 if none */ |
| static int re_parse_captures(REParseState *s, int *phas_named_captures, |
| const char *capture_name) |
| { |
| const uint8_t *p; |
| int capture_index; |
| char name[TMP_BUF_SIZE]; |
| |
| capture_index = 1; |
| *phas_named_captures = 0; |
| for (p = s->buf_start; p < s->buf_end; p++) { |
| switch (*p) { |
| case '(': |
| if (p[1] == '?') { |
| if (p[2] == '<' && p[3] != '=' && p[3] != '!') { |
| *phas_named_captures = 1; |
| /* potential named capture */ |
| if (capture_name) { |
| p += 3; |
| if (re_parse_group_name(name, sizeof(name), &p) == 0) { |
| if (!strcmp(name, capture_name)) |
| return capture_index; |
| } |
| } |
| capture_index++; |
| if (capture_index >= CAPTURE_COUNT_MAX) |
| goto done; |
| } |
| } else { |
| capture_index++; |
| if (capture_index >= CAPTURE_COUNT_MAX) |
| goto done; |
| } |
| break; |
| case '\\': |
| p++; |
| break; |
| case '[': |
| for (p += 1 + (*p == ']'); p < s->buf_end && *p != ']'; p++) { |
| if (*p == '\\') |
| p++; |
| } |
| break; |
| } |
| } |
| done: |
| if (capture_name) |
| return -1; |
| else |
| return capture_index; |
| } |
| |
| static int re_count_captures(REParseState *s) |
| { |
| if (s->total_capture_count < 0) { |
| s->total_capture_count = re_parse_captures(s, &s->has_named_captures, |
| NULL); |
| } |
| return s->total_capture_count; |
| } |
| |
| static BOOL re_has_named_captures(REParseState *s) |
| { |
| if (s->has_named_captures < 0) |
| re_count_captures(s); |
| return s->has_named_captures; |
| } |
| |
| static int find_group_name(REParseState *s, const char *name) |
| { |
| const char *p, *buf_end; |
| size_t len, name_len; |
| int capture_index; |
| |
| p = (char *)s->group_names.buf; |
| if (!p) return -1; |
| buf_end = (char *)s->group_names.buf + s->group_names.size; |
| name_len = strlen(name); |
| capture_index = 1; |
| while (p < buf_end) { |
| len = strlen(p); |
| if (len == name_len && memcmp(name, p, name_len) == 0) |
| return capture_index; |
| p += len + 1; |
| capture_index++; |
| } |
| return -1; |
| } |
| |
| static int re_parse_disjunction(REParseState *s, BOOL is_backward_dir); |
| |
| static int re_parse_modifiers(REParseState *s, const uint8_t **pp) |
| { |
| const uint8_t *p = *pp; |
| int mask = 0; |
| int val; |
| |
| for(;;) { |
| if (*p == 'i') { |
| val = LRE_FLAG_IGNORECASE; |
| } else if (*p == 'm') { |
| val = LRE_FLAG_MULTILINE; |
| } else if (*p == 's') { |
| val = LRE_FLAG_DOTALL; |
| } else { |
| break; |
| } |
| if (mask & val) |
| return re_parse_error(s, "duplicate modifier: '%c'", *p); |
| mask |= val; |
| p++; |
| } |
| *pp = p; |
| return mask; |
| } |
| |
| static BOOL update_modifier(BOOL val, int add_mask, int remove_mask, |
| int mask) |
| { |
| if (add_mask & mask) |
| val = TRUE; |
| if (remove_mask & mask) |
| val = FALSE; |
| return val; |
| } |
| |
| static int re_parse_term(REParseState *s, BOOL is_backward_dir) |
| { |
| const uint8_t *p; |
| int c, last_atom_start, quant_min, quant_max, last_capture_count; |
| BOOL greedy, add_zero_advance_check, is_neg, is_backward_lookahead; |
| REStringList cr_s, *cr = &cr_s; |
| |
| last_atom_start = -1; |
| last_capture_count = 0; |
| p = s->buf_ptr; |
| c = *p; |
| switch(c) { |
| case '^': |
| p++; |
| re_emit_op(s, s->multi_line ? REOP_line_start_m : REOP_line_start); |
| break; |
| case '$': |
| p++; |
| re_emit_op(s, s->multi_line ? REOP_line_end_m : REOP_line_end); |
| break; |
| case '.': |
| p++; |
| last_atom_start = s->byte_code.size; |
| last_capture_count = s->capture_count; |
| if (is_backward_dir) |
| re_emit_op(s, REOP_prev); |
| re_emit_op(s, s->dotall ? REOP_any : REOP_dot); |
| if (is_backward_dir) |
| re_emit_op(s, REOP_prev); |
| break; |
| case '{': |
| if (s->is_unicode) { |
| return re_parse_error(s, "syntax error"); |
| } else if (!is_digit(p[1])) { |
| /* Annex B: we accept '{' not followed by digits as a |
| normal atom */ |
| goto parse_class_atom; |
| } else { |
| const uint8_t *p1 = p + 1; |
| /* Annex B: error if it is like a repetition count */ |
| parse_digits(&p1, TRUE); |
| if (*p1 == ',') { |
| p1++; |
| if (is_digit(*p1)) { |
| parse_digits(&p1, TRUE); |
| } |
| } |
| if (*p1 != '}') { |
| goto parse_class_atom; |
| } |
| } |
| /* fall thru */ |
| case '*': |
| case '+': |
| case '?': |
| return re_parse_error(s, "nothing to repeat"); |
| case '(': |
| if (p[1] == '?') { |
| if (p[2] == ':') { |
| p += 3; |
| last_atom_start = s->byte_code.size; |
| last_capture_count = s->capture_count; |
| s->buf_ptr = p; |
| if (re_parse_disjunction(s, is_backward_dir)) |
| return -1; |
| p = s->buf_ptr; |
| if (re_parse_expect(s, &p, ')')) |
| return -1; |
| } else if (p[2] == 'i' || p[2] == 'm' || p[2] == 's' || p[2] == '-') { |
| BOOL saved_ignore_case, saved_multi_line, saved_dotall; |
| int add_mask, remove_mask; |
| p += 2; |
| remove_mask = 0; |
| add_mask = re_parse_modifiers(s, &p); |
| if (add_mask < 0) |
| return -1; |
| if (*p == '-') { |
| p++; |
| remove_mask = re_parse_modifiers(s, &p); |
| if (remove_mask < 0) |
| return -1; |
| } |
| if ((add_mask == 0 && remove_mask == 0) || |
| (add_mask & remove_mask) != 0) { |
| return re_parse_error(s, "invalid modifiers"); |
| } |
| if (re_parse_expect(s, &p, ':')) |
| return -1; |
| saved_ignore_case = s->ignore_case; |
| saved_multi_line = s->multi_line; |
| saved_dotall = s->dotall; |
| s->ignore_case = update_modifier(s->ignore_case, add_mask, remove_mask, LRE_FLAG_IGNORECASE); |
| s->multi_line = update_modifier(s->multi_line, add_mask, remove_mask, LRE_FLAG_MULTILINE); |
| s->dotall = update_modifier(s->dotall, add_mask, remove_mask, LRE_FLAG_DOTALL); |
| |
| last_atom_start = s->byte_code.size; |
| last_capture_count = s->capture_count; |
| s->buf_ptr = p; |
| if (re_parse_disjunction(s, is_backward_dir)) |
| return -1; |
| p = s->buf_ptr; |
| if (re_parse_expect(s, &p, ')')) |
| return -1; |
| s->ignore_case = saved_ignore_case; |
| s->multi_line = saved_multi_line; |
| s->dotall = saved_dotall; |
| } else if ((p[2] == '=' || p[2] == '!')) { |
| is_neg = (p[2] == '!'); |
| is_backward_lookahead = FALSE; |
| p += 3; |
| goto lookahead; |
| } else if (p[2] == '<' && |
| (p[3] == '=' || p[3] == '!')) { |
| int pos; |
| is_neg = (p[3] == '!'); |
| is_backward_lookahead = TRUE; |
| p += 4; |
| /* lookahead */ |
| lookahead: |
| /* Annex B allows lookahead to be used as an atom for |
| the quantifiers */ |
| if (!s->is_unicode && !is_backward_lookahead) { |
| last_atom_start = s->byte_code.size; |
| last_capture_count = s->capture_count; |
| } |
| pos = re_emit_op_u32(s, REOP_lookahead + is_neg, 0); |
| s->buf_ptr = p; |
| if (re_parse_disjunction(s, is_backward_lookahead)) |
| return -1; |
| p = s->buf_ptr; |
| if (re_parse_expect(s, &p, ')')) |
| return -1; |
| re_emit_op(s, REOP_match); |
| /* jump after the 'match' after the lookahead is successful */ |
| if (dbuf_error(&s->byte_code)) |
| return -1; |
| put_u32(s->byte_code.buf + pos, s->byte_code.size - (pos + 4)); |
| } else if (p[2] == '<') { |
| p += 3; |
| if (re_parse_group_name(s->u.tmp_buf, sizeof(s->u.tmp_buf), |
| &p)) { |
| return re_parse_error(s, "invalid group name"); |
| } |
| if (find_group_name(s, s->u.tmp_buf) > 0) { |
| return re_parse_error(s, "duplicate group name"); |
| } |
| /* group name with a trailing zero */ |
| dbuf_put(&s->group_names, (uint8_t *)s->u.tmp_buf, |
| strlen(s->u.tmp_buf) + 1); |
| s->has_named_captures = 1; |
| goto parse_capture; |
| } else { |
| return re_parse_error(s, "invalid group"); |
| } |
| } else { |
| int capture_index; |
| p++; |
| /* capture without group name */ |
| dbuf_putc(&s->group_names, 0); |
| parse_capture: |
| if (s->capture_count >= CAPTURE_COUNT_MAX) |
| return re_parse_error(s, "too many captures"); |
| last_atom_start = s->byte_code.size; |
| last_capture_count = s->capture_count; |
| capture_index = s->capture_count++; |
| re_emit_op_u8(s, REOP_save_start + is_backward_dir, |
| capture_index); |
| |
| s->buf_ptr = p; |
| if (re_parse_disjunction(s, is_backward_dir)) |
| return -1; |
| p = s->buf_ptr; |
| |
| re_emit_op_u8(s, REOP_save_start + 1 - is_backward_dir, |
| capture_index); |
| |
| if (re_parse_expect(s, &p, ')')) |
| return -1; |
| } |
| break; |
| case '\\': |
| switch(p[1]) { |
| case 'b': |
| case 'B': |
| if (p[1] != 'b') { |
| re_emit_op(s, s->ignore_case ? REOP_not_word_boundary_i : REOP_not_word_boundary); |
| } else { |
| re_emit_op(s, s->ignore_case ? REOP_word_boundary_i : REOP_word_boundary); |
| } |
| p += 2; |
| break; |
| case 'k': |
| { |
| const uint8_t *p1; |
| int dummy_res; |
| |
| p1 = p; |
| if (p1[2] != '<') { |
| /* annex B: we tolerate invalid group names in non |
| unicode mode if there is no named capture |
| definition */ |
| if (s->is_unicode || re_has_named_captures(s)) |
| return re_parse_error(s, "expecting group name"); |
| else |
| goto parse_class_atom; |
| } |
| p1 += 3; |
| if (re_parse_group_name(s->u.tmp_buf, sizeof(s->u.tmp_buf), |
| &p1)) { |
| if (s->is_unicode || re_has_named_captures(s)) |
| return re_parse_error(s, "invalid group name"); |
| else |
| goto parse_class_atom; |
| } |
| c = find_group_name(s, s->u.tmp_buf); |
| if (c < 0) { |
| /* no capture name parsed before, try to look |
| after (inefficient, but hopefully not common */ |
| c = re_parse_captures(s, &dummy_res, s->u.tmp_buf); |
| if (c < 0) { |
| if (s->is_unicode || re_has_named_captures(s)) |
| return re_parse_error(s, "group name not defined"); |
| else |
| goto parse_class_atom; |
| } |
| } |
| p = p1; |
| } |
| goto emit_back_reference; |
| case '0': |
| p += 2; |
| c = 0; |
| if (s->is_unicode) { |
| if (is_digit(*p)) { |
| return re_parse_error(s, "invalid decimal escape in regular expression"); |
| } |
| } else { |
| /* Annex B.1.4: accept legacy octal */ |
| if (*p >= '0' && *p <= '7') { |
| c = *p++ - '0'; |
| if (*p >= '0' && *p <= '7') { |
| c = (c << 3) + *p++ - '0'; |
| } |
| } |
| } |
| goto normal_char; |
| case '1': case '2': case '3': case '4': |
| case '5': case '6': case '7': case '8': |
| case '9': |
| { |
| const uint8_t *q = ++p; |
| |
| c = parse_digits(&p, FALSE); |
| if (c < 0 || (c >= s->capture_count && c >= re_count_captures(s))) { |
| if (!s->is_unicode) { |
| /* Annex B.1.4: accept legacy octal */ |
| p = q; |
| if (*p <= '7') { |
| c = 0; |
| if (*p <= '3') |
| c = *p++ - '0'; |
| if (*p >= '0' && *p <= '7') { |
| c = (c << 3) + *p++ - '0'; |
| if (*p >= '0' && *p <= '7') { |
| c = (c << 3) + *p++ - '0'; |
| } |
| } |
| } else { |
| c = *p++; |
| } |
| goto normal_char; |
| } |
| return re_parse_error(s, "back reference out of range in regular expression"); |
| } |
| emit_back_reference: |
| last_atom_start = s->byte_code.size; |
| last_capture_count = s->capture_count; |
| |
| re_emit_op_u8(s, REOP_back_reference + 2 * is_backward_dir + s->ignore_case, c); |
| } |
| break; |
| default: |
| goto parse_class_atom; |
| } |
| break; |
| case '[': |
| last_atom_start = s->byte_code.size; |
| last_capture_count = s->capture_count; |
| if (is_backward_dir) |
| re_emit_op(s, REOP_prev); |
| if (re_parse_char_class(s, &p)) |
| return -1; |
| if (is_backward_dir) |
| re_emit_op(s, REOP_prev); |
| break; |
| case ']': |
| case '}': |
| if (s->is_unicode) |
| return re_parse_error(s, "syntax error"); |
| goto parse_class_atom; |
| default: |
| parse_class_atom: |
| c = get_class_atom(s, cr, &p, FALSE); |
| if ((int)c < 0) |
| return -1; |
| normal_char: |
| last_atom_start = s->byte_code.size; |
| last_capture_count = s->capture_count; |
| if (is_backward_dir) |
| re_emit_op(s, REOP_prev); |
| if (c >= CLASS_RANGE_BASE) { |
| int ret; |
| ret = re_emit_string_list(s, cr); |
| re_string_list_free(cr); |
| if (ret) |
| return -1; |
| } else { |
| if (s->ignore_case) |
| c = lre_canonicalize(c, s->is_unicode); |
| re_emit_char(s, c); |
| } |
| if (is_backward_dir) |
| re_emit_op(s, REOP_prev); |
| break; |
| } |
| |
| /* quantifier */ |
| if (last_atom_start >= 0) { |
| c = *p; |
| switch(c) { |
| case '*': |
| p++; |
| quant_min = 0; |
| quant_max = INT32_MAX; |
| goto quantifier; |
| case '+': |
| p++; |
| quant_min = 1; |
| quant_max = INT32_MAX; |
| goto quantifier; |
| case '?': |
| p++; |
| quant_min = 0; |
| quant_max = 1; |
| goto quantifier; |
| case '{': |
| { |
| const uint8_t *p1 = p; |
| /* As an extension (see ES6 annex B), we accept '{' not |
| followed by digits as a normal atom */ |
| if (!is_digit(p[1])) { |
| if (s->is_unicode) |
| goto invalid_quant_count; |
| break; |
| } |
| p++; |
| quant_min = parse_digits(&p, TRUE); |
| quant_max = quant_min; |
| if (*p == ',') { |
| p++; |
| if (is_digit(*p)) { |
| quant_max = parse_digits(&p, TRUE); |
| if (quant_max < quant_min) { |
| invalid_quant_count: |
| return re_parse_error(s, "invalid repetition count"); |
| } |
| } else { |
| quant_max = INT32_MAX; /* infinity */ |
| } |
| } |
| if (*p != '}' && !s->is_unicode) { |
| /* Annex B: normal atom if invalid '{' syntax */ |
| p = p1; |
| break; |
| } |
| if (re_parse_expect(s, &p, '}')) |
| return -1; |
| } |
| quantifier: |
| greedy = TRUE; |
| if (*p == '?') { |
| p++; |
| greedy = FALSE; |
| } |
| if (last_atom_start < 0) { |
| return re_parse_error(s, "nothing to repeat"); |
| } |
| if (greedy) { |
| int len, pos; |
| |
| if (quant_max > 0) { |
| /* specific optimization for simple quantifiers */ |
| if (dbuf_error(&s->byte_code)) |
| goto out_of_memory; |
| len = re_is_simple_quantifier(s->byte_code.buf + last_atom_start, |
| s->byte_code.size - last_atom_start); |
| if (len > 0) { |
| re_emit_op(s, REOP_match); |
| |
| if (dbuf_insert(&s->byte_code, last_atom_start, 17)) |
| goto out_of_memory; |
| pos = last_atom_start; |
| s->byte_code.buf[pos++] = REOP_simple_greedy_quant; |
| put_u32(&s->byte_code.buf[pos], |
| s->byte_code.size - last_atom_start - 17); |
| pos += 4; |
| put_u32(&s->byte_code.buf[pos], quant_min); |
| pos += 4; |
| put_u32(&s->byte_code.buf[pos], quant_max); |
| pos += 4; |
| put_u32(&s->byte_code.buf[pos], len); |
| pos += 4; |
| goto done; |
| } |
| } |
| |
| if (dbuf_error(&s->byte_code)) |
| goto out_of_memory; |
| } |
| /* the spec tells that if there is no advance when |
| running the atom after the first quant_min times, |
| then there is no match. We remove this test when we |
| are sure the atom always advances the position. */ |
| add_zero_advance_check = re_need_check_advance(s->byte_code.buf + last_atom_start, |
| s->byte_code.size - last_atom_start); |
| |
| { |
| int len, pos; |
| len = s->byte_code.size - last_atom_start; |
| if (quant_min == 0) { |
| /* need to reset the capture in case the atom is |
| not executed */ |
| if (last_capture_count != s->capture_count) { |
| if (dbuf_insert(&s->byte_code, last_atom_start, 3)) |
| goto out_of_memory; |
| s->byte_code.buf[last_atom_start++] = REOP_save_reset; |
| s->byte_code.buf[last_atom_start++] = last_capture_count; |
| s->byte_code.buf[last_atom_start++] = s->capture_count - 1; |
| } |
| if (quant_max == 0) { |
| s->byte_code.size = last_atom_start; |
| } else if (quant_max == 1 || quant_max == INT32_MAX) { |
| BOOL has_goto = (quant_max == INT32_MAX); |
| if (dbuf_insert(&s->byte_code, last_atom_start, 5 + add_zero_advance_check)) |
| goto out_of_memory; |
| s->byte_code.buf[last_atom_start] = REOP_split_goto_first + |
| greedy; |
| put_u32(s->byte_code.buf + last_atom_start + 1, |
| len + 5 * has_goto + add_zero_advance_check * 2); |
| if (add_zero_advance_check) { |
| s->byte_code.buf[last_atom_start + 1 + 4] = REOP_push_char_pos; |
| re_emit_op(s, REOP_check_advance); |
| } |
| if (has_goto) |
| re_emit_goto(s, REOP_goto, last_atom_start); |
| } else { |
| if (dbuf_insert(&s->byte_code, last_atom_start, 10 + add_zero_advance_check)) |
| goto out_of_memory; |
| pos = last_atom_start; |
| s->byte_code.buf[pos++] = REOP_push_i32; |
| put_u32(s->byte_code.buf + pos, quant_max); |
| pos += 4; |
| s->byte_code.buf[pos++] = REOP_split_goto_first + greedy; |
| put_u32(s->byte_code.buf + pos, len + 5 + add_zero_advance_check * 2); |
| pos += 4; |
| if (add_zero_advance_check) { |
| s->byte_code.buf[pos++] = REOP_push_char_pos; |
| re_emit_op(s, REOP_check_advance); |
| } |
| re_emit_goto(s, REOP_loop, last_atom_start + 5); |
| re_emit_op(s, REOP_drop); |
| } |
| } else if (quant_min == 1 && quant_max == INT32_MAX && |
| !add_zero_advance_check) { |
| re_emit_goto(s, REOP_split_next_first - greedy, |
| last_atom_start); |
| } else { |
| if (quant_min == 1) { |
| /* nothing to add */ |
| } else { |
| if (dbuf_insert(&s->byte_code, last_atom_start, 5)) |
| goto out_of_memory; |
| s->byte_code.buf[last_atom_start] = REOP_push_i32; |
| put_u32(s->byte_code.buf + last_atom_start + 1, |
| quant_min); |
| last_atom_start += 5; |
| re_emit_goto(s, REOP_loop, last_atom_start); |
| re_emit_op(s, REOP_drop); |
| } |
| if (quant_max == INT32_MAX) { |
| pos = s->byte_code.size; |
| re_emit_op_u32(s, REOP_split_goto_first + greedy, |
| len + 5 + add_zero_advance_check * 2); |
| if (add_zero_advance_check) |
| re_emit_op(s, REOP_push_char_pos); |
| /* copy the atom */ |
| dbuf_put_self(&s->byte_code, last_atom_start, len); |
| if (add_zero_advance_check) |
| re_emit_op(s, REOP_check_advance); |
| re_emit_goto(s, REOP_goto, pos); |
| } else if (quant_max > quant_min) { |
| re_emit_op_u32(s, REOP_push_i32, quant_max - quant_min); |
| pos = s->byte_code.size; |
| re_emit_op_u32(s, REOP_split_goto_first + greedy, |
| len + 5 + add_zero_advance_check * 2); |
| if (add_zero_advance_check) |
| re_emit_op(s, REOP_push_char_pos); |
| /* copy the atom */ |
| dbuf_put_self(&s->byte_code, last_atom_start, len); |
| if (add_zero_advance_check) |
| re_emit_op(s, REOP_check_advance); |
| re_emit_goto(s, REOP_loop, pos); |
| re_emit_op(s, REOP_drop); |
| } |
| } |
| last_atom_start = -1; |
| } |
| break; |
| default: |
| break; |
| } |
| } |
| done: |
| s->buf_ptr = p; |
| return 0; |
| out_of_memory: |
| return re_parse_out_of_memory(s); |
| } |
| |
| static int re_parse_alternative(REParseState *s, BOOL is_backward_dir) |
| { |
| const uint8_t *p; |
| int ret; |
| size_t start, term_start, end, term_size; |
| |
| start = s->byte_code.size; |
| for(;;) { |
| p = s->buf_ptr; |
| if (p >= s->buf_end) |
| break; |
| if (*p == '|' || *p == ')') |
| break; |
| term_start = s->byte_code.size; |
| ret = re_parse_term(s, is_backward_dir); |
| if (ret) |
| return ret; |
| if (is_backward_dir) { |
| /* reverse the order of the terms (XXX: inefficient, but |
| speed is not really critical here) */ |
| end = s->byte_code.size; |
| term_size = end - term_start; |
| if (dbuf_realloc(&s->byte_code, end + term_size)) |
| return -1; |
| memmove(s->byte_code.buf + start + term_size, |
| s->byte_code.buf + start, |
| end - start); |
| memcpy(s->byte_code.buf + start, s->byte_code.buf + end, |
| term_size); |
| } |
| } |
| return 0; |
| } |
| |
| static int re_parse_disjunction(REParseState *s, BOOL is_backward_dir) |
| { |
| int start, len, pos; |
| |
| if (lre_check_stack_overflow(s->opaque, 0)) |
| return re_parse_error(s, "stack overflow"); |
| |
| start = s->byte_code.size; |
| if (re_parse_alternative(s, is_backward_dir)) |
| return -1; |
| while (*s->buf_ptr == '|') { |
| s->buf_ptr++; |
| |
| len = s->byte_code.size - start; |
| |
| /* insert a split before the first alternative */ |
| if (dbuf_insert(&s->byte_code, start, 5)) { |
| return re_parse_out_of_memory(s); |
| } |
| s->byte_code.buf[start] = REOP_split_next_first; |
| put_u32(s->byte_code.buf + start + 1, len + 5); |
| |
| pos = re_emit_op_u32(s, REOP_goto, 0); |
| |
| if (re_parse_alternative(s, is_backward_dir)) |
| return -1; |
| |
| /* patch the goto */ |
| len = s->byte_code.size - (pos + 4); |
| put_u32(s->byte_code.buf + pos, len); |
| } |
| return 0; |
| } |
| |
| /* the control flow is recursive so the analysis can be linear */ |
| static int compute_stack_size(const uint8_t *bc_buf, int bc_buf_len) |
| { |
| int stack_size, stack_size_max, pos, opcode, len; |
| uint32_t val; |
| |
| stack_size = 0; |
| stack_size_max = 0; |
| bc_buf += RE_HEADER_LEN; |
| bc_buf_len -= RE_HEADER_LEN; |
| pos = 0; |
| while (pos < bc_buf_len) { |
| opcode = bc_buf[pos]; |
| len = reopcode_info[opcode].size; |
| assert(opcode < REOP_COUNT); |
| assert((pos + len) <= bc_buf_len); |
| switch(opcode) { |
| case REOP_push_i32: |
| case REOP_push_char_pos: |
| stack_size++; |
| if (stack_size > stack_size_max) { |
| if (stack_size > STACK_SIZE_MAX) |
| return -1; |
| stack_size_max = stack_size; |
| } |
| break; |
| case REOP_drop: |
| case REOP_check_advance: |
| assert(stack_size > 0); |
| stack_size--; |
| break; |
| case REOP_range: |
| case REOP_range_i: |
| val = get_u16(bc_buf + pos + 1); |
| len += val * 4; |
| break; |
| case REOP_range32: |
| case REOP_range32_i: |
| val = get_u16(bc_buf + pos + 1); |
| len += val * 8; |
| break; |
| } |
| pos += len; |
| } |
| return stack_size_max; |
| } |
| |
| static void *lre_bytecode_realloc(void *opaque, void *ptr, size_t size) |
| { |
| if (size > (INT32_MAX / 2)) { |
| /* the bytecode cannot be larger than 2G. Leave some slack to |
| avoid some overflows. */ |
| return NULL; |
| } else { |
| return lre_realloc(opaque, ptr, size); |
| } |
| } |
| |
| /* 'buf' must be a zero terminated UTF-8 string of length buf_len. |
| Return NULL if error and allocate an error message in *perror_msg, |
| otherwise the compiled bytecode and its length in plen. |
| */ |
| uint8_t *lre_compile(int *plen, char *error_msg, int error_msg_size, |
| const char *buf, size_t buf_len, int re_flags, |
| void *opaque) |
| { |
| REParseState s_s, *s = &s_s; |
| int stack_size; |
| BOOL is_sticky; |
| |
| memset(s, 0, sizeof(*s)); |
| s->opaque = opaque; |
| s->buf_ptr = (const uint8_t *)buf; |
| s->buf_end = s->buf_ptr + buf_len; |
| s->buf_start = s->buf_ptr; |
| s->re_flags = re_flags; |
| s->is_unicode = ((re_flags & (LRE_FLAG_UNICODE | LRE_FLAG_UNICODE_SETS)) != 0); |
| is_sticky = ((re_flags & LRE_FLAG_STICKY) != 0); |
| s->ignore_case = ((re_flags & LRE_FLAG_IGNORECASE) != 0); |
| s->multi_line = ((re_flags & LRE_FLAG_MULTILINE) != 0); |
| s->dotall = ((re_flags & LRE_FLAG_DOTALL) != 0); |
| s->unicode_sets = ((re_flags & LRE_FLAG_UNICODE_SETS) != 0); |
| s->capture_count = 1; |
| s->total_capture_count = -1; |
| s->has_named_captures = -1; |
| |
| dbuf_init2(&s->byte_code, opaque, lre_bytecode_realloc); |
| dbuf_init2(&s->group_names, opaque, lre_realloc); |
| |
| dbuf_put_u16(&s->byte_code, re_flags); /* first element is the flags */ |
| dbuf_putc(&s->byte_code, 0); /* second element is the number of captures */ |
| dbuf_putc(&s->byte_code, 0); /* stack size */ |
| dbuf_put_u32(&s->byte_code, 0); /* bytecode length */ |
| |
| if (!is_sticky) { |
| /* iterate thru all positions (about the same as .*?( ... ) ) |
| . We do it without an explicit loop so that lock step |
| thread execution will be possible in an optimized |
| implementation */ |
| re_emit_op_u32(s, REOP_split_goto_first, 1 + 5); |
| re_emit_op(s, REOP_any); |
| re_emit_op_u32(s, REOP_goto, -(5 + 1 + 5)); |
| } |
| re_emit_op_u8(s, REOP_save_start, 0); |
| |
| if (re_parse_disjunction(s, FALSE)) { |
| error: |
| dbuf_free(&s->byte_code); |
| dbuf_free(&s->group_names); |
| pstrcpy(error_msg, error_msg_size, s->u.error_msg); |
| *plen = 0; |
| return NULL; |
| } |
| |
| re_emit_op_u8(s, REOP_save_end, 0); |
| |
| re_emit_op(s, REOP_match); |
| |
| if (*s->buf_ptr != '\0') { |
| re_parse_error(s, "extraneous characters at the end"); |
| goto error; |
| } |
| |
| if (dbuf_error(&s->byte_code)) { |
| re_parse_out_of_memory(s); |
| goto error; |
| } |
| |
| stack_size = compute_stack_size(s->byte_code.buf, s->byte_code.size); |
| if (stack_size < 0) { |
| re_parse_error(s, "too many imbricated quantifiers"); |
| goto error; |
| } |
| |
| s->byte_code.buf[RE_HEADER_CAPTURE_COUNT] = s->capture_count; |
| s->byte_code.buf[RE_HEADER_STACK_SIZE] = stack_size; |
| put_u32(s->byte_code.buf + RE_HEADER_BYTECODE_LEN, |
| s->byte_code.size - RE_HEADER_LEN); |
| |
| /* add the named groups if needed */ |
| if (s->group_names.size > (s->capture_count - 1)) { |
| dbuf_put(&s->byte_code, s->group_names.buf, s->group_names.size); |
| put_u16(s->byte_code.buf + RE_HEADER_FLAGS, |
| lre_get_flags(s->byte_code.buf) | LRE_FLAG_NAMED_GROUPS); |
| } |
| dbuf_free(&s->group_names); |
| |
| #ifdef DUMP_REOP |
| lre_dump_bytecode(s->byte_code.buf, s->byte_code.size); |
| #endif |
| |
| error_msg[0] = '\0'; |
| *plen = s->byte_code.size; |
| return s->byte_code.buf; |
| } |
| |
| static BOOL is_line_terminator(uint32_t c) |
| { |
| return (c == '\n' || c == '\r' || c == CP_LS || c == CP_PS); |
| } |
| |
| static BOOL is_word_char(uint32_t c) |
| { |
| return ((c >= '0' && c <= '9') || |
| (c >= 'a' && c <= 'z') || |
| (c >= 'A' && c <= 'Z') || |
| (c == '_')); |
| } |
| |
| #define GET_CHAR(c, cptr, cbuf_end, cbuf_type) \ |
| do { \ |
| if (cbuf_type == 0) { \ |
| c = *cptr++; \ |
| } else { \ |
| const uint16_t *_p = (const uint16_t *)cptr; \ |
| const uint16_t *_end = (const uint16_t *)cbuf_end; \ |
| c = *_p++; \ |
| if (is_hi_surrogate(c) && cbuf_type == 2) { \ |
| if (_p < _end && is_lo_surrogate(*_p)) { \ |
| c = from_surrogate(c, *_p++); \ |
| } \ |
| } \ |
| cptr = (const void *)_p; \ |
| } \ |
| } while (0) |
| |
| #define PEEK_CHAR(c, cptr, cbuf_end, cbuf_type) \ |
| do { \ |
| if (cbuf_type == 0) { \ |
| c = cptr[0]; \ |
| } else { \ |
| const uint16_t *_p = (const uint16_t *)cptr; \ |
| const uint16_t *_end = (const uint16_t *)cbuf_end; \ |
| c = *_p++; \ |
| if (is_hi_surrogate(c) && cbuf_type == 2) { \ |
| if (_p < _end && is_lo_surrogate(*_p)) { \ |
| c = from_surrogate(c, *_p); \ |
| } \ |
| } \ |
| } \ |
| } while (0) |
| |
| #define PEEK_PREV_CHAR(c, cptr, cbuf_start, cbuf_type) \ |
| do { \ |
| if (cbuf_type == 0) { \ |
| c = cptr[-1]; \ |
| } else { \ |
| const uint16_t *_p = (const uint16_t *)cptr - 1; \ |
| const uint16_t *_start = (const uint16_t *)cbuf_start; \ |
| c = *_p; \ |
| if (is_lo_surrogate(c) && cbuf_type == 2) { \ |
| if (_p > _start && is_hi_surrogate(_p[-1])) { \ |
| c = from_surrogate(*--_p, c); \ |
| } \ |
| } \ |
| } \ |
| } while (0) |
| |
| #define GET_PREV_CHAR(c, cptr, cbuf_start, cbuf_type) \ |
| do { \ |
| if (cbuf_type == 0) { \ |
| cptr--; \ |
| c = cptr[0]; \ |
| } else { \ |
| const uint16_t *_p = (const uint16_t *)cptr - 1; \ |
| const uint16_t *_start = (const uint16_t *)cbuf_start; \ |
| c = *_p; \ |
| if (is_lo_surrogate(c) && cbuf_type == 2) { \ |
| if (_p > _start && is_hi_surrogate(_p[-1])) { \ |
| c = from_surrogate(*--_p, c); \ |
| } \ |
| } \ |
| cptr = (const void *)_p; \ |
| } \ |
| } while (0) |
| |
| #define PREV_CHAR(cptr, cbuf_start, cbuf_type) \ |
| do { \ |
| if (cbuf_type == 0) { \ |
| cptr--; \ |
| } else { \ |
| const uint16_t *_p = (const uint16_t *)cptr - 1; \ |
| const uint16_t *_start = (const uint16_t *)cbuf_start; \ |
| if (is_lo_surrogate(*_p) && cbuf_type == 2) { \ |
| if (_p > _start && is_hi_surrogate(_p[-1])) { \ |
| --_p; \ |
| } \ |
| } \ |
| cptr = (const void *)_p; \ |
| } \ |
| } while (0) |
| |
| typedef uintptr_t StackInt; |
| |
| typedef enum { |
| RE_EXEC_STATE_SPLIT, |
| RE_EXEC_STATE_LOOKAHEAD, |
| RE_EXEC_STATE_NEGATIVE_LOOKAHEAD, |
| RE_EXEC_STATE_GREEDY_QUANT, |
| } REExecStateEnum; |
| |
| typedef struct REExecState { |
| REExecStateEnum type : 8; |
| uint8_t stack_len; |
| size_t count; /* only used for RE_EXEC_STATE_GREEDY_QUANT */ |
| const uint8_t *cptr; |
| const uint8_t *pc; |
| void *buf[0]; |
| } REExecState; |
| |
| typedef struct { |
| const uint8_t *cbuf; |
| const uint8_t *cbuf_end; |
| /* 0 = 8 bit chars, 1 = 16 bit chars, 2 = 16 bit chars, UTF-16 */ |
| int cbuf_type; |
| int capture_count; |
| int stack_size_max; |
| BOOL is_unicode; |
| int interrupt_counter; |
| void *opaque; /* used for stack overflow check */ |
| |
| size_t state_size; |
| uint8_t *state_stack; |
| size_t state_stack_size; |
| size_t state_stack_len; |
| } REExecContext; |
| |
| static int push_state(REExecContext *s, |
| uint8_t **capture, |
| StackInt *stack, size_t stack_len, |
| const uint8_t *pc, const uint8_t *cptr, |
| REExecStateEnum type, size_t count) |
| { |
| REExecState *rs; |
| uint8_t *new_stack; |
| size_t new_size, i, n; |
| StackInt *stack_buf; |
| |
| if (unlikely((s->state_stack_len + 1) > s->state_stack_size)) { |
| /* reallocate the stack */ |
| new_size = s->state_stack_size * 3 / 2; |
| if (new_size < 8) |
| new_size = 8; |
| new_stack = lre_realloc(s->opaque, s->state_stack, new_size * s->state_size); |
| if (!new_stack) |
| return -1; |
| s->state_stack_size = new_size; |
| s->state_stack = new_stack; |
| } |
| rs = (REExecState *)(s->state_stack + s->state_stack_len * s->state_size); |
| s->state_stack_len++; |
| rs->type = type; |
| rs->count = count; |
| rs->stack_len = stack_len; |
| rs->cptr = cptr; |
| rs->pc = pc; |
| n = 2 * s->capture_count; |
| for(i = 0; i < n; i++) |
| rs->buf[i] = capture[i]; |
| stack_buf = (StackInt *)(rs->buf + n); |
| for(i = 0; i < stack_len; i++) |
| stack_buf[i] = stack[i]; |
| return 0; |
| } |
| |
| static int lre_poll_timeout(REExecContext *s) |
| { |
| if (unlikely(--s->interrupt_counter <= 0)) { |
| s->interrupt_counter = INTERRUPT_COUNTER_INIT; |
| if (lre_check_timeout(s->opaque)) |
| return LRE_RET_TIMEOUT; |
| } |
| return 0; |
| } |
| |
| /* return 1 if match, 0 if not match or < 0 if error. */ |
| static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture, |
| StackInt *stack, int stack_len, |
| const uint8_t *pc, const uint8_t *cptr, |
| BOOL no_recurse) |
| { |
| int opcode, ret; |
| int cbuf_type; |
| uint32_t val, c; |
| const uint8_t *cbuf_end; |
| |
| cbuf_type = s->cbuf_type; |
| cbuf_end = s->cbuf_end; |
| |
| for(;;) { |
| // printf("top=%p: pc=%d\n", th_list.top, (int)(pc - (bc_buf + RE_HEADER_LEN))); |
| opcode = *pc++; |
| switch(opcode) { |
| case REOP_match: |
| { |
| REExecState *rs; |
| if (no_recurse) |
| return (intptr_t)cptr; |
| ret = 1; |
| goto recurse; |
| no_match: |
| if (no_recurse) |
| return 0; |
| ret = 0; |
| recurse: |
| for(;;) { |
| if (lre_poll_timeout(s)) |
| return LRE_RET_TIMEOUT; |
| if (s->state_stack_len == 0) |
| return ret; |
| rs = (REExecState *)(s->state_stack + |
| (s->state_stack_len - 1) * s->state_size); |
| if (rs->type == RE_EXEC_STATE_SPLIT) { |
| if (!ret) { |
| pop_state: |
| memcpy(capture, rs->buf, |
| sizeof(capture[0]) * 2 * s->capture_count); |
| pop_state1: |
| pc = rs->pc; |
| cptr = rs->cptr; |
| stack_len = rs->stack_len; |
| memcpy(stack, rs->buf + 2 * s->capture_count, |
| stack_len * sizeof(stack[0])); |
| s->state_stack_len--; |
| break; |
| } |
| } else if (rs->type == RE_EXEC_STATE_GREEDY_QUANT) { |
| if (!ret) { |
| uint32_t char_count, i; |
| memcpy(capture, rs->buf, |
| sizeof(capture[0]) * 2 * s->capture_count); |
| stack_len = rs->stack_len; |
| memcpy(stack, rs->buf + 2 * s->capture_count, |
| stack_len * sizeof(stack[0])); |
| pc = rs->pc; |
| cptr = rs->cptr; |
| /* go backward */ |
| char_count = get_u32(pc + 12); |
| for(i = 0; i < char_count; i++) { |
| PREV_CHAR(cptr, s->cbuf, cbuf_type); |
| } |
| pc = (pc + 16) + (int)get_u32(pc); |
| rs->cptr = cptr; |
| rs->count--; |
| if (rs->count == 0) { |
| s->state_stack_len--; |
| } |
| break; |
| } |
| } else { |
| ret = ((rs->type == RE_EXEC_STATE_LOOKAHEAD && ret) || |
| (rs->type == RE_EXEC_STATE_NEGATIVE_LOOKAHEAD && !ret)); |
| if (ret) { |
| /* keep the capture in case of positive lookahead */ |
| if (rs->type == RE_EXEC_STATE_LOOKAHEAD) |
| goto pop_state1; |
| else |
| goto pop_state; |
| } |
| } |
| s->state_stack_len--; |
| } |
| } |
| break; |
| case REOP_char32: |
| case REOP_char32_i: |
| val = get_u32(pc); |
| pc += 4; |
| goto test_char; |
| case REOP_char: |
| case REOP_char_i: |
| val = get_u16(pc); |
| pc += 2; |
| test_char: |
| if (cptr >= cbuf_end) |
| goto no_match; |
| GET_CHAR(c, cptr, cbuf_end, cbuf_type); |
| if (opcode == REOP_char_i || opcode == REOP_char32_i) { |
| c = lre_canonicalize(c, s->is_unicode); |
| } |
| if (val != c) |
| goto no_match; |
| break; |
| case REOP_split_goto_first: |
| case REOP_split_next_first: |
| { |
| const uint8_t *pc1; |
| |
| val = get_u32(pc); |
| pc += 4; |
| if (opcode == REOP_split_next_first) { |
| pc1 = pc + (int)val; |
| } else { |
| pc1 = pc; |
| pc = pc + (int)val; |
| } |
| ret = push_state(s, capture, stack, stack_len, |
| pc1, cptr, RE_EXEC_STATE_SPLIT, 0); |
| if (ret < 0) |
| return LRE_RET_MEMORY_ERROR; |
| break; |
| } |
| case REOP_lookahead: |
| case REOP_negative_lookahead: |
| val = get_u32(pc); |
| pc += 4; |
| ret = push_state(s, capture, stack, stack_len, |
| pc + (int)val, cptr, |
| RE_EXEC_STATE_LOOKAHEAD + opcode - REOP_lookahead, |
| 0); |
| if (ret < 0) |
| return LRE_RET_MEMORY_ERROR; |
| break; |
| |
| case REOP_goto: |
| val = get_u32(pc); |
| pc += 4 + (int)val; |
| if (lre_poll_timeout(s)) |
| return LRE_RET_TIMEOUT; |
| break; |
| case REOP_line_start: |
| case REOP_line_start_m: |
| if (cptr == s->cbuf) |
| break; |
| if (opcode == REOP_line_start) |
| goto no_match; |
| PEEK_PREV_CHAR(c, cptr, s->cbuf, cbuf_type); |
| if (!is_line_terminator(c)) |
| goto no_match; |
| break; |
| case REOP_line_end: |
| case REOP_line_end_m: |
| if (cptr == cbuf_end) |
| break; |
| if (opcode == REOP_line_end) |
| goto no_match; |
| PEEK_CHAR(c, cptr, cbuf_end, cbuf_type); |
| if (!is_line_terminator(c)) |
| goto no_match; |
| break; |
| case REOP_dot: |
| if (cptr == cbuf_end) |
| goto no_match; |
| GET_CHAR(c, cptr, cbuf_end, cbuf_type); |
| if (is_line_terminator(c)) |
| goto no_match; |
| break; |
| case REOP_any: |
| if (cptr == cbuf_end) |
| goto no_match; |
| GET_CHAR(c, cptr, cbuf_end, cbuf_type); |
| break; |
| case REOP_save_start: |
| case REOP_save_end: |
| val = *pc++; |
| assert(val < s->capture_count); |
| capture[2 * val + opcode - REOP_save_start] = (uint8_t *)cptr; |
| break; |
| case REOP_save_reset: |
| { |
| uint32_t val2; |
| val = pc[0]; |
| val2 = pc[1]; |
| pc += 2; |
| assert(val2 < s->capture_count); |
| while (val <= val2) { |
| capture[2 * val] = NULL; |
| capture[2 * val + 1] = NULL; |
| val++; |
| } |
| } |
| break; |
| case REOP_push_i32: |
| val = get_u32(pc); |
| pc += 4; |
| stack[stack_len++] = val; |
| break; |
| case REOP_drop: |
| stack_len--; |
| break; |
| case REOP_loop: |
| val = get_u32(pc); |
| pc += 4; |
| if (--stack[stack_len - 1] != 0) { |
| pc += (int)val; |
| if (lre_poll_timeout(s)) |
| return LRE_RET_TIMEOUT; |
| } |
| break; |
| case REOP_push_char_pos: |
| stack[stack_len++] = (uintptr_t)cptr; |
| break; |
| case REOP_check_advance: |
| if (stack[--stack_len] == (uintptr_t)cptr) |
| goto no_match; |
| break; |
| case REOP_word_boundary: |
| case REOP_word_boundary_i: |
| case REOP_not_word_boundary: |
| case REOP_not_word_boundary_i: |
| { |
| BOOL v1, v2; |
| int ignore_case = (opcode == REOP_word_boundary_i || opcode == REOP_not_word_boundary_i); |
| BOOL is_boundary = (opcode == REOP_word_boundary || opcode == REOP_word_boundary_i); |
| /* char before */ |
| if (cptr == s->cbuf) { |
| v1 = FALSE; |
| } else { |
| PEEK_PREV_CHAR(c, cptr, s->cbuf, cbuf_type); |
| if (ignore_case) |
| c = lre_canonicalize(c, s->is_unicode); |
| v1 = is_word_char(c); |
| } |
| /* current char */ |
| if (cptr >= cbuf_end) { |
| v2 = FALSE; |
| } else { |
| PEEK_CHAR(c, cptr, cbuf_end, cbuf_type); |
| if (ignore_case) |
| c = lre_canonicalize(c, s->is_unicode); |
| v2 = is_word_char(c); |
| } |
| if (v1 ^ v2 ^ is_boundary) |
| goto no_match; |
| } |
| break; |
| case REOP_back_reference: |
| case REOP_back_reference_i: |
| case REOP_backward_back_reference: |
| case REOP_backward_back_reference_i: |
| { |
| const uint8_t *cptr1, *cptr1_end, *cptr1_start; |
| uint32_t c1, c2; |
| |
| val = *pc++; |
| if (val >= s->capture_count) |
| goto no_match; |
| cptr1_start = capture[2 * val]; |
| cptr1_end = capture[2 * val + 1]; |
| if (!cptr1_start || !cptr1_end) |
| break; |
| if (opcode == REOP_back_reference || |
| opcode == REOP_back_reference_i) { |
| cptr1 = cptr1_start; |
| while (cptr1 < cptr1_end) { |
| if (cptr >= cbuf_end) |
| goto no_match; |
| GET_CHAR(c1, cptr1, cptr1_end, cbuf_type); |
| GET_CHAR(c2, cptr, cbuf_end, cbuf_type); |
| if (opcode == REOP_back_reference_i) { |
| c1 = lre_canonicalize(c1, s->is_unicode); |
| c2 = lre_canonicalize(c2, s->is_unicode); |
| } |
| if (c1 != c2) |
| goto no_match; |
| } |
| } else { |
| cptr1 = cptr1_end; |
| while (cptr1 > cptr1_start) { |
| if (cptr == s->cbuf) |
| goto no_match; |
| GET_PREV_CHAR(c1, cptr1, cptr1_start, cbuf_type); |
| GET_PREV_CHAR(c2, cptr, s->cbuf, cbuf_type); |
| if (opcode == REOP_backward_back_reference_i) { |
| c1 = lre_canonicalize(c1, s->is_unicode); |
| c2 = lre_canonicalize(c2, s->is_unicode); |
| } |
| if (c1 != c2) |
| goto no_match; |
| } |
| } |
| } |
| break; |
| case REOP_range: |
| case REOP_range_i: |
| { |
| int n; |
| uint32_t low, high, idx_min, idx_max, idx; |
| |
| n = get_u16(pc); /* n must be >= 1 */ |
| pc += 2; |
| if (cptr >= cbuf_end) |
| goto no_match; |
| GET_CHAR(c, cptr, cbuf_end, cbuf_type); |
| if (opcode == REOP_range_i) { |
| c = lre_canonicalize(c, s->is_unicode); |
| } |
| idx_min = 0; |
| low = get_u16(pc + 0 * 4); |
| if (c < low) |
| goto no_match; |
| idx_max = n - 1; |
| high = get_u16(pc + idx_max * 4 + 2); |
| /* 0xffff in for last value means +infinity */ |
| if (unlikely(c >= 0xffff) && high == 0xffff) |
| goto range_match; |
| if (c > high) |
| goto no_match; |
| while (idx_min <= idx_max) { |
| idx = (idx_min + idx_max) / 2; |
| low = get_u16(pc + idx * 4); |
| high = get_u16(pc + idx * 4 + 2); |
| if (c < low) |
| idx_max = idx - 1; |
| else if (c > high) |
| idx_min = idx + 1; |
| else |
| goto range_match; |
| } |
| goto no_match; |
| range_match: |
| pc += 4 * n; |
| } |
| break; |
| case REOP_range32: |
| case REOP_range32_i: |
| { |
| int n; |
| uint32_t low, high, idx_min, idx_max, idx; |
| |
| n = get_u16(pc); /* n must be >= 1 */ |
| pc += 2; |
| if (cptr >= cbuf_end) |
| goto no_match; |
| GET_CHAR(c, cptr, cbuf_end, cbuf_type); |
| if (opcode == REOP_range32_i) { |
| c = lre_canonicalize(c, s->is_unicode); |
| } |
| idx_min = 0; |
| low = get_u32(pc + 0 * 8); |
| if (c < low) |
| goto no_match; |
| idx_max = n - 1; |
| high = get_u32(pc + idx_max * 8 + 4); |
| if (c > high) |
| goto no_match; |
| while (idx_min <= idx_max) { |
| idx = (idx_min + idx_max) / 2; |
| low = get_u32(pc + idx * 8); |
| high = get_u32(pc + idx * 8 + 4); |
| if (c < low) |
| idx_max = idx - 1; |
| else if (c > high) |
| idx_min = idx + 1; |
| else |
| goto range32_match; |
| } |
| goto no_match; |
| range32_match: |
| pc += 8 * n; |
| } |
| break; |
| case REOP_prev: |
| /* go to the previous char */ |
| if (cptr == s->cbuf) |
| goto no_match; |
| PREV_CHAR(cptr, s->cbuf, cbuf_type); |
| break; |
| case REOP_simple_greedy_quant: |
| { |
| uint32_t next_pos, quant_min, quant_max; |
| size_t q; |
| intptr_t res; |
| const uint8_t *pc1; |
| |
| next_pos = get_u32(pc); |
| quant_min = get_u32(pc + 4); |
| quant_max = get_u32(pc + 8); |
| pc += 16; |
| pc1 = pc; |
| pc += (int)next_pos; |
| |
| q = 0; |
| for(;;) { |
| if (lre_poll_timeout(s)) |
| return LRE_RET_TIMEOUT; |
| res = lre_exec_backtrack(s, capture, stack, stack_len, |
| pc1, cptr, TRUE); |
| if (res == LRE_RET_MEMORY_ERROR || |
| res == LRE_RET_TIMEOUT) |
| return res; |
| if (!res) |
| break; |
| cptr = (uint8_t *)res; |
| q++; |
| if (q >= quant_max && quant_max != INT32_MAX) |
| break; |
| } |
| if (q < quant_min) |
| goto no_match; |
| if (q > quant_min) { |
| /* will examine all matches down to quant_min */ |
| ret = push_state(s, capture, stack, stack_len, |
| pc1 - 16, cptr, |
| RE_EXEC_STATE_GREEDY_QUANT, |
| q - quant_min); |
| if (ret < 0) |
| return LRE_RET_MEMORY_ERROR; |
| } |
| } |
| break; |
| default: |
| abort(); |
| } |
| } |
| } |
| |
| /* Return 1 if match, 0 if not match or < 0 if error (see LRE_RET_x). cindex is the |
| starting position of the match and must be such as 0 <= cindex <= |
| clen. */ |
| int lre_exec(uint8_t **capture, |
| const uint8_t *bc_buf, const uint8_t *cbuf, int cindex, int clen, |
| int cbuf_type, void *opaque) |
| { |
| REExecContext s_s, *s = &s_s; |
| int re_flags, i, alloca_size, ret; |
| StackInt *stack_buf; |
| const uint8_t *cptr; |
| |
| re_flags = lre_get_flags(bc_buf); |
| s->is_unicode = (re_flags & (LRE_FLAG_UNICODE | LRE_FLAG_UNICODE_SETS)) != 0; |
| s->capture_count = bc_buf[RE_HEADER_CAPTURE_COUNT]; |
| s->stack_size_max = bc_buf[RE_HEADER_STACK_SIZE]; |
| s->cbuf = cbuf; |
| s->cbuf_end = cbuf + (clen << cbuf_type); |
| s->cbuf_type = cbuf_type; |
| if (s->cbuf_type == 1 && s->is_unicode) |
| s->cbuf_type = 2; |
| s->interrupt_counter = INTERRUPT_COUNTER_INIT; |
| s->opaque = opaque; |
| |
| s->state_size = sizeof(REExecState) + |
| s->capture_count * sizeof(capture[0]) * 2 + |
| s->stack_size_max * sizeof(stack_buf[0]); |
| s->state_stack = NULL; |
| s->state_stack_len = 0; |
| s->state_stack_size = 0; |
| |
| for(i = 0; i < s->capture_count * 2; i++) |
| capture[i] = NULL; |
| alloca_size = s->stack_size_max * sizeof(stack_buf[0]); |
| stack_buf = alloca(alloca_size); |
| |
| cptr = cbuf + (cindex << cbuf_type); |
| if (0 < cindex && cindex < clen && s->cbuf_type == 2) { |
| const uint16_t *p = (const uint16_t *)cptr; |
| if (is_lo_surrogate(*p) && is_hi_surrogate(p[-1])) { |
| cptr = (const uint8_t *)(p - 1); |
| } |
| } |
| |
| ret = lre_exec_backtrack(s, capture, stack_buf, 0, bc_buf + RE_HEADER_LEN, |
| cptr, FALSE); |
| lre_realloc(s->opaque, s->state_stack, 0); |
| return ret; |
| } |
| |
| int lre_get_capture_count(const uint8_t *bc_buf) |
| { |
| return bc_buf[RE_HEADER_CAPTURE_COUNT]; |
| } |
| |
| int lre_get_flags(const uint8_t *bc_buf) |
| { |
| return get_u16(bc_buf + RE_HEADER_FLAGS); |
| } |
| |
| /* Return NULL if no group names. Otherwise, return a pointer to |
| 'capture_count - 1' zero terminated UTF-8 strings. */ |
| const char *lre_get_groupnames(const uint8_t *bc_buf) |
| { |
| uint32_t re_bytecode_len; |
| if ((lre_get_flags(bc_buf) & LRE_FLAG_NAMED_GROUPS) == 0) |
| return NULL; |
| re_bytecode_len = get_u32(bc_buf + RE_HEADER_BYTECODE_LEN); |
| return (const char *)(bc_buf + RE_HEADER_LEN + re_bytecode_len); |
| } |
| |
| #ifdef TEST |
| |
| BOOL lre_check_stack_overflow(void *opaque, size_t alloca_size) |
| { |
| return FALSE; |
| } |
| |
| void *lre_realloc(void *opaque, void *ptr, size_t size) |
| { |
| return realloc(ptr, size); |
| } |
| |
| int main(int argc, char **argv) |
| { |
| int len, flags, ret, i; |
| uint8_t *bc; |
| char error_msg[64]; |
| uint8_t *capture[CAPTURE_COUNT_MAX * 2]; |
| const char *input; |
| int input_len, capture_count; |
| |
| if (argc < 4) { |
| printf("usage: %s regexp flags input\n", argv[0]); |
| return 1; |
| } |
| flags = atoi(argv[2]); |
| bc = lre_compile(&len, error_msg, sizeof(error_msg), argv[1], |
| strlen(argv[1]), flags, NULL); |
| if (!bc) { |
| fprintf(stderr, "error: %s\n", error_msg); |
| exit(1); |
| } |
| |
| input = argv[3]; |
| input_len = strlen(input); |
| |
| ret = lre_exec(capture, bc, (uint8_t *)input, 0, input_len, 0, NULL); |
| printf("ret=%d\n", ret); |
| if (ret == 1) { |
| capture_count = lre_get_capture_count(bc); |
| for(i = 0; i < 2 * capture_count; i++) { |
| uint8_t *ptr; |
| ptr = capture[i]; |
| printf("%d: ", i); |
| if (!ptr) |
| printf("<nil>"); |
| else |
| printf("%u", (int)(ptr - (uint8_t *)input)); |
| printf("\n"); |
| } |
| } |
| return 0; |
| } |
| #endif |