| /*------------------------------------------------------------------------- |
| * |
| * regprefix.c |
| * Extract a common prefix, if any, from a compiled regex. |
| * |
| * |
| * Portions Copyright (c) 2012-2023, PostgreSQL Global Development Group |
| * Portions Copyright (c) 1998, 1999 Henry Spencer |
| * |
| * IDENTIFICATION |
| * src/backend/regex/regprefix.c |
| * |
| *------------------------------------------------------------------------- |
| */ |
| |
| #include "regex/regguts.h" |
| |
| |
| /* |
| * forward declarations |
| */ |
| static int findprefix(struct cnfa *cnfa, struct colormap *cm, |
| chr *string, size_t *slength); |
| |
| |
| /* |
| * pg_regprefix - get common prefix for regular expression |
| * |
| * Returns one of: |
| * REG_NOMATCH: there is no common prefix of strings matching the regex |
| * REG_PREFIX: there is a common prefix of strings matching the regex |
| * REG_EXACT: all strings satisfying the regex must match the same string |
| * or a REG_XXX error code |
| * |
| * In the non-failure cases, *string is set to a palloc'd string containing |
| * the common prefix or exact value, of length *slength (measured in chrs |
| * not bytes!). |
| * |
| * This function does not analyze all complex cases (such as lookaround |
| * constraints) exactly. Therefore it is possible that some strings matching |
| * the reported prefix or exact-match string do not satisfy the regex. But |
| * it should never be the case that a string satisfying the regex does not |
| * match the reported prefix or exact-match string. |
| */ |
| int |
| pg_regprefix(regex_t *re, |
| chr **string, |
| size_t *slength) |
| { |
| struct guts *g; |
| struct cnfa *cnfa; |
| int st; |
| |
| /* sanity checks */ |
| if (string == NULL || slength == NULL) |
| return REG_INVARG; |
| *string = NULL; /* initialize for failure cases */ |
| *slength = 0; |
| if (re == NULL || re->re_magic != REMAGIC) |
| return REG_INVARG; |
| if (re->re_csize != sizeof(chr)) |
| return REG_MIXED; |
| |
| /* Initialize locale-dependent support */ |
| pg_set_regex_collation(re->re_collation); |
| |
| /* setup */ |
| g = (struct guts *) re->re_guts; |
| if (g->info & REG_UIMPOSSIBLE) |
| return REG_NOMATCH; |
| |
| /* |
| * This implementation considers only the search NFA for the topmost regex |
| * tree node. Therefore, constraints such as backrefs are not fully |
| * applied, which is allowed per the function's API spec. |
| */ |
| assert(g->tree != NULL); |
| cnfa = &g->tree->cnfa; |
| |
| /* matchall NFAs never have a fixed prefix */ |
| if (cnfa->flags & MATCHALL) |
| return REG_NOMATCH; |
| |
| /* |
| * Since a correct NFA should never contain any exit-free loops, it should |
| * not be possible for our traversal to return to a previously visited NFA |
| * state. Hence we need at most nstates chrs in the output string. |
| */ |
| *string = (chr *) MALLOC(cnfa->nstates * sizeof(chr)); |
| if (*string == NULL) |
| return REG_ESPACE; |
| |
| /* do it */ |
| st = findprefix(cnfa, &g->cmap, *string, slength); |
| |
| assert(*slength <= cnfa->nstates); |
| |
| /* clean up */ |
| if (st != REG_PREFIX && st != REG_EXACT) |
| { |
| FREE(*string); |
| *string = NULL; |
| *slength = 0; |
| } |
| |
| return st; |
| } |
| |
| /* |
| * findprefix - extract common prefix from cNFA |
| * |
| * Results are returned into the preallocated chr array string[], with |
| * *slength (which must be preset to zero) incremented for each chr. |
| */ |
| static int /* regprefix return code */ |
| findprefix(struct cnfa *cnfa, |
| struct colormap *cm, |
| chr *string, |
| size_t *slength) |
| { |
| int st; |
| int nextst; |
| color thiscolor; |
| chr c; |
| struct carc *ca; |
| |
| /* |
| * The "pre" state must have only BOS/BOL outarcs, else pattern isn't |
| * anchored left. If we have both BOS and BOL, they must go to the same |
| * next state. |
| */ |
| st = cnfa->pre; |
| nextst = -1; |
| for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++) |
| { |
| if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1]) |
| { |
| if (nextst == -1) |
| nextst = ca->to; |
| else if (nextst != ca->to) |
| return REG_NOMATCH; |
| } |
| else |
| return REG_NOMATCH; |
| } |
| if (nextst == -1) |
| return REG_NOMATCH; |
| |
| /* |
| * Scan through successive states, stopping as soon as we find one with |
| * more than one acceptable transition character (either multiple colors |
| * on out-arcs, or a color with more than one member chr). |
| * |
| * We could find a state with multiple out-arcs that are all labeled with |
| * the same singleton color; this comes from patterns like "^ab(cde|cxy)". |
| * In that case we add the chr "c" to the output string but then exit the |
| * loop with nextst == -1. This leaves a little bit on the table: if the |
| * pattern is like "^ab(cde|cdy)", we won't notice that "d" could be added |
| * to the prefix. But chasing multiple parallel state chains doesn't seem |
| * worth the trouble. |
| */ |
| do |
| { |
| st = nextst; |
| nextst = -1; |
| thiscolor = COLORLESS; |
| for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++) |
| { |
| /* We can ignore BOS/BOL arcs */ |
| if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1]) |
| continue; |
| |
| /* |
| * ... but EOS/EOL arcs terminate the search, as do RAINBOW arcs |
| * and LACONs |
| */ |
| if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1] || |
| ca->co == RAINBOW || ca->co >= cnfa->ncolors) |
| { |
| thiscolor = COLORLESS; |
| break; |
| } |
| if (thiscolor == COLORLESS) |
| { |
| /* First plain outarc */ |
| thiscolor = ca->co; |
| nextst = ca->to; |
| } |
| else if (thiscolor == ca->co) |
| { |
| /* Another plain outarc for same color */ |
| nextst = -1; |
| } |
| else |
| { |
| /* More than one plain outarc color terminates the search */ |
| thiscolor = COLORLESS; |
| break; |
| } |
| } |
| /* Done if we didn't find exactly one color on plain outarcs */ |
| if (thiscolor == COLORLESS) |
| break; |
| /* The color must be a singleton */ |
| if (cm->cd[thiscolor].nschrs != 1) |
| break; |
| /* Must not have any high-color-map entries */ |
| if (cm->cd[thiscolor].nuchrs != 0) |
| break; |
| |
| /* |
| * Identify the color's sole member chr and add it to the prefix |
| * string. In general the colormap data structure doesn't provide a |
| * way to find color member chrs, except by trying GETCOLOR() on each |
| * possible chr value, which won't do at all. However, for the cases |
| * we care about it should be sufficient to test the "firstchr" value, |
| * that is the first chr ever added to the color. There are cases |
| * where this might no longer be a member of the color (so we do need |
| * to test), but none of them are likely to arise for a character that |
| * is a member of a common prefix. If we do hit such a corner case, |
| * we just fall out without adding anything to the prefix string. |
| */ |
| c = cm->cd[thiscolor].firstchr; |
| if (GETCOLOR(cm, c) != thiscolor) |
| break; |
| |
| string[(*slength)++] = c; |
| |
| /* Advance to next state, but only if we have a unique next state */ |
| } while (nextst != -1); |
| |
| /* |
| * If we ended at a state that only has EOS/EOL outarcs leading to the |
| * "post" state, then we have an exact-match string. Note this is true |
| * even if the string is of zero length. |
| */ |
| nextst = -1; |
| for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++) |
| { |
| if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1]) |
| { |
| if (nextst == -1) |
| nextst = ca->to; |
| else if (nextst != ca->to) |
| { |
| nextst = -1; |
| break; |
| } |
| } |
| else |
| { |
| nextst = -1; |
| break; |
| } |
| } |
| if (nextst == cnfa->post) |
| return REG_EXACT; |
| |
| /* |
| * Otherwise, if we were unable to identify any prefix characters, say |
| * NOMATCH --- the pattern is anchored left, but doesn't specify any |
| * particular first character. |
| */ |
| if (*slength > 0) |
| return REG_PREFIX; |
| |
| return REG_NOMATCH; |
| } |