| /*------------------------------------------------------------------------- |
| * |
| * trgm_regexp.c |
| * Regular expression matching using trigrams. |
| * |
| * The general idea of trigram index support for a regular expression (regex) |
| * search is to transform the regex into a logical expression on trigrams. |
| * For example: |
| * |
| * (ab|cd)efg => ((abe & bef) | (cde & def)) & efg |
| * |
| * If a string matches the regex, then it must match the logical expression on |
| * trigrams. The opposite is not necessarily true, however: a string that |
| * matches the logical expression might not match the original regex. Such |
| * false positives are removed via recheck, by running the regular regex match |
| * operator on the retrieved heap tuple. |
| * |
| * Since the trigram expression involves both AND and OR operators, we can't |
| * expect the core index machinery to evaluate it completely. Instead, the |
| * result of regex analysis is a list of trigrams to be sought in the index, |
| * plus a simplified graph that is used by trigramsMatchGraph() to determine |
| * whether a particular indexed value matches the expression. |
| * |
| * Converting a regex to a trigram expression is based on analysis of an |
| * automaton corresponding to the regex. The algorithm consists of four |
| * stages: |
| * |
| * 1) Compile the regexp to NFA form. This is handled by the PostgreSQL |
| * regexp library, which provides accessors for its opaque regex_t struct |
| * to expose the NFA state graph and the "colors" (sets of equivalent |
| * characters) used as state transition labels. |
| * |
| * 2) Transform the original NFA into an expanded graph, where arcs |
| * are labeled with trigrams that must be present in order to move from |
| * one state to another via the arcs. The trigrams used in this stage |
| * consist of colors, not characters, as in the original NFA. |
| * |
| * 3) Expand the color trigrams into regular trigrams consisting of |
| * characters. If too many distinct trigrams are produced, trigrams are |
| * eliminated and the graph is simplified until it's simple enough. |
| * |
| * 4) Finally, the resulting graph is packed into a TrgmPackedGraph struct, |
| * and returned to the caller. |
| * |
| * 1) Compile the regexp to NFA form |
| * --------------------------------- |
| * The automaton returned by the regexp compiler is a graph where vertices |
| * are "states" and arcs are labeled with colors. Each color represents |
| * a set of characters, so that all characters assigned to the same color |
| * are interchangeable, so far as matching the regexp is concerned. There |
| * are two special states: "initial" and "final". A state can have multiple |
| * outgoing arcs labeled with the same color, which makes the automaton |
| * non-deterministic, because it can be in many states simultaneously. |
| * |
| * Note that this NFA is already lossy compared to the original regexp, |
| * since it ignores some regex features such as lookahead constraints and |
| * backref matching. This is OK for our purposes since it's still the case |
| * that only strings matching the NFA can possibly satisfy the regexp. |
| * |
| * 2) Transform the original NFA into an expanded graph |
| * ---------------------------------------------------- |
| * In the 2nd stage, the automaton is transformed into a graph based on the |
| * original NFA. Each state in the expanded graph represents a state from |
| * the original NFA, plus a prefix identifying the last two characters |
| * (colors, to be precise) seen before entering the state. There can be |
| * multiple states in the expanded graph for each state in the original NFA, |
| * depending on what characters can precede it. A prefix position can be |
| * "unknown" if it's uncertain what the preceding character was, or "blank" |
| * if the character was a non-word character (we don't need to distinguish |
| * which non-word character it was, so just think of all of them as blanks). |
| * |
| * For convenience in description, call an expanded-state identifier |
| * (two prefix colors plus a state number from the original NFA) an |
| * "enter key". |
| * |
| * Each arc of the expanded graph is labeled with a trigram that must be |
| * present in the string to match. We can construct this from an out-arc of |
| * the underlying NFA state by combining the expanded state's prefix with the |
| * color label of the underlying out-arc, if neither prefix position is |
| * "unknown". But note that some of the colors in the trigram might be |
| * "blank". This is OK since we want to generate word-boundary trigrams as |
| * the regular trigram machinery would, if we know that some word characters |
| * must be adjacent to a word boundary in all strings matching the NFA. |
| * |
| * The expanded graph can also have fewer states than the original NFA, |
| * because we don't bother to make a separate state entry unless the state |
| * is reachable by a valid arc. When an enter key is reachable from a state |
| * of the expanded graph, but we do not know a complete trigram associated |
| * with that transition, we cannot make a valid arc; instead we insert the |
| * enter key into the enterKeys list of the source state. This effectively |
| * means that the two expanded states are not reliably distinguishable based |
| * on examining trigrams. |
| * |
| * So the expanded graph resembles the original NFA, but the arcs are |
| * labeled with trigrams instead of individual characters, and there may be |
| * more or fewer states. It is a lossy representation of the original NFA: |
| * any string that matches the original regexp must match the expanded graph, |
| * but the reverse is not true. |
| * |
| * We build the expanded graph through a breadth-first traversal of states |
| * reachable from the initial state. At each reachable state, we identify the |
| * states reachable from it without traversing a predictable trigram, and add |
| * those states' enter keys to the current state. Then we generate all |
| * out-arcs leading out of this collection of states that have predictable |
| * trigrams, adding their target states to the queue of states to examine. |
| * |
| * When building the graph, if the number of states or arcs exceed pre-defined |
| * limits, we give up and simply mark any states not yet processed as final |
| * states. Roughly speaking, that means that we make use of some portion from |
| * the beginning of the regexp. Also, any colors that have too many member |
| * characters are treated as "unknown", so that we can't derive trigrams |
| * from them. |
| * |
| * 3) Expand the color trigrams into regular trigrams |
| * -------------------------------------------------- |
| * The trigrams in the expanded graph are "color trigrams", consisting |
| * of three consecutive colors that must be present in the string. But for |
| * search, we need regular trigrams consisting of characters. In the 3rd |
| * stage, the color trigrams are expanded into regular trigrams. Since each |
| * color can represent many characters, the total number of regular trigrams |
| * after expansion could be very large. Because searching the index for |
| * thousands of trigrams would be slow, and would likely produce so many |
| * false positives that we would have to traverse a large fraction of the |
| * index, the graph is simplified further in a lossy fashion by removing |
| * color trigrams. When a color trigram is removed, the states connected by |
| * any arcs labeled with that trigram are merged. |
| * |
| * Trigrams do not all have equivalent value for searching: some of them are |
| * more frequent and some of them are less frequent. Ideally, we would like |
| * to know the distribution of trigrams, but we don't. But because of padding |
| * we know for sure that the empty character is more frequent than others, |
| * so we can penalize trigrams according to presence of whitespace. The |
| * penalty assigned to each color trigram is the number of simple trigrams |
| * it would produce, times the penalties[] multiplier associated with its |
| * whitespace content. (The penalties[] constants were calculated by analysis |
| * of some real-life text.) We eliminate color trigrams starting with the |
| * highest-penalty one, until we get to a total penalty of no more than |
| * WISH_TRGM_PENALTY. However, we cannot remove a color trigram if that would |
| * lead to merging the initial and final states, so we may not be able to |
| * reach WISH_TRGM_PENALTY. It's still okay so long as we have no more than |
| * MAX_TRGM_COUNT simple trigrams in total, otherwise we fail. |
| * |
| * 4) Pack the graph into a compact representation |
| * ----------------------------------------------- |
| * The 2nd and 3rd stages might have eliminated or merged many of the states |
| * and trigrams created earlier, so in this final stage, the graph is |
| * compacted and packed into a simpler struct that contains only the |
| * information needed to evaluate it. |
| * |
| * ALGORITHM EXAMPLE: |
| * |
| * Consider the example regex "ab[cd]". This regex is transformed into the |
| * following NFA (for simplicity we show colors as their single members): |
| * |
| * 4# |
| * c/ |
| * a b / |
| * 1* --- 2 ---- 3 |
| * \ |
| * d\ |
| * 5# |
| * |
| * We use * to mark initial state and # to mark final state. It's not depicted, |
| * but states 1, 4, 5 have self-referencing arcs for all possible characters, |
| * because this pattern can match to any part of a string. |
| * |
| * As the result of stage 2 we will have the following graph: |
| * |
| * abc abd |
| * 2# <---- 1* ----> 3# |
| * |
| * The process for generating this graph is: |
| * 1) Create state 1 with enter key (UNKNOWN, UNKNOWN, 1). |
| * 2) Add key (UNKNOWN, "a", 2) to state 1. |
| * 3) Add key ("a", "b", 3) to state 1. |
| * 4) Create new state 2 with enter key ("b", "c", 4). Add an arc |
| * from state 1 to state 2 with label trigram "abc". |
| * 5) Mark state 2 final because state 4 of source NFA is marked as final. |
| * 6) Create new state 3 with enter key ("b", "d", 5). Add an arc |
| * from state 1 to state 3 with label trigram "abd". |
| * 7) Mark state 3 final because state 5 of source NFA is marked as final. |
| * |
| * |
| * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group |
| * Portions Copyright (c) 1994, Regents of the University of California |
| * |
| * IDENTIFICATION |
| * contrib/pg_trgm/trgm_regexp.c |
| * |
| *------------------------------------------------------------------------- |
| */ |
| #include "postgres.h" |
| |
| #include "regex/regexport.h" |
| #include "trgm.h" |
| #include "tsearch/ts_locale.h" |
| #include "utils/hsearch.h" |
| #include "utils/memutils.h" |
| #include "varatt.h" |
| |
| /* |
| * Uncomment (or use -DTRGM_REGEXP_DEBUG) to print debug info, |
| * for exploring and debugging the algorithm implementation. |
| * This produces three graph files in /tmp, in Graphviz .gv format. |
| * Some progress information is also printed to postmaster stderr. |
| */ |
| /* #define TRGM_REGEXP_DEBUG */ |
| |
| /* |
| * These parameters are used to limit the amount of work done. |
| * Otherwise regex processing could be too slow and memory-consuming. |
| * |
| * MAX_EXPANDED_STATES - How many states we allow in expanded graph |
| * MAX_EXPANDED_ARCS - How many arcs we allow in expanded graph |
| * MAX_TRGM_COUNT - How many simple trigrams we allow to be extracted |
| * WISH_TRGM_PENALTY - Maximum desired sum of color trigram penalties |
| * COLOR_COUNT_LIMIT - Maximum number of characters per color |
| */ |
| #define MAX_EXPANDED_STATES 128 |
| #define MAX_EXPANDED_ARCS 1024 |
| #define MAX_TRGM_COUNT 256 |
| #define WISH_TRGM_PENALTY 16 |
| #define COLOR_COUNT_LIMIT 256 |
| |
| /* |
| * Penalty multipliers for trigram counts depending on whitespace contents. |
| * Numbers based on analysis of real-life texts. |
| */ |
| static const float4 penalties[8] = { |
| 1.0f, /* "aaa" */ |
| 3.5f, /* "aa " */ |
| 0.0f, /* "a a" (impossible) */ |
| 0.0f, /* "a " (impossible) */ |
| 4.2f, /* " aa" */ |
| 2.1f, /* " a " */ |
| 25.0f, /* " a" */ |
| 0.0f /* " " (impossible) */ |
| }; |
| |
| /* Struct representing a single pg_wchar, converted back to multibyte form */ |
| typedef struct |
| { |
| char bytes[MAX_MULTIBYTE_CHAR_LEN]; |
| } trgm_mb_char; |
| |
| /* |
| * Attributes of NFA colors: |
| * |
| * expandable - we know the character expansion of this color |
| * containsNonWord - color contains non-word characters |
| * (which will not be extracted into trigrams) |
| * wordCharsCount - count of word characters in color |
| * wordChars - array of this color's word characters |
| * (which can be extracted into trigrams) |
| * |
| * When expandable is false, the other attributes don't matter; we just |
| * assume this color represents unknown character(s). |
| */ |
| typedef struct |
| { |
| bool expandable; |
| bool containsNonWord; |
| int wordCharsCount; |
| trgm_mb_char *wordChars; |
| } TrgmColorInfo; |
| |
| /* |
| * A "prefix" is information about the colors of the last two characters read |
| * before reaching a specific NFA state. These colors can have special values |
| * COLOR_UNKNOWN and COLOR_BLANK. COLOR_UNKNOWN means that we have no |
| * information, for example because we read some character of an unexpandable |
| * color. COLOR_BLANK means that we read a non-word character. |
| * |
| * We call a prefix ambiguous if at least one of its colors is unknown. It's |
| * fully ambiguous if both are unknown, partially ambiguous if only the first |
| * is unknown. (The case of first color known, second unknown is not valid.) |
| * |
| * Wholly- or partly-blank prefixes are mostly handled the same as regular |
| * color prefixes. This allows us to generate appropriate partly-blank |
| * trigrams when the NFA requires word character(s) to appear adjacent to |
| * non-word character(s). |
| */ |
| typedef int TrgmColor; |
| |
| /* We assume that colors returned by the regexp engine cannot be these: */ |
| #define COLOR_UNKNOWN (-3) |
| #define COLOR_BLANK (-4) |
| |
| typedef struct |
| { |
| TrgmColor colors[2]; |
| } TrgmPrefix; |
| |
| /* |
| * Color-trigram data type. Note that some elements of the trigram can be |
| * COLOR_BLANK, but we don't allow COLOR_UNKNOWN. |
| */ |
| typedef struct |
| { |
| TrgmColor colors[3]; |
| } ColorTrgm; |
| |
| /* |
| * Key identifying a state of our expanded graph: color prefix, and number |
| * of the corresponding state in the underlying regex NFA. The color prefix |
| * shows how we reached the regex state (to the extent that we know it). |
| */ |
| typedef struct |
| { |
| TrgmPrefix prefix; |
| int nstate; |
| } TrgmStateKey; |
| |
| /* |
| * One state of the expanded graph. |
| * |
| * stateKey - ID of this state |
| * arcs - outgoing arcs of this state (List of TrgmArc) |
| * enterKeys - enter keys reachable from this state without reading any |
| * predictable trigram (List of TrgmStateKey) |
| * flags - flag bits |
| * snumber - number of this state (initially assigned as -1, -2, etc, |
| * for debugging purposes only; then at the packaging stage, |
| * surviving states are renumbered with positive numbers) |
| * parent - parent state, if this state has been merged into another |
| * tentFlags - flags this state would acquire via planned merges |
| * tentParent - planned parent state, if considering a merge |
| */ |
| #define TSTATE_INIT 0x01 /* flag indicating this state is initial */ |
| #define TSTATE_FIN 0x02 /* flag indicating this state is final */ |
| |
| typedef struct TrgmState |
| { |
| TrgmStateKey stateKey; /* hashtable key: must be first field */ |
| List *arcs; |
| List *enterKeys; |
| int flags; |
| int snumber; |
| struct TrgmState *parent; |
| int tentFlags; |
| struct TrgmState *tentParent; |
| } TrgmState; |
| |
| /* |
| * One arc in the expanded graph. |
| */ |
| typedef struct |
| { |
| ColorTrgm ctrgm; /* trigram needed to traverse arc */ |
| TrgmState *target; /* next state */ |
| } TrgmArc; |
| |
| /* |
| * Information about arc of specific color trigram (used in stage 3) |
| * |
| * Contains pointers to the source and target states. |
| */ |
| typedef struct |
| { |
| TrgmState *source; |
| TrgmState *target; |
| } TrgmArcInfo; |
| |
| /* |
| * Information about color trigram (used in stage 3) |
| * |
| * ctrgm - trigram itself |
| * cnumber - number of this trigram (used in the packaging stage) |
| * count - number of simple trigrams created from this color trigram |
| * expanded - indicates this color trigram is expanded into simple trigrams |
| * arcs - list of all arcs labeled with this color trigram. |
| */ |
| typedef struct |
| { |
| ColorTrgm ctrgm; |
| int cnumber; |
| int count; |
| float4 penalty; |
| bool expanded; |
| List *arcs; |
| } ColorTrgmInfo; |
| |
| /* |
| * Data structure representing all the data we need during regex processing. |
| * |
| * regex - compiled regex |
| * colorInfo - extracted information about regex's colors |
| * ncolors - number of colors in colorInfo[] |
| * states - hashtable of TrgmStates (states of expanded graph) |
| * initState - pointer to initial state of expanded graph |
| * queue - queue of to-be-processed TrgmStates |
| * keysQueue - queue of to-be-processed TrgmStateKeys |
| * arcsCount - total number of arcs of expanded graph (for resource |
| * limiting) |
| * overflowed - we have exceeded resource limit for transformation |
| * colorTrgms - array of all color trigrams present in graph |
| * colorTrgmsCount - count of those color trigrams |
| * totalTrgmCount - total count of extracted simple trigrams |
| */ |
| typedef struct |
| { |
| /* Source regexp, and color information extracted from it (stage 1) */ |
| regex_t *regex; |
| TrgmColorInfo *colorInfo; |
| int ncolors; |
| |
| /* Expanded graph (stage 2) */ |
| HTAB *states; |
| TrgmState *initState; |
| int nstates; |
| |
| /* Workspace for stage 2 */ |
| List *queue; |
| List *keysQueue; |
| int arcsCount; |
| bool overflowed; |
| |
| /* Information about distinct color trigrams in the graph (stage 3) */ |
| ColorTrgmInfo *colorTrgms; |
| int colorTrgmsCount; |
| int totalTrgmCount; |
| } TrgmNFA; |
| |
| /* |
| * Final, compact representation of expanded graph. |
| */ |
| typedef struct |
| { |
| int targetState; /* index of target state (zero-based) */ |
| int colorTrgm; /* index of color trigram for transition */ |
| } TrgmPackedArc; |
| |
| typedef struct |
| { |
| int arcsCount; /* number of out-arcs for this state */ |
| TrgmPackedArc *arcs; /* array of arcsCount packed arcs */ |
| } TrgmPackedState; |
| |
| /* "typedef struct TrgmPackedGraph TrgmPackedGraph" appears in trgm.h */ |
| struct TrgmPackedGraph |
| { |
| /* |
| * colorTrigramsCount and colorTrigramGroups contain information about how |
| * trigrams are grouped into color trigrams. "colorTrigramsCount" is the |
| * count of color trigrams and "colorTrigramGroups" contains number of |
| * simple trigrams for each color trigram. The array of simple trigrams |
| * (stored separately from this struct) is ordered so that the simple |
| * trigrams for each color trigram are consecutive, and they're in order |
| * by color trigram number. |
| */ |
| int colorTrigramsCount; |
| int *colorTrigramGroups; /* array of size colorTrigramsCount */ |
| |
| /* |
| * The states of the simplified NFA. State number 0 is always initial |
| * state and state number 1 is always final state. |
| */ |
| int statesCount; |
| TrgmPackedState *states; /* array of size statesCount */ |
| |
| /* Temporary work space for trigramsMatchGraph() */ |
| bool *colorTrigramsActive; /* array of size colorTrigramsCount */ |
| bool *statesActive; /* array of size statesCount */ |
| int *statesQueue; /* array of size statesCount */ |
| }; |
| |
| /* |
| * Temporary structure for representing an arc during packaging. |
| */ |
| typedef struct |
| { |
| int sourceState; |
| int targetState; |
| int colorTrgm; |
| } TrgmPackArcInfo; |
| |
| |
| /* prototypes for private functions */ |
| static TRGM *createTrgmNFAInternal(regex_t *regex, TrgmPackedGraph **graph, |
| MemoryContext rcontext); |
| static void RE_compile(regex_t *regex, text *text_re, |
| int cflags, Oid collation); |
| static void getColorInfo(regex_t *regex, TrgmNFA *trgmNFA); |
| static bool convertPgWchar(pg_wchar c, trgm_mb_char *result); |
| static void transformGraph(TrgmNFA *trgmNFA); |
| static void processState(TrgmNFA *trgmNFA, TrgmState *state); |
| static void addKey(TrgmNFA *trgmNFA, TrgmState *state, TrgmStateKey *key); |
| static void addKeyToQueue(TrgmNFA *trgmNFA, TrgmStateKey *key); |
| static void addArcs(TrgmNFA *trgmNFA, TrgmState *state); |
| static void addArc(TrgmNFA *trgmNFA, TrgmState *state, TrgmStateKey *key, |
| TrgmColor co, TrgmStateKey *destKey); |
| static bool validArcLabel(TrgmStateKey *key, TrgmColor co); |
| static TrgmState *getState(TrgmNFA *trgmNFA, TrgmStateKey *key); |
| static bool prefixContains(TrgmPrefix *prefix1, TrgmPrefix *prefix2); |
| static bool selectColorTrigrams(TrgmNFA *trgmNFA); |
| static TRGM *expandColorTrigrams(TrgmNFA *trgmNFA, MemoryContext rcontext); |
| static void fillTrgm(trgm *ptrgm, trgm_mb_char s[3]); |
| static void mergeStates(TrgmState *state1, TrgmState *state2); |
| static int colorTrgmInfoCmp(const void *p1, const void *p2); |
| static int colorTrgmInfoPenaltyCmp(const void *p1, const void *p2); |
| static TrgmPackedGraph *packGraph(TrgmNFA *trgmNFA, MemoryContext rcontext); |
| static int packArcInfoCmp(const void *a1, const void *a2); |
| |
| #ifdef TRGM_REGEXP_DEBUG |
| static void printSourceNFA(regex_t *regex, TrgmColorInfo *colors, int ncolors); |
| static void printTrgmNFA(TrgmNFA *trgmNFA); |
| static void printTrgmColor(StringInfo buf, TrgmColor co); |
| static void printTrgmPackedGraph(TrgmPackedGraph *packedGraph, TRGM *trigrams); |
| #endif |
| |
| |
| /* |
| * Main entry point to process a regular expression. |
| * |
| * Returns an array of trigrams required by the regular expression, or NULL if |
| * the regular expression was too complex to analyze. In addition, a packed |
| * graph representation of the regex is returned into *graph. The results |
| * must be allocated in rcontext (which might or might not be the current |
| * context). |
| */ |
| TRGM * |
| createTrgmNFA(text *text_re, Oid collation, |
| TrgmPackedGraph **graph, MemoryContext rcontext) |
| { |
| TRGM *trg; |
| regex_t regex; |
| MemoryContext tmpcontext; |
| MemoryContext oldcontext; |
| |
| /* |
| * This processing generates a great deal of cruft, which we'd like to |
| * clean up before returning (since this function may be called in a |
| * query-lifespan memory context). Make a temp context we can work in so |
| * that cleanup is easy. |
| */ |
| tmpcontext = AllocSetContextCreate(CurrentMemoryContext, |
| "createTrgmNFA temporary context", |
| ALLOCSET_DEFAULT_SIZES); |
| oldcontext = MemoryContextSwitchTo(tmpcontext); |
| |
| /* |
| * Stage 1: Compile the regexp into a NFA, using the regexp library. |
| */ |
| #ifdef IGNORECASE |
| RE_compile(®ex, text_re, |
| REG_ADVANCED | REG_NOSUB | REG_ICASE, collation); |
| #else |
| RE_compile(®ex, text_re, |
| REG_ADVANCED | REG_NOSUB, collation); |
| #endif |
| |
| trg = createTrgmNFAInternal(®ex, graph, rcontext); |
| |
| /* Clean up all the cruft we created (including regex) */ |
| MemoryContextSwitchTo(oldcontext); |
| MemoryContextDelete(tmpcontext); |
| |
| return trg; |
| } |
| |
| /* |
| * Body of createTrgmNFA, exclusive of regex compilation/freeing. |
| */ |
| static TRGM * |
| createTrgmNFAInternal(regex_t *regex, TrgmPackedGraph **graph, |
| MemoryContext rcontext) |
| { |
| TRGM *trg; |
| TrgmNFA trgmNFA; |
| |
| trgmNFA.regex = regex; |
| |
| /* Collect color information from the regex */ |
| getColorInfo(regex, &trgmNFA); |
| |
| #ifdef TRGM_REGEXP_DEBUG |
| printSourceNFA(regex, trgmNFA.colorInfo, trgmNFA.ncolors); |
| #endif |
| |
| /* |
| * Stage 2: Create an expanded graph from the source NFA. |
| */ |
| transformGraph(&trgmNFA); |
| |
| #ifdef TRGM_REGEXP_DEBUG |
| printTrgmNFA(&trgmNFA); |
| #endif |
| |
| /* |
| * Fail if we were unable to make a nontrivial graph, ie it is possible to |
| * get from the initial state to the final state without reading any |
| * predictable trigram. |
| */ |
| if (trgmNFA.initState->flags & TSTATE_FIN) |
| return NULL; |
| |
| /* |
| * Stage 3: Select color trigrams to expand. Fail if too many trigrams. |
| */ |
| if (!selectColorTrigrams(&trgmNFA)) |
| return NULL; |
| |
| /* |
| * Stage 4: Expand color trigrams and pack graph into final |
| * representation. |
| */ |
| trg = expandColorTrigrams(&trgmNFA, rcontext); |
| |
| *graph = packGraph(&trgmNFA, rcontext); |
| |
| #ifdef TRGM_REGEXP_DEBUG |
| printTrgmPackedGraph(*graph, trg); |
| #endif |
| |
| return trg; |
| } |
| |
| /* |
| * Main entry point for evaluating a graph during index scanning. |
| * |
| * The check[] array is indexed by trigram number (in the array of simple |
| * trigrams returned by createTrgmNFA), and holds true for those trigrams |
| * that are present in the index entry being checked. |
| */ |
| bool |
| trigramsMatchGraph(TrgmPackedGraph *graph, bool *check) |
| { |
| int i, |
| j, |
| k, |
| queueIn, |
| queueOut; |
| |
| /* |
| * Reset temporary working areas. |
| */ |
| memset(graph->colorTrigramsActive, 0, |
| sizeof(bool) * graph->colorTrigramsCount); |
| memset(graph->statesActive, 0, sizeof(bool) * graph->statesCount); |
| |
| /* |
| * Check which color trigrams were matched. A match for any simple |
| * trigram associated with a color trigram counts as a match of the color |
| * trigram. |
| */ |
| j = 0; |
| for (i = 0; i < graph->colorTrigramsCount; i++) |
| { |
| int cnt = graph->colorTrigramGroups[i]; |
| |
| for (k = j; k < j + cnt; k++) |
| { |
| if (check[k]) |
| { |
| /* |
| * Found one matched trigram in the group. Can skip the rest |
| * of them and go to the next group. |
| */ |
| graph->colorTrigramsActive[i] = true; |
| break; |
| } |
| } |
| j = j + cnt; |
| } |
| |
| /* |
| * Initialize the statesQueue to hold just the initial state. Note: |
| * statesQueue has room for statesCount entries, which is certainly enough |
| * since no state will be put in the queue more than once. The |
| * statesActive array marks which states have been queued. |
| */ |
| graph->statesActive[0] = true; |
| graph->statesQueue[0] = 0; |
| queueIn = 0; |
| queueOut = 1; |
| |
| /* Process queued states as long as there are any. */ |
| while (queueIn < queueOut) |
| { |
| int stateno = graph->statesQueue[queueIn++]; |
| TrgmPackedState *state = &graph->states[stateno]; |
| int cnt = state->arcsCount; |
| |
| /* Loop over state's out-arcs */ |
| for (i = 0; i < cnt; i++) |
| { |
| TrgmPackedArc *arc = &state->arcs[i]; |
| |
| /* |
| * If corresponding color trigram is present then activate the |
| * corresponding state. We're done if that's the final state, |
| * otherwise queue the state if it's not been queued already. |
| */ |
| if (graph->colorTrigramsActive[arc->colorTrgm]) |
| { |
| int nextstate = arc->targetState; |
| |
| if (nextstate == 1) |
| return true; /* success: final state is reachable */ |
| |
| if (!graph->statesActive[nextstate]) |
| { |
| graph->statesActive[nextstate] = true; |
| graph->statesQueue[queueOut++] = nextstate; |
| } |
| } |
| } |
| } |
| |
| /* Queue is empty, so match fails. */ |
| return false; |
| } |
| |
| /* |
| * Compile regex string into struct at *regex. |
| * NB: pg_regfree must be applied to regex if this completes successfully. |
| */ |
| static void |
| RE_compile(regex_t *regex, text *text_re, int cflags, Oid collation) |
| { |
| int text_re_len = VARSIZE_ANY_EXHDR(text_re); |
| char *text_re_val = VARDATA_ANY(text_re); |
| pg_wchar *pattern; |
| int pattern_len; |
| int regcomp_result; |
| char errMsg[100]; |
| |
| /* Convert pattern string to wide characters */ |
| pattern = (pg_wchar *) palloc((text_re_len + 1) * sizeof(pg_wchar)); |
| pattern_len = pg_mb2wchar_with_len(text_re_val, |
| pattern, |
| text_re_len); |
| |
| /* Compile regex */ |
| regcomp_result = pg_regcomp(regex, |
| pattern, |
| pattern_len, |
| cflags, |
| collation); |
| |
| pfree(pattern); |
| |
| if (regcomp_result != REG_OKAY) |
| { |
| /* re didn't compile (no need for pg_regfree, if so) */ |
| pg_regerror(regcomp_result, regex, errMsg, sizeof(errMsg)); |
| ereport(ERROR, |
| (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION), |
| errmsg("invalid regular expression: %s", errMsg))); |
| } |
| } |
| |
| |
| /*--------------------- |
| * Subroutines for pre-processing the color map (stage 1). |
| *--------------------- |
| */ |
| |
| /* |
| * Fill TrgmColorInfo structure for each color using regex export functions. |
| */ |
| static void |
| getColorInfo(regex_t *regex, TrgmNFA *trgmNFA) |
| { |
| int colorsCount = pg_reg_getnumcolors(regex); |
| int i; |
| |
| trgmNFA->ncolors = colorsCount; |
| trgmNFA->colorInfo = (TrgmColorInfo *) |
| palloc0(colorsCount * sizeof(TrgmColorInfo)); |
| |
| /* |
| * Loop over colors, filling TrgmColorInfo about each. Note we include |
| * WHITE (0) even though we know it'll be reported as non-expandable. |
| */ |
| for (i = 0; i < colorsCount; i++) |
| { |
| TrgmColorInfo *colorInfo = &trgmNFA->colorInfo[i]; |
| int charsCount = pg_reg_getnumcharacters(regex, i); |
| pg_wchar *chars; |
| int j; |
| |
| if (charsCount < 0 || charsCount > COLOR_COUNT_LIMIT) |
| { |
| /* Non expandable, or too large to work with */ |
| colorInfo->expandable = false; |
| continue; |
| } |
| |
| colorInfo->expandable = true; |
| colorInfo->containsNonWord = false; |
| colorInfo->wordChars = (trgm_mb_char *) |
| palloc(sizeof(trgm_mb_char) * charsCount); |
| colorInfo->wordCharsCount = 0; |
| |
| /* Extract all the chars in this color */ |
| chars = (pg_wchar *) palloc(sizeof(pg_wchar) * charsCount); |
| pg_reg_getcharacters(regex, i, chars, charsCount); |
| |
| /* |
| * Convert characters back to multibyte form, and save only those that |
| * are word characters. Set "containsNonWord" if any non-word |
| * character. (Note: it'd probably be nicer to keep the chars in |
| * pg_wchar format for now, but ISWORDCHR wants to see multibyte.) |
| */ |
| for (j = 0; j < charsCount; j++) |
| { |
| trgm_mb_char c; |
| |
| if (!convertPgWchar(chars[j], &c)) |
| continue; /* ok to ignore it altogether */ |
| if (ISWORDCHR(c.bytes)) |
| colorInfo->wordChars[colorInfo->wordCharsCount++] = c; |
| else |
| colorInfo->containsNonWord = true; |
| } |
| |
| pfree(chars); |
| } |
| } |
| |
| /* |
| * Convert pg_wchar to multibyte format. |
| * Returns false if the character should be ignored completely. |
| */ |
| static bool |
| convertPgWchar(pg_wchar c, trgm_mb_char *result) |
| { |
| /* "s" has enough space for a multibyte character and a trailing NUL */ |
| char s[MAX_MULTIBYTE_CHAR_LEN + 1]; |
| |
| /* |
| * We can ignore the NUL character, since it can never appear in a PG text |
| * string. This avoids the need for various special cases when |
| * reconstructing trigrams. |
| */ |
| if (c == 0) |
| return false; |
| |
| /* Do the conversion, making sure the result is NUL-terminated */ |
| memset(s, 0, sizeof(s)); |
| pg_wchar2mb_with_len(&c, s, 1); |
| |
| /* |
| * In IGNORECASE mode, we can ignore uppercase characters. We assume that |
| * the regex engine generated both uppercase and lowercase equivalents |
| * within each color, since we used the REG_ICASE option; so there's no |
| * need to process the uppercase version. |
| * |
| * XXX this code is dependent on the assumption that lowerstr() works the |
| * same as the regex engine's internal case folding machinery. Might be |
| * wiser to expose pg_wc_tolower and test whether c == pg_wc_tolower(c). |
| * On the other hand, the trigrams in the index were created using |
| * lowerstr(), so we're probably screwed if there's any incompatibility |
| * anyway. |
| */ |
| #ifdef IGNORECASE |
| { |
| char *lowerCased = lowerstr(s); |
| |
| if (strcmp(lowerCased, s) != 0) |
| { |
| pfree(lowerCased); |
| return false; |
| } |
| pfree(lowerCased); |
| } |
| #endif |
| |
| /* Fill result with exactly MAX_MULTIBYTE_CHAR_LEN bytes */ |
| memcpy(result->bytes, s, MAX_MULTIBYTE_CHAR_LEN); |
| return true; |
| } |
| |
| |
| /*--------------------- |
| * Subroutines for expanding original NFA graph into a trigram graph (stage 2). |
| *--------------------- |
| */ |
| |
| /* |
| * Transform the graph, given a regex and extracted color information. |
| * |
| * We create and process a queue of expanded-graph states until all the states |
| * are processed. |
| * |
| * This algorithm may be stopped due to resource limitation. In this case we |
| * force every unprocessed branch to immediately finish with matching (this |
| * can give us false positives but no false negatives) by marking all |
| * unprocessed states as final. |
| */ |
| static void |
| transformGraph(TrgmNFA *trgmNFA) |
| { |
| HASHCTL hashCtl; |
| TrgmStateKey initkey; |
| TrgmState *initstate; |
| ListCell *lc; |
| |
| /* Initialize this stage's workspace in trgmNFA struct */ |
| trgmNFA->queue = NIL; |
| trgmNFA->keysQueue = NIL; |
| trgmNFA->arcsCount = 0; |
| trgmNFA->overflowed = false; |
| |
| /* Create hashtable for states */ |
| hashCtl.keysize = sizeof(TrgmStateKey); |
| hashCtl.entrysize = sizeof(TrgmState); |
| hashCtl.hcxt = CurrentMemoryContext; |
| trgmNFA->states = hash_create("Trigram NFA", |
| 1024, |
| &hashCtl, |
| HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); |
| trgmNFA->nstates = 0; |
| |
| /* Create initial state: ambiguous prefix, NFA's initial state */ |
| MemSet(&initkey, 0, sizeof(initkey)); |
| initkey.prefix.colors[0] = COLOR_UNKNOWN; |
| initkey.prefix.colors[1] = COLOR_UNKNOWN; |
| initkey.nstate = pg_reg_getinitialstate(trgmNFA->regex); |
| |
| initstate = getState(trgmNFA, &initkey); |
| initstate->flags |= TSTATE_INIT; |
| trgmNFA->initState = initstate; |
| |
| /* |
| * Recursively build the expanded graph by processing queue of states |
| * (breadth-first search). getState already put initstate in the queue. |
| * Note that getState will append new states to the queue within the loop, |
| * too; this works as long as we don't do repeat fetches using the "lc" |
| * pointer. |
| */ |
| foreach(lc, trgmNFA->queue) |
| { |
| TrgmState *state = (TrgmState *) lfirst(lc); |
| |
| /* |
| * If we overflowed then just mark state as final. Otherwise do |
| * actual processing. |
| */ |
| if (trgmNFA->overflowed) |
| state->flags |= TSTATE_FIN; |
| else |
| processState(trgmNFA, state); |
| |
| /* Did we overflow? */ |
| if (trgmNFA->arcsCount > MAX_EXPANDED_ARCS || |
| hash_get_num_entries(trgmNFA->states) > MAX_EXPANDED_STATES) |
| trgmNFA->overflowed = true; |
| } |
| } |
| |
| /* |
| * Process one state: add enter keys and then add outgoing arcs. |
| */ |
| static void |
| processState(TrgmNFA *trgmNFA, TrgmState *state) |
| { |
| ListCell *lc; |
| |
| /* keysQueue should be NIL already, but make sure */ |
| trgmNFA->keysQueue = NIL; |
| |
| /* |
| * Add state's own key, and then process all keys added to keysQueue until |
| * queue is finished. But we can quit if the state gets marked final. |
| */ |
| addKey(trgmNFA, state, &state->stateKey); |
| foreach(lc, trgmNFA->keysQueue) |
| { |
| TrgmStateKey *key = (TrgmStateKey *) lfirst(lc); |
| |
| if (state->flags & TSTATE_FIN) |
| break; |
| addKey(trgmNFA, state, key); |
| } |
| |
| /* Release keysQueue to clean up for next cycle */ |
| list_free(trgmNFA->keysQueue); |
| trgmNFA->keysQueue = NIL; |
| |
| /* |
| * Add outgoing arcs only if state isn't final (we have no interest in |
| * outgoing arcs if we already match) |
| */ |
| if (!(state->flags & TSTATE_FIN)) |
| addArcs(trgmNFA, state); |
| } |
| |
| /* |
| * Add the given enter key into the state's enterKeys list, and determine |
| * whether this should result in any further enter keys being added. |
| * If so, add those keys to keysQueue so that processState will handle them. |
| * |
| * If the enter key is for the NFA's final state, mark state as TSTATE_FIN. |
| * This situation means that we can reach the final state from this expanded |
| * state without reading any predictable trigram, so we must consider this |
| * state as an accepting one. |
| * |
| * The given key could be a duplicate of one already in enterKeys, or be |
| * redundant with some enterKeys. So we check that before doing anything. |
| * |
| * Note that we don't generate any actual arcs here. addArcs will do that |
| * later, after we have identified all the enter keys for this state. |
| */ |
| static void |
| addKey(TrgmNFA *trgmNFA, TrgmState *state, TrgmStateKey *key) |
| { |
| regex_arc_t *arcs; |
| TrgmStateKey destKey; |
| ListCell *cell; |
| int i, |
| arcsCount; |
| |
| /* |
| * Ensure any pad bytes in destKey are zero, since it may get used as a |
| * hashtable key by getState. |
| */ |
| MemSet(&destKey, 0, sizeof(destKey)); |
| |
| /* |
| * Compare key to each existing enter key of the state to check for |
| * redundancy. We can drop either old key(s) or the new key if we find |
| * redundancy. |
| */ |
| foreach(cell, state->enterKeys) |
| { |
| TrgmStateKey *existingKey = (TrgmStateKey *) lfirst(cell); |
| |
| if (existingKey->nstate == key->nstate) |
| { |
| if (prefixContains(&existingKey->prefix, &key->prefix)) |
| { |
| /* This old key already covers the new key. Nothing to do */ |
| return; |
| } |
| if (prefixContains(&key->prefix, &existingKey->prefix)) |
| { |
| /* |
| * The new key covers this old key. Remove the old key, it's |
| * no longer needed once we add this key to the list. |
| */ |
| state->enterKeys = foreach_delete_current(state->enterKeys, |
| cell); |
| } |
| } |
| } |
| |
| /* No redundancy, so add this key to the state's list */ |
| state->enterKeys = lappend(state->enterKeys, key); |
| |
| /* If state is now known final, mark it and we're done */ |
| if (key->nstate == pg_reg_getfinalstate(trgmNFA->regex)) |
| { |
| state->flags |= TSTATE_FIN; |
| return; |
| } |
| |
| /* |
| * Loop through all outgoing arcs of the corresponding state in the |
| * original NFA. |
| */ |
| arcsCount = pg_reg_getnumoutarcs(trgmNFA->regex, key->nstate); |
| arcs = (regex_arc_t *) palloc(sizeof(regex_arc_t) * arcsCount); |
| pg_reg_getoutarcs(trgmNFA->regex, key->nstate, arcs, arcsCount); |
| |
| for (i = 0; i < arcsCount; i++) |
| { |
| regex_arc_t *arc = &arcs[i]; |
| |
| if (pg_reg_colorisbegin(trgmNFA->regex, arc->co)) |
| { |
| /* |
| * Start of line/string (^). Trigram extraction treats start of |
| * line same as start of word: double space prefix is added. |
| * Hence, make an enter key showing we can reach the arc |
| * destination with all-blank prefix. |
| */ |
| destKey.prefix.colors[0] = COLOR_BLANK; |
| destKey.prefix.colors[1] = COLOR_BLANK; |
| destKey.nstate = arc->to; |
| |
| /* Add enter key to this state */ |
| addKeyToQueue(trgmNFA, &destKey); |
| } |
| else if (pg_reg_colorisend(trgmNFA->regex, arc->co)) |
| { |
| /* |
| * End of line/string ($). We must consider this arc as a |
| * transition that doesn't read anything. The reason for adding |
| * this enter key to the state is that if the arc leads to the |
| * NFA's final state, we must mark this expanded state as final. |
| */ |
| destKey.prefix.colors[0] = COLOR_UNKNOWN; |
| destKey.prefix.colors[1] = COLOR_UNKNOWN; |
| destKey.nstate = arc->to; |
| |
| /* Add enter key to this state */ |
| addKeyToQueue(trgmNFA, &destKey); |
| } |
| else if (arc->co >= 0) |
| { |
| /* Regular color (including WHITE) */ |
| TrgmColorInfo *colorInfo = &trgmNFA->colorInfo[arc->co]; |
| |
| if (colorInfo->expandable) |
| { |
| if (colorInfo->containsNonWord && |
| !validArcLabel(key, COLOR_BLANK)) |
| { |
| /* |
| * We can reach the arc destination after reading a |
| * non-word character, but the prefix is not something |
| * that addArc will accept with COLOR_BLANK, so no trigram |
| * arc can get made for this transition. We must make an |
| * enter key to show that the arc destination is |
| * reachable. Set it up with an all-blank prefix, since |
| * that corresponds to what the trigram extraction code |
| * will do at a word starting boundary. |
| */ |
| destKey.prefix.colors[0] = COLOR_BLANK; |
| destKey.prefix.colors[1] = COLOR_BLANK; |
| destKey.nstate = arc->to; |
| addKeyToQueue(trgmNFA, &destKey); |
| } |
| |
| if (colorInfo->wordCharsCount > 0 && |
| !validArcLabel(key, arc->co)) |
| { |
| /* |
| * We can reach the arc destination after reading a word |
| * character, but the prefix is not something that addArc |
| * will accept, so no trigram arc can get made for this |
| * transition. We must make an enter key to show that the |
| * arc destination is reachable. The prefix for the enter |
| * key should reflect the info we have for this arc. |
| */ |
| destKey.prefix.colors[0] = key->prefix.colors[1]; |
| destKey.prefix.colors[1] = arc->co; |
| destKey.nstate = arc->to; |
| addKeyToQueue(trgmNFA, &destKey); |
| } |
| } |
| else |
| { |
| /* |
| * Unexpandable color. Add enter key with ambiguous prefix, |
| * showing we can reach the destination from this state, but |
| * the preceding colors will be uncertain. (We do not set the |
| * first prefix color to key->prefix.colors[1], because a |
| * prefix of known followed by unknown is invalid.) |
| */ |
| destKey.prefix.colors[0] = COLOR_UNKNOWN; |
| destKey.prefix.colors[1] = COLOR_UNKNOWN; |
| destKey.nstate = arc->to; |
| addKeyToQueue(trgmNFA, &destKey); |
| } |
| } |
| else |
| { |
| /* RAINBOW: treat as unexpandable color */ |
| destKey.prefix.colors[0] = COLOR_UNKNOWN; |
| destKey.prefix.colors[1] = COLOR_UNKNOWN; |
| destKey.nstate = arc->to; |
| addKeyToQueue(trgmNFA, &destKey); |
| } |
| } |
| |
| pfree(arcs); |
| } |
| |
| /* |
| * Add copy of given key to keysQueue for later processing. |
| */ |
| static void |
| addKeyToQueue(TrgmNFA *trgmNFA, TrgmStateKey *key) |
| { |
| TrgmStateKey *keyCopy = (TrgmStateKey *) palloc(sizeof(TrgmStateKey)); |
| |
| memcpy(keyCopy, key, sizeof(TrgmStateKey)); |
| trgmNFA->keysQueue = lappend(trgmNFA->keysQueue, keyCopy); |
| } |
| |
| /* |
| * Add outgoing arcs from given state, whose enter keys are all now known. |
| */ |
| static void |
| addArcs(TrgmNFA *trgmNFA, TrgmState *state) |
| { |
| TrgmStateKey destKey; |
| ListCell *cell; |
| regex_arc_t *arcs; |
| int arcsCount, |
| i; |
| |
| /* |
| * Ensure any pad bytes in destKey are zero, since it may get used as a |
| * hashtable key by getState. |
| */ |
| MemSet(&destKey, 0, sizeof(destKey)); |
| |
| /* |
| * Iterate over enter keys associated with this expanded-graph state. This |
| * includes both the state's own stateKey, and any enter keys we added to |
| * it during addKey (which represent expanded-graph states that are not |
| * distinguishable from this one by means of trigrams). For each such |
| * enter key, examine all the out-arcs of the key's underlying NFA state, |
| * and try to make a trigram arc leading to where the out-arc leads. |
| * (addArc will deal with whether the arc is valid or not.) |
| */ |
| foreach(cell, state->enterKeys) |
| { |
| TrgmStateKey *key = (TrgmStateKey *) lfirst(cell); |
| |
| arcsCount = pg_reg_getnumoutarcs(trgmNFA->regex, key->nstate); |
| arcs = (regex_arc_t *) palloc(sizeof(regex_arc_t) * arcsCount); |
| pg_reg_getoutarcs(trgmNFA->regex, key->nstate, arcs, arcsCount); |
| |
| for (i = 0; i < arcsCount; i++) |
| { |
| regex_arc_t *arc = &arcs[i]; |
| TrgmColorInfo *colorInfo; |
| |
| /* |
| * Ignore non-expandable colors; addKey already handled the case. |
| * |
| * We need no special check for WHITE or begin/end pseudocolors |
| * here. We don't need to do any processing for them, and they |
| * will be marked non-expandable since the regex engine will have |
| * reported them that way. We do have to watch out for RAINBOW, |
| * which has a negative color number. |
| */ |
| if (arc->co < 0) |
| continue; |
| Assert(arc->co < trgmNFA->ncolors); |
| |
| colorInfo = &trgmNFA->colorInfo[arc->co]; |
| if (!colorInfo->expandable) |
| continue; |
| |
| if (colorInfo->containsNonWord) |
| { |
| /* |
| * Color includes non-word character(s). |
| * |
| * Generate an arc, treating this transition as occurring on |
| * BLANK. This allows word-ending trigrams to be manufactured |
| * if possible. |
| */ |
| destKey.prefix.colors[0] = key->prefix.colors[1]; |
| destKey.prefix.colors[1] = COLOR_BLANK; |
| destKey.nstate = arc->to; |
| |
| addArc(trgmNFA, state, key, COLOR_BLANK, &destKey); |
| } |
| |
| if (colorInfo->wordCharsCount > 0) |
| { |
| /* |
| * Color includes word character(s). |
| * |
| * Generate an arc. Color is pushed into prefix of target |
| * state. |
| */ |
| destKey.prefix.colors[0] = key->prefix.colors[1]; |
| destKey.prefix.colors[1] = arc->co; |
| destKey.nstate = arc->to; |
| |
| addArc(trgmNFA, state, key, arc->co, &destKey); |
| } |
| } |
| |
| pfree(arcs); |
| } |
| } |
| |
| /* |
| * Generate an out-arc of the expanded graph, if it's valid and not redundant. |
| * |
| * state: expanded-graph state we want to add an out-arc to |
| * key: provides prefix colors (key->nstate is not used) |
| * co: transition color |
| * destKey: identifier for destination state of expanded graph |
| */ |
| static void |
| addArc(TrgmNFA *trgmNFA, TrgmState *state, TrgmStateKey *key, |
| TrgmColor co, TrgmStateKey *destKey) |
| { |
| TrgmArc *arc; |
| ListCell *cell; |
| |
| /* Do nothing if this wouldn't be a valid arc label trigram */ |
| if (!validArcLabel(key, co)) |
| return; |
| |
| /* |
| * Check if we are going to reach key which is covered by a key which is |
| * already listed in this state. If so arc is useless: the NFA can bypass |
| * it through a path that doesn't require any predictable trigram, so |
| * whether the arc's trigram is present or not doesn't really matter. |
| */ |
| foreach(cell, state->enterKeys) |
| { |
| TrgmStateKey *existingKey = (TrgmStateKey *) lfirst(cell); |
| |
| if (existingKey->nstate == destKey->nstate && |
| prefixContains(&existingKey->prefix, &destKey->prefix)) |
| return; |
| } |
| |
| /* Checks were successful, add new arc */ |
| arc = (TrgmArc *) palloc(sizeof(TrgmArc)); |
| arc->target = getState(trgmNFA, destKey); |
| arc->ctrgm.colors[0] = key->prefix.colors[0]; |
| arc->ctrgm.colors[1] = key->prefix.colors[1]; |
| arc->ctrgm.colors[2] = co; |
| |
| state->arcs = lappend(state->arcs, arc); |
| trgmNFA->arcsCount++; |
| } |
| |
| /* |
| * Can we make a valid trigram arc label from the given prefix and arc color? |
| * |
| * This is split out so that tests in addKey and addArc will stay in sync. |
| */ |
| static bool |
| validArcLabel(TrgmStateKey *key, TrgmColor co) |
| { |
| /* |
| * We have to know full trigram in order to add outgoing arc. So we can't |
| * do it if prefix is ambiguous. |
| */ |
| if (key->prefix.colors[0] == COLOR_UNKNOWN) |
| return false; |
| |
| /* If key->prefix.colors[0] isn't unknown, its second color isn't either */ |
| Assert(key->prefix.colors[1] != COLOR_UNKNOWN); |
| /* And we should not be called with an unknown arc color anytime */ |
| Assert(co != COLOR_UNKNOWN); |
| |
| /* |
| * We don't bother with making arcs representing three non-word |
| * characters, since that's useless for trigram extraction. |
| */ |
| if (key->prefix.colors[0] == COLOR_BLANK && |
| key->prefix.colors[1] == COLOR_BLANK && |
| co == COLOR_BLANK) |
| return false; |
| |
| /* |
| * We also reject nonblank-blank-anything. The nonblank-blank-nonblank |
| * case doesn't correspond to any trigram the trigram extraction code |
| * would make. The nonblank-blank-blank case is also not possible with |
| * RPADDING = 1. (Note that in many cases we'd fail to generate such a |
| * trigram even if it were valid, for example processing "foo bar" will |
| * not result in considering the trigram "o ". So if you want to support |
| * RPADDING = 2, there's more to do than just twiddle this test.) |
| */ |
| if (key->prefix.colors[0] != COLOR_BLANK && |
| key->prefix.colors[1] == COLOR_BLANK) |
| return false; |
| |
| /* |
| * Other combinations involving blank are valid, in particular we assume |
| * blank-blank-nonblank is valid, which presumes that LPADDING is 2. |
| * |
| * Note: Using again the example "foo bar", we will not consider the |
| * trigram " b", though this trigram would be found by the trigram |
| * extraction code. Since we will find " ba", it doesn't seem worth |
| * trying to hack the algorithm to generate the additional trigram. |
| */ |
| |
| /* arc label is valid */ |
| return true; |
| } |
| |
| /* |
| * Get state of expanded graph for given state key, |
| * and queue the state for processing if it didn't already exist. |
| */ |
| static TrgmState * |
| getState(TrgmNFA *trgmNFA, TrgmStateKey *key) |
| { |
| TrgmState *state; |
| bool found; |
| |
| state = (TrgmState *) hash_search(trgmNFA->states, key, HASH_ENTER, |
| &found); |
| if (!found) |
| { |
| /* New state: initialize and queue it */ |
| state->arcs = NIL; |
| state->enterKeys = NIL; |
| state->flags = 0; |
| /* states are initially given negative numbers */ |
| state->snumber = -(++trgmNFA->nstates); |
| state->parent = NULL; |
| state->tentFlags = 0; |
| state->tentParent = NULL; |
| |
| trgmNFA->queue = lappend(trgmNFA->queue, state); |
| } |
| return state; |
| } |
| |
| /* |
| * Check if prefix1 "contains" prefix2. |
| * |
| * "contains" means that any exact prefix (with no ambiguity) that satisfies |
| * prefix2 also satisfies prefix1. |
| */ |
| static bool |
| prefixContains(TrgmPrefix *prefix1, TrgmPrefix *prefix2) |
| { |
| if (prefix1->colors[1] == COLOR_UNKNOWN) |
| { |
| /* Fully ambiguous prefix contains everything */ |
| return true; |
| } |
| else if (prefix1->colors[0] == COLOR_UNKNOWN) |
| { |
| /* |
| * Prefix with only first unknown color contains every prefix with |
| * same second color. |
| */ |
| if (prefix1->colors[1] == prefix2->colors[1]) |
| return true; |
| else |
| return false; |
| } |
| else |
| { |
| /* Exact prefix contains only the exact same prefix */ |
| if (prefix1->colors[0] == prefix2->colors[0] && |
| prefix1->colors[1] == prefix2->colors[1]) |
| return true; |
| else |
| return false; |
| } |
| } |
| |
| |
| /*--------------------- |
| * Subroutines for expanding color trigrams into regular trigrams (stage 3). |
| *--------------------- |
| */ |
| |
| /* |
| * Get vector of all color trigrams in graph and select which of them |
| * to expand into simple trigrams. |
| * |
| * Returns true if OK, false if exhausted resource limits. |
| */ |
| static bool |
| selectColorTrigrams(TrgmNFA *trgmNFA) |
| { |
| HASH_SEQ_STATUS scan_status; |
| int arcsCount = trgmNFA->arcsCount, |
| i; |
| TrgmState *state; |
| ColorTrgmInfo *colorTrgms; |
| int64 totalTrgmCount; |
| float4 totalTrgmPenalty; |
| int cnumber; |
| |
| /* Collect color trigrams from all arcs */ |
| colorTrgms = (ColorTrgmInfo *) palloc0(sizeof(ColorTrgmInfo) * arcsCount); |
| trgmNFA->colorTrgms = colorTrgms; |
| |
| i = 0; |
| hash_seq_init(&scan_status, trgmNFA->states); |
| while ((state = (TrgmState *) hash_seq_search(&scan_status)) != NULL) |
| { |
| ListCell *cell; |
| |
| foreach(cell, state->arcs) |
| { |
| TrgmArc *arc = (TrgmArc *) lfirst(cell); |
| TrgmArcInfo *arcInfo = (TrgmArcInfo *) palloc(sizeof(TrgmArcInfo)); |
| ColorTrgmInfo *trgmInfo = &colorTrgms[i]; |
| |
| arcInfo->source = state; |
| arcInfo->target = arc->target; |
| trgmInfo->ctrgm = arc->ctrgm; |
| trgmInfo->cnumber = -1; |
| /* count and penalty will be set below */ |
| trgmInfo->expanded = true; |
| trgmInfo->arcs = list_make1(arcInfo); |
| i++; |
| } |
| } |
| Assert(i == arcsCount); |
| |
| /* Remove duplicates, merging their arcs lists */ |
| if (arcsCount >= 2) |
| { |
| ColorTrgmInfo *p1, |
| *p2; |
| |
| /* Sort trigrams to ease duplicate detection */ |
| qsort(colorTrgms, arcsCount, sizeof(ColorTrgmInfo), colorTrgmInfoCmp); |
| |
| /* p1 is probe point, p2 is last known non-duplicate. */ |
| p2 = colorTrgms; |
| for (p1 = colorTrgms + 1; p1 < colorTrgms + arcsCount; p1++) |
| { |
| if (colorTrgmInfoCmp(p1, p2) > 0) |
| { |
| p2++; |
| *p2 = *p1; |
| } |
| else |
| { |
| p2->arcs = list_concat(p2->arcs, p1->arcs); |
| } |
| } |
| trgmNFA->colorTrgmsCount = (p2 - colorTrgms) + 1; |
| } |
| else |
| { |
| trgmNFA->colorTrgmsCount = arcsCount; |
| } |
| |
| /* |
| * Count number of simple trigrams generated by each color trigram, and |
| * also compute a penalty value, which is the number of simple trigrams |
| * times a multiplier that depends on its whitespace content. |
| * |
| * Note: per-color-trigram counts cannot overflow an int so long as |
| * COLOR_COUNT_LIMIT is not more than the cube root of INT_MAX, ie about |
| * 1290. However, the grand total totalTrgmCount might conceivably |
| * overflow an int, so we use int64 for that within this routine. Also, |
| * penalties are calculated in float4 arithmetic to avoid any overflow |
| * worries. |
| */ |
| totalTrgmCount = 0; |
| totalTrgmPenalty = 0.0f; |
| for (i = 0; i < trgmNFA->colorTrgmsCount; i++) |
| { |
| ColorTrgmInfo *trgmInfo = &colorTrgms[i]; |
| int j, |
| count = 1, |
| typeIndex = 0; |
| |
| for (j = 0; j < 3; j++) |
| { |
| TrgmColor c = trgmInfo->ctrgm.colors[j]; |
| |
| typeIndex *= 2; |
| if (c == COLOR_BLANK) |
| typeIndex++; |
| else |
| count *= trgmNFA->colorInfo[c].wordCharsCount; |
| } |
| trgmInfo->count = count; |
| totalTrgmCount += count; |
| trgmInfo->penalty = penalties[typeIndex] * (float4) count; |
| totalTrgmPenalty += trgmInfo->penalty; |
| } |
| |
| /* Sort color trigrams in descending order of their penalties */ |
| qsort(colorTrgms, trgmNFA->colorTrgmsCount, sizeof(ColorTrgmInfo), |
| colorTrgmInfoPenaltyCmp); |
| |
| /* |
| * Remove color trigrams from the graph so long as total penalty of color |
| * trigrams exceeds WISH_TRGM_PENALTY. (If we fail to get down to |
| * WISH_TRGM_PENALTY, it's OK so long as total count is no more than |
| * MAX_TRGM_COUNT.) We prefer to remove color trigrams with higher |
| * penalty, since those are the most promising for reducing the total |
| * penalty. When removing a color trigram we have to merge states |
| * connected by arcs labeled with that trigram. It's necessary to not |
| * merge initial and final states, because our graph becomes useless if |
| * that happens; so we cannot always remove the trigram we'd prefer to. |
| */ |
| for (i = 0; i < trgmNFA->colorTrgmsCount; i++) |
| { |
| ColorTrgmInfo *trgmInfo = &colorTrgms[i]; |
| bool canRemove = true; |
| ListCell *cell; |
| |
| /* Done if we've reached the target */ |
| if (totalTrgmPenalty <= WISH_TRGM_PENALTY) |
| break; |
| |
| #ifdef TRGM_REGEXP_DEBUG |
| fprintf(stderr, "considering ctrgm %d %d %d, penalty %f, %d arcs\n", |
| trgmInfo->ctrgm.colors[0], |
| trgmInfo->ctrgm.colors[1], |
| trgmInfo->ctrgm.colors[2], |
| trgmInfo->penalty, |
| list_length(trgmInfo->arcs)); |
| #endif |
| |
| /* |
| * Does any arc of this color trigram connect initial and final |
| * states? If so we can't remove it. |
| */ |
| foreach(cell, trgmInfo->arcs) |
| { |
| TrgmArcInfo *arcInfo = (TrgmArcInfo *) lfirst(cell); |
| TrgmState *source = arcInfo->source, |
| *target = arcInfo->target; |
| int source_flags, |
| target_flags; |
| |
| #ifdef TRGM_REGEXP_DEBUG |
| fprintf(stderr, "examining arc to s%d (%x) from s%d (%x)\n", |
| -target->snumber, target->flags, |
| -source->snumber, source->flags); |
| #endif |
| |
| /* examine parent states, if any merging has already happened */ |
| while (source->parent) |
| source = source->parent; |
| while (target->parent) |
| target = target->parent; |
| |
| #ifdef TRGM_REGEXP_DEBUG |
| fprintf(stderr, " ... after completed merges: to s%d (%x) from s%d (%x)\n", |
| -target->snumber, target->flags, |
| -source->snumber, source->flags); |
| #endif |
| |
| /* we must also consider merges we are planning right now */ |
| source_flags = source->flags | source->tentFlags; |
| while (source->tentParent) |
| { |
| source = source->tentParent; |
| source_flags |= source->flags | source->tentFlags; |
| } |
| target_flags = target->flags | target->tentFlags; |
| while (target->tentParent) |
| { |
| target = target->tentParent; |
| target_flags |= target->flags | target->tentFlags; |
| } |
| |
| #ifdef TRGM_REGEXP_DEBUG |
| fprintf(stderr, " ... after tentative merges: to s%d (%x) from s%d (%x)\n", |
| -target->snumber, target_flags, |
| -source->snumber, source_flags); |
| #endif |
| |
| /* would fully-merged state have both INIT and FIN set? */ |
| if (((source_flags | target_flags) & (TSTATE_INIT | TSTATE_FIN)) == |
| (TSTATE_INIT | TSTATE_FIN)) |
| { |
| canRemove = false; |
| break; |
| } |
| |
| /* ok so far, so remember planned merge */ |
| if (source != target) |
| { |
| #ifdef TRGM_REGEXP_DEBUG |
| fprintf(stderr, " ... tentatively merging s%d into s%d\n", |
| -target->snumber, -source->snumber); |
| #endif |
| target->tentParent = source; |
| source->tentFlags |= target_flags; |
| } |
| } |
| |
| /* |
| * We must reset all the tentFlags/tentParent fields before |
| * continuing. tentFlags could only have become set in states that |
| * are the source or parent or tentative parent of one of the current |
| * arcs; likewise tentParent could only have become set in states that |
| * are the target or parent or tentative parent of one of the current |
| * arcs. There might be some overlap between those sets, but if we |
| * clear tentFlags in target states as well as source states, we |
| * should be okay even if we visit a state as target before visiting |
| * it as a source. |
| */ |
| foreach(cell, trgmInfo->arcs) |
| { |
| TrgmArcInfo *arcInfo = (TrgmArcInfo *) lfirst(cell); |
| TrgmState *source = arcInfo->source, |
| *target = arcInfo->target; |
| TrgmState *ttarget; |
| |
| /* no need to touch previously-merged states */ |
| while (source->parent) |
| source = source->parent; |
| while (target->parent) |
| target = target->parent; |
| |
| while (source) |
| { |
| source->tentFlags = 0; |
| source = source->tentParent; |
| } |
| |
| while ((ttarget = target->tentParent) != NULL) |
| { |
| target->tentParent = NULL; |
| target->tentFlags = 0; /* in case it was also a source */ |
| target = ttarget; |
| } |
| } |
| |
| /* Now, move on if we can't drop this trigram */ |
| if (!canRemove) |
| { |
| #ifdef TRGM_REGEXP_DEBUG |
| fprintf(stderr, " ... not ok to merge\n"); |
| #endif |
| continue; |
| } |
| |
| /* OK, merge states linked by each arc labeled by the trigram */ |
| foreach(cell, trgmInfo->arcs) |
| { |
| TrgmArcInfo *arcInfo = (TrgmArcInfo *) lfirst(cell); |
| TrgmState *source = arcInfo->source, |
| *target = arcInfo->target; |
| |
| while (source->parent) |
| source = source->parent; |
| while (target->parent) |
| target = target->parent; |
| if (source != target) |
| { |
| #ifdef TRGM_REGEXP_DEBUG |
| fprintf(stderr, "merging s%d into s%d\n", |
| -target->snumber, -source->snumber); |
| #endif |
| mergeStates(source, target); |
| /* Assert we didn't merge initial and final states */ |
| Assert((source->flags & (TSTATE_INIT | TSTATE_FIN)) != |
| (TSTATE_INIT | TSTATE_FIN)); |
| } |
| } |
| |
| /* Mark trigram unexpanded, and update totals */ |
| trgmInfo->expanded = false; |
| totalTrgmCount -= trgmInfo->count; |
| totalTrgmPenalty -= trgmInfo->penalty; |
| } |
| |
| /* Did we succeed in fitting into MAX_TRGM_COUNT? */ |
| if (totalTrgmCount > MAX_TRGM_COUNT) |
| return false; |
| |
| trgmNFA->totalTrgmCount = (int) totalTrgmCount; |
| |
| /* |
| * Sort color trigrams by colors (will be useful for bsearch in packGraph) |
| * and enumerate the color trigrams that are expanded. |
| */ |
| cnumber = 0; |
| qsort(colorTrgms, trgmNFA->colorTrgmsCount, sizeof(ColorTrgmInfo), |
| colorTrgmInfoCmp); |
| for (i = 0; i < trgmNFA->colorTrgmsCount; i++) |
| { |
| if (colorTrgms[i].expanded) |
| { |
| colorTrgms[i].cnumber = cnumber; |
| cnumber++; |
| } |
| } |
| |
| return true; |
| } |
| |
| /* |
| * Expand selected color trigrams into regular trigrams. |
| * |
| * Returns the TRGM array to be passed to the index machinery. |
| * The array must be allocated in rcontext. |
| */ |
| static TRGM * |
| expandColorTrigrams(TrgmNFA *trgmNFA, MemoryContext rcontext) |
| { |
| TRGM *trg; |
| trgm *p; |
| int i; |
| TrgmColorInfo blankColor; |
| trgm_mb_char blankChar; |
| |
| /* Set up "blank" color structure containing a single zero character */ |
| memset(blankChar.bytes, 0, sizeof(blankChar.bytes)); |
| blankColor.wordCharsCount = 1; |
| blankColor.wordChars = &blankChar; |
| |
| /* Construct the trgm array */ |
| trg = (TRGM *) |
| MemoryContextAllocZero(rcontext, |
| TRGMHDRSIZE + |
| trgmNFA->totalTrgmCount * sizeof(trgm)); |
| trg->flag = ARRKEY; |
| SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, trgmNFA->totalTrgmCount)); |
| p = GETARR(trg); |
| for (i = 0; i < trgmNFA->colorTrgmsCount; i++) |
| { |
| ColorTrgmInfo *colorTrgm = &trgmNFA->colorTrgms[i]; |
| TrgmColorInfo *c[3]; |
| trgm_mb_char s[3]; |
| int j, |
| i1, |
| i2, |
| i3; |
| |
| /* Ignore any unexpanded trigrams ... */ |
| if (!colorTrgm->expanded) |
| continue; |
| |
| /* Get colors, substituting the dummy struct for COLOR_BLANK */ |
| for (j = 0; j < 3; j++) |
| { |
| if (colorTrgm->ctrgm.colors[j] != COLOR_BLANK) |
| c[j] = &trgmNFA->colorInfo[colorTrgm->ctrgm.colors[j]]; |
| else |
| c[j] = &blankColor; |
| } |
| |
| /* Iterate over all possible combinations of colors' characters */ |
| for (i1 = 0; i1 < c[0]->wordCharsCount; i1++) |
| { |
| s[0] = c[0]->wordChars[i1]; |
| for (i2 = 0; i2 < c[1]->wordCharsCount; i2++) |
| { |
| s[1] = c[1]->wordChars[i2]; |
| for (i3 = 0; i3 < c[2]->wordCharsCount; i3++) |
| { |
| s[2] = c[2]->wordChars[i3]; |
| fillTrgm(p, s); |
| p++; |
| } |
| } |
| } |
| } |
| |
| return trg; |
| } |
| |
| /* |
| * Convert trigram into trgm datatype. |
| */ |
| static void |
| fillTrgm(trgm *ptrgm, trgm_mb_char s[3]) |
| { |
| char str[3 * MAX_MULTIBYTE_CHAR_LEN], |
| *p; |
| int i, |
| j; |
| |
| /* Write multibyte string into "str" (we don't need null termination) */ |
| p = str; |
| |
| for (i = 0; i < 3; i++) |
| { |
| if (s[i].bytes[0] != 0) |
| { |
| for (j = 0; j < MAX_MULTIBYTE_CHAR_LEN && s[i].bytes[j]; j++) |
| *p++ = s[i].bytes[j]; |
| } |
| else |
| { |
| /* Emit a space in place of COLOR_BLANK */ |
| *p++ = ' '; |
| } |
| } |
| |
| /* Convert "str" to a standard trigram (possibly hashing it) */ |
| compact_trigram(ptrgm, str, p - str); |
| } |
| |
| /* |
| * Merge two states of graph. |
| */ |
| static void |
| mergeStates(TrgmState *state1, TrgmState *state2) |
| { |
| Assert(state1 != state2); |
| Assert(!state1->parent); |
| Assert(!state2->parent); |
| |
| /* state1 absorbs state2's flags */ |
| state1->flags |= state2->flags; |
| |
| /* state2, and indirectly all its children, become children of state1 */ |
| state2->parent = state1; |
| } |
| |
| /* |
| * Compare function for sorting of color trigrams by their colors. |
| */ |
| static int |
| colorTrgmInfoCmp(const void *p1, const void *p2) |
| { |
| const ColorTrgmInfo *c1 = (const ColorTrgmInfo *) p1; |
| const ColorTrgmInfo *c2 = (const ColorTrgmInfo *) p2; |
| |
| return memcmp(&c1->ctrgm, &c2->ctrgm, sizeof(ColorTrgm)); |
| } |
| |
| /* |
| * Compare function for sorting color trigrams in descending order of |
| * their penalty fields. |
| */ |
| static int |
| colorTrgmInfoPenaltyCmp(const void *p1, const void *p2) |
| { |
| float4 penalty1 = ((const ColorTrgmInfo *) p1)->penalty; |
| float4 penalty2 = ((const ColorTrgmInfo *) p2)->penalty; |
| |
| if (penalty1 < penalty2) |
| return 1; |
| else if (penalty1 == penalty2) |
| return 0; |
| else |
| return -1; |
| } |
| |
| |
| /*--------------------- |
| * Subroutines for packing the graph into final representation (stage 4). |
| *--------------------- |
| */ |
| |
| /* |
| * Pack expanded graph into final representation. |
| * |
| * The result data must be allocated in rcontext. |
| */ |
| static TrgmPackedGraph * |
| packGraph(TrgmNFA *trgmNFA, MemoryContext rcontext) |
| { |
| int snumber = 2, |
| arcIndex, |
| arcsCount; |
| HASH_SEQ_STATUS scan_status; |
| TrgmState *state; |
| TrgmPackArcInfo *arcs; |
| TrgmPackedArc *packedArcs; |
| TrgmPackedGraph *result; |
| int i, |
| j; |
| |
| /* Enumerate surviving states, giving init and fin reserved numbers */ |
| hash_seq_init(&scan_status, trgmNFA->states); |
| while ((state = (TrgmState *) hash_seq_search(&scan_status)) != NULL) |
| { |
| while (state->parent) |
| state = state->parent; |
| |
| if (state->snumber < 0) |
| { |
| if (state->flags & TSTATE_INIT) |
| state->snumber = 0; |
| else if (state->flags & TSTATE_FIN) |
| state->snumber = 1; |
| else |
| { |
| state->snumber = snumber; |
| snumber++; |
| } |
| } |
| } |
| |
| /* Collect array of all arcs */ |
| arcs = (TrgmPackArcInfo *) |
| palloc(sizeof(TrgmPackArcInfo) * trgmNFA->arcsCount); |
| arcIndex = 0; |
| hash_seq_init(&scan_status, trgmNFA->states); |
| while ((state = (TrgmState *) hash_seq_search(&scan_status)) != NULL) |
| { |
| TrgmState *source = state; |
| ListCell *cell; |
| |
| while (source->parent) |
| source = source->parent; |
| |
| foreach(cell, state->arcs) |
| { |
| TrgmArc *arc = (TrgmArc *) lfirst(cell); |
| TrgmState *target = arc->target; |
| |
| while (target->parent) |
| target = target->parent; |
| |
| if (source->snumber != target->snumber) |
| { |
| ColorTrgmInfo *ctrgm; |
| |
| ctrgm = (ColorTrgmInfo *) bsearch(&arc->ctrgm, |
| trgmNFA->colorTrgms, |
| trgmNFA->colorTrgmsCount, |
| sizeof(ColorTrgmInfo), |
| colorTrgmInfoCmp); |
| Assert(ctrgm != NULL); |
| Assert(ctrgm->expanded); |
| |
| arcs[arcIndex].sourceState = source->snumber; |
| arcs[arcIndex].targetState = target->snumber; |
| arcs[arcIndex].colorTrgm = ctrgm->cnumber; |
| arcIndex++; |
| } |
| } |
| } |
| |
| /* Sort arcs to ease duplicate detection */ |
| qsort(arcs, arcIndex, sizeof(TrgmPackArcInfo), packArcInfoCmp); |
| |
| /* We could have duplicates because states were merged. Remove them. */ |
| if (arcIndex > 1) |
| { |
| /* p1 is probe point, p2 is last known non-duplicate. */ |
| TrgmPackArcInfo *p1, |
| *p2; |
| |
| p2 = arcs; |
| for (p1 = arcs + 1; p1 < arcs + arcIndex; p1++) |
| { |
| if (packArcInfoCmp(p1, p2) > 0) |
| { |
| p2++; |
| *p2 = *p1; |
| } |
| } |
| arcsCount = (p2 - arcs) + 1; |
| } |
| else |
| arcsCount = arcIndex; |
| |
| /* Create packed representation */ |
| result = (TrgmPackedGraph *) |
| MemoryContextAlloc(rcontext, sizeof(TrgmPackedGraph)); |
| |
| /* Pack color trigrams information */ |
| result->colorTrigramsCount = 0; |
| for (i = 0; i < trgmNFA->colorTrgmsCount; i++) |
| { |
| if (trgmNFA->colorTrgms[i].expanded) |
| result->colorTrigramsCount++; |
| } |
| result->colorTrigramGroups = (int *) |
| MemoryContextAlloc(rcontext, sizeof(int) * result->colorTrigramsCount); |
| j = 0; |
| for (i = 0; i < trgmNFA->colorTrgmsCount; i++) |
| { |
| if (trgmNFA->colorTrgms[i].expanded) |
| { |
| result->colorTrigramGroups[j] = trgmNFA->colorTrgms[i].count; |
| j++; |
| } |
| } |
| |
| /* Pack states and arcs information */ |
| result->statesCount = snumber; |
| result->states = (TrgmPackedState *) |
| MemoryContextAlloc(rcontext, snumber * sizeof(TrgmPackedState)); |
| packedArcs = (TrgmPackedArc *) |
| MemoryContextAlloc(rcontext, arcsCount * sizeof(TrgmPackedArc)); |
| j = 0; |
| for (i = 0; i < snumber; i++) |
| { |
| int cnt = 0; |
| |
| result->states[i].arcs = &packedArcs[j]; |
| while (j < arcsCount && arcs[j].sourceState == i) |
| { |
| packedArcs[j].targetState = arcs[j].targetState; |
| packedArcs[j].colorTrgm = arcs[j].colorTrgm; |
| cnt++; |
| j++; |
| } |
| result->states[i].arcsCount = cnt; |
| } |
| |
| /* Allocate working memory for trigramsMatchGraph() */ |
| result->colorTrigramsActive = (bool *) |
| MemoryContextAlloc(rcontext, sizeof(bool) * result->colorTrigramsCount); |
| result->statesActive = (bool *) |
| MemoryContextAlloc(rcontext, sizeof(bool) * result->statesCount); |
| result->statesQueue = (int *) |
| MemoryContextAlloc(rcontext, sizeof(int) * result->statesCount); |
| |
| return result; |
| } |
| |
| /* |
| * Comparison function for sorting TrgmPackArcInfos. |
| * |
| * Compares arcs in following order: sourceState, colorTrgm, targetState. |
| */ |
| static int |
| packArcInfoCmp(const void *a1, const void *a2) |
| { |
| const TrgmPackArcInfo *p1 = (const TrgmPackArcInfo *) a1; |
| const TrgmPackArcInfo *p2 = (const TrgmPackArcInfo *) a2; |
| |
| if (p1->sourceState < p2->sourceState) |
| return -1; |
| if (p1->sourceState > p2->sourceState) |
| return 1; |
| if (p1->colorTrgm < p2->colorTrgm) |
| return -1; |
| if (p1->colorTrgm > p2->colorTrgm) |
| return 1; |
| if (p1->targetState < p2->targetState) |
| return -1; |
| if (p1->targetState > p2->targetState) |
| return 1; |
| return 0; |
| } |
| |
| |
| /*--------------------- |
| * Debugging functions |
| * |
| * These are designed to emit GraphViz files. |
| *--------------------- |
| */ |
| |
| #ifdef TRGM_REGEXP_DEBUG |
| |
| /* |
| * Print initial NFA, in regexp library's representation |
| */ |
| static void |
| printSourceNFA(regex_t *regex, TrgmColorInfo *colors, int ncolors) |
| { |
| StringInfoData buf; |
| int nstates = pg_reg_getnumstates(regex); |
| int state; |
| int i; |
| |
| initStringInfo(&buf); |
| |
| appendStringInfoString(&buf, "\ndigraph sourceNFA {\n"); |
| |
| for (state = 0; state < nstates; state++) |
| { |
| regex_arc_t *arcs; |
| int i, |
| arcsCount; |
| |
| appendStringInfo(&buf, "s%d", state); |
| if (pg_reg_getfinalstate(regex) == state) |
| appendStringInfoString(&buf, " [shape = doublecircle]"); |
| appendStringInfoString(&buf, ";\n"); |
| |
| arcsCount = pg_reg_getnumoutarcs(regex, state); |
| arcs = (regex_arc_t *) palloc(sizeof(regex_arc_t) * arcsCount); |
| pg_reg_getoutarcs(regex, state, arcs, arcsCount); |
| |
| for (i = 0; i < arcsCount; i++) |
| { |
| appendStringInfo(&buf, " s%d -> s%d [label = \"%d\"];\n", |
| state, arcs[i].to, arcs[i].co); |
| } |
| |
| pfree(arcs); |
| } |
| |
| appendStringInfoString(&buf, " node [shape = point ]; initial;\n"); |
| appendStringInfo(&buf, " initial -> s%d;\n", |
| pg_reg_getinitialstate(regex)); |
| |
| /* Print colors */ |
| appendStringInfoString(&buf, " { rank = sink;\n"); |
| appendStringInfoString(&buf, " Colors [shape = none, margin=0, label=<\n"); |
| |
| for (i = 0; i < ncolors; i++) |
| { |
| TrgmColorInfo *color = &colors[i]; |
| int j; |
| |
| appendStringInfo(&buf, "<br/>Color %d: ", i); |
| if (color->expandable) |
| { |
| for (j = 0; j < color->wordCharsCount; j++) |
| { |
| char s[MAX_MULTIBYTE_CHAR_LEN + 1]; |
| |
| memcpy(s, color->wordChars[j].bytes, MAX_MULTIBYTE_CHAR_LEN); |
| s[MAX_MULTIBYTE_CHAR_LEN] = '\0'; |
| appendStringInfoString(&buf, s); |
| } |
| } |
| else |
| appendStringInfoString(&buf, "not expandable"); |
| appendStringInfoChar(&buf, '\n'); |
| } |
| |
| appendStringInfoString(&buf, " >];\n"); |
| appendStringInfoString(&buf, " }\n"); |
| appendStringInfoString(&buf, "}\n"); |
| |
| { |
| /* dot -Tpng -o /tmp/source.png < /tmp/source.gv */ |
| FILE *fp = fopen("/tmp/source.gv", "w"); |
| |
| fprintf(fp, "%s", buf.data); |
| fclose(fp); |
| } |
| |
| pfree(buf.data); |
| } |
| |
| /* |
| * Print expanded graph. |
| */ |
| static void |
| printTrgmNFA(TrgmNFA *trgmNFA) |
| { |
| StringInfoData buf; |
| HASH_SEQ_STATUS scan_status; |
| TrgmState *state; |
| TrgmState *initstate = NULL; |
| |
| initStringInfo(&buf); |
| |
| appendStringInfoString(&buf, "\ndigraph transformedNFA {\n"); |
| |
| hash_seq_init(&scan_status, trgmNFA->states); |
| while ((state = (TrgmState *) hash_seq_search(&scan_status)) != NULL) |
| { |
| ListCell *cell; |
| |
| appendStringInfo(&buf, "s%d", -state->snumber); |
| if (state->flags & TSTATE_FIN) |
| appendStringInfoString(&buf, " [shape = doublecircle]"); |
| if (state->flags & TSTATE_INIT) |
| initstate = state; |
| appendStringInfo(&buf, " [label = \"%d\"]", state->stateKey.nstate); |
| appendStringInfoString(&buf, ";\n"); |
| |
| foreach(cell, state->arcs) |
| { |
| TrgmArc *arc = (TrgmArc *) lfirst(cell); |
| |
| appendStringInfo(&buf, " s%d -> s%d [label = \"", |
| -state->snumber, -arc->target->snumber); |
| printTrgmColor(&buf, arc->ctrgm.colors[0]); |
| appendStringInfoChar(&buf, ' '); |
| printTrgmColor(&buf, arc->ctrgm.colors[1]); |
| appendStringInfoChar(&buf, ' '); |
| printTrgmColor(&buf, arc->ctrgm.colors[2]); |
| appendStringInfoString(&buf, "\"];\n"); |
| } |
| } |
| |
| if (initstate) |
| { |
| appendStringInfoString(&buf, " node [shape = point ]; initial;\n"); |
| appendStringInfo(&buf, " initial -> s%d;\n", -initstate->snumber); |
| } |
| |
| appendStringInfoString(&buf, "}\n"); |
| |
| { |
| /* dot -Tpng -o /tmp/transformed.png < /tmp/transformed.gv */ |
| FILE *fp = fopen("/tmp/transformed.gv", "w"); |
| |
| fprintf(fp, "%s", buf.data); |
| fclose(fp); |
| } |
| |
| pfree(buf.data); |
| } |
| |
| /* |
| * Print a TrgmColor readably. |
| */ |
| static void |
| printTrgmColor(StringInfo buf, TrgmColor co) |
| { |
| if (co == COLOR_UNKNOWN) |
| appendStringInfoChar(buf, 'u'); |
| else if (co == COLOR_BLANK) |
| appendStringInfoChar(buf, 'b'); |
| else |
| appendStringInfo(buf, "%d", (int) co); |
| } |
| |
| /* |
| * Print final packed representation of trigram-based expanded graph. |
| */ |
| static void |
| printTrgmPackedGraph(TrgmPackedGraph *packedGraph, TRGM *trigrams) |
| { |
| StringInfoData buf; |
| trgm *p; |
| int i; |
| |
| initStringInfo(&buf); |
| |
| appendStringInfoString(&buf, "\ndigraph packedGraph {\n"); |
| |
| for (i = 0; i < packedGraph->statesCount; i++) |
| { |
| TrgmPackedState *state = &packedGraph->states[i]; |
| int j; |
| |
| appendStringInfo(&buf, " s%d", i); |
| if (i == 1) |
| appendStringInfoString(&buf, " [shape = doublecircle]"); |
| |
| appendStringInfo(&buf, " [label = <s%d>];\n", i); |
| |
| for (j = 0; j < state->arcsCount; j++) |
| { |
| TrgmPackedArc *arc = &state->arcs[j]; |
| |
| appendStringInfo(&buf, " s%d -> s%d [label = \"trigram %d\"];\n", |
| i, arc->targetState, arc->colorTrgm); |
| } |
| } |
| |
| appendStringInfoString(&buf, " node [shape = point ]; initial;\n"); |
| appendStringInfo(&buf, " initial -> s%d;\n", 0); |
| |
| /* Print trigrams */ |
| appendStringInfoString(&buf, " { rank = sink;\n"); |
| appendStringInfoString(&buf, " Trigrams [shape = none, margin=0, label=<\n"); |
| |
| p = GETARR(trigrams); |
| for (i = 0; i < packedGraph->colorTrigramsCount; i++) |
| { |
| int count = packedGraph->colorTrigramGroups[i]; |
| int j; |
| |
| appendStringInfo(&buf, "<br/>Trigram %d: ", i); |
| |
| for (j = 0; j < count; j++) |
| { |
| if (j > 0) |
| appendStringInfoString(&buf, ", "); |
| |
| /* |
| * XXX This representation is nice only for all-ASCII trigrams. |
| */ |
| appendStringInfo(&buf, "\"%c%c%c\"", (*p)[0], (*p)[1], (*p)[2]); |
| p++; |
| } |
| } |
| |
| appendStringInfoString(&buf, " >];\n"); |
| appendStringInfoString(&buf, " }\n"); |
| appendStringInfoString(&buf, "}\n"); |
| |
| { |
| /* dot -Tpng -o /tmp/packed.png < /tmp/packed.gv */ |
| FILE *fp = fopen("/tmp/packed.gv", "w"); |
| |
| fprintf(fp, "%s", buf.data); |
| fclose(fp); |
| } |
| |
| pfree(buf.data); |
| } |
| |
| #endif /* TRGM_REGEXP_DEBUG */ |