| /*------------------------------------------------------------------------- |
| * |
| * tsvector_op.c |
| * operations over tsvector |
| * |
| * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group |
| * |
| * |
| * IDENTIFICATION |
| * src/backend/utils/adt/tsvector_op.c |
| * |
| *------------------------------------------------------------------------- |
| */ |
| #include "postgres.h" |
| |
| #include <limits.h> |
| |
| #include "access/htup_details.h" |
| #include "catalog/namespace.h" |
| #include "catalog/pg_type.h" |
| #include "commands/trigger.h" |
| #include "executor/spi.h" |
| #include "funcapi.h" |
| #include "lib/qunique.h" |
| #include "mb/pg_wchar.h" |
| #include "miscadmin.h" |
| #include "parser/parse_coerce.h" |
| #include "tsearch/ts_utils.h" |
| #include "utils/array.h" |
| #include "utils/builtins.h" |
| #include "utils/lsyscache.h" |
| #include "utils/regproc.h" |
| #include "utils/rel.h" |
| |
| |
| typedef struct |
| { |
| WordEntry *arrb; |
| WordEntry *arre; |
| char *values; |
| char *operand; |
| } CHKVAL; |
| |
| |
| typedef struct StatEntry |
| { |
| uint32 ndoc; /* zero indicates that we were already here |
| * while walking through the tree */ |
| uint32 nentry; |
| struct StatEntry *left; |
| struct StatEntry *right; |
| uint32 lenlexeme; |
| char lexeme[FLEXIBLE_ARRAY_MEMBER]; |
| } StatEntry; |
| |
| #define STATENTRYHDRSZ (offsetof(StatEntry, lexeme)) |
| |
| typedef struct |
| { |
| int32 weight; |
| |
| uint32 maxdepth; |
| |
| StatEntry **stack; |
| uint32 stackpos; |
| |
| StatEntry *root; |
| } TSVectorStat; |
| |
| |
| static TSTernaryValue TS_execute_recurse(QueryItem *curitem, void *arg, |
| uint32 flags, |
| TSExecuteCallback chkcond); |
| static bool TS_execute_locations_recurse(QueryItem *curitem, |
| void *arg, |
| TSExecuteCallback chkcond, |
| List **locations); |
| static int tsvector_bsearch(const TSVector tsv, char *lexeme, int lexeme_len); |
| static Datum tsvector_update_trigger(PG_FUNCTION_ARGS, bool config_column); |
| |
| |
| /* |
| * Order: haspos, len, word, for all positions (pos, weight) |
| */ |
| static int |
| silly_cmp_tsvector(const TSVector a, const TSVector b) |
| { |
| if (VARSIZE(a) < VARSIZE(b)) |
| return -1; |
| else if (VARSIZE(a) > VARSIZE(b)) |
| return 1; |
| else if (a->size < b->size) |
| return -1; |
| else if (a->size > b->size) |
| return 1; |
| else |
| { |
| WordEntry *aptr = ARRPTR(a); |
| WordEntry *bptr = ARRPTR(b); |
| int i = 0; |
| int res; |
| |
| |
| for (i = 0; i < a->size; i++) |
| { |
| if (aptr->haspos != bptr->haspos) |
| { |
| return (aptr->haspos > bptr->haspos) ? -1 : 1; |
| } |
| else if ((res = tsCompareString(STRPTR(a) + aptr->pos, aptr->len, STRPTR(b) + bptr->pos, bptr->len, false)) != 0) |
| { |
| return res; |
| } |
| else if (aptr->haspos) |
| { |
| WordEntryPos *ap = POSDATAPTR(a, aptr); |
| WordEntryPos *bp = POSDATAPTR(b, bptr); |
| int j; |
| |
| if (POSDATALEN(a, aptr) != POSDATALEN(b, bptr)) |
| return (POSDATALEN(a, aptr) > POSDATALEN(b, bptr)) ? -1 : 1; |
| |
| for (j = 0; j < POSDATALEN(a, aptr); j++) |
| { |
| if (WEP_GETPOS(*ap) != WEP_GETPOS(*bp)) |
| { |
| return (WEP_GETPOS(*ap) > WEP_GETPOS(*bp)) ? -1 : 1; |
| } |
| else if (WEP_GETWEIGHT(*ap) != WEP_GETWEIGHT(*bp)) |
| { |
| return (WEP_GETWEIGHT(*ap) > WEP_GETWEIGHT(*bp)) ? -1 : 1; |
| } |
| ap++, bp++; |
| } |
| } |
| |
| aptr++; |
| bptr++; |
| } |
| } |
| |
| return 0; |
| } |
| |
| #define TSVECTORCMPFUNC( type, action, ret ) \ |
| Datum \ |
| tsvector_##type(PG_FUNCTION_ARGS) \ |
| { \ |
| TSVector a = PG_GETARG_TSVECTOR(0); \ |
| TSVector b = PG_GETARG_TSVECTOR(1); \ |
| int res = silly_cmp_tsvector(a, b); \ |
| PG_FREE_IF_COPY(a,0); \ |
| PG_FREE_IF_COPY(b,1); \ |
| PG_RETURN_##ret( res action 0 ); \ |
| } \ |
| /* keep compiler quiet - no extra ; */ \ |
| extern int no_such_variable |
| |
| TSVECTORCMPFUNC(lt, <, BOOL); |
| TSVECTORCMPFUNC(le, <=, BOOL); |
| TSVECTORCMPFUNC(eq, ==, BOOL); |
| TSVECTORCMPFUNC(ge, >=, BOOL); |
| TSVECTORCMPFUNC(gt, >, BOOL); |
| TSVECTORCMPFUNC(ne, !=, BOOL); |
| TSVECTORCMPFUNC(cmp, +, INT32); |
| |
| Datum |
| tsvector_strip(PG_FUNCTION_ARGS) |
| { |
| TSVector in = PG_GETARG_TSVECTOR(0); |
| TSVector out; |
| int i, |
| len = 0; |
| WordEntry *arrin = ARRPTR(in), |
| *arrout; |
| char *cur; |
| |
| for (i = 0; i < in->size; i++) |
| len += arrin[i].len; |
| |
| len = CALCDATASIZE(in->size, len); |
| out = (TSVector) palloc0(len); |
| SET_VARSIZE(out, len); |
| out->size = in->size; |
| arrout = ARRPTR(out); |
| cur = STRPTR(out); |
| for (i = 0; i < in->size; i++) |
| { |
| memcpy(cur, STRPTR(in) + arrin[i].pos, arrin[i].len); |
| arrout[i].haspos = 0; |
| arrout[i].len = arrin[i].len; |
| arrout[i].pos = cur - STRPTR(out); |
| cur += arrout[i].len; |
| } |
| |
| PG_FREE_IF_COPY(in, 0); |
| PG_RETURN_POINTER(out); |
| } |
| |
| Datum |
| tsvector_length(PG_FUNCTION_ARGS) |
| { |
| TSVector in = PG_GETARG_TSVECTOR(0); |
| int32 ret = in->size; |
| |
| PG_FREE_IF_COPY(in, 0); |
| PG_RETURN_INT32(ret); |
| } |
| |
| Datum |
| tsvector_setweight(PG_FUNCTION_ARGS) |
| { |
| TSVector in = PG_GETARG_TSVECTOR(0); |
| char cw = PG_GETARG_CHAR(1); |
| TSVector out; |
| int i, |
| j; |
| WordEntry *entry; |
| WordEntryPos *p; |
| int w = 0; |
| |
| switch (cw) |
| { |
| case 'A': |
| case 'a': |
| w = 3; |
| break; |
| case 'B': |
| case 'b': |
| w = 2; |
| break; |
| case 'C': |
| case 'c': |
| w = 1; |
| break; |
| case 'D': |
| case 'd': |
| w = 0; |
| break; |
| default: |
| /* internal error */ |
| elog(ERROR, "unrecognized weight: %d", cw); |
| } |
| |
| out = (TSVector) palloc(VARSIZE(in)); |
| memcpy(out, in, VARSIZE(in)); |
| entry = ARRPTR(out); |
| i = out->size; |
| while (i--) |
| { |
| if ((j = POSDATALEN(out, entry)) != 0) |
| { |
| p = POSDATAPTR(out, entry); |
| while (j--) |
| { |
| WEP_SETWEIGHT(*p, w); |
| p++; |
| } |
| } |
| entry++; |
| } |
| |
| PG_FREE_IF_COPY(in, 0); |
| PG_RETURN_POINTER(out); |
| } |
| |
| /* |
| * setweight(tsin tsvector, char_weight "char", lexemes "text"[]) |
| * |
| * Assign weight w to elements of tsin that are listed in lexemes. |
| */ |
| Datum |
| tsvector_setweight_by_filter(PG_FUNCTION_ARGS) |
| { |
| TSVector tsin = PG_GETARG_TSVECTOR(0); |
| char char_weight = PG_GETARG_CHAR(1); |
| ArrayType *lexemes = PG_GETARG_ARRAYTYPE_P(2); |
| |
| TSVector tsout; |
| int i, |
| j, |
| nlexemes, |
| weight; |
| WordEntry *entry; |
| Datum *dlexemes; |
| bool *nulls; |
| |
| switch (char_weight) |
| { |
| case 'A': |
| case 'a': |
| weight = 3; |
| break; |
| case 'B': |
| case 'b': |
| weight = 2; |
| break; |
| case 'C': |
| case 'c': |
| weight = 1; |
| break; |
| case 'D': |
| case 'd': |
| weight = 0; |
| break; |
| default: |
| /* internal error */ |
| elog(ERROR, "unrecognized weight: %c", char_weight); |
| } |
| |
| tsout = (TSVector) palloc(VARSIZE(tsin)); |
| memcpy(tsout, tsin, VARSIZE(tsin)); |
| entry = ARRPTR(tsout); |
| |
| deconstruct_array_builtin(lexemes, TEXTOID, &dlexemes, &nulls, &nlexemes); |
| |
| /* |
| * Assuming that lexemes array is significantly shorter than tsvector we |
| * can iterate through lexemes performing binary search of each lexeme |
| * from lexemes in tsvector. |
| */ |
| for (i = 0; i < nlexemes; i++) |
| { |
| char *lex; |
| int lex_len, |
| lex_pos; |
| |
| /* Ignore null array elements, they surely don't match */ |
| if (nulls[i]) |
| continue; |
| |
| lex = VARDATA(dlexemes[i]); |
| lex_len = VARSIZE(dlexemes[i]) - VARHDRSZ; |
| lex_pos = tsvector_bsearch(tsout, lex, lex_len); |
| |
| if (lex_pos >= 0 && (j = POSDATALEN(tsout, entry + lex_pos)) != 0) |
| { |
| WordEntryPos *p = POSDATAPTR(tsout, entry + lex_pos); |
| |
| while (j--) |
| { |
| WEP_SETWEIGHT(*p, weight); |
| p++; |
| } |
| } |
| } |
| |
| PG_FREE_IF_COPY(tsin, 0); |
| PG_FREE_IF_COPY(lexemes, 2); |
| |
| PG_RETURN_POINTER(tsout); |
| } |
| |
| #define compareEntry(pa, a, pb, b) \ |
| tsCompareString((pa) + (a)->pos, (a)->len, \ |
| (pb) + (b)->pos, (b)->len, \ |
| false) |
| |
| /* |
| * Add positions from src to dest after offsetting them by maxpos. |
| * Return the number added (might be less than expected due to overflow) |
| */ |
| static int32 |
| add_pos(TSVector src, WordEntry *srcptr, |
| TSVector dest, WordEntry *destptr, |
| int32 maxpos) |
| { |
| uint16 *clen = &_POSVECPTR(dest, destptr)->npos; |
| int i; |
| uint16 slen = POSDATALEN(src, srcptr), |
| startlen; |
| WordEntryPos *spos = POSDATAPTR(src, srcptr), |
| *dpos = POSDATAPTR(dest, destptr); |
| |
| if (!destptr->haspos) |
| *clen = 0; |
| |
| startlen = *clen; |
| for (i = 0; |
| i < slen && *clen < MAXNUMPOS && |
| (*clen == 0 || WEP_GETPOS(dpos[*clen - 1]) != MAXENTRYPOS - 1); |
| i++) |
| { |
| WEP_SETWEIGHT(dpos[*clen], WEP_GETWEIGHT(spos[i])); |
| WEP_SETPOS(dpos[*clen], LIMITPOS(WEP_GETPOS(spos[i]) + maxpos)); |
| (*clen)++; |
| } |
| |
| if (*clen != startlen) |
| destptr->haspos = 1; |
| return *clen - startlen; |
| } |
| |
| /* |
| * Perform binary search of given lexeme in TSVector. |
| * Returns lexeme position in TSVector's entry array or -1 if lexeme wasn't |
| * found. |
| */ |
| static int |
| tsvector_bsearch(const TSVector tsv, char *lexeme, int lexeme_len) |
| { |
| WordEntry *arrin = ARRPTR(tsv); |
| int StopLow = 0, |
| StopHigh = tsv->size, |
| StopMiddle, |
| cmp; |
| |
| while (StopLow < StopHigh) |
| { |
| StopMiddle = (StopLow + StopHigh) / 2; |
| |
| cmp = tsCompareString(lexeme, lexeme_len, |
| STRPTR(tsv) + arrin[StopMiddle].pos, |
| arrin[StopMiddle].len, |
| false); |
| |
| if (cmp < 0) |
| StopHigh = StopMiddle; |
| else if (cmp > 0) |
| StopLow = StopMiddle + 1; |
| else /* found it */ |
| return StopMiddle; |
| } |
| |
| return -1; |
| } |
| |
| /* |
| * qsort comparator functions |
| */ |
| |
| static int |
| compare_int(const void *va, const void *vb) |
| { |
| int a = *((const int *) va); |
| int b = *((const int *) vb); |
| |
| if (a == b) |
| return 0; |
| return (a > b) ? 1 : -1; |
| } |
| |
| static int |
| compare_text_lexemes(const void *va, const void *vb) |
| { |
| Datum a = *((const Datum *) va); |
| Datum b = *((const Datum *) vb); |
| char *alex = VARDATA_ANY(a); |
| int alex_len = VARSIZE_ANY_EXHDR(a); |
| char *blex = VARDATA_ANY(b); |
| int blex_len = VARSIZE_ANY_EXHDR(b); |
| |
| return tsCompareString(alex, alex_len, blex, blex_len, false); |
| } |
| |
| /* |
| * Internal routine to delete lexemes from TSVector by array of offsets. |
| * |
| * int *indices_to_delete -- array of lexeme offsets to delete (modified here!) |
| * int indices_count -- size of that array |
| * |
| * Returns new TSVector without given lexemes along with their positions |
| * and weights. |
| */ |
| static TSVector |
| tsvector_delete_by_indices(TSVector tsv, int *indices_to_delete, |
| int indices_count) |
| { |
| TSVector tsout; |
| WordEntry *arrin = ARRPTR(tsv), |
| *arrout; |
| char *data = STRPTR(tsv), |
| *dataout; |
| int i, /* index in arrin */ |
| j, /* index in arrout */ |
| k, /* index in indices_to_delete */ |
| curoff; /* index in dataout area */ |
| |
| /* |
| * Sort the filter array to simplify membership checks below. Also, get |
| * rid of any duplicate entries, so that we can assume that indices_count |
| * is exactly equal to the number of lexemes that will be removed. |
| */ |
| if (indices_count > 1) |
| { |
| qsort(indices_to_delete, indices_count, sizeof(int), compare_int); |
| indices_count = qunique(indices_to_delete, indices_count, sizeof(int), |
| compare_int); |
| } |
| |
| /* |
| * Here we overestimate tsout size, since we don't know how much space is |
| * used by the deleted lexeme(s). We will set exact size below. |
| */ |
| tsout = (TSVector) palloc0(VARSIZE(tsv)); |
| |
| /* This count must be correct because STRPTR(tsout) relies on it. */ |
| tsout->size = tsv->size - indices_count; |
| |
| /* |
| * Copy tsv to tsout, skipping lexemes listed in indices_to_delete. |
| */ |
| arrout = ARRPTR(tsout); |
| dataout = STRPTR(tsout); |
| curoff = 0; |
| for (i = j = k = 0; i < tsv->size; i++) |
| { |
| /* |
| * If current i is present in indices_to_delete, skip this lexeme. |
| * Since indices_to_delete is already sorted, we only need to check |
| * the current (k'th) entry. |
| */ |
| if (k < indices_count && i == indices_to_delete[k]) |
| { |
| k++; |
| continue; |
| } |
| |
| /* Copy lexeme and its positions and weights */ |
| memcpy(dataout + curoff, data + arrin[i].pos, arrin[i].len); |
| arrout[j].haspos = arrin[i].haspos; |
| arrout[j].len = arrin[i].len; |
| arrout[j].pos = curoff; |
| curoff += arrin[i].len; |
| if (arrin[i].haspos) |
| { |
| int len = POSDATALEN(tsv, arrin + i) * sizeof(WordEntryPos) |
| + sizeof(uint16); |
| |
| curoff = SHORTALIGN(curoff); |
| memcpy(dataout + curoff, |
| STRPTR(tsv) + SHORTALIGN(arrin[i].pos + arrin[i].len), |
| len); |
| curoff += len; |
| } |
| |
| j++; |
| } |
| |
| /* |
| * k should now be exactly equal to indices_count. If it isn't then the |
| * caller provided us with indices outside of [0, tsv->size) range and |
| * estimation of tsout's size is wrong. |
| */ |
| Assert(k == indices_count); |
| |
| SET_VARSIZE(tsout, CALCDATASIZE(tsout->size, curoff)); |
| return tsout; |
| } |
| |
| /* |
| * Delete given lexeme from tsvector. |
| * Implementation of user-level ts_delete(tsvector, text). |
| */ |
| Datum |
| tsvector_delete_str(PG_FUNCTION_ARGS) |
| { |
| TSVector tsin = PG_GETARG_TSVECTOR(0), |
| tsout; |
| text *tlexeme = PG_GETARG_TEXT_PP(1); |
| char *lexeme = VARDATA_ANY(tlexeme); |
| int lexeme_len = VARSIZE_ANY_EXHDR(tlexeme), |
| skip_index; |
| |
| if ((skip_index = tsvector_bsearch(tsin, lexeme, lexeme_len)) == -1) |
| PG_RETURN_POINTER(tsin); |
| |
| tsout = tsvector_delete_by_indices(tsin, &skip_index, 1); |
| |
| PG_FREE_IF_COPY(tsin, 0); |
| PG_FREE_IF_COPY(tlexeme, 1); |
| PG_RETURN_POINTER(tsout); |
| } |
| |
| /* |
| * Delete given array of lexemes from tsvector. |
| * Implementation of user-level ts_delete(tsvector, text[]). |
| */ |
| Datum |
| tsvector_delete_arr(PG_FUNCTION_ARGS) |
| { |
| TSVector tsin = PG_GETARG_TSVECTOR(0), |
| tsout; |
| ArrayType *lexemes = PG_GETARG_ARRAYTYPE_P(1); |
| int i, |
| nlex, |
| skip_count, |
| *skip_indices; |
| Datum *dlexemes; |
| bool *nulls; |
| |
| deconstruct_array_builtin(lexemes, TEXTOID, &dlexemes, &nulls, &nlex); |
| |
| /* |
| * In typical use case array of lexemes to delete is relatively small. So |
| * here we optimize things for that scenario: iterate through lexarr |
| * performing binary search of each lexeme from lexarr in tsvector. |
| */ |
| skip_indices = palloc0(nlex * sizeof(int)); |
| for (i = skip_count = 0; i < nlex; i++) |
| { |
| char *lex; |
| int lex_len, |
| lex_pos; |
| |
| /* Ignore null array elements, they surely don't match */ |
| if (nulls[i]) |
| continue; |
| |
| lex = VARDATA(dlexemes[i]); |
| lex_len = VARSIZE(dlexemes[i]) - VARHDRSZ; |
| lex_pos = tsvector_bsearch(tsin, lex, lex_len); |
| |
| if (lex_pos >= 0) |
| skip_indices[skip_count++] = lex_pos; |
| } |
| |
| tsout = tsvector_delete_by_indices(tsin, skip_indices, skip_count); |
| |
| pfree(skip_indices); |
| PG_FREE_IF_COPY(tsin, 0); |
| PG_FREE_IF_COPY(lexemes, 1); |
| |
| PG_RETURN_POINTER(tsout); |
| } |
| |
| /* |
| * Expand tsvector as table with following columns: |
| * lexeme: lexeme text |
| * positions: integer array of lexeme positions |
| * weights: char array of weights corresponding to positions |
| */ |
| Datum |
| tsvector_unnest(PG_FUNCTION_ARGS) |
| { |
| FuncCallContext *funcctx; |
| TSVector tsin; |
| |
| if (SRF_IS_FIRSTCALL()) |
| { |
| MemoryContext oldcontext; |
| TupleDesc tupdesc; |
| |
| funcctx = SRF_FIRSTCALL_INIT(); |
| oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); |
| |
| tupdesc = CreateTemplateTupleDesc(3); |
| TupleDescInitEntry(tupdesc, (AttrNumber) 1, "lexeme", |
| TEXTOID, -1, 0); |
| TupleDescInitEntry(tupdesc, (AttrNumber) 2, "positions", |
| INT2ARRAYOID, -1, 0); |
| TupleDescInitEntry(tupdesc, (AttrNumber) 3, "weights", |
| TEXTARRAYOID, -1, 0); |
| if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) |
| elog(ERROR, "return type must be a row type"); |
| funcctx->tuple_desc = tupdesc; |
| |
| funcctx->user_fctx = PG_GETARG_TSVECTOR_COPY(0); |
| |
| MemoryContextSwitchTo(oldcontext); |
| } |
| |
| funcctx = SRF_PERCALL_SETUP(); |
| tsin = (TSVector) funcctx->user_fctx; |
| |
| if (funcctx->call_cntr < tsin->size) |
| { |
| WordEntry *arrin = ARRPTR(tsin); |
| char *data = STRPTR(tsin); |
| HeapTuple tuple; |
| int j, |
| i = funcctx->call_cntr; |
| bool nulls[] = {false, false, false}; |
| Datum values[3]; |
| |
| values[0] = PointerGetDatum(cstring_to_text_with_len(data + arrin[i].pos, arrin[i].len)); |
| |
| if (arrin[i].haspos) |
| { |
| WordEntryPosVector *posv; |
| Datum *positions; |
| Datum *weights; |
| char weight; |
| |
| /* |
| * Internally tsvector stores position and weight in the same |
| * uint16 (2 bits for weight, 14 for position). Here we extract |
| * that in two separate arrays. |
| */ |
| posv = _POSVECPTR(tsin, arrin + i); |
| positions = palloc(posv->npos * sizeof(Datum)); |
| weights = palloc(posv->npos * sizeof(Datum)); |
| for (j = 0; j < posv->npos; j++) |
| { |
| positions[j] = Int16GetDatum(WEP_GETPOS(posv->pos[j])); |
| weight = 'D' - WEP_GETWEIGHT(posv->pos[j]); |
| weights[j] = PointerGetDatum(cstring_to_text_with_len(&weight, |
| 1)); |
| } |
| |
| values[1] = PointerGetDatum(construct_array_builtin(positions, posv->npos, INT2OID)); |
| values[2] = PointerGetDatum(construct_array_builtin(weights, posv->npos, TEXTOID)); |
| } |
| else |
| { |
| nulls[1] = nulls[2] = true; |
| } |
| |
| tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls); |
| SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(tuple)); |
| } |
| else |
| { |
| SRF_RETURN_DONE(funcctx); |
| } |
| } |
| |
| /* |
| * Convert tsvector to array of lexemes. |
| */ |
| Datum |
| tsvector_to_array(PG_FUNCTION_ARGS) |
| { |
| TSVector tsin = PG_GETARG_TSVECTOR(0); |
| WordEntry *arrin = ARRPTR(tsin); |
| Datum *elements; |
| int i; |
| ArrayType *array; |
| |
| elements = palloc(tsin->size * sizeof(Datum)); |
| |
| for (i = 0; i < tsin->size; i++) |
| { |
| elements[i] = PointerGetDatum(cstring_to_text_with_len(STRPTR(tsin) + arrin[i].pos, |
| arrin[i].len)); |
| } |
| |
| array = construct_array_builtin(elements, tsin->size, TEXTOID); |
| |
| pfree(elements); |
| PG_FREE_IF_COPY(tsin, 0); |
| PG_RETURN_POINTER(array); |
| } |
| |
| /* |
| * Build tsvector from array of lexemes. |
| */ |
| Datum |
| array_to_tsvector(PG_FUNCTION_ARGS) |
| { |
| ArrayType *v = PG_GETARG_ARRAYTYPE_P(0); |
| TSVector tsout; |
| Datum *dlexemes; |
| WordEntry *arrout; |
| bool *nulls; |
| int nitems, |
| i, |
| tslen, |
| datalen = 0; |
| char *cur; |
| |
| deconstruct_array_builtin(v, TEXTOID, &dlexemes, &nulls, &nitems); |
| |
| /* |
| * Reject nulls and zero length strings (maybe we should just ignore them, |
| * instead?) |
| */ |
| for (i = 0; i < nitems; i++) |
| { |
| if (nulls[i]) |
| ereport(ERROR, |
| (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), |
| errmsg("lexeme array may not contain nulls"))); |
| |
| if (VARSIZE(dlexemes[i]) - VARHDRSZ == 0) |
| ereport(ERROR, |
| (errcode(ERRCODE_ZERO_LENGTH_CHARACTER_STRING), |
| errmsg("lexeme array may not contain empty strings"))); |
| } |
| |
| /* Sort and de-dup, because this is required for a valid tsvector. */ |
| if (nitems > 1) |
| { |
| qsort(dlexemes, nitems, sizeof(Datum), compare_text_lexemes); |
| nitems = qunique(dlexemes, nitems, sizeof(Datum), |
| compare_text_lexemes); |
| } |
| |
| /* Calculate space needed for surviving lexemes. */ |
| for (i = 0; i < nitems; i++) |
| datalen += VARSIZE(dlexemes[i]) - VARHDRSZ; |
| tslen = CALCDATASIZE(nitems, datalen); |
| |
| /* Allocate and fill tsvector. */ |
| tsout = (TSVector) palloc0(tslen); |
| SET_VARSIZE(tsout, tslen); |
| tsout->size = nitems; |
| |
| arrout = ARRPTR(tsout); |
| cur = STRPTR(tsout); |
| for (i = 0; i < nitems; i++) |
| { |
| char *lex = VARDATA(dlexemes[i]); |
| int lex_len = VARSIZE(dlexemes[i]) - VARHDRSZ; |
| |
| memcpy(cur, lex, lex_len); |
| arrout[i].haspos = 0; |
| arrout[i].len = lex_len; |
| arrout[i].pos = cur - STRPTR(tsout); |
| cur += lex_len; |
| } |
| |
| PG_FREE_IF_COPY(v, 0); |
| PG_RETURN_POINTER(tsout); |
| } |
| |
| /* |
| * ts_filter(): keep only lexemes with given weights in tsvector. |
| */ |
| Datum |
| tsvector_filter(PG_FUNCTION_ARGS) |
| { |
| TSVector tsin = PG_GETARG_TSVECTOR(0), |
| tsout; |
| ArrayType *weights = PG_GETARG_ARRAYTYPE_P(1); |
| WordEntry *arrin = ARRPTR(tsin), |
| *arrout; |
| char *datain = STRPTR(tsin), |
| *dataout; |
| Datum *dweights; |
| bool *nulls; |
| int nweights; |
| int i, |
| j; |
| int cur_pos = 0; |
| char mask = 0; |
| |
| deconstruct_array_builtin(weights, CHAROID, &dweights, &nulls, &nweights); |
| |
| for (i = 0; i < nweights; i++) |
| { |
| char char_weight; |
| |
| if (nulls[i]) |
| ereport(ERROR, |
| (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), |
| errmsg("weight array may not contain nulls"))); |
| |
| char_weight = DatumGetChar(dweights[i]); |
| switch (char_weight) |
| { |
| case 'A': |
| case 'a': |
| mask = mask | 8; |
| break; |
| case 'B': |
| case 'b': |
| mask = mask | 4; |
| break; |
| case 'C': |
| case 'c': |
| mask = mask | 2; |
| break; |
| case 'D': |
| case 'd': |
| mask = mask | 1; |
| break; |
| default: |
| ereport(ERROR, |
| (errcode(ERRCODE_INVALID_PARAMETER_VALUE), |
| errmsg("unrecognized weight: \"%c\"", char_weight))); |
| } |
| } |
| |
| tsout = (TSVector) palloc0(VARSIZE(tsin)); |
| tsout->size = tsin->size; |
| arrout = ARRPTR(tsout); |
| dataout = STRPTR(tsout); |
| |
| for (i = j = 0; i < tsin->size; i++) |
| { |
| WordEntryPosVector *posvin, |
| *posvout; |
| int npos = 0; |
| int k; |
| |
| if (!arrin[i].haspos) |
| continue; |
| |
| posvin = _POSVECPTR(tsin, arrin + i); |
| posvout = (WordEntryPosVector *) |
| (dataout + SHORTALIGN(cur_pos + arrin[i].len)); |
| |
| for (k = 0; k < posvin->npos; k++) |
| { |
| if (mask & (1 << WEP_GETWEIGHT(posvin->pos[k]))) |
| posvout->pos[npos++] = posvin->pos[k]; |
| } |
| |
| /* if no satisfactory positions found, skip lexeme */ |
| if (!npos) |
| continue; |
| |
| arrout[j].haspos = true; |
| arrout[j].len = arrin[i].len; |
| arrout[j].pos = cur_pos; |
| |
| memcpy(dataout + cur_pos, datain + arrin[i].pos, arrin[i].len); |
| posvout->npos = npos; |
| cur_pos += SHORTALIGN(arrin[i].len); |
| cur_pos += POSDATALEN(tsout, arrout + j) * sizeof(WordEntryPos) + |
| sizeof(uint16); |
| j++; |
| } |
| |
| tsout->size = j; |
| if (dataout != STRPTR(tsout)) |
| memmove(STRPTR(tsout), dataout, cur_pos); |
| |
| SET_VARSIZE(tsout, CALCDATASIZE(tsout->size, cur_pos)); |
| |
| PG_FREE_IF_COPY(tsin, 0); |
| PG_RETURN_POINTER(tsout); |
| } |
| |
| Datum |
| tsvector_concat(PG_FUNCTION_ARGS) |
| { |
| TSVector in1 = PG_GETARG_TSVECTOR(0); |
| TSVector in2 = PG_GETARG_TSVECTOR(1); |
| TSVector out; |
| WordEntry *ptr; |
| WordEntry *ptr1, |
| *ptr2; |
| WordEntryPos *p; |
| int maxpos = 0, |
| i, |
| j, |
| i1, |
| i2, |
| dataoff, |
| output_bytes, |
| output_size; |
| char *data, |
| *data1, |
| *data2; |
| |
| /* Get max position in in1; we'll need this to offset in2's positions */ |
| ptr = ARRPTR(in1); |
| i = in1->size; |
| while (i--) |
| { |
| if ((j = POSDATALEN(in1, ptr)) != 0) |
| { |
| p = POSDATAPTR(in1, ptr); |
| while (j--) |
| { |
| if (WEP_GETPOS(*p) > maxpos) |
| maxpos = WEP_GETPOS(*p); |
| p++; |
| } |
| } |
| ptr++; |
| } |
| |
| ptr1 = ARRPTR(in1); |
| ptr2 = ARRPTR(in2); |
| data1 = STRPTR(in1); |
| data2 = STRPTR(in2); |
| i1 = in1->size; |
| i2 = in2->size; |
| |
| /* |
| * Conservative estimate of space needed. We might need all the data in |
| * both inputs, and conceivably add a pad byte before position data for |
| * each item where there was none before. |
| */ |
| output_bytes = VARSIZE(in1) + VARSIZE(in2) + i1 + i2; |
| |
| out = (TSVector) palloc0(output_bytes); |
| SET_VARSIZE(out, output_bytes); |
| |
| /* |
| * We must make out->size valid so that STRPTR(out) is sensible. We'll |
| * collapse out any unused space at the end. |
| */ |
| out->size = in1->size + in2->size; |
| |
| ptr = ARRPTR(out); |
| data = STRPTR(out); |
| dataoff = 0; |
| while (i1 && i2) |
| { |
| int cmp = compareEntry(data1, ptr1, data2, ptr2); |
| |
| if (cmp < 0) |
| { /* in1 first */ |
| ptr->haspos = ptr1->haspos; |
| ptr->len = ptr1->len; |
| memcpy(data + dataoff, data1 + ptr1->pos, ptr1->len); |
| ptr->pos = dataoff; |
| dataoff += ptr1->len; |
| if (ptr->haspos) |
| { |
| dataoff = SHORTALIGN(dataoff); |
| memcpy(data + dataoff, _POSVECPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16)); |
| dataoff += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16); |
| } |
| |
| ptr++; |
| ptr1++; |
| i1--; |
| } |
| else if (cmp > 0) |
| { /* in2 first */ |
| ptr->haspos = ptr2->haspos; |
| ptr->len = ptr2->len; |
| memcpy(data + dataoff, data2 + ptr2->pos, ptr2->len); |
| ptr->pos = dataoff; |
| dataoff += ptr2->len; |
| if (ptr->haspos) |
| { |
| int addlen = add_pos(in2, ptr2, out, ptr, maxpos); |
| |
| if (addlen == 0) |
| ptr->haspos = 0; |
| else |
| { |
| dataoff = SHORTALIGN(dataoff); |
| dataoff += addlen * sizeof(WordEntryPos) + sizeof(uint16); |
| } |
| } |
| |
| ptr++; |
| ptr2++; |
| i2--; |
| } |
| else |
| { |
| ptr->haspos = ptr1->haspos | ptr2->haspos; |
| ptr->len = ptr1->len; |
| memcpy(data + dataoff, data1 + ptr1->pos, ptr1->len); |
| ptr->pos = dataoff; |
| dataoff += ptr1->len; |
| if (ptr->haspos) |
| { |
| if (ptr1->haspos) |
| { |
| dataoff = SHORTALIGN(dataoff); |
| memcpy(data + dataoff, _POSVECPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16)); |
| dataoff += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16); |
| if (ptr2->haspos) |
| dataoff += add_pos(in2, ptr2, out, ptr, maxpos) * sizeof(WordEntryPos); |
| } |
| else /* must have ptr2->haspos */ |
| { |
| int addlen = add_pos(in2, ptr2, out, ptr, maxpos); |
| |
| if (addlen == 0) |
| ptr->haspos = 0; |
| else |
| { |
| dataoff = SHORTALIGN(dataoff); |
| dataoff += addlen * sizeof(WordEntryPos) + sizeof(uint16); |
| } |
| } |
| } |
| |
| ptr++; |
| ptr1++; |
| ptr2++; |
| i1--; |
| i2--; |
| } |
| } |
| |
| while (i1) |
| { |
| ptr->haspos = ptr1->haspos; |
| ptr->len = ptr1->len; |
| memcpy(data + dataoff, data1 + ptr1->pos, ptr1->len); |
| ptr->pos = dataoff; |
| dataoff += ptr1->len; |
| if (ptr->haspos) |
| { |
| dataoff = SHORTALIGN(dataoff); |
| memcpy(data + dataoff, _POSVECPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16)); |
| dataoff += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16); |
| } |
| |
| ptr++; |
| ptr1++; |
| i1--; |
| } |
| |
| while (i2) |
| { |
| ptr->haspos = ptr2->haspos; |
| ptr->len = ptr2->len; |
| memcpy(data + dataoff, data2 + ptr2->pos, ptr2->len); |
| ptr->pos = dataoff; |
| dataoff += ptr2->len; |
| if (ptr->haspos) |
| { |
| int addlen = add_pos(in2, ptr2, out, ptr, maxpos); |
| |
| if (addlen == 0) |
| ptr->haspos = 0; |
| else |
| { |
| dataoff = SHORTALIGN(dataoff); |
| dataoff += addlen * sizeof(WordEntryPos) + sizeof(uint16); |
| } |
| } |
| |
| ptr++; |
| ptr2++; |
| i2--; |
| } |
| |
| /* |
| * Instead of checking each offset individually, we check for overflow of |
| * pos fields once at the end. |
| */ |
| if (dataoff > MAXSTRPOS) |
| ereport(ERROR, |
| (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), |
| errmsg("string is too long for tsvector (%d bytes, max %d bytes)", dataoff, MAXSTRPOS))); |
| |
| /* |
| * Adjust sizes (asserting that we didn't overrun the original estimates) |
| * and collapse out any unused array entries. |
| */ |
| output_size = ptr - ARRPTR(out); |
| Assert(output_size <= out->size); |
| out->size = output_size; |
| if (data != STRPTR(out)) |
| memmove(STRPTR(out), data, dataoff); |
| output_bytes = CALCDATASIZE(out->size, dataoff); |
| Assert(output_bytes <= VARSIZE(out)); |
| SET_VARSIZE(out, output_bytes); |
| |
| PG_FREE_IF_COPY(in1, 0); |
| PG_FREE_IF_COPY(in2, 1); |
| PG_RETURN_POINTER(out); |
| } |
| |
| /* |
| * Compare two strings by tsvector rules. |
| * |
| * if prefix = true then it returns zero value iff b has prefix a |
| */ |
| int32 |
| tsCompareString(char *a, int _lena, char *b, int _lenb, bool prefix) |
| { |
| int cmp; |
| |
| Assert(_lena >= 0 && _lenb >= 0); |
| size_t lena = (size_t)_lena; |
| size_t lenb = (size_t)_lenb; |
| if (lena == 0) |
| { |
| if (prefix) |
| cmp = 0; /* empty string is prefix of anything */ |
| else |
| cmp = (lenb > 0) ? -1 : 0; |
| } |
| else if (lenb == 0) |
| { |
| cmp = (lena > 0) ? 1 : 0; |
| } |
| else |
| { |
| |
| #pragma GCC diagnostic push |
| #pragma GCC diagnostic ignored "-Wstringop-overflow" |
| |
| cmp = memcmp(a, b, Min((unsigned int) lena, (unsigned int) lenb)); |
| |
| #pragma GCC diagnostic pop |
| |
| if (prefix) |
| { |
| if (cmp == 0 && lena > lenb) |
| cmp = 1; /* a is longer, so not a prefix of b */ |
| } |
| else if (cmp == 0 && lena != lenb) |
| { |
| cmp = (lena < lenb) ? -1 : 1; |
| } |
| } |
| |
| return cmp; |
| } |
| |
| /* |
| * Check weight info or/and fill 'data' with the required positions |
| */ |
| static TSTernaryValue |
| checkclass_str(CHKVAL *chkval, WordEntry *entry, QueryOperand *val, |
| ExecPhraseData *data) |
| { |
| TSTernaryValue result = TS_NO; |
| |
| Assert(data == NULL || data->npos == 0); |
| |
| if (entry->haspos) |
| { |
| WordEntryPosVector *posvec; |
| |
| /* |
| * We can't use the _POSVECPTR macro here because the pointer to the |
| * tsvector's lexeme storage is already contained in chkval->values. |
| */ |
| posvec = (WordEntryPosVector *) |
| (chkval->values + SHORTALIGN(entry->pos + entry->len)); |
| |
| if (val->weight && data) |
| { |
| WordEntryPos *posvec_iter = posvec->pos; |
| WordEntryPos *dptr; |
| |
| /* |
| * Filter position information by weights |
| */ |
| dptr = data->pos = palloc(sizeof(WordEntryPos) * posvec->npos); |
| data->allocated = true; |
| |
| /* Is there a position with a matching weight? */ |
| while (posvec_iter < posvec->pos + posvec->npos) |
| { |
| /* If true, append this position to the data->pos */ |
| if (val->weight & (1 << WEP_GETWEIGHT(*posvec_iter))) |
| { |
| *dptr = WEP_GETPOS(*posvec_iter); |
| dptr++; |
| } |
| |
| posvec_iter++; |
| } |
| |
| data->npos = dptr - data->pos; |
| |
| if (data->npos > 0) |
| result = TS_YES; |
| else |
| { |
| pfree(data->pos); |
| data->pos = NULL; |
| data->allocated = false; |
| } |
| } |
| else if (val->weight) |
| { |
| WordEntryPos *posvec_iter = posvec->pos; |
| |
| /* Is there a position with a matching weight? */ |
| while (posvec_iter < posvec->pos + posvec->npos) |
| { |
| if (val->weight & (1 << WEP_GETWEIGHT(*posvec_iter))) |
| { |
| result = TS_YES; |
| break; /* no need to go further */ |
| } |
| |
| posvec_iter++; |
| } |
| } |
| else if (data) |
| { |
| data->npos = posvec->npos; |
| data->pos = posvec->pos; |
| data->allocated = false; |
| result = TS_YES; |
| } |
| else |
| { |
| /* simplest case: no weight check, positions not needed */ |
| result = TS_YES; |
| } |
| } |
| else |
| { |
| /* |
| * Position info is lacking, so if the caller requires it, we can only |
| * say that maybe there is a match. |
| * |
| * Notice, however, that we *don't* check val->weight here. |
| * Historically, stripped tsvectors are considered to match queries |
| * whether or not the query has a weight restriction; that's a little |
| * dubious but we'll preserve the behavior. |
| */ |
| if (data) |
| result = TS_MAYBE; |
| else |
| result = TS_YES; |
| } |
| |
| return result; |
| } |
| |
| /* |
| * TS_execute callback for matching a tsquery operand to plain tsvector data |
| */ |
| static TSTernaryValue |
| checkcondition_str(void *checkval, QueryOperand *val, ExecPhraseData *data) |
| { |
| CHKVAL *chkval = (CHKVAL *) checkval; |
| WordEntry *StopLow = chkval->arrb; |
| WordEntry *StopHigh = chkval->arre; |
| WordEntry *StopMiddle = StopHigh; |
| TSTernaryValue res = TS_NO; |
| |
| /* Loop invariant: StopLow <= val < StopHigh */ |
| while (StopLow < StopHigh) |
| { |
| int difference; |
| |
| StopMiddle = StopLow + (StopHigh - StopLow) / 2; |
| difference = tsCompareString(chkval->operand + val->distance, |
| val->length, |
| chkval->values + StopMiddle->pos, |
| StopMiddle->len, |
| false); |
| |
| if (difference == 0) |
| { |
| /* Check weight info & fill 'data' with positions */ |
| res = checkclass_str(chkval, StopMiddle, val, data); |
| break; |
| } |
| else if (difference > 0) |
| StopLow = StopMiddle + 1; |
| else |
| StopHigh = StopMiddle; |
| } |
| |
| /* |
| * If it's a prefix search, we should also consider lexemes that the |
| * search term is a prefix of (which will necessarily immediately follow |
| * the place we found in the above loop). But we can skip them if there |
| * was a definite match on the exact term AND the caller doesn't need |
| * position info. |
| */ |
| if (val->prefix && (res != TS_YES || data)) |
| { |
| WordEntryPos *allpos = NULL; |
| int npos = 0, |
| totalpos = 0; |
| |
| /* adjust start position for corner case */ |
| if (StopLow >= StopHigh) |
| StopMiddle = StopHigh; |
| |
| /* we don't try to re-use any data from the initial match */ |
| if (data) |
| { |
| if (data->allocated) |
| pfree(data->pos); |
| data->pos = NULL; |
| data->allocated = false; |
| data->npos = 0; |
| } |
| res = TS_NO; |
| |
| while ((res != TS_YES || data) && |
| StopMiddle < chkval->arre && |
| tsCompareString(chkval->operand + val->distance, |
| val->length, |
| chkval->values + StopMiddle->pos, |
| StopMiddle->len, |
| true) == 0) |
| { |
| TSTernaryValue subres; |
| |
| subres = checkclass_str(chkval, StopMiddle, val, data); |
| |
| if (subres != TS_NO) |
| { |
| if (data) |
| { |
| /* |
| * We need to join position information |
| */ |
| if (subres == TS_MAYBE) |
| { |
| /* |
| * No position info for this match, so we must report |
| * MAYBE overall. |
| */ |
| res = TS_MAYBE; |
| /* forget any previous positions */ |
| npos = 0; |
| /* don't leak storage */ |
| if (allpos) |
| pfree(allpos); |
| break; |
| } |
| |
| while (npos + data->npos > totalpos) |
| { |
| if (totalpos == 0) |
| { |
| totalpos = 256; |
| allpos = palloc(sizeof(WordEntryPos) * totalpos); |
| } |
| else |
| { |
| totalpos *= 2; |
| allpos = repalloc(allpos, sizeof(WordEntryPos) * totalpos); |
| } |
| } |
| |
| memcpy(allpos + npos, data->pos, sizeof(WordEntryPos) * data->npos); |
| npos += data->npos; |
| |
| /* don't leak storage from individual matches */ |
| if (data->allocated) |
| pfree(data->pos); |
| data->pos = NULL; |
| data->allocated = false; |
| /* it's important to reset data->npos before next loop */ |
| data->npos = 0; |
| } |
| else |
| { |
| /* Don't need positions, just handle YES/MAYBE */ |
| if (subres == TS_YES || res == TS_NO) |
| res = subres; |
| } |
| } |
| |
| StopMiddle++; |
| } |
| |
| if (data && npos > 0) |
| { |
| /* Sort and make unique array of found positions */ |
| data->pos = allpos; |
| qsort(data->pos, npos, sizeof(WordEntryPos), compareWordEntryPos); |
| data->npos = qunique(data->pos, npos, sizeof(WordEntryPos), |
| compareWordEntryPos); |
| data->allocated = true; |
| res = TS_YES; |
| } |
| } |
| |
| return res; |
| } |
| |
| /* |
| * Compute output position list for a tsquery operator in phrase mode. |
| * |
| * Merge the position lists in Ldata and Rdata as specified by "emit", |
| * returning the result list into *data. The input position lists must be |
| * sorted and unique, and the output will be as well. |
| * |
| * data: pointer to initially-all-zeroes output struct, or NULL |
| * Ldata, Rdata: input position lists |
| * emit: bitmask of TSPO_XXX flags |
| * Loffset: offset to be added to Ldata positions before comparing/outputting |
| * Roffset: offset to be added to Rdata positions before comparing/outputting |
| * max_npos: maximum possible required size of output position array |
| * |
| * Loffset and Roffset should not be negative, else we risk trying to output |
| * negative positions, which won't fit into WordEntryPos. |
| * |
| * The result is boolean (TS_YES or TS_NO), but for the caller's convenience |
| * we return it as TSTernaryValue. |
| * |
| * Returns TS_YES if any positions were emitted to *data; or if data is NULL, |
| * returns TS_YES if any positions would have been emitted. |
| */ |
| #define TSPO_L_ONLY 0x01 /* emit positions appearing only in L */ |
| #define TSPO_R_ONLY 0x02 /* emit positions appearing only in R */ |
| #define TSPO_BOTH 0x04 /* emit positions appearing in both L&R */ |
| |
| static TSTernaryValue |
| TS_phrase_output(ExecPhraseData *data, |
| ExecPhraseData *Ldata, |
| ExecPhraseData *Rdata, |
| int emit, |
| int Loffset, |
| int Roffset, |
| int max_npos) |
| { |
| int Lindex, |
| Rindex; |
| |
| /* Loop until both inputs are exhausted */ |
| Lindex = Rindex = 0; |
| while (Lindex < Ldata->npos || Rindex < Rdata->npos) |
| { |
| int Lpos, |
| Rpos; |
| int output_pos = 0; |
| |
| /* |
| * Fetch current values to compare. WEP_GETPOS() is needed because |
| * ExecPhraseData->data can point to a tsvector's WordEntryPosVector. |
| */ |
| if (Lindex < Ldata->npos) |
| Lpos = WEP_GETPOS(Ldata->pos[Lindex]) + Loffset; |
| else |
| { |
| /* L array exhausted, so we're done if R_ONLY isn't set */ |
| if (!(emit & TSPO_R_ONLY)) |
| break; |
| Lpos = INT_MAX; |
| } |
| if (Rindex < Rdata->npos) |
| Rpos = WEP_GETPOS(Rdata->pos[Rindex]) + Roffset; |
| else |
| { |
| /* R array exhausted, so we're done if L_ONLY isn't set */ |
| if (!(emit & TSPO_L_ONLY)) |
| break; |
| Rpos = INT_MAX; |
| } |
| |
| /* Merge-join the two input lists */ |
| if (Lpos < Rpos) |
| { |
| /* Lpos is not matched in Rdata, should we output it? */ |
| if (emit & TSPO_L_ONLY) |
| output_pos = Lpos; |
| Lindex++; |
| } |
| else if (Lpos == Rpos) |
| { |
| /* Lpos and Rpos match ... should we output it? */ |
| if (emit & TSPO_BOTH) |
| output_pos = Rpos; |
| Lindex++; |
| Rindex++; |
| } |
| else /* Lpos > Rpos */ |
| { |
| /* Rpos is not matched in Ldata, should we output it? */ |
| if (emit & TSPO_R_ONLY) |
| output_pos = Rpos; |
| Rindex++; |
| } |
| |
| if (output_pos > 0) |
| { |
| if (data) |
| { |
| /* Store position, first allocating output array if needed */ |
| if (data->pos == NULL) |
| { |
| data->pos = (WordEntryPos *) |
| palloc(max_npos * sizeof(WordEntryPos)); |
| data->allocated = true; |
| } |
| data->pos[data->npos++] = output_pos; |
| } |
| else |
| { |
| /* |
| * Exact positions not needed, so return TS_YES as soon as we |
| * know there is at least one. |
| */ |
| return TS_YES; |
| } |
| } |
| } |
| |
| if (data && data->npos > 0) |
| { |
| /* Let's assert we didn't overrun the array */ |
| Assert(data->npos <= max_npos); |
| return TS_YES; |
| } |
| return TS_NO; |
| } |
| |
| /* |
| * Execute tsquery at or below an OP_PHRASE operator. |
| * |
| * This handles tsquery execution at recursion levels where we need to care |
| * about match locations. |
| * |
| * In addition to the same arguments used for TS_execute, the caller may pass |
| * a preinitialized-to-zeroes ExecPhraseData struct, to be filled with lexeme |
| * match position info on success. data == NULL if no position data need be |
| * returned. |
| * Note: the function assumes data != NULL for operators other than OP_PHRASE. |
| * This is OK because an outside call always starts from an OP_PHRASE node, |
| * and all internal recursion cases pass data != NULL. |
| * |
| * The detailed semantics of the match data, given that the function returned |
| * TS_YES (successful match), are: |
| * |
| * npos > 0, negate = false: |
| * query is matched at specified position(s) (and only those positions) |
| * npos > 0, negate = true: |
| * query is matched at all positions *except* specified position(s) |
| * npos = 0, negate = true: |
| * query is matched at all positions |
| * npos = 0, negate = false: |
| * disallowed (this should result in TS_NO or TS_MAYBE, as appropriate) |
| * |
| * Successful matches also return a "width" value which is the match width in |
| * lexemes, less one. Hence, "width" is zero for simple one-lexeme matches, |
| * and is the sum of the phrase operator distances for phrase matches. Note |
| * that when width > 0, the listed positions represent the ends of matches not |
| * the starts. (This unintuitive rule is needed to avoid possibly generating |
| * negative positions, which wouldn't fit into the WordEntryPos arrays.) |
| * |
| * If the TSExecuteCallback function reports that an operand is present |
| * but fails to provide position(s) for it, we will return TS_MAYBE when |
| * it is possible but not certain that the query is matched. |
| * |
| * When the function returns TS_NO or TS_MAYBE, it must return npos = 0, |
| * negate = false (which is the state initialized by the caller); but the |
| * "width" output in such cases is undefined. |
| */ |
| static TSTernaryValue |
| TS_phrase_execute(QueryItem *curitem, void *arg, uint32 flags, |
| TSExecuteCallback chkcond, |
| ExecPhraseData *data) |
| { |
| ExecPhraseData Ldata, |
| Rdata; |
| TSTernaryValue lmatch, |
| rmatch; |
| int Loffset, |
| Roffset, |
| maxwidth; |
| |
| /* since this function recurses, it could be driven to stack overflow */ |
| check_stack_depth(); |
| |
| /* ... and let's check for query cancel while we're at it */ |
| CHECK_FOR_INTERRUPTS(); |
| |
| if (curitem->type == QI_VAL) |
| return chkcond(arg, (QueryOperand *) curitem, data); |
| |
| switch (curitem->qoperator.oper) |
| { |
| case OP_NOT: |
| |
| /* |
| * We need not touch data->width, since a NOT operation does not |
| * change the match width. |
| */ |
| if (flags & TS_EXEC_SKIP_NOT) |
| { |
| /* with SKIP_NOT, report NOT as "match everywhere" */ |
| Assert(data->npos == 0 && !data->negate); |
| data->negate = true; |
| return TS_YES; |
| } |
| switch (TS_phrase_execute(curitem + 1, arg, flags, chkcond, data)) |
| { |
| case TS_NO: |
| /* change "match nowhere" to "match everywhere" */ |
| Assert(data->npos == 0 && !data->negate); |
| data->negate = true; |
| return TS_YES; |
| case TS_YES: |
| if (data->npos > 0) |
| { |
| /* we have some positions, invert negate flag */ |
| data->negate = !data->negate; |
| return TS_YES; |
| } |
| else if (data->negate) |
| { |
| /* change "match everywhere" to "match nowhere" */ |
| data->negate = false; |
| return TS_NO; |
| } |
| /* Should not get here if result was TS_YES */ |
| Assert(false); |
| break; |
| case TS_MAYBE: |
| /* match positions are, and remain, uncertain */ |
| return TS_MAYBE; |
| } |
| break; |
| |
| case OP_PHRASE: |
| case OP_AND: |
| memset(&Ldata, 0, sizeof(Ldata)); |
| memset(&Rdata, 0, sizeof(Rdata)); |
| |
| lmatch = TS_phrase_execute(curitem + curitem->qoperator.left, |
| arg, flags, chkcond, &Ldata); |
| if (lmatch == TS_NO) |
| return TS_NO; |
| |
| rmatch = TS_phrase_execute(curitem + 1, |
| arg, flags, chkcond, &Rdata); |
| if (rmatch == TS_NO) |
| return TS_NO; |
| |
| /* |
| * If either operand has no position information, then we can't |
| * return reliable position data, only a MAYBE result. |
| */ |
| if (lmatch == TS_MAYBE || rmatch == TS_MAYBE) |
| return TS_MAYBE; |
| |
| if (curitem->qoperator.oper == OP_PHRASE) |
| { |
| /* |
| * Compute Loffset and Roffset suitable for phrase match, and |
| * compute overall width of whole phrase match. |
| */ |
| Loffset = curitem->qoperator.distance + Rdata.width; |
| Roffset = 0; |
| if (data) |
| data->width = curitem->qoperator.distance + |
| Ldata.width + Rdata.width; |
| } |
| else |
| { |
| /* |
| * For OP_AND, set output width and alignment like OP_OR (see |
| * comment below) |
| */ |
| maxwidth = Max(Ldata.width, Rdata.width); |
| Loffset = maxwidth - Ldata.width; |
| Roffset = maxwidth - Rdata.width; |
| if (data) |
| data->width = maxwidth; |
| } |
| |
| if (Ldata.negate && Rdata.negate) |
| { |
| /* !L & !R: treat as !(L | R) */ |
| (void) TS_phrase_output(data, &Ldata, &Rdata, |
| TSPO_BOTH | TSPO_L_ONLY | TSPO_R_ONLY, |
| Loffset, Roffset, |
| Ldata.npos + Rdata.npos); |
| if (data) |
| data->negate = true; |
| return TS_YES; |
| } |
| else if (Ldata.negate) |
| { |
| /* !L & R */ |
| return TS_phrase_output(data, &Ldata, &Rdata, |
| TSPO_R_ONLY, |
| Loffset, Roffset, |
| Rdata.npos); |
| } |
| else if (Rdata.negate) |
| { |
| /* L & !R */ |
| return TS_phrase_output(data, &Ldata, &Rdata, |
| TSPO_L_ONLY, |
| Loffset, Roffset, |
| Ldata.npos); |
| } |
| else |
| { |
| /* straight AND */ |
| return TS_phrase_output(data, &Ldata, &Rdata, |
| TSPO_BOTH, |
| Loffset, Roffset, |
| Min(Ldata.npos, Rdata.npos)); |
| } |
| |
| case OP_OR: |
| memset(&Ldata, 0, sizeof(Ldata)); |
| memset(&Rdata, 0, sizeof(Rdata)); |
| |
| lmatch = TS_phrase_execute(curitem + curitem->qoperator.left, |
| arg, flags, chkcond, &Ldata); |
| rmatch = TS_phrase_execute(curitem + 1, |
| arg, flags, chkcond, &Rdata); |
| |
| if (lmatch == TS_NO && rmatch == TS_NO) |
| return TS_NO; |
| |
| /* |
| * If either operand has no position information, then we can't |
| * return reliable position data, only a MAYBE result. |
| */ |
| if (lmatch == TS_MAYBE || rmatch == TS_MAYBE) |
| return TS_MAYBE; |
| |
| /* |
| * Cope with undefined output width from failed submatch. (This |
| * takes less code than trying to ensure that all failure returns |
| * set data->width to zero.) |
| */ |
| if (lmatch == TS_NO) |
| Ldata.width = 0; |
| if (rmatch == TS_NO) |
| Rdata.width = 0; |
| |
| /* |
| * For OP_AND and OP_OR, report the width of the wider of the two |
| * inputs, and align the narrower input's positions to the right |
| * end of that width. This rule deals at least somewhat |
| * reasonably with cases like "x <-> (y | z <-> q)". |
| */ |
| maxwidth = Max(Ldata.width, Rdata.width); |
| Loffset = maxwidth - Ldata.width; |
| Roffset = maxwidth - Rdata.width; |
| data->width = maxwidth; |
| |
| if (Ldata.negate && Rdata.negate) |
| { |
| /* !L | !R: treat as !(L & R) */ |
| (void) TS_phrase_output(data, &Ldata, &Rdata, |
| TSPO_BOTH, |
| Loffset, Roffset, |
| Min(Ldata.npos, Rdata.npos)); |
| data->negate = true; |
| return TS_YES; |
| } |
| else if (Ldata.negate) |
| { |
| /* !L | R: treat as !(L & !R) */ |
| (void) TS_phrase_output(data, &Ldata, &Rdata, |
| TSPO_L_ONLY, |
| Loffset, Roffset, |
| Ldata.npos); |
| data->negate = true; |
| return TS_YES; |
| } |
| else if (Rdata.negate) |
| { |
| /* L | !R: treat as !(!L & R) */ |
| (void) TS_phrase_output(data, &Ldata, &Rdata, |
| TSPO_R_ONLY, |
| Loffset, Roffset, |
| Rdata.npos); |
| data->negate = true; |
| return TS_YES; |
| } |
| else |
| { |
| /* straight OR */ |
| return TS_phrase_output(data, &Ldata, &Rdata, |
| TSPO_BOTH | TSPO_L_ONLY | TSPO_R_ONLY, |
| Loffset, Roffset, |
| Ldata.npos + Rdata.npos); |
| } |
| |
| default: |
| elog(ERROR, "unrecognized operator: %d", curitem->qoperator.oper); |
| } |
| |
| /* not reachable, but keep compiler quiet */ |
| return TS_NO; |
| } |
| |
| |
| /* |
| * Evaluate tsquery boolean expression. |
| * |
| * curitem: current tsquery item (initially, the first one) |
| * arg: opaque value to pass through to callback function |
| * flags: bitmask of flag bits shown in ts_utils.h |
| * chkcond: callback function to check whether a primitive value is present |
| */ |
| bool |
| TS_execute(QueryItem *curitem, void *arg, uint32 flags, |
| TSExecuteCallback chkcond) |
| { |
| /* |
| * If we get TS_MAYBE from the recursion, return true. We could only see |
| * that result if the caller passed TS_EXEC_PHRASE_NO_POS, so there's no |
| * need to check again. |
| */ |
| return TS_execute_recurse(curitem, arg, flags, chkcond) != TS_NO; |
| } |
| |
| /* |
| * Evaluate tsquery boolean expression. |
| * |
| * This is the same as TS_execute except that TS_MAYBE is returned as-is. |
| */ |
| TSTernaryValue |
| TS_execute_ternary(QueryItem *curitem, void *arg, uint32 flags, |
| TSExecuteCallback chkcond) |
| { |
| return TS_execute_recurse(curitem, arg, flags, chkcond); |
| } |
| |
| /* |
| * TS_execute recursion for operators above any phrase operator. Here we do |
| * not need to worry about lexeme positions. As soon as we hit an OP_PHRASE |
| * operator, we pass it off to TS_phrase_execute which does worry. |
| */ |
| static TSTernaryValue |
| TS_execute_recurse(QueryItem *curitem, void *arg, uint32 flags, |
| TSExecuteCallback chkcond) |
| { |
| TSTernaryValue lmatch; |
| |
| /* since this function recurses, it could be driven to stack overflow */ |
| check_stack_depth(); |
| |
| /* ... and let's check for query cancel while we're at it */ |
| CHECK_FOR_INTERRUPTS(); |
| |
| if (curitem->type == QI_VAL) |
| return chkcond(arg, (QueryOperand *) curitem, |
| NULL /* don't need position info */ ); |
| |
| switch (curitem->qoperator.oper) |
| { |
| case OP_NOT: |
| if (flags & TS_EXEC_SKIP_NOT) |
| return TS_YES; |
| switch (TS_execute_recurse(curitem + 1, arg, flags, chkcond)) |
| { |
| case TS_NO: |
| return TS_YES; |
| case TS_YES: |
| return TS_NO; |
| case TS_MAYBE: |
| return TS_MAYBE; |
| } |
| break; |
| |
| case OP_AND: |
| lmatch = TS_execute_recurse(curitem + curitem->qoperator.left, arg, |
| flags, chkcond); |
| if (lmatch == TS_NO) |
| return TS_NO; |
| switch (TS_execute_recurse(curitem + 1, arg, flags, chkcond)) |
| { |
| case TS_NO: |
| return TS_NO; |
| case TS_YES: |
| return lmatch; |
| case TS_MAYBE: |
| return TS_MAYBE; |
| } |
| break; |
| |
| case OP_OR: |
| lmatch = TS_execute_recurse(curitem + curitem->qoperator.left, arg, |
| flags, chkcond); |
| if (lmatch == TS_YES) |
| return TS_YES; |
| switch (TS_execute_recurse(curitem + 1, arg, flags, chkcond)) |
| { |
| case TS_NO: |
| return lmatch; |
| case TS_YES: |
| return TS_YES; |
| case TS_MAYBE: |
| return TS_MAYBE; |
| } |
| break; |
| |
| case OP_PHRASE: |
| |
| /* |
| * If we get a MAYBE result, and the caller doesn't want that, |
| * convert it to NO. It would be more consistent, perhaps, to |
| * return the result of TS_phrase_execute() verbatim and then |
| * convert MAYBE results at the top of the recursion. But |
| * converting at the topmost phrase operator gives results that |
| * are bug-compatible with the old implementation, so do it like |
| * this for now. |
| */ |
| switch (TS_phrase_execute(curitem, arg, flags, chkcond, NULL)) |
| { |
| case TS_NO: |
| return TS_NO; |
| case TS_YES: |
| return TS_YES; |
| case TS_MAYBE: |
| return (flags & TS_EXEC_PHRASE_NO_POS) ? TS_MAYBE : TS_NO; |
| } |
| break; |
| |
| default: |
| elog(ERROR, "unrecognized operator: %d", curitem->qoperator.oper); |
| } |
| |
| /* not reachable, but keep compiler quiet */ |
| return TS_NO; |
| } |
| |
| /* |
| * Evaluate tsquery and report locations of matching terms. |
| * |
| * This is like TS_execute except that it returns match locations not just |
| * success/failure status. The callback function is required to provide |
| * position data (we report failure if it doesn't). |
| * |
| * On successful match, the result is a List of ExecPhraseData structs, one |
| * for each AND'ed term or phrase operator in the query. Each struct includes |
| * a sorted array of lexeme positions matching that term. (Recall that for |
| * phrase operators, the match includes width+1 lexemes, and the recorded |
| * position is that of the rightmost lexeme.) |
| * |
| * OR subexpressions are handled by union'ing their match locations into a |
| * single List element, which is valid since any of those locations contains |
| * a match. However, when some of the OR'ed terms are phrase operators, we |
| * report the maximum width of any of the OR'ed terms, making such cases |
| * slightly imprecise in the conservative direction. (For example, if the |
| * tsquery is "(A <-> B) | C", an occurrence of C in the data would be |
| * reported as though it includes the lexeme to the left of C.) |
| * |
| * Locations of NOT subexpressions are not reported. (Obviously, there can |
| * be no successful NOT matches at top level, or the match would have failed. |
| * So this amounts to ignoring NOTs underneath ORs.) |
| * |
| * The result is NIL if no match, or if position data was not returned. |
| * |
| * Arguments are the same as for TS_execute, although flags is currently |
| * vestigial since none of the defined bits are sensible here. |
| */ |
| List * |
| TS_execute_locations(QueryItem *curitem, void *arg, |
| uint32 flags, |
| TSExecuteCallback chkcond) |
| { |
| List *result; |
| |
| /* No flags supported, as yet */ |
| Assert(flags == TS_EXEC_EMPTY); |
| if (TS_execute_locations_recurse(curitem, arg, chkcond, &result)) |
| return result; |
| return NIL; |
| } |
| |
| /* |
| * TS_execute_locations recursion for operators above any phrase operator. |
| * OP_PHRASE subexpressions can be passed off to TS_phrase_execute. |
| */ |
| static bool |
| TS_execute_locations_recurse(QueryItem *curitem, void *arg, |
| TSExecuteCallback chkcond, |
| List **locations) |
| { |
| bool lmatch, |
| rmatch; |
| List *llocations, |
| *rlocations; |
| ExecPhraseData *data; |
| |
| /* since this function recurses, it could be driven to stack overflow */ |
| check_stack_depth(); |
| |
| /* ... and let's check for query cancel while we're at it */ |
| CHECK_FOR_INTERRUPTS(); |
| |
| /* Default locations result is empty */ |
| *locations = NIL; |
| |
| if (curitem->type == QI_VAL) |
| { |
| data = palloc0_object(ExecPhraseData); |
| if (chkcond(arg, (QueryOperand *) curitem, data) == TS_YES) |
| { |
| *locations = list_make1(data); |
| return true; |
| } |
| pfree(data); |
| return false; |
| } |
| |
| switch (curitem->qoperator.oper) |
| { |
| case OP_NOT: |
| if (!TS_execute_locations_recurse(curitem + 1, arg, chkcond, |
| &llocations)) |
| return true; /* we don't pass back any locations */ |
| return false; |
| |
| case OP_AND: |
| if (!TS_execute_locations_recurse(curitem + curitem->qoperator.left, |
| arg, chkcond, |
| &llocations)) |
| return false; |
| if (!TS_execute_locations_recurse(curitem + 1, |
| arg, chkcond, |
| &rlocations)) |
| return false; |
| *locations = list_concat(llocations, rlocations); |
| return true; |
| |
| case OP_OR: |
| lmatch = TS_execute_locations_recurse(curitem + curitem->qoperator.left, |
| arg, chkcond, |
| &llocations); |
| rmatch = TS_execute_locations_recurse(curitem + 1, |
| arg, chkcond, |
| &rlocations); |
| if (lmatch || rmatch) |
| { |
| /* |
| * We generate an AND'able location struct from each |
| * combination of sub-matches, following the disjunctive law |
| * (A & B) | (C & D) = (A | C) & (A | D) & (B | C) & (B | D). |
| * |
| * However, if either input didn't produce locations (i.e., it |
| * failed or was a NOT), we must just return the other list. |
| */ |
| if (llocations == NIL) |
| *locations = rlocations; |
| else if (rlocations == NIL) |
| *locations = llocations; |
| else |
| { |
| ListCell *ll; |
| |
| foreach(ll, llocations) |
| { |
| ExecPhraseData *ldata = (ExecPhraseData *) lfirst(ll); |
| ListCell *lr; |
| |
| foreach(lr, rlocations) |
| { |
| ExecPhraseData *rdata = (ExecPhraseData *) lfirst(lr); |
| |
| data = palloc0_object(ExecPhraseData); |
| (void) TS_phrase_output(data, ldata, rdata, |
| TSPO_BOTH | TSPO_L_ONLY | TSPO_R_ONLY, |
| 0, 0, |
| ldata->npos + rdata->npos); |
| /* Report the larger width, as explained above. */ |
| data->width = Max(ldata->width, rdata->width); |
| *locations = lappend(*locations, data); |
| } |
| } |
| } |
| |
| return true; |
| } |
| return false; |
| |
| case OP_PHRASE: |
| /* We can hand this off to TS_phrase_execute */ |
| data = palloc0_object(ExecPhraseData); |
| if (TS_phrase_execute(curitem, arg, TS_EXEC_EMPTY, chkcond, |
| data) == TS_YES) |
| { |
| if (!data->negate) |
| *locations = list_make1(data); |
| return true; |
| } |
| pfree(data); |
| return false; |
| |
| default: |
| elog(ERROR, "unrecognized operator: %d", curitem->qoperator.oper); |
| } |
| |
| /* not reachable, but keep compiler quiet */ |
| return false; |
| } |
| |
| /* |
| * Detect whether a tsquery boolean expression requires any positive matches |
| * to values shown in the tsquery. |
| * |
| * This is needed to know whether a GIN index search requires full index scan. |
| * For example, 'x & !y' requires a match of x, so it's sufficient to scan |
| * entries for x; but 'x | !y' could match rows containing neither x nor y. |
| */ |
| bool |
| tsquery_requires_match(QueryItem *curitem) |
| { |
| /* since this function recurses, it could be driven to stack overflow */ |
| check_stack_depth(); |
| |
| if (curitem->type == QI_VAL) |
| return true; |
| |
| switch (curitem->qoperator.oper) |
| { |
| case OP_NOT: |
| |
| /* |
| * Assume there are no required matches underneath a NOT. For |
| * some cases with nested NOTs, we could prove there's a required |
| * match, but it seems unlikely to be worth the trouble. |
| */ |
| return false; |
| |
| case OP_PHRASE: |
| |
| /* |
| * Treat OP_PHRASE as OP_AND here |
| */ |
| case OP_AND: |
| /* If either side requires a match, we're good */ |
| if (tsquery_requires_match(curitem + curitem->qoperator.left)) |
| return true; |
| else |
| return tsquery_requires_match(curitem + 1); |
| |
| case OP_OR: |
| /* Both sides must require a match */ |
| if (tsquery_requires_match(curitem + curitem->qoperator.left)) |
| return tsquery_requires_match(curitem + 1); |
| else |
| return false; |
| |
| default: |
| elog(ERROR, "unrecognized operator: %d", curitem->qoperator.oper); |
| } |
| |
| /* not reachable, but keep compiler quiet */ |
| return false; |
| } |
| |
| /* |
| * boolean operations |
| */ |
| Datum |
| ts_match_qv(PG_FUNCTION_ARGS) |
| { |
| PG_RETURN_DATUM(DirectFunctionCall2(ts_match_vq, |
| PG_GETARG_DATUM(1), |
| PG_GETARG_DATUM(0))); |
| } |
| |
| Datum |
| ts_match_vq(PG_FUNCTION_ARGS) |
| { |
| TSVector val = PG_GETARG_TSVECTOR(0); |
| TSQuery query = PG_GETARG_TSQUERY(1); |
| CHKVAL chkval; |
| bool result; |
| |
| /* empty query matches nothing */ |
| if (!query->size) |
| { |
| PG_FREE_IF_COPY(val, 0); |
| PG_FREE_IF_COPY(query, 1); |
| PG_RETURN_BOOL(false); |
| } |
| |
| chkval.arrb = ARRPTR(val); |
| chkval.arre = chkval.arrb + val->size; |
| chkval.values = STRPTR(val); |
| chkval.operand = GETOPERAND(query); |
| result = TS_execute(GETQUERY(query), |
| &chkval, |
| TS_EXEC_EMPTY, |
| checkcondition_str); |
| |
| PG_FREE_IF_COPY(val, 0); |
| PG_FREE_IF_COPY(query, 1); |
| PG_RETURN_BOOL(result); |
| } |
| |
| Datum |
| ts_match_tt(PG_FUNCTION_ARGS) |
| { |
| TSVector vector; |
| TSQuery query; |
| bool res; |
| |
| vector = DatumGetTSVector(DirectFunctionCall1(to_tsvector, |
| PG_GETARG_DATUM(0))); |
| query = DatumGetTSQuery(DirectFunctionCall1(plainto_tsquery, |
| PG_GETARG_DATUM(1))); |
| |
| res = DatumGetBool(DirectFunctionCall2(ts_match_vq, |
| TSVectorGetDatum(vector), |
| TSQueryGetDatum(query))); |
| |
| pfree(vector); |
| pfree(query); |
| |
| PG_RETURN_BOOL(res); |
| } |
| |
| Datum |
| ts_match_tq(PG_FUNCTION_ARGS) |
| { |
| TSVector vector; |
| TSQuery query = PG_GETARG_TSQUERY(1); |
| bool res; |
| |
| vector = DatumGetTSVector(DirectFunctionCall1(to_tsvector, |
| PG_GETARG_DATUM(0))); |
| |
| res = DatumGetBool(DirectFunctionCall2(ts_match_vq, |
| TSVectorGetDatum(vector), |
| TSQueryGetDatum(query))); |
| |
| pfree(vector); |
| PG_FREE_IF_COPY(query, 1); |
| |
| PG_RETURN_BOOL(res); |
| } |
| |
| /* |
| * ts_stat statistic function support |
| */ |
| |
| |
| /* |
| * Returns the number of positions in value 'wptr' within tsvector 'txt', |
| * that have a weight equal to one of the weights in 'weight' bitmask. |
| */ |
| static int |
| check_weight(TSVector txt, WordEntry *wptr, int8 weight) |
| { |
| int len = POSDATALEN(txt, wptr); |
| int num = 0; |
| WordEntryPos *ptr = POSDATAPTR(txt, wptr); |
| |
| while (len--) |
| { |
| if (weight & (1 << WEP_GETWEIGHT(*ptr))) |
| num++; |
| ptr++; |
| } |
| return num; |
| } |
| |
| #define compareStatWord(a,e,t) \ |
| tsCompareString((a)->lexeme, (a)->lenlexeme, \ |
| STRPTR(t) + (e)->pos, (e)->len, \ |
| false) |
| |
| static void |
| insertStatEntry(MemoryContext persistentContext, TSVectorStat *stat, TSVector txt, uint32 off) |
| { |
| WordEntry *we = ARRPTR(txt) + off; |
| StatEntry *node = stat->root, |
| *pnode = NULL; |
| int n, |
| res = 0; |
| uint32 depth = 1; |
| |
| if (stat->weight == 0) |
| n = (we->haspos) ? POSDATALEN(txt, we) : 1; |
| else |
| n = (we->haspos) ? check_weight(txt, we, stat->weight) : 0; |
| |
| if (n == 0) |
| return; /* nothing to insert */ |
| |
| while (node) |
| { |
| res = compareStatWord(node, we, txt); |
| |
| if (res == 0) |
| { |
| break; |
| } |
| else |
| { |
| pnode = node; |
| node = (res < 0) ? node->left : node->right; |
| } |
| depth++; |
| } |
| |
| if (depth > stat->maxdepth) |
| stat->maxdepth = depth; |
| |
| if (node == NULL) |
| { |
| node = MemoryContextAlloc(persistentContext, STATENTRYHDRSZ + we->len); |
| node->left = node->right = NULL; |
| node->ndoc = 1; |
| node->nentry = n; |
| node->lenlexeme = we->len; |
| memcpy(node->lexeme, STRPTR(txt) + we->pos, node->lenlexeme); |
| |
| if (pnode == NULL) |
| { |
| stat->root = node; |
| } |
| else |
| { |
| if (res < 0) |
| pnode->left = node; |
| else |
| pnode->right = node; |
| } |
| } |
| else |
| { |
| node->ndoc++; |
| node->nentry += n; |
| } |
| } |
| |
| static void |
| chooseNextStatEntry(MemoryContext persistentContext, TSVectorStat *stat, TSVector txt, |
| uint32 low, uint32 high, uint32 offset) |
| { |
| uint32 pos; |
| uint32 middle = (low + high) >> 1; |
| |
| pos = (low + middle) >> 1; |
| if (low != middle && pos >= offset && pos - offset < txt->size) |
| insertStatEntry(persistentContext, stat, txt, pos - offset); |
| pos = (high + middle + 1) >> 1; |
| if (middle + 1 != high && pos >= offset && pos - offset < txt->size) |
| insertStatEntry(persistentContext, stat, txt, pos - offset); |
| |
| if (low != middle) |
| chooseNextStatEntry(persistentContext, stat, txt, low, middle, offset); |
| if (high != middle + 1) |
| chooseNextStatEntry(persistentContext, stat, txt, middle + 1, high, offset); |
| } |
| |
| /* |
| * This is written like a custom aggregate function, because the |
| * original plan was to do just that. Unfortunately, an aggregate function |
| * can't return a set, so that plan was abandoned. If that limitation is |
| * lifted in the future, ts_stat could be a real aggregate function so that |
| * you could use it like this: |
| * |
| * SELECT ts_stat(vector_column) FROM vector_table; |
| * |
| * where vector_column is a tsvector-type column in vector_table. |
| */ |
| |
| static TSVectorStat * |
| ts_accum(MemoryContext persistentContext, TSVectorStat *stat, Datum data) |
| { |
| TSVector txt = DatumGetTSVector(data); |
| uint32 i, |
| nbit = 0, |
| offset; |
| |
| if (stat == NULL) |
| { /* Init in first */ |
| stat = MemoryContextAllocZero(persistentContext, sizeof(TSVectorStat)); |
| stat->maxdepth = 1; |
| } |
| |
| /* simple check of correctness */ |
| if (txt == NULL || txt->size == 0) |
| { |
| if (txt && txt != (TSVector) DatumGetPointer(data)) |
| pfree(txt); |
| return stat; |
| } |
| |
| i = txt->size - 1; |
| for (; i > 0; i >>= 1) |
| nbit++; |
| |
| nbit = 1 << nbit; |
| offset = (nbit - txt->size) / 2; |
| |
| insertStatEntry(persistentContext, stat, txt, (nbit >> 1) - offset); |
| chooseNextStatEntry(persistentContext, stat, txt, 0, nbit, offset); |
| |
| return stat; |
| } |
| |
| static void |
| ts_setup_firstcall(FunctionCallInfo fcinfo, FuncCallContext *funcctx, |
| TSVectorStat *stat) |
| { |
| TupleDesc tupdesc; |
| MemoryContext oldcontext; |
| StatEntry *node; |
| |
| funcctx->user_fctx = (void *) stat; |
| |
| oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); |
| |
| stat->stack = palloc0(sizeof(StatEntry *) * (stat->maxdepth + 1)); |
| stat->stackpos = 0; |
| |
| node = stat->root; |
| /* find leftmost value */ |
| if (node == NULL) |
| stat->stack[stat->stackpos] = NULL; |
| else |
| for (;;) |
| { |
| stat->stack[stat->stackpos] = node; |
| if (node->left) |
| { |
| stat->stackpos++; |
| node = node->left; |
| } |
| else |
| break; |
| } |
| Assert(stat->stackpos <= stat->maxdepth); |
| |
| if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) |
| elog(ERROR, "return type must be a row type"); |
| funcctx->tuple_desc = tupdesc; |
| funcctx->attinmeta = TupleDescGetAttInMetadata(tupdesc); |
| |
| MemoryContextSwitchTo(oldcontext); |
| } |
| |
| static StatEntry * |
| walkStatEntryTree(TSVectorStat *stat) |
| { |
| StatEntry *node = stat->stack[stat->stackpos]; |
| |
| if (node == NULL) |
| return NULL; |
| |
| if (node->ndoc != 0) |
| { |
| /* return entry itself: we already was at left sublink */ |
| return node; |
| } |
| else if (node->right && node->right != stat->stack[stat->stackpos + 1]) |
| { |
| /* go on right sublink */ |
| stat->stackpos++; |
| node = node->right; |
| |
| /* find most-left value */ |
| for (;;) |
| { |
| stat->stack[stat->stackpos] = node; |
| if (node->left) |
| { |
| stat->stackpos++; |
| node = node->left; |
| } |
| else |
| break; |
| } |
| Assert(stat->stackpos <= stat->maxdepth); |
| } |
| else |
| { |
| /* we already return all left subtree, itself and right subtree */ |
| if (stat->stackpos == 0) |
| return NULL; |
| |
| stat->stackpos--; |
| return walkStatEntryTree(stat); |
| } |
| |
| return node; |
| } |
| |
| static Datum |
| ts_process_call(FuncCallContext *funcctx) |
| { |
| TSVectorStat *st; |
| StatEntry *entry; |
| |
| st = (TSVectorStat *) funcctx->user_fctx; |
| |
| entry = walkStatEntryTree(st); |
| |
| if (entry != NULL) |
| { |
| Datum result; |
| char *values[3]; |
| char ndoc[16]; |
| char nentry[16]; |
| HeapTuple tuple; |
| |
| values[0] = palloc(entry->lenlexeme + 1); |
| memcpy(values[0], entry->lexeme, entry->lenlexeme); |
| (values[0])[entry->lenlexeme] = '\0'; |
| sprintf(ndoc, "%d", entry->ndoc); |
| values[1] = ndoc; |
| sprintf(nentry, "%d", entry->nentry); |
| values[2] = nentry; |
| |
| tuple = BuildTupleFromCStrings(funcctx->attinmeta, values); |
| result = HeapTupleGetDatum(tuple); |
| |
| pfree(values[0]); |
| |
| /* mark entry as already visited */ |
| entry->ndoc = 0; |
| |
| return result; |
| } |
| |
| return (Datum) 0; |
| } |
| |
| static TSVectorStat * |
| ts_stat_sql(MemoryContext persistentContext, text *txt, text *ws) |
| { |
| char *query = text_to_cstring(txt); |
| TSVectorStat *stat; |
| bool isnull; |
| Portal portal; |
| SPIPlanPtr plan; |
| |
| if ((plan = SPI_prepare(query, 0, NULL)) == NULL) |
| /* internal error */ |
| elog(ERROR, "SPI_prepare(\"%s\") failed", query); |
| |
| if ((portal = SPI_cursor_open(NULL, plan, NULL, NULL, true)) == NULL) |
| /* internal error */ |
| elog(ERROR, "SPI_cursor_open(\"%s\") failed", query); |
| |
| SPI_cursor_fetch(portal, true, 100); |
| |
| if (SPI_tuptable == NULL || |
| SPI_tuptable->tupdesc->natts != 1 || |
| !IsBinaryCoercible(SPI_gettypeid(SPI_tuptable->tupdesc, 1), |
| TSVECTOROID)) |
| ereport(ERROR, |
| (errcode(ERRCODE_INVALID_PARAMETER_VALUE), |
| errmsg("ts_stat query must return one tsvector column"))); |
| |
| stat = MemoryContextAllocZero(persistentContext, sizeof(TSVectorStat)); |
| stat->maxdepth = 1; |
| |
| if (ws) |
| { |
| char *buf; |
| |
| buf = VARDATA_ANY(ws); |
| while (buf - VARDATA_ANY(ws) < VARSIZE_ANY_EXHDR(ws)) |
| { |
| if (pg_mblen(buf) == 1) |
| { |
| switch (*buf) |
| { |
| case 'A': |
| case 'a': |
| stat->weight |= 1 << 3; |
| break; |
| case 'B': |
| case 'b': |
| stat->weight |= 1 << 2; |
| break; |
| case 'C': |
| case 'c': |
| stat->weight |= 1 << 1; |
| break; |
| case 'D': |
| case 'd': |
| stat->weight |= 1; |
| break; |
| default: |
| stat->weight |= 0; |
| } |
| } |
| buf += pg_mblen(buf); |
| } |
| } |
| |
| while (SPI_processed > 0) |
| { |
| uint64 i; |
| |
| for (i = 0; i < SPI_processed; i++) |
| { |
| Datum data = SPI_getbinval(SPI_tuptable->vals[i], SPI_tuptable->tupdesc, 1, &isnull); |
| |
| if (!isnull) |
| stat = ts_accum(persistentContext, stat, data); |
| } |
| |
| SPI_freetuptable(SPI_tuptable); |
| SPI_cursor_fetch(portal, true, 100); |
| } |
| |
| SPI_freetuptable(SPI_tuptable); |
| SPI_cursor_close(portal); |
| SPI_freeplan(plan); |
| pfree(query); |
| |
| return stat; |
| } |
| |
| Datum |
| ts_stat1(PG_FUNCTION_ARGS) |
| { |
| FuncCallContext *funcctx; |
| Datum result; |
| |
| if (SRF_IS_FIRSTCALL()) |
| { |
| TSVectorStat *stat; |
| text *txt = PG_GETARG_TEXT_PP(0); |
| |
| funcctx = SRF_FIRSTCALL_INIT(); |
| SPI_connect(); |
| stat = ts_stat_sql(funcctx->multi_call_memory_ctx, txt, NULL); |
| PG_FREE_IF_COPY(txt, 0); |
| ts_setup_firstcall(fcinfo, funcctx, stat); |
| SPI_finish(); |
| } |
| |
| funcctx = SRF_PERCALL_SETUP(); |
| if ((result = ts_process_call(funcctx)) != (Datum) 0) |
| SRF_RETURN_NEXT(funcctx, result); |
| SRF_RETURN_DONE(funcctx); |
| } |
| |
| Datum |
| ts_stat2(PG_FUNCTION_ARGS) |
| { |
| FuncCallContext *funcctx; |
| Datum result; |
| |
| if (SRF_IS_FIRSTCALL()) |
| { |
| TSVectorStat *stat; |
| text *txt = PG_GETARG_TEXT_PP(0); |
| text *ws = PG_GETARG_TEXT_PP(1); |
| |
| funcctx = SRF_FIRSTCALL_INIT(); |
| SPI_connect(); |
| stat = ts_stat_sql(funcctx->multi_call_memory_ctx, txt, ws); |
| PG_FREE_IF_COPY(txt, 0); |
| PG_FREE_IF_COPY(ws, 1); |
| ts_setup_firstcall(fcinfo, funcctx, stat); |
| SPI_finish(); |
| } |
| |
| funcctx = SRF_PERCALL_SETUP(); |
| if ((result = ts_process_call(funcctx)) != (Datum) 0) |
| SRF_RETURN_NEXT(funcctx, result); |
| SRF_RETURN_DONE(funcctx); |
| } |
| |
| |
| /* |
| * Triggers for automatic update of a tsvector column from text column(s) |
| * |
| * Trigger arguments are either |
| * name of tsvector col, name of tsconfig to use, name(s) of text col(s) |
| * name of tsvector col, name of regconfig col, name(s) of text col(s) |
| * ie, tsconfig can either be specified by name, or indirectly as the |
| * contents of a regconfig field in the row. If the name is used, it must |
| * be explicitly schema-qualified. |
| */ |
| Datum |
| tsvector_update_trigger_byid(PG_FUNCTION_ARGS) |
| { |
| return tsvector_update_trigger(fcinfo, false); |
| } |
| |
| Datum |
| tsvector_update_trigger_bycolumn(PG_FUNCTION_ARGS) |
| { |
| return tsvector_update_trigger(fcinfo, true); |
| } |
| |
| static Datum |
| tsvector_update_trigger(PG_FUNCTION_ARGS, bool config_column) |
| { |
| TriggerData *trigdata; |
| Trigger *trigger; |
| Relation rel; |
| HeapTuple rettuple = NULL; |
| int tsvector_attr_num, |
| i; |
| ParsedText prs; |
| Datum datum; |
| bool isnull; |
| text *txt; |
| Oid cfgId; |
| bool update_needed; |
| |
| /* Check call context */ |
| if (!CALLED_AS_TRIGGER(fcinfo)) /* internal error */ |
| elog(ERROR, "tsvector_update_trigger: not fired by trigger manager"); |
| |
| trigdata = (TriggerData *) fcinfo->context; |
| if (!TRIGGER_FIRED_FOR_ROW(trigdata->tg_event)) |
| elog(ERROR, "tsvector_update_trigger: must be fired for row"); |
| if (!TRIGGER_FIRED_BEFORE(trigdata->tg_event)) |
| elog(ERROR, "tsvector_update_trigger: must be fired BEFORE event"); |
| |
| if (TRIGGER_FIRED_BY_INSERT(trigdata->tg_event)) |
| { |
| rettuple = trigdata->tg_trigtuple; |
| update_needed = true; |
| } |
| else if (TRIGGER_FIRED_BY_UPDATE(trigdata->tg_event)) |
| { |
| rettuple = trigdata->tg_newtuple; |
| update_needed = false; /* computed below */ |
| } |
| else |
| elog(ERROR, "tsvector_update_trigger: must be fired for INSERT or UPDATE"); |
| |
| trigger = trigdata->tg_trigger; |
| rel = trigdata->tg_relation; |
| |
| if (trigger->tgnargs < 3) |
| elog(ERROR, "tsvector_update_trigger: arguments must be tsvector_field, ts_config, text_field1, ...)"); |
| |
| /* Find the target tsvector column */ |
| tsvector_attr_num = SPI_fnumber(rel->rd_att, trigger->tgargs[0]); |
| if (tsvector_attr_num == SPI_ERROR_NOATTRIBUTE) |
| ereport(ERROR, |
| (errcode(ERRCODE_UNDEFINED_COLUMN), |
| errmsg("tsvector column \"%s\" does not exist", |
| trigger->tgargs[0]))); |
| /* This will effectively reject system columns, so no separate test: */ |
| if (!IsBinaryCoercible(SPI_gettypeid(rel->rd_att, tsvector_attr_num), |
| TSVECTOROID)) |
| ereport(ERROR, |
| (errcode(ERRCODE_DATATYPE_MISMATCH), |
| errmsg("column \"%s\" is not of tsvector type", |
| trigger->tgargs[0]))); |
| |
| /* Find the configuration to use */ |
| if (config_column) |
| { |
| int config_attr_num; |
| |
| config_attr_num = SPI_fnumber(rel->rd_att, trigger->tgargs[1]); |
| if (config_attr_num == SPI_ERROR_NOATTRIBUTE) |
| ereport(ERROR, |
| (errcode(ERRCODE_UNDEFINED_COLUMN), |
| errmsg("configuration column \"%s\" does not exist", |
| trigger->tgargs[1]))); |
| if (!IsBinaryCoercible(SPI_gettypeid(rel->rd_att, config_attr_num), |
| REGCONFIGOID)) |
| ereport(ERROR, |
| (errcode(ERRCODE_DATATYPE_MISMATCH), |
| errmsg("column \"%s\" is not of regconfig type", |
| trigger->tgargs[1]))); |
| |
| datum = SPI_getbinval(rettuple, rel->rd_att, config_attr_num, &isnull); |
| if (isnull) |
| ereport(ERROR, |
| (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), |
| errmsg("configuration column \"%s\" must not be null", |
| trigger->tgargs[1]))); |
| cfgId = DatumGetObjectId(datum); |
| } |
| else |
| { |
| List *names; |
| |
| names = stringToQualifiedNameList(trigger->tgargs[1], NULL); |
| /* require a schema so that results are not search path dependent */ |
| if (list_length(names) < 2) |
| ereport(ERROR, |
| (errcode(ERRCODE_INVALID_PARAMETER_VALUE), |
| errmsg("text search configuration name \"%s\" must be schema-qualified", |
| trigger->tgargs[1]))); |
| cfgId = get_ts_config_oid(names, false); |
| } |
| |
| /* initialize parse state */ |
| prs.lenwords = 32; |
| prs.curwords = 0; |
| prs.pos = 0; |
| prs.words = (ParsedWord *) palloc(sizeof(ParsedWord) * prs.lenwords); |
| |
| /* find all words in indexable column(s) */ |
| for (i = 2; i < trigger->tgnargs; i++) |
| { |
| int numattr; |
| |
| numattr = SPI_fnumber(rel->rd_att, trigger->tgargs[i]); |
| if (numattr == SPI_ERROR_NOATTRIBUTE) |
| ereport(ERROR, |
| (errcode(ERRCODE_UNDEFINED_COLUMN), |
| errmsg("column \"%s\" does not exist", |
| trigger->tgargs[i]))); |
| if (!IsBinaryCoercible(SPI_gettypeid(rel->rd_att, numattr), TEXTOID)) |
| ereport(ERROR, |
| (errcode(ERRCODE_DATATYPE_MISMATCH), |
| errmsg("column \"%s\" is not of a character type", |
| trigger->tgargs[i]))); |
| |
| if (bms_is_member(numattr - FirstLowInvalidHeapAttributeNumber, trigdata->tg_updatedcols)) |
| update_needed = true; |
| |
| datum = SPI_getbinval(rettuple, rel->rd_att, numattr, &isnull); |
| if (isnull) |
| continue; |
| |
| txt = DatumGetTextPP(datum); |
| |
| parsetext(cfgId, &prs, VARDATA_ANY(txt), VARSIZE_ANY_EXHDR(txt)); |
| |
| if (txt != (text *) DatumGetPointer(datum)) |
| pfree(txt); |
| } |
| |
| if (update_needed) |
| { |
| /* make tsvector value */ |
| datum = TSVectorGetDatum(make_tsvector(&prs)); |
| isnull = false; |
| |
| /* and insert it into tuple */ |
| rettuple = heap_modify_tuple_by_cols(rettuple, rel->rd_att, |
| 1, &tsvector_attr_num, |
| &datum, &isnull); |
| |
| pfree(DatumGetPointer(datum)); |
| } |
| |
| return PointerGetDatum(rettuple); |
| } |