| /*------------------------------------------------------------------------- |
| * |
| * tsrank.c |
| * rank tsvector by tsquery |
| * |
| * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group |
| * |
| * |
| * IDENTIFICATION |
| * src/backend/utils/adt/tsrank.c |
| * |
| *------------------------------------------------------------------------- |
| */ |
| #include "postgres.h" |
| |
| #include <limits.h> |
| #include <math.h> |
| |
| #include "miscadmin.h" |
| #include "tsearch/ts_utils.h" |
| #include "utils/array.h" |
| #include "utils/builtins.h" |
| |
| static const float weights[] = {0.1f, 0.2f, 0.4f, 1.0f}; |
| |
| #define wpos(wep) ( w[ WEP_GETWEIGHT(wep) ] ) |
| |
| #define RANK_NO_NORM 0x00 |
| #define RANK_NORM_LOGLENGTH 0x01 |
| #define RANK_NORM_LENGTH 0x02 |
| #define RANK_NORM_EXTDIST 0x04 |
| #define RANK_NORM_UNIQ 0x08 |
| #define RANK_NORM_LOGUNIQ 0x10 |
| #define RANK_NORM_RDIVRPLUS1 0x20 |
| #define DEF_NORM_METHOD RANK_NO_NORM |
| |
| static float calc_rank_or(const float *w, TSVector t, TSQuery q); |
| static float calc_rank_and(const float *w, TSVector t, TSQuery q); |
| |
| /* |
| * Returns a weight of a word collocation |
| */ |
| static float4 |
| word_distance(int32 w) |
| { |
| if (w > 100) |
| return 1e-30f; |
| |
| return 1.0 / (1.005 + 0.05 * exp(((float4) w) / 1.5 - 2)); |
| } |
| |
| static int |
| cnt_length(TSVector t) |
| { |
| WordEntry *ptr = ARRPTR(t), |
| *end = (WordEntry *) STRPTR(t); |
| int len = 0; |
| |
| while (ptr < end) |
| { |
| int clen = POSDATALEN(t, ptr); |
| |
| if (clen == 0) |
| len += 1; |
| else |
| len += clen; |
| |
| ptr++; |
| } |
| |
| return len; |
| } |
| |
| |
| #define WordECompareQueryItem(e,q,p,i,m) \ |
| tsCompareString((q) + (i)->distance, (i)->length, \ |
| (e) + (p)->pos, (p)->len, (m)) |
| |
| |
| /* |
| * Returns a pointer to a WordEntry's array corresponding to 'item' from |
| * tsvector 't'. 'q' is the TSQuery containing 'item'. |
| * Returns NULL if not found. |
| */ |
| static WordEntry * |
| find_wordentry(TSVector t, TSQuery q, QueryOperand *item, int32 *nitem) |
| { |
| WordEntry *StopLow = ARRPTR(t); |
| WordEntry *StopHigh = (WordEntry *) STRPTR(t); |
| WordEntry *StopMiddle = StopHigh; |
| int difference; |
| |
| *nitem = 0; |
| |
| /* Loop invariant: StopLow <= item < StopHigh */ |
| while (StopLow < StopHigh) |
| { |
| StopMiddle = StopLow + (StopHigh - StopLow) / 2; |
| difference = WordECompareQueryItem(STRPTR(t), GETOPERAND(q), StopMiddle, item, false); |
| if (difference == 0) |
| { |
| StopHigh = StopMiddle; |
| *nitem = 1; |
| break; |
| } |
| else if (difference > 0) |
| StopLow = StopMiddle + 1; |
| else |
| StopHigh = StopMiddle; |
| } |
| |
| if (item->prefix) |
| { |
| if (StopLow >= StopHigh) |
| StopMiddle = StopHigh; |
| |
| *nitem = 0; |
| |
| while (StopMiddle < (WordEntry *) STRPTR(t) && |
| WordECompareQueryItem(STRPTR(t), GETOPERAND(q), StopMiddle, item, true) == 0) |
| { |
| (*nitem)++; |
| StopMiddle++; |
| } |
| } |
| |
| return (*nitem > 0) ? StopHigh : NULL; |
| } |
| |
| |
| /* |
| * sort QueryOperands by (length, word) |
| */ |
| static int |
| compareQueryOperand(const void *a, const void *b, void *arg) |
| { |
| char *operand = (char *) arg; |
| QueryOperand *qa = (*(QueryOperand *const *) a); |
| QueryOperand *qb = (*(QueryOperand *const *) b); |
| |
| return tsCompareString(operand + qa->distance, qa->length, |
| operand + qb->distance, qb->length, |
| false); |
| } |
| |
| /* |
| * Returns a sorted, de-duplicated array of QueryOperands in a query. |
| * The returned QueryOperands are pointers to the original QueryOperands |
| * in the query. |
| * |
| * Length of the returned array is stored in *size |
| */ |
| static QueryOperand ** |
| SortAndUniqItems(TSQuery q, int *size) |
| { |
| char *operand = GETOPERAND(q); |
| QueryItem *item = GETQUERY(q); |
| QueryOperand **res, |
| **ptr, |
| **prevptr; |
| |
| ptr = res = (QueryOperand **) palloc(sizeof(QueryOperand *) * *size); |
| |
| /* Collect all operands from the tree to res */ |
| while ((*size)--) |
| { |
| if (item->type == QI_VAL) |
| { |
| *ptr = (QueryOperand *) item; |
| ptr++; |
| } |
| item++; |
| } |
| |
| *size = ptr - res; |
| if (*size < 2) |
| return res; |
| |
| qsort_arg(res, *size, sizeof(QueryOperand *), compareQueryOperand, operand); |
| |
| ptr = res + 1; |
| prevptr = res; |
| |
| /* remove duplicates */ |
| while (ptr - res < *size) |
| { |
| if (compareQueryOperand((void *) ptr, (void *) prevptr, (void *) operand) != 0) |
| { |
| prevptr++; |
| *prevptr = *ptr; |
| } |
| ptr++; |
| } |
| |
| *size = prevptr + 1 - res; |
| return res; |
| } |
| |
| static float |
| calc_rank_and(const float *w, TSVector t, TSQuery q) |
| { |
| WordEntryPosVector **pos; |
| WordEntryPosVector1 posnull; |
| WordEntryPosVector *POSNULL; |
| int i, |
| k, |
| l, |
| p; |
| WordEntry *entry, |
| *firstentry; |
| WordEntryPos *post, |
| *ct; |
| int32 dimt, |
| lenct, |
| dist, |
| nitem; |
| float res = -1.0; |
| QueryOperand **item; |
| int size = q->size; |
| |
| item = SortAndUniqItems(q, &size); |
| if (size < 2) |
| { |
| pfree(item); |
| return calc_rank_or(w, t, q); |
| } |
| pos = (WordEntryPosVector **) palloc0(sizeof(WordEntryPosVector *) * q->size); |
| |
| /* A dummy WordEntryPos array to use when haspos is false */ |
| posnull.npos = 1; |
| posnull.pos[0] = 0; |
| WEP_SETPOS(posnull.pos[0], MAXENTRYPOS - 1); |
| POSNULL = (WordEntryPosVector *) &posnull; |
| |
| for (i = 0; i < size; i++) |
| { |
| firstentry = entry = find_wordentry(t, q, item[i], &nitem); |
| if (!entry) |
| continue; |
| |
| while (entry - firstentry < nitem) |
| { |
| if (entry->haspos) |
| pos[i] = _POSVECPTR(t, entry); |
| else |
| pos[i] = POSNULL; |
| |
| dimt = pos[i]->npos; |
| post = pos[i]->pos; |
| for (k = 0; k < i; k++) |
| { |
| if (!pos[k]) |
| continue; |
| lenct = pos[k]->npos; |
| ct = pos[k]->pos; |
| for (l = 0; l < dimt; l++) |
| { |
| for (p = 0; p < lenct; p++) |
| { |
| dist = abs((int) WEP_GETPOS(post[l]) - (int) WEP_GETPOS(ct[p])); |
| if (dist || (dist == 0 && (pos[i] == POSNULL || pos[k] == POSNULL))) |
| { |
| float curw; |
| |
| if (!dist) |
| dist = MAXENTRYPOS; |
| curw = sqrt(wpos(post[l]) * wpos(ct[p]) * word_distance(dist)); |
| res = (res < 0) ? curw : 1.0 - (1.0 - res) * (1.0 - curw); |
| } |
| } |
| } |
| } |
| |
| entry++; |
| } |
| } |
| pfree(pos); |
| pfree(item); |
| return res; |
| } |
| |
| static float |
| calc_rank_or(const float *w, TSVector t, TSQuery q) |
| { |
| WordEntry *entry, |
| *firstentry; |
| WordEntryPosVector1 posnull; |
| WordEntryPos *post; |
| int32 dimt, |
| j, |
| i, |
| nitem; |
| float res = 0.0; |
| QueryOperand **item; |
| int size = q->size; |
| |
| /* A dummy WordEntryPos array to use when haspos is false */ |
| posnull.npos = 1; |
| posnull.pos[0] = 0; |
| |
| item = SortAndUniqItems(q, &size); |
| |
| for (i = 0; i < size; i++) |
| { |
| float resj, |
| wjm; |
| int32 jm; |
| |
| firstentry = entry = find_wordentry(t, q, item[i], &nitem); |
| if (!entry) |
| continue; |
| |
| while (entry - firstentry < nitem) |
| { |
| if (entry->haspos) |
| { |
| dimt = POSDATALEN(t, entry); |
| post = POSDATAPTR(t, entry); |
| } |
| else |
| { |
| dimt = posnull.npos; |
| post = posnull.pos; |
| } |
| |
| resj = 0.0; |
| wjm = -1.0; |
| jm = 0; |
| for (j = 0; j < dimt; j++) |
| { |
| resj = resj + wpos(post[j]) / ((j + 1) * (j + 1)); |
| if (wpos(post[j]) > wjm) |
| { |
| wjm = wpos(post[j]); |
| jm = j; |
| } |
| } |
| /* |
| limit (sum(1/i^2),i=1,inf) = pi^2/6 |
| resj = sum(wi/i^2),i=1,noccurrence, |
| wi - should be sorted desc, |
| don't sort for now, just choose maximum weight. This should be corrected |
| Oleg Bartunov |
| */ |
| res = res + (wjm + resj - wjm / ((jm + 1) * (jm + 1))) / 1.64493406685; |
| |
| entry++; |
| } |
| } |
| if (size > 0) |
| res = res / size; |
| pfree(item); |
| return res; |
| } |
| |
| static float |
| calc_rank(const float *w, TSVector t, TSQuery q, int32 method) |
| { |
| QueryItem *item = GETQUERY(q); |
| float res = 0.0; |
| int len; |
| |
| if (!t->size || !q->size) |
| return 0.0; |
| |
| /* XXX: What about NOT? */ |
| res = (item->type == QI_OPR && (item->qoperator.oper == OP_AND || |
| item->qoperator.oper == OP_PHRASE)) ? |
| calc_rank_and(w, t, q) : |
| calc_rank_or(w, t, q); |
| |
| if (res < 0) |
| res = 1e-20f; |
| |
| if ((method & RANK_NORM_LOGLENGTH) && t->size > 0) |
| res /= log((double) (cnt_length(t) + 1)) / log(2.0); |
| |
| if (method & RANK_NORM_LENGTH) |
| { |
| len = cnt_length(t); |
| if (len > 0) |
| res /= (float) len; |
| } |
| |
| /* RANK_NORM_EXTDIST not applicable */ |
| |
| if ((method & RANK_NORM_UNIQ) && t->size > 0) |
| res /= (float) (t->size); |
| |
| if ((method & RANK_NORM_LOGUNIQ) && t->size > 0) |
| res /= log((double) (t->size + 1)) / log(2.0); |
| |
| if (method & RANK_NORM_RDIVRPLUS1) |
| res /= (res + 1); |
| |
| return res; |
| } |
| |
| static const float * |
| getWeights(ArrayType *win) |
| { |
| static float ws[lengthof(weights)]; |
| int i; |
| float4 *arrdata; |
| |
| if (win == NULL) |
| return weights; |
| |
| if (ARR_NDIM(win) != 1) |
| ereport(ERROR, |
| (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR), |
| errmsg("array of weight must be one-dimensional"))); |
| |
| if (ArrayGetNItems(ARR_NDIM(win), ARR_DIMS(win)) < lengthof(weights)) |
| ereport(ERROR, |
| (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR), |
| errmsg("array of weight is too short"))); |
| |
| if (array_contains_nulls(win)) |
| ereport(ERROR, |
| (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), |
| errmsg("array of weight must not contain nulls"))); |
| |
| arrdata = (float4 *) ARR_DATA_PTR(win); |
| for (i = 0; i < lengthof(weights); i++) |
| { |
| ws[i] = (arrdata[i] >= 0) ? arrdata[i] : weights[i]; |
| if (ws[i] > 1.0) |
| ereport(ERROR, |
| (errcode(ERRCODE_INVALID_PARAMETER_VALUE), |
| errmsg("weight out of range"))); |
| } |
| |
| return ws; |
| } |
| |
| Datum |
| ts_rank_wttf(PG_FUNCTION_ARGS) |
| { |
| ArrayType *win = (ArrayType *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0)); |
| TSVector txt = PG_GETARG_TSVECTOR(1); |
| TSQuery query = PG_GETARG_TSQUERY(2); |
| int method = PG_GETARG_INT32(3); |
| float res; |
| |
| res = calc_rank(getWeights(win), txt, query, method); |
| |
| PG_FREE_IF_COPY(win, 0); |
| PG_FREE_IF_COPY(txt, 1); |
| PG_FREE_IF_COPY(query, 2); |
| PG_RETURN_FLOAT4(res); |
| } |
| |
| Datum |
| ts_rank_wtt(PG_FUNCTION_ARGS) |
| { |
| ArrayType *win = (ArrayType *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0)); |
| TSVector txt = PG_GETARG_TSVECTOR(1); |
| TSQuery query = PG_GETARG_TSQUERY(2); |
| float res; |
| |
| res = calc_rank(getWeights(win), txt, query, DEF_NORM_METHOD); |
| |
| PG_FREE_IF_COPY(win, 0); |
| PG_FREE_IF_COPY(txt, 1); |
| PG_FREE_IF_COPY(query, 2); |
| PG_RETURN_FLOAT4(res); |
| } |
| |
| Datum |
| ts_rank_ttf(PG_FUNCTION_ARGS) |
| { |
| TSVector txt = PG_GETARG_TSVECTOR(0); |
| TSQuery query = PG_GETARG_TSQUERY(1); |
| int method = PG_GETARG_INT32(2); |
| float res; |
| |
| res = calc_rank(getWeights(NULL), txt, query, method); |
| |
| PG_FREE_IF_COPY(txt, 0); |
| PG_FREE_IF_COPY(query, 1); |
| PG_RETURN_FLOAT4(res); |
| } |
| |
| Datum |
| ts_rank_tt(PG_FUNCTION_ARGS) |
| { |
| TSVector txt = PG_GETARG_TSVECTOR(0); |
| TSQuery query = PG_GETARG_TSQUERY(1); |
| float res; |
| |
| res = calc_rank(getWeights(NULL), txt, query, DEF_NORM_METHOD); |
| |
| PG_FREE_IF_COPY(txt, 0); |
| PG_FREE_IF_COPY(query, 1); |
| PG_RETURN_FLOAT4(res); |
| } |
| |
| typedef struct |
| { |
| union |
| { |
| struct |
| { /* compiled doc representation */ |
| QueryItem **items; |
| int16 nitem; |
| } query; |
| struct |
| { /* struct is used for preparing doc |
| * representation */ |
| QueryItem *item; |
| WordEntry *entry; |
| } map; |
| } data; |
| WordEntryPos pos; |
| } DocRepresentation; |
| |
| static int |
| compareDocR(const void *va, const void *vb) |
| { |
| const DocRepresentation *a = (const DocRepresentation *) va; |
| const DocRepresentation *b = (const DocRepresentation *) vb; |
| |
| if (WEP_GETPOS(a->pos) == WEP_GETPOS(b->pos)) |
| { |
| if (WEP_GETWEIGHT(a->pos) == WEP_GETWEIGHT(b->pos)) |
| { |
| if (a->data.map.entry == b->data.map.entry) |
| return 0; |
| |
| return (a->data.map.entry > b->data.map.entry) ? 1 : -1; |
| } |
| |
| return (WEP_GETWEIGHT(a->pos) > WEP_GETWEIGHT(b->pos)) ? 1 : -1; |
| } |
| |
| return (WEP_GETPOS(a->pos) > WEP_GETPOS(b->pos)) ? 1 : -1; |
| } |
| |
| #define MAXQROPOS MAXENTRYPOS |
| typedef struct |
| { |
| bool operandexists; |
| bool reverseinsert; /* indicates insert order, true means |
| * descending order */ |
| uint32 npos; |
| WordEntryPos pos[MAXQROPOS]; |
| } QueryRepresentationOperand; |
| |
| typedef struct |
| { |
| TSQuery query; |
| QueryRepresentationOperand *operandData; |
| } QueryRepresentation; |
| |
| #define QR_GET_OPERAND_DATA(q, v) \ |
| ( (q)->operandData + (((QueryItem*)(v)) - GETQUERY((q)->query)) ) |
| |
| /* |
| * TS_execute callback for matching a tsquery operand to QueryRepresentation |
| */ |
| static TSTernaryValue |
| checkcondition_QueryOperand(void *checkval, QueryOperand *val, |
| ExecPhraseData *data) |
| { |
| QueryRepresentation *qr = (QueryRepresentation *) checkval; |
| QueryRepresentationOperand *opData = QR_GET_OPERAND_DATA(qr, val); |
| |
| if (!opData->operandexists) |
| return TS_NO; |
| |
| if (data) |
| { |
| data->npos = opData->npos; |
| data->pos = opData->pos; |
| if (opData->reverseinsert) |
| data->pos += MAXQROPOS - opData->npos; |
| } |
| |
| return TS_YES; |
| } |
| |
| typedef struct |
| { |
| int pos; |
| int p; |
| int q; |
| DocRepresentation *begin; |
| DocRepresentation *end; |
| } CoverExt; |
| |
| static void |
| resetQueryRepresentation(QueryRepresentation *qr, bool reverseinsert) |
| { |
| int i; |
| |
| for (i = 0; i < qr->query->size; i++) |
| { |
| qr->operandData[i].operandexists = false; |
| qr->operandData[i].reverseinsert = reverseinsert; |
| qr->operandData[i].npos = 0; |
| } |
| } |
| |
| static void |
| fillQueryRepresentationData(QueryRepresentation *qr, DocRepresentation *entry) |
| { |
| int i; |
| int lastPos; |
| QueryRepresentationOperand *opData; |
| |
| for (i = 0; i < entry->data.query.nitem; i++) |
| { |
| if (entry->data.query.items[i]->type != QI_VAL) |
| continue; |
| |
| opData = QR_GET_OPERAND_DATA(qr, entry->data.query.items[i]); |
| |
| opData->operandexists = true; |
| |
| if (opData->npos == 0) |
| { |
| lastPos = (opData->reverseinsert) ? (MAXQROPOS - 1) : 0; |
| opData->pos[lastPos] = entry->pos; |
| opData->npos++; |
| continue; |
| } |
| |
| lastPos = opData->reverseinsert ? |
| (MAXQROPOS - opData->npos) : |
| (opData->npos - 1); |
| |
| if (WEP_GETPOS(opData->pos[lastPos]) != WEP_GETPOS(entry->pos)) |
| { |
| lastPos = opData->reverseinsert ? |
| (MAXQROPOS - 1 - opData->npos) : |
| (opData->npos); |
| |
| opData->pos[lastPos] = entry->pos; |
| opData->npos++; |
| } |
| } |
| } |
| |
| static bool |
| Cover(DocRepresentation *doc, int len, QueryRepresentation *qr, CoverExt *ext) |
| { |
| DocRepresentation *ptr; |
| int lastpos = ext->pos; |
| bool found = false; |
| |
| /* |
| * since this function recurses, it could be driven to stack overflow. |
| * (though any decent compiler will optimize away the tail-recursion. |
| */ |
| check_stack_depth(); |
| |
| resetQueryRepresentation(qr, false); |
| |
| ext->p = INT_MAX; |
| ext->q = 0; |
| ptr = doc + ext->pos; |
| |
| /* find upper bound of cover from current position, move up */ |
| while (ptr - doc < len) |
| { |
| fillQueryRepresentationData(qr, ptr); |
| |
| if (TS_execute(GETQUERY(qr->query), (void *) qr, |
| TS_EXEC_EMPTY, checkcondition_QueryOperand)) |
| { |
| if (WEP_GETPOS(ptr->pos) > ext->q) |
| { |
| ext->q = WEP_GETPOS(ptr->pos); |
| ext->end = ptr; |
| lastpos = ptr - doc; |
| found = true; |
| } |
| break; |
| } |
| ptr++; |
| } |
| |
| if (!found) |
| return false; |
| |
| resetQueryRepresentation(qr, true); |
| |
| ptr = doc + lastpos; |
| |
| /* find lower bound of cover from found upper bound, move down */ |
| while (ptr >= doc + ext->pos) |
| { |
| /* |
| * we scan doc from right to left, so pos info in reverse order! |
| */ |
| fillQueryRepresentationData(qr, ptr); |
| |
| if (TS_execute(GETQUERY(qr->query), (void *) qr, |
| TS_EXEC_EMPTY, checkcondition_QueryOperand)) |
| { |
| if (WEP_GETPOS(ptr->pos) < ext->p) |
| { |
| ext->begin = ptr; |
| ext->p = WEP_GETPOS(ptr->pos); |
| } |
| break; |
| } |
| ptr--; |
| } |
| |
| if (ext->p <= ext->q) |
| { |
| /* |
| * set position for next try to next lexeme after beginning of found |
| * cover |
| */ |
| ext->pos = (ptr - doc) + 1; |
| return true; |
| } |
| |
| ext->pos++; |
| return Cover(doc, len, qr, ext); |
| } |
| |
| static DocRepresentation * |
| get_docrep(TSVector txt, QueryRepresentation *qr, int *doclen) |
| { |
| QueryItem *item = GETQUERY(qr->query); |
| WordEntry *entry, |
| *firstentry; |
| WordEntryPos *post; |
| int32 dimt, /* number of 'post' items */ |
| j, |
| i, |
| nitem; |
| int len = qr->query->size * 4, |
| cur = 0; |
| DocRepresentation *doc; |
| |
| doc = (DocRepresentation *) palloc(sizeof(DocRepresentation) * len); |
| |
| /* |
| * Iterate through query to make DocRepresentation for words and it's |
| * entries satisfied by query |
| */ |
| for (i = 0; i < qr->query->size; i++) |
| { |
| QueryOperand *curoperand; |
| |
| if (item[i].type != QI_VAL) |
| continue; |
| |
| curoperand = &item[i].qoperand; |
| |
| firstentry = entry = find_wordentry(txt, qr->query, curoperand, &nitem); |
| if (!entry) |
| continue; |
| |
| /* iterations over entries in tsvector */ |
| while (entry - firstentry < nitem) |
| { |
| if (entry->haspos) |
| { |
| dimt = POSDATALEN(txt, entry); |
| post = POSDATAPTR(txt, entry); |
| } |
| else |
| { |
| /* ignore words without positions */ |
| entry++; |
| continue; |
| } |
| |
| while (cur + dimt >= len) |
| { |
| len *= 2; |
| doc = (DocRepresentation *) repalloc(doc, sizeof(DocRepresentation) * len); |
| } |
| |
| /* iterations over entry's positions */ |
| for (j = 0; j < dimt; j++) |
| { |
| if (curoperand->weight == 0 || |
| curoperand->weight & (1 << WEP_GETWEIGHT(post[j]))) |
| { |
| doc[cur].pos = post[j]; |
| doc[cur].data.map.entry = entry; |
| doc[cur].data.map.item = (QueryItem *) curoperand; |
| cur++; |
| } |
| } |
| |
| entry++; |
| } |
| } |
| |
| if (cur > 0) |
| { |
| DocRepresentation *rptr = doc + 1, |
| *wptr = doc, |
| storage; |
| |
| /* |
| * Sort representation in ascending order by pos and entry |
| */ |
| qsort(doc, cur, sizeof(DocRepresentation), compareDocR); |
| |
| /* |
| * Join QueryItem per WordEntry and it's position |
| */ |
| storage.pos = doc->pos; |
| storage.data.query.items = palloc(sizeof(QueryItem *) * qr->query->size); |
| storage.data.query.items[0] = doc->data.map.item; |
| storage.data.query.nitem = 1; |
| |
| while (rptr - doc < cur) |
| { |
| if (rptr->pos == (rptr - 1)->pos && |
| rptr->data.map.entry == (rptr - 1)->data.map.entry) |
| { |
| storage.data.query.items[storage.data.query.nitem] = rptr->data.map.item; |
| storage.data.query.nitem++; |
| } |
| else |
| { |
| *wptr = storage; |
| wptr++; |
| storage.pos = rptr->pos; |
| storage.data.query.items = palloc(sizeof(QueryItem *) * qr->query->size); |
| storage.data.query.items[0] = rptr->data.map.item; |
| storage.data.query.nitem = 1; |
| } |
| |
| rptr++; |
| } |
| |
| *wptr = storage; |
| wptr++; |
| |
| *doclen = wptr - doc; |
| return doc; |
| } |
| |
| pfree(doc); |
| return NULL; |
| } |
| |
| static float4 |
| calc_rank_cd(const float4 *arrdata, TSVector txt, TSQuery query, int method) |
| { |
| DocRepresentation *doc; |
| int len, |
| i, |
| doclen = 0; |
| CoverExt ext; |
| double Wdoc = 0.0; |
| double invws[lengthof(weights)]; |
| double SumDist = 0.0, |
| PrevExtPos = 0.0; |
| int NExtent = 0; |
| QueryRepresentation qr; |
| |
| |
| for (i = 0; i < lengthof(weights); i++) |
| { |
| invws[i] = ((double) ((arrdata[i] >= 0) ? arrdata[i] : weights[i])); |
| if (invws[i] > 1.0) |
| ereport(ERROR, |
| (errcode(ERRCODE_INVALID_PARAMETER_VALUE), |
| errmsg("weight out of range"))); |
| invws[i] = 1.0 / invws[i]; |
| } |
| |
| qr.query = query; |
| qr.operandData = (QueryRepresentationOperand *) |
| palloc0(sizeof(QueryRepresentationOperand) * query->size); |
| |
| doc = get_docrep(txt, &qr, &doclen); |
| if (!doc) |
| { |
| pfree(qr.operandData); |
| return 0.0; |
| } |
| |
| MemSet(&ext, 0, sizeof(CoverExt)); |
| while (Cover(doc, doclen, &qr, &ext)) |
| { |
| double Cpos = 0.0; |
| double InvSum = 0.0; |
| double CurExtPos; |
| int nNoise; |
| DocRepresentation *ptr = ext.begin; |
| |
| while (ptr <= ext.end) |
| { |
| InvSum += invws[WEP_GETWEIGHT(ptr->pos)]; |
| ptr++; |
| } |
| |
| Cpos = ((double) (ext.end - ext.begin + 1)) / InvSum; |
| |
| /* |
| * if doc are big enough then ext.q may be equal to ext.p due to limit |
| * of positional information. In this case we approximate number of |
| * noise word as half cover's length |
| */ |
| nNoise = (ext.q - ext.p) - (ext.end - ext.begin); |
| if (nNoise < 0) |
| nNoise = (ext.end - ext.begin) / 2; |
| Wdoc += Cpos / ((double) (1 + nNoise)); |
| |
| CurExtPos = ((double) (ext.q + ext.p)) / 2.0; |
| if (NExtent > 0 && CurExtPos > PrevExtPos /* prevent division by |
| * zero in a case of |
| * multiple lexize */ ) |
| SumDist += 1.0 / (CurExtPos - PrevExtPos); |
| |
| PrevExtPos = CurExtPos; |
| NExtent++; |
| } |
| |
| if ((method & RANK_NORM_LOGLENGTH) && txt->size > 0) |
| Wdoc /= log((double) (cnt_length(txt) + 1)); |
| |
| if (method & RANK_NORM_LENGTH) |
| { |
| len = cnt_length(txt); |
| if (len > 0) |
| Wdoc /= (double) len; |
| } |
| |
| if ((method & RANK_NORM_EXTDIST) && NExtent > 0 && SumDist > 0) |
| Wdoc /= ((double) NExtent) / SumDist; |
| |
| if ((method & RANK_NORM_UNIQ) && txt->size > 0) |
| Wdoc /= (double) (txt->size); |
| |
| if ((method & RANK_NORM_LOGUNIQ) && txt->size > 0) |
| Wdoc /= log((double) (txt->size + 1)) / log(2.0); |
| |
| if (method & RANK_NORM_RDIVRPLUS1) |
| Wdoc /= (Wdoc + 1); |
| |
| pfree(doc); |
| |
| pfree(qr.operandData); |
| |
| return (float4) Wdoc; |
| } |
| |
| Datum |
| ts_rankcd_wttf(PG_FUNCTION_ARGS) |
| { |
| ArrayType *win = (ArrayType *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0)); |
| TSVector txt = PG_GETARG_TSVECTOR(1); |
| TSQuery query = PG_GETARG_TSQUERY(2); |
| int method = PG_GETARG_INT32(3); |
| float res; |
| |
| res = calc_rank_cd(getWeights(win), txt, query, method); |
| |
| PG_FREE_IF_COPY(win, 0); |
| PG_FREE_IF_COPY(txt, 1); |
| PG_FREE_IF_COPY(query, 2); |
| PG_RETURN_FLOAT4(res); |
| } |
| |
| Datum |
| ts_rankcd_wtt(PG_FUNCTION_ARGS) |
| { |
| ArrayType *win = (ArrayType *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0)); |
| TSVector txt = PG_GETARG_TSVECTOR(1); |
| TSQuery query = PG_GETARG_TSQUERY(2); |
| float res; |
| |
| res = calc_rank_cd(getWeights(win), txt, query, DEF_NORM_METHOD); |
| |
| PG_FREE_IF_COPY(win, 0); |
| PG_FREE_IF_COPY(txt, 1); |
| PG_FREE_IF_COPY(query, 2); |
| PG_RETURN_FLOAT4(res); |
| } |
| |
| Datum |
| ts_rankcd_ttf(PG_FUNCTION_ARGS) |
| { |
| TSVector txt = PG_GETARG_TSVECTOR(0); |
| TSQuery query = PG_GETARG_TSQUERY(1); |
| int method = PG_GETARG_INT32(2); |
| float res; |
| |
| res = calc_rank_cd(getWeights(NULL), txt, query, method); |
| |
| PG_FREE_IF_COPY(txt, 0); |
| PG_FREE_IF_COPY(query, 1); |
| PG_RETURN_FLOAT4(res); |
| } |
| |
| Datum |
| ts_rankcd_tt(PG_FUNCTION_ARGS) |
| { |
| TSVector txt = PG_GETARG_TSVECTOR(0); |
| TSQuery query = PG_GETARG_TSQUERY(1); |
| float res; |
| |
| res = calc_rank_cd(getWeights(NULL), txt, query, DEF_NORM_METHOD); |
| |
| PG_FREE_IF_COPY(txt, 0); |
| PG_FREE_IF_COPY(query, 1); |
| PG_RETURN_FLOAT4(res); |
| } |