| /*------------------------------------------------------------------------- |
| * |
| * tsvector.c |
| * I/O functions for tsvector |
| * |
| * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group |
| * |
| * |
| * IDENTIFICATION |
| * src/backend/utils/adt/tsvector.c |
| * |
| *------------------------------------------------------------------------- |
| */ |
| |
| #include "postgres.h" |
| |
| #include "libpq/pqformat.h" |
| #include "nodes/miscnodes.h" |
| #include "tsearch/ts_locale.h" |
| #include "tsearch/ts_utils.h" |
| #include "utils/builtins.h" |
| #include "utils/memutils.h" |
| #include "varatt.h" |
| |
| typedef struct |
| { |
| WordEntry entry; /* must be first, see compareentry */ |
| WordEntryPos *pos; |
| int poslen; /* number of elements in pos */ |
| } WordEntryIN; |
| |
| |
| /* Compare two WordEntryPos values for qsort */ |
| int |
| compareWordEntryPos(const void *a, const void *b) |
| { |
| int apos = WEP_GETPOS(*(const WordEntryPos *) a); |
| int bpos = WEP_GETPOS(*(const WordEntryPos *) b); |
| |
| if (apos == bpos) |
| return 0; |
| return (apos > bpos) ? 1 : -1; |
| } |
| |
| /* |
| * Removes duplicate pos entries. If there's two entries with same pos but |
| * different weight, the higher weight is retained, so we can't use |
| * qunique here. |
| * |
| * Returns new length. |
| */ |
| static int |
| uniquePos(WordEntryPos *a, int l) |
| { |
| WordEntryPos *ptr, |
| *res; |
| |
| if (l <= 1) |
| return l; |
| |
| qsort(a, l, sizeof(WordEntryPos), compareWordEntryPos); |
| |
| res = a; |
| ptr = a + 1; |
| while (ptr - a < l) |
| { |
| if (WEP_GETPOS(*ptr) != WEP_GETPOS(*res)) |
| { |
| res++; |
| *res = *ptr; |
| if (res - a >= MAXNUMPOS - 1 || |
| WEP_GETPOS(*res) == MAXENTRYPOS - 1) |
| break; |
| } |
| else if (WEP_GETWEIGHT(*ptr) > WEP_GETWEIGHT(*res)) |
| WEP_SETWEIGHT(*res, WEP_GETWEIGHT(*ptr)); |
| ptr++; |
| } |
| |
| return res + 1 - a; |
| } |
| |
| /* |
| * Compare two WordEntry structs for qsort_arg. This can also be used on |
| * WordEntryIN structs, since those have WordEntry as their first field. |
| */ |
| static int |
| compareentry(const void *va, const void *vb, void *arg) |
| { |
| const WordEntry *a = (const WordEntry *) va; |
| const WordEntry *b = (const WordEntry *) vb; |
| char *BufferStr = (char *) arg; |
| |
| return tsCompareString(&BufferStr[a->pos], a->len, |
| &BufferStr[b->pos], b->len, |
| false); |
| } |
| |
| /* |
| * Sort an array of WordEntryIN, remove duplicates. |
| * *outbuflen receives the amount of space needed for strings and positions. |
| */ |
| static int |
| uniqueentry(WordEntryIN *a, int l, char *buf, int *outbuflen) |
| { |
| int buflen; |
| WordEntryIN *ptr, |
| *res; |
| |
| Assert(l >= 1); |
| |
| if (l > 1) |
| qsort_arg(a, l, sizeof(WordEntryIN), compareentry, buf); |
| |
| buflen = 0; |
| res = a; |
| ptr = a + 1; |
| while (ptr - a < l) |
| { |
| if (!(ptr->entry.len == res->entry.len && |
| strncmp(&buf[ptr->entry.pos], &buf[res->entry.pos], |
| res->entry.len) == 0)) |
| { |
| /* done accumulating data into *res, count space needed */ |
| buflen += res->entry.len; |
| if (res->entry.haspos) |
| { |
| res->poslen = uniquePos(res->pos, res->poslen); |
| buflen = SHORTALIGN(buflen); |
| buflen += res->poslen * sizeof(WordEntryPos) + sizeof(uint16); |
| } |
| res++; |
| if (res != ptr) |
| memcpy(res, ptr, sizeof(WordEntryIN)); |
| } |
| else if (ptr->entry.haspos) |
| { |
| if (res->entry.haspos) |
| { |
| /* append ptr's positions to res's positions */ |
| int newlen = ptr->poslen + res->poslen; |
| |
| res->pos = (WordEntryPos *) |
| repalloc(res->pos, newlen * sizeof(WordEntryPos)); |
| memcpy(&res->pos[res->poslen], ptr->pos, |
| ptr->poslen * sizeof(WordEntryPos)); |
| res->poslen = newlen; |
| pfree(ptr->pos); |
| } |
| else |
| { |
| /* just give ptr's positions to pos */ |
| res->entry.haspos = 1; |
| res->pos = ptr->pos; |
| res->poslen = ptr->poslen; |
| } |
| } |
| ptr++; |
| } |
| |
| /* count space needed for last item */ |
| buflen += res->entry.len; |
| if (res->entry.haspos) |
| { |
| res->poslen = uniquePos(res->pos, res->poslen); |
| buflen = SHORTALIGN(buflen); |
| buflen += res->poslen * sizeof(WordEntryPos) + sizeof(uint16); |
| } |
| |
| *outbuflen = buflen; |
| return res + 1 - a; |
| } |
| |
| |
| Datum |
| tsvectorin(PG_FUNCTION_ARGS) |
| { |
| char *buf = PG_GETARG_CSTRING(0); |
| Node *escontext = fcinfo->context; |
| TSVectorParseState state; |
| WordEntryIN *arr; |
| int totallen; |
| int arrlen; /* allocated size of arr */ |
| WordEntry *inarr; |
| int len = 0; |
| TSVector in; |
| int i; |
| char *token; |
| int toklen; |
| WordEntryPos *pos; |
| int poslen; |
| char *strbuf; |
| int stroff; |
| |
| /* |
| * Tokens are appended to tmpbuf, cur is a pointer to the end of used |
| * space in tmpbuf. |
| */ |
| char *tmpbuf; |
| char *cur; |
| int buflen = 256; /* allocated size of tmpbuf */ |
| |
| state = init_tsvector_parser(buf, 0, escontext); |
| |
| arrlen = 64; |
| arr = (WordEntryIN *) palloc(sizeof(WordEntryIN) * arrlen); |
| cur = tmpbuf = (char *) palloc(buflen); |
| |
| while (gettoken_tsvector(state, &token, &toklen, &pos, &poslen, NULL)) |
| { |
| if (toklen >= MAXSTRLEN) |
| ereturn(escontext, (Datum) 0, |
| (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), |
| errmsg("word is too long (%ld bytes, max %ld bytes)", |
| (long) toklen, |
| (long) (MAXSTRLEN - 1)))); |
| |
| if (cur - tmpbuf > MAXSTRPOS) |
| ereturn(escontext, (Datum) 0, |
| (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), |
| errmsg("string is too long for tsvector (%ld bytes, max %ld bytes)", |
| (long) (cur - tmpbuf), (long) MAXSTRPOS))); |
| |
| /* |
| * Enlarge buffers if needed |
| */ |
| if (len >= arrlen) |
| { |
| arrlen *= 2; |
| arr = (WordEntryIN *) |
| repalloc(arr, sizeof(WordEntryIN) * arrlen); |
| } |
| while ((cur - tmpbuf) + toklen >= buflen) |
| { |
| int dist = cur - tmpbuf; |
| |
| buflen *= 2; |
| tmpbuf = (char *) repalloc(tmpbuf, buflen); |
| cur = tmpbuf + dist; |
| } |
| arr[len].entry.len = toklen; |
| arr[len].entry.pos = cur - tmpbuf; |
| memcpy(cur, token, toklen); |
| cur += toklen; |
| |
| if (poslen != 0) |
| { |
| arr[len].entry.haspos = 1; |
| arr[len].pos = pos; |
| arr[len].poslen = poslen; |
| } |
| else |
| { |
| arr[len].entry.haspos = 0; |
| arr[len].pos = NULL; |
| arr[len].poslen = 0; |
| } |
| len++; |
| } |
| |
| close_tsvector_parser(state); |
| |
| /* Did gettoken_tsvector fail? */ |
| if (SOFT_ERROR_OCCURRED(escontext)) |
| PG_RETURN_NULL(); |
| |
| if (len > 0) |
| len = uniqueentry(arr, len, tmpbuf, &buflen); |
| else |
| buflen = 0; |
| |
| if (buflen > MAXSTRPOS) |
| ereturn(escontext, (Datum) 0, |
| (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), |
| errmsg("string is too long for tsvector (%d bytes, max %d bytes)", buflen, MAXSTRPOS))); |
| |
| totallen = CALCDATASIZE(len, buflen); |
| in = (TSVector) palloc0(totallen); |
| SET_VARSIZE(in, totallen); |
| in->size = len; |
| inarr = ARRPTR(in); |
| strbuf = STRPTR(in); |
| stroff = 0; |
| for (i = 0; i < len; i++) |
| { |
| memcpy(strbuf + stroff, &tmpbuf[arr[i].entry.pos], arr[i].entry.len); |
| arr[i].entry.pos = stroff; |
| stroff += arr[i].entry.len; |
| if (arr[i].entry.haspos) |
| { |
| /* This should be unreachable because of MAXNUMPOS restrictions */ |
| if (arr[i].poslen > 0xFFFF) |
| elog(ERROR, "positions array too long"); |
| |
| /* Copy number of positions */ |
| stroff = SHORTALIGN(stroff); |
| *(uint16 *) (strbuf + stroff) = (uint16) arr[i].poslen; |
| stroff += sizeof(uint16); |
| |
| /* Copy positions */ |
| memcpy(strbuf + stroff, arr[i].pos, arr[i].poslen * sizeof(WordEntryPos)); |
| stroff += arr[i].poslen * sizeof(WordEntryPos); |
| |
| pfree(arr[i].pos); |
| } |
| inarr[i] = arr[i].entry; |
| } |
| |
| Assert((strbuf + stroff - (char *) in) == totallen); |
| |
| PG_RETURN_TSVECTOR(in); |
| } |
| |
| Datum |
| tsvectorout(PG_FUNCTION_ARGS) |
| { |
| TSVector out = PG_GETARG_TSVECTOR(0); |
| char *outbuf; |
| int32 i, |
| lenbuf = 0, |
| pp; |
| WordEntry *ptr = ARRPTR(out); |
| char *curbegin, |
| *curin, |
| *curout; |
| |
| lenbuf = out->size * 2 /* '' */ + out->size - 1 /* space */ + 2 /* \0 */ ; |
| for (i = 0; i < out->size; i++) |
| { |
| lenbuf += ptr[i].len * 2 * pg_database_encoding_max_length() /* for escape */ ; |
| if (ptr[i].haspos) |
| lenbuf += 1 /* : */ + 7 /* int2 + , + weight */ * POSDATALEN(out, &(ptr[i])); |
| } |
| |
| curout = outbuf = (char *) palloc(lenbuf); |
| for (i = 0; i < out->size; i++) |
| { |
| curbegin = curin = STRPTR(out) + ptr->pos; |
| if (i != 0) |
| *curout++ = ' '; |
| *curout++ = '\''; |
| while (curin - curbegin < ptr->len) |
| { |
| int len = pg_mblen(curin); |
| |
| if (t_iseq(curin, '\'')) |
| *curout++ = '\''; |
| else if (t_iseq(curin, '\\')) |
| *curout++ = '\\'; |
| |
| while (len--) |
| *curout++ = *curin++; |
| } |
| |
| *curout++ = '\''; |
| if ((pp = POSDATALEN(out, ptr)) != 0) |
| { |
| WordEntryPos *wptr; |
| |
| *curout++ = ':'; |
| wptr = POSDATAPTR(out, ptr); |
| while (pp) |
| { |
| curout += sprintf(curout, "%d", WEP_GETPOS(*wptr)); |
| switch (WEP_GETWEIGHT(*wptr)) |
| { |
| case 3: |
| *curout++ = 'A'; |
| break; |
| case 2: |
| *curout++ = 'B'; |
| break; |
| case 1: |
| *curout++ = 'C'; |
| break; |
| case 0: |
| default: |
| break; |
| } |
| |
| if (pp > 1) |
| *curout++ = ','; |
| pp--; |
| wptr++; |
| } |
| } |
| ptr++; |
| } |
| |
| *curout = '\0'; |
| PG_FREE_IF_COPY(out, 0); |
| PG_RETURN_CSTRING(outbuf); |
| } |
| |
| /* |
| * Binary Input / Output functions. The binary format is as follows: |
| * |
| * uint32 number of lexemes |
| * |
| * for each lexeme: |
| * lexeme text in client encoding, null-terminated |
| * uint16 number of positions |
| * for each position: |
| * uint16 WordEntryPos |
| */ |
| |
| Datum |
| tsvectorsend(PG_FUNCTION_ARGS) |
| { |
| TSVector vec = PG_GETARG_TSVECTOR(0); |
| StringInfoData buf; |
| int i, |
| j; |
| WordEntry *weptr = ARRPTR(vec); |
| |
| pq_begintypsend(&buf); |
| |
| pq_sendint32(&buf, vec->size); |
| for (i = 0; i < vec->size; i++) |
| { |
| uint16 npos; |
| |
| /* |
| * the strings in the TSVector array are not null-terminated, so we |
| * have to send the null-terminator separately |
| */ |
| pq_sendtext(&buf, STRPTR(vec) + weptr->pos, weptr->len); |
| pq_sendbyte(&buf, '\0'); |
| |
| npos = POSDATALEN(vec, weptr); |
| pq_sendint16(&buf, npos); |
| |
| if (npos > 0) |
| { |
| WordEntryPos *wepptr = POSDATAPTR(vec, weptr); |
| |
| for (j = 0; j < npos; j++) |
| pq_sendint16(&buf, wepptr[j]); |
| } |
| weptr++; |
| } |
| |
| PG_RETURN_BYTEA_P(pq_endtypsend(&buf)); |
| } |
| |
| Datum |
| tsvectorrecv(PG_FUNCTION_ARGS) |
| { |
| StringInfo buf = (StringInfo) PG_GETARG_POINTER(0); |
| TSVector vec; |
| int i; |
| int32 nentries; |
| int datalen; /* number of bytes used in the variable size |
| * area after fixed size TSVector header and |
| * WordEntries */ |
| Size hdrlen; |
| Size len; /* allocated size of vec */ |
| bool needSort = false; |
| |
| nentries = pq_getmsgint(buf, sizeof(int32)); |
| if (nentries < 0 || nentries > (MaxAllocSize / sizeof(WordEntry))) |
| elog(ERROR, "invalid size of tsvector"); |
| |
| hdrlen = DATAHDRSIZE + sizeof(WordEntry) * nentries; |
| |
| len = hdrlen * 2; /* times two to make room for lexemes */ |
| vec = (TSVector) palloc0(len); |
| vec->size = nentries; |
| |
| datalen = 0; |
| for (i = 0; i < nentries; i++) |
| { |
| const char *lexeme; |
| uint16 npos; |
| size_t lex_len; |
| |
| lexeme = pq_getmsgstring(buf); |
| npos = (uint16) pq_getmsgint(buf, sizeof(uint16)); |
| |
| /* sanity checks */ |
| |
| lex_len = strlen(lexeme); |
| if (lex_len > MAXSTRLEN) |
| elog(ERROR, "invalid tsvector: lexeme too long"); |
| |
| if (datalen > MAXSTRPOS) |
| elog(ERROR, "invalid tsvector: maximum total lexeme length exceeded"); |
| |
| if (npos > MAXNUMPOS) |
| elog(ERROR, "unexpected number of tsvector positions"); |
| |
| /* |
| * Looks valid. Fill the WordEntry struct, and copy lexeme. |
| * |
| * But make sure the buffer is large enough first. |
| */ |
| while (hdrlen + SHORTALIGN(datalen + lex_len) + |
| sizeof(uint16) + npos * sizeof(WordEntryPos) >= len) |
| { |
| len *= 2; |
| vec = (TSVector) repalloc(vec, len); |
| } |
| |
| vec->entries[i].haspos = (npos > 0) ? 1 : 0; |
| vec->entries[i].len = lex_len; |
| vec->entries[i].pos = datalen; |
| |
| memcpy(STRPTR(vec) + datalen, lexeme, lex_len); |
| |
| datalen += lex_len; |
| |
| if (i > 0 && compareentry(&vec->entries[i], |
| &vec->entries[i - 1], |
| STRPTR(vec)) <= 0) |
| needSort = true; |
| |
| /* Receive positions */ |
| if (npos > 0) |
| { |
| uint16 j; |
| WordEntryPos *wepptr; |
| |
| /* |
| * Pad to 2-byte alignment if necessary. Though we used palloc0 |
| * for the initial allocation, subsequent repalloc'd memory areas |
| * are not initialized to zero. |
| */ |
| if (datalen != SHORTALIGN(datalen)) |
| { |
| *(STRPTR(vec) + datalen) = '\0'; |
| datalen = SHORTALIGN(datalen); |
| } |
| |
| memcpy(STRPTR(vec) + datalen, &npos, sizeof(uint16)); |
| |
| wepptr = POSDATAPTR(vec, &vec->entries[i]); |
| for (j = 0; j < npos; j++) |
| { |
| wepptr[j] = (WordEntryPos) pq_getmsgint(buf, sizeof(WordEntryPos)); |
| if (j > 0 && WEP_GETPOS(wepptr[j]) <= WEP_GETPOS(wepptr[j - 1])) |
| elog(ERROR, "position information is misordered"); |
| } |
| |
| datalen += sizeof(uint16) + npos * sizeof(WordEntryPos); |
| } |
| } |
| |
| SET_VARSIZE(vec, hdrlen + datalen); |
| |
| if (needSort) |
| qsort_arg(ARRPTR(vec), vec->size, sizeof(WordEntry), |
| compareentry, STRPTR(vec)); |
| |
| PG_RETURN_TSVECTOR(vec); |
| } |