| /*------------------------------------------------------------------------- |
| * |
| * ts_parse.c |
| * main parse functions for tsearch |
| * |
| * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group |
| * |
| * |
| * IDENTIFICATION |
| * src/backend/tsearch/ts_parse.c |
| * |
| *------------------------------------------------------------------------- |
| */ |
| |
| #include "postgres.h" |
| |
| #include "tsearch/ts_cache.h" |
| #include "tsearch/ts_utils.h" |
| |
| #define IGNORE_LONGLEXEME 1 |
| |
| /* |
| * Lexize subsystem |
| */ |
| |
| typedef struct ParsedLex |
| { |
| int type; |
| char *lemm; |
| int lenlemm; |
| struct ParsedLex *next; |
| } ParsedLex; |
| |
| typedef struct ListParsedLex |
| { |
| ParsedLex *head; |
| ParsedLex *tail; |
| } ListParsedLex; |
| |
| typedef struct |
| { |
| TSConfigCacheEntry *cfg; |
| Oid curDictId; |
| int posDict; |
| DictSubState dictState; |
| ParsedLex *curSub; |
| ListParsedLex towork; /* current list to work */ |
| ListParsedLex waste; /* list of lexemes that already lexized */ |
| |
| /* |
| * fields to store last variant to lexize (basically, thesaurus or similar |
| * to, which wants several lexemes |
| */ |
| |
| ParsedLex *lastRes; |
| TSLexeme *tmpRes; |
| } LexizeData; |
| |
| static void |
| LexizeInit(LexizeData *ld, TSConfigCacheEntry *cfg) |
| { |
| ld->cfg = cfg; |
| ld->curDictId = InvalidOid; |
| ld->posDict = 0; |
| ld->towork.head = ld->towork.tail = ld->curSub = NULL; |
| ld->waste.head = ld->waste.tail = NULL; |
| ld->lastRes = NULL; |
| ld->tmpRes = NULL; |
| } |
| |
| static void |
| LPLAddTail(ListParsedLex *list, ParsedLex *newpl) |
| { |
| if (list->tail) |
| { |
| list->tail->next = newpl; |
| list->tail = newpl; |
| } |
| else |
| list->head = list->tail = newpl; |
| newpl->next = NULL; |
| } |
| |
| static ParsedLex * |
| LPLRemoveHead(ListParsedLex *list) |
| { |
| ParsedLex *res = list->head; |
| |
| if (list->head) |
| list->head = list->head->next; |
| |
| if (list->head == NULL) |
| list->tail = NULL; |
| |
| return res; |
| } |
| |
| static void |
| LexizeAddLemm(LexizeData *ld, int type, char *lemm, int lenlemm) |
| { |
| ParsedLex *newpl = (ParsedLex *) palloc(sizeof(ParsedLex)); |
| |
| newpl->type = type; |
| newpl->lemm = lemm; |
| newpl->lenlemm = lenlemm; |
| LPLAddTail(&ld->towork, newpl); |
| ld->curSub = ld->towork.tail; |
| } |
| |
| static void |
| RemoveHead(LexizeData *ld) |
| { |
| LPLAddTail(&ld->waste, LPLRemoveHead(&ld->towork)); |
| |
| ld->posDict = 0; |
| } |
| |
| static void |
| setCorrLex(LexizeData *ld, ParsedLex **correspondLexem) |
| { |
| if (correspondLexem) |
| { |
| *correspondLexem = ld->waste.head; |
| } |
| else |
| { |
| ParsedLex *tmp, |
| *ptr = ld->waste.head; |
| |
| while (ptr) |
| { |
| tmp = ptr->next; |
| pfree(ptr); |
| ptr = tmp; |
| } |
| } |
| ld->waste.head = ld->waste.tail = NULL; |
| } |
| |
| static void |
| moveToWaste(LexizeData *ld, ParsedLex *stop) |
| { |
| bool go = true; |
| |
| while (ld->towork.head && go) |
| { |
| if (ld->towork.head == stop) |
| { |
| ld->curSub = stop->next; |
| go = false; |
| } |
| RemoveHead(ld); |
| } |
| } |
| |
| static void |
| setNewTmpRes(LexizeData *ld, ParsedLex *lex, TSLexeme *res) |
| { |
| if (ld->tmpRes) |
| { |
| TSLexeme *ptr; |
| |
| for (ptr = ld->tmpRes; ptr->lexeme; ptr++) |
| pfree(ptr->lexeme); |
| pfree(ld->tmpRes); |
| } |
| ld->tmpRes = res; |
| ld->lastRes = lex; |
| } |
| |
| static TSLexeme * |
| LexizeExec(LexizeData *ld, ParsedLex **correspondLexem) |
| { |
| int i; |
| ListDictionary *map; |
| TSDictionaryCacheEntry *dict; |
| TSLexeme *res; |
| |
| if (ld->curDictId == InvalidOid) |
| { |
| /* |
| * usual mode: dictionary wants only one word, but we should keep in |
| * mind that we should go through all stack |
| */ |
| |
| while (ld->towork.head) |
| { |
| ParsedLex *curVal = ld->towork.head; |
| char *curValLemm = curVal->lemm; |
| int curValLenLemm = curVal->lenlemm; |
| |
| map = ld->cfg->map + curVal->type; |
| |
| if (curVal->type == 0 || curVal->type >= ld->cfg->lenmap || map->len == 0) |
| { |
| /* skip this type of lexeme */ |
| RemoveHead(ld); |
| continue; |
| } |
| |
| for (i = ld->posDict; i < map->len; i++) |
| { |
| dict = lookup_ts_dictionary_cache(map->dictIds[i]); |
| |
| ld->dictState.isend = ld->dictState.getnext = false; |
| ld->dictState.private_state = NULL; |
| res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(dict->lexize), |
| PointerGetDatum(dict->dictData), |
| PointerGetDatum(curValLemm), |
| Int32GetDatum(curValLenLemm), |
| PointerGetDatum(&ld->dictState))); |
| |
| if (ld->dictState.getnext) |
| { |
| /* |
| * dictionary wants next word, so setup and store current |
| * position and go to multiword mode |
| */ |
| |
| ld->curDictId = DatumGetObjectId(map->dictIds[i]); |
| ld->posDict = i + 1; |
| ld->curSub = curVal->next; |
| if (res) |
| setNewTmpRes(ld, curVal, res); |
| return LexizeExec(ld, correspondLexem); |
| } |
| |
| if (!res) /* dictionary doesn't know this lexeme */ |
| continue; |
| |
| if (res->flags & TSL_FILTER) |
| { |
| curValLemm = res->lexeme; |
| curValLenLemm = strlen(res->lexeme); |
| continue; |
| } |
| |
| RemoveHead(ld); |
| setCorrLex(ld, correspondLexem); |
| return res; |
| } |
| |
| RemoveHead(ld); |
| } |
| } |
| else |
| { /* curDictId is valid */ |
| dict = lookup_ts_dictionary_cache(ld->curDictId); |
| |
| /* |
| * Dictionary ld->curDictId asks us about following words |
| */ |
| |
| while (ld->curSub) |
| { |
| ParsedLex *curVal = ld->curSub; |
| |
| map = ld->cfg->map + curVal->type; |
| |
| if (curVal->type != 0) |
| { |
| bool dictExists = false; |
| |
| if (curVal->type >= ld->cfg->lenmap || map->len == 0) |
| { |
| /* skip this type of lexeme */ |
| ld->curSub = curVal->next; |
| continue; |
| } |
| |
| /* |
| * We should be sure that current type of lexeme is recognized |
| * by our dictionary: we just check is it exist in list of |
| * dictionaries ? |
| */ |
| for (i = 0; i < map->len && !dictExists; i++) |
| if (ld->curDictId == DatumGetObjectId(map->dictIds[i])) |
| dictExists = true; |
| |
| if (!dictExists) |
| { |
| /* |
| * Dictionary can't work with current type of lexeme, |
| * return to basic mode and redo all stored lexemes |
| */ |
| ld->curDictId = InvalidOid; |
| return LexizeExec(ld, correspondLexem); |
| } |
| } |
| |
| ld->dictState.isend = (curVal->type == 0) ? true : false; |
| ld->dictState.getnext = false; |
| |
| res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(dict->lexize), |
| PointerGetDatum(dict->dictData), |
| PointerGetDatum(curVal->lemm), |
| Int32GetDatum(curVal->lenlemm), |
| PointerGetDatum(&ld->dictState))); |
| |
| if (ld->dictState.getnext) |
| { |
| /* Dictionary wants one more */ |
| ld->curSub = curVal->next; |
| if (res) |
| setNewTmpRes(ld, curVal, res); |
| continue; |
| } |
| |
| if (res || ld->tmpRes) |
| { |
| /* |
| * Dictionary normalizes lexemes, so we remove from stack all |
| * used lexemes, return to basic mode and redo end of stack |
| * (if it exists) |
| */ |
| if (res) |
| { |
| moveToWaste(ld, ld->curSub); |
| } |
| else |
| { |
| res = ld->tmpRes; |
| moveToWaste(ld, ld->lastRes); |
| } |
| |
| /* reset to initial state */ |
| ld->curDictId = InvalidOid; |
| ld->posDict = 0; |
| ld->lastRes = NULL; |
| ld->tmpRes = NULL; |
| setCorrLex(ld, correspondLexem); |
| return res; |
| } |
| |
| /* |
| * Dict don't want next lexem and didn't recognize anything, redo |
| * from ld->towork.head |
| */ |
| ld->curDictId = InvalidOid; |
| return LexizeExec(ld, correspondLexem); |
| } |
| } |
| |
| setCorrLex(ld, correspondLexem); |
| return NULL; |
| } |
| |
| /* |
| * Parse string and lexize words. |
| * |
| * prs will be filled in. |
| */ |
| void |
| parsetext(Oid cfgId, ParsedText *prs, char *buf, int buflen) |
| { |
| int type, |
| lenlemm; |
| char *lemm = NULL; |
| LexizeData ldata; |
| TSLexeme *norms; |
| TSConfigCacheEntry *cfg; |
| TSParserCacheEntry *prsobj; |
| void *prsdata; |
| |
| cfg = lookup_ts_config_cache(cfgId); |
| prsobj = lookup_ts_parser_cache(cfg->prsId); |
| |
| prsdata = (void *) DatumGetPointer(FunctionCall2(&prsobj->prsstart, |
| PointerGetDatum(buf), |
| Int32GetDatum(buflen))); |
| |
| LexizeInit(&ldata, cfg); |
| |
| do |
| { |
| type = DatumGetInt32(FunctionCall3(&(prsobj->prstoken), |
| PointerGetDatum(prsdata), |
| PointerGetDatum(&lemm), |
| PointerGetDatum(&lenlemm))); |
| |
| if (type > 0 && lenlemm >= MAXSTRLEN) |
| { |
| #ifdef IGNORE_LONGLEXEME |
| ereport(NOTICE, |
| (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), |
| errmsg("word is too long to be indexed"), |
| errdetail("Words longer than %d characters are ignored.", |
| MAXSTRLEN))); |
| continue; |
| #else |
| ereport(ERROR, |
| (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), |
| errmsg("word is too long to be indexed"), |
| errdetail("Words longer than %d characters are ignored.", |
| MAXSTRLEN))); |
| #endif |
| } |
| |
| LexizeAddLemm(&ldata, type, lemm, lenlemm); |
| |
| while ((norms = LexizeExec(&ldata, NULL)) != NULL) |
| { |
| TSLexeme *ptr = norms; |
| |
| prs->pos++; /* set pos */ |
| |
| while (ptr->lexeme) |
| { |
| if (prs->curwords == prs->lenwords) |
| { |
| prs->lenwords *= 2; |
| prs->words = (ParsedWord *) repalloc((void *) prs->words, prs->lenwords * sizeof(ParsedWord)); |
| } |
| |
| if (ptr->flags & TSL_ADDPOS) |
| prs->pos++; |
| prs->words[prs->curwords].len = strlen(ptr->lexeme); |
| prs->words[prs->curwords].word = ptr->lexeme; |
| prs->words[prs->curwords].nvariant = ptr->nvariant; |
| prs->words[prs->curwords].flags = ptr->flags & TSL_PREFIX; |
| prs->words[prs->curwords].alen = 0; |
| prs->words[prs->curwords].pos.pos = LIMITPOS(prs->pos); |
| ptr++; |
| prs->curwords++; |
| } |
| pfree(norms); |
| } |
| } while (type > 0); |
| |
| FunctionCall1(&(prsobj->prsend), PointerGetDatum(prsdata)); |
| } |
| |
| /* |
| * Headline framework |
| */ |
| static void |
| hladdword(HeadlineParsedText *prs, char *buf, int buflen, int type) |
| { |
| while (prs->curwords >= prs->lenwords) |
| { |
| prs->lenwords *= 2; |
| prs->words = (HeadlineWordEntry *) repalloc((void *) prs->words, prs->lenwords * sizeof(HeadlineWordEntry)); |
| } |
| memset(&(prs->words[prs->curwords]), 0, sizeof(HeadlineWordEntry)); |
| prs->words[prs->curwords].type = (uint8) type; |
| prs->words[prs->curwords].len = buflen; |
| prs->words[prs->curwords].word = palloc(buflen); |
| memcpy(prs->words[prs->curwords].word, buf, buflen); |
| prs->curwords++; |
| } |
| |
| static void |
| hlfinditem(HeadlineParsedText *prs, TSQuery query, int32 pos, char *buf, int buflen) |
| { |
| int i; |
| QueryItem *item = GETQUERY(query); |
| HeadlineWordEntry *word; |
| |
| while (prs->curwords + query->size >= prs->lenwords) |
| { |
| prs->lenwords *= 2; |
| prs->words = (HeadlineWordEntry *) repalloc((void *) prs->words, prs->lenwords * sizeof(HeadlineWordEntry)); |
| } |
| |
| word = &(prs->words[prs->curwords - 1]); |
| word->pos = LIMITPOS(pos); |
| for (i = 0; i < query->size; i++) |
| { |
| if (item->type == QI_VAL && |
| tsCompareString(GETOPERAND(query) + item->qoperand.distance, item->qoperand.length, |
| buf, buflen, item->qoperand.prefix) == 0) |
| { |
| if (word->item) |
| { |
| memcpy(&(prs->words[prs->curwords]), word, sizeof(HeadlineWordEntry)); |
| prs->words[prs->curwords].item = &item->qoperand; |
| prs->words[prs->curwords].repeated = 1; |
| prs->curwords++; |
| } |
| else |
| word->item = &item->qoperand; |
| } |
| item++; |
| } |
| } |
| |
| static void |
| addHLParsedLex(HeadlineParsedText *prs, TSQuery query, ParsedLex *lexs, TSLexeme *norms) |
| { |
| ParsedLex *tmplexs; |
| TSLexeme *ptr; |
| int32 savedpos; |
| |
| while (lexs) |
| { |
| if (lexs->type > 0) |
| hladdword(prs, lexs->lemm, lexs->lenlemm, lexs->type); |
| |
| ptr = norms; |
| savedpos = prs->vectorpos; |
| while (ptr && ptr->lexeme) |
| { |
| if (ptr->flags & TSL_ADDPOS) |
| savedpos++; |
| hlfinditem(prs, query, savedpos, ptr->lexeme, strlen(ptr->lexeme)); |
| ptr++; |
| } |
| |
| tmplexs = lexs->next; |
| pfree(lexs); |
| lexs = tmplexs; |
| } |
| |
| if (norms) |
| { |
| ptr = norms; |
| while (ptr->lexeme) |
| { |
| if (ptr->flags & TSL_ADDPOS) |
| prs->vectorpos++; |
| pfree(ptr->lexeme); |
| ptr++; |
| } |
| pfree(norms); |
| } |
| } |
| |
| void |
| hlparsetext(Oid cfgId, HeadlineParsedText *prs, TSQuery query, char *buf, int buflen) |
| { |
| int type, |
| lenlemm; |
| char *lemm = NULL; |
| LexizeData ldata; |
| TSLexeme *norms; |
| ParsedLex *lexs; |
| TSConfigCacheEntry *cfg; |
| TSParserCacheEntry *prsobj; |
| void *prsdata; |
| |
| cfg = lookup_ts_config_cache(cfgId); |
| prsobj = lookup_ts_parser_cache(cfg->prsId); |
| |
| prsdata = (void *) DatumGetPointer(FunctionCall2(&(prsobj->prsstart), |
| PointerGetDatum(buf), |
| Int32GetDatum(buflen))); |
| |
| LexizeInit(&ldata, cfg); |
| |
| do |
| { |
| type = DatumGetInt32(FunctionCall3(&(prsobj->prstoken), |
| PointerGetDatum(prsdata), |
| PointerGetDatum(&lemm), |
| PointerGetDatum(&lenlemm))); |
| |
| if (type > 0 && lenlemm >= MAXSTRLEN) |
| { |
| #ifdef IGNORE_LONGLEXEME |
| ereport(NOTICE, |
| (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), |
| errmsg("word is too long to be indexed"), |
| errdetail("Words longer than %d characters are ignored.", |
| MAXSTRLEN))); |
| continue; |
| #else |
| ereport(ERROR, |
| (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), |
| errmsg("word is too long to be indexed"), |
| errdetail("Words longer than %d characters are ignored.", |
| MAXSTRLEN))); |
| #endif |
| } |
| |
| LexizeAddLemm(&ldata, type, lemm, lenlemm); |
| |
| do |
| { |
| if ((norms = LexizeExec(&ldata, &lexs)) != NULL) |
| { |
| prs->vectorpos++; |
| addHLParsedLex(prs, query, lexs, norms); |
| } |
| else |
| addHLParsedLex(prs, query, lexs, NULL); |
| } while (norms); |
| |
| } while (type > 0); |
| |
| FunctionCall1(&(prsobj->prsend), PointerGetDatum(prsdata)); |
| } |
| |
| text * |
| generateHeadline(HeadlineParsedText *prs) |
| { |
| text *out; |
| char *ptr; |
| int len = 128; |
| int numfragments = 0; |
| int16 infrag = 0; |
| |
| HeadlineWordEntry *wrd = prs->words; |
| |
| out = (text *) palloc(len); |
| ptr = ((char *) out) + VARHDRSZ; |
| |
| while (wrd - prs->words < prs->curwords) |
| { |
| while (wrd->len + prs->stopsellen + prs->startsellen + prs->fragdelimlen + (ptr - ((char *) out)) >= len) |
| { |
| int dist = ptr - ((char *) out); |
| |
| len *= 2; |
| out = (text *) repalloc(out, len); |
| ptr = ((char *) out) + dist; |
| } |
| |
| if (wrd->in && !wrd->repeated) |
| { |
| if (!infrag) |
| { |
| |
| /* start of a new fragment */ |
| infrag = 1; |
| numfragments++; |
| /* add a fragment delimiter if this is after the first one */ |
| if (numfragments > 1) |
| { |
| memcpy(ptr, prs->fragdelim, prs->fragdelimlen); |
| ptr += prs->fragdelimlen; |
| } |
| |
| } |
| if (wrd->replace) |
| { |
| *ptr = ' '; |
| ptr++; |
| } |
| else if (!wrd->skip) |
| { |
| if (wrd->selected) |
| { |
| memcpy(ptr, prs->startsel, prs->startsellen); |
| ptr += prs->startsellen; |
| } |
| memcpy(ptr, wrd->word, wrd->len); |
| ptr += wrd->len; |
| if (wrd->selected) |
| { |
| memcpy(ptr, prs->stopsel, prs->stopsellen); |
| ptr += prs->stopsellen; |
| } |
| } |
| } |
| else if (!wrd->repeated) |
| { |
| if (infrag) |
| infrag = 0; |
| pfree(wrd->word); |
| } |
| |
| wrd++; |
| } |
| |
| SET_VARSIZE(out, ptr - ((char *) out)); |
| return out; |
| } |