src/utils/smartQueries.ts - doris-website - Git at Google

 import lunr from 'lunr';
 import { SmartQuery, SmartTerm } from '../../shared/interfaces';
 import { smartTerms } from './smartTerms';
 import { language, removeDefaultStopWordFilter } from './proxiedGenerated';

 /**
  * Get all possible queries for a list of tokens consists of words mixed English and Chinese,
  * by a Chinese words dictionary.
  *
  * @param tokens - Tokens consists of English words or strings of consecutive Chinese words.
  * @param zhDictionary - A Chinese words dictionary.
  *
  * @returns A smart query list.
  */
 export function smartQueries(tokens: string[], zhDictionary: string[]): SmartQuery[] {
     const terms = smartTerms(tokens, zhDictionary);

     if (terms.length === 0) {
         // There are no matched terms.
         // All tokens are considered required and with wildcard.
         return [
             {
                 tokens,
                 term: tokens.map(value => ({
                     value,
                     presence: lunr.Query.presence.REQUIRED,
                     wildcard: lunr.Query.wildcard.LEADING | lunr.Query.wildcard.TRAILING,
                 })),
             },
         ];
     }

     // The last token of a term maybe incomplete while user is typing.
     for (const term of terms) {
         term[term.length - 1].maybeTyping = true;
     }

     lunr.generateStopWordFilter = function (stopWords) {
         var words = stopWords.reduce(function (memo, stopWord) {
             memo[stopWord] = stopWord;
             return memo;
         }, {});

         return function (token) {
             if (token && words[token.toString()] !== token.toString()) return token;
         };
     };
     lunr.stopWordFilter = lunr.generateStopWordFilter([
         'a',
         'able',
         'about',
         'across',
         'after',
         'all',
         'almost',
         'also',
         'am',
         'among',
         'an',
         'and',
         'any',
         'are',
         'as',
         'at',
         'be',
         'because',
         'been',
         'but',
         'by',
         'can',
         'cannot',
         'could',
         'dear',
         'did',
         'do',
         'does',
         'either',
         'else',
         'ever',
         'every',
         'for',
         'from',
         'get',
         'got',
         'had',
         'has',
         'have',
         'he',
         'her',
         'hers',
         'him',
         'his',
         'how',
         'however',
         'i',
         'if',
         'in',
         'into',
         'is',
         'it',
         'its',
         'just',
         'least',
         'let',
         'like',
         'likely',
         'may',
         'me',
         'might',
         'most',
         'must',
         'my',
         'neither',
         'no',
         'nor',
         'not',
         'of',
         'off',
         'often',
         'on',
         'only',
         'or',
         'other',
         'our',
         'own',
         'rather',
         'said',
         'say',
         'says',
         'she',
         'should',
         'since',
         'so',
         'some',
         'than',
         'that',
         'the',
         'their',
         'them',
         'then',
         'there',
         'these',
         'they',
         'this',
         'tis',
         'to',
         'too',
         'twas',
         'us',
         'wants',
         'was',
         'we',
         'were',
         'what',
         'when',
         'where',
         'which',
         'while',
         'who',
         'whom',
         'why',
         'will',
         'with',
         'would',
         'yet',
         'you',
         'your',
     ]);
     lunrLanguageZh(lunr);
     // Try to append terms without stop words,
     // since they are removed in the index.
     const stopWordPipelines: lunr.PipelineFunction[] = [];
     for (const lang of language) {
         if (lang === 'en') {
             if (!removeDefaultStopWordFilter) {
                 stopWordPipelines.unshift(lunr.stopWordFilter);
             }
         } else {
             const lunrLang = (lunr as any)[lang] as typeof lunr;
             if (lunrLang.stopWordFilter) {
                 stopWordPipelines.unshift(lunrLang.stopWordFilter);
             }
         }
     }

     let refinedTerms: SmartTerm[];

     if (stopWordPipelines.length > 0) {
         const pipe = (term: SmartTerm) =>
             stopWordPipelines.reduce(
                 (term, p) => term.filter(item => (p as unknown as (str: string) => string | undefined)(item.value)),
                 term,
             );
         refinedTerms = [];
         const newTerms: SmartTerm[] = [];
         for (const term of terms) {
             const filteredTerm = pipe(term);
             refinedTerms.push(filteredTerm);
             // Add extra terms only if some stop words are removed,
             // and some non-stop-words exist too.
             if (filteredTerm.length < term.length && filteredTerm.length > 0) {
                 newTerms.push(filteredTerm);
             }
         }
         terms.push(...newTerms);
     } else {
         refinedTerms = terms.slice();
     }

     // Also try to add extra terms which miss one of the searched tokens,
     // when the term contains 3 or more tokens,
     // to improve the search precision.
     const extraTerms: SmartTerm[] = [];
     for (const term of refinedTerms) {
         if (term.length > 2) {
             for (let i = term.length - 1; i >= 0; i -= 1) {
                 extraTerms.push(term.slice(0, i).concat(term.slice(i + 1)));
             }
         }
     }

     return getQueriesMaybeTyping(terms).concat(getQueriesMaybeTyping(extraTerms));
 }

 function getQueriesMaybeTyping(terms: SmartTerm[]): SmartQuery[] {
     return termsToQueries(terms).concat(
         termsToQueries(
             // Ignore terms whose last token already has a trailing wildcard,
             // or the last token is not `maybeTyping`.
             terms.filter(term => {
                 const token = term[term.length - 1];
                 return !token.trailing && token.maybeTyping;
             }),
             true,
         ),
     );
 }

 function termsToQueries(terms: SmartTerm[], maybeTyping?: boolean): SmartQuery[] {
     return terms.map(term => ({
         tokens: term.map(item => item.value),
         term: term.map(item => ({
             value: item.value,
             presence: lunr.Query.presence.REQUIRED,
             // The last token of a term maybe incomplete while user is typing.
             // So append more queries with trailing wildcard added.
             wildcard: (maybeTyping ? item.trailing || item.maybeTyping : item.trailing)
                 ? lunr.Query.wildcard.TRAILING
                 : lunr.Query.wildcard.NONE,
         })),
     }));
 }

 // `lunr-languages/lunr.stemmer.support` is required.
 function generateTrimmer(wordCharacters) {
   const startRegex = new RegExp("^[^" + wordCharacters + "]+", "u");
   const endRegex = new RegExp("[^" + wordCharacters + "]+$", "u");
   return function (token) {
       return token.update(function (str) {
           return str.replace(startRegex, "").replace(endRegex, "");
       });
   };
 }


 function lunrLanguageZh(lunr, tokenizer) {
   lunr.trimmerSupport.generateTrimmer = generateTrimmer;
   lunr.zh = function () {
       this.pipeline.reset();
       this.pipeline.add(lunr.zh.trimmer, lunr.zh.stopWordFilter);
       if (tokenizer) {
           this.tokenizer = tokenizer;
       }
   };
   if (tokenizer) {
       lunr.zh.tokenizer = tokenizer;
   }
   // https://zhuanlan.zhihu.com/p/33335629
   // https://mothereff.in/regexpu#input=const+regex+%3D+/%5Cp%7BUnified_Ideograph%7D/u%3B&unicodePropertyEscape=1
   lunr.zh.wordCharacters =
       "\\u3400-\\u4DBF\\u4E00-\\u9FFC\\uFA0E\\uFA0F\\uFA11\\uFA13\\uFA14\\uFA1F\\uFA21\\uFA23\\uFA24\\uFA27-\\uFA29\\u{20000}-\\u{2A6DD}\\u{2A700}-\\u{2B734}\\u{2B740}-\\u{2B81D}\\u{2B820}-\\u{2CEA1}\\u{2CEB0}-\\u{2EBE0}\\u{30000}-\\u{3134A}";
   lunr.zh.trimmer = lunr.trimmerSupport.generateTrimmer(lunr.zh.wordCharacters);
   lunr.Pipeline.registerFunction(lunr.zh.trimmer, "trimmer-zh");
   /* lunr stop word filter. see https://www.ranks.nl/stopwords/chinese-stopwords */
   lunr.zh.stopWordFilter = lunr.generateStopWordFilter("的 一 不 在 人 有 是 为 以 于 上 他 而 后 之 来 及 了 因 下 可 到 由 这 与 也 此 但 并 个 其 已 无 小 我 们 起 最 再 今 去 好 只 又 或 很 亦 某 把 那 你 乃 它 吧 被 比 别 趁 当 从 到 得 打 凡 儿 尔 该 各 给 跟 和 何 还 即 几 既 看 据 距 靠 啦 了 另 么 每 们 嘛 拿 哪 那 您 凭 且 却 让 仍 啥 如 若 使 谁 虽 随 同 所 她 哇 嗡 往 哪 些 向 沿 哟 用 于 咱 则 怎 曾 至 致 着 诸 自".split(" "));
   lunr.Pipeline.registerFunction(lunr.zh.stopWordFilter, "stopWordFilter-zh");
 }
	import lunr from 'lunr';
	import { SmartQuery, SmartTerm } from '../../shared/interfaces';
	import { smartTerms } from './smartTerms';
	import { language, removeDefaultStopWordFilter } from './proxiedGenerated';

	/**
	* Get all possible queries for a list of tokens consists of words mixed English and Chinese,
	* by a Chinese words dictionary.
	*
	* @param tokens - Tokens consists of English words or strings of consecutive Chinese words.
	* @param zhDictionary - A Chinese words dictionary.
	*
	* @returns A smart query list.
	*/
	export function smartQueries(tokens: string[], zhDictionary: string[]): SmartQuery[] {
	const terms = smartTerms(tokens, zhDictionary);

	if (terms.length === 0) {
	// There are no matched terms.
	// All tokens are considered required and with wildcard.
	return [
	{
	tokens,
	term: tokens.map(value => ({
	value,
	presence: lunr.Query.presence.REQUIRED,
	wildcard: lunr.Query.wildcard.LEADING \| lunr.Query.wildcard.TRAILING,
	})),
	},
	];
	}

	// The last token of a term maybe incomplete while user is typing.
	for (const term of terms) {
	term[term.length - 1].maybeTyping = true;
	}

	lunr.generateStopWordFilter = function (stopWords) {
	var words = stopWords.reduce(function (memo, stopWord) {
	memo[stopWord] = stopWord;
	return memo;
	}, {});

	return function (token) {
	if (token && words[token.toString()] !== token.toString()) return token;
	};
	};
	lunr.stopWordFilter = lunr.generateStopWordFilter([
	'a',
	'able',
	'about',
	'across',
	'after',
	'all',
	'almost',
	'also',
	'am',
	'among',
	'an',
	'and',
	'any',
	'are',
	'as',
	'at',
	'be',
	'because',
	'been',
	'but',
	'by',
	'can',
	'cannot',
	'could',
	'dear',
	'did',
	'do',
	'does',
	'either',
	'else',
	'ever',
	'every',
	'for',
	'from',
	'get',
	'got',
	'had',
	'has',
	'have',
	'he',
	'her',
	'hers',
	'him',
	'his',
	'how',
	'however',
	'i',
	'if',
	'in',
	'into',
	'is',
	'it',
	'its',
	'just',
	'least',
	'let',
	'like',
	'likely',
	'may',
	'me',
	'might',
	'most',
	'must',
	'my',
	'neither',
	'no',
	'nor',
	'not',
	'of',
	'off',
	'often',
	'on',
	'only',
	'or',
	'other',
	'our',
	'own',
	'rather',
	'said',
	'say',
	'says',
	'she',
	'should',
	'since',
	'so',
	'some',
	'than',
	'that',
	'the',
	'their',
	'them',
	'then',
	'there',
	'these',
	'they',
	'this',
	'tis',
	'to',
	'too',
	'twas',
	'us',
	'wants',
	'was',
	'we',
	'were',
	'what',
	'when',
	'where',
	'which',
	'while',
	'who',
	'whom',
	'why',
	'will',
	'with',
	'would',
	'yet',
	'you',
	'your',
	]);
	lunrLanguageZh(lunr);
	// Try to append terms without stop words,
	// since they are removed in the index.
	const stopWordPipelines: lunr.PipelineFunction[] = [];
	for (const lang of language) {
	if (lang === 'en') {
	if (!removeDefaultStopWordFilter) {
	stopWordPipelines.unshift(lunr.stopWordFilter);
	}
	} else {
	const lunrLang = (lunr as any)[lang] as typeof lunr;
	if (lunrLang.stopWordFilter) {
	stopWordPipelines.unshift(lunrLang.stopWordFilter);
	}
	}
	}

	let refinedTerms: SmartTerm[];

	if (stopWordPipelines.length > 0) {
	const pipe = (term: SmartTerm) =>
	stopWordPipelines.reduce(
	(term, p) => term.filter(item => (p as unknown as (str: string) => string \| undefined)(item.value)),
	term,
	);
	refinedTerms = [];
	const newTerms: SmartTerm[] = [];
	for (const term of terms) {
	const filteredTerm = pipe(term);
	refinedTerms.push(filteredTerm);
	// Add extra terms only if some stop words are removed,
	// and some non-stop-words exist too.
	if (filteredTerm.length < term.length && filteredTerm.length > 0) {
	newTerms.push(filteredTerm);
	}
	}
	terms.push(...newTerms);
	} else {
	refinedTerms = terms.slice();
	}

	// Also try to add extra terms which miss one of the searched tokens,
	// when the term contains 3 or more tokens,
	// to improve the search precision.
	const extraTerms: SmartTerm[] = [];
	for (const term of refinedTerms) {
	if (term.length > 2) {
	for (let i = term.length - 1; i >= 0; i -= 1) {
	extraTerms.push(term.slice(0, i).concat(term.slice(i + 1)));
	}
	}
	}

	return getQueriesMaybeTyping(terms).concat(getQueriesMaybeTyping(extraTerms));
	}

	function getQueriesMaybeTyping(terms: SmartTerm[]): SmartQuery[] {
	return termsToQueries(terms).concat(
	termsToQueries(
	// Ignore terms whose last token already has a trailing wildcard,
	// or the last token is not `maybeTyping`.
	terms.filter(term => {
	const token = term[term.length - 1];
	return !token.trailing && token.maybeTyping;
	}),
	true,
	),
	);
	}

	function termsToQueries(terms: SmartTerm[], maybeTyping?: boolean): SmartQuery[] {
	return terms.map(term => ({
	tokens: term.map(item => item.value),
	term: term.map(item => ({
	value: item.value,
	presence: lunr.Query.presence.REQUIRED,
	// The last token of a term maybe incomplete while user is typing.
	// So append more queries with trailing wildcard added.
	wildcard: (maybeTyping ? item.trailing \|\| item.maybeTyping : item.trailing)
	? lunr.Query.wildcard.TRAILING
	: lunr.Query.wildcard.NONE,
	})),
	}));
	}

	// `lunr-languages/lunr.stemmer.support` is required.
	function generateTrimmer(wordCharacters) {
	const startRegex = new RegExp("^[^" + wordCharacters + "]+", "u");
	const endRegex = new RegExp("[^" + wordCharacters + "]+$", "u");
	return function (token) {
	return token.update(function (str) {
	return str.replace(startRegex, "").replace(endRegex, "");
	});
	};
	}


	function lunrLanguageZh(lunr, tokenizer) {
	lunr.trimmerSupport.generateTrimmer = generateTrimmer;
	lunr.zh = function () {
	this.pipeline.reset();
	this.pipeline.add(lunr.zh.trimmer, lunr.zh.stopWordFilter);
	if (tokenizer) {
	this.tokenizer = tokenizer;
	}
	};
	if (tokenizer) {
	lunr.zh.tokenizer = tokenizer;
	}
	// https://zhuanlan.zhihu.com/p/33335629
	// https://mothereff.in/regexpu#input=const+regex+%3D+/%5Cp%7BUnified_Ideograph%7D/u%3B&unicodePropertyEscape=1
	lunr.zh.wordCharacters =
	"\\u3400-\\u4DBF\\u4E00-\\u9FFC\\uFA0E\\uFA0F\\uFA11\\uFA13\\uFA14\\uFA1F\\uFA21\\uFA23\\uFA24\\uFA27-\\uFA29\\u{20000}-\\u{2A6DD}\\u{2A700}-\\u{2B734}\\u{2B740}-\\u{2B81D}\\u{2B820}-\\u{2CEA1}\\u{2CEB0}-\\u{2EBE0}\\u{30000}-\\u{3134A}";
	lunr.zh.trimmer = lunr.trimmerSupport.generateTrimmer(lunr.zh.wordCharacters);
	lunr.Pipeline.registerFunction(lunr.zh.trimmer, "trimmer-zh");
	/* lunr stop word filter. see https://www.ranks.nl/stopwords/chinese-stopwords */
	lunr.zh.stopWordFilter = lunr.generateStopWordFilter("的一不在人有是为以于上他而后之来及了因下可到由这与也此但并个其已无小我们起最再今去好只又或很亦某把那你乃它吧被比别趁当从到得打凡儿尔该各给跟和何还即几既看据距靠啦了另么每们嘛拿哪那您凭且却让仍啥如若使谁虽随同所她哇嗡往哪些向沿哟用于咱则怎曾至致着诸自".split(" "));
	lunr.Pipeline.registerFunction(lunr.zh.stopWordFilter, "stopWordFilter-zh");
	}