src/utils/smartQueries.ts - doris-website - Git at Google

 import lunr from "lunr";
 import { SmartQuery, SmartTerm } from "../../shared/interfaces";
 import { smartTerms } from "./smartTerms";
 import { language, removeDefaultStopWordFilter } from "./proxiedGenerated";

 /**
  * Get all possible queries for a list of tokens consists of words mixed English and Chinese,
  * by a Chinese words dictionary.
  *
  * @param tokens - Tokens consists of English words or strings of consecutive Chinese words.
  * @param zhDictionary - A Chinese words dictionary.
  *
  * @returns A smart query list.
  */
 export function smartQueries(
   tokens: string[],
   zhDictionary: string[]
 ): SmartQuery[] {
   const terms = smartTerms(tokens, zhDictionary);

   if (terms.length === 0) {
     // There are no matched terms.
     // All tokens are considered required and with wildcard.
     return [
       {
         tokens,
         term: tokens.map((value) => ({
           value,
           presence: lunr.Query.presence.REQUIRED,
           wildcard: lunr.Query.wildcard.LEADING | lunr.Query.wildcard.TRAILING,
         })),
       },
     ];
   }

   // The last token of a term maybe incomplete while user is typing.
   for (const term of terms) {
     term[term.length - 1].maybeTyping = true;
   }

   // Try to append terms without stop words,
   // since they are removed in the index.
   const stopWordPipelines: lunr.PipelineFunction[] = [];
   for (const lang of language) {
     if (lang === "en") {
       if (!removeDefaultStopWordFilter) {
         stopWordPipelines.unshift(lunr.stopWordFilter);
       }
     } else {
       const lunrLang = (lunr as any)[lang] as typeof lunr;
       if (lunrLang.stopWordFilter) {
         stopWordPipelines.unshift(lunrLang.stopWordFilter);
       }
     }
   }

   let refinedTerms: SmartTerm[];

   if (stopWordPipelines.length > 0) {
     const pipe = (term: SmartTerm) =>
       stopWordPipelines.reduce(
         (term, p) =>
           term.filter((item) =>
             (p as unknown as (str: string) => string | undefined)(item.value)
           ),
         term
       );
     refinedTerms = [];
     const newTerms: SmartTerm[] = [];
     for (const term of terms) {
       const filteredTerm = pipe(term);
       refinedTerms.push(filteredTerm);
       // Add extra terms only if some stop words are removed,
       // and some non-stop-words exist too.
       if (filteredTerm.length < term.length && filteredTerm.length > 0) {
         newTerms.push(filteredTerm);
       }
     }
     terms.push(...newTerms);
   } else {
     refinedTerms = terms.slice();
   }

   // Also try to add extra terms which miss one of the searched tokens,
   // when the term contains 3 or more tokens,
   // to improve the search precision.
   const extraTerms: SmartTerm[] = [];
   for (const term of refinedTerms) {
     if (term.length > 2) {
       for (let i = term.length - 1; i >= 0; i -= 1) {
         extraTerms.push(term.slice(0, i).concat(term.slice(i + 1)));
       }
     }
   }

   return getQueriesMaybeTyping(terms).concat(getQueriesMaybeTyping(extraTerms));
 }

 function getQueriesMaybeTyping(terms: SmartTerm[]): SmartQuery[] {
   return termsToQueries(terms).concat(
     termsToQueries(
       // Ignore terms whose last token already has a trailing wildcard,
       // or the last token is not `maybeTyping`.
       terms.filter((term) => {
         const token = term[term.length - 1];
         return !token.trailing && token.maybeTyping;
       }),
       true
     )
   );
 }

 function termsToQueries(
   terms: SmartTerm[],
   maybeTyping?: boolean
 ): SmartQuery[] {
   return terms.map((term) => ({
     tokens: term.map((item) => item.value),
     term: term.map((item) => ({
       value: item.value,
       presence: lunr.Query.presence.REQUIRED,
       // The last token of a term maybe incomplete while user is typing.
       // So append more queries with trailing wildcard added.
       wildcard: (
         maybeTyping ? item.trailing || item.maybeTyping : item.trailing
       )
         ? lunr.Query.wildcard.TRAILING
         : lunr.Query.wildcard.NONE,
     })),
   }));
 }
	import lunr from "lunr";
	import { SmartQuery, SmartTerm } from "../../shared/interfaces";
	import { smartTerms } from "./smartTerms";
	import { language, removeDefaultStopWordFilter } from "./proxiedGenerated";

	/**
	* Get all possible queries for a list of tokens consists of words mixed English and Chinese,
	* by a Chinese words dictionary.
	*
	* @param tokens - Tokens consists of English words or strings of consecutive Chinese words.
	* @param zhDictionary - A Chinese words dictionary.
	*
	* @returns A smart query list.
	*/
	export function smartQueries(
	tokens: string[],
	zhDictionary: string[]
	): SmartQuery[] {
	const terms = smartTerms(tokens, zhDictionary);

	if (terms.length === 0) {
	// There are no matched terms.
	// All tokens are considered required and with wildcard.
	return [
	{
	tokens,
	term: tokens.map((value) => ({
	value,
	presence: lunr.Query.presence.REQUIRED,
	wildcard: lunr.Query.wildcard.LEADING \| lunr.Query.wildcard.TRAILING,
	})),
	},
	];
	}

	// The last token of a term maybe incomplete while user is typing.
	for (const term of terms) {
	term[term.length - 1].maybeTyping = true;
	}

	// Try to append terms without stop words,
	// since they are removed in the index.
	const stopWordPipelines: lunr.PipelineFunction[] = [];
	for (const lang of language) {
	if (lang === "en") {
	if (!removeDefaultStopWordFilter) {
	stopWordPipelines.unshift(lunr.stopWordFilter);
	}
	} else {
	const lunrLang = (lunr as any)[lang] as typeof lunr;
	if (lunrLang.stopWordFilter) {
	stopWordPipelines.unshift(lunrLang.stopWordFilter);
	}
	}
	}

	let refinedTerms: SmartTerm[];

	if (stopWordPipelines.length > 0) {
	const pipe = (term: SmartTerm) =>
	stopWordPipelines.reduce(
	(term, p) =>
	term.filter((item) =>
	(p as unknown as (str: string) => string \| undefined)(item.value)
	),
	term
	);
	refinedTerms = [];
	const newTerms: SmartTerm[] = [];
	for (const term of terms) {
	const filteredTerm = pipe(term);
	refinedTerms.push(filteredTerm);
	// Add extra terms only if some stop words are removed,
	// and some non-stop-words exist too.
	if (filteredTerm.length < term.length && filteredTerm.length > 0) {
	newTerms.push(filteredTerm);
	}
	}
	terms.push(...newTerms);
	} else {
	refinedTerms = terms.slice();
	}

	// Also try to add extra terms which miss one of the searched tokens,
	// when the term contains 3 or more tokens,
	// to improve the search precision.
	const extraTerms: SmartTerm[] = [];
	for (const term of refinedTerms) {
	if (term.length > 2) {
	for (let i = term.length - 1; i >= 0; i -= 1) {
	extraTerms.push(term.slice(0, i).concat(term.slice(i + 1)));
	}
	}
	}

	return getQueriesMaybeTyping(terms).concat(getQueriesMaybeTyping(extraTerms));
	}

	function getQueriesMaybeTyping(terms: SmartTerm[]): SmartQuery[] {
	return termsToQueries(terms).concat(
	termsToQueries(
	// Ignore terms whose last token already has a trailing wildcard,
	// or the last token is not `maybeTyping`.
	terms.filter((term) => {
	const token = term[term.length - 1];
	return !token.trailing && token.maybeTyping;
	}),
	true
	)
	);
	}

	function termsToQueries(
	terms: SmartTerm[],
	maybeTyping?: boolean
	): SmartQuery[] {
	return terms.map((term) => ({
	tokens: term.map((item) => item.value),
	term: term.map((item) => ({
	value: item.value,
	presence: lunr.Query.presence.REQUIRED,
	// The last token of a term maybe incomplete while user is typing.
	// So append more queries with trailing wildcard added.
	wildcard: (
	maybeTyping ? item.trailing \|\| item.maybeTyping : item.trailing
	)
	? lunr.Query.wildcard.TRAILING
	: lunr.Query.wildcard.NONE,
	})),
	}));
	}