blob: 29fe8d7bcd4981f9dc25e682a5bbf9fff0a951ec [file] [log] [blame]
import lunr from "lunr";
import { SmartQuery, SmartTerm } from "../../shared/interfaces";
import { smartTerms } from "./smartTerms";
import { language, removeDefaultStopWordFilter } from "./proxiedGenerated";
/**
* Get all possible queries for a list of tokens consists of words mixed English and Chinese,
* by a Chinese words dictionary.
*
* @param tokens - Tokens consists of English words or strings of consecutive Chinese words.
* @param zhDictionary - A Chinese words dictionary.
*
* @returns A smart query list.
*/
export function smartQueries(
tokens: string[],
zhDictionary: string[]
): SmartQuery[] {
const terms = smartTerms(tokens, zhDictionary);
if (terms.length === 0) {
// There are no matched terms.
// All tokens are considered required and with wildcard.
return [
{
tokens,
term: tokens.map((value) => ({
value,
presence: lunr.Query.presence.REQUIRED,
wildcard: lunr.Query.wildcard.LEADING | lunr.Query.wildcard.TRAILING,
})),
},
];
}
// The last token of a term maybe incomplete while user is typing.
for (const term of terms) {
term[term.length - 1].maybeTyping = true;
}
// Try to append terms without stop words,
// since they are removed in the index.
const stopWordPipelines: lunr.PipelineFunction[] = [];
for (const lang of language) {
if (lang === "en") {
if (!removeDefaultStopWordFilter) {
stopWordPipelines.unshift(lunr.stopWordFilter);
}
} else {
const lunrLang = (lunr as any)[lang] as typeof lunr;
if (lunrLang.stopWordFilter) {
stopWordPipelines.unshift(lunrLang.stopWordFilter);
}
}
}
let refinedTerms: SmartTerm[];
if (stopWordPipelines.length > 0) {
const pipe = (term: SmartTerm) =>
stopWordPipelines.reduce(
(term, p) =>
term.filter((item) =>
(p as unknown as (str: string) => string | undefined)(item.value)
),
term
);
refinedTerms = [];
const newTerms: SmartTerm[] = [];
for (const term of terms) {
const filteredTerm = pipe(term);
refinedTerms.push(filteredTerm);
// Add extra terms only if some stop words are removed,
// and some non-stop-words exist too.
if (filteredTerm.length < term.length && filteredTerm.length > 0) {
newTerms.push(filteredTerm);
}
}
terms.push(...newTerms);
} else {
refinedTerms = terms.slice();
}
// Also try to add extra terms which miss one of the searched tokens,
// when the term contains 3 or more tokens,
// to improve the search precision.
const extraTerms: SmartTerm[] = [];
for (const term of refinedTerms) {
if (term.length > 2) {
for (let i = term.length - 1; i >= 0; i -= 1) {
extraTerms.push(term.slice(0, i).concat(term.slice(i + 1)));
}
}
}
return getQueriesMaybeTyping(terms).concat(getQueriesMaybeTyping(extraTerms));
}
function getQueriesMaybeTyping(terms: SmartTerm[]): SmartQuery[] {
return termsToQueries(terms).concat(
termsToQueries(
// Ignore terms whose last token already has a trailing wildcard,
// or the last token is not `maybeTyping`.
terms.filter((term) => {
const token = term[term.length - 1];
return !token.trailing && token.maybeTyping;
}),
true
)
);
}
function termsToQueries(
terms: SmartTerm[],
maybeTyping?: boolean
): SmartQuery[] {
return terms.map((term) => ({
tokens: term.map((item) => item.value),
term: term.map((item) => ({
value: item.value,
presence: lunr.Query.presence.REQUIRED,
// The last token of a term maybe incomplete while user is typing.
// So append more queries with trailing wildcard added.
wildcard: (
maybeTyping ? item.trailing || item.maybeTyping : item.trailing
)
? lunr.Query.wildcard.TRAILING
: lunr.Query.wildcard.NONE,
})),
}));
}