blob: ec540304681bef72ca2fa7a608bbc460a722f457 [file] [log] [blame]
import lunr from 'lunr';
import { SmartQuery, SmartTerm } from '../../shared/interfaces';
import { smartTerms } from './smartTerms';
import { language, removeDefaultStopWordFilter } from './proxiedGenerated';
/**
* Get all possible queries for a list of tokens consists of words mixed English and Chinese,
* by a Chinese words dictionary.
*
* @param tokens - Tokens consists of English words or strings of consecutive Chinese words.
* @param zhDictionary - A Chinese words dictionary.
*
* @returns A smart query list.
*/
export function smartQueries(tokens: string[], zhDictionary: string[]): SmartQuery[] {
const terms = smartTerms(tokens, zhDictionary);
if (terms.length === 0) {
// There are no matched terms.
// All tokens are considered required and with wildcard.
return [
{
tokens,
term: tokens.map(value => ({
value,
presence: lunr.Query.presence.REQUIRED,
wildcard: lunr.Query.wildcard.LEADING | lunr.Query.wildcard.TRAILING,
})),
},
];
}
// The last token of a term maybe incomplete while user is typing.
for (const term of terms) {
term[term.length - 1].maybeTyping = true;
}
lunr.generateStopWordFilter = function (stopWords) {
var words = stopWords.reduce(function (memo, stopWord) {
memo[stopWord] = stopWord;
return memo;
}, {});
return function (token) {
if (token && words[token.toString()] !== token.toString()) return token;
};
};
lunr.stopWordFilter = lunr.generateStopWordFilter([
'a',
'able',
'about',
'across',
'after',
'all',
'almost',
'also',
'am',
'among',
'an',
'and',
'any',
'are',
'as',
'at',
'be',
'because',
'been',
'but',
'by',
'can',
'cannot',
'could',
'dear',
'did',
'do',
'does',
'either',
'else',
'ever',
'every',
'for',
'from',
'get',
'got',
'had',
'has',
'have',
'he',
'her',
'hers',
'him',
'his',
'how',
'however',
'i',
'if',
'in',
'into',
'is',
'it',
'its',
'just',
'least',
'let',
'like',
'likely',
'may',
'me',
'might',
'most',
'must',
'my',
'neither',
'no',
'nor',
'not',
'of',
'off',
'often',
'on',
'only',
'or',
'other',
'our',
'own',
'rather',
'said',
'say',
'says',
'she',
'should',
'since',
'so',
'some',
'than',
'that',
'the',
'their',
'them',
'then',
'there',
'these',
'they',
'this',
'tis',
'to',
'too',
'twas',
'us',
'wants',
'was',
'we',
'were',
'what',
'when',
'where',
'which',
'while',
'who',
'whom',
'why',
'will',
'with',
'would',
'yet',
'you',
'your',
]);
lunrLanguageZh(lunr);
// Try to append terms without stop words,
// since they are removed in the index.
const stopWordPipelines: lunr.PipelineFunction[] = [];
for (const lang of language) {
if (lang === 'en') {
if (!removeDefaultStopWordFilter) {
stopWordPipelines.unshift(lunr.stopWordFilter);
}
} else {
const lunrLang = (lunr as any)[lang] as typeof lunr;
if (lunrLang.stopWordFilter) {
stopWordPipelines.unshift(lunrLang.stopWordFilter);
}
}
}
let refinedTerms: SmartTerm[];
if (stopWordPipelines.length > 0) {
const pipe = (term: SmartTerm) =>
stopWordPipelines.reduce(
(term, p) => term.filter(item => (p as unknown as (str: string) => string | undefined)(item.value)),
term,
);
refinedTerms = [];
const newTerms: SmartTerm[] = [];
for (const term of terms) {
const filteredTerm = pipe(term);
refinedTerms.push(filteredTerm);
// Add extra terms only if some stop words are removed,
// and some non-stop-words exist too.
if (filteredTerm.length < term.length && filteredTerm.length > 0) {
newTerms.push(filteredTerm);
}
}
terms.push(...newTerms);
} else {
refinedTerms = terms.slice();
}
// Also try to add extra terms which miss one of the searched tokens,
// when the term contains 3 or more tokens,
// to improve the search precision.
const extraTerms: SmartTerm[] = [];
for (const term of refinedTerms) {
if (term.length > 2) {
for (let i = term.length - 1; i >= 0; i -= 1) {
extraTerms.push(term.slice(0, i).concat(term.slice(i + 1)));
}
}
}
return getQueriesMaybeTyping(terms).concat(getQueriesMaybeTyping(extraTerms));
}
function getQueriesMaybeTyping(terms: SmartTerm[]): SmartQuery[] {
return termsToQueries(terms).concat(
termsToQueries(
// Ignore terms whose last token already has a trailing wildcard,
// or the last token is not `maybeTyping`.
terms.filter(term => {
const token = term[term.length - 1];
return !token.trailing && token.maybeTyping;
}),
true,
),
);
}
function termsToQueries(terms: SmartTerm[], maybeTyping?: boolean): SmartQuery[] {
return terms.map(term => ({
tokens: term.map(item => item.value),
term: term.map(item => ({
value: item.value,
presence: lunr.Query.presence.REQUIRED,
// The last token of a term maybe incomplete while user is typing.
// So append more queries with trailing wildcard added.
wildcard: (maybeTyping ? item.trailing || item.maybeTyping : item.trailing)
? lunr.Query.wildcard.TRAILING
: lunr.Query.wildcard.NONE,
})),
}));
}
// `lunr-languages/lunr.stemmer.support` is required.
function generateTrimmer(wordCharacters) {
const startRegex = new RegExp("^[^" + wordCharacters + "]+", "u");
const endRegex = new RegExp("[^" + wordCharacters + "]+$", "u");
return function (token) {
return token.update(function (str) {
return str.replace(startRegex, "").replace(endRegex, "");
});
};
}
function lunrLanguageZh(lunr, tokenizer) {
lunr.trimmerSupport.generateTrimmer = generateTrimmer;
lunr.zh = function () {
this.pipeline.reset();
this.pipeline.add(lunr.zh.trimmer, lunr.zh.stopWordFilter);
if (tokenizer) {
this.tokenizer = tokenizer;
}
};
if (tokenizer) {
lunr.zh.tokenizer = tokenizer;
}
// https://zhuanlan.zhihu.com/p/33335629
// https://mothereff.in/regexpu#input=const+regex+%3D+/%5Cp%7BUnified_Ideograph%7D/u%3B&unicodePropertyEscape=1
lunr.zh.wordCharacters =
"\\u3400-\\u4DBF\\u4E00-\\u9FFC\\uFA0E\\uFA0F\\uFA11\\uFA13\\uFA14\\uFA1F\\uFA21\\uFA23\\uFA24\\uFA27-\\uFA29\\u{20000}-\\u{2A6DD}\\u{2A700}-\\u{2B734}\\u{2B740}-\\u{2B81D}\\u{2B820}-\\u{2CEA1}\\u{2CEB0}-\\u{2EBE0}\\u{30000}-\\u{3134A}";
lunr.zh.trimmer = lunr.trimmerSupport.generateTrimmer(lunr.zh.wordCharacters);
lunr.Pipeline.registerFunction(lunr.zh.trimmer, "trimmer-zh");
/* lunr stop word filter. see https://www.ranks.nl/stopwords/chinese-stopwords */
lunr.zh.stopWordFilter = lunr.generateStopWordFilter("的 一 不 在 人 有 是 为 以 于 上 他 而 后 之 来 及 了 因 下 可 到 由 这 与 也 此 但 并 个 其 已 无 小 我 们 起 最 再 今 去 好 只 又 或 很 亦 某 把 那 你 乃 它 吧 被 比 别 趁 当 从 到 得 打 凡 儿 尔 该 各 给 跟 和 何 还 即 几 既 看 据 距 靠 啦 了 另 么 每 们 嘛 拿 哪 那 您 凭 且 却 让 仍 啥 如 若 使 谁 虽 随 同 所 她 哇 嗡 往 哪 些 向 沿 哟 用 于 咱 则 怎 曾 至 致 着 诸 自".split(" "));
lunr.Pipeline.registerFunction(lunr.zh.stopWordFilter, "stopWordFilter-zh");
}