| """English search language: includes the JS porter stemmer.""" |
| |
| from __future__ import annotations |
| |
| from typing import TYPE_CHECKING, Dict |
| |
| import snowballstemmer |
| |
| from sphinx.search import SearchLanguage |
| |
| english_stopwords = set(""" |
| a and are as at |
| be but by |
| for |
| if in into is it |
| near no not |
| of on or |
| such |
| that the their then there these they this to |
| was will with |
| """.split()) |
| |
| js_porter_stemmer = """ |
| /** |
| * Porter Stemmer |
| */ |
| var Stemmer = function() { |
| |
| var step2list = { |
| ational: 'ate', |
| tional: 'tion', |
| enci: 'ence', |
| anci: 'ance', |
| izer: 'ize', |
| bli: 'ble', |
| alli: 'al', |
| entli: 'ent', |
| eli: 'e', |
| ousli: 'ous', |
| ization: 'ize', |
| ation: 'ate', |
| ator: 'ate', |
| alism: 'al', |
| iveness: 'ive', |
| fulness: 'ful', |
| ousness: 'ous', |
| aliti: 'al', |
| iviti: 'ive', |
| biliti: 'ble', |
| logi: 'log' |
| }; |
| |
| var step3list = { |
| icate: 'ic', |
| ative: '', |
| alize: 'al', |
| iciti: 'ic', |
| ical: 'ic', |
| ful: '', |
| ness: '' |
| }; |
| |
| var c = "[^aeiou]"; // consonant |
| var v = "[aeiouy]"; // vowel |
| var C = c + "[^aeiouy]*"; // consonant sequence |
| var V = v + "[aeiou]*"; // vowel sequence |
| |
| var mgr0 = "^(" + C + ")?" + V + C; // [C]VC... is m>0 |
| var meq1 = "^(" + C + ")?" + V + C + "(" + V + ")?$"; // [C]VC[V] is m=1 |
| var mgr1 = "^(" + C + ")?" + V + C + V + C; // [C]VCVC... is m>1 |
| var s_v = "^(" + C + ")?" + v; // vowel in stem |
| |
| this.stemWord = function (w) { |
| var stem; |
| var suffix; |
| var firstch; |
| var origword = w; |
| |
| if (w.length < 3) |
| return w; |
| |
| var re; |
| var re2; |
| var re3; |
| var re4; |
| |
| firstch = w.substr(0,1); |
| if (firstch == "y") |
| w = firstch.toUpperCase() + w.substr(1); |
| |
| // Step 1a |
| re = /^(.+?)(ss|i)es$/; |
| re2 = /^(.+?)([^s])s$/; |
| |
| if (re.test(w)) |
| w = w.replace(re,"$1$2"); |
| else if (re2.test(w)) |
| w = w.replace(re2,"$1$2"); |
| |
| // Step 1b |
| re = /^(.+?)eed$/; |
| re2 = /^(.+?)(ed|ing)$/; |
| if (re.test(w)) { |
| var fp = re.exec(w); |
| re = new RegExp(mgr0); |
| if (re.test(fp[1])) { |
| re = /.$/; |
| w = w.replace(re,""); |
| } |
| } |
| else if (re2.test(w)) { |
| var fp = re2.exec(w); |
| stem = fp[1]; |
| re2 = new RegExp(s_v); |
| if (re2.test(stem)) { |
| w = stem; |
| re2 = /(at|bl|iz)$/; |
| re3 = new RegExp("([^aeiouylsz])\\\\1$"); |
| re4 = new RegExp("^" + C + v + "[^aeiouwxy]$"); |
| if (re2.test(w)) |
| w = w + "e"; |
| else if (re3.test(w)) { |
| re = /.$/; |
| w = w.replace(re,""); |
| } |
| else if (re4.test(w)) |
| w = w + "e"; |
| } |
| } |
| |
| // Step 1c |
| re = /^(.+?)y$/; |
| if (re.test(w)) { |
| var fp = re.exec(w); |
| stem = fp[1]; |
| re = new RegExp(s_v); |
| if (re.test(stem)) |
| w = stem + "i"; |
| } |
| |
| // Step 2 |
| re = /^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|\ |
| ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/; |
| if (re.test(w)) { |
| var fp = re.exec(w); |
| stem = fp[1]; |
| suffix = fp[2]; |
| re = new RegExp(mgr0); |
| if (re.test(stem)) |
| w = stem + step2list[suffix]; |
| } |
| |
| // Step 3 |
| re = /^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/; |
| if (re.test(w)) { |
| var fp = re.exec(w); |
| stem = fp[1]; |
| suffix = fp[2]; |
| re = new RegExp(mgr0); |
| if (re.test(stem)) |
| w = stem + step3list[suffix]; |
| } |
| |
| // Step 4 |
| re = /^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|\ |
| iti|ous|ive|ize)$/; |
| re2 = /^(.+?)(s|t)(ion)$/; |
| if (re.test(w)) { |
| var fp = re.exec(w); |
| stem = fp[1]; |
| re = new RegExp(mgr1); |
| if (re.test(stem)) |
| w = stem; |
| } |
| else if (re2.test(w)) { |
| var fp = re2.exec(w); |
| stem = fp[1] + fp[2]; |
| re2 = new RegExp(mgr1); |
| if (re2.test(stem)) |
| w = stem; |
| } |
| |
| // Step 5 |
| re = /^(.+?)e$/; |
| if (re.test(w)) { |
| var fp = re.exec(w); |
| stem = fp[1]; |
| re = new RegExp(mgr1); |
| re2 = new RegExp(meq1); |
| re3 = new RegExp("^" + C + v + "[^aeiouwxy]$"); |
| if (re.test(stem) || (re2.test(stem) && !(re3.test(stem)))) |
| w = stem; |
| } |
| re = /ll$/; |
| re2 = new RegExp(mgr1); |
| if (re.test(w) && re2.test(w)) { |
| re = /.$/; |
| w = w.replace(re,""); |
| } |
| |
| // and turn initial Y back to y |
| if (firstch == "y") |
| w = firstch.toLowerCase() + w.substr(1); |
| return w; |
| } |
| } |
| """ |
| |
| |
| class SearchEnglish(SearchLanguage): |
| lang = 'en' |
| language_name = 'English' |
| js_stemmer_code = js_porter_stemmer |
| stopwords = english_stopwords |
| |
| def init(self, options: dict) -> None: |
| self.stemmer = snowballstemmer.stemmer('porter') |
| |
| def stem(self, word: str) -> str: |
| return self.stemmer.stemWord(word.lower()) |