| // Porter stemmer in Javascript. Few comments, but it's easy to follow against the rules in the original |
| // paper, in |
| // |
| // Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14, |
| // no. 3, pp 130-137, |
| // |
| // see also http://www.tartarus.org/~martin/PorterStemmer |
| |
| // Release 1 |
| // Derived from (http://tartarus.org/~martin/PorterStemmer/js.txt) - cjm (iizuu) Aug 24, 2009 |
| |
| var stemmer = (function(){ |
| var step2list = { |
| "ational" : "ate", |
| "tional" : "tion", |
| "enci" : "ence", |
| "anci" : "ance", |
| "izer" : "ize", |
| "bli" : "ble", |
| "alli" : "al", |
| "entli" : "ent", |
| "eli" : "e", |
| "ousli" : "ous", |
| "ization" : "ize", |
| "ation" : "ate", |
| "ator" : "ate", |
| "alism" : "al", |
| "iveness" : "ive", |
| "fulness" : "ful", |
| "ousness" : "ous", |
| "aliti" : "al", |
| "iviti" : "ive", |
| "biliti" : "ble", |
| "logi" : "log" |
| }, |
| |
| step3list = { |
| "icate" : "ic", |
| "ative" : "", |
| "alize" : "al", |
| "iciti" : "ic", |
| "ical" : "ic", |
| "ful" : "", |
| "ness" : "" |
| }, |
| |
| c = "[^aeiou]", // consonant |
| v = "[aeiouy]", // vowel |
| C = c + "[^aeiouy]*", // consonant sequence |
| V = v + "[aeiou]*", // vowel sequence |
| |
| mgr0 = "^(" + C + ")?" + V + C, // [C]VC... is m>0 |
| meq1 = "^(" + C + ")?" + V + C + "(" + V + ")?$", // [C]VC[V] is m=1 |
| mgr1 = "^(" + C + ")?" + V + C + V + C, // [C]VCVC... is m>1 |
| s_v = "^(" + C + ")?" + v; // vowel in stem |
| |
| return function (w) { |
| var stem, |
| suffix, |
| firstch, |
| re, |
| re2, |
| re3, |
| re4, |
| origword = w; |
| |
| if (w.length < 3) { return w; } |
| |
| firstch = w.substr(0,1); |
| if (firstch == "y") { |
| w = firstch.toUpperCase() + w.substr(1); |
| } |
| |
| // Step 1a |
| re = /^(.+?)(ss|i)es$/; |
| re2 = /^(.+?)([^s])s$/; |
| |
| if (re.test(w)) { w = w.replace(re,"$1$2"); } |
| else if (re2.test(w)) { w = w.replace(re2,"$1$2"); } |
| |
| // Step 1b |
| re = /^(.+?)eed$/; |
| re2 = /^(.+?)(ed|ing)$/; |
| if (re.test(w)) { |
| var fp = re.exec(w); |
| re = new RegExp(mgr0); |
| if (re.test(fp[1])) { |
| re = /.$/; |
| w = w.replace(re,""); |
| } |
| } else if (re2.test(w)) { |
| var fp = re2.exec(w); |
| stem = fp[1]; |
| re2 = new RegExp(s_v); |
| if (re2.test(stem)) { |
| w = stem; |
| re2 = /(at|bl|iz)$/; |
| re3 = new RegExp("([^aeiouylsz])\\1$"); |
| re4 = new RegExp("^" + C + v + "[^aeiouwxy]$"); |
| if (re2.test(w)) { w = w + "e"; } |
| else if (re3.test(w)) { re = /.$/; w = w.replace(re,""); } |
| else if (re4.test(w)) { w = w + "e"; } |
| } |
| } |
| |
| // Step 1c |
| re = new RegExp("^(.+" + c + ")y$"); |
| if (re.test(w)) { |
| var fp = re.exec(w); |
| stem = fp[1]; |
| w = stem + "i"; |
| } |
| |
| // Step 2 |
| re = /^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/; |
| if (re.test(w)) { |
| var fp = re.exec(w); |
| stem = fp[1]; |
| suffix = fp[2]; |
| re = new RegExp(mgr0); |
| if (re.test(stem)) { |
| w = stem + step2list[suffix]; |
| } |
| } |
| |
| // Step 3 |
| re = /^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/; |
| if (re.test(w)) { |
| var fp = re.exec(w); |
| stem = fp[1]; |
| suffix = fp[2]; |
| re = new RegExp(mgr0); |
| if (re.test(stem)) { |
| w = stem + step3list[suffix]; |
| } |
| } |
| |
| // Step 4 |
| re = /^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/; |
| re2 = /^(.+?)(s|t)(ion)$/; |
| if (re.test(w)) { |
| var fp = re.exec(w); |
| stem = fp[1]; |
| re = new RegExp(mgr1); |
| if (re.test(stem)) { |
| w = stem; |
| } |
| } else if (re2.test(w)) { |
| var fp = re2.exec(w); |
| stem = fp[1] + fp[2]; |
| re2 = new RegExp(mgr1); |
| if (re2.test(stem)) { |
| w = stem; |
| } |
| } |
| |
| // Step 5 |
| re = /^(.+?)e$/; |
| if (re.test(w)) { |
| var fp = re.exec(w); |
| stem = fp[1]; |
| re = new RegExp(mgr1); |
| re2 = new RegExp(meq1); |
| re3 = new RegExp("^" + C + v + "[^aeiouwxy]$"); |
| if (re.test(stem) || (re2.test(stem) && !(re3.test(stem)))) { |
| w = stem; |
| } |
| } |
| |
| re = /ll$/; |
| re2 = new RegExp(mgr1); |
| if (re.test(w) && re2.test(w)) { |
| re = /.$/; |
| w = w.replace(re,""); |
| } |
| |
| // and turn initial Y back to y |
| |
| if (firstch == "y") { |
| w = firstch.toLowerCase() + w.substr(1); |
| } |
| |
| // See http://snowball.tartarus.org/algorithms/english/stemmer.html |
| // "Exceptional forms in general" |
| var specialWords = { |
| "skis" : "ski", |
| "skies" : "sky", |
| "dying" : "die", |
| "lying" : "lie", |
| "tying" : "tie", |
| "idly" : "idl", |
| "gently" : "gentl", |
| "ugly" : "ugli", |
| "early": "earli", |
| "only": "onli", |
| "singly": "singl" |
| }; |
| |
| if(specialWords[origword]){ |
| w = specialWords[origword]; |
| } |
| |
| if( "sky news howe atlas cosmos bias \ |
| andes inning outing canning herring \ |
| earring proceed exceed succeed".indexOf(origword) !== -1 ){ |
| w = origword; |
| } |
| |
| // Address words overstemmed as gener- |
| re = /.*generate?s?d?(ing)?$/; |
| if( re.test(origword) ){ |
| w = w + 'at'; |
| } |
| re = /.*general(ly)?$/; |
| if( re.test(origword) ){ |
| w = w + 'al'; |
| } |
| re = /.*generic(ally)?$/; |
| if( re.test(origword) ){ |
| w = w + 'ic'; |
| } |
| re = /.*generous(ly)?$/; |
| if( re.test(origword) ){ |
| w = w + 'ous'; |
| } |
| // Address words overstemmed as commun- |
| re = /.*communit(ies)?y?/; |
| if( re.test(origword) ){ |
| w = w + 'iti'; |
| } |
| |
| return w; |
| } |
| })(); |