| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.analysis.lv; |
| |
| import static org.apache.lucene.analysis.util.StemmerUtil.*; |
| |
| /** |
| * Light stemmer for Latvian. |
| * |
| * <p>This is a light version of the algorithm in Karlis Kreslin's PhD thesis <i>A stemming |
| * algorithm for Latvian</i> with the following modifications: |
| * |
| * <ul> |
| * <li>Only explicitly stems noun and adjective morphology |
| * <li>Stricter length/vowel checks for the resulting stems (verb etc suffix stripping is removed) |
| * <li>Removes only the primary inflectional suffixes: case and number for nouns ; case, number, |
| * gender, and definitiveness for adjectives. |
| * <li>Palatalization is only handled when a declension II,V,VI noun suffix is removed. |
| * </ul> |
| */ |
| public class LatvianStemmer { |
| /** Stem a latvian word. returns the new adjusted length. */ |
| public int stem(char s[], int len) { |
| int numVowels = numVowels(s, len); |
| |
| for (int i = 0; i < affixes.length; i++) { |
| Affix affix = affixes[i]; |
| if (numVowels > affix.vc && len >= affix.affix.length + 3 && endsWith(s, len, affix.affix)) { |
| len -= affix.affix.length; |
| return affix.palatalizes ? unpalatalize(s, len) : len; |
| } |
| } |
| |
| return len; |
| } |
| |
| static final Affix affixes[] = { |
| new Affix("ajiem", 3, false), new Affix("ajai", 3, false), |
| new Affix("ajam", 2, false), new Affix("ajām", 2, false), |
| new Affix("ajos", 2, false), new Affix("ajās", 2, false), |
| new Affix("iem", 2, true), new Affix("ajā", 2, false), |
| new Affix("ais", 2, false), new Affix("ai", 2, false), |
| new Affix("ei", 2, false), new Affix("ām", 1, false), |
| new Affix("am", 1, false), new Affix("ēm", 1, false), |
| new Affix("īm", 1, false), new Affix("im", 1, false), |
| new Affix("um", 1, false), new Affix("us", 1, true), |
| new Affix("as", 1, false), new Affix("ās", 1, false), |
| new Affix("es", 1, false), new Affix("os", 1, true), |
| new Affix("ij", 1, false), new Affix("īs", 1, false), |
| new Affix("ēs", 1, false), new Affix("is", 1, false), |
| new Affix("ie", 1, false), new Affix("u", 1, true), |
| new Affix("a", 1, true), new Affix("i", 1, true), |
| new Affix("e", 1, false), new Affix("ā", 1, false), |
| new Affix("ē", 1, false), new Affix("ī", 1, false), |
| new Affix("ū", 1, false), new Affix("o", 1, false), |
| new Affix("s", 0, false), new Affix("š", 0, false), |
| }; |
| |
| static class Affix { |
| char affix[]; // suffix |
| int vc; // vowel count of the suffix |
| boolean palatalizes; // true if we should fire palatalization rules. |
| |
| Affix(String affix, int vc, boolean palatalizes) { |
| this.affix = affix.toCharArray(); |
| this.vc = vc; |
| this.palatalizes = palatalizes; |
| } |
| } |
| |
| /** |
| * Most cases are handled except for the ambiguous ones: |
| * |
| * <ul> |
| * <li>s -> š |
| * <li>t -> š |
| * <li>d -> ž |
| * <li>z -> ž |
| * </ul> |
| */ |
| private int unpalatalize(char s[], int len) { |
| // we check the character removed: if it's -u then |
| // it's 2,5, or 6 gen pl., and these two can only apply then. |
| if (s[len] == 'u') { |
| // kš -> kst |
| if (endsWith(s, len, "kš")) { |
| len++; |
| s[len - 2] = 's'; |
| s[len - 1] = 't'; |
| return len; |
| } |
| // ņņ -> nn |
| if (endsWith(s, len, "ņņ")) { |
| s[len - 2] = 'n'; |
| s[len - 1] = 'n'; |
| return len; |
| } |
| } |
| |
| // otherwise all other rules |
| if (endsWith(s, len, "pj") |
| || endsWith(s, len, "bj") |
| || endsWith(s, len, "mj") |
| || endsWith(s, len, "vj")) { |
| // labial consonant |
| return len - 1; |
| } else if (endsWith(s, len, "šņ")) { |
| s[len - 2] = 's'; |
| s[len - 1] = 'n'; |
| return len; |
| } else if (endsWith(s, len, "žņ")) { |
| s[len - 2] = 'z'; |
| s[len - 1] = 'n'; |
| return len; |
| } else if (endsWith(s, len, "šļ")) { |
| s[len - 2] = 's'; |
| s[len - 1] = 'l'; |
| return len; |
| } else if (endsWith(s, len, "žļ")) { |
| s[len - 2] = 'z'; |
| s[len - 1] = 'l'; |
| return len; |
| } else if (endsWith(s, len, "ļņ")) { |
| s[len - 2] = 'l'; |
| s[len - 1] = 'n'; |
| return len; |
| } else if (endsWith(s, len, "ļļ")) { |
| s[len - 2] = 'l'; |
| s[len - 1] = 'l'; |
| return len; |
| } else if (s[len - 1] == 'č') { |
| s[len - 1] = 'c'; |
| return len; |
| } else if (s[len - 1] == 'ļ') { |
| s[len - 1] = 'l'; |
| return len; |
| } else if (s[len - 1] == 'ņ') { |
| s[len - 1] = 'n'; |
| return len; |
| } |
| |
| return len; |
| } |
| |
| /** |
| * Count the vowels in the string, we always require at least one in the remaining stem to accept |
| * it. |
| */ |
| private int numVowels(char s[], int len) { |
| int n = 0; |
| for (int i = 0; i < len; i++) { |
| switch (s[i]) { |
| case 'a': |
| case 'e': |
| case 'i': |
| case 'o': |
| case 'u': |
| case 'ā': |
| case 'ī': |
| case 'ē': |
| case 'ū': |
| n++; |
| } |
| } |
| return n; |
| } |
| } |