| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| /* |
| This file was partially derived from the |
| original CIIR University of Massachusetts Amherst version of KStemmer.java (license for |
| the original shown below) |
| */ |
| |
| /* |
| Copyright © 2003, |
| Center for Intelligent Information Retrieval, |
| University of Massachusetts, Amherst. |
| All rights reserved. |
| |
| Redistribution and use in source and binary forms, with or without modification, |
| are permitted provided that the following conditions are met: |
| |
| 1. Redistributions of source code must retain the above copyright notice, this |
| list of conditions and the following disclaimer. |
| |
| 2. Redistributions in binary form must reproduce the above copyright notice, |
| this list of conditions and the following disclaimer in the documentation |
| and/or other materials provided with the distribution. |
| |
| 3. The names "Center for Intelligent Information Retrieval" and |
| "University of Massachusetts" must not be used to endorse or promote products |
| derived from this software without prior written permission. To obtain |
| permission, contact info@ciir.cs.umass.edu. |
| |
| THIS SOFTWARE IS PROVIDED BY UNIVERSITY OF MASSACHUSETTS AND OTHER CONTRIBUTORS |
| "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, |
| THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE |
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
| CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE |
| GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
| HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
| LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
| OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
| SUCH DAMAGE. |
| */ |
| package org.apache.lucene.analysis.en; |
| |
| import org.apache.lucene.analysis.CharArrayMap; |
| import org.apache.lucene.analysis.util.OpenStringBuilder; |
| |
| /** |
| * Title: Kstemmer |
| * |
| * <p>Description: This is a java version of Bob Krovetz' kstem stemmer |
| * |
| * <p>Copyright: Copyright 2008, Luicid Imagination, Inc. |
| * |
| * <p>Copyright: Copyright 2003, CIIR University of Massachusetts Amherst (http://ciir.cs.umass.edu) |
| */ |
| |
| /** This class implements the Kstem algorithm */ |
| public class KStemmer { |
| private static final int MaxWordLen = 50; |
| |
| private static final String[] exceptionWords = { |
| "aide", |
| "bathe", |
| "caste", |
| "cute", |
| "dame", |
| "dime", |
| "doge", |
| "done", |
| "dune", |
| "envelope", |
| "gage", |
| "grille", |
| "grippe", |
| "lobe", |
| "mane", |
| "mare", |
| "nape", |
| "node", |
| "pane", |
| "pate", |
| "plane", |
| "pope", |
| "programme", |
| "quite", |
| "ripe", |
| "rote", |
| "rune", |
| "sage", |
| "severe", |
| "shoppe", |
| "sine", |
| "slime", |
| "snipe", |
| "steppe", |
| "suite", |
| "swinge", |
| "tare", |
| "tine", |
| "tope", |
| "tripe", |
| "twine" |
| }; |
| |
| private static final String[][] directConflations = { |
| {"aging", "age"}, |
| {"going", "go"}, |
| {"goes", "go"}, |
| {"lying", "lie"}, |
| {"using", "use"}, |
| {"owing", "owe"}, |
| {"suing", "sue"}, |
| {"dying", "die"}, |
| {"tying", "tie"}, |
| {"vying", "vie"}, |
| {"aged", "age"}, |
| {"used", "use"}, |
| {"vied", "vie"}, |
| {"cued", "cue"}, |
| {"died", "die"}, |
| {"eyed", "eye"}, |
| {"hued", "hue"}, |
| {"iced", "ice"}, |
| {"lied", "lie"}, |
| {"owed", "owe"}, |
| {"sued", "sue"}, |
| {"toed", "toe"}, |
| {"tied", "tie"}, |
| {"does", "do"}, |
| {"doing", "do"}, |
| {"aeronautical", "aeronautics"}, |
| {"mathematical", "mathematics"}, |
| {"political", "politics"}, |
| {"metaphysical", "metaphysics"}, |
| {"cylindrical", "cylinder"}, |
| {"nazism", "nazi"}, |
| {"ambiguity", "ambiguous"}, |
| {"barbarity", "barbarous"}, |
| {"credulity", "credulous"}, |
| {"generosity", "generous"}, |
| {"spontaneity", "spontaneous"}, |
| {"unanimity", "unanimous"}, |
| {"voracity", "voracious"}, |
| {"fled", "flee"}, |
| {"miscarriage", "miscarry"} |
| }; |
| |
| private static final String[][] countryNationality = { |
| {"afghan", "afghanistan"}, |
| {"african", "africa"}, |
| {"albanian", "albania"}, |
| {"algerian", "algeria"}, |
| {"american", "america"}, |
| {"andorran", "andorra"}, |
| {"angolan", "angola"}, |
| {"arabian", "arabia"}, |
| {"argentine", "argentina"}, |
| {"armenian", "armenia"}, |
| {"asian", "asia"}, |
| {"australian", "australia"}, |
| {"austrian", "austria"}, |
| {"azerbaijani", "azerbaijan"}, |
| {"azeri", "azerbaijan"}, |
| {"bangladeshi", "bangladesh"}, |
| {"belgian", "belgium"}, |
| {"bermudan", "bermuda"}, |
| {"bolivian", "bolivia"}, |
| {"bosnian", "bosnia"}, |
| {"botswanan", "botswana"}, |
| {"brazilian", "brazil"}, |
| {"british", "britain"}, |
| {"bulgarian", "bulgaria"}, |
| {"burmese", "burma"}, |
| {"californian", "california"}, |
| {"cambodian", "cambodia"}, |
| {"canadian", "canada"}, |
| {"chadian", "chad"}, |
| {"chilean", "chile"}, |
| {"chinese", "china"}, |
| {"colombian", "colombia"}, |
| {"croat", "croatia"}, |
| {"croatian", "croatia"}, |
| {"cuban", "cuba"}, |
| {"cypriot", "cyprus"}, |
| {"czechoslovakian", "czechoslovakia"}, |
| {"danish", "denmark"}, |
| {"egyptian", "egypt"}, |
| {"equadorian", "equador"}, |
| {"eritrean", "eritrea"}, |
| {"estonian", "estonia"}, |
| {"ethiopian", "ethiopia"}, |
| {"european", "europe"}, |
| {"fijian", "fiji"}, |
| {"filipino", "philippines"}, |
| {"finnish", "finland"}, |
| {"french", "france"}, |
| {"gambian", "gambia"}, |
| {"georgian", "georgia"}, |
| {"german", "germany"}, |
| {"ghanian", "ghana"}, |
| {"greek", "greece"}, |
| {"grenadan", "grenada"}, |
| {"guamian", "guam"}, |
| {"guatemalan", "guatemala"}, |
| {"guinean", "guinea"}, |
| {"guyanan", "guyana"}, |
| {"haitian", "haiti"}, |
| {"hawaiian", "hawaii"}, |
| {"holland", "dutch"}, |
| {"honduran", "honduras"}, |
| {"hungarian", "hungary"}, |
| {"icelandic", "iceland"}, |
| {"indonesian", "indonesia"}, |
| {"iranian", "iran"}, |
| {"iraqi", "iraq"}, |
| {"iraqui", "iraq"}, |
| {"irish", "ireland"}, |
| {"israeli", "israel"}, |
| {"italian", "italy"}, |
| {"jamaican", "jamaica"}, |
| {"japanese", "japan"}, |
| {"jordanian", "jordan"}, |
| {"kampuchean", "cambodia"}, |
| {"kenyan", "kenya"}, |
| {"korean", "korea"}, |
| {"kuwaiti", "kuwait"}, |
| {"lankan", "lanka"}, |
| {"laotian", "laos"}, |
| {"latvian", "latvia"}, |
| {"lebanese", "lebanon"}, |
| {"liberian", "liberia"}, |
| {"libyan", "libya"}, |
| {"lithuanian", "lithuania"}, |
| {"macedonian", "macedonia"}, |
| {"madagascan", "madagascar"}, |
| {"malaysian", "malaysia"}, |
| {"maltese", "malta"}, |
| {"mauritanian", "mauritania"}, |
| {"mexican", "mexico"}, |
| {"micronesian", "micronesia"}, |
| {"moldovan", "moldova"}, |
| {"monacan", "monaco"}, |
| {"mongolian", "mongolia"}, |
| {"montenegran", "montenegro"}, |
| {"moroccan", "morocco"}, |
| {"myanmar", "burma"}, |
| {"namibian", "namibia"}, |
| {"nepalese", "nepal"}, |
| // {"netherlands", "dutch"}, |
| {"nicaraguan", "nicaragua"}, |
| {"nigerian", "nigeria"}, |
| {"norwegian", "norway"}, |
| {"omani", "oman"}, |
| {"pakistani", "pakistan"}, |
| {"panamanian", "panama"}, |
| {"papuan", "papua"}, |
| {"paraguayan", "paraguay"}, |
| {"peruvian", "peru"}, |
| {"portuguese", "portugal"}, |
| {"romanian", "romania"}, |
| {"rumania", "romania"}, |
| {"rumanian", "romania"}, |
| {"russian", "russia"}, |
| {"rwandan", "rwanda"}, |
| {"samoan", "samoa"}, |
| {"scottish", "scotland"}, |
| {"serb", "serbia"}, |
| {"serbian", "serbia"}, |
| {"siam", "thailand"}, |
| {"siamese", "thailand"}, |
| {"slovakia", "slovak"}, |
| {"slovakian", "slovak"}, |
| {"slovenian", "slovenia"}, |
| {"somali", "somalia"}, |
| {"somalian", "somalia"}, |
| {"spanish", "spain"}, |
| {"swedish", "sweden"}, |
| {"swiss", "switzerland"}, |
| {"syrian", "syria"}, |
| {"taiwanese", "taiwan"}, |
| {"tanzanian", "tanzania"}, |
| {"texan", "texas"}, |
| {"thai", "thailand"}, |
| {"tunisian", "tunisia"}, |
| {"turkish", "turkey"}, |
| {"ugandan", "uganda"}, |
| {"ukrainian", "ukraine"}, |
| {"uruguayan", "uruguay"}, |
| {"uzbek", "uzbekistan"}, |
| {"venezuelan", "venezuela"}, |
| {"vietnamese", "viet"}, |
| {"virginian", "virginia"}, |
| {"yemeni", "yemen"}, |
| {"yugoslav", "yugoslavia"}, |
| {"yugoslavian", "yugoslavia"}, |
| {"zambian", "zambia"}, |
| {"zealander", "zealand"}, |
| {"zimbabwean", "zimbabwe"} |
| }; |
| |
| private static final String[] supplementDict = { |
| "aids", |
| "applicator", |
| "capacitor", |
| "digitize", |
| "electromagnet", |
| "ellipsoid", |
| "exosphere", |
| "extensible", |
| "ferromagnet", |
| "graphics", |
| "hydromagnet", |
| "polygraph", |
| "toroid", |
| "superconduct", |
| "backscatter", |
| "connectionism" |
| }; |
| |
| private static final String[] properNouns = { |
| "abrams", |
| "achilles", |
| "acropolis", |
| "adams", |
| "agnes", |
| "aires", |
| "alexander", |
| "alexis", |
| "alfred", |
| "algiers", |
| "alps", |
| "amadeus", |
| "ames", |
| "amos", |
| "andes", |
| "angeles", |
| "annapolis", |
| "antilles", |
| "aquarius", |
| "archimedes", |
| "arkansas", |
| "asher", |
| "ashly", |
| "athens", |
| "atkins", |
| "atlantis", |
| "avis", |
| "bahamas", |
| "bangor", |
| "barbados", |
| "barger", |
| "bering", |
| "brahms", |
| "brandeis", |
| "brussels", |
| "bruxelles", |
| "cairns", |
| "camoros", |
| "camus", |
| "carlos", |
| "celts", |
| "chalker", |
| "charles", |
| "cheops", |
| "ching", |
| "christmas", |
| "cocos", |
| "collins", |
| "columbus", |
| "confucius", |
| "conners", |
| "connolly", |
| "copernicus", |
| "cramer", |
| "cyclops", |
| "cygnus", |
| "cyprus", |
| "dallas", |
| "damascus", |
| "daniels", |
| "davies", |
| "davis", |
| "decker", |
| "denning", |
| "dennis", |
| "descartes", |
| "dickens", |
| "doris", |
| "douglas", |
| "downs", |
| "dreyfus", |
| "dukakis", |
| "dulles", |
| "dumfries", |
| "ecclesiastes", |
| "edwards", |
| "emily", |
| "erasmus", |
| "euphrates", |
| "evans", |
| "everglades", |
| "fairbanks", |
| "federales", |
| "fisher", |
| "fitzsimmons", |
| "fleming", |
| "forbes", |
| "fowler", |
| "france", |
| "francis", |
| "goering", |
| "goodling", |
| "goths", |
| "grenadines", |
| "guiness", |
| "hades", |
| "harding", |
| "harris", |
| "hastings", |
| "hawkes", |
| "hawking", |
| "hayes", |
| "heights", |
| "hercules", |
| "himalayas", |
| "hippocrates", |
| "hobbs", |
| "holmes", |
| "honduras", |
| "hopkins", |
| "hughes", |
| "humphreys", |
| "illinois", |
| "indianapolis", |
| "inverness", |
| "iris", |
| "iroquois", |
| "irving", |
| "isaacs", |
| "italy", |
| "james", |
| "jarvis", |
| "jeffreys", |
| "jesus", |
| "jones", |
| "josephus", |
| "judas", |
| "julius", |
| "kansas", |
| "keynes", |
| "kipling", |
| "kiwanis", |
| "lansing", |
| "laos", |
| "leeds", |
| "levis", |
| "leviticus", |
| "lewis", |
| "louis", |
| "maccabees", |
| "madras", |
| "maimonides", |
| "maldive", |
| "massachusetts", |
| "matthews", |
| "mauritius", |
| "memphis", |
| "mercedes", |
| "midas", |
| "mingus", |
| "minneapolis", |
| "mohammed", |
| "moines", |
| "morris", |
| "moses", |
| "myers", |
| "myknos", |
| "nablus", |
| "nanjing", |
| "nantes", |
| "naples", |
| "neal", |
| "netherlands", |
| "nevis", |
| "nostradamus", |
| "oedipus", |
| "olympus", |
| "orleans", |
| "orly", |
| "papas", |
| "paris", |
| "parker", |
| "pauling", |
| "peking", |
| "pershing", |
| "peter", |
| "peters", |
| "philippines", |
| "phineas", |
| "pisces", |
| "pryor", |
| "pythagoras", |
| "queens", |
| "rabelais", |
| "ramses", |
| "reynolds", |
| "rhesus", |
| "rhodes", |
| "richards", |
| "robins", |
| "rodgers", |
| "rogers", |
| "rubens", |
| "sagittarius", |
| "seychelles", |
| "socrates", |
| "texas", |
| "thames", |
| "thomas", |
| "tiberias", |
| "tunis", |
| "venus", |
| "vilnius", |
| "wales", |
| "warner", |
| "wilkins", |
| "williams", |
| "wyoming", |
| "xmas", |
| "yonkers", |
| "zeus", |
| "frances", |
| "aarhus", |
| "adonis", |
| "andrews", |
| "angus", |
| "antares", |
| "aquinas", |
| "arcturus", |
| "ares", |
| "artemis", |
| "augustus", |
| "ayers", |
| "barnabas", |
| "barnes", |
| "becker", |
| "bejing", |
| "biggs", |
| "billings", |
| "boeing", |
| "boris", |
| "borroughs", |
| "briggs", |
| "buenos", |
| "calais", |
| "caracas", |
| "cassius", |
| "cerberus", |
| "ceres", |
| "cervantes", |
| "chantilly", |
| "chartres", |
| "chester", |
| "connally", |
| "conner", |
| "coors", |
| "cummings", |
| "curtis", |
| "daedalus", |
| "dionysus", |
| "dobbs", |
| "dolores", |
| "edmonds" |
| }; |
| |
| static class DictEntry { |
| boolean exception; |
| String root; |
| |
| DictEntry(String root, boolean isException) { |
| this.root = root; |
| this.exception = isException; |
| } |
| } |
| |
| private static final CharArrayMap<DictEntry> dict_ht = initializeDictHash(); |
| |
| /*** |
| * caching off private int maxCacheSize; private CharArrayMap<String> cache = |
| * null; private static final String SAME = "SAME"; // use if stemmed form is |
| * the same |
| ***/ |
| |
| private final OpenStringBuilder word = new OpenStringBuilder(); |
| |
| private int j; /* index of final letter in stem (within word) */ |
| private int k; /* |
| * INDEX of final letter in word. You must add 1 to k to get |
| * the current length of word. When you want the length of |
| * word, use the method wordLength, which returns (k+1). |
| */ |
| |
| /* |
| * private void initializeStemHash() { if (maxCacheSize > 0) cache = new |
| * CharArrayMap<String>(maxCacheSize,false); } |
| ***/ |
| |
| private char finalChar() { |
| return word.charAt(k); |
| } |
| |
| private char penultChar() { |
| return word.charAt(k - 1); |
| } |
| |
| private boolean isVowel(int index) { |
| return !isCons(index); |
| } |
| |
| private boolean isCons(int index) { |
| char ch; |
| |
| ch = word.charAt(index); |
| |
| if ((ch == 'a') || (ch == 'e') || (ch == 'i') || (ch == 'o') || (ch == 'u')) return false; |
| if ((ch != 'y') || (index == 0)) return true; |
| else return (!isCons(index - 1)); |
| } |
| |
| private static CharArrayMap<DictEntry> initializeDictHash() { |
| DictEntry defaultEntry; |
| DictEntry entry; |
| |
| CharArrayMap<DictEntry> d = new CharArrayMap<>(1000, false); |
| for (int i = 0; i < exceptionWords.length; i++) { |
| if (!d.containsKey(exceptionWords[i])) { |
| entry = new DictEntry(exceptionWords[i], true); |
| d.put(exceptionWords[i], entry); |
| } else { |
| throw new RuntimeException( |
| "Warning: Entry [" + exceptionWords[i] + "] already in dictionary 1"); |
| } |
| } |
| |
| for (int i = 0; i < directConflations.length; i++) { |
| if (!d.containsKey(directConflations[i][0])) { |
| entry = new DictEntry(directConflations[i][1], false); |
| d.put(directConflations[i][0], entry); |
| } else { |
| throw new RuntimeException( |
| "Warning: Entry [" + directConflations[i][0] + "] already in dictionary 2"); |
| } |
| } |
| |
| for (int i = 0; i < countryNationality.length; i++) { |
| if (!d.containsKey(countryNationality[i][0])) { |
| entry = new DictEntry(countryNationality[i][1], false); |
| d.put(countryNationality[i][0], entry); |
| } else { |
| throw new RuntimeException( |
| "Warning: Entry [" + countryNationality[i][0] + "] already in dictionary 3"); |
| } |
| } |
| |
| defaultEntry = new DictEntry(null, false); |
| |
| String[] array; |
| array = KStemData1.data; |
| |
| for (int i = 0; i < array.length; i++) { |
| if (!d.containsKey(array[i])) { |
| d.put(array[i], defaultEntry); |
| } else { |
| throw new RuntimeException("Warning: Entry [" + array[i] + "] already in dictionary 4"); |
| } |
| } |
| |
| array = KStemData2.data; |
| for (int i = 0; i < array.length; i++) { |
| if (!d.containsKey(array[i])) { |
| d.put(array[i], defaultEntry); |
| } else { |
| throw new RuntimeException("Warning: Entry [" + array[i] + "] already in dictionary 4"); |
| } |
| } |
| |
| array = KStemData3.data; |
| for (int i = 0; i < array.length; i++) { |
| if (!d.containsKey(array[i])) { |
| d.put(array[i], defaultEntry); |
| } else { |
| throw new RuntimeException("Warning: Entry [" + array[i] + "] already in dictionary 4"); |
| } |
| } |
| |
| array = KStemData4.data; |
| for (int i = 0; i < array.length; i++) { |
| if (!d.containsKey(array[i])) { |
| d.put(array[i], defaultEntry); |
| } else { |
| throw new RuntimeException("Warning: Entry [" + array[i] + "] already in dictionary 4"); |
| } |
| } |
| |
| array = KStemData5.data; |
| for (int i = 0; i < array.length; i++) { |
| if (!d.containsKey(array[i])) { |
| d.put(array[i], defaultEntry); |
| } else { |
| throw new RuntimeException("Warning: Entry [" + array[i] + "] already in dictionary 4"); |
| } |
| } |
| |
| array = KStemData6.data; |
| for (int i = 0; i < array.length; i++) { |
| if (!d.containsKey(array[i])) { |
| d.put(array[i], defaultEntry); |
| } else { |
| throw new RuntimeException("Warning: Entry [" + array[i] + "] already in dictionary 4"); |
| } |
| } |
| |
| array = KStemData7.data; |
| for (int i = 0; i < array.length; i++) { |
| if (!d.containsKey(array[i])) { |
| d.put(array[i], defaultEntry); |
| } else { |
| throw new RuntimeException("Warning: Entry [" + array[i] + "] already in dictionary 4"); |
| } |
| } |
| |
| for (int i = 0; i < KStemData8.data.length; i++) { |
| if (!d.containsKey(KStemData8.data[i])) { |
| d.put(KStemData8.data[i], defaultEntry); |
| } else { |
| throw new RuntimeException( |
| "Warning: Entry [" + KStemData8.data[i] + "] already in dictionary 4"); |
| } |
| } |
| |
| for (int i = 0; i < supplementDict.length; i++) { |
| if (!d.containsKey(supplementDict[i])) { |
| d.put(supplementDict[i], defaultEntry); |
| } else { |
| throw new RuntimeException( |
| "Warning: Entry [" + supplementDict[i] + "] already in dictionary 5"); |
| } |
| } |
| |
| for (int i = 0; i < properNouns.length; i++) { |
| if (!d.containsKey(properNouns[i])) { |
| d.put(properNouns[i], defaultEntry); |
| } else { |
| throw new RuntimeException( |
| "Warning: Entry [" + properNouns[i] + "] already in dictionary 6"); |
| } |
| } |
| |
| return d; |
| } |
| |
| private boolean isAlpha(char ch) { |
| return ch >= 'a' && ch <= 'z'; // terms must be lowercased already |
| } |
| |
| /* length of stem within word */ |
| private int stemLength() { |
| return j + 1; |
| } |
| ; |
| |
| private boolean endsIn(char[] s) { |
| if (s.length > k) return false; |
| |
| int r = word.length() - s.length; /* length of word before this suffix */ |
| j = k; |
| for (int r1 = r, i = 0; i < s.length; i++, r1++) { |
| if (s[i] != word.charAt(r1)) return false; |
| } |
| j = r - 1; /* index of the character BEFORE the posfix */ |
| return true; |
| } |
| |
| private boolean endsIn(char a, char b) { |
| if (2 > k) return false; |
| // check left to right since the endings have often already matched |
| if (word.charAt(k - 1) == a && word.charAt(k) == b) { |
| j = k - 2; |
| return true; |
| } |
| return false; |
| } |
| |
| private boolean endsIn(char a, char b, char c) { |
| if (3 > k) return false; |
| if (word.charAt(k - 2) == a && word.charAt(k - 1) == b && word.charAt(k) == c) { |
| j = k - 3; |
| return true; |
| } |
| return false; |
| } |
| |
| private boolean endsIn(char a, char b, char c, char d) { |
| if (4 > k) return false; |
| if (word.charAt(k - 3) == a |
| && word.charAt(k - 2) == b |
| && word.charAt(k - 1) == c |
| && word.charAt(k) == d) { |
| j = k - 4; |
| return true; |
| } |
| return false; |
| } |
| |
| private DictEntry wordInDict() { |
| /*** |
| * if (matchedEntry != null) { if (dict_ht.get(word.getArray(), 0, |
| * word.size()) != matchedEntry) { |
| * System.out.println("Uh oh... cached entry doesn't match"); } return |
| * matchedEntry; } |
| ***/ |
| if (matchedEntry != null) return matchedEntry; |
| DictEntry e = dict_ht.get(word.getArray(), 0, word.length()); |
| if (e != null && !e.exception) { |
| matchedEntry = e; // only cache if it's not an exception. |
| } |
| // lookups.add(word.toString()); |
| return e; |
| } |
| |
| /* Convert plurals to singular form, and '-ies' to 'y' */ |
| private void plural() { |
| if (word.charAt(k) == 's') { |
| if (endsIn('i', 'e', 's')) { |
| word.setLength(j + 3); |
| k--; |
| if (lookup()) /* ensure calories -> calorie */ return; |
| k++; |
| word.unsafeWrite('s'); |
| setSuffix("y"); |
| lookup(); |
| } else if (endsIn('e', 's')) { |
| /* try just removing the "s" */ |
| word.setLength(j + 2); |
| k--; |
| |
| /* |
| * note: don't check for exceptions here. So, `aides' -> `aide', but |
| * `aided' -> `aid'. The exception for double s is used to prevent |
| * crosses -> crosse. This is actually correct if crosses is a plural |
| * noun (a type of racket used in lacrosse), but the verb is much more |
| * common |
| */ |
| |
| /**** |
| * YCS: this was the one place where lookup was not followed by return. |
| * So restructure it. if ((j>0)&&(lookup(word.toString())) && |
| * !((word.charAt(j) == 's') && (word.charAt(j-1) == 's'))) return; |
| *****/ |
| boolean tryE = j > 0 && !((word.charAt(j) == 's') && (word.charAt(j - 1) == 's')); |
| if (tryE && lookup()) return; |
| |
| /* try removing the "es" */ |
| |
| word.setLength(j + 1); |
| k--; |
| if (lookup()) return; |
| |
| /* the default is to retain the "e" */ |
| word.unsafeWrite('e'); |
| k++; |
| |
| if (!tryE) lookup(); // if we didn't try the "e" ending before |
| return; |
| } else { |
| if (word.length() > 3 && penultChar() != 's' && !endsIn('o', 'u', 's')) { |
| /* unless the word ends in "ous" or a double "s", remove the final "s" */ |
| |
| word.setLength(k); |
| k--; |
| lookup(); |
| } |
| } |
| } |
| } |
| |
| private void setSuffix(String s) { |
| setSuff(s, s.length()); |
| } |
| |
| /* replace old suffix with s */ |
| private void setSuff(String s, int len) { |
| word.setLength(j + 1); |
| for (int l = 0; l < len; l++) { |
| word.unsafeWrite(s.charAt(l)); |
| } |
| k = j + len; |
| } |
| |
| /* Returns true if the word is found in the dictionary */ |
| // almost all uses of lookup() return immediately and are |
| // followed by another lookup in the dict. Store the match |
| // to avoid this double lookup. |
| DictEntry matchedEntry = null; |
| |
| private boolean lookup() { |
| /****** |
| * debugging code String thisLookup = word.toString(); boolean added = |
| * lookups.add(thisLookup); if (!added) { |
| * System.out.println("######extra lookup:" + thisLookup); // occaasional |
| * extra lookups aren't necessarily errors... could happen by diff |
| * manipulations // throw new RuntimeException("######extra lookup:" + |
| * thisLookup); } else { // System.out.println("new lookup:" + thisLookup); |
| * } |
| ******/ |
| |
| matchedEntry = dict_ht.get(word.getArray(), 0, word.size()); |
| return matchedEntry != null; |
| } |
| |
| // Set<String> lookups = new HashSet<>(); |
| |
| /* convert past tense (-ed) to present, and `-ied' to `y' */ |
| private void pastTense() { |
| /* |
| * Handle words less than 5 letters with a direct mapping This prevents |
| * (fled -> fl). |
| */ |
| if (word.length() <= 4) return; |
| |
| if (endsIn('i', 'e', 'd')) { |
| word.setLength(j + 3); |
| k--; |
| if (lookup()) /* we almost always want to convert -ied to -y, but */ |
| return; /* this isn't true for short words (died->die) */ |
| k++; /* I don't know any long words that this applies to, */ |
| word.unsafeWrite('d'); /* but just in case... */ |
| setSuffix("y"); |
| lookup(); |
| return; |
| } |
| |
| /* the vowelInStem() is necessary so we don't stem acronyms */ |
| if (endsIn('e', 'd') && vowelInStem()) { |
| /* see if the root ends in `e' */ |
| word.setLength(j + 2); |
| k = j + 1; |
| |
| DictEntry entry = wordInDict(); |
| if (entry != null) if (!entry.exception) /* |
| * if it's in the dictionary and |
| * not an exception |
| */ return; |
| |
| /* try removing the "ed" */ |
| word.setLength(j + 1); |
| k = j; |
| if (lookup()) return; |
| |
| /* |
| * try removing a doubled consonant. if the root isn't found in the |
| * dictionary, the default is to leave it doubled. This will correctly |
| * capture `backfilled' -> `backfill' instead of `backfill' -> |
| * `backfille', and seems correct most of the time |
| */ |
| |
| if (doubleC(k)) { |
| word.setLength(k); |
| k--; |
| if (lookup()) return; |
| word.unsafeWrite(word.charAt(k)); |
| k++; |
| lookup(); |
| return; |
| } |
| |
| /* if we have a `un-' prefix, then leave the word alone */ |
| /* (this will sometimes screw up with `under-', but we */ |
| /* will take care of that later) */ |
| |
| if ((word.charAt(0) == 'u') && (word.charAt(1) == 'n')) { |
| word.unsafeWrite('e'); |
| word.unsafeWrite('d'); |
| k = k + 2; |
| // nolookup() |
| return; |
| } |
| |
| /* |
| * it wasn't found by just removing the `d' or the `ed', so prefer to end |
| * with an `e' (e.g., `microcoded' -> `microcode'). |
| */ |
| |
| word.setLength(j + 1); |
| word.unsafeWrite('e'); |
| k = j + 1; |
| // nolookup() - we already tried the "e" ending |
| return; |
| } |
| } |
| |
| /* return TRUE if word ends with a double consonant */ |
| private boolean doubleC(int i) { |
| if (i < 1) return false; |
| |
| if (word.charAt(i) != word.charAt(i - 1)) return false; |
| return (isCons(i)); |
| } |
| |
| private boolean vowelInStem() { |
| for (int i = 0; i < stemLength(); i++) { |
| if (isVowel(i)) return true; |
| } |
| return false; |
| } |
| |
| /* handle `-ing' endings */ |
| private void aspect() { |
| /* |
| * handle short words (aging -> age) via a direct mapping. This prevents |
| * (thing -> the) in the version of this routine that ignores inflectional |
| * variants that are mentioned in the dictionary (when the root is also |
| * present) |
| */ |
| |
| if (word.length() <= 5) return; |
| |
| /* the vowelinstem() is necessary so we don't stem acronyms */ |
| if (endsIn('i', 'n', 'g') && vowelInStem()) { |
| |
| /* try adding an `e' to the stem and check against the dictionary */ |
| word.setCharAt(j + 1, 'e'); |
| word.setLength(j + 2); |
| k = j + 1; |
| |
| DictEntry entry = wordInDict(); |
| if (entry != null) { |
| if (!entry.exception) /* if it's in the dictionary and not an exception */ return; |
| } |
| |
| /* adding on the `e' didn't work, so remove it */ |
| word.setLength(k); |
| k--; /* note that `ing' has also been removed */ |
| |
| if (lookup()) return; |
| |
| /* if I can remove a doubled consonant and get a word, then do so */ |
| if (doubleC(k)) { |
| k--; |
| word.setLength(k + 1); |
| if (lookup()) return; |
| word.unsafeWrite(word.charAt(k)); /* restore the doubled consonant */ |
| |
| /* the default is to leave the consonant doubled */ |
| /* (e.g.,`fingerspelling' -> `fingerspell'). Unfortunately */ |
| /* `bookselling' -> `booksell' and `mislabelling' -> `mislabell'). */ |
| /* Without making the algorithm significantly more complicated, this */ |
| /* is the best I can do */ |
| k++; |
| lookup(); |
| return; |
| } |
| |
| /* |
| * the word wasn't in the dictionary after removing the stem, and then |
| * checking with and without a final `e'. The default is to add an `e' |
| * unless the word ends in two consonants, so `microcoding' -> |
| * `microcode'. The two consonants restriction wouldn't normally be |
| * necessary, but is needed because we don't try to deal with prefixes and |
| * compounds, and most of the time it is correct (e.g., footstamping -> |
| * footstamp, not footstampe; however, decoupled -> decoupl). We can |
| * prevent almost all of the incorrect stems if we try to do some prefix |
| * analysis first |
| */ |
| |
| if ((j > 0) && isCons(j) && isCons(j - 1)) { |
| k = j; |
| word.setLength(k + 1); |
| // nolookup() because we already did according to the comment |
| return; |
| } |
| |
| word.setLength(j + 1); |
| word.unsafeWrite('e'); |
| k = j + 1; |
| // nolookup(); we already tried an 'e' ending |
| return; |
| } |
| } |
| |
| /* |
| * this routine deals with -ity endings. It accepts -ability, -ibility, and |
| * -ality, even without checking the dictionary because they are so |
| * productive. The first two are mapped to -ble, and the -ity is remove for |
| * the latter |
| */ |
| private void ityEndings() { |
| int old_k = k; |
| |
| if (endsIn('i', 't', 'y')) { |
| word.setLength(j + 1); /* try just removing -ity */ |
| k = j; |
| if (lookup()) return; |
| word.unsafeWrite('e'); /* try removing -ity and adding -e */ |
| k = j + 1; |
| if (lookup()) return; |
| word.setCharAt(j + 1, 'i'); |
| word.append("ty"); |
| k = old_k; |
| /* |
| * the -ability and -ibility endings are highly productive, so just accept |
| * them |
| */ |
| if ((j > 0) && (word.charAt(j - 1) == 'i') && (word.charAt(j) == 'l')) { |
| word.setLength(j - 1); |
| word.append("le"); /* convert to -ble */ |
| k = j; |
| lookup(); |
| return; |
| } |
| |
| /* ditto for -ivity */ |
| if ((j > 0) && (word.charAt(j - 1) == 'i') && (word.charAt(j) == 'v')) { |
| word.setLength(j + 1); |
| word.unsafeWrite('e'); /* convert to -ive */ |
| k = j + 1; |
| lookup(); |
| return; |
| } |
| /* ditto for -ality */ |
| if ((j > 0) && (word.charAt(j - 1) == 'a') && (word.charAt(j) == 'l')) { |
| word.setLength(j + 1); |
| k = j; |
| lookup(); |
| return; |
| } |
| |
| /* |
| * if the root isn't in the dictionary, and the variant *is* there, then |
| * use the variant. This allows `immunity'->`immune', but prevents |
| * `capacity'->`capac'. If neither the variant nor the root form are in |
| * the dictionary, then remove the ending as a default |
| */ |
| |
| if (lookup()) return; |
| |
| /* the default is to remove -ity altogether */ |
| word.setLength(j + 1); |
| k = j; |
| // nolookup(), we already did it. |
| return; |
| } |
| } |
| |
| /* handle -ence and -ance */ |
| private void nceEndings() { |
| int old_k = k; |
| char word_char; |
| |
| if (endsIn('n', 'c', 'e')) { |
| word_char = word.charAt(j); |
| if (!((word_char == 'e') || (word_char == 'a'))) return; |
| word.setLength(j); |
| word.unsafeWrite('e'); /* try converting -e/ance to -e (adherance/adhere) */ |
| k = j; |
| if (lookup()) return; |
| word.setLength(j); /* |
| * try removing -e/ance altogether |
| * (disappearance/disappear) |
| */ |
| k = j - 1; |
| if (lookup()) return; |
| word.unsafeWrite(word_char); /* restore the original ending */ |
| word.append("nce"); |
| k = old_k; |
| // nolookup() because we restored the original ending |
| } |
| return; |
| } |
| |
| /* handle -ness */ |
| private void nessEndings() { |
| if (endsIn('n', 'e', 's', 's')) { |
| /* |
| * this is a very productive endings, so |
| * just accept it |
| */ |
| word.setLength(j + 1); |
| k = j; |
| if (word.charAt(j) == 'i') word.setCharAt(j, 'y'); |
| lookup(); |
| } |
| return; |
| } |
| |
| /* handle -ism */ |
| private void ismEndings() { |
| if (endsIn('i', 's', 'm')) { |
| /* |
| * this is a very productive ending, so just |
| * accept it |
| */ |
| word.setLength(j + 1); |
| k = j; |
| lookup(); |
| } |
| return; |
| } |
| |
| /* this routine deals with -ment endings. */ |
| private void mentEndings() { |
| int old_k = k; |
| |
| if (endsIn('m', 'e', 'n', 't')) { |
| word.setLength(j + 1); |
| k = j; |
| if (lookup()) return; |
| word.append("ment"); |
| k = old_k; |
| // nolookup |
| } |
| return; |
| } |
| |
| /* this routine deals with -ize endings. */ |
| private void izeEndings() { |
| int old_k = k; |
| |
| if (endsIn('i', 'z', 'e')) { |
| word.setLength(j + 1); /* try removing -ize entirely */ |
| k = j; |
| if (lookup()) return; |
| word.unsafeWrite('i'); |
| |
| if (doubleC(j)) { |
| /* allow for a doubled consonant */ |
| word.setLength(j); |
| k = j - 1; |
| if (lookup()) return; |
| word.unsafeWrite(word.charAt(j - 1)); |
| } |
| |
| word.setLength(j + 1); |
| word.unsafeWrite('e'); /* try removing -ize and adding -e */ |
| k = j + 1; |
| if (lookup()) return; |
| word.setLength(j + 1); |
| word.append("ize"); |
| k = old_k; |
| // nolookup() |
| } |
| return; |
| } |
| |
| /* handle -ency and -ancy */ |
| private void ncyEndings() { |
| if (endsIn('n', 'c', 'y')) { |
| if (!((word.charAt(j) == 'e') || (word.charAt(j) == 'a'))) return; |
| word.setCharAt(j + 2, 't'); /* try converting -ncy to -nt */ |
| word.setLength(j + 3); |
| k = j + 2; |
| |
| if (lookup()) return; |
| |
| word.setCharAt(j + 2, 'c'); /* the default is to convert it to -nce */ |
| word.unsafeWrite('e'); |
| k = j + 3; |
| lookup(); |
| } |
| return; |
| } |
| |
| /* handle -able and -ible */ |
| private void bleEndings() { |
| int old_k = k; |
| char word_char; |
| |
| if (endsIn('b', 'l', 'e')) { |
| if (!((word.charAt(j) == 'a') || (word.charAt(j) == 'i'))) return; |
| word_char = word.charAt(j); |
| word.setLength(j); /* try just removing the ending */ |
| k = j - 1; |
| if (lookup()) return; |
| if (doubleC(k)) { |
| /* allow for a doubled consonant */ |
| word.setLength(k); |
| k--; |
| if (lookup()) return; |
| k++; |
| word.unsafeWrite(word.charAt(k - 1)); |
| } |
| word.setLength(j); |
| word.unsafeWrite('e'); /* try removing -a/ible and adding -e */ |
| k = j; |
| if (lookup()) return; |
| word.setLength(j); |
| word.append("ate"); /* try removing -able and adding -ate */ |
| /* (e.g., compensable/compensate) */ |
| k = j + 2; |
| if (lookup()) return; |
| word.setLength(j); |
| word.unsafeWrite(word_char); /* restore the original values */ |
| word.append("ble"); |
| k = old_k; |
| // nolookup() |
| } |
| return; |
| } |
| |
| /* |
| * handle -ic endings. This is fairly straightforward, but this is also the |
| * only place we try *expanding* an ending, -ic -> -ical. This is to handle |
| * cases like `canonic' -> `canonical' |
| */ |
| private void icEndings() { |
| if (endsIn('i', 'c')) { |
| word.setLength(j + 3); |
| word.append("al"); /* try converting -ic to -ical */ |
| k = j + 4; |
| if (lookup()) return; |
| |
| word.setCharAt(j + 1, 'y'); /* try converting -ic to -y */ |
| word.setLength(j + 2); |
| k = j + 1; |
| if (lookup()) return; |
| |
| word.setCharAt(j + 1, 'e'); /* try converting -ic to -e */ |
| if (lookup()) return; |
| |
| word.setLength(j + 1); /* try removing -ic altogether */ |
| k = j; |
| if (lookup()) return; |
| word.append("ic"); /* restore the original ending */ |
| k = j + 2; |
| // nolookup() |
| } |
| return; |
| } |
| |
| private static char[] ization = "ization".toCharArray(); |
| private static char[] ition = "ition".toCharArray(); |
| private static char[] ation = "ation".toCharArray(); |
| private static char[] ication = "ication".toCharArray(); |
| |
| /* handle some derivational endings */ |
| /* |
| * this routine deals with -ion, -ition, -ation, -ization, and -ication. The |
| * -ization ending is always converted to -ize |
| */ |
| private void ionEndings() { |
| int old_k = k; |
| if (!endsIn('i', 'o', 'n')) { |
| return; |
| } |
| |
| if (endsIn(ization)) { |
| /* |
| * the -ize ending is very productive, so simply |
| * accept it as the root |
| */ |
| word.setLength(j + 3); |
| word.unsafeWrite('e'); |
| k = j + 3; |
| lookup(); |
| return; |
| } |
| |
| if (endsIn(ition)) { |
| word.setLength(j + 1); |
| word.unsafeWrite('e'); |
| k = j + 1; |
| if (lookup()) /* |
| * remove -ition and add `e', and check against the |
| * dictionary |
| */ |
| return; /* (e.g., definition->define, opposition->oppose) */ |
| |
| /* restore original values */ |
| word.setLength(j + 1); |
| word.append("ition"); |
| k = old_k; |
| // nolookup() |
| } else if (endsIn(ation)) { |
| word.setLength(j + 3); |
| word.unsafeWrite('e'); |
| k = j + 3; |
| if (lookup()) /* remove -ion and add `e', and check against the dictionary */ |
| return; /* (elmination -> eliminate) */ |
| |
| word.setLength(j + 1); |
| word.unsafeWrite('e'); /* |
| * remove -ation and add `e', and check against the |
| * dictionary |
| */ |
| k = j + 1; |
| if (lookup()) return; |
| |
| word.setLength(j + 1); /* |
| * just remove -ation (resignation->resign) and |
| * check dictionary |
| */ |
| k = j; |
| if (lookup()) return; |
| |
| /* restore original values */ |
| word.setLength(j + 1); |
| word.append("ation"); |
| k = old_k; |
| // nolookup() |
| |
| } |
| |
| /* |
| * test -ication after -ation is attempted (e.g., `complication->complicate' |
| * rather than `complication->comply') |
| */ |
| |
| if (endsIn(ication)) { |
| word.setLength(j + 1); |
| word.unsafeWrite('y'); |
| k = j + 1; |
| if (lookup()) /* |
| * remove -ication and add `y', and check against the |
| * dictionary |
| */ return; /* (e.g., amplification -> amplify) */ |
| |
| /* restore original values */ |
| word.setLength(j + 1); |
| word.append("ication"); |
| k = old_k; |
| // nolookup() |
| } |
| |
| // if (endsIn(ion)) { |
| if (true) { // we checked for this earlier... just need to set "j" |
| j = k - 3; // YCS |
| |
| word.setLength(j + 1); |
| word.unsafeWrite('e'); |
| k = j + 1; |
| if (lookup()) /* remove -ion and add `e', and check against the dictionary */ return; |
| |
| word.setLength(j + 1); |
| k = j; |
| if (lookup()) /* remove -ion, and if it's found, treat that as the root */ return; |
| |
| /* restore original values */ |
| word.setLength(j + 1); |
| word.append("ion"); |
| k = old_k; |
| // nolookup() |
| } |
| |
| // nolookup(); all of the other paths restored original values |
| return; |
| } |
| |
| /* |
| * this routine deals with -er, -or, -ier, and -eer. The -izer ending is |
| * always converted to -ize |
| */ |
| private void erAndOrEndings() { |
| int old_k = k; |
| |
| if (word.charAt(k) != 'r') return; // YCS |
| |
| char word_char; /* so we can remember if it was -er or -or */ |
| |
| if (endsIn('i', 'z', 'e', 'r')) { |
| /* |
| * -ize is very productive, so accept it |
| * as the root |
| */ |
| word.setLength(j + 4); |
| k = j + 3; |
| lookup(); |
| return; |
| } |
| |
| if (endsIn('e', 'r') || endsIn('o', 'r')) { |
| word_char = word.charAt(j + 1); |
| if (doubleC(j)) { |
| word.setLength(j); |
| k = j - 1; |
| if (lookup()) return; |
| word.unsafeWrite(word.charAt(j - 1)); /* restore the doubled consonant */ |
| } |
| |
| if (word.charAt(j) == 'i') { |
| /* do we have a -ier ending? */ |
| word.setCharAt(j, 'y'); |
| word.setLength(j + 1); |
| k = j; |
| if (lookup()) /* yes, so check against the dictionary */ return; |
| word.setCharAt(j, 'i'); /* restore the endings */ |
| word.unsafeWrite('e'); |
| } |
| |
| if (word.charAt(j) == 'e') { |
| /* handle -eer */ |
| word.setLength(j); |
| k = j - 1; |
| if (lookup()) return; |
| word.unsafeWrite('e'); |
| } |
| |
| word.setLength(j + 2); /* remove the -r ending */ |
| k = j + 1; |
| if (lookup()) return; |
| word.setLength(j + 1); /* try removing -er/-or */ |
| k = j; |
| if (lookup()) return; |
| word.unsafeWrite('e'); /* try removing -or and adding -e */ |
| k = j + 1; |
| if (lookup()) return; |
| word.setLength(j + 1); |
| word.unsafeWrite(word_char); |
| word.unsafeWrite('r'); /* restore the word to the way it was */ |
| k = old_k; |
| // nolookup() |
| } |
| } |
| |
| /* |
| * this routine deals with -ly endings. The -ally ending is always converted |
| * to -al Sometimes this will temporarily leave us with a non-word (e.g., |
| * heuristically maps to heuristical), but then the -al is removed in the next |
| * step. |
| */ |
| private void lyEndings() { |
| int old_k = k; |
| |
| if (endsIn('l', 'y')) { |
| |
| word.setCharAt(j + 2, 'e'); /* try converting -ly to -le */ |
| |
| if (lookup()) return; |
| word.setCharAt(j + 2, 'y'); |
| |
| word.setLength(j + 1); /* try just removing the -ly */ |
| k = j; |
| |
| if (lookup()) return; |
| |
| if ((j > 0) && (word.charAt(j - 1) == 'a') && (word.charAt(j) == 'l')) /* |
| * always |
| * convert |
| * - |
| * ally |
| * to |
| * - |
| * al |
| */ return; |
| word.append("ly"); |
| k = old_k; |
| |
| if ((j > 0) && (word.charAt(j - 1) == 'a') && (word.charAt(j) == 'b')) { |
| /* |
| * always |
| * convert |
| * - |
| * ably |
| * to |
| * - |
| * able |
| */ |
| word.setCharAt(j + 2, 'e'); |
| k = j + 2; |
| return; |
| } |
| |
| if (word.charAt(j) == 'i') { |
| /* e.g., militarily -> military */ |
| word.setLength(j); |
| word.unsafeWrite('y'); |
| k = j; |
| if (lookup()) return; |
| word.setLength(j); |
| word.append("ily"); |
| k = old_k; |
| } |
| |
| word.setLength(j + 1); /* the default is to remove -ly */ |
| |
| k = j; |
| // nolookup()... we already tried removing the "ly" variant |
| } |
| return; |
| } |
| |
| /* |
| * this routine deals with -al endings. Some of the endings from the previous |
| * routine are finished up here. |
| */ |
| private void alEndings() { |
| int old_k = k; |
| |
| if (word.length() < 4) return; |
| if (endsIn('a', 'l')) { |
| word.setLength(j + 1); |
| k = j; |
| if (lookup()) /* try just removing the -al */ return; |
| |
| if (doubleC(j)) { |
| /* allow for a doubled consonant */ |
| word.setLength(j); |
| k = j - 1; |
| if (lookup()) return; |
| word.unsafeWrite(word.charAt(j - 1)); |
| } |
| |
| word.setLength(j + 1); |
| word.unsafeWrite('e'); /* try removing the -al and adding -e */ |
| k = j + 1; |
| if (lookup()) return; |
| |
| word.setLength(j + 1); |
| word.append("um"); /* try converting -al to -um */ |
| /* (e.g., optimal - > optimum ) */ |
| k = j + 2; |
| if (lookup()) return; |
| |
| word.setLength(j + 1); |
| word.append("al"); /* restore the ending to the way it was */ |
| k = old_k; |
| |
| if ((j > 0) && (word.charAt(j - 1) == 'i') && (word.charAt(j) == 'c')) { |
| word.setLength(j - 1); /* try removing -ical */ |
| k = j - 2; |
| if (lookup()) return; |
| |
| word.setLength(j - 1); |
| word.unsafeWrite('y'); /* try turning -ical to -y (e.g., bibliographical) */ |
| k = j - 1; |
| if (lookup()) return; |
| |
| word.setLength(j - 1); |
| word.append("ic"); /* the default is to convert -ical to -ic */ |
| k = j; |
| // nolookup() ... converting ical to ic means removing "al" which we |
| // already tried |
| // ERROR |
| lookup(); |
| return; |
| } |
| |
| if (word.charAt(j) == 'i') { |
| /* sometimes -ial endings should be removed */ |
| word.setLength(j); /* (sometimes it gets turned into -y, but we */ |
| k = j - 1; /* aren't dealing with that case for now) */ |
| if (lookup()) return; |
| word.append("ial"); |
| k = old_k; |
| lookup(); |
| } |
| } |
| return; |
| } |
| |
| /* |
| * this routine deals with -ive endings. It normalizes some of the -ative |
| * endings directly, and also maps some -ive endings to -ion. |
| */ |
| private void iveEndings() { |
| int old_k = k; |
| |
| if (endsIn('i', 'v', 'e')) { |
| word.setLength(j + 1); /* try removing -ive entirely */ |
| k = j; |
| if (lookup()) return; |
| |
| word.unsafeWrite('e'); /* try removing -ive and adding -e */ |
| k = j + 1; |
| if (lookup()) return; |
| word.setLength(j + 1); |
| word.append("ive"); |
| if ((j > 0) && (word.charAt(j - 1) == 'a') && (word.charAt(j) == 't')) { |
| word.setCharAt(j - 1, 'e'); /* try removing -ative and adding -e */ |
| word.setLength(j); /* (e.g., determinative -> determine) */ |
| k = j - 1; |
| if (lookup()) return; |
| word.setLength(j - 1); /* try just removing -ative */ |
| if (lookup()) return; |
| |
| word.append("ative"); |
| k = old_k; |
| } |
| |
| /* try mapping -ive to -ion (e.g., injunctive/injunction) */ |
| word.setCharAt(j + 2, 'o'); |
| word.setCharAt(j + 3, 'n'); |
| if (lookup()) return; |
| |
| word.setCharAt(j + 2, 'v'); /* restore the original values */ |
| word.setCharAt(j + 3, 'e'); |
| k = old_k; |
| // nolookup() |
| } |
| return; |
| } |
| |
| KStemmer() {} |
| |
| String stem(String term) { |
| boolean changed = stem(term.toCharArray(), term.length()); |
| if (!changed) return term; |
| return asString(); |
| } |
| |
| /** Returns the result of the stem (assuming the word was changed) as a String. */ |
| String asString() { |
| String s = getString(); |
| if (s != null) return s; |
| return word.toString(); |
| } |
| |
| CharSequence asCharSequence() { |
| return result != null ? result : word; |
| } |
| |
| String getString() { |
| return result; |
| } |
| |
| char[] getChars() { |
| return word.getArray(); |
| } |
| |
| int getLength() { |
| return word.length(); |
| } |
| |
| String result; |
| |
| private boolean matched() { |
| /*** |
| * if (!lookups.contains(word.toString())) { throw new |
| * RuntimeException("didn't look up "+word.toString()+" prev="+prevLookup); |
| * } |
| ***/ |
| // lookup(); |
| return matchedEntry != null; |
| } |
| |
| /** Stems the text in the token. Returns true if changed. */ |
| boolean stem(char[] term, int len) { |
| |
| result = null; |
| |
| k = len - 1; |
| if ((k <= 1) || (k >= MaxWordLen - 1)) { |
| return false; // don't stem |
| } |
| |
| // first check the stemmer dictionaries, and avoid using the |
| // cache if it's in there. |
| DictEntry entry = dict_ht.get(term, 0, len); |
| if (entry != null) { |
| if (entry.root != null) { |
| result = entry.root; |
| return true; |
| } |
| return false; |
| } |
| |
| /*** |
| * caching off is normally faster if (cache == null) initializeStemHash(); |
| * |
| * // now check the cache, before we copy chars to "word" if (cache != null) |
| * { String val = cache.get(term, 0, len); if (val != null) { if (val != |
| * SAME) { result = val; return true; } return false; } } |
| ***/ |
| |
| word.reset(); |
| // allocate enough space so that an expansion is never needed |
| word.reserve(len + 10); |
| for (int i = 0; i < len; i++) { |
| char ch = term[i]; |
| if (!isAlpha(ch)) return false; // don't stem |
| // don't lowercase... it's a requirement that lowercase filter be |
| // used before this stemmer. |
| word.unsafeWrite(ch); |
| } |
| |
| matchedEntry = null; |
| /*** |
| * lookups.clear(); lookups.add(word.toString()); |
| ***/ |
| |
| /* |
| * This while loop will never be executed more than one time; it is here |
| * only to allow the break statement to be used to escape as soon as a word |
| * is recognized |
| */ |
| while (true) { |
| // YCS: extra lookup()s were inserted so we don't need to |
| // do an extra wordInDict() here. |
| plural(); |
| if (matched()) break; |
| pastTense(); |
| if (matched()) break; |
| aspect(); |
| if (matched()) break; |
| ityEndings(); |
| if (matched()) break; |
| nessEndings(); |
| if (matched()) break; |
| ionEndings(); |
| if (matched()) break; |
| erAndOrEndings(); |
| if (matched()) break; |
| lyEndings(); |
| if (matched()) break; |
| alEndings(); |
| if (matched()) break; |
| entry = wordInDict(); |
| iveEndings(); |
| if (matched()) break; |
| izeEndings(); |
| if (matched()) break; |
| mentEndings(); |
| if (matched()) break; |
| bleEndings(); |
| if (matched()) break; |
| ismEndings(); |
| if (matched()) break; |
| icEndings(); |
| if (matched()) break; |
| ncyEndings(); |
| if (matched()) break; |
| nceEndings(); |
| matched(); |
| break; |
| } |
| |
| /* |
| * try for a direct mapping (allows for cases like `Italian'->`Italy' and |
| * `Italians'->`Italy') |
| */ |
| entry = matchedEntry; |
| if (entry != null) { |
| result = entry.root; // may be null, which means that "word" is the stem |
| } |
| |
| /*** |
| * caching off is normally faster if (cache != null && cache.size() < |
| * maxCacheSize) { char[] key = new char[len]; System.arraycopy(term, 0, |
| * key, 0, len); if (result != null) { cache.put(key, result); } else { |
| * cache.put(key, word.toString()); } } |
| ***/ |
| |
| /*** |
| * if (entry == null) { if (!word.toString().equals(new String(term,0,len))) |
| * { System.out.println("CASE:" + word.toString() + "," + new |
| * String(term,0,len)); |
| * |
| * } } |
| ***/ |
| |
| // no entry matched means result is "word" |
| return true; |
| } |
| } |