| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.analysis.hunspell; |
| |
| import java.util.ArrayList; |
| import java.util.List; |
| import org.apache.lucene.util.BytesRef; |
| import org.apache.lucene.util.IntsRef; |
| |
| /** |
| * A spell checker based on Hunspell dictionaries. The objects of this class are not thread-safe |
| * (but a single underlying Dictionary can be shared by multiple spell-checkers in different |
| * threads). Not all Hunspell features are supported yet. |
| */ |
| public class SpellChecker { |
| private final Dictionary dictionary; |
| private final BytesRef scratch = new BytesRef(); |
| private final Stemmer stemmer; |
| |
| public SpellChecker(Dictionary dictionary) { |
| this.dictionary = dictionary; |
| stemmer = new Stemmer(dictionary); |
| } |
| |
| /** @return whether the given word's spelling is considered correct according to Hunspell rules */ |
| public boolean spell(String word) { |
| if (word.isEmpty()) return true; |
| |
| if (dictionary.needsInputCleaning) { |
| word = dictionary.cleanInput(word, new StringBuilder()).toString(); |
| } |
| |
| if (isNumber(word)) { |
| return true; |
| } |
| |
| char[] wordChars = word.toCharArray(); |
| if (checkWord(wordChars, wordChars.length, false)) { |
| return true; |
| } |
| |
| WordCase wc = stemmer.caseOf(wordChars, wordChars.length); |
| if ((wc == WordCase.UPPER || wc == WordCase.TITLE) && checkCaseVariants(wordChars, wc)) { |
| return true; |
| } |
| |
| if (dictionary.breaks.isNotEmpty() |
| && !hasTooManyBreakOccurrences(word) |
| && !dictionary.isForbiddenWord(wordChars, word.length(), scratch)) { |
| return tryBreaks(word); |
| } |
| |
| return false; |
| } |
| |
| private boolean checkCaseVariants(char[] wordChars, WordCase wordCase) { |
| char[] caseVariant = wordChars; |
| if (wordCase == WordCase.UPPER) { |
| caseVariant = stemmer.caseFoldTitle(caseVariant, wordChars.length); |
| if (checkWord(caseVariant, wordChars.length, true)) { |
| return true; |
| } |
| char[] aposCase = Stemmer.capitalizeAfterApostrophe(caseVariant, wordChars.length); |
| if (aposCase != null && checkWord(aposCase, aposCase.length, true)) { |
| return true; |
| } |
| } |
| return checkWord(stemmer.caseFoldLower(caseVariant, wordChars.length), wordChars.length, true); |
| } |
| |
| private boolean checkWord(char[] wordChars, int length, boolean caseVariant) { |
| if (dictionary.isForbiddenWord(wordChars, length, scratch)) { |
| return false; |
| } |
| |
| if (!stemmer.doStem(wordChars, length, caseVariant).isEmpty()) { |
| return true; |
| } |
| |
| if (dictionary.hasCompounding()) { |
| return checkCompounds(wordChars, 0, length, new ArrayList<>()); |
| } |
| |
| return false; |
| } |
| |
| private boolean checkCompounds(char[] wordChars, int offset, int length, List<IntsRef> words) { |
| if (words.size() >= 100) return false; |
| |
| int limit = length - dictionary.compoundMin + 1; |
| for (int breakPos = dictionary.compoundMin; breakPos < limit; breakPos++) { |
| IntsRef forms = dictionary.lookupWord(wordChars, offset, breakPos); |
| if (forms != null) { |
| words.add(forms); |
| |
| if (dictionary.compoundRules != null |
| && dictionary.compoundRules.stream().anyMatch(r -> r.mayMatch(words, scratch))) { |
| if (checkLastCompoundPart(wordChars, offset + breakPos, length - breakPos, words)) { |
| return true; |
| } |
| |
| if (checkCompounds(wordChars, offset + breakPos, length - breakPos, words)) { |
| return true; |
| } |
| } |
| |
| words.remove(words.size() - 1); |
| } |
| } |
| |
| return false; |
| } |
| |
| private boolean checkLastCompoundPart( |
| char[] wordChars, int start, int length, List<IntsRef> words) { |
| IntsRef forms = dictionary.lookupWord(wordChars, start, length); |
| if (forms == null) return false; |
| |
| words.add(forms); |
| boolean result = |
| dictionary.compoundRules != null |
| && dictionary.compoundRules.stream().anyMatch(r -> r.fullyMatches(words, scratch)); |
| words.remove(words.size() - 1); |
| return result; |
| } |
| |
| private static boolean isNumber(String s) { |
| int i = 0; |
| while (i < s.length()) { |
| char c = s.charAt(i); |
| if (isDigit(c)) { |
| i++; |
| } else if (c == '.' || c == ',' || c == '-') { |
| if (i == 0 || i >= s.length() - 1 || !isDigit(s.charAt(i + 1))) { |
| return false; |
| } |
| i += 2; |
| } else { |
| return false; |
| } |
| } |
| return true; |
| } |
| |
| private static boolean isDigit(char c) { |
| return c >= '0' && c <= '9'; |
| } |
| |
| private boolean tryBreaks(String word) { |
| for (String br : dictionary.breaks.starting) { |
| if (word.length() > br.length() && word.startsWith(br)) { |
| if (spell(word.substring(br.length()))) { |
| return true; |
| } |
| } |
| } |
| |
| for (String br : dictionary.breaks.ending) { |
| if (word.length() > br.length() && word.endsWith(br)) { |
| if (spell(word.substring(0, word.length() - br.length()))) { |
| return true; |
| } |
| } |
| } |
| |
| for (String br : dictionary.breaks.middle) { |
| int pos = word.indexOf(br); |
| if (canBeBrokenAt(word, br, pos)) { |
| return true; |
| } |
| |
| // try to break at the second occurrence |
| // to recognize dictionary words with a word break |
| if (pos > 0 && canBeBrokenAt(word, br, word.indexOf(br, pos + 1))) { |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| private boolean hasTooManyBreakOccurrences(String word) { |
| int occurrences = 0; |
| for (String br : dictionary.breaks.middle) { |
| int pos = 0; |
| while ((pos = word.indexOf(br, pos)) >= 0) { |
| if (++occurrences >= 10) return true; |
| pos += br.length(); |
| } |
| } |
| return false; |
| } |
| |
| private boolean canBeBrokenAt(String word, String breakStr, int breakPos) { |
| return breakPos > 0 |
| && breakPos < word.length() - breakStr.length() |
| && spell(word.substring(0, breakPos)) |
| && spell(word.substring(breakPos + breakStr.length())); |
| } |
| } |