| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.analysis.hunspell; |
| |
| import static org.apache.lucene.analysis.hunspell.Dictionary.FLAG_UNSET; |
| import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_BEGIN; |
| import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_END; |
| import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_MIDDLE; |
| import static org.apache.lucene.analysis.hunspell.WordContext.SIMPLE_WORD; |
| |
| import java.io.IOException; |
| import java.util.ArrayList; |
| import java.util.Collections; |
| import java.util.LinkedHashSet; |
| import java.util.List; |
| import java.util.Locale; |
| import java.util.Set; |
| import java.util.stream.Collectors; |
| import org.apache.lucene.util.CharsRef; |
| import org.apache.lucene.util.IntsRef; |
| |
| /** |
| * A spell checker based on Hunspell dictionaries. This class can be used in place of native |
| * Hunspell for many languages for spell-checking and suggesting purposes. Note that not all |
| * languages are supported yet. For example: |
| * |
| * <ul> |
| * <li>Hungarian (as it doesn't only rely on dictionaries, but has some logic directly in the |
| * source code |
| * <li>Languages with Unicode characters outside of the Basic Multilingual Plane |
| * <li>PHONE affix file option for suggestions |
| * </ul> |
| * |
| * <p>The objects of this class are not thread-safe (but a single underlying Dictionary can be |
| * shared by multiple spell-checkers in different threads). |
| */ |
| public class Hunspell { |
| final Dictionary dictionary; |
| final Stemmer stemmer; |
| |
| public Hunspell(Dictionary dictionary) { |
| this.dictionary = dictionary; |
| stemmer = new Stemmer(dictionary); |
| } |
| |
| /** @return whether the given word's spelling is considered correct according to Hunspell rules */ |
| public boolean spell(String word) { |
| if (word.isEmpty()) return true; |
| |
| if (dictionary.needsInputCleaning) { |
| word = dictionary.cleanInput(word, new StringBuilder()).toString(); |
| } |
| |
| if (word.endsWith(".")) { |
| return spellWithTrailingDots(word); |
| } |
| |
| return spellClean(word); |
| } |
| |
| private boolean spellClean(String word) { |
| if (isNumber(word)) { |
| return true; |
| } |
| |
| char[] wordChars = word.toCharArray(); |
| Boolean simpleResult = checkSimpleWord(wordChars, wordChars.length, null); |
| if (simpleResult != null) { |
| return simpleResult; |
| } |
| |
| if (checkCompounds(wordChars, wordChars.length, null)) { |
| return true; |
| } |
| |
| WordCase wc = stemmer.caseOf(wordChars, wordChars.length); |
| if ((wc == WordCase.UPPER || wc == WordCase.TITLE)) { |
| Stemmer.CaseVariationProcessor variationProcessor = |
| (variant, varLength, originalCase) -> !checkWord(variant, varLength, originalCase); |
| if (!stemmer.varyCase(wordChars, wordChars.length, wc, variationProcessor)) { |
| return true; |
| } |
| } |
| |
| if (dictionary.breaks.isNotEmpty() && !hasTooManyBreakOccurrences(word)) { |
| return tryBreaks(word); |
| } |
| |
| return false; |
| } |
| |
| private boolean spellWithTrailingDots(String word) { |
| int length = word.length() - 1; |
| while (length > 0 && word.charAt(length - 1) == '.') { |
| length--; |
| } |
| return spellClean(word.substring(0, length)) || spellClean(word.substring(0, length + 1)); |
| } |
| |
| boolean checkWord(String word) { |
| return checkWord(word.toCharArray(), word.length(), null); |
| } |
| |
| Boolean checkSimpleWord(char[] wordChars, int length, WordCase originalCase) { |
| Root<CharsRef> entry = findStem(wordChars, 0, length, originalCase, SIMPLE_WORD); |
| if (entry != null) { |
| return !dictionary.hasFlag(entry.entryId, dictionary.forbiddenword); |
| } |
| |
| return null; |
| } |
| |
| private boolean checkWord(char[] wordChars, int length, WordCase originalCase) { |
| Boolean simpleResult = checkSimpleWord(wordChars, length, originalCase); |
| if (simpleResult != null) { |
| return simpleResult; |
| } |
| |
| return checkCompounds(wordChars, length, originalCase); |
| } |
| |
| private boolean checkCompounds(char[] wordChars, int length, WordCase originalCase) { |
| if (dictionary.compoundRules != null |
| && checkCompoundRules(wordChars, 0, length, new ArrayList<>())) { |
| return true; |
| } |
| |
| if (dictionary.compoundBegin != FLAG_UNSET || dictionary.compoundFlag != FLAG_UNSET) { |
| return checkCompounds(new CharsRef(wordChars, 0, length), originalCase, null); |
| } |
| |
| return false; |
| } |
| |
| private Root<CharsRef> findStem( |
| char[] wordChars, int offset, int length, WordCase originalCase, WordContext context) { |
| @SuppressWarnings({"rawtypes", "unchecked"}) |
| Root<CharsRef>[] result = new Root[1]; |
| stemmer.doStem( |
| wordChars, |
| offset, |
| length, |
| originalCase, |
| context, |
| (stem, formID, morphDataId) -> { |
| if (acceptsStem(formID)) { |
| result[0] = new Root<>(stem, formID); |
| } |
| return false; |
| }); |
| return result[0]; |
| } |
| |
| boolean acceptsStem(int formID) { |
| return true; |
| } |
| |
| private boolean checkCompounds(CharsRef word, WordCase originalCase, CompoundPart prev) { |
| if (prev != null && prev.index > dictionary.compoundMax - 2) return false; |
| |
| int limit = word.length - dictionary.compoundMin + 1; |
| for (int breakPos = dictionary.compoundMin; breakPos < limit; breakPos++) { |
| WordContext context = prev == null ? COMPOUND_BEGIN : COMPOUND_MIDDLE; |
| int breakOffset = word.offset + breakPos; |
| if (mayBreakIntoCompounds(word.chars, word.offset, word.length, breakOffset)) { |
| Root<CharsRef> stem = findStem(word.chars, word.offset, breakPos, originalCase, context); |
| if (stem == null |
| && dictionary.simplifiedTriple |
| && word.chars[breakOffset - 1] == word.chars[breakOffset]) { |
| stem = findStem(word.chars, word.offset, breakPos + 1, originalCase, context); |
| } |
| if (stem != null |
| && !dictionary.hasFlag(stem.entryId, dictionary.forbiddenword) |
| && (prev == null || prev.mayCompound(stem, breakPos, originalCase))) { |
| CompoundPart part = new CompoundPart(prev, word, breakPos, stem, null); |
| if (checkCompoundsAfter(originalCase, part)) { |
| return true; |
| } |
| } |
| } |
| |
| if (checkCompoundPatternReplacements(word, breakPos, originalCase, prev)) { |
| return true; |
| } |
| } |
| |
| return false; |
| } |
| |
| private boolean checkCompoundPatternReplacements( |
| CharsRef word, int pos, WordCase originalCase, CompoundPart prev) { |
| for (CheckCompoundPattern pattern : dictionary.checkCompoundPatterns) { |
| CharsRef expanded = pattern.expandReplacement(word, pos); |
| if (expanded != null) { |
| WordContext context = prev == null ? COMPOUND_BEGIN : COMPOUND_MIDDLE; |
| int breakPos = pos + pattern.endLength(); |
| Root<CharsRef> stem = |
| findStem(expanded.chars, expanded.offset, breakPos, originalCase, context); |
| if (stem != null) { |
| CompoundPart part = new CompoundPart(prev, expanded, breakPos, stem, pattern); |
| if (checkCompoundsAfter(originalCase, part)) { |
| return true; |
| } |
| } |
| } |
| } |
| return false; |
| } |
| |
| private boolean checkCompoundsAfter(WordCase originalCase, CompoundPart prev) { |
| CharsRef word = prev.tail; |
| int breakPos = prev.length; |
| int remainingLength = word.length - breakPos; |
| int breakOffset = word.offset + breakPos; |
| Root<CharsRef> tailStem = |
| findStem(word.chars, breakOffset, remainingLength, originalCase, COMPOUND_END); |
| if (tailStem != null |
| && !dictionary.hasFlag(tailStem.entryId, dictionary.forbiddenword) |
| && !(dictionary.checkCompoundDup && equalsIgnoreCase(prev.stem, tailStem.word)) |
| && !hasForceUCaseProblem(word.chars, breakOffset, remainingLength, originalCase) |
| && prev.mayCompound(tailStem, remainingLength, originalCase)) { |
| return true; |
| } |
| |
| CharsRef tail = new CharsRef(word.chars, breakOffset, remainingLength); |
| return checkCompounds(tail, originalCase, prev); |
| } |
| |
| private boolean hasForceUCaseProblem( |
| char[] chars, int offset, int length, WordCase originalCase) { |
| if (dictionary.forceUCase == FLAG_UNSET) return false; |
| if (originalCase == WordCase.TITLE || originalCase == WordCase.UPPER) return false; |
| |
| IntsRef forms = dictionary.lookupWord(chars, offset, length); |
| return forms != null && dictionary.hasFlag(forms, dictionary.forceUCase); |
| } |
| |
| private boolean equalsIgnoreCase(CharSequence cr1, CharSequence cr2) { |
| return cr1.toString().equalsIgnoreCase(cr2.toString()); |
| } |
| |
| /** |
| * Find all roots that could result in the given word after case conversion and adding affixes. |
| * This corresponds to the original {@code hunspell -s} (stemming) functionality. |
| * |
| * <p>Some affix rules are relaxed in this stemming process: e.g. explicitly forbidden words are |
| * still returned. Some of the returned roots may be synthetic and not directly occur in the *.dic |
| * file (but differ from some existing entries in case). No roots are returned for compound words. |
| * |
| * <p>The returned roots may be used to retrieve morphological data via {@link |
| * Dictionary#lookupEntries}. |
| */ |
| public List<String> getRoots(String word) { |
| return stemmer.stem(word).stream() |
| .map(CharsRef::toString) |
| .distinct() |
| .collect(Collectors.toList()); |
| } |
| |
| private class CompoundPart { |
| final CompoundPart prev; |
| final int index, length; |
| final CharsRef tail, stem; |
| final CheckCompoundPattern enablingPattern; |
| |
| CompoundPart( |
| CompoundPart prev, |
| CharsRef tail, |
| int length, |
| Root<CharsRef> stem, |
| CheckCompoundPattern enabler) { |
| this.prev = prev; |
| this.tail = tail; |
| this.length = length; |
| this.stem = stem.word; |
| index = prev == null ? 1 : prev.index + 1; |
| enablingPattern = enabler; |
| } |
| |
| @Override |
| public String toString() { |
| return (prev == null ? "" : prev + "+") + tail.subSequence(0, length); |
| } |
| |
| boolean mayCompound(Root<CharsRef> nextStem, int nextPartLength, WordCase originalCase) { |
| boolean patternsOk = |
| enablingPattern != null |
| ? enablingPattern.prohibitsCompounding(tail, length, stem, nextStem.word) |
| : dictionary.checkCompoundPatterns.stream() |
| .noneMatch(p -> p.prohibitsCompounding(tail, length, stem, nextStem.word)); |
| if (!patternsOk) { |
| return false; |
| } |
| |
| if (dictionary.checkCompoundRep |
| && isMisspelledSimpleWord(length + nextPartLength, originalCase)) { |
| return false; |
| } |
| |
| String spaceSeparated = |
| new String(tail.chars, tail.offset, length) |
| + " " |
| + new String(tail.chars, tail.offset + length, nextPartLength); |
| return !checkWord(spaceSeparated); |
| } |
| |
| private boolean isMisspelledSimpleWord(int length, WordCase originalCase) { |
| String word = new String(tail.chars, tail.offset, length); |
| for (RepEntry entry : dictionary.repTable) { |
| if (entry.isMiddle()) { |
| for (String sug : entry.substitute(word)) { |
| if (findStem(sug.toCharArray(), 0, sug.length(), originalCase, SIMPLE_WORD) != null) { |
| return true; |
| } |
| } |
| } |
| } |
| return false; |
| } |
| } |
| |
| private boolean mayBreakIntoCompounds(char[] chars, int offset, int length, int breakPos) { |
| if (dictionary.checkCompoundCase) { |
| if (Character.isUpperCase(chars[breakPos - 1]) || Character.isUpperCase(chars[breakPos])) { |
| return false; |
| } |
| } |
| if (dictionary.checkCompoundTriple && chars[breakPos - 1] == chars[breakPos]) { |
| //noinspection RedundantIfStatement |
| if (breakPos > offset + 1 && chars[breakPos - 2] == chars[breakPos - 1] |
| || breakPos < length - 1 && chars[breakPos] == chars[breakPos + 1]) { |
| return false; |
| } |
| } |
| return true; |
| } |
| |
| private boolean checkCompoundRules( |
| char[] wordChars, int offset, int length, List<IntsRef> words) { |
| if (words.size() >= 100) return false; |
| |
| int limit = length - dictionary.compoundMin + 1; |
| for (int breakPos = dictionary.compoundMin; breakPos < limit; breakPos++) { |
| IntsRef forms = dictionary.lookupWord(wordChars, offset, breakPos); |
| if (forms != null) { |
| words.add(forms); |
| |
| if (dictionary.compoundRules != null |
| && dictionary.compoundRules.stream().anyMatch(r -> r.mayMatch(words))) { |
| if (checkLastCompoundPart(wordChars, offset + breakPos, length - breakPos, words)) { |
| return true; |
| } |
| |
| if (checkCompoundRules(wordChars, offset + breakPos, length - breakPos, words)) { |
| return true; |
| } |
| } |
| |
| words.remove(words.size() - 1); |
| } |
| } |
| |
| return false; |
| } |
| |
| private boolean checkLastCompoundPart( |
| char[] wordChars, int start, int length, List<IntsRef> words) { |
| IntsRef forms = dictionary.lookupWord(wordChars, start, length); |
| if (forms == null) return false; |
| |
| words.add(forms); |
| boolean result = dictionary.compoundRules.stream().anyMatch(r -> r.fullyMatches(words)); |
| words.remove(words.size() - 1); |
| return result; |
| } |
| |
| private static boolean isNumber(String s) { |
| int i = 0; |
| while (i < s.length()) { |
| char c = s.charAt(i); |
| if (isDigit(c)) { |
| i++; |
| } else if (c == '.' || c == ',' || c == '-') { |
| if (i == 0 || i >= s.length() - 1 || !isDigit(s.charAt(i + 1))) { |
| return false; |
| } |
| i += 2; |
| } else { |
| return false; |
| } |
| } |
| return true; |
| } |
| |
| private static boolean isDigit(char c) { |
| return c >= '0' && c <= '9'; |
| } |
| |
| private boolean tryBreaks(String word) { |
| for (String br : dictionary.breaks.starting) { |
| if (word.length() > br.length() && word.startsWith(br)) { |
| if (spell(word.substring(br.length()))) { |
| return true; |
| } |
| } |
| } |
| |
| for (String br : dictionary.breaks.ending) { |
| if (word.length() > br.length() && word.endsWith(br)) { |
| if (spell(word.substring(0, word.length() - br.length()))) { |
| return true; |
| } |
| } |
| } |
| |
| for (String br : dictionary.breaks.middle) { |
| int pos = word.indexOf(br); |
| if (canBeBrokenAt(word, br, pos)) { |
| return true; |
| } |
| |
| // try to break at the second occurrence |
| // to recognize dictionary words with a word break |
| if (pos > 0 && canBeBrokenAt(word, br, word.indexOf(br, pos + 1))) { |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| private boolean hasTooManyBreakOccurrences(String word) { |
| int occurrences = 0; |
| for (String br : dictionary.breaks.middle) { |
| int pos = 0; |
| while ((pos = word.indexOf(br, pos)) >= 0) { |
| if (++occurrences >= 10) return true; |
| pos += br.length(); |
| } |
| } |
| return false; |
| } |
| |
| private boolean canBeBrokenAt(String word, String breakStr, int breakPos) { |
| return breakPos > 0 |
| && breakPos < word.length() - breakStr.length() |
| && spell(word.substring(0, breakPos)) |
| && spell(word.substring(breakPos + breakStr.length())); |
| } |
| |
| public List<String> suggest(String word) { |
| if (word.length() >= 100) return Collections.emptyList(); |
| |
| if (dictionary.needsInputCleaning) { |
| word = dictionary.cleanInput(word, new StringBuilder()).toString(); |
| } |
| |
| WordCase wordCase = WordCase.caseOf(word); |
| if (dictionary.forceUCase != FLAG_UNSET && wordCase == WordCase.LOWER) { |
| String title = dictionary.toTitleCase(word); |
| if (spell(title)) { |
| return Collections.singletonList(title); |
| } |
| } |
| |
| Hunspell suggestionSpeller = |
| new Hunspell(dictionary) { |
| @Override |
| boolean acceptsStem(int formID) { |
| return !dictionary.hasFlag(formID, dictionary.noSuggest) |
| && !dictionary.hasFlag(formID, dictionary.subStandard); |
| } |
| }; |
| ModifyingSuggester modifier = new ModifyingSuggester(suggestionSpeller); |
| Set<String> suggestions = modifier.suggest(word, wordCase); |
| |
| if (!modifier.hasGoodSuggestions && dictionary.maxNGramSuggestions > 0) { |
| suggestions.addAll( |
| new GeneratingSuggester(suggestionSpeller) |
| .suggest(dictionary.toLowerCase(word), wordCase, suggestions)); |
| } |
| |
| if (word.contains("-") && suggestions.stream().noneMatch(s -> s.contains("-"))) { |
| suggestions.addAll(modifyChunksBetweenDashes(word)); |
| } |
| |
| Set<String> result = new LinkedHashSet<>(); |
| for (String candidate : suggestions) { |
| result.add(adjustSuggestionCase(candidate, wordCase, word)); |
| if (wordCase == WordCase.UPPER && dictionary.checkSharpS && candidate.contains("ß")) { |
| result.add(candidate); |
| } |
| } |
| return result.stream().map(this::cleanOutput).collect(Collectors.toList()); |
| } |
| |
| private String adjustSuggestionCase(String candidate, WordCase originalCase, String original) { |
| if (originalCase == WordCase.UPPER) { |
| String upper = candidate.toUpperCase(Locale.ROOT); |
| if (upper.contains(" ") || spell(upper)) { |
| return upper; |
| } |
| } |
| if (Character.isUpperCase(original.charAt(0))) { |
| String title = Character.toUpperCase(candidate.charAt(0)) + candidate.substring(1); |
| if (title.contains(" ") || spell(title)) { |
| return title; |
| } |
| } |
| return candidate; |
| } |
| |
| private List<String> modifyChunksBetweenDashes(String word) { |
| List<String> result = new ArrayList<>(); |
| int chunkStart = 0; |
| while (chunkStart < word.length()) { |
| int chunkEnd = word.indexOf('-', chunkStart); |
| if (chunkEnd < 0) { |
| chunkEnd = word.length(); |
| } |
| |
| if (chunkEnd > chunkStart) { |
| String chunk = word.substring(chunkStart, chunkEnd); |
| if (!spell(chunk)) { |
| for (String chunkSug : suggest(chunk)) { |
| String replaced = word.substring(0, chunkStart) + chunkSug + word.substring(chunkEnd); |
| if (spell(replaced)) { |
| result.add(replaced); |
| } |
| } |
| } |
| } |
| |
| chunkStart = chunkEnd + 1; |
| } |
| return result; |
| } |
| |
| private String cleanOutput(String s) { |
| if (!dictionary.needsOutputCleaning) return s; |
| |
| try { |
| StringBuilder sb = new StringBuilder(s); |
| Dictionary.applyMappings(dictionary.oconv, sb); |
| return sb.toString(); |
| } catch (IOException bogus) { |
| throw new RuntimeException(bogus); |
| } |
| } |
| } |