blob: 04bd55840ce81f3d428d3befb3987dd1635b0826 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.hunspell;
import static org.apache.lucene.analysis.hunspell.Dictionary.FLAG_UNSET;
import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_BEGIN;
import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_END;
import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_MIDDLE;
import static org.apache.lucene.analysis.hunspell.WordContext.SIMPLE_WORD;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Locale;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.IntsRef;
/**
* A spell checker based on Hunspell dictionaries. This class can be used in place of native
* Hunspell for many languages for spell-checking and suggesting purposes. Note that not all
* languages are supported yet. For example:
*
* <ul>
* <li>Hungarian (as it doesn't only rely on dictionaries, but has some logic directly in the
* source code
* <li>Languages with Unicode characters outside of the Basic Multilingual Plane
* <li>PHONE affix file option for suggestions
* </ul>
*
* <p>The objects of this class are not thread-safe (but a single underlying Dictionary can be
* shared by multiple spell-checkers in different threads).
*/
public class Hunspell {
final Dictionary dictionary;
final Stemmer stemmer;
public Hunspell(Dictionary dictionary) {
this.dictionary = dictionary;
stemmer = new Stemmer(dictionary);
}
/** @return whether the given word's spelling is considered correct according to Hunspell rules */
public boolean spell(String word) {
if (word.isEmpty()) return true;
if (dictionary.needsInputCleaning) {
word = dictionary.cleanInput(word, new StringBuilder()).toString();
}
if (word.endsWith(".")) {
return spellWithTrailingDots(word);
}
return spellClean(word);
}
private boolean spellClean(String word) {
if (isNumber(word)) {
return true;
}
char[] wordChars = word.toCharArray();
Boolean simpleResult = checkSimpleWord(wordChars, wordChars.length, null);
if (simpleResult != null) {
return simpleResult;
}
if (checkCompounds(wordChars, wordChars.length, null)) {
return true;
}
WordCase wc = stemmer.caseOf(wordChars, wordChars.length);
if ((wc == WordCase.UPPER || wc == WordCase.TITLE)) {
Stemmer.CaseVariationProcessor variationProcessor =
(variant, varLength, originalCase) -> !checkWord(variant, varLength, originalCase);
if (!stemmer.varyCase(wordChars, wordChars.length, wc, variationProcessor)) {
return true;
}
}
if (dictionary.breaks.isNotEmpty() && !hasTooManyBreakOccurrences(word)) {
return tryBreaks(word);
}
return false;
}
private boolean spellWithTrailingDots(String word) {
int length = word.length() - 1;
while (length > 0 && word.charAt(length - 1) == '.') {
length--;
}
return spellClean(word.substring(0, length)) || spellClean(word.substring(0, length + 1));
}
boolean checkWord(String word) {
return checkWord(word.toCharArray(), word.length(), null);
}
Boolean checkSimpleWord(char[] wordChars, int length, WordCase originalCase) {
Root<CharsRef> entry = findStem(wordChars, 0, length, originalCase, SIMPLE_WORD);
if (entry != null) {
return !dictionary.hasFlag(entry.entryId, dictionary.forbiddenword);
}
return null;
}
private boolean checkWord(char[] wordChars, int length, WordCase originalCase) {
Boolean simpleResult = checkSimpleWord(wordChars, length, originalCase);
if (simpleResult != null) {
return simpleResult;
}
return checkCompounds(wordChars, length, originalCase);
}
private boolean checkCompounds(char[] wordChars, int length, WordCase originalCase) {
if (dictionary.compoundRules != null
&& checkCompoundRules(wordChars, 0, length, new ArrayList<>())) {
return true;
}
if (dictionary.compoundBegin != FLAG_UNSET || dictionary.compoundFlag != FLAG_UNSET) {
return checkCompounds(new CharsRef(wordChars, 0, length), originalCase, null);
}
return false;
}
private Root<CharsRef> findStem(
char[] wordChars, int offset, int length, WordCase originalCase, WordContext context) {
@SuppressWarnings({"rawtypes", "unchecked"})
Root<CharsRef>[] result = new Root[1];
stemmer.doStem(
wordChars,
offset,
length,
originalCase,
context,
(stem, formID, morphDataId) -> {
if (acceptsStem(formID)) {
result[0] = new Root<>(stem, formID);
}
return false;
});
return result[0];
}
boolean acceptsStem(int formID) {
return true;
}
private boolean checkCompounds(CharsRef word, WordCase originalCase, CompoundPart prev) {
if (prev != null && prev.index > dictionary.compoundMax - 2) return false;
int limit = word.length - dictionary.compoundMin + 1;
for (int breakPos = dictionary.compoundMin; breakPos < limit; breakPos++) {
WordContext context = prev == null ? COMPOUND_BEGIN : COMPOUND_MIDDLE;
int breakOffset = word.offset + breakPos;
if (mayBreakIntoCompounds(word.chars, word.offset, word.length, breakOffset)) {
Root<CharsRef> stem = findStem(word.chars, word.offset, breakPos, originalCase, context);
if (stem == null
&& dictionary.simplifiedTriple
&& word.chars[breakOffset - 1] == word.chars[breakOffset]) {
stem = findStem(word.chars, word.offset, breakPos + 1, originalCase, context);
}
if (stem != null
&& !dictionary.hasFlag(stem.entryId, dictionary.forbiddenword)
&& (prev == null || prev.mayCompound(stem, breakPos, originalCase))) {
CompoundPart part = new CompoundPart(prev, word, breakPos, stem, null);
if (checkCompoundsAfter(originalCase, part)) {
return true;
}
}
}
if (checkCompoundPatternReplacements(word, breakPos, originalCase, prev)) {
return true;
}
}
return false;
}
private boolean checkCompoundPatternReplacements(
CharsRef word, int pos, WordCase originalCase, CompoundPart prev) {
for (CheckCompoundPattern pattern : dictionary.checkCompoundPatterns) {
CharsRef expanded = pattern.expandReplacement(word, pos);
if (expanded != null) {
WordContext context = prev == null ? COMPOUND_BEGIN : COMPOUND_MIDDLE;
int breakPos = pos + pattern.endLength();
Root<CharsRef> stem =
findStem(expanded.chars, expanded.offset, breakPos, originalCase, context);
if (stem != null) {
CompoundPart part = new CompoundPart(prev, expanded, breakPos, stem, pattern);
if (checkCompoundsAfter(originalCase, part)) {
return true;
}
}
}
}
return false;
}
private boolean checkCompoundsAfter(WordCase originalCase, CompoundPart prev) {
CharsRef word = prev.tail;
int breakPos = prev.length;
int remainingLength = word.length - breakPos;
int breakOffset = word.offset + breakPos;
Root<CharsRef> tailStem =
findStem(word.chars, breakOffset, remainingLength, originalCase, COMPOUND_END);
if (tailStem != null
&& !dictionary.hasFlag(tailStem.entryId, dictionary.forbiddenword)
&& !(dictionary.checkCompoundDup && equalsIgnoreCase(prev.stem, tailStem.word))
&& !hasForceUCaseProblem(word.chars, breakOffset, remainingLength, originalCase)
&& prev.mayCompound(tailStem, remainingLength, originalCase)) {
return true;
}
CharsRef tail = new CharsRef(word.chars, breakOffset, remainingLength);
return checkCompounds(tail, originalCase, prev);
}
private boolean hasForceUCaseProblem(
char[] chars, int offset, int length, WordCase originalCase) {
if (dictionary.forceUCase == FLAG_UNSET) return false;
if (originalCase == WordCase.TITLE || originalCase == WordCase.UPPER) return false;
IntsRef forms = dictionary.lookupWord(chars, offset, length);
return forms != null && dictionary.hasFlag(forms, dictionary.forceUCase);
}
private boolean equalsIgnoreCase(CharSequence cr1, CharSequence cr2) {
return cr1.toString().equalsIgnoreCase(cr2.toString());
}
/**
* Find all roots that could result in the given word after case conversion and adding affixes.
* This corresponds to the original {@code hunspell -s} (stemming) functionality.
*
* <p>Some affix rules are relaxed in this stemming process: e.g. explicitly forbidden words are
* still returned. Some of the returned roots may be synthetic and not directly occur in the *.dic
* file (but differ from some existing entries in case). No roots are returned for compound words.
*
* <p>The returned roots may be used to retrieve morphological data via {@link
* Dictionary#lookupEntries}.
*/
public List<String> getRoots(String word) {
return stemmer.stem(word).stream()
.map(CharsRef::toString)
.distinct()
.collect(Collectors.toList());
}
private class CompoundPart {
final CompoundPart prev;
final int index, length;
final CharsRef tail, stem;
final CheckCompoundPattern enablingPattern;
CompoundPart(
CompoundPart prev,
CharsRef tail,
int length,
Root<CharsRef> stem,
CheckCompoundPattern enabler) {
this.prev = prev;
this.tail = tail;
this.length = length;
this.stem = stem.word;
index = prev == null ? 1 : prev.index + 1;
enablingPattern = enabler;
}
@Override
public String toString() {
return (prev == null ? "" : prev + "+") + tail.subSequence(0, length);
}
boolean mayCompound(Root<CharsRef> nextStem, int nextPartLength, WordCase originalCase) {
boolean patternsOk =
enablingPattern != null
? enablingPattern.prohibitsCompounding(tail, length, stem, nextStem.word)
: dictionary.checkCompoundPatterns.stream()
.noneMatch(p -> p.prohibitsCompounding(tail, length, stem, nextStem.word));
if (!patternsOk) {
return false;
}
if (dictionary.checkCompoundRep
&& isMisspelledSimpleWord(length + nextPartLength, originalCase)) {
return false;
}
String spaceSeparated =
new String(tail.chars, tail.offset, length)
+ " "
+ new String(tail.chars, tail.offset + length, nextPartLength);
return !checkWord(spaceSeparated);
}
private boolean isMisspelledSimpleWord(int length, WordCase originalCase) {
String word = new String(tail.chars, tail.offset, length);
for (RepEntry entry : dictionary.repTable) {
if (entry.isMiddle()) {
for (String sug : entry.substitute(word)) {
if (findStem(sug.toCharArray(), 0, sug.length(), originalCase, SIMPLE_WORD) != null) {
return true;
}
}
}
}
return false;
}
}
private boolean mayBreakIntoCompounds(char[] chars, int offset, int length, int breakPos) {
if (dictionary.checkCompoundCase) {
if (Character.isUpperCase(chars[breakPos - 1]) || Character.isUpperCase(chars[breakPos])) {
return false;
}
}
if (dictionary.checkCompoundTriple && chars[breakPos - 1] == chars[breakPos]) {
//noinspection RedundantIfStatement
if (breakPos > offset + 1 && chars[breakPos - 2] == chars[breakPos - 1]
|| breakPos < length - 1 && chars[breakPos] == chars[breakPos + 1]) {
return false;
}
}
return true;
}
private boolean checkCompoundRules(
char[] wordChars, int offset, int length, List<IntsRef> words) {
if (words.size() >= 100) return false;
int limit = length - dictionary.compoundMin + 1;
for (int breakPos = dictionary.compoundMin; breakPos < limit; breakPos++) {
IntsRef forms = dictionary.lookupWord(wordChars, offset, breakPos);
if (forms != null) {
words.add(forms);
if (dictionary.compoundRules != null
&& dictionary.compoundRules.stream().anyMatch(r -> r.mayMatch(words))) {
if (checkLastCompoundPart(wordChars, offset + breakPos, length - breakPos, words)) {
return true;
}
if (checkCompoundRules(wordChars, offset + breakPos, length - breakPos, words)) {
return true;
}
}
words.remove(words.size() - 1);
}
}
return false;
}
private boolean checkLastCompoundPart(
char[] wordChars, int start, int length, List<IntsRef> words) {
IntsRef forms = dictionary.lookupWord(wordChars, start, length);
if (forms == null) return false;
words.add(forms);
boolean result = dictionary.compoundRules.stream().anyMatch(r -> r.fullyMatches(words));
words.remove(words.size() - 1);
return result;
}
private static boolean isNumber(String s) {
int i = 0;
while (i < s.length()) {
char c = s.charAt(i);
if (isDigit(c)) {
i++;
} else if (c == '.' || c == ',' || c == '-') {
if (i == 0 || i >= s.length() - 1 || !isDigit(s.charAt(i + 1))) {
return false;
}
i += 2;
} else {
return false;
}
}
return true;
}
private static boolean isDigit(char c) {
return c >= '0' && c <= '9';
}
private boolean tryBreaks(String word) {
for (String br : dictionary.breaks.starting) {
if (word.length() > br.length() && word.startsWith(br)) {
if (spell(word.substring(br.length()))) {
return true;
}
}
}
for (String br : dictionary.breaks.ending) {
if (word.length() > br.length() && word.endsWith(br)) {
if (spell(word.substring(0, word.length() - br.length()))) {
return true;
}
}
}
for (String br : dictionary.breaks.middle) {
int pos = word.indexOf(br);
if (canBeBrokenAt(word, br, pos)) {
return true;
}
// try to break at the second occurrence
// to recognize dictionary words with a word break
if (pos > 0 && canBeBrokenAt(word, br, word.indexOf(br, pos + 1))) {
return true;
}
}
return false;
}
private boolean hasTooManyBreakOccurrences(String word) {
int occurrences = 0;
for (String br : dictionary.breaks.middle) {
int pos = 0;
while ((pos = word.indexOf(br, pos)) >= 0) {
if (++occurrences >= 10) return true;
pos += br.length();
}
}
return false;
}
private boolean canBeBrokenAt(String word, String breakStr, int breakPos) {
return breakPos > 0
&& breakPos < word.length() - breakStr.length()
&& spell(word.substring(0, breakPos))
&& spell(word.substring(breakPos + breakStr.length()));
}
public List<String> suggest(String word) {
if (word.length() >= 100) return Collections.emptyList();
if (dictionary.needsInputCleaning) {
word = dictionary.cleanInput(word, new StringBuilder()).toString();
}
WordCase wordCase = WordCase.caseOf(word);
if (dictionary.forceUCase != FLAG_UNSET && wordCase == WordCase.LOWER) {
String title = dictionary.toTitleCase(word);
if (spell(title)) {
return Collections.singletonList(title);
}
}
Hunspell suggestionSpeller =
new Hunspell(dictionary) {
@Override
boolean acceptsStem(int formID) {
return !dictionary.hasFlag(formID, dictionary.noSuggest)
&& !dictionary.hasFlag(formID, dictionary.subStandard);
}
};
ModifyingSuggester modifier = new ModifyingSuggester(suggestionSpeller);
Set<String> suggestions = modifier.suggest(word, wordCase);
if (!modifier.hasGoodSuggestions && dictionary.maxNGramSuggestions > 0) {
suggestions.addAll(
new GeneratingSuggester(suggestionSpeller)
.suggest(dictionary.toLowerCase(word), wordCase, suggestions));
}
if (word.contains("-") && suggestions.stream().noneMatch(s -> s.contains("-"))) {
suggestions.addAll(modifyChunksBetweenDashes(word));
}
Set<String> result = new LinkedHashSet<>();
for (String candidate : suggestions) {
result.add(adjustSuggestionCase(candidate, wordCase, word));
if (wordCase == WordCase.UPPER && dictionary.checkSharpS && candidate.contains("ß")) {
result.add(candidate);
}
}
return result.stream().map(this::cleanOutput).collect(Collectors.toList());
}
private String adjustSuggestionCase(String candidate, WordCase originalCase, String original) {
if (originalCase == WordCase.UPPER) {
String upper = candidate.toUpperCase(Locale.ROOT);
if (upper.contains(" ") || spell(upper)) {
return upper;
}
}
if (Character.isUpperCase(original.charAt(0))) {
String title = Character.toUpperCase(candidate.charAt(0)) + candidate.substring(1);
if (title.contains(" ") || spell(title)) {
return title;
}
}
return candidate;
}
private List<String> modifyChunksBetweenDashes(String word) {
List<String> result = new ArrayList<>();
int chunkStart = 0;
while (chunkStart < word.length()) {
int chunkEnd = word.indexOf('-', chunkStart);
if (chunkEnd < 0) {
chunkEnd = word.length();
}
if (chunkEnd > chunkStart) {
String chunk = word.substring(chunkStart, chunkEnd);
if (!spell(chunk)) {
for (String chunkSug : suggest(chunk)) {
String replaced = word.substring(0, chunkStart) + chunkSug + word.substring(chunkEnd);
if (spell(replaced)) {
result.add(replaced);
}
}
}
}
chunkStart = chunkEnd + 1;
}
return result;
}
private String cleanOutput(String s) {
if (!dictionary.needsOutputCleaning) return s;
try {
StringBuilder sb = new StringBuilder(s);
Dictionary.applyMappings(dictionary.oconv, sb);
return sb.toString();
} catch (IOException bogus) {
throw new RuntimeException(bogus);
}
}
}