LUCENE-9808: Hunspell suggestions: consider space/dash-separated words for each case variation (#2425)
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java
index d69fc39..4eadcd1 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java
@@ -565,9 +565,9 @@
}
};
ModifyingSuggester modifier = new ModifyingSuggester(suggestionSpeller, suggestions);
- modifier.suggest(word, wordCase);
+ boolean hasGoodSuggestions = modifier.suggest(word, wordCase);
- if (!modifier.hasGoodSuggestions && dictionary.maxNGramSuggestions > 0) {
+ if (!hasGoodSuggestions && dictionary.maxNGramSuggestions > 0) {
suggestions.addAll(
new GeneratingSuggester(suggestionSpeller)
.suggest(dictionary.toLowerCase(word), wordCase, suggestions));
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ModifyingSuggester.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ModifyingSuggester.java
index 286d1ee..5e29274 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ModifyingSuggester.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ModifyingSuggester.java
@@ -21,7 +21,6 @@
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Locale;
-import java.util.stream.Collectors;
/** A class that modifies the given misspelled word in various ways to get correct suggestions */
class ModifyingSuggester {
@@ -29,7 +28,6 @@
private final LinkedHashSet<String> result;
private final char[] tryChars;
private final Hunspell speller;
- boolean hasGoodSuggestions;
ModifyingSuggester(Hunspell speller, LinkedHashSet<String> result) {
this.speller = speller;
@@ -37,19 +35,20 @@
this.result = result;
}
- void suggest(String word, WordCase wordCase) {
+ /** @return whether any of the added suggestions are considered "good" */
+ boolean suggest(String word, WordCase wordCase) {
String low = wordCase != WordCase.LOWER ? speller.dictionary.toLowerCase(word) : word;
if (wordCase == WordCase.UPPER || wordCase == WordCase.MIXED) {
trySuggestion(low);
}
- tryVariationsOf(word);
+ boolean hasGoodSuggestions = tryVariationsOf(word);
if (wordCase == WordCase.TITLE) {
- tryVariationsOf(low);
+ hasGoodSuggestions |= tryVariationsOf(low);
} else if (wordCase == WordCase.UPPER) {
- tryVariationsOf(low);
- tryVariationsOf(speller.dictionary.toTitleCase(word));
+ hasGoodSuggestions |= tryVariationsOf(low);
+ hasGoodSuggestions |= tryVariationsOf(speller.dictionary.toTitleCase(word));
} else if (wordCase == WordCase.MIXED) {
int dot = word.indexOf('.');
if (dot > 0
@@ -60,20 +59,26 @@
boolean capitalized = Character.isUpperCase(word.charAt(0));
if (capitalized) {
- tryVariationsOf(speller.dictionary.caseFold(word.charAt(0)) + word.substring(1));
+ hasGoodSuggestions |=
+ tryVariationsOf(speller.dictionary.caseFold(word.charAt(0)) + word.substring(1));
}
- tryVariationsOf(low);
+ hasGoodSuggestions |= tryVariationsOf(low);
if (capitalized) {
- tryVariationsOf(speller.dictionary.toTitleCase(low));
+ hasGoodSuggestions |= tryVariationsOf(speller.dictionary.toTitleCase(low));
}
- List<String> adjusted =
- result.stream().map(s -> capitalizeAfterSpace(word, s)).collect(Collectors.toList());
+ List<String> adjusted = new ArrayList<>();
+ for (String candidate : result) {
+ String s = capitalizeAfterSpace(word, candidate);
+ adjusted.add(s.equals(candidate) ? adjusted.size() : 0, s);
+ }
+
result.clear();
result.addAll(adjusted);
}
+ return hasGoodSuggestions;
}
// aNew -> "a New" (instead of "a new")
@@ -89,8 +94,8 @@
return candidate;
}
- private void tryVariationsOf(String word) {
- hasGoodSuggestions |= trySuggestion(word.toUpperCase(Locale.ROOT));
+ private boolean tryVariationsOf(String word) {
+ boolean hasGoodSuggestions = trySuggestion(word.toUpperCase(Locale.ROOT));
hasGoodSuggestions |= tryRep(word);
if (!speller.dictionary.mapTable.isEmpty()) {
@@ -120,6 +125,7 @@
if (!hasGoodSuggestions && speller.dictionary.enableSplitSuggestions) {
trySplitting(word);
}
+ return hasGoodSuggestions;
}
private boolean tryRep(String word) {
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug2.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug2.aff
index bb7c8803..ccd79f0 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug2.aff
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug2.aff
@@ -23,3 +23,5 @@
WORDCHARS .-
FORBIDDENWORD ?
+REP 1
+REP s ti
\ No newline at end of file
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug2.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug2.dic
index 86311a9..7378051 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug2.dic
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug2.dic
@@ -10,3 +10,6 @@
scot
free
scot-free
+Sm
+es
+times
\ No newline at end of file
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug2.sug b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug2.sug
index 65b7537..7032c06 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug2.sug
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug2.sug
@@ -1,3 +1,4 @@
a lot
in spite
scot-free
+Sm Es, Times, Sm-es
\ No newline at end of file
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug2.wrong b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug2.wrong
index 4cfc569..78a4d63 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug2.wrong
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug2.wrong
@@ -1,3 +1,4 @@
alot
inspite
scotfree
+SMEs
\ No newline at end of file