LUCENE-9808: Hunspell suggestions: consider space/dash-separated words for each case variation (#2425)

commit: e1ff4c13541f279799d03d65cf90a50ec5cea8ea [log] [tgz]
author: Peter Gromov <peter@jetbrains.com> Wed Feb 24 17:43:37 2021 +0100
committer: GitHub <noreply@github.com> Wed Feb 24 11:43:37 2021 -0500
tree: 17de65ca9985f7ef3f68f06b5c780df3125f056e
parent: 9d6fd98810aff7311e124be98f8c8f8cc66520be [diff]
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java
index d69fc39..4eadcd1 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java

@@ -565,9 +565,9 @@
           }
         };
     ModifyingSuggester modifier = new ModifyingSuggester(suggestionSpeller, suggestions);
-    modifier.suggest(word, wordCase);
+    boolean hasGoodSuggestions = modifier.suggest(word, wordCase);
 
-    if (!modifier.hasGoodSuggestions && dictionary.maxNGramSuggestions > 0) {
+    if (!hasGoodSuggestions && dictionary.maxNGramSuggestions > 0) {
       suggestions.addAll(
           new GeneratingSuggester(suggestionSpeller)
               .suggest(dictionary.toLowerCase(word), wordCase, suggestions));

diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ModifyingSuggester.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ModifyingSuggester.java
index 286d1ee..5e29274 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ModifyingSuggester.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ModifyingSuggester.java

@@ -21,7 +21,6 @@
 import java.util.LinkedHashSet;
 import java.util.List;
 import java.util.Locale;
-import java.util.stream.Collectors;
 
 /** A class that modifies the given misspelled word in various ways to get correct suggestions */
 class ModifyingSuggester {
@@ -29,7 +28,6 @@
   private final LinkedHashSet<String> result;
   private final char[] tryChars;
   private final Hunspell speller;
-  boolean hasGoodSuggestions;
 
   ModifyingSuggester(Hunspell speller, LinkedHashSet<String> result) {
     this.speller = speller;
@@ -37,19 +35,20 @@
     this.result = result;
   }
 
-  void suggest(String word, WordCase wordCase) {
+  /** @return whether any of the added suggestions are considered "good" */
+  boolean suggest(String word, WordCase wordCase) {
     String low = wordCase != WordCase.LOWER ? speller.dictionary.toLowerCase(word) : word;
     if (wordCase == WordCase.UPPER || wordCase == WordCase.MIXED) {
       trySuggestion(low);
     }
 
-    tryVariationsOf(word);
+    boolean hasGoodSuggestions = tryVariationsOf(word);
 
     if (wordCase == WordCase.TITLE) {
-      tryVariationsOf(low);
+      hasGoodSuggestions |= tryVariationsOf(low);
     } else if (wordCase == WordCase.UPPER) {
-      tryVariationsOf(low);
-      tryVariationsOf(speller.dictionary.toTitleCase(word));
+      hasGoodSuggestions |= tryVariationsOf(low);
+      hasGoodSuggestions |= tryVariationsOf(speller.dictionary.toTitleCase(word));
     } else if (wordCase == WordCase.MIXED) {
       int dot = word.indexOf('.');
       if (dot > 0
@@ -60,20 +59,26 @@
 
       boolean capitalized = Character.isUpperCase(word.charAt(0));
       if (capitalized) {
-        tryVariationsOf(speller.dictionary.caseFold(word.charAt(0)) + word.substring(1));
+        hasGoodSuggestions |=
+            tryVariationsOf(speller.dictionary.caseFold(word.charAt(0)) + word.substring(1));
       }
 
-      tryVariationsOf(low);
+      hasGoodSuggestions |= tryVariationsOf(low);
 
       if (capitalized) {
-        tryVariationsOf(speller.dictionary.toTitleCase(low));
+        hasGoodSuggestions |= tryVariationsOf(speller.dictionary.toTitleCase(low));
       }
 
-      List<String> adjusted =
-          result.stream().map(s -> capitalizeAfterSpace(word, s)).collect(Collectors.toList());
+      List<String> adjusted = new ArrayList<>();
+      for (String candidate : result) {
+        String s = capitalizeAfterSpace(word, candidate);
+        adjusted.add(s.equals(candidate) ? adjusted.size() : 0, s);
+      }
+
       result.clear();
       result.addAll(adjusted);
     }
+    return hasGoodSuggestions;
   }
 
   // aNew -> "a New" (instead of "a new")
@@ -89,8 +94,8 @@
     return candidate;
   }
 
-  private void tryVariationsOf(String word) {
-    hasGoodSuggestions |= trySuggestion(word.toUpperCase(Locale.ROOT));
+  private boolean tryVariationsOf(String word) {
+    boolean hasGoodSuggestions = trySuggestion(word.toUpperCase(Locale.ROOT));
     hasGoodSuggestions |= tryRep(word);
 
     if (!speller.dictionary.mapTable.isEmpty()) {
@@ -120,6 +125,7 @@
     if (!hasGoodSuggestions && speller.dictionary.enableSplitSuggestions) {
       trySplitting(word);
     }
+    return hasGoodSuggestions;
   }
 
   private boolean tryRep(String word) {

diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug2.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug2.aff
index bb7c8803..ccd79f0 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug2.aff
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug2.aff

@@ -23,3 +23,5 @@
 WORDCHARS .-
 FORBIDDENWORD ?
 
+REP 1
+REP s ti
\ No newline at end of file

diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug2.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug2.dic
index 86311a9..7378051 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug2.dic
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug2.dic

@@ -10,3 +10,6 @@
 scot
 free
 scot-free
+Sm
+es
+times
\ No newline at end of file

diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug2.sug b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug2.sug
index 65b7537..7032c06 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug2.sug
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug2.sug

@@ -1,3 +1,4 @@
 a lot
 in spite
 scot-free
+Sm Es, Times, Sm-es
\ No newline at end of file

diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug2.wrong b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug2.wrong
index 4cfc569..78a4d63 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug2.wrong
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug2.wrong

@@ -1,3 +1,4 @@
 alot
 inspite
 scotfree
+SMEs
\ No newline at end of file
commit	e1ff4c13541f279799d03d65cf90a50ec5cea8ea	[log] [tgz]
author	Peter Gromov <peter@jetbrains.com>	Wed Feb 24 17:43:37 2021 +0100
committer	GitHub <noreply@github.com>	Wed Feb 24 11:43:37 2021 -0500
tree	17de65ca9985f7ef3f68f06b5c780df3125f056e
parent	9d6fd98810aff7311e124be98f8c8f8cc66520be [diff]