LUCENE-9810: Hunspell: when generating suggestions, skip too deep word FST subtrees (#2427) * LUCENE-9810: Hunspell: when generating suggestions, skip too deep word FST subtrees we skip roots longer than misspelled+4 anyway, so there's no need to read their arcs * check more in TestPerformance.de_suggest

commit: a5d84631196f9e70316726ee114adbc2b62dfe64 [log] [tgz]
author: Peter Gromov <peter@jetbrains.com> Wed Feb 24 17:58:32 2021 +0100
committer: GitHub <noreply@github.com> Wed Feb 24 11:58:32 2021 -0500
tree: 4c4c980b14355576ae2b7601c43ec2239c54aa4d
parent: 3a99e2aa82d860b6a319cea39607ab3c8ef590b5 [diff]
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java
index 9030459..e85f0fd 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java

@@ -30,11 +30,12 @@
 import java.util.PriorityQueue;
 import java.util.Set;
 import java.util.TreeSet;
-import java.util.function.BiConsumer;
 import java.util.stream.Collectors;
+import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.IntsRef;
 import org.apache.lucene.util.fst.FST;
 import org.apache.lucene.util.fst.IntsRefFSTEnum;
+import org.apache.lucene.util.fst.IntsRefFSTEnum.InputOutput;
 
 /**
  * A class that traverses the entire dictionary and applies affix rules to check if those yield
@@ -44,6 +45,7 @@
   private static final int MAX_ROOTS = 100;
   private static final int MAX_WORDS = 100;
   private static final int MAX_GUESSES = 200;
+  private static final int MAX_ROOT_LENGTH_DIFF = 4;
   private final Dictionary dictionary;
   private final Hunspell speller;
 
@@ -63,47 +65,57 @@
       String word, WordCase originalCase) {
     Comparator<Weighted<Root<String>>> natural = Comparator.naturalOrder();
     PriorityQueue<Weighted<Root<String>>> roots = new PriorityQueue<>(natural.reversed());
-    processFST(
-        dictionary.words,
-        (key, forms) -> {
-          if (Math.abs(key.length - word.length()) > 4) return;
 
-          String root = toString(key);
-          List<Root<String>> entries = filterSuitableEntries(root, forms);
-          if (entries.isEmpty()) return;
+    IntsRefFSTEnum<IntsRef> fstEnum = new IntsRefFSTEnum<>(dictionary.words);
+    InputOutput<IntsRef> mapping;
+    while ((mapping = nextKey(fstEnum, word.length() + 4)) != null) {
+      speller.checkCanceled.run();
 
-          if (originalCase == WordCase.LOWER
-              && WordCase.caseOf(root) == WordCase.TITLE
-              && !dictionary.hasLanguage("de")) {
-            return;
-          }
+      IntsRef key = mapping.input;
+      if (Math.abs(key.length - word.length()) > MAX_ROOT_LENGTH_DIFF) {
+        assert key.length < word.length(); // nextKey takes care of longer keys
+        continue;
+      }
 
-          String lower = dictionary.toLowerCase(root);
-          int sc =
-              ngram(3, word, lower, EnumSet.of(NGramOptions.LONGER_WORSE))
-                  + commonPrefix(word, root);
+      String root = toString(key);
+      List<Root<String>> entries = filterSuitableEntries(root, mapping.output);
+      if (entries.isEmpty()) continue;
 
-          if (roots.size() == MAX_ROOTS && sc < roots.peek().score) {
-            return;
-          }
+      if (originalCase == WordCase.LOWER
+          && WordCase.caseOf(root) == WordCase.TITLE
+          && !dictionary.hasLanguage("de")) {
+        continue;
+      }
 
-          entries.forEach(e -> roots.add(new Weighted<>(e, sc)));
-          while (roots.size() > MAX_ROOTS) {
-            roots.poll();
-          }
-        });
+      String lower = dictionary.toLowerCase(root);
+      int sc =
+          ngram(3, word, lower, EnumSet.of(NGramOptions.LONGER_WORSE)) + commonPrefix(word, root);
+
+      if (roots.size() == MAX_ROOTS && sc < roots.peek().score) {
+        continue;
+      }
+
+      entries.forEach(e -> roots.add(new Weighted<>(e, sc)));
+      while (roots.size() > MAX_ROOTS) {
+        roots.poll();
+      }
+    }
     return roots.stream().sorted().collect(Collectors.toList());
   }
 
-  private void processFST(FST<IntsRef> fst, BiConsumer<IntsRef, IntsRef> keyValueConsumer) {
-    if (fst == null) return;
+  private static InputOutput<IntsRef> nextKey(IntsRefFSTEnum<IntsRef> fstEnum, int maxLen) {
     try {
-      IntsRefFSTEnum<IntsRef> fstEnum = new IntsRefFSTEnum<>(fst);
-      IntsRefFSTEnum.InputOutput<IntsRef> mapping;
-      while ((mapping = fstEnum.next()) != null) {
-        speller.checkCanceled.run();
-        keyValueConsumer.accept(mapping.input, mapping.output);
+      InputOutput<IntsRef> next = fstEnum.next();
+      while (next != null && next.input.length > maxLen) {
+        int offset = next.input.offset;
+        int[] ints = ArrayUtil.copyOfSubArray(next.input.ints, offset, offset + maxLen);
+        if (ints[ints.length - 1] == Integer.MAX_VALUE) {
+          throw new AssertionError("Too large char");
+        }
+        ints[ints.length - 1]++;
+        next = fstEnum.seekCeil(new IntsRef(ints, 0, ints.length));
       }
+      return next;
     } catch (IOException e) {
       throw new RuntimeException(e);
     }

diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java
index eda0f76..f746549 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java

@@ -72,7 +72,7 @@
 
   @Test
   public void de_suggest() throws Exception {
-    checkSuggestionPerformance("de", 30);
+    checkSuggestionPerformance("de", 55);
   }
 
   @Test
commit	a5d84631196f9e70316726ee114adbc2b62dfe64	[log] [tgz]
author	Peter Gromov <peter@jetbrains.com>	Wed Feb 24 17:58:32 2021 +0100
committer	GitHub <noreply@github.com>	Wed Feb 24 11:58:32 2021 -0500
tree	4c4c980b14355576ae2b7601c43ec2239c54aa4d
parent	3a99e2aa82d860b6a319cea39607ab3c8ef590b5 [diff]