LUCENE-9800: Hunspell: put a time limit on suggestion calculation (#2414)
* LUCENE-9800: Hunspell: put a time limit on suggestion calculation
* fix review remarks
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java
index 76db921..d69fc39 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java
@@ -17,6 +17,7 @@
package org.apache.lucene.analysis.hunspell;
import static org.apache.lucene.analysis.hunspell.Dictionary.FLAG_UNSET;
+import static org.apache.lucene.analysis.hunspell.TimeoutPolicy.*;
import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_BEGIN;
import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_END;
import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_MIDDLE;
@@ -24,11 +25,13 @@
import static org.apache.lucene.analysis.hunspell.WordContext.SIMPLE_WORD;
import java.util.ArrayList;
+import java.util.Collection;
import java.util.Collections;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Locale;
import java.util.Set;
+import java.util.concurrent.TimeUnit;
import java.util.stream.Collectors;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.IntsRef;
@@ -49,20 +52,25 @@
* shared by multiple spell-checkers in different threads).
*/
public class Hunspell {
+ static final long SUGGEST_TIME_LIMIT = 250;
+
final Dictionary dictionary;
final Stemmer stemmer;
+ private final TimeoutPolicy policy;
final Runnable checkCanceled;
public Hunspell(Dictionary dictionary) {
- this(dictionary, () -> {});
+ this(dictionary, RETURN_PARTIAL_RESULT, () -> {});
}
/**
+ * @param policy a strategy determining what to do when API calls take too much time
* @param checkCanceled an object that's periodically called, allowing to interrupt spell-checking
* or suggestion generation by throwing an exception
*/
- public Hunspell(Dictionary dictionary, Runnable checkCanceled) {
+ public Hunspell(Dictionary dictionary, TimeoutPolicy policy, Runnable checkCanceled) {
this.dictionary = dictionary;
+ this.policy = policy;
this.checkCanceled = checkCanceled;
stemmer = new Stemmer(dictionary);
}
@@ -504,7 +512,16 @@
&& spell(word.substring(breakPos + breakStr.length()));
}
- public List<String> suggest(String word) {
+ /**
+ * @return suggestions for the given misspelled word
+ * @throws SuggestionTimeoutException if the computation takes too long and {@link
+ * TimeoutPolicy#THROW_EXCEPTION} was specified in the constructor
+ */
+ public List<String> suggest(String word) throws SuggestionTimeoutException {
+ return suggest(word, SUGGEST_TIME_LIMIT);
+ }
+
+ List<String> suggest(String word, long timeLimitMs) throws SuggestionTimeoutException {
checkCanceled.run();
if (word.length() >= 100) return Collections.emptyList();
@@ -520,16 +537,35 @@
}
}
+ LinkedHashSet<String> suggestions = new LinkedHashSet<>();
+ Runnable checkCanceled =
+ policy == NO_TIMEOUT
+ ? this.checkCanceled
+ : checkTimeLimit(word, wordCase, suggestions, timeLimitMs);
+ try {
+ doSuggest(word, wordCase, suggestions, checkCanceled);
+ } catch (SuggestionTimeoutException e) {
+ if (policy == RETURN_PARTIAL_RESULT) {
+ return postprocess(word, wordCase, suggestions);
+ }
+ throw e;
+ }
+
+ return postprocess(word, wordCase, suggestions);
+ }
+
+ private void doSuggest(
+ String word, WordCase wordCase, LinkedHashSet<String> suggestions, Runnable checkCanceled) {
Hunspell suggestionSpeller =
- new Hunspell(dictionary, checkCanceled) {
+ new Hunspell(dictionary, policy, checkCanceled) {
@Override
boolean acceptsStem(int formID) {
return !dictionary.hasFlag(formID, dictionary.noSuggest)
&& !dictionary.hasFlag(formID, dictionary.subStandard);
}
};
- ModifyingSuggester modifier = new ModifyingSuggester(suggestionSpeller);
- Set<String> suggestions = modifier.suggest(word, wordCase);
+ ModifyingSuggester modifier = new ModifyingSuggester(suggestionSpeller, suggestions);
+ modifier.suggest(word, wordCase);
if (!modifier.hasGoodSuggestions && dictionary.maxNGramSuggestions > 0) {
suggestions.addAll(
@@ -540,7 +576,35 @@
if (word.contains("-") && suggestions.stream().noneMatch(s -> s.contains("-"))) {
suggestions.addAll(modifyChunksBetweenDashes(word));
}
+ }
+ private Runnable checkTimeLimit(
+ String word, WordCase wordCase, Set<String> suggestions, long timeLimitMs) {
+ return new Runnable() {
+ final long deadline = System.nanoTime() + TimeUnit.MILLISECONDS.toNanos(timeLimitMs);
+ int invocationCounter = 100;
+
+ @Override
+ public void run() {
+ checkCanceled.run();
+ if (--invocationCounter <= 0) {
+ if (System.nanoTime() - deadline > 0) {
+ stop();
+ }
+ invocationCounter = 100;
+ }
+ }
+
+ private void stop() {
+ List<String> partialResult =
+ policy == RETURN_PARTIAL_RESULT ? null : postprocess(word, wordCase, suggestions);
+ String message = "Time limit of " + timeLimitMs + "ms exceeded for " + word;
+ throw new SuggestionTimeoutException(message, partialResult);
+ }
+ };
+ }
+
+ private List<String> postprocess(String word, WordCase wordCase, Collection<String> suggestions) {
Set<String> result = new LinkedHashSet<>();
for (String candidate : suggestions) {
result.add(adjustSuggestionCase(candidate, wordCase, word));
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ModifyingSuggester.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ModifyingSuggester.java
index b689a69..86e34c7 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ModifyingSuggester.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ModifyingSuggester.java
@@ -26,17 +26,18 @@
/** A class that modifies the given misspelled word in various ways to get correct suggestions */
class ModifyingSuggester {
private static final int MAX_CHAR_DISTANCE = 4;
- private final LinkedHashSet<String> result = new LinkedHashSet<>();
+ private final LinkedHashSet<String> result;
private final char[] tryChars;
private final Hunspell speller;
boolean hasGoodSuggestions;
- ModifyingSuggester(Hunspell speller) {
+ ModifyingSuggester(Hunspell speller, LinkedHashSet<String> result) {
this.speller = speller;
tryChars = speller.dictionary.tryChars.toCharArray();
+ this.result = result;
}
- LinkedHashSet<String> suggest(String word, WordCase wordCase) {
+ void suggest(String word, WordCase wordCase) {
String low = wordCase != WordCase.LOWER ? speller.dictionary.toLowerCase(word) : word;
if (wordCase == WordCase.UPPER || wordCase == WordCase.MIXED) {
trySuggestion(low);
@@ -68,12 +69,11 @@
tryVariationsOf(speller.dictionary.toTitleCase(low));
}
- return result.stream()
- .map(s -> capitalizeAfterSpace(low, s))
- .collect(Collectors.toCollection(LinkedHashSet::new));
+ List<String> adjusted =
+ result.stream().map(s -> capitalizeAfterSpace(low, s)).collect(Collectors.toList());
+ result.clear();
+ result.addAll(adjusted);
}
-
- return result;
}
// aNew -> "a New" (instead of "a new")
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SuggestionTimeoutException.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SuggestionTimeoutException.java
new file mode 100644
index 0000000..6a831ad
--- /dev/null
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SuggestionTimeoutException.java
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.hunspell;
+
+import java.util.Collections;
+import java.util.List;
+
+/**
+ * An exception thrown when {@link Hunspell#suggest} call takes too long, if {@link
+ * TimeoutPolicy#THROW_EXCEPTION} is used.
+ */
+public class SuggestionTimeoutException extends RuntimeException {
+ private final List<String> partialResult;
+
+ SuggestionTimeoutException(String message, List<String> partialResult) {
+ super(message);
+ this.partialResult = partialResult == null ? null : Collections.unmodifiableList(partialResult);
+ }
+
+ /**
+ * @return partial result calculated by {@link Hunspell#suggest} before the time limit was
+ * exceeded
+ */
+ public List<String> getPartialResult() {
+ return partialResult;
+ }
+}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/TimeoutPolicy.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/TimeoutPolicy.java
new file mode 100644
index 0000000..ad26043
--- /dev/null
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/TimeoutPolicy.java
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.hunspell;
+
+/** A strategy determining what to do when Hunspell API calls take too much time */
+public enum TimeoutPolicy {
+ /** Let the computation complete even if it takes ages */
+ NO_TIMEOUT,
+
+ /** Just stop the calculation and return whatever has been computed so far */
+ RETURN_PARTIAL_RESULT,
+
+ /**
+ * Throw an exception (e.g. {@link SuggestionTimeoutException}) to make it more clear to the
+ * caller that the timeout happened and the returned results might not be reliable or
+ * reproducible.
+ */
+ THROW_EXCEPTION
+}
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellTest.java
deleted file mode 100644
index 893439e..0000000
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellTest.java
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.lucene.analysis.hunspell;
-
-import static org.apache.lucene.analysis.hunspell.StemmerTestBase.loadDictionary;
-
-import java.util.Collections;
-import java.util.concurrent.CancellationException;
-import java.util.concurrent.atomic.AtomicBoolean;
-import org.apache.lucene.util.LuceneTestCase;
-
-public class HunspellTest extends LuceneTestCase {
- public void testCheckCanceled() throws Exception {
- AtomicBoolean canceled = new AtomicBoolean();
- Runnable checkCanceled =
- () -> {
- if (canceled.get()) {
- throw new CancellationException();
- }
- };
- Hunspell hunspell =
- new Hunspell(loadDictionary(false, "simple.aff", "simple.dic"), checkCanceled);
-
- assertTrue(hunspell.spell("apache"));
- assertEquals(Collections.singletonList("apach"), hunspell.suggest("apac"));
-
- canceled.set(true);
- assertThrows(CancellationException.class, () -> hunspell.spell("apache"));
- assertThrows(CancellationException.class, () -> hunspell.suggest("apac"));
- }
-}
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspell.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspell.java
new file mode 100644
index 0000000..598e74c
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspell.java
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.hunspell;
+
+import static org.apache.lucene.analysis.hunspell.StemmerTestBase.loadDictionary;
+import static org.apache.lucene.analysis.hunspell.TimeoutPolicy.NO_TIMEOUT;
+import static org.apache.lucene.analysis.hunspell.TimeoutPolicy.RETURN_PARTIAL_RESULT;
+import static org.apache.lucene.analysis.hunspell.TimeoutPolicy.THROW_EXCEPTION;
+
+import java.io.IOException;
+import java.text.ParseException;
+import java.util.Collections;
+import java.util.concurrent.CancellationException;
+import java.util.concurrent.atomic.AtomicBoolean;
+import org.apache.lucene.util.LuceneTestCase;
+import org.junit.Test;
+
+public class TestHunspell extends LuceneTestCase {
+ public void testCheckCanceled() throws Exception {
+ AtomicBoolean canceled = new AtomicBoolean();
+ Runnable checkCanceled =
+ () -> {
+ if (canceled.get()) {
+ throw new CancellationException();
+ }
+ };
+ Dictionary dictionary = loadDictionary(false, "simple.aff", "simple.dic");
+ Hunspell hunspell = new Hunspell(dictionary, NO_TIMEOUT, checkCanceled);
+
+ assertTrue(hunspell.spell("apache"));
+ assertEquals(Collections.singletonList("apach"), hunspell.suggest("apac"));
+
+ canceled.set(true);
+ assertThrows(CancellationException.class, () -> hunspell.spell("apache"));
+ assertThrows(CancellationException.class, () -> hunspell.suggest("apac"));
+ }
+
+ public void testSuggestionTimeLimit() throws IOException, ParseException {
+ int timeLimitMs = 10;
+
+ Dictionary dictionary = loadDictionary(false, "timelimit.aff", "simple.dic");
+ String word = "qazwsxedcrfvtgbyhnujm";
+
+ Hunspell incomplete = new Hunspell(dictionary, RETURN_PARTIAL_RESULT, () -> {});
+ assertFalse(incomplete.spell(word));
+ assertEquals(Collections.emptyList(), incomplete.suggest(word, timeLimitMs));
+
+ Hunspell throwing = new Hunspell(dictionary, THROW_EXCEPTION, () -> {});
+ assertFalse(throwing.spell(word));
+
+ SuggestionTimeoutException exception =
+ assertThrows(SuggestionTimeoutException.class, () -> throwing.suggest(word, timeLimitMs));
+ assertEquals(Collections.emptyList(), exception.getPartialResult());
+
+ Hunspell noLimit = new Hunspell(dictionary, NO_TIMEOUT, () -> {});
+ assertEquals(Collections.emptyList(), noLimit.suggest(word.substring(0, 5), timeLimitMs));
+ }
+
+ @Test
+ public void testStemmingApi() throws Exception {
+ Dictionary dictionary = loadDictionary(false, "simple.aff", "simple.dic");
+ Hunspell hunspell = new Hunspell(dictionary, TimeoutPolicy.NO_TIMEOUT, () -> {});
+ assertEquals(Collections.singletonList("apach"), hunspell.getRoots("apache"));
+ assertEquals(Collections.singletonList("foo"), hunspell.getRoots("foo"));
+ }
+}
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java
index bc9f71c..ca38ca8 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java
@@ -28,6 +28,7 @@
import java.text.ParseException;
import java.util.ArrayList;
import java.util.List;
+import java.util.concurrent.TimeUnit;
import java.util.function.Consumer;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
@@ -81,7 +82,7 @@
@Test
public void fr_suggest() throws Exception {
- checkSuggestionPerformance("fr", 10);
+ checkSuggestionPerformance("fr", 100);
}
private Dictionary loadDictionary(String code) throws IOException, ParseException {
@@ -97,7 +98,7 @@
List<String> words = loadWords(code, wordCount, dictionary);
Stemmer stemmer = new Stemmer(dictionary);
- Hunspell speller = new Hunspell(dictionary);
+ Hunspell speller = new Hunspell(dictionary, TimeoutPolicy.NO_TIMEOUT, () -> {});
measure(
"Stemming " + code,
blackHole -> {
@@ -117,10 +118,10 @@
private void checkSuggestionPerformance(String code, int wordCount) throws Exception {
Dictionary dictionary = loadDictionary(code);
- Hunspell speller = new Hunspell(dictionary);
+ Hunspell speller = new Hunspell(dictionary, TimeoutPolicy.THROW_EXCEPTION, () -> {});
List<String> words =
loadWords(code, wordCount, dictionary).stream()
- .filter(w -> !speller.spell(w))
+ .filter(w -> hasQuickSuggestions(speller, w))
.collect(Collectors.toList());
System.out.println("Checking " + words.size() + " misspelled words");
@@ -134,6 +135,25 @@
System.out.println();
}
+ private boolean hasQuickSuggestions(Hunspell speller, String word) {
+ if (speller.spell(word)) {
+ return false;
+ }
+
+ long start = System.nanoTime();
+ try {
+ speller.suggest(word);
+ } catch (SuggestionTimeoutException e) {
+ System.out.println("Timeout happened for " + word + ", skipping");
+ return false;
+ }
+ long elapsed = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - start);
+ if (elapsed > Hunspell.SUGGEST_TIME_LIMIT * 4 / 5) {
+ System.out.println(elapsed + "ms for " + word + ", too close to time limit, skipping");
+ }
+ return true;
+ }
+
private Path findAffFile(String code) throws IOException {
return TestAllDictionaries.findAllAffixFiles()
.filter(
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestSpellChecking.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestSpellChecking.java
index 113fad0..7737531 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestSpellChecking.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestSpellChecking.java
@@ -25,8 +25,9 @@
import java.util.stream.Collectors;
import org.apache.lucene.store.ByteBuffersDirectory;
import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.LuceneTestCase;
-public class TestSpellChecking extends StemmerTestBase {
+public class TestSpellChecking extends LuceneTestCase {
public void testBase() throws Exception {
doTest("base");
@@ -233,7 +234,7 @@
try {
Dictionary dictionary =
new Dictionary(new ByteBuffersDirectory(), "dictionary", affixStream, dictStream);
- speller = new Hunspell(dictionary);
+ speller = new Hunspell(dictionary, TimeoutPolicy.NO_TIMEOUT, () -> {});
} finally {
IOUtils.closeWhileHandlingException(affixStream);
IOUtils.closeWhileHandlingException(dictStream);
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestStemmer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestStemmer.java
index 1aa0b14..44a62c3 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestStemmer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestStemmer.java
@@ -16,11 +16,7 @@
*/
package org.apache.lucene.analysis.hunspell;
-import java.io.IOException;
-import java.text.ParseException;
-import java.util.Collections;
import org.junit.BeforeClass;
-import org.junit.Test;
public class TestStemmer extends StemmerTestBase {
@@ -62,13 +58,6 @@
assertStemsTo("solr", "olr");
}
- @Test
- public void testHunspellStemmingApi() throws IOException, ParseException {
- Hunspell hunspell = new Hunspell(loadDictionary(false, "simple.aff", "simple.dic"));
- assertEquals(Collections.singletonList("apach"), hunspell.getRoots("apache"));
- assertEquals(Collections.singletonList("foo"), hunspell.getRoots("foo"));
- }
-
// some bogus stuff that should not stem (empty lists)!
public void testBogusStems() {
assertStemsTo("abs");
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/timelimit.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/timelimit.aff
new file mode 100644
index 0000000..8487288
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/timelimit.aff
@@ -0,0 +1,7 @@
+# A very expensive MAP: each character from each group has to be replaced with every other character from the same group,
+# so for long words the enumeration is very long
+
+MAP 3
+MAP qwertyuiop
+MAP asdfghjkl
+MAP zxcvbnm
\ No newline at end of file