blob: 05f07dc4636f9dbdcefc6546560a653c349c6dee [file] [log] [blame]
Index: CHANGES.txt
===================================================================
--- CHANGES.txt (revision 414178)
+++ CHANGES.txt (working copy)
@@ -37,6 +37,15 @@
(Chris Hostetter)
5. Fix to allow MatchAllDocsQuery to be used with RemoteSearcher (Yonik Seeley)
+
+ 6. LUCENE-504: FuzzyQuery can generate up to BooleanQuery.getMaxClauseCount()
+ boolean clauses. To find the most-similar terms up to this number, it
+ used to allocate a priority queue of this size. This caused errors when
+ this limit was set very high.
+ Added a setMaxClauseCount() to FuzzyQuery, to limit the number of terms
+ FuzzyQuery selects, regardless of BooleanQuery's setting. The default is
+ now 1024 (BooleanQuery's default maxClauseCount), for backward
+ compatibility, but a smaller default should also be considered.
Optimizations
Index: src/test/org/apache/lucene/search/TestFuzzyQuery.java
===================================================================
--- src/test/org/apache/lucene/search/TestFuzzyQuery.java (revision 414178)
+++ src/test/org/apache/lucene/search/TestFuzzyQuery.java (working copy)
@@ -249,6 +249,31 @@
directory.close();
}
+ public void testPriorityQueueSize() throws Exception {
+ // This tests for a regression of bug LUCENE-504. When FuzzyQuery is
+ // limited in the number of OR clauses it produces (this limit is
+ // BooleanQuery.getMaxClauseCount()), it needs to chose from all "near"
+ // words this number of nearest words. However, if the user chose a
+ // huge limit for BooleanQuery.getMaxClauseCount() - perhaps for not
+ // limiting range or wildcard queries - it doesn't mean we need to
+ // allocate a huge priority queue.
+ int oldMaxClauseCount=BooleanQuery.getMaxClauseCount();
+ try {
+ BooleanQuery.setMaxClauseCount(Integer.MAX_VALUE-1);
+ testFuzziness();
+ BooleanQuery.setMaxClauseCount(Integer.MAX_VALUE);
+ testFuzziness();
+ } catch(OutOfMemoryError e){
+ fail("FuzzyQuery tried to allocate too much memory");
+ } catch(NegativeArraySizeException e){
+ // PriorityQueue tries to allocate an array of size+1, which for
+ // Integer.MAX_VALUE comes out "-1", which is negative.
+ fail("FuzzyQuery tried to create priority queue sized Integer.MAX_VALUE");
+ } finally {
+ BooleanQuery.setMaxClauseCount(oldMaxClauseCount);
+ }
+ }
+
private void addDoc(String text, IndexWriter writer) throws IOException {
Document doc = new Document();
doc.add(new Field("field", text, Field.Store.YES, Field.Index.TOKENIZED));
Index: src/java/org/apache/lucene/search/FuzzyQuery.java
===================================================================
--- src/java/org/apache/lucene/search/FuzzyQuery.java (revision 414178)
+++ src/java/org/apache/lucene/search/FuzzyQuery.java (working copy)
@@ -30,9 +30,11 @@
public final static float defaultMinSimilarity = 0.5f;
public final static int defaultPrefixLength = 0;
+ public final static int defaultMaxClauseCount = 1024;
private float minimumSimilarity;
private int prefixLength;
+ private int maxClauseCount;
/**
* Create a new FuzzyQuery that will match terms with a similarity
@@ -62,6 +64,8 @@
this.minimumSimilarity = minimumSimilarity;
this.prefixLength = prefixLength;
+
+ this.maxClauseCount = defaultMaxClauseCount;
}
/**
@@ -94,14 +98,46 @@
public int getPrefixLength() {
return prefixLength;
}
-
+
+ /**
+ * Returns the maximum number of boolean clauses generated by FuzzyQuery,
+ * 1024 by default.
+ * @see #setMaxClauseCount(int)
+ */
+ public int getMaxClauseCount() {
+ return maxClauseCount;
+ }
+
+ /**
+ * Sets the maximum number of boolean clauses generated by FuzzyQuery,
+ * 1024 by default.
+ * <p>FuzzyQuery works by finding all terms similar to the given term, and
+ * generating a BooleanQuery with an "or" or all these similar terms.
+ * setMaxClauseCount limits the number of the similar terms taken; If the
+ * number of similar terms exceed this limit, those which are least similar
+ * are dropped.
+ * <p>Note that {@link BooleanQuery#setMaxClauseCount(int)} also limits
+ * the number of boolean clauses which can be generated. The smaller limit
+ * is used.
+ */
+ public void setMaxClauseCount(int maxClauseCount) {
+ this.maxClauseCount = maxClauseCount;
+ }
+
protected FilteredTermEnum getEnum(IndexReader reader) throws IOException {
return new FuzzyTermEnum(reader, getTerm(), minimumSimilarity, prefixLength);
}
public Query rewrite(IndexReader reader) throws IOException {
FilteredTermEnum enumerator = getEnum(reader);
- int maxClauseCount = BooleanQuery.getMaxClauseCount();
+ // To limit the generated clauses to BooleanQuery.getMaxClauseCount(),
+ // we create a priority queue to find the most similar terms, up to this
+ // number. But, when this limit is huge, there is no point in creating
+ // a huge priority queue (or in creating so many terms). So FuzzyQuery
+ // also has its own getMaxClauseCount(), and we do not generate more
+ // clauses even if BooleanQuery allows us to.
+ int maxClauseCount =
+ Math.min(getMaxClauseCount(), BooleanQuery.getMaxClauseCount());
ScoreTermQueue stQueue = new ScoreTermQueue(maxClauseCount);
try {