| Index: CHANGES.txt |
| =================================================================== |
| --- CHANGES.txt (revision 414178) |
| +++ CHANGES.txt (working copy) |
| @@ -37,6 +37,15 @@ |
| (Chris Hostetter) |
| |
| 5. Fix to allow MatchAllDocsQuery to be used with RemoteSearcher (Yonik Seeley) |
| + |
| + 6. LUCENE-504: FuzzyQuery can generate up to BooleanQuery.getMaxClauseCount() |
| + boolean clauses. To find the most-similar terms up to this number, it |
| + used to allocate a priority queue of this size. This caused errors when |
| + this limit was set very high. |
| + Added a setMaxClauseCount() to FuzzyQuery, to limit the number of terms |
| + FuzzyQuery selects, regardless of BooleanQuery's setting. The default is |
| + now 1024 (BooleanQuery's default maxClauseCount), for backward |
| + compatibility, but a smaller default should also be considered. |
| |
| Optimizations |
| |
| Index: src/test/org/apache/lucene/search/TestFuzzyQuery.java |
| =================================================================== |
| --- src/test/org/apache/lucene/search/TestFuzzyQuery.java (revision 414178) |
| +++ src/test/org/apache/lucene/search/TestFuzzyQuery.java (working copy) |
| @@ -249,6 +249,31 @@ |
| directory.close(); |
| } |
| |
| + public void testPriorityQueueSize() throws Exception { |
| + // This tests for a regression of bug LUCENE-504. When FuzzyQuery is |
| + // limited in the number of OR clauses it produces (this limit is |
| + // BooleanQuery.getMaxClauseCount()), it needs to chose from all "near" |
| + // words this number of nearest words. However, if the user chose a |
| + // huge limit for BooleanQuery.getMaxClauseCount() - perhaps for not |
| + // limiting range or wildcard queries - it doesn't mean we need to |
| + // allocate a huge priority queue. |
| + int oldMaxClauseCount=BooleanQuery.getMaxClauseCount(); |
| + try { |
| + BooleanQuery.setMaxClauseCount(Integer.MAX_VALUE-1); |
| + testFuzziness(); |
| + BooleanQuery.setMaxClauseCount(Integer.MAX_VALUE); |
| + testFuzziness(); |
| + } catch(OutOfMemoryError e){ |
| + fail("FuzzyQuery tried to allocate too much memory"); |
| + } catch(NegativeArraySizeException e){ |
| + // PriorityQueue tries to allocate an array of size+1, which for |
| + // Integer.MAX_VALUE comes out "-1", which is negative. |
| + fail("FuzzyQuery tried to create priority queue sized Integer.MAX_VALUE"); |
| + } finally { |
| + BooleanQuery.setMaxClauseCount(oldMaxClauseCount); |
| + } |
| + } |
| + |
| private void addDoc(String text, IndexWriter writer) throws IOException { |
| Document doc = new Document(); |
| doc.add(new Field("field", text, Field.Store.YES, Field.Index.TOKENIZED)); |
| Index: src/java/org/apache/lucene/search/FuzzyQuery.java |
| =================================================================== |
| --- src/java/org/apache/lucene/search/FuzzyQuery.java (revision 414178) |
| +++ src/java/org/apache/lucene/search/FuzzyQuery.java (working copy) |
| @@ -30,9 +30,11 @@ |
| |
| public final static float defaultMinSimilarity = 0.5f; |
| public final static int defaultPrefixLength = 0; |
| + public final static int defaultMaxClauseCount = 1024; |
| |
| private float minimumSimilarity; |
| private int prefixLength; |
| + private int maxClauseCount; |
| |
| /** |
| * Create a new FuzzyQuery that will match terms with a similarity |
| @@ -62,6 +64,8 @@ |
| |
| this.minimumSimilarity = minimumSimilarity; |
| this.prefixLength = prefixLength; |
| + |
| + this.maxClauseCount = defaultMaxClauseCount; |
| } |
| |
| /** |
| @@ -94,14 +98,46 @@ |
| public int getPrefixLength() { |
| return prefixLength; |
| } |
| - |
| + |
| + /** |
| + * Returns the maximum number of boolean clauses generated by FuzzyQuery, |
| + * 1024 by default. |
| + * @see #setMaxClauseCount(int) |
| + */ |
| + public int getMaxClauseCount() { |
| + return maxClauseCount; |
| + } |
| + |
| + /** |
| + * Sets the maximum number of boolean clauses generated by FuzzyQuery, |
| + * 1024 by default. |
| + * <p>FuzzyQuery works by finding all terms similar to the given term, and |
| + * generating a BooleanQuery with an "or" or all these similar terms. |
| + * setMaxClauseCount limits the number of the similar terms taken; If the |
| + * number of similar terms exceed this limit, those which are least similar |
| + * are dropped. |
| + * <p>Note that {@link BooleanQuery#setMaxClauseCount(int)} also limits |
| + * the number of boolean clauses which can be generated. The smaller limit |
| + * is used. |
| + */ |
| + public void setMaxClauseCount(int maxClauseCount) { |
| + this.maxClauseCount = maxClauseCount; |
| + } |
| + |
| protected FilteredTermEnum getEnum(IndexReader reader) throws IOException { |
| return new FuzzyTermEnum(reader, getTerm(), minimumSimilarity, prefixLength); |
| } |
| |
| public Query rewrite(IndexReader reader) throws IOException { |
| FilteredTermEnum enumerator = getEnum(reader); |
| - int maxClauseCount = BooleanQuery.getMaxClauseCount(); |
| + // To limit the generated clauses to BooleanQuery.getMaxClauseCount(), |
| + // we create a priority queue to find the most similar terms, up to this |
| + // number. But, when this limit is huge, there is no point in creating |
| + // a huge priority queue (or in creating so many terms). So FuzzyQuery |
| + // also has its own getMaxClauseCount(), and we do not generate more |
| + // clauses even if BooleanQuery allows us to. |
| + int maxClauseCount = |
| + Math.min(getMaxClauseCount(), BooleanQuery.getMaxClauseCount()); |
| ScoreTermQueue stQueue = new ScoreTermQueue(maxClauseCount); |
| |
| try { |