docs/attachments/LUCENE-504/fuzzyquery.patch - lucene-jira-archive - Git at Google

 Index: CHANGES.txt
 ===================================================================
 --- CHANGES.txt	(revision 414178)
 +++ CHANGES.txt	(working copy)
 @@ -37,6 +37,15 @@
      (Chris Hostetter)

   5. Fix to allow MatchAllDocsQuery to be used with RemoteSearcher (Yonik Seeley)
 +
 + 6. LUCENE-504: FuzzyQuery can generate up to BooleanQuery.getMaxClauseCount()
 +    boolean clauses. To find the most-similar terms up to this number, it
 +    used to allocate a priority queue of this size. This caused errors when
 +    this limit was set very high.
 +    Added a setMaxClauseCount() to FuzzyQuery, to limit the number of terms
 +    FuzzyQuery selects, regardless of BooleanQuery's setting. The default is
 +    now 1024 (BooleanQuery's default maxClauseCount), for backward
 +    compatibility, but a smaller default should also be considered.

  Optimizations

 Index: src/test/org/apache/lucene/search/TestFuzzyQuery.java
 ===================================================================
 --- src/test/org/apache/lucene/search/TestFuzzyQuery.java	(revision 414178)
 +++ src/test/org/apache/lucene/search/TestFuzzyQuery.java	(working copy)
 @@ -249,6 +249,31 @@
      directory.close();
    }

 +  public void testPriorityQueueSize() throws Exception {
 +	  // This tests for a regression of bug LUCENE-504. When FuzzyQuery is
 +	  // limited in the number of OR clauses it produces (this limit is
 +	  // BooleanQuery.getMaxClauseCount()), it needs to chose from all "near"
 +	  // words this number of nearest words. However, if the user chose a
 +	  // huge limit for BooleanQuery.getMaxClauseCount() - perhaps for not
 +	  // limiting range or wildcard queries - it doesn't mean we need to
 +	  // allocate a huge priority queue.
 +	  int oldMaxClauseCount=BooleanQuery.getMaxClauseCount();
 +	  try {
 +		  BooleanQuery.setMaxClauseCount(Integer.MAX_VALUE-1);
 +		  testFuzziness();
 +		  BooleanQuery.setMaxClauseCount(Integer.MAX_VALUE);
 +		  testFuzziness();
 +	  } catch(OutOfMemoryError e){
 +		  fail("FuzzyQuery tried to allocate too much memory");
 +	  } catch(NegativeArraySizeException e){
 +		  // PriorityQueue tries to allocate an array of size+1, which for
 +		  // Integer.MAX_VALUE comes out "-1", which is negative.
 +		  fail("FuzzyQuery tried to create priority queue sized Integer.MAX_VALUE");
 +	  } finally {
 +		  BooleanQuery.setMaxClauseCount(oldMaxClauseCount);
 +	  }
 +  }
 +
    private void addDoc(String text, IndexWriter writer) throws IOException {
      Document doc = new Document();
      doc.add(new Field("field", text, Field.Store.YES, Field.Index.TOKENIZED));
 Index: src/java/org/apache/lucene/search/FuzzyQuery.java
 ===================================================================
 --- src/java/org/apache/lucene/search/FuzzyQuery.java	(revision 414178)
 +++ src/java/org/apache/lucene/search/FuzzyQuery.java	(working copy)
 @@ -30,9 +30,11 @@

    public final static float defaultMinSimilarity = 0.5f;
    public final static int defaultPrefixLength = 0;
 +  public final static int defaultMaxClauseCount = 1024;

    private float minimumSimilarity;
    private int prefixLength;
 +  private int maxClauseCount;

    /**
     * Create a new FuzzyQuery that will match terms with a similarity
 @@ -62,6 +64,8 @@

      this.minimumSimilarity = minimumSimilarity;
      this.prefixLength = prefixLength;
 +
 +    this.maxClauseCount = defaultMaxClauseCount;
    }

    /**
 @@ -94,14 +98,46 @@
    public int getPrefixLength() {
      return prefixLength;
    }
 -
 +
 +  /**
 +   * Returns the maximum number of boolean clauses generated by FuzzyQuery,
 +   * 1024 by default.
 +   * @see #setMaxClauseCount(int)
 +   */
 +  public int getMaxClauseCount() {
 +	  return maxClauseCount;
 +  }
 +
 +  /**
 +   * Sets the maximum number of boolean clauses generated by FuzzyQuery,
 +   * 1024 by default.
 +   * <p>FuzzyQuery works by finding all terms similar to the given term, and
 +   * generating a BooleanQuery with an "or" or all these similar terms.
 +   * setMaxClauseCount limits the number of the similar terms taken; If the
 +   * number of similar terms exceed this limit, those which are least similar
 +   * are dropped.
 +   * <p>Note that {@link BooleanQuery#setMaxClauseCount(int)} also limits
 +   * the number of boolean clauses which can be generated. The smaller limit
 +   * is used.
 +   */
 +  public void setMaxClauseCount(int maxClauseCount) {
 +	  this.maxClauseCount = maxClauseCount;
 +  }
 +
    protected FilteredTermEnum getEnum(IndexReader reader) throws IOException {
      return new FuzzyTermEnum(reader, getTerm(), minimumSimilarity, prefixLength);
    }

    public Query rewrite(IndexReader reader) throws IOException {
      FilteredTermEnum enumerator = getEnum(reader);
 -    int maxClauseCount = BooleanQuery.getMaxClauseCount();
 +    // To limit the generated clauses to BooleanQuery.getMaxClauseCount(),
 +    // we create a priority queue to find the most similar terms, up to this
 +    // number. But, when this limit is huge, there is no point in creating
 +    // a huge priority queue (or in creating so many terms). So FuzzyQuery
 +    // also has its own getMaxClauseCount(), and we do not generate more
 +    // clauses even if BooleanQuery allows us to.
 +    int maxClauseCount =
 +    	Math.min(getMaxClauseCount(), BooleanQuery.getMaxClauseCount());
      ScoreTermQueue stQueue = new ScoreTermQueue(maxClauseCount);

      try {
	Index: CHANGES.txt
	===================================================================
	--- CHANGES.txt (revision 414178)
	+++ CHANGES.txt (working copy)
	@@ -37,6 +37,15 @@
	(Chris Hostetter)

	5. Fix to allow MatchAllDocsQuery to be used with RemoteSearcher (Yonik Seeley)
	+
	+ 6. LUCENE-504: FuzzyQuery can generate up to BooleanQuery.getMaxClauseCount()
	+ boolean clauses. To find the most-similar terms up to this number, it
	+ used to allocate a priority queue of this size. This caused errors when
	+ this limit was set very high.
	+ Added a setMaxClauseCount() to FuzzyQuery, to limit the number of terms
	+ FuzzyQuery selects, regardless of BooleanQuery's setting. The default is
	+ now 1024 (BooleanQuery's default maxClauseCount), for backward
	+ compatibility, but a smaller default should also be considered.

	Optimizations

	Index: src/test/org/apache/lucene/search/TestFuzzyQuery.java
	===================================================================
	--- src/test/org/apache/lucene/search/TestFuzzyQuery.java (revision 414178)
	+++ src/test/org/apache/lucene/search/TestFuzzyQuery.java (working copy)
	@@ -249,6 +249,31 @@
	directory.close();
	}

	+ public void testPriorityQueueSize() throws Exception {
	+ // This tests for a regression of bug LUCENE-504. When FuzzyQuery is
	+ // limited in the number of OR clauses it produces (this limit is
	+ // BooleanQuery.getMaxClauseCount()), it needs to chose from all "near"
	+ // words this number of nearest words. However, if the user chose a
	+ // huge limit for BooleanQuery.getMaxClauseCount() - perhaps for not
	+ // limiting range or wildcard queries - it doesn't mean we need to
	+ // allocate a huge priority queue.
	+ int oldMaxClauseCount=BooleanQuery.getMaxClauseCount();
	+ try {
	+ BooleanQuery.setMaxClauseCount(Integer.MAX_VALUE-1);
	+ testFuzziness();
	+ BooleanQuery.setMaxClauseCount(Integer.MAX_VALUE);
	+ testFuzziness();
	+ } catch(OutOfMemoryError e){
	+ fail("FuzzyQuery tried to allocate too much memory");
	+ } catch(NegativeArraySizeException e){
	+ // PriorityQueue tries to allocate an array of size+1, which for
	+ // Integer.MAX_VALUE comes out "-1", which is negative.
	+ fail("FuzzyQuery tried to create priority queue sized Integer.MAX_VALUE");
	+ } finally {
	+ BooleanQuery.setMaxClauseCount(oldMaxClauseCount);
	+ }
	+ }
	+
	private void addDoc(String text, IndexWriter writer) throws IOException {
	Document doc = new Document();
	doc.add(new Field("field", text, Field.Store.YES, Field.Index.TOKENIZED));
	Index: src/java/org/apache/lucene/search/FuzzyQuery.java
	===================================================================
	--- src/java/org/apache/lucene/search/FuzzyQuery.java (revision 414178)
	+++ src/java/org/apache/lucene/search/FuzzyQuery.java (working copy)
	@@ -30,9 +30,11 @@

	public final static float defaultMinSimilarity = 0.5f;
	public final static int defaultPrefixLength = 0;
	+ public final static int defaultMaxClauseCount = 1024;

	private float minimumSimilarity;
	private int prefixLength;
	+ private int maxClauseCount;

	/**
	* Create a new FuzzyQuery that will match terms with a similarity
	@@ -62,6 +64,8 @@

	this.minimumSimilarity = minimumSimilarity;
	this.prefixLength = prefixLength;
	+
	+ this.maxClauseCount = defaultMaxClauseCount;
	}

	/**
	@@ -94,14 +98,46 @@
	public int getPrefixLength() {
	return prefixLength;
	}
	-
	+
	+ /**
	+ * Returns the maximum number of boolean clauses generated by FuzzyQuery,
	+ * 1024 by default.
	+ * @see #setMaxClauseCount(int)
	+ */
	+ public int getMaxClauseCount() {
	+ return maxClauseCount;
	+ }
	+
	+ /**
	+ * Sets the maximum number of boolean clauses generated by FuzzyQuery,
	+ * 1024 by default.
	+ * <p>FuzzyQuery works by finding all terms similar to the given term, and
	+ * generating a BooleanQuery with an "or" or all these similar terms.
	+ * setMaxClauseCount limits the number of the similar terms taken; If the
	+ * number of similar terms exceed this limit, those which are least similar
	+ * are dropped.
	+ * <p>Note that {@link BooleanQuery#setMaxClauseCount(int)} also limits
	+ * the number of boolean clauses which can be generated. The smaller limit
	+ * is used.
	+ */
	+ public void setMaxClauseCount(int maxClauseCount) {
	+ this.maxClauseCount = maxClauseCount;
	+ }
	+
	protected FilteredTermEnum getEnum(IndexReader reader) throws IOException {
	return new FuzzyTermEnum(reader, getTerm(), minimumSimilarity, prefixLength);
	}

	public Query rewrite(IndexReader reader) throws IOException {
	FilteredTermEnum enumerator = getEnum(reader);
	- int maxClauseCount = BooleanQuery.getMaxClauseCount();
	+ // To limit the generated clauses to BooleanQuery.getMaxClauseCount(),
	+ // we create a priority queue to find the most similar terms, up to this
	+ // number. But, when this limit is huge, there is no point in creating
	+ // a huge priority queue (or in creating so many terms). So FuzzyQuery
	+ // also has its own getMaxClauseCount(), and we do not generate more
	+ // clauses even if BooleanQuery allows us to.
	+ int maxClauseCount =
	+ Math.min(getMaxClauseCount(), BooleanQuery.getMaxClauseCount());
	ScoreTermQueue stQueue = new ScoreTermQueue(maxClauseCount);

	try {