docs/attachments/LUCENE-1626/LUCENE-1626-positionincrement.patch - lucene-jira-archive - Git at Google

 Index: CHANGES.txt
 ===================================================================
 --- CHANGES.txt	(revision 727243)
 +++ CHANGES.txt	(working copy)
 @@ -182,6 +182,11 @@

  24. LUCENE-1131: Added numDeletedDocs method to IndexReader (Otis Gospodnetic)

 +25. LUCENE-1494: Deprecated Analyzer.getPositionIncrementGap(String) in favour
 +    of getPositionIncrementGap(String, int), which is aware of the current
 +    position and can be used to 'line up' terms across variable-length fields.
 +    (Paul Cowan)
 +
  Bug fixes

   1. LUCENE-1134: Fixed BooleanQuery.rewrite to only optimize a single
 Index: src/java/org/apache/lucene/index/DocInverterPerField.java
 ===================================================================
 --- src/java/org/apache/lucene/index/DocInverterPerField.java	(revision 727243)
 +++ src/java/org/apache/lucene/index/DocInverterPerField.java	(working copy)
 @@ -74,7 +74,8 @@
        if (field.isIndexed() && doInvert) {

          if (fieldState.length > 0)
 -          fieldState.position += docState.analyzer.getPositionIncrementGap(fieldInfo.name);
 +          fieldState.position += docState.analyzer.getPositionIncrementGap(
 +              fieldInfo.name, fieldState.position);

          if (!field.isTokenized()) {		  // un-tokenized field
            String stringValue = field.stringValue();
 Index: src/java/org/apache/lucene/analysis/PerFieldAnalyzerWrapper.java
 ===================================================================
 --- src/java/org/apache/lucene/analysis/PerFieldAnalyzerWrapper.java	(revision 727243)
 +++ src/java/org/apache/lucene/analysis/PerFieldAnalyzerWrapper.java	(working copy)
 @@ -85,11 +85,11 @@
    }

    /** Return the positionIncrementGap from the analyzer assigned to fieldName */
 -  public int getPositionIncrementGap(String fieldName) {
 +  public int getPositionIncrementGap(String fieldName, int lastTokenPosition) {
      Analyzer analyzer = (Analyzer) analyzerMap.get(fieldName);
      if (analyzer == null)
        analyzer = defaultAnalyzer;
 -    return analyzer.getPositionIncrementGap(fieldName);
 +    return analyzer.getPositionIncrementGap(fieldName, lastTokenPosition);
    }

    public String toString() {
 Index: src/java/org/apache/lucene/analysis/Analyzer.java
 ===================================================================
 --- src/java/org/apache/lucene/analysis/Analyzer.java	(revision 727243)
 +++ src/java/org/apache/lucene/analysis/Analyzer.java	(working copy)
 @@ -62,6 +62,21 @@


    /**
 +   * Provides a constant gap between the position values of tokens
 +   * from different Fieldable instances which share the same field name.
 +   * Used by the default implementation of
 +   * {@link #getPositionIncrementGap(String, int)}.
 +   *
 +   * @param fieldName Fieldable name being indexed.
 +   * @return position increment gap, added to the next token emitted from {@link #tokenStream(String,Reader)}
 +   * @deprecated replaced with {@link #getPositionIncrementGap(String, int)}
 +   */
 +  protected int getPositionIncrementGap(String fieldName)
 +  {
 +    return 0;
 +  }
 +
 +  /**
     * Invoked before indexing a Fieldable instance if
     * terms have already been added to that field.  This allows custom
     * analyzers to place an automatic position increment gap between
 @@ -70,12 +85,18 @@
     * the typical default token position increment of 1, all terms in a field,
     * including across Fieldable instances, are in successive positions, allowing
     * exact PhraseQuery matches, for instance, across Fieldable instance boundaries.
 +   * The last token position is supplied to enable analyzers to 'line up' terms;
 +   * for example, for subsequent terms to start at positions that are multiples
 +   * of 100. Defaults to the constant gap supplied by
 +   * {@link #getPositionIncrementGap(String)}.
     *
     * @param fieldName Fieldable name being indexed.
 -   * @return position increment gap, added to the next token emitted from {@link #tokenStream(String,Reader)}
 +   * @param lastTokenPosition the last token position for this field
 +   * @return position increment gap, added to the next token emitted from
 +   *   {@link #tokenStream(String,Reader)}
     */
 -  public int getPositionIncrementGap(String fieldName)
 +  public int getPositionIncrementGap(String fieldName, int lastTokenPosition)
    {
 -    return 0;
 +    return getPositionIncrementGap(fieldName);
    }
  }
 Index: src/test/org/apache/lucene/index/TestDocumentWriter.java
 ===================================================================
 --- src/test/org/apache/lucene/index/TestDocumentWriter.java	(revision 727243)
 +++ src/test/org/apache/lucene/index/TestDocumentWriter.java	(working copy)
 @@ -102,7 +102,8 @@
      }
    }

 -  public void testPositionIncrementGap() throws IOException {
 +  public void testOldPositionIncrementGap() throws IOException {
 +    // Tests the use of the legacy getPositionIncrementGap(String)
      Analyzer analyzer = new Analyzer() {
        public TokenStream tokenStream(String fieldName, Reader reader) {
          return new WhitespaceTokenizer(reader);
 @@ -112,6 +113,26 @@
          return 500;
        }
      };
 +    assertExpectedPositionIncrementGap(analyzer, 2, 0, 502);
 +  }
 +
 +  public void testNewPositionIncrementGap() throws IOException {
 +    // Tests the use of the new getPositionIncrementGap(String, int)
 +    Analyzer analyzer = new Analyzer() {
 +      public TokenStream tokenStream(String fieldName, Reader reader) {
 +        return new WhitespaceTokenizer(reader);
 +      }
 +
 +      public int getPositionIncrementGap(String fieldName, int lastTokenPosition) {
 +        // Calculate gap to round up to nearest 500
 +        return (((lastTokenPosition / 500) + 1) * 500) - lastTokenPosition;
 +      }
 +    };
 +    assertExpectedPositionIncrementGap(analyzer, 2, 0, 500);
 +  }
 +
 +  private void assertExpectedPositionIncrementGap(Analyzer analyzer,
 +      int expectedFreq, int firstPosition, int secondPosition) throws IOException {

      IndexWriter writer = new IndexWriter(dir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);

 @@ -128,9 +149,9 @@
      TermPositions termPositions = reader.termPositions(new Term("repeated", "repeated"));
      assertTrue(termPositions.next());
      int freq = termPositions.freq();
 -    assertEquals(2, freq);
 -    assertEquals(0, termPositions.nextPosition());
 -    assertEquals(502, termPositions.nextPosition());
 +    assertEquals(expectedFreq, freq);
 +    assertEquals(firstPosition, termPositions.nextPosition());
 +    assertEquals(secondPosition, termPositions.nextPosition());
    }

    public void testTokenReuse() throws IOException {
	Index: CHANGES.txt
	===================================================================
	--- CHANGES.txt (revision 727243)
	+++ CHANGES.txt (working copy)
	@@ -182,6 +182,11 @@

	24. LUCENE-1131: Added numDeletedDocs method to IndexReader (Otis Gospodnetic)

	+25. LUCENE-1494: Deprecated Analyzer.getPositionIncrementGap(String) in favour
	+ of getPositionIncrementGap(String, int), which is aware of the current
	+ position and can be used to 'line up' terms across variable-length fields.
	+ (Paul Cowan)
	+
	Bug fixes

	1. LUCENE-1134: Fixed BooleanQuery.rewrite to only optimize a single
	Index: src/java/org/apache/lucene/index/DocInverterPerField.java
	===================================================================
	--- src/java/org/apache/lucene/index/DocInverterPerField.java (revision 727243)
	+++ src/java/org/apache/lucene/index/DocInverterPerField.java (working copy)
	@@ -74,7 +74,8 @@
	if (field.isIndexed() && doInvert) {

	if (fieldState.length > 0)
	- fieldState.position += docState.analyzer.getPositionIncrementGap(fieldInfo.name);
	+ fieldState.position += docState.analyzer.getPositionIncrementGap(
	+ fieldInfo.name, fieldState.position);

	if (!field.isTokenized()) { // un-tokenized field
	String stringValue = field.stringValue();
	Index: src/java/org/apache/lucene/analysis/PerFieldAnalyzerWrapper.java
	===================================================================
	--- src/java/org/apache/lucene/analysis/PerFieldAnalyzerWrapper.java (revision 727243)
	+++ src/java/org/apache/lucene/analysis/PerFieldAnalyzerWrapper.java (working copy)
	@@ -85,11 +85,11 @@
	}

	/** Return the positionIncrementGap from the analyzer assigned to fieldName */
	- public int getPositionIncrementGap(String fieldName) {
	+ public int getPositionIncrementGap(String fieldName, int lastTokenPosition) {
	Analyzer analyzer = (Analyzer) analyzerMap.get(fieldName);
	if (analyzer == null)
	analyzer = defaultAnalyzer;
	- return analyzer.getPositionIncrementGap(fieldName);
	+ return analyzer.getPositionIncrementGap(fieldName, lastTokenPosition);
	}

	public String toString() {
	Index: src/java/org/apache/lucene/analysis/Analyzer.java
	===================================================================
	--- src/java/org/apache/lucene/analysis/Analyzer.java (revision 727243)
	+++ src/java/org/apache/lucene/analysis/Analyzer.java (working copy)
	@@ -62,6 +62,21 @@


	/**
	+ * Provides a constant gap between the position values of tokens
	+ * from different Fieldable instances which share the same field name.
	+ * Used by the default implementation of
	+ * {@link #getPositionIncrementGap(String, int)}.
	+ *
	+ * @param fieldName Fieldable name being indexed.
	+ * @return position increment gap, added to the next token emitted from {@link #tokenStream(String,Reader)}
	+ * @deprecated replaced with {@link #getPositionIncrementGap(String, int)}
	+ */
	+ protected int getPositionIncrementGap(String fieldName)
	+ {
	+ return 0;
	+ }
	+
	+ /**
	* Invoked before indexing a Fieldable instance if
	* terms have already been added to that field. This allows custom
	* analyzers to place an automatic position increment gap between
	@@ -70,12 +85,18 @@
	* the typical default token position increment of 1, all terms in a field,
	* including across Fieldable instances, are in successive positions, allowing
	* exact PhraseQuery matches, for instance, across Fieldable instance boundaries.
	+ * The last token position is supplied to enable analyzers to 'line up' terms;
	+ * for example, for subsequent terms to start at positions that are multiples
	+ * of 100. Defaults to the constant gap supplied by
	+ * {@link #getPositionIncrementGap(String)}.
	*
	* @param fieldName Fieldable name being indexed.
	- * @return position increment gap, added to the next token emitted from {@link #tokenStream(String,Reader)}
	+ * @param lastTokenPosition the last token position for this field
	+ * @return position increment gap, added to the next token emitted from
	+ * {@link #tokenStream(String,Reader)}
	*/
	- public int getPositionIncrementGap(String fieldName)
	+ public int getPositionIncrementGap(String fieldName, int lastTokenPosition)
	{
	- return 0;
	+ return getPositionIncrementGap(fieldName);
	}
	}
	Index: src/test/org/apache/lucene/index/TestDocumentWriter.java
	===================================================================
	--- src/test/org/apache/lucene/index/TestDocumentWriter.java (revision 727243)
	+++ src/test/org/apache/lucene/index/TestDocumentWriter.java (working copy)
	@@ -102,7 +102,8 @@
	}
	}

	- public void testPositionIncrementGap() throws IOException {
	+ public void testOldPositionIncrementGap() throws IOException {
	+ // Tests the use of the legacy getPositionIncrementGap(String)
	Analyzer analyzer = new Analyzer() {
	public TokenStream tokenStream(String fieldName, Reader reader) {
	return new WhitespaceTokenizer(reader);
	@@ -112,6 +113,26 @@
	return 500;
	}
	};
	+ assertExpectedPositionIncrementGap(analyzer, 2, 0, 502);
	+ }
	+
	+ public void testNewPositionIncrementGap() throws IOException {
	+ // Tests the use of the new getPositionIncrementGap(String, int)
	+ Analyzer analyzer = new Analyzer() {
	+ public TokenStream tokenStream(String fieldName, Reader reader) {
	+ return new WhitespaceTokenizer(reader);
	+ }
	+
	+ public int getPositionIncrementGap(String fieldName, int lastTokenPosition) {
	+ // Calculate gap to round up to nearest 500
	+ return (((lastTokenPosition / 500) + 1) * 500) - lastTokenPosition;
	+ }
	+ };
	+ assertExpectedPositionIncrementGap(analyzer, 2, 0, 500);
	+ }
	+
	+ private void assertExpectedPositionIncrementGap(Analyzer analyzer,
	+ int expectedFreq, int firstPosition, int secondPosition) throws IOException {

	IndexWriter writer = new IndexWriter(dir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);

	@@ -128,9 +149,9 @@
	TermPositions termPositions = reader.termPositions(new Term("repeated", "repeated"));
	assertTrue(termPositions.next());
	int freq = termPositions.freq();
	- assertEquals(2, freq);
	- assertEquals(0, termPositions.nextPosition());
	- assertEquals(502, termPositions.nextPosition());
	+ assertEquals(expectedFreq, freq);
	+ assertEquals(firstPosition, termPositions.nextPosition());
	+ assertEquals(secondPosition, termPositions.nextPosition());
	}

	public void testTokenReuse() throws IOException {