docs/attachments/LUCENE-2864/LUCENE-2864.patch - lucene-jira-archive - Git at Google

 Index: lucene/src/test/org/apache/lucene/index/TestMaxTF.java
 ===================================================================
 --- lucene/src/test/org/apache/lucene/index/TestMaxTF.java	(revision 0)
 +++ lucene/src/test/org/apache/lucene/index/TestMaxTF.java	(revision 0)
 @@ -0,0 +1,116 @@
 +package org.apache.lucene.index;
 +
 +/**
 + * Licensed to the Apache Software Foundation (ASF) under one or more
 + * contributor license agreements.  See the NOTICE file distributed with
 + * this work for additional information regarding copyright ownership.
 + * The ASF licenses this file to You under the Apache License, Version 2.0
 + * (the "License"); you may not use this file except in compliance with
 + * the License.  You may obtain a copy of the License at
 + *
 + *     http://www.apache.org/licenses/LICENSE-2.0
 + *
 + * Unless required by applicable law or agreed to in writing, software
 + * distributed under the License is distributed on an "AS IS" BASIS,
 + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 + * See the License for the specific language governing permissions and
 + * limitations under the License.
 + */
 +
 +import java.util.ArrayList;
 +import java.util.Arrays;
 +import java.util.Collections;
 +import java.util.List;
 +
 +import org.apache.lucene.analysis.MockAnalyzer;
 +import org.apache.lucene.analysis.MockTokenizer;
 +import org.apache.lucene.document.Document;
 +import org.apache.lucene.document.Field;
 +import org.apache.lucene.search.DefaultSimilarity;
 +import org.apache.lucene.store.Directory;
 +import org.apache.lucene.util.LuceneTestCase;
 +import org.apache.lucene.util._TestUtil;
 +
 +/**
 + * Tests the maxTF statistic in FieldInvertState
 + */
 +public class TestMaxTF extends LuceneTestCase {
 +  Directory dir;
 +  IndexReader reader;
 +  /* expected maxTF values for our documents */
 +  ArrayList<Integer> expected = new ArrayList<Integer>();
 +
 +  @Override
 +  public void setUp() throws Exception {
 +    super.setUp();
 +    dir = newDirectory();
 +    IndexWriterConfig config = newIndexWriterConfig(TEST_VERSION_CURRENT,
 +        new MockAnalyzer(MockTokenizer.SIMPLE, true));
 +    config.setSimilarity(new TestSimilarity());
 +    RandomIndexWriter writer = new RandomIndexWriter(random, dir, config);
 +    Document doc = new Document();
 +    Field foo = newField("foo", "", Field.Store.NO, Field.Index.ANALYZED);
 +    doc.add(foo);
 +    for (int i = 0; i < 100; i++) {
 +      foo.setValue(addValue());
 +      writer.addDocument(doc);
 +    }
 +    reader = writer.getReader();
 +    writer.close();
 +  }
 +
 +  @Override
 +  public void tearDown() throws Exception {
 +    reader.close();
 +    dir.close();
 +    super.tearDown();
 +  }
 +
 +  public void test() throws Exception {
 +    byte fooNorms[] = MultiNorms.norms(reader, "foo");
 +    for (int i = 0; i < reader.maxDoc(); i++)
 +      assertEquals(expected.get(i).intValue(), fooNorms[i] & 0xff);
 +  }
 +
 +  /**
 +   * Makes a bunch of single-char tokens (the max freq will at most be 255).
 +   * shuffles them around, and returns the whole list with Arrays.toString().
 +   * This works fine because we use lettertokenizer.
 +   * puts the max-frequency term into expected, to be checked against the norm.
 +   */
 +  private String addValue() {
 +    List<String> terms = new ArrayList<String>();
 +    int maxCeiling = _TestUtil.nextInt(random, 0, 255);
 +    int max = 0;
 +    for (char ch = 'a'; ch <= 'z'; ch++) {
 +      int num = _TestUtil.nextInt(random, 0, maxCeiling);
 +      for (int i = 0; i < num; i++)
 +        terms.add(Character.toString(ch));
 +      max = Math.max(max, num);
 +    }
 +    expected.add(max);
 +    Collections.shuffle(terms, random);
 +    return Arrays.toString(terms.toArray(new String[terms.size()]));
 +  }
 +
 +  /**
 +   * Simple similarity that encodes maxTF directly as a byte
 +   */
 +  class TestSimilarity extends DefaultSimilarity {
 +
 +    @Override
 +    public byte encodeNormValue(float f) {
 +      return (byte) f;
 +    }
 +
 +    @Override
 +    public float decodeNormValue(byte b) {
 +      return (float) b;
 +    }
 +
 +    @Override
 +    public float computeNorm(String field, FieldInvertState state) {
 +      return (float) state.maxTF;
 +    }
 +  }
 +}

 Property changes on: lucene\src\test\org\apache\lucene\index\TestMaxTF.java
 ___________________________________________________________________
 Added: svn:eol-style
    + native

 Index: lucene/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java
 ===================================================================
 --- lucene/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java	(revision 1058413)
 +++ lucene/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java	(working copy)
 @@ -125,6 +125,7 @@
        postings.docFreqs[termID] = 1;
        writeProx(termID, fieldState.position);
      }
 +    fieldState.maxTF = Math.max(1, fieldState.maxTF);
    }

    @Override
 @@ -158,11 +159,12 @@
            termsHashPerField.writeVInt(0, postings.docFreqs[termID]);
          }
          postings.docFreqs[termID] = 1;
 +        fieldState.maxTF = Math.max(1, fieldState.maxTF);
          postings.lastDocCodes[termID] = (docState.docID - postings.lastDocIDs[termID]) << 1;
          postings.lastDocIDs[termID] = docState.docID;
          writeProx(termID, fieldState.position);
        } else {
 -        postings.docFreqs[termID]++;
 +        fieldState.maxTF = Math.max(fieldState.maxTF, ++postings.docFreqs[termID]);
          writeProx(termID, fieldState.position-postings.lastPositions[termID]);
        }
      }
 Index: lucene/src/java/org/apache/lucene/index/FieldInvertState.java
 ===================================================================
 --- lucene/src/java/org/apache/lucene/index/FieldInvertState.java	(revision 1058413)
 +++ lucene/src/java/org/apache/lucene/index/FieldInvertState.java	(working copy)
 @@ -30,6 +30,7 @@
    int length;
    int numOverlap;
    int offset;
 +  int maxTF;
    float boost;
    AttributeSource attributeSource;

 @@ -53,6 +54,7 @@
      length = 0;
      numOverlap = 0;
      offset = 0;
 +    maxTF = 0;
      boost = docBoost;
      attributeSource = null;
    }
 @@ -107,6 +109,13 @@
      return boost;
    }

 +  /**
 +   * Get the maximum term-frequency encountered.
 +   */
 +  public int getMaxTF() {
 +    return maxTF;
 +  }
 +
    public void setBoost(float boost) {
      this.boost = boost;
    }
	Index: lucene/src/test/org/apache/lucene/index/TestMaxTF.java
	===================================================================
	--- lucene/src/test/org/apache/lucene/index/TestMaxTF.java (revision 0)
	+++ lucene/src/test/org/apache/lucene/index/TestMaxTF.java (revision 0)
	@@ -0,0 +1,116 @@
	+package org.apache.lucene.index;
	+
	+/**
	+ * Licensed to the Apache Software Foundation (ASF) under one or more
	+ * contributor license agreements. See the NOTICE file distributed with
	+ * this work for additional information regarding copyright ownership.
	+ * The ASF licenses this file to You under the Apache License, Version 2.0
	+ * (the "License"); you may not use this file except in compliance with
	+ * the License. You may obtain a copy of the License at
	+ *
	+ * http://www.apache.org/licenses/LICENSE-2.0
	+ *
	+ * Unless required by applicable law or agreed to in writing, software
	+ * distributed under the License is distributed on an "AS IS" BASIS,
	+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	+ * See the License for the specific language governing permissions and
	+ * limitations under the License.
	+ */
	+
	+import java.util.ArrayList;
	+import java.util.Arrays;
	+import java.util.Collections;
	+import java.util.List;
	+
	+import org.apache.lucene.analysis.MockAnalyzer;
	+import org.apache.lucene.analysis.MockTokenizer;
	+import org.apache.lucene.document.Document;
	+import org.apache.lucene.document.Field;
	+import org.apache.lucene.search.DefaultSimilarity;
	+import org.apache.lucene.store.Directory;
	+import org.apache.lucene.util.LuceneTestCase;
	+import org.apache.lucene.util._TestUtil;
	+
	+/**
	+ * Tests the maxTF statistic in FieldInvertState
	+ */
	+public class TestMaxTF extends LuceneTestCase {
	+ Directory dir;
	+ IndexReader reader;
	+ /* expected maxTF values for our documents */
	+ ArrayList<Integer> expected = new ArrayList<Integer>();
	+
	+ @Override
	+ public void setUp() throws Exception {
	+ super.setUp();
	+ dir = newDirectory();
	+ IndexWriterConfig config = newIndexWriterConfig(TEST_VERSION_CURRENT,
	+ new MockAnalyzer(MockTokenizer.SIMPLE, true));
	+ config.setSimilarity(new TestSimilarity());
	+ RandomIndexWriter writer = new RandomIndexWriter(random, dir, config);
	+ Document doc = new Document();
	+ Field foo = newField("foo", "", Field.Store.NO, Field.Index.ANALYZED);
	+ doc.add(foo);
	+ for (int i = 0; i < 100; i++) {
	+ foo.setValue(addValue());
	+ writer.addDocument(doc);
	+ }
	+ reader = writer.getReader();
	+ writer.close();
	+ }
	+
	+ @Override
	+ public void tearDown() throws Exception {
	+ reader.close();
	+ dir.close();
	+ super.tearDown();
	+ }
	+
	+ public void test() throws Exception {
	+ byte fooNorms[] = MultiNorms.norms(reader, "foo");
	+ for (int i = 0; i < reader.maxDoc(); i++)
	+ assertEquals(expected.get(i).intValue(), fooNorms[i] & 0xff);
	+ }
	+
	+ /**
	+ * Makes a bunch of single-char tokens (the max freq will at most be 255).
	+ * shuffles them around, and returns the whole list with Arrays.toString().
	+ * This works fine because we use lettertokenizer.
	+ * puts the max-frequency term into expected, to be checked against the norm.
	+ */
	+ private String addValue() {
	+ List<String> terms = new ArrayList<String>();
	+ int maxCeiling = _TestUtil.nextInt(random, 0, 255);
	+ int max = 0;
	+ for (char ch = 'a'; ch <= 'z'; ch++) {
	+ int num = _TestUtil.nextInt(random, 0, maxCeiling);
	+ for (int i = 0; i < num; i++)
	+ terms.add(Character.toString(ch));
	+ max = Math.max(max, num);
	+ }
	+ expected.add(max);
	+ Collections.shuffle(terms, random);
	+ return Arrays.toString(terms.toArray(new String[terms.size()]));
	+ }
	+
	+ /**
	+ * Simple similarity that encodes maxTF directly as a byte
	+ */
	+ class TestSimilarity extends DefaultSimilarity {
	+
	+ @Override
	+ public byte encodeNormValue(float f) {
	+ return (byte) f;
	+ }
	+
	+ @Override
	+ public float decodeNormValue(byte b) {
	+ return (float) b;
	+ }
	+
	+ @Override
	+ public float computeNorm(String field, FieldInvertState state) {
	+ return (float) state.maxTF;
	+ }
	+ }
	+}

	Property changes on: lucene\src\test\org\apache\lucene\index\TestMaxTF.java
	___________________________________________________________________
	Added: svn:eol-style
	+ native

	Index: lucene/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java
	===================================================================
	--- lucene/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java (revision 1058413)
	+++ lucene/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java (working copy)
	@@ -125,6 +125,7 @@
	postings.docFreqs[termID] = 1;
	writeProx(termID, fieldState.position);
	}
	+ fieldState.maxTF = Math.max(1, fieldState.maxTF);
	}

	@Override
	@@ -158,11 +159,12 @@
	termsHashPerField.writeVInt(0, postings.docFreqs[termID]);
	}
	postings.docFreqs[termID] = 1;
	+ fieldState.maxTF = Math.max(1, fieldState.maxTF);
	postings.lastDocCodes[termID] = (docState.docID - postings.lastDocIDs[termID]) << 1;
	postings.lastDocIDs[termID] = docState.docID;
	writeProx(termID, fieldState.position);
	} else {
	- postings.docFreqs[termID]++;
	+ fieldState.maxTF = Math.max(fieldState.maxTF, ++postings.docFreqs[termID]);
	writeProx(termID, fieldState.position-postings.lastPositions[termID]);
	}
	}
	Index: lucene/src/java/org/apache/lucene/index/FieldInvertState.java
	===================================================================
	--- lucene/src/java/org/apache/lucene/index/FieldInvertState.java (revision 1058413)
	+++ lucene/src/java/org/apache/lucene/index/FieldInvertState.java (working copy)
	@@ -30,6 +30,7 @@
	int length;
	int numOverlap;
	int offset;
	+ int maxTF;
	float boost;
	AttributeSource attributeSource;

	@@ -53,6 +54,7 @@
	length = 0;
	numOverlap = 0;
	offset = 0;
	+ maxTF = 0;
	boost = docBoost;
	attributeSource = null;
	}
	@@ -107,6 +109,13 @@
	return boost;
	}

	+ /**
	+ * Get the maximum term-frequency encountered.
	+ */
	+ public int getMaxTF() {
	+ return maxTF;
	+ }
	+
	public void setBoost(float boost) {
	this.boost = boost;
	}