blob: 71b8d941d01c5ac228cc259d3ebfee3db1febdc0 [file] [log] [blame]
Index: lucene/src/test/org/apache/lucene/index/TestMaxTF.java
===================================================================
--- lucene/src/test/org/apache/lucene/index/TestMaxTF.java (revision 0)
+++ lucene/src/test/org/apache/lucene/index/TestMaxTF.java (revision 0)
@@ -0,0 +1,116 @@
+package org.apache.lucene.index;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+
+import org.apache.lucene.analysis.MockAnalyzer;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.search.DefaultSimilarity;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util._TestUtil;
+
+/**
+ * Tests the maxTF statistic in FieldInvertState
+ */
+public class TestMaxTF extends LuceneTestCase {
+ Directory dir;
+ IndexReader reader;
+ /* expected maxTF values for our documents */
+ ArrayList<Integer> expected = new ArrayList<Integer>();
+
+ @Override
+ public void setUp() throws Exception {
+ super.setUp();
+ dir = newDirectory();
+ IndexWriterConfig config = newIndexWriterConfig(TEST_VERSION_CURRENT,
+ new MockAnalyzer(MockTokenizer.SIMPLE, true));
+ config.setSimilarity(new TestSimilarity());
+ RandomIndexWriter writer = new RandomIndexWriter(random, dir, config);
+ Document doc = new Document();
+ Field foo = newField("foo", "", Field.Store.NO, Field.Index.ANALYZED);
+ doc.add(foo);
+ for (int i = 0; i < 100; i++) {
+ foo.setValue(addValue());
+ writer.addDocument(doc);
+ }
+ reader = writer.getReader();
+ writer.close();
+ }
+
+ @Override
+ public void tearDown() throws Exception {
+ reader.close();
+ dir.close();
+ super.tearDown();
+ }
+
+ public void test() throws Exception {
+ byte fooNorms[] = MultiNorms.norms(reader, "foo");
+ for (int i = 0; i < reader.maxDoc(); i++)
+ assertEquals(expected.get(i).intValue(), fooNorms[i] & 0xff);
+ }
+
+ /**
+ * Makes a bunch of single-char tokens (the max freq will at most be 255).
+ * shuffles them around, and returns the whole list with Arrays.toString().
+ * This works fine because we use lettertokenizer.
+ * puts the max-frequency term into expected, to be checked against the norm.
+ */
+ private String addValue() {
+ List<String> terms = new ArrayList<String>();
+ int maxCeiling = _TestUtil.nextInt(random, 0, 255);
+ int max = 0;
+ for (char ch = 'a'; ch <= 'z'; ch++) {
+ int num = _TestUtil.nextInt(random, 0, maxCeiling);
+ for (int i = 0; i < num; i++)
+ terms.add(Character.toString(ch));
+ max = Math.max(max, num);
+ }
+ expected.add(max);
+ Collections.shuffle(terms, random);
+ return Arrays.toString(terms.toArray(new String[terms.size()]));
+ }
+
+ /**
+ * Simple similarity that encodes maxTF directly as a byte
+ */
+ class TestSimilarity extends DefaultSimilarity {
+
+ @Override
+ public byte encodeNormValue(float f) {
+ return (byte) f;
+ }
+
+ @Override
+ public float decodeNormValue(byte b) {
+ return (float) b;
+ }
+
+ @Override
+ public float computeNorm(String field, FieldInvertState state) {
+ return (float) state.maxTF;
+ }
+ }
+}
Property changes on: lucene\src\test\org\apache\lucene\index\TestMaxTF.java
___________________________________________________________________
Added: svn:eol-style
+ native
Index: lucene/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java
===================================================================
--- lucene/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java (revision 1058413)
+++ lucene/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java (working copy)
@@ -125,6 +125,7 @@
postings.docFreqs[termID] = 1;
writeProx(termID, fieldState.position);
}
+ fieldState.maxTF = Math.max(1, fieldState.maxTF);
}
@Override
@@ -158,11 +159,12 @@
termsHashPerField.writeVInt(0, postings.docFreqs[termID]);
}
postings.docFreqs[termID] = 1;
+ fieldState.maxTF = Math.max(1, fieldState.maxTF);
postings.lastDocCodes[termID] = (docState.docID - postings.lastDocIDs[termID]) << 1;
postings.lastDocIDs[termID] = docState.docID;
writeProx(termID, fieldState.position);
} else {
- postings.docFreqs[termID]++;
+ fieldState.maxTF = Math.max(fieldState.maxTF, ++postings.docFreqs[termID]);
writeProx(termID, fieldState.position-postings.lastPositions[termID]);
}
}
Index: lucene/src/java/org/apache/lucene/index/FieldInvertState.java
===================================================================
--- lucene/src/java/org/apache/lucene/index/FieldInvertState.java (revision 1058413)
+++ lucene/src/java/org/apache/lucene/index/FieldInvertState.java (working copy)
@@ -30,6 +30,7 @@
int length;
int numOverlap;
int offset;
+ int maxTF;
float boost;
AttributeSource attributeSource;
@@ -53,6 +54,7 @@
length = 0;
numOverlap = 0;
offset = 0;
+ maxTF = 0;
boost = docBoost;
attributeSource = null;
}
@@ -107,6 +109,13 @@
return boost;
}
+ /**
+ * Get the maximum term-frequency encountered.
+ */
+ public int getMaxTF() {
+ return maxTF;
+ }
+
public void setBoost(float boost) {
this.boost = boost;
}