blob: d8dda086f424b93a51085ab993bf3801f35eef14 [file] [log] [blame]
package org.apache.lucene.index;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Collection;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util._TestUtil;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Collector;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.Similarity;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.MockRAMDirectory;
import org.apache.lucene.search.Explanation.IDFExplanation;
public class TestOmitTf extends LuceneTestCase {
public static class SimpleSimilarity extends Similarity {
@Override public float lengthNorm(String field, int numTerms) { return 1.0f; }
@Override public float queryNorm(float sumOfSquaredWeights) { return 1.0f; }
@Override public float tf(float freq) { return freq; }
@Override public float sloppyFreq(int distance) { return 2.0f; }
@Override public float idf(int docFreq, int numDocs) { return 1.0f; }
@Override public float coord(int overlap, int maxOverlap) { return 1.0f; }
@Override public IDFExplanation idfExplain(Collection<Term> terms, Searcher searcher) throws IOException {
return new IDFExplanation() {
@Override
public float getIdf() {
return 1.0f;
}
@Override
public String explain() {
return "Inexplicable";
}
};
}
}
// Tests whether the DocumentWriter correctly enable the
// omitTermFreqAndPositions bit in the FieldInfo
public void testOmitTermFreqAndPositions() throws Exception {
Directory ram = new MockRAMDirectory();
Analyzer analyzer = new StandardAnalyzer(TEST_VERSION_CURRENT);
IndexWriter writer = new IndexWriter(ram, new IndexWriterConfig(TEST_VERSION_CURRENT, analyzer));
Document d = new Document();
// this field will have Tf
Field f1 = new Field("f1", "This field has term freqs", Field.Store.NO, Field.Index.ANALYZED);
d.add(f1);
// this field will NOT have Tf
Field f2 = new Field("f2", "This field has NO Tf in all docs", Field.Store.NO, Field.Index.ANALYZED);
f2.setOmitTermFreqAndPositions(true);
d.add(f2);
writer.addDocument(d);
writer.optimize();
// now we add another document which has term freq for field f2 and not for f1 and verify if the SegmentMerger
// keep things constant
d = new Document();
// Reverese
f1.setOmitTermFreqAndPositions(true);
d.add(f1);
f2.setOmitTermFreqAndPositions(false);
d.add(f2);
writer.addDocument(d);
// force merge
writer.optimize();
// flush
writer.close();
_TestUtil.checkIndex(ram);
SegmentReader reader = SegmentReader.getOnlySegmentReader(ram);
FieldInfos fi = reader.fieldInfos();
assertTrue("OmitTermFreqAndPositions field bit should be set.", fi.fieldInfo("f1").omitTermFreqAndPositions);
assertTrue("OmitTermFreqAndPositions field bit should be set.", fi.fieldInfo("f2").omitTermFreqAndPositions);
reader.close();
ram.close();
}
// Tests whether merging of docs that have different
// omitTermFreqAndPositions for the same field works
public void testMixedMerge() throws Exception {
Directory ram = new MockRAMDirectory();
Analyzer analyzer = new StandardAnalyzer(TEST_VERSION_CURRENT);
IndexWriter writer = new IndexWriter(ram, new IndexWriterConfig(
TEST_VERSION_CURRENT, analyzer).setMaxBufferedDocs(3));
((LogMergePolicy) writer.getConfig().getMergePolicy()).setMergeFactor(2);
Document d = new Document();
// this field will have Tf
Field f1 = new Field("f1", "This field has term freqs", Field.Store.NO, Field.Index.ANALYZED);
d.add(f1);
// this field will NOT have Tf
Field f2 = new Field("f2", "This field has NO Tf in all docs", Field.Store.NO, Field.Index.ANALYZED);
f2.setOmitTermFreqAndPositions(true);
d.add(f2);
for(int i=0;i<30;i++)
writer.addDocument(d);
// now we add another document which has term freq for field f2 and not for f1 and verify if the SegmentMerger
// keep things constant
d = new Document();
// Reverese
f1.setOmitTermFreqAndPositions(true);
d.add(f1);
f2.setOmitTermFreqAndPositions(false);
d.add(f2);
for(int i=0;i<30;i++)
writer.addDocument(d);
// force merge
writer.optimize();
// flush
writer.close();
_TestUtil.checkIndex(ram);
SegmentReader reader = SegmentReader.getOnlySegmentReader(ram);
FieldInfos fi = reader.fieldInfos();
assertTrue("OmitTermFreqAndPositions field bit should be set.", fi.fieldInfo("f1").omitTermFreqAndPositions);
assertTrue("OmitTermFreqAndPositions field bit should be set.", fi.fieldInfo("f2").omitTermFreqAndPositions);
reader.close();
ram.close();
}
// Make sure first adding docs that do not omitTermFreqAndPositions for
// field X, then adding docs that do omitTermFreqAndPositions for that same
// field,
public void testMixedRAM() throws Exception {
Directory ram = new MockRAMDirectory();
Analyzer analyzer = new StandardAnalyzer(TEST_VERSION_CURRENT);
IndexWriter writer = new IndexWriter(ram, new IndexWriterConfig(
TEST_VERSION_CURRENT, analyzer).setMaxBufferedDocs(10));
((LogMergePolicy) writer.getConfig().getMergePolicy()).setMergeFactor(2);
Document d = new Document();
// this field will have Tf
Field f1 = new Field("f1", "This field has term freqs", Field.Store.NO, Field.Index.ANALYZED);
d.add(f1);
// this field will NOT have Tf
Field f2 = new Field("f2", "This field has NO Tf in all docs", Field.Store.NO, Field.Index.ANALYZED);
d.add(f2);
for(int i=0;i<5;i++)
writer.addDocument(d);
f2.setOmitTermFreqAndPositions(true);
for(int i=0;i<20;i++)
writer.addDocument(d);
// force merge
writer.optimize();
// flush
writer.close();
_TestUtil.checkIndex(ram);
SegmentReader reader = SegmentReader.getOnlySegmentReader(ram);
FieldInfos fi = reader.fieldInfos();
assertTrue("OmitTermFreqAndPositions field bit should not be set.", !fi.fieldInfo("f1").omitTermFreqAndPositions);
assertTrue("OmitTermFreqAndPositions field bit should be set.", fi.fieldInfo("f2").omitTermFreqAndPositions);
reader.close();
ram.close();
}
private void assertNoPrx(Directory dir) throws Throwable {
final String[] files = dir.listAll();
for(int i=0;i<files.length;i++)
assertFalse(files[i].endsWith(".prx"));
}
// Verifies no *.prx exists when all fields omit term freq:
public void testNoPrxFile() throws Throwable {
Directory ram = new MockRAMDirectory();
Analyzer analyzer = new StandardAnalyzer(TEST_VERSION_CURRENT);
IndexWriter writer = new IndexWriter(ram, new IndexWriterConfig(
TEST_VERSION_CURRENT, analyzer).setMaxBufferedDocs(3));
LogMergePolicy lmp = (LogMergePolicy) writer.getConfig().getMergePolicy();
lmp.setMergeFactor(2);
lmp.setUseCompoundFile(false);
lmp.setUseCompoundDocStore(false);
Document d = new Document();
Field f1 = new Field("f1", "This field has term freqs", Field.Store.NO, Field.Index.ANALYZED);
f1.setOmitTermFreqAndPositions(true);
d.add(f1);
for(int i=0;i<30;i++)
writer.addDocument(d);
writer.commit();
assertNoPrx(ram);
// force merge
writer.optimize();
// flush
writer.close();
assertNoPrx(ram);
_TestUtil.checkIndex(ram);
ram.close();
}
// Test scores with one field with Term Freqs and one without, otherwise with equal content
public void testBasic() throws Exception {
Directory dir = new MockRAMDirectory();
Analyzer analyzer = new StandardAnalyzer(TEST_VERSION_CURRENT);
IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(
TEST_VERSION_CURRENT, analyzer).setMaxBufferedDocs(2)
.setSimilarity(new SimpleSimilarity()));
((LogMergePolicy) writer.getConfig().getMergePolicy()).setMergeFactor(2);
StringBuilder sb = new StringBuilder(265);
String term = "term";
for(int i = 0; i<30; i++){
Document d = new Document();
sb.append(term).append(" ");
String content = sb.toString();
Field noTf = new Field("noTf", content + (i%2==0 ? "" : " notf"), Field.Store.NO, Field.Index.ANALYZED);
noTf.setOmitTermFreqAndPositions(true);
d.add(noTf);
Field tf = new Field("tf", content + (i%2==0 ? " tf" : ""), Field.Store.NO, Field.Index.ANALYZED);
d.add(tf);
writer.addDocument(d);
//System.out.println(d);
}
writer.optimize();
// flush
writer.close();
_TestUtil.checkIndex(dir);
/*
* Verify the index
*/
Searcher searcher = new IndexSearcher(dir, true);
searcher.setSimilarity(new SimpleSimilarity());
Term a = new Term("noTf", term);
Term b = new Term("tf", term);
Term c = new Term("noTf", "notf");
Term d = new Term("tf", "tf");
TermQuery q1 = new TermQuery(a);
TermQuery q2 = new TermQuery(b);
TermQuery q3 = new TermQuery(c);
TermQuery q4 = new TermQuery(d);
searcher.search(q1,
new CountingHitCollector() {
private Scorer scorer;
@Override
public final void setScorer(Scorer scorer) {
this.scorer = scorer;
}
@Override
public final void collect(int doc) throws IOException {
//System.out.println("Q1: Doc=" + doc + " score=" + score);
float score = scorer.score();
assertTrue(score==1.0f);
super.collect(doc);
}
});
//System.out.println(CountingHitCollector.getCount());
searcher.search(q2,
new CountingHitCollector() {
private Scorer scorer;
@Override
public final void setScorer(Scorer scorer) {
this.scorer = scorer;
}
@Override
public final void collect(int doc) throws IOException {
//System.out.println("Q2: Doc=" + doc + " score=" + score);
float score = scorer.score();
assertTrue(score==1.0f+doc);
super.collect(doc);
}
});
//System.out.println(CountingHitCollector.getCount());
searcher.search(q3,
new CountingHitCollector() {
private Scorer scorer;
@Override
public final void setScorer(Scorer scorer) {
this.scorer = scorer;
}
@Override
public final void collect(int doc) throws IOException {
//System.out.println("Q1: Doc=" + doc + " score=" + score);
float score = scorer.score();
assertTrue(score==1.0f);
assertFalse(doc%2==0);
super.collect(doc);
}
});
//System.out.println(CountingHitCollector.getCount());
searcher.search(q4,
new CountingHitCollector() {
private Scorer scorer;
@Override
public final void setScorer(Scorer scorer) {
this.scorer = scorer;
}
@Override
public final void collect(int doc) throws IOException {
float score = scorer.score();
//System.out.println("Q1: Doc=" + doc + " score=" + score);
assertTrue(score==1.0f);
assertTrue(doc%2==0);
super.collect(doc);
}
});
//System.out.println(CountingHitCollector.getCount());
BooleanQuery bq = new BooleanQuery();
bq.add(q1,Occur.MUST);
bq.add(q4,Occur.MUST);
searcher.search(bq,
new CountingHitCollector() {
@Override
public final void collect(int doc) throws IOException {
//System.out.println("BQ: Doc=" + doc + " score=" + score);
super.collect(doc);
}
});
assertTrue(15 == CountingHitCollector.getCount());
searcher.close();
dir.close();
}
public static class CountingHitCollector extends Collector {
static int count=0;
static int sum=0;
private int docBase = -1;
CountingHitCollector(){count=0;sum=0;}
@Override
public void setScorer(Scorer scorer) throws IOException {}
@Override
public void collect(int doc) throws IOException {
count++;
sum += doc + docBase; // use it to avoid any possibility of being optimized away
}
public static int getCount() { return count; }
public static int getSum() { return sum; }
@Override
public void setNextReader(IndexReader reader, int docBase) {
this.docBase = docBase;
}
@Override
public boolean acceptsDocsOutOfOrder() {
return true;
}
}
}