| package org.apache.lucene.index; |
| |
| /** |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| import java.io.IOException; |
| import java.util.Collection; |
| |
| import org.apache.lucene.util.LuceneTestCase; |
| import org.apache.lucene.util._TestUtil; |
| import org.apache.lucene.analysis.Analyzer; |
| import org.apache.lucene.analysis.standard.StandardAnalyzer; |
| import org.apache.lucene.document.Document; |
| import org.apache.lucene.document.Field; |
| import org.apache.lucene.search.BooleanQuery; |
| import org.apache.lucene.search.IndexSearcher; |
| import org.apache.lucene.search.Collector; |
| import org.apache.lucene.search.Scorer; |
| import org.apache.lucene.search.Searcher; |
| import org.apache.lucene.search.Similarity; |
| import org.apache.lucene.search.TermQuery; |
| import org.apache.lucene.search.BooleanClause.Occur; |
| import org.apache.lucene.store.Directory; |
| import org.apache.lucene.store.MockRAMDirectory; |
| import org.apache.lucene.search.Explanation.IDFExplanation; |
| |
| |
| public class TestOmitTf extends LuceneTestCase { |
| |
| public static class SimpleSimilarity extends Similarity { |
| @Override public float lengthNorm(String field, int numTerms) { return 1.0f; } |
| @Override public float queryNorm(float sumOfSquaredWeights) { return 1.0f; } |
| @Override public float tf(float freq) { return freq; } |
| @Override public float sloppyFreq(int distance) { return 2.0f; } |
| @Override public float idf(int docFreq, int numDocs) { return 1.0f; } |
| @Override public float coord(int overlap, int maxOverlap) { return 1.0f; } |
| @Override public IDFExplanation idfExplain(Collection<Term> terms, Searcher searcher) throws IOException { |
| return new IDFExplanation() { |
| @Override |
| public float getIdf() { |
| return 1.0f; |
| } |
| @Override |
| public String explain() { |
| return "Inexplicable"; |
| } |
| }; |
| } |
| } |
| |
| // Tests whether the DocumentWriter correctly enable the |
| // omitTermFreqAndPositions bit in the FieldInfo |
| public void testOmitTermFreqAndPositions() throws Exception { |
| Directory ram = new MockRAMDirectory(); |
| Analyzer analyzer = new StandardAnalyzer(TEST_VERSION_CURRENT); |
| IndexWriter writer = new IndexWriter(ram, new IndexWriterConfig(TEST_VERSION_CURRENT, analyzer)); |
| Document d = new Document(); |
| |
| // this field will have Tf |
| Field f1 = new Field("f1", "This field has term freqs", Field.Store.NO, Field.Index.ANALYZED); |
| d.add(f1); |
| |
| // this field will NOT have Tf |
| Field f2 = new Field("f2", "This field has NO Tf in all docs", Field.Store.NO, Field.Index.ANALYZED); |
| f2.setOmitTermFreqAndPositions(true); |
| d.add(f2); |
| |
| writer.addDocument(d); |
| writer.optimize(); |
| // now we add another document which has term freq for field f2 and not for f1 and verify if the SegmentMerger |
| // keep things constant |
| d = new Document(); |
| |
| // Reverese |
| f1.setOmitTermFreqAndPositions(true); |
| d.add(f1); |
| |
| f2.setOmitTermFreqAndPositions(false); |
| d.add(f2); |
| |
| writer.addDocument(d); |
| // force merge |
| writer.optimize(); |
| // flush |
| writer.close(); |
| _TestUtil.checkIndex(ram); |
| |
| SegmentReader reader = SegmentReader.getOnlySegmentReader(ram); |
| FieldInfos fi = reader.fieldInfos(); |
| assertTrue("OmitTermFreqAndPositions field bit should be set.", fi.fieldInfo("f1").omitTermFreqAndPositions); |
| assertTrue("OmitTermFreqAndPositions field bit should be set.", fi.fieldInfo("f2").omitTermFreqAndPositions); |
| |
| reader.close(); |
| ram.close(); |
| } |
| |
| // Tests whether merging of docs that have different |
| // omitTermFreqAndPositions for the same field works |
| public void testMixedMerge() throws Exception { |
| Directory ram = new MockRAMDirectory(); |
| Analyzer analyzer = new StandardAnalyzer(TEST_VERSION_CURRENT); |
| IndexWriter writer = new IndexWriter(ram, new IndexWriterConfig( |
| TEST_VERSION_CURRENT, analyzer).setMaxBufferedDocs(3)); |
| ((LogMergePolicy) writer.getConfig().getMergePolicy()).setMergeFactor(2); |
| Document d = new Document(); |
| |
| // this field will have Tf |
| Field f1 = new Field("f1", "This field has term freqs", Field.Store.NO, Field.Index.ANALYZED); |
| d.add(f1); |
| |
| // this field will NOT have Tf |
| Field f2 = new Field("f2", "This field has NO Tf in all docs", Field.Store.NO, Field.Index.ANALYZED); |
| f2.setOmitTermFreqAndPositions(true); |
| d.add(f2); |
| |
| for(int i=0;i<30;i++) |
| writer.addDocument(d); |
| |
| // now we add another document which has term freq for field f2 and not for f1 and verify if the SegmentMerger |
| // keep things constant |
| d = new Document(); |
| |
| // Reverese |
| f1.setOmitTermFreqAndPositions(true); |
| d.add(f1); |
| |
| f2.setOmitTermFreqAndPositions(false); |
| d.add(f2); |
| |
| for(int i=0;i<30;i++) |
| writer.addDocument(d); |
| |
| // force merge |
| writer.optimize(); |
| // flush |
| writer.close(); |
| |
| _TestUtil.checkIndex(ram); |
| |
| SegmentReader reader = SegmentReader.getOnlySegmentReader(ram); |
| FieldInfos fi = reader.fieldInfos(); |
| assertTrue("OmitTermFreqAndPositions field bit should be set.", fi.fieldInfo("f1").omitTermFreqAndPositions); |
| assertTrue("OmitTermFreqAndPositions field bit should be set.", fi.fieldInfo("f2").omitTermFreqAndPositions); |
| |
| reader.close(); |
| ram.close(); |
| } |
| |
| // Make sure first adding docs that do not omitTermFreqAndPositions for |
| // field X, then adding docs that do omitTermFreqAndPositions for that same |
| // field, |
| public void testMixedRAM() throws Exception { |
| Directory ram = new MockRAMDirectory(); |
| Analyzer analyzer = new StandardAnalyzer(TEST_VERSION_CURRENT); |
| IndexWriter writer = new IndexWriter(ram, new IndexWriterConfig( |
| TEST_VERSION_CURRENT, analyzer).setMaxBufferedDocs(10)); |
| ((LogMergePolicy) writer.getConfig().getMergePolicy()).setMergeFactor(2); |
| Document d = new Document(); |
| |
| // this field will have Tf |
| Field f1 = new Field("f1", "This field has term freqs", Field.Store.NO, Field.Index.ANALYZED); |
| d.add(f1); |
| |
| // this field will NOT have Tf |
| Field f2 = new Field("f2", "This field has NO Tf in all docs", Field.Store.NO, Field.Index.ANALYZED); |
| d.add(f2); |
| |
| for(int i=0;i<5;i++) |
| writer.addDocument(d); |
| |
| f2.setOmitTermFreqAndPositions(true); |
| |
| for(int i=0;i<20;i++) |
| writer.addDocument(d); |
| |
| // force merge |
| writer.optimize(); |
| |
| // flush |
| writer.close(); |
| |
| _TestUtil.checkIndex(ram); |
| |
| SegmentReader reader = SegmentReader.getOnlySegmentReader(ram); |
| FieldInfos fi = reader.fieldInfos(); |
| assertTrue("OmitTermFreqAndPositions field bit should not be set.", !fi.fieldInfo("f1").omitTermFreqAndPositions); |
| assertTrue("OmitTermFreqAndPositions field bit should be set.", fi.fieldInfo("f2").omitTermFreqAndPositions); |
| |
| reader.close(); |
| ram.close(); |
| } |
| |
| private void assertNoPrx(Directory dir) throws Throwable { |
| final String[] files = dir.listAll(); |
| for(int i=0;i<files.length;i++) |
| assertFalse(files[i].endsWith(".prx")); |
| } |
| |
| // Verifies no *.prx exists when all fields omit term freq: |
| public void testNoPrxFile() throws Throwable { |
| Directory ram = new MockRAMDirectory(); |
| Analyzer analyzer = new StandardAnalyzer(TEST_VERSION_CURRENT); |
| IndexWriter writer = new IndexWriter(ram, new IndexWriterConfig( |
| TEST_VERSION_CURRENT, analyzer).setMaxBufferedDocs(3)); |
| LogMergePolicy lmp = (LogMergePolicy) writer.getConfig().getMergePolicy(); |
| lmp.setMergeFactor(2); |
| lmp.setUseCompoundFile(false); |
| lmp.setUseCompoundDocStore(false); |
| Document d = new Document(); |
| |
| Field f1 = new Field("f1", "This field has term freqs", Field.Store.NO, Field.Index.ANALYZED); |
| f1.setOmitTermFreqAndPositions(true); |
| d.add(f1); |
| |
| for(int i=0;i<30;i++) |
| writer.addDocument(d); |
| |
| writer.commit(); |
| |
| assertNoPrx(ram); |
| |
| // force merge |
| writer.optimize(); |
| // flush |
| writer.close(); |
| |
| assertNoPrx(ram); |
| _TestUtil.checkIndex(ram); |
| ram.close(); |
| } |
| |
| // Test scores with one field with Term Freqs and one without, otherwise with equal content |
| public void testBasic() throws Exception { |
| Directory dir = new MockRAMDirectory(); |
| Analyzer analyzer = new StandardAnalyzer(TEST_VERSION_CURRENT); |
| IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig( |
| TEST_VERSION_CURRENT, analyzer).setMaxBufferedDocs(2) |
| .setSimilarity(new SimpleSimilarity())); |
| ((LogMergePolicy) writer.getConfig().getMergePolicy()).setMergeFactor(2); |
| |
| StringBuilder sb = new StringBuilder(265); |
| String term = "term"; |
| for(int i = 0; i<30; i++){ |
| Document d = new Document(); |
| sb.append(term).append(" "); |
| String content = sb.toString(); |
| Field noTf = new Field("noTf", content + (i%2==0 ? "" : " notf"), Field.Store.NO, Field.Index.ANALYZED); |
| noTf.setOmitTermFreqAndPositions(true); |
| d.add(noTf); |
| |
| Field tf = new Field("tf", content + (i%2==0 ? " tf" : ""), Field.Store.NO, Field.Index.ANALYZED); |
| d.add(tf); |
| |
| writer.addDocument(d); |
| //System.out.println(d); |
| } |
| |
| writer.optimize(); |
| // flush |
| writer.close(); |
| _TestUtil.checkIndex(dir); |
| |
| /* |
| * Verify the index |
| */ |
| Searcher searcher = new IndexSearcher(dir, true); |
| searcher.setSimilarity(new SimpleSimilarity()); |
| |
| Term a = new Term("noTf", term); |
| Term b = new Term("tf", term); |
| Term c = new Term("noTf", "notf"); |
| Term d = new Term("tf", "tf"); |
| TermQuery q1 = new TermQuery(a); |
| TermQuery q2 = new TermQuery(b); |
| TermQuery q3 = new TermQuery(c); |
| TermQuery q4 = new TermQuery(d); |
| |
| |
| searcher.search(q1, |
| new CountingHitCollector() { |
| private Scorer scorer; |
| @Override |
| public final void setScorer(Scorer scorer) { |
| this.scorer = scorer; |
| } |
| @Override |
| public final void collect(int doc) throws IOException { |
| //System.out.println("Q1: Doc=" + doc + " score=" + score); |
| float score = scorer.score(); |
| assertTrue(score==1.0f); |
| super.collect(doc); |
| } |
| }); |
| //System.out.println(CountingHitCollector.getCount()); |
| |
| |
| searcher.search(q2, |
| new CountingHitCollector() { |
| private Scorer scorer; |
| @Override |
| public final void setScorer(Scorer scorer) { |
| this.scorer = scorer; |
| } |
| @Override |
| public final void collect(int doc) throws IOException { |
| //System.out.println("Q2: Doc=" + doc + " score=" + score); |
| float score = scorer.score(); |
| assertTrue(score==1.0f+doc); |
| super.collect(doc); |
| } |
| }); |
| //System.out.println(CountingHitCollector.getCount()); |
| |
| |
| |
| |
| |
| searcher.search(q3, |
| new CountingHitCollector() { |
| private Scorer scorer; |
| @Override |
| public final void setScorer(Scorer scorer) { |
| this.scorer = scorer; |
| } |
| @Override |
| public final void collect(int doc) throws IOException { |
| //System.out.println("Q1: Doc=" + doc + " score=" + score); |
| float score = scorer.score(); |
| assertTrue(score==1.0f); |
| assertFalse(doc%2==0); |
| super.collect(doc); |
| } |
| }); |
| //System.out.println(CountingHitCollector.getCount()); |
| |
| |
| searcher.search(q4, |
| new CountingHitCollector() { |
| private Scorer scorer; |
| @Override |
| public final void setScorer(Scorer scorer) { |
| this.scorer = scorer; |
| } |
| @Override |
| public final void collect(int doc) throws IOException { |
| float score = scorer.score(); |
| //System.out.println("Q1: Doc=" + doc + " score=" + score); |
| assertTrue(score==1.0f); |
| assertTrue(doc%2==0); |
| super.collect(doc); |
| } |
| }); |
| //System.out.println(CountingHitCollector.getCount()); |
| |
| |
| |
| BooleanQuery bq = new BooleanQuery(); |
| bq.add(q1,Occur.MUST); |
| bq.add(q4,Occur.MUST); |
| |
| searcher.search(bq, |
| new CountingHitCollector() { |
| @Override |
| public final void collect(int doc) throws IOException { |
| //System.out.println("BQ: Doc=" + doc + " score=" + score); |
| super.collect(doc); |
| } |
| }); |
| assertTrue(15 == CountingHitCollector.getCount()); |
| |
| searcher.close(); |
| dir.close(); |
| } |
| |
| public static class CountingHitCollector extends Collector { |
| static int count=0; |
| static int sum=0; |
| private int docBase = -1; |
| CountingHitCollector(){count=0;sum=0;} |
| @Override |
| public void setScorer(Scorer scorer) throws IOException {} |
| @Override |
| public void collect(int doc) throws IOException { |
| count++; |
| sum += doc + docBase; // use it to avoid any possibility of being optimized away |
| } |
| |
| public static int getCount() { return count; } |
| public static int getSum() { return sum; } |
| |
| @Override |
| public void setNextReader(IndexReader reader, int docBase) { |
| this.docBase = docBase; |
| } |
| @Override |
| public boolean acceptsDocsOutOfOrder() { |
| return true; |
| } |
| } |
| } |