blob: 2fb3e0e8562ed713022acb24e21cc49ee2a77c6d [file] [log] [blame]
using Lucene.Net.Documents;
using Lucene.Net.Index.Extensions;
using NUnit.Framework;
using System;
using System.Text;
using Assert = Lucene.Net.TestFramework.Assert;
namespace Lucene.Net.Index
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
using Analyzer = Lucene.Net.Analysis.Analyzer;
using BooleanQuery = Lucene.Net.Search.BooleanQuery;
using BytesRef = Lucene.Net.Util.BytesRef;
using CollectionStatistics = Lucene.Net.Search.CollectionStatistics;
using ICollector = Lucene.Net.Search.ICollector;
using Directory = Lucene.Net.Store.Directory;
using Document = Documents.Document;
using Explanation = Lucene.Net.Search.Explanation;
using Field = Field;
using FieldType = FieldType;
using IndexSearcher = Lucene.Net.Search.IndexSearcher;
using LuceneTestCase = Lucene.Net.Util.LuceneTestCase;
using MockAnalyzer = Lucene.Net.Analysis.MockAnalyzer;
using Occur = Lucene.Net.Search.Occur;
using PhraseQuery = Lucene.Net.Search.PhraseQuery;
using Scorer = Lucene.Net.Search.Scorer;
using TermQuery = Lucene.Net.Search.TermQuery;
using TermStatistics = Lucene.Net.Search.TermStatistics;
using TextField = TextField;
using TFIDFSimilarity = Lucene.Net.Search.Similarities.TFIDFSimilarity;
[TestFixture]
public class TestOmitTf : LuceneTestCase
{
public class SimpleSimilarity : TFIDFSimilarity
{
public override float DecodeNormValue(long norm)
{
return norm;
}
public override long EncodeNormValue(float f)
{
return (long)f;
}
public override float QueryNorm(float sumOfSquaredWeights)
{
return 1.0f;
}
public override float Coord(int overlap, int maxOverlap)
{
return 1.0f;
}
public override float LengthNorm(FieldInvertState state)
{
return state.Boost;
}
public override float Tf(float freq)
{
return freq;
}
public override float SloppyFreq(int distance)
{
return 2.0f;
}
public override float Idf(long docFreq, long numDocs)
{
return 1.0f;
}
public override Explanation IdfExplain(CollectionStatistics collectionStats, TermStatistics[] termStats)
{
return new Explanation(1.0f, "Inexplicable");
}
public override float ScorePayload(int doc, int start, int end, BytesRef payload)
{
return 1.0f;
}
}
private static readonly FieldType omitType = new FieldType(TextField.TYPE_NOT_STORED);
private static readonly FieldType normalType = new FieldType(TextField.TYPE_NOT_STORED);
static TestOmitTf()
{
omitType.IndexOptions = IndexOptions.DOCS_ONLY;
}
// Tests whether the DocumentWriter correctly enable the
// omitTermFreqAndPositions bit in the FieldInfo
[Test]
public virtual void TestOmitTermFreqAndPositions()
{
Directory ram = NewDirectory();
Analyzer analyzer = new MockAnalyzer(Random);
IndexWriter writer = new IndexWriter(ram, NewIndexWriterConfig(TEST_VERSION_CURRENT, analyzer));
Document d = new Document();
// this field will have Tf
Field f1 = NewField("f1", "this field has term freqs", normalType);
d.Add(f1);
// this field will NOT have Tf
Field f2 = NewField("f2", "this field has NO Tf in all docs", omitType);
d.Add(f2);
writer.AddDocument(d);
writer.ForceMerge(1);
// now we add another document which has term freq for field f2 and not for f1 and verify if the SegmentMerger
// keep things constant
d = new Document();
// Reverse
f1 = NewField("f1", "this field has term freqs", omitType);
d.Add(f1);
f2 = NewField("f2", "this field has NO Tf in all docs", normalType);
d.Add(f2);
writer.AddDocument(d);
// force merge
writer.ForceMerge(1);
// flush
writer.Dispose();
SegmentReader reader = GetOnlySegmentReader(DirectoryReader.Open(ram));
FieldInfos fi = reader.FieldInfos;
Assert.AreEqual(IndexOptions.DOCS_ONLY, fi.FieldInfo("f1").IndexOptions, "OmitTermFreqAndPositions field bit should be set.");
Assert.AreEqual(IndexOptions.DOCS_ONLY, fi.FieldInfo("f2").IndexOptions, "OmitTermFreqAndPositions field bit should be set.");
reader.Dispose();
ram.Dispose();
}
// Tests whether merging of docs that have different
// omitTermFreqAndPositions for the same field works
[Test]
public virtual void TestMixedMerge()
{
Directory ram = NewDirectory();
Analyzer analyzer = new MockAnalyzer(Random);
IndexWriter writer = new IndexWriter(ram, NewIndexWriterConfig(TEST_VERSION_CURRENT, analyzer).SetMaxBufferedDocs(3).SetMergePolicy(NewLogMergePolicy(2)));
Document d = new Document();
// this field will have Tf
Field f1 = NewField("f1", "this field has term freqs", normalType);
d.Add(f1);
// this field will NOT have Tf
Field f2 = NewField("f2", "this field has NO Tf in all docs", omitType);
d.Add(f2);
for (int i = 0; i < 30; i++)
{
writer.AddDocument(d);
}
// now we add another document which has term freq for field f2 and not for f1 and verify if the SegmentMerger
// keep things constant
d = new Document();
// Reverese
f1 = NewField("f1", "this field has term freqs", omitType);
d.Add(f1);
f2 = NewField("f2", "this field has NO Tf in all docs", normalType);
d.Add(f2);
for (int i = 0; i < 30; i++)
{
writer.AddDocument(d);
}
// force merge
writer.ForceMerge(1);
// flush
writer.Dispose();
SegmentReader reader = GetOnlySegmentReader(DirectoryReader.Open(ram));
FieldInfos fi = reader.FieldInfos;
Assert.AreEqual(IndexOptions.DOCS_ONLY, fi.FieldInfo("f1").IndexOptions, "OmitTermFreqAndPositions field bit should be set.");
Assert.AreEqual(IndexOptions.DOCS_ONLY, fi.FieldInfo("f2").IndexOptions, "OmitTermFreqAndPositions field bit should be set.");
reader.Dispose();
ram.Dispose();
}
// Make sure first adding docs that do not omitTermFreqAndPositions for
// field X, then adding docs that do omitTermFreqAndPositions for that same
// field,
[Test]
public virtual void TestMixedRAM()
{
Directory ram = NewDirectory();
Analyzer analyzer = new MockAnalyzer(Random);
IndexWriter writer = new IndexWriter(ram, NewIndexWriterConfig(TEST_VERSION_CURRENT, analyzer).SetMaxBufferedDocs(10).SetMergePolicy(NewLogMergePolicy(2)));
Document d = new Document();
// this field will have Tf
Field f1 = NewField("f1", "this field has term freqs", normalType);
d.Add(f1);
// this field will NOT have Tf
Field f2 = NewField("f2", "this field has NO Tf in all docs", omitType);
d.Add(f2);
for (int i = 0; i < 5; i++)
{
writer.AddDocument(d);
}
for (int i = 0; i < 20; i++)
{
writer.AddDocument(d);
}
// force merge
writer.ForceMerge(1);
// flush
writer.Dispose();
SegmentReader reader = GetOnlySegmentReader(DirectoryReader.Open(ram));
FieldInfos fi = reader.FieldInfos;
Assert.AreEqual(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS, fi.FieldInfo("f1").IndexOptions, "OmitTermFreqAndPositions field bit should not be set.");
Assert.AreEqual(IndexOptions.DOCS_ONLY, fi.FieldInfo("f2").IndexOptions, "OmitTermFreqAndPositions field bit should be set.");
reader.Dispose();
ram.Dispose();
}
private void AssertNoPrx(Directory dir)
{
string[] files = dir.ListAll();
for (int i = 0; i < files.Length; i++)
{
Assert.IsFalse(files[i].EndsWith(".prx", StringComparison.Ordinal));
Assert.IsFalse(files[i].EndsWith(".pos", StringComparison.Ordinal));
}
}
// Verifies no *.prx exists when all fields omit term freq:
[Test]
public virtual void TestNoPrxFile()
{
Directory ram = NewDirectory();
Analyzer analyzer = new MockAnalyzer(Random);
IndexWriter writer = new IndexWriter(ram, NewIndexWriterConfig(TEST_VERSION_CURRENT, analyzer).SetMaxBufferedDocs(3).SetMergePolicy(NewLogMergePolicy()));
LogMergePolicy lmp = (LogMergePolicy)writer.Config.MergePolicy;
lmp.MergeFactor = 2;
lmp.NoCFSRatio = 0.0;
Document d = new Document();
Field f1 = NewField("f1", "this field has term freqs", omitType);
d.Add(f1);
for (int i = 0; i < 30; i++)
{
writer.AddDocument(d);
}
writer.Commit();
AssertNoPrx(ram);
// now add some documents with positions, and check
// there is no prox after full merge
d = new Document();
f1 = NewTextField("f1", "this field has positions", Field.Store.NO);
d.Add(f1);
for (int i = 0; i < 30; i++)
{
writer.AddDocument(d);
}
// force merge
writer.ForceMerge(1);
// flush
writer.Dispose();
AssertNoPrx(ram);
ram.Dispose();
}
// Test scores with one field with Term Freqs and one without, otherwise with equal content
[Test]
public virtual void TestBasic()
{
Directory dir = NewDirectory();
Analyzer analyzer = new MockAnalyzer(Random);
IndexWriter writer = new IndexWriter(dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, analyzer).SetMaxBufferedDocs(2).SetSimilarity(new SimpleSimilarity()).SetMergePolicy(NewLogMergePolicy(2)));
StringBuilder sb = new StringBuilder(265);
string term = "term";
for (int i = 0; i < 30; i++)
{
Document doc = new Document();
sb.Append(term).Append(" ");
string content = sb.ToString();
Field noTf = NewField("noTf", content + (i % 2 == 0 ? "" : " notf"), omitType);
doc.Add(noTf);
Field tf = NewField("tf", content + (i % 2 == 0 ? " tf" : ""), normalType);
doc.Add(tf);
writer.AddDocument(doc);
//System.out.println(d);
}
writer.ForceMerge(1);
// flush
writer.Dispose();
/*
* Verify the index
*/
IndexReader reader = DirectoryReader.Open(dir);
IndexSearcher searcher = NewSearcher(reader);
searcher.Similarity = new SimpleSimilarity();
Term a = new Term("noTf", term);
Term b = new Term("tf", term);
Term c = new Term("noTf", "notf");
Term d = new Term("tf", "tf");
TermQuery q1 = new TermQuery(a);
TermQuery q2 = new TermQuery(b);
TermQuery q3 = new TermQuery(c);
TermQuery q4 = new TermQuery(d);
PhraseQuery pq = new PhraseQuery();
pq.Add(a);
pq.Add(c);
try
{
searcher.Search(pq, 10);
Assert.Fail("did not hit expected exception");
}
catch (Exception e)
{
Exception cause = e;
// If the searcher uses an executor service, the IAE is wrapped into other exceptions
while (cause.InnerException != null)
{
cause = cause.InnerException;
}
if (!(cause is InvalidOperationException))
{
throw new InvalidOperationException("Expected an IAE", e);
} // else OK because positions are not indexed
}
searcher.Search(q1, new CountingHitCollectorAnonymousClass(this));
//System.out.println(CountingHitCollector.getCount());
searcher.Search(q2, new CountingHitCollectorAnonymousClass2(this));
//System.out.println(CountingHitCollector.getCount());
searcher.Search(q3, new CountingHitCollectorAnonymousClass3(this));
//System.out.println(CountingHitCollector.getCount());
searcher.Search(q4, new CountingHitCollectorAnonymousClass4(this));
//System.out.println(CountingHitCollector.getCount());
BooleanQuery bq = new BooleanQuery();
bq.Add(q1, Occur.MUST);
bq.Add(q4, Occur.MUST);
searcher.Search(bq, new CountingHitCollectorAnonymousClass5(this));
Assert.AreEqual(15, CountingHitCollector.Count);
reader.Dispose();
dir.Dispose();
}
private class CountingHitCollectorAnonymousClass : CountingHitCollector
{
private readonly TestOmitTf outerInstance;
public CountingHitCollectorAnonymousClass(TestOmitTf outerInstance)
{
this.outerInstance = outerInstance;
}
private Scorer scorer;
public override sealed void SetScorer(Scorer scorer)
{
this.scorer = scorer;
}
public override sealed void Collect(int doc)
{
//System.out.println("Q1: Doc=" + doc + " score=" + score);
float score = scorer.GetScore();
Assert.IsTrue(score == 1.0f, "got score=" + score);
base.Collect(doc);
}
}
private class CountingHitCollectorAnonymousClass2 : CountingHitCollector
{
private readonly TestOmitTf outerInstance;
public CountingHitCollectorAnonymousClass2(TestOmitTf outerInstance)
{
this.outerInstance = outerInstance;
}
private Scorer scorer;
public override sealed void SetScorer(Scorer scorer)
{
this.scorer = scorer;
}
public override sealed void Collect(int doc)
{
//System.out.println("Q2: Doc=" + doc + " score=" + score);
float score = scorer.GetScore();
Assert.AreEqual(1.0f + doc, score, 0.00001f);
base.Collect(doc);
}
}
private class CountingHitCollectorAnonymousClass3 : CountingHitCollector
{
private readonly TestOmitTf outerInstance;
public CountingHitCollectorAnonymousClass3(TestOmitTf outerInstance)
{
this.outerInstance = outerInstance;
}
private Scorer scorer;
public override sealed void SetScorer(Scorer scorer)
{
this.scorer = scorer;
}
public override sealed void Collect(int doc)
{
//System.out.println("Q1: Doc=" + doc + " score=" + score);
float score = scorer.GetScore();
Assert.IsTrue(score == 1.0f);
Assert.IsFalse(doc % 2 == 0);
base.Collect(doc);
}
}
private class CountingHitCollectorAnonymousClass4 : CountingHitCollector
{
private readonly TestOmitTf outerInstance;
public CountingHitCollectorAnonymousClass4(TestOmitTf outerInstance)
{
this.outerInstance = outerInstance;
}
private Scorer scorer;
public override sealed void SetScorer(Scorer scorer)
{
this.scorer = scorer;
}
public override sealed void Collect(int doc)
{
float score = scorer.GetScore();
//System.out.println("Q1: Doc=" + doc + " score=" + score);
Assert.IsTrue(score == 1.0f);
Assert.IsTrue(doc % 2 == 0);
base.Collect(doc);
}
}
private class CountingHitCollectorAnonymousClass5 : CountingHitCollector
{
private readonly TestOmitTf outerInstance;
public CountingHitCollectorAnonymousClass5(TestOmitTf outerInstance)
{
this.outerInstance = outerInstance;
}
public override sealed void Collect(int doc)
{
//System.out.println("BQ: Doc=" + doc + " score=" + score);
base.Collect(doc);
}
}
public class CountingHitCollector : ICollector
{
internal static int count = 0;
internal static int sum = 0;
internal int docBase = -1;
internal CountingHitCollector()
{
count = 0;
sum = 0;
}
public virtual void SetScorer(Scorer scorer)
{
}
public virtual void Collect(int doc)
{
count++;
sum += doc + docBase; // use it to avoid any possibility of being merged away
}
public static int Count => count;
public static int Sum => sum;
public virtual void SetNextReader(AtomicReaderContext context)
{
docBase = context.DocBase;
}
public virtual bool AcceptsDocsOutOfOrder => true;
}
/// <summary>
/// test that when freqs are omitted, that totalTermFreq and sumTotalTermFreq are -1 </summary>
[Test]
public virtual void TestStats()
{
Directory dir = NewDirectory();
RandomIndexWriter iw = new RandomIndexWriter(Random, dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random)));
Document doc = new Document();
FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
ft.IndexOptions = IndexOptions.DOCS_ONLY;
ft.Freeze();
Field f = NewField("foo", "bar", ft);
doc.Add(f);
iw.AddDocument(doc);
IndexReader ir = iw.GetReader();
iw.Dispose();
Assert.AreEqual(-1, ir.TotalTermFreq(new Term("foo", new BytesRef("bar"))));
Assert.AreEqual(-1, ir.GetSumTotalTermFreq("foo"));
ir.Dispose();
dir.Dispose();
}
}
}