blob: dc4647776b33036b3796dcddfb8d71e8e42cbd12 [file] [log] [blame]
/*
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*
*/
using System;
using System.Collections.Generic;
using Lucene.Net.Analysis;
using Lucene.Net.Documents;
using Lucene.Net.Index;
using Lucene.Net.Queries;
using Lucene.Net.Search;
using Lucene.Net.Store;
using Lucene.Net.Util;
using NUnit.Framework;
using JCG = J2N.Collections.Generic;
namespace Lucene.Net.Tests.Queries
{
public class CommonTermsQueryTest : LuceneTestCase
{
[Test]
public void TestBasics()
{
Directory dir = NewDirectory();
MockAnalyzer analyzer = new MockAnalyzer(Random);
RandomIndexWriter w = new RandomIndexWriter(
#if FEATURE_INSTANCE_TESTDATA_INITIALIZATION
this,
#endif
Random, dir, analyzer);
var docs = new string[]
{
@"this is the end of the world right", @"is this it or maybe not",
@"this is the end of the universe as we know it",
@"there is the famous restaurant at the end of the universe"
};
for (int i = 0; i < docs.Length; i++)
{
Document doc = new Document();
doc.Add(NewStringField(@"id", @"" + i, Field.Store.YES));
doc.Add(NewTextField(@"field", docs[i], Field.Store.NO));
w.AddDocument(doc);
}
IndexReader r = w.GetReader();
IndexSearcher s = NewSearcher(r);
{
CommonTermsQuery query = new CommonTermsQuery(Occur.SHOULD, Occur.SHOULD, Random.NextBoolean() ? 2F : 0.5F);
query.Add(new Term("field", "is"));
query.Add(new Term("field", "this"));
query.Add(new Term("field", "end"));
query.Add(new Term("field", "world"));
query.Add(new Term("field", "universe"));
query.Add(new Term("field", "right"));
TopDocs search = s.Search(query, 10);
assertEquals(search.TotalHits, 3);
assertEquals(@"0", r.Document(search.ScoreDocs[0].Doc).Get(@"id"));
assertEquals(@"2", r.Document(search.ScoreDocs[1].Doc).Get(@"id"));
assertEquals(@"3", r.Document(search.ScoreDocs[2].Doc).Get(@"id"));
}
{
CommonTermsQuery query = new CommonTermsQuery(Occur.SHOULD, Occur.SHOULD, Random.NextBoolean() ? 2F : 0.5F);
query.Add(new Term("field", "is"));
query.Add(new Term("field", "this"));
query.Add(new Term("field", "end"));
TopDocs search = s.Search(query, 10);
assertEquals(search.TotalHits, 2);
assertEquals(@"0", r.Document(search.ScoreDocs[0].Doc).Get(@"id"));
assertEquals(@"2", r.Document(search.ScoreDocs[1].Doc).Get(@"id"));
}
{
CommonTermsQuery query = new CommonTermsQuery(Occur.SHOULD, Occur.MUST, Random.NextBoolean() ? 2F : 0.5F);
query.Add(new Term("field", "is"));
query.Add(new Term("field", "this"));
query.Add(new Term("field", "end"));
query.Add(new Term("field", "world"));
TopDocs search = s.Search(query, 10);
assertEquals(search.TotalHits, 1);
assertEquals(@"0", r.Document(search.ScoreDocs[0].Doc).Get(@"id"));
}
{
CommonTermsQuery query = new CommonTermsQuery(Occur.SHOULD, Occur.MUST, Random.NextBoolean() ? 2F : 0.5F);
query.Add(new Term("field", "restaurant"));
query.Add(new Term("field", "universe"));
TopDocs search = s.Search(query, 10);
assertEquals(search.TotalHits, 1);
assertEquals(@"3", r.Document(search.ScoreDocs[0].Doc).Get(@"id"));
}
r.Dispose();
w.Dispose();
dir.Dispose();
}
[Test]
public void TestEqualsHashCode()
{
CommonTermsQuery query = new CommonTermsQuery(RandomOccur(Random), RandomOccur(Random), Random.NextSingle(), Random.NextBoolean());
int terms = AtLeast(2);
for (int i = 0; i < terms; i++)
{
query.Add(new Term(TestUtil.RandomRealisticUnicodeString(Random), TestUtil.RandomRealisticUnicodeString(Random)));
}
QueryUtils.CheckHashEquals(query);
QueryUtils.CheckUnequal(new CommonTermsQuery(RandomOccur(Random), RandomOccur(Random), Random.NextSingle(), Random.NextBoolean()), query);
{
long seed = Random.NextInt64();
Random r = new Random((int)seed);
CommonTermsQuery left = new CommonTermsQuery(RandomOccur(r), RandomOccur(r), r.NextSingle(), r.NextBoolean());
int leftTerms = AtLeast(r, 2);
for (int i = 0; i < leftTerms; i++)
{
left.Add(new Term(TestUtil.RandomRealisticUnicodeString(r), TestUtil.RandomRealisticUnicodeString(r)));
}
left.HighFreqMinimumNumberShouldMatch = r.nextInt(4);
left.LowFreqMinimumNumberShouldMatch = r.nextInt(4);
r = new Random((int)seed);
CommonTermsQuery right = new CommonTermsQuery(RandomOccur(r), RandomOccur(r), r.NextSingle(), r.NextBoolean());
int rightTerms = AtLeast(r, 2);
for (int i = 0; i < rightTerms; i++)
{
right.Add(new Term(TestUtil.RandomRealisticUnicodeString(r), TestUtil.RandomRealisticUnicodeString(r)));
}
right.HighFreqMinimumNumberShouldMatch = r.nextInt(4);
right.LowFreqMinimumNumberShouldMatch = r.nextInt(4);
QueryUtils.CheckEqual(left, right);
}
}
private static Occur RandomOccur(Random random)
{
return random.NextBoolean() ? Occur.MUST : Occur.SHOULD;
}
[Test]
public void TestNullTerm()
{
Random random = Random;
CommonTermsQuery query = new CommonTermsQuery(RandomOccur(random), RandomOccur(random), Random.NextSingle());
try
{
query.Add(null);
Assert.Fail(@"null values are not supported");
}
#pragma warning disable 168
catch (ArgumentException ex)
#pragma warning restore 168
{
}
}
[Test]
public void TestMinShouldMatch()
{
Directory dir = NewDirectory();
MockAnalyzer analyzer = new MockAnalyzer(Random);
RandomIndexWriter w = new RandomIndexWriter(
#if FEATURE_INSTANCE_TESTDATA_INITIALIZATION
this,
#endif
Random, dir, analyzer);
string[] docs = new string[]
{
@"this is the end of the world right", @"is this it or maybe not",
@"this is the end of the universe as we know it",
@"there is the famous restaurant at the end of the universe"
};
for (int i = 0; i < docs.Length; i++)
{
Document doc = new Document();
doc.Add(NewStringField(@"id", @"" + i, Field.Store.YES));
doc.Add(NewTextField(@"field", docs[i], Field.Store.NO));
w.AddDocument(doc);
}
IndexReader r = w.GetReader();
IndexSearcher s = NewSearcher(r);
{
CommonTermsQuery query = new CommonTermsQuery(Occur.SHOULD, Occur.SHOULD, Random.NextBoolean() ? 2F : 0.5F);
query.Add(new Term("field", "is"));
query.Add(new Term("field", "this"));
query.Add(new Term("field", "end"));
query.Add(new Term("field", "world"));
query.Add(new Term("field", "universe"));
query.Add(new Term("field", "right"));
query.LowFreqMinimumNumberShouldMatch = 0.5F;
TopDocs search = s.Search(query, 10);
assertEquals(search.TotalHits, 1);
assertEquals(@"0", r.Document(search.ScoreDocs[0].Doc).Get(@"id"));
}
{
CommonTermsQuery query = new CommonTermsQuery(Occur.SHOULD, Occur.SHOULD, Random.NextBoolean() ? 2F : 0.5F);
query.Add(new Term("field", "is"));
query.Add(new Term("field", "this"));
query.Add(new Term("field", "end"));
query.Add(new Term("field", "world"));
query.Add(new Term("field", "universe"));
query.Add(new Term("field", "right"));
query.LowFreqMinimumNumberShouldMatch = 2F;
TopDocs search = s.Search(query, 10);
assertEquals(search.TotalHits, 1);
assertEquals(@"0", r.Document(search.ScoreDocs[0].Doc).Get(@"id"));
}
{
CommonTermsQuery query = new CommonTermsQuery(Occur.SHOULD, Occur.SHOULD, Random.NextBoolean() ? 2F : 0.5F);
query.Add(new Term("field", "is"));
query.Add(new Term("field", "this"));
query.Add(new Term("field", "end"));
query.Add(new Term("field", "world"));
query.Add(new Term("field", "universe"));
query.Add(new Term("field", "right"));
query.LowFreqMinimumNumberShouldMatch = 0.49F;
TopDocs search = s.Search(query, 10);
assertEquals(search.TotalHits, 3);
assertEquals(@"0", r.Document(search.ScoreDocs[0].Doc).Get(@"id"));
assertEquals(@"2", r.Document(search.ScoreDocs[1].Doc).Get(@"id"));
assertEquals(@"3", r.Document(search.ScoreDocs[2].Doc).Get(@"id"));
}
{
CommonTermsQuery query = new CommonTermsQuery(Occur.SHOULD, Occur.SHOULD, Random.NextBoolean() ? 2F : 0.5F);
query.Add(new Term("field", "is"));
query.Add(new Term("field", "this"));
query.Add(new Term("field", "end"));
query.Add(new Term("field", "world"));
query.Add(new Term("field", "universe"));
query.Add(new Term("field", "right"));
query.LowFreqMinimumNumberShouldMatch = 1F;
TopDocs search = s.Search(query, 10);
assertEquals(search.TotalHits, 3);
assertEquals(@"0", r.Document(search.ScoreDocs[0].Doc).Get(@"id"));
assertEquals(@"2", r.Document(search.ScoreDocs[1].Doc).Get(@"id"));
assertEquals(@"3", r.Document(search.ScoreDocs[2].Doc).Get(@"id"));
assertTrue(search.ScoreDocs[1].Score > search.ScoreDocs[2].Score);
}
{
CommonTermsQuery query = new CommonTermsQuery(Occur.SHOULD, Occur.SHOULD, Random.NextBoolean() ? 2F : 0.5F);
query.Add(new Term("field", "is"));
query.Add(new Term("field", "this"));
query.Add(new Term("field", "end"));
query.Add(new Term("field", "world"));
query.Add(new Term("field", "universe"));
query.Add(new Term("field", "right"));
query.LowFreqMinimumNumberShouldMatch = 1F;
query.HighFreqMinimumNumberShouldMatch = 4F;
TopDocs search = s.Search(query, 10);
assertEquals(search.TotalHits, 3);
assertEquals(search.ScoreDocs[1].Score, search.ScoreDocs[2].Score, 0F);
assertEquals(@"0", r.Document(search.ScoreDocs[0].Doc).Get(@"id"));
assertEquals(new JCG.HashSet<string> { @"2", @"3" }, new JCG.HashSet<string> { r.Document(search.ScoreDocs[1].Doc).Get(@"id"), r.Document(search.ScoreDocs[2].Doc).Get(@"id") });
}
{
CommonTermsQuery query = new CommonTermsQuery(Occur.SHOULD, Occur.SHOULD, Random.NextBoolean() ? 2F : 0.5F);
query.Add(new Term("field", "is"));
query.Add(new Term("field", "this"));
query.Add(new Term("field", "the"));
query.LowFreqMinimumNumberShouldMatch = 1F;
query.HighFreqMinimumNumberShouldMatch = 2F;
TopDocs search = s.Search(query, 10);
assertEquals(search.TotalHits, 4);
}
{
CommonTermsQuery query = new CommonTermsQuery(Occur.MUST, Occur.SHOULD, Random.NextBoolean() ? 2F : 0.5F);
query.Add(new Term("field", "is"));
query.Add(new Term("field", "this"));
query.Add(new Term("field", "the"));
query.LowFreqMinimumNumberShouldMatch = 1F;
query.HighFreqMinimumNumberShouldMatch = 2F;
TopDocs search = s.Search(query, 10);
assertEquals(search.TotalHits, 2);
assertEquals(new JCG.HashSet<string> { @"0", @"2" }, new JCG.HashSet<string> { r.Document(search.ScoreDocs[0].Doc).Get(@"id"), r.Document(search.ScoreDocs[1].Doc).Get(@"id") });
}
r.Dispose();
w.Dispose();
dir.Dispose();
}
[Test]
public void TestIllegalOccur()
{
Random random = Random;
try
{
new CommonTermsQuery(Occur.MUST_NOT, RandomOccur(random), Random.NextSingle());
Assert.Fail(@"MUST_NOT is not supproted");
}
#pragma warning disable 168
catch (ArgumentException ex)
#pragma warning restore 168
{
}
try
{
new CommonTermsQuery(RandomOccur(random), Occur.MUST_NOT, Random.NextSingle());
Assert.Fail(@"MUST_NOT is not supproted");
}
#pragma warning disable 168
catch (ArgumentException ex)
#pragma warning restore 168
{
}
}
[Test]
public void TestExtend()
{
Directory dir = NewDirectory();
MockAnalyzer analyzer = new MockAnalyzer(Random);
RandomIndexWriter w = new RandomIndexWriter(
#if FEATURE_INSTANCE_TESTDATA_INITIALIZATION
this,
#endif
Random, dir, analyzer);
var docs = new string[]
{
@"this is the end of the world right", @"is this it or maybe not",
@"this is the end of the universe as we know it",
@"there is the famous restaurant at the end of the universe"
};
for (int i = 0; i < docs.Length; i++)
{
Document doc = new Document();
doc.Add(NewStringField(@"id", @"" + i, Field.Store.YES));
doc.Add(NewTextField(@"field", docs[i], Field.Store.NO));
w.AddDocument(doc);
}
IndexReader r = w.GetReader();
IndexSearcher s = NewSearcher(r);
{
CommonTermsQuery query = new CommonTermsQuery(Occur.SHOULD, Occur.SHOULD, Random.NextBoolean() ? 2F : 0.5F);
query.Add(new Term("field", "is"));
query.Add(new Term("field", "this"));
query.Add(new Term("field", "end"));
query.Add(new Term("field", "world"));
query.Add(new Term("field", "universe"));
query.Add(new Term("field", "right"));
TopDocs search = s.Search(query, 10);
assertEquals(search.TotalHits, 3);
assertEquals(@"0", r.Document(search.ScoreDocs[0].Doc).Get(@"id"));
assertEquals(@"2", r.Document(search.ScoreDocs[1].Doc).Get(@"id"));
assertEquals(@"3", r.Document(search.ScoreDocs[2].Doc).Get(@"id"));
}
{
CommonTermsQuery query = new ExtendedCommonTermsQuery(Occur.SHOULD, Occur.SHOULD, Random.NextBoolean() ? 2F : 0.5F);
query.Add(new Term("field", "is"));
query.Add(new Term("field", "this"));
query.Add(new Term("field", "end"));
query.Add(new Term("field", "world"));
query.Add(new Term("field", "universe"));
query.Add(new Term("field", "right"));
TopDocs search = s.Search(query, 10);
assertEquals(search.TotalHits, 3);
assertEquals(@"2", r.Document(search.ScoreDocs[0].Doc).Get(@"id"));
assertEquals(@"3", r.Document(search.ScoreDocs[1].Doc).Get(@"id"));
assertEquals(@"0", r.Document(search.ScoreDocs[2].Doc).Get(@"id"));
}
r.Dispose();
w.Dispose();
dir.Dispose();
}
[Test]
public void TestRandomIndex()
{
Directory dir = NewDirectory();
MockAnalyzer analyzer = new MockAnalyzer(Random);
analyzer.MaxTokenLength = TestUtil.NextInt32(Random, 1, IndexWriter.MAX_TERM_LENGTH);
RandomIndexWriter w = new RandomIndexWriter(
#if FEATURE_INSTANCE_TESTDATA_INITIALIZATION
this,
#endif
Random, dir, analyzer);
CreateRandomIndex(AtLeast(50), w, Random.NextInt64());
DirectoryReader reader = w.GetReader();
AtomicReader wrapper = SlowCompositeReaderWrapper.Wrap(reader);
string field = @"body";
Terms terms = wrapper.GetTerms(field);
var lowFreqQueue = new AnonymousPriorityQueue(this, 5);
Util.PriorityQueue<TermAndFreq> highFreqQueue = new AnonymousPriorityQueue1(this, 5);
try
{
TermsEnum iterator = terms.GetIterator(null);
while (iterator.Next() != null)
{
if (highFreqQueue.Count < 5)
{
highFreqQueue.Add(new TermAndFreq(BytesRef.DeepCopyOf(iterator.Term), iterator.DocFreq));
lowFreqQueue.Add(new TermAndFreq(BytesRef.DeepCopyOf(iterator.Term), iterator.DocFreq));
}
else
{
if (highFreqQueue.Top.freq < iterator.DocFreq)
{
highFreqQueue.Top.freq = iterator.DocFreq;
highFreqQueue.Top.term = BytesRef.DeepCopyOf(iterator.Term);
highFreqQueue.UpdateTop();
}
if (lowFreqQueue.Top.freq > iterator.DocFreq)
{
lowFreqQueue.Top.freq = iterator.DocFreq;
lowFreqQueue.Top.term = BytesRef.DeepCopyOf(iterator.Term);
lowFreqQueue.UpdateTop();
}
}
}
int lowFreq = lowFreqQueue.Top.freq;
int highFreq = highFreqQueue.Top.freq;
AssumeTrue(@"unlucky index", highFreq - 1 > lowFreq);
List<TermAndFreq> highTerms = QueueToList(highFreqQueue);
List<TermAndFreq> lowTerms = QueueToList(lowFreqQueue);
IndexSearcher searcher = NewSearcher(reader);
Occur lowFreqOccur = RandomOccur(Random);
BooleanQuery verifyQuery = new BooleanQuery();
CommonTermsQuery cq = new CommonTermsQuery(RandomOccur(Random), lowFreqOccur, highFreq - 1, Random.NextBoolean());
foreach (TermAndFreq termAndFreq in lowTerms)
{
cq.Add(new Term(field, termAndFreq.term));
verifyQuery.Add(new BooleanClause(new TermQuery(new Term(field, termAndFreq.term)), lowFreqOccur));
}
foreach (TermAndFreq termAndFreq in highTerms)
{
cq.Add(new Term(field, termAndFreq.term));
}
TopDocs cqSearch = searcher.Search(cq, reader.MaxDoc);
TopDocs verifySearch = searcher.Search(verifyQuery, reader.MaxDoc);
assertEquals(verifySearch.TotalHits, cqSearch.TotalHits);
var hits = new JCG.HashSet<int>();
foreach (ScoreDoc doc in verifySearch.ScoreDocs)
{
hits.Add(doc.Doc);
}
foreach (ScoreDoc doc in cqSearch.ScoreDocs)
{
assertTrue(hits.Remove(doc.Doc));
}
assertTrue(hits.Count == 0);
w.ForceMerge(1);
DirectoryReader reader2 = w.GetReader();
QueryUtils.Check(
#if FEATURE_INSTANCE_TESTDATA_INITIALIZATION
this,
#endif
Random, cq, NewSearcher(reader2));
reader2.Dispose();
}
finally
{
reader.Dispose();
wrapper.Dispose();
w.Dispose();
dir.Dispose();
}
}
private sealed class AnonymousPriorityQueue : Util.PriorityQueue<TermAndFreq>
{
public AnonymousPriorityQueue(CommonTermsQueryTest parent, int maxSize)
: base(maxSize)
{
this.parent = parent;
}
private readonly CommonTermsQueryTest parent;
protected override bool LessThan(TermAndFreq a, TermAndFreq b)
{
return a.freq > b.freq;
}
}
private sealed class AnonymousPriorityQueue1 : Util.PriorityQueue<TermAndFreq>
{
public AnonymousPriorityQueue1(CommonTermsQueryTest parent, int maxSize)
: base(maxSize)
{
this.parent = parent;
}
private readonly CommonTermsQueryTest parent;
protected override bool LessThan(TermAndFreq a, TermAndFreq b)
{
return a.freq < b.freq;
}
}
private static List<TermAndFreq> QueueToList(Util.PriorityQueue<TermAndFreq> queue)
{
var terms = new List<TermAndFreq>();
while (queue.Count > 0)
{
terms.Add(queue.Pop());
}
return terms;
}
private class TermAndFreq : IComparable<TermAndFreq>
{
public BytesRef term;
public int freq;
public TermAndFreq(BytesRef term, int freq)
{
this.term = term;
this.freq = freq;
}
public int CompareTo(TermAndFreq other)
{
return term.CompareTo(other.term) + freq.CompareTo(other.freq);
}
}
public static void CreateRandomIndex(int numdocs, RandomIndexWriter writer, long seed)
{
Random random = new Random((int)seed);
// primary source for our data is from linefiledocs, its realistic.
LineFileDocs lineFileDocs = new LineFileDocs(random, false); // no docvalues in 4x
for (int i = 0; i < numdocs; i++)
{
writer.AddDocument(lineFileDocs.NextDoc());
}
lineFileDocs.Dispose();
}
private sealed class ExtendedCommonTermsQuery : CommonTermsQuery
{
public ExtendedCommonTermsQuery(Occur highFreqOccur, Occur lowFreqOccur, float maxTermFrequency)
: base(highFreqOccur, lowFreqOccur, maxTermFrequency)
{
}
protected override Query NewTermQuery(Term term, TermContext context)
{
Query query = base.NewTermQuery(term, context);
if (term.Text().Equals(@"universe", StringComparison.Ordinal))
{
query.Boost = 100F;
}
return query;
}
}
}
}