blob: 09f6c4cfbf0980ffb3c8c29f7b0749e5d3f007f9 [file] [log] [blame]
using System.Linq;
using Lucene.Net.Documents;
using Lucene.Net.Index;
using Lucene.Net.Support;
using NUnit.Framework;
using System.Collections.Generic;
using System.Diagnostics;
using JCG = J2N.Collections.Generic;
using J2N.Collections.Generic.Extensions;
namespace Lucene.Net.Search
{
using AtomicReader = Lucene.Net.Index.AtomicReader;
using BooleanWeight = Lucene.Net.Search.BooleanQuery.BooleanWeight;
using BytesRef = Lucene.Net.Util.BytesRef;
using DefaultSimilarity = Lucene.Net.Search.Similarities.DefaultSimilarity;
using Directory = Lucene.Net.Store.Directory;
using DirectoryReader = Lucene.Net.Index.DirectoryReader;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
using Document = Documents.Document;
using Field = Field;
using LuceneTestCase = Lucene.Net.Util.LuceneTestCase;
using RandomIndexWriter = Lucene.Net.Index.RandomIndexWriter;
using SimScorer = Lucene.Net.Search.Similarities.Similarity.SimScorer;
using SimWeight = Lucene.Net.Search.Similarities.Similarity.SimWeight;
using SortedSetDocValues = Lucene.Net.Index.SortedSetDocValues;
using SortedSetDocValuesField = SortedSetDocValuesField;
using StringField = StringField;
using Term = Lucene.Net.Index.Term;
using TermContext = Lucene.Net.Index.TermContext;
using TestUtil = Lucene.Net.Util.TestUtil;
/// <summary>
/// tests BooleanScorer2's minShouldMatch </summary>
[SuppressCodecs("Appending", "Lucene3x", "Lucene40", "Lucene41")]
[TestFixture]
public class TestMinShouldMatch2 : LuceneTestCase
{
internal static Directory Dir;
internal static DirectoryReader r;
internal static AtomicReader atomicReader;
internal static IndexSearcher Searcher;
internal static readonly string[] AlwaysTerms = new string[] { "a" };
internal static readonly string[] CommonTerms = new string[] { "b", "c", "d" };
internal static readonly string[] MediumTerms = new string[] { "e", "f", "g" };
internal static readonly string[] RareTerms = new string[] { "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z" };
/// <summary>
/// LUCENENET specific
/// Is non-static because Similarity and TimeZone are not static.
/// </summary>
[OneTimeSetUp]
public override void BeforeClass()
{
base.BeforeClass();
Dir = NewDirectory();
RandomIndexWriter iw = new RandomIndexWriter(
#if FEATURE_INSTANCE_TESTDATA_INITIALIZATION
this,
#endif
Random, Dir);
int numDocs = AtLeast(300);
for (int i = 0; i < numDocs; i++)
{
Document doc = new Document();
AddSome(doc, AlwaysTerms);
if (Random.Next(100) < 90)
{
AddSome(doc, CommonTerms);
}
if (Random.Next(100) < 50)
{
AddSome(doc, MediumTerms);
}
if (Random.Next(100) < 10)
{
AddSome(doc, RareTerms);
}
iw.AddDocument(doc);
}
iw.ForceMerge(1);
iw.Dispose();
r = DirectoryReader.Open(Dir);
atomicReader = GetOnlySegmentReader(r);
Searcher = new IndexSearcher(atomicReader);
Searcher.Similarity = new DefaultSimilarityAnonymousInnerClassHelper();
}
private class DefaultSimilarityAnonymousInnerClassHelper : DefaultSimilarity
{
public DefaultSimilarityAnonymousInnerClassHelper()
{
}
public override float QueryNorm(float sumOfSquaredWeights)
{
return 1; // we disable queryNorm, both for debugging and ease of impl
}
}
[OneTimeTearDown]
public override void AfterClass()
{
atomicReader.Dispose();
Dir.Dispose();
Searcher = null;
atomicReader = null;
r = null;
Dir = null;
base.AfterClass();
}
private static void AddSome(Document doc, string[] values)
{
IList<string> list = values.ToArray();
list.Shuffle();
int howMany = TestUtil.NextInt32(Random, 1, list.Count);
for (int i = 0; i < howMany; i++)
{
doc.Add(new StringField("field", list[i], Field.Store.NO));
doc.Add(new SortedSetDocValuesField("dv", new BytesRef(list[i])));
}
}
private Scorer Scorer(string[] values, int minShouldMatch, bool slow)
{
BooleanQuery bq = new BooleanQuery();
foreach (string value in values)
{
bq.Add(new TermQuery(new Term("field", value)), Occur.SHOULD);
}
bq.MinimumNumberShouldMatch = minShouldMatch;
BooleanWeight weight = (BooleanWeight)Searcher.CreateNormalizedWeight(bq);
if (slow)
{
return new SlowMinShouldMatchScorer(weight, atomicReader, Searcher);
}
else
{
return weight.GetScorer((AtomicReaderContext)atomicReader.Context, null);
}
}
private void AssertNext(Scorer expected, Scorer actual)
{
if (actual == null)
{
Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, expected.NextDoc());
return;
}
int doc;
while ((doc = expected.NextDoc()) != DocIdSetIterator.NO_MORE_DOCS)
{
Assert.AreEqual(doc, actual.NextDoc());
Assert.AreEqual(expected.Freq, actual.Freq);
float expectedScore = expected.GetScore();
float actualScore = actual.GetScore();
Assert.AreEqual(expectedScore, actualScore, CheckHits.ExplainToleranceDelta(expectedScore, actualScore));
}
Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, actual.NextDoc());
}
private void AssertAdvance(Scorer expected, Scorer actual, int amount)
{
if (actual == null)
{
Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, expected.NextDoc());
return;
}
int prevDoc = 0;
int doc;
while ((doc = expected.Advance(prevDoc + amount)) != DocIdSetIterator.NO_MORE_DOCS)
{
Assert.AreEqual(doc, actual.Advance(prevDoc + amount));
Assert.AreEqual(expected.Freq, actual.Freq);
float expectedScore = expected.GetScore();
float actualScore = actual.GetScore();
Assert.AreEqual(expectedScore, actualScore, CheckHits.ExplainToleranceDelta(expectedScore, actualScore));
prevDoc = doc;
}
Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, actual.Advance(prevDoc + amount));
}
/// <summary>
/// simple test for next(): minShouldMatch=2 on 3 terms (one common, one medium, one rare) </summary>
[Test]
public virtual void TestNextCMR2()
{
for (int common = 0; common < CommonTerms.Length; common++)
{
for (int medium = 0; medium < MediumTerms.Length; medium++)
{
for (int rare = 0; rare < RareTerms.Length; rare++)
{
Scorer expected = Scorer(new string[] { CommonTerms[common], MediumTerms[medium], RareTerms[rare] }, 2, true);
Scorer actual = Scorer(new string[] { CommonTerms[common], MediumTerms[medium], RareTerms[rare] }, 2, false);
AssertNext(expected, actual);
}
}
}
}
/// <summary>
/// simple test for advance(): minShouldMatch=2 on 3 terms (one common, one medium, one rare) </summary>
[Test]
public virtual void TestAdvanceCMR2()
{
for (int amount = 25; amount < 200; amount += 25)
{
for (int common = 0; common < CommonTerms.Length; common++)
{
for (int medium = 0; medium < MediumTerms.Length; medium++)
{
for (int rare = 0; rare < RareTerms.Length; rare++)
{
Scorer expected = Scorer(new string[] { CommonTerms[common], MediumTerms[medium], RareTerms[rare] }, 2, true);
Scorer actual = Scorer(new string[] { CommonTerms[common], MediumTerms[medium], RareTerms[rare] }, 2, false);
AssertAdvance(expected, actual, amount);
}
}
}
}
}
/// <summary>
/// test next with giant bq of all terms with varying minShouldMatch </summary>
[Test]
public virtual void TestNextAllTerms()
{
IList<string> termsList = new List<string>(CommonTerms.Length + MediumTerms.Length + RareTerms.Length);
termsList.AddRange(CommonTerms);
termsList.AddRange(MediumTerms);
termsList.AddRange(RareTerms);
string[] terms = termsList.ToArray();
for (int minNrShouldMatch = 1; minNrShouldMatch <= terms.Length; minNrShouldMatch++)
{
Scorer expected = Scorer(terms, minNrShouldMatch, true);
Scorer actual = Scorer(terms, minNrShouldMatch, false);
AssertNext(expected, actual);
}
}
/// <summary>
/// test advance with giant bq of all terms with varying minShouldMatch </summary>
[Test]
public virtual void TestAdvanceAllTerms()
{
IList<string> termsList = new List<string>(CommonTerms.Length + MediumTerms.Length + RareTerms.Length);
termsList.AddRange(CommonTerms);
termsList.AddRange(MediumTerms);
termsList.AddRange(RareTerms);
string[] terms = termsList.ToArray();
for (int amount = 25; amount < 200; amount += 25)
{
for (int minNrShouldMatch = 1; minNrShouldMatch <= terms.Length; minNrShouldMatch++)
{
Scorer expected = Scorer(terms, minNrShouldMatch, true);
Scorer actual = Scorer(terms, minNrShouldMatch, false);
AssertAdvance(expected, actual, amount);
}
}
}
/// <summary>
/// test next with varying numbers of terms with varying minShouldMatch </summary>
[Test]
public virtual void TestNextVaryingNumberOfTerms()
{
IList<string> termsList = new List<string>(CommonTerms.Length + MediumTerms.Length + RareTerms.Length);
termsList.AddRange(CommonTerms);
termsList.AddRange(MediumTerms);
termsList.AddRange(RareTerms);
termsList.Shuffle();
for (int numTerms = 2; numTerms <= termsList.Count; numTerms++)
{
string[] terms = termsList.SubList(0, numTerms).ToArray(/*new string[0]*/);
for (int minNrShouldMatch = 1; minNrShouldMatch <= terms.Length; minNrShouldMatch++)
{
Scorer expected = Scorer(terms, minNrShouldMatch, true);
Scorer actual = Scorer(terms, minNrShouldMatch, false);
AssertNext(expected, actual);
}
}
}
/// <summary>
/// test advance with varying numbers of terms with varying minShouldMatch </summary>
[Test]
public virtual void TestAdvanceVaryingNumberOfTerms()
{
IList<string> termsList = new List<string>(CommonTerms.Length + MediumTerms.Length + RareTerms.Length);
termsList.AddRange(CommonTerms);
termsList.AddRange(MediumTerms);
termsList.AddRange(RareTerms);
termsList.Shuffle();
for (int amount = 25; amount < 200; amount += 25)
{
for (int numTerms = 2; numTerms <= termsList.Count; numTerms++)
{
string[] terms = termsList.SubList(0, numTerms).ToArray(/*new string[0]*/);
for (int minNrShouldMatch = 1; minNrShouldMatch <= terms.Length; minNrShouldMatch++)
{
Scorer expected = Scorer(terms, minNrShouldMatch, true);
Scorer actual = Scorer(terms, minNrShouldMatch, false);
AssertAdvance(expected, actual, amount);
}
}
}
}
// TODO: more tests
// a slow min-should match scorer that uses a docvalues field.
// later, we can make debugging easier as it can record the set of ords it currently matched
// and e.g. print out their values and so on for the document
internal class SlowMinShouldMatchScorer : Scorer
{
internal int CurrentDoc = -1; // current docid
internal int CurrentMatched = -1; // current number of terms matched
internal readonly SortedSetDocValues Dv;
internal readonly int MaxDoc;
internal readonly ISet<long?> Ords = new JCG.HashSet<long?>();
internal readonly SimScorer[] Sims;
internal readonly int MinNrShouldMatch;
internal double Score_Renamed = float.NaN;
internal SlowMinShouldMatchScorer(BooleanWeight weight, AtomicReader reader, IndexSearcher searcher)
: base(weight)
{
this.Dv = reader.GetSortedSetDocValues("dv");
this.MaxDoc = reader.MaxDoc;
BooleanQuery bq = (BooleanQuery)weight.Query;
this.MinNrShouldMatch = bq.MinimumNumberShouldMatch;
this.Sims = new SimScorer[(int)Dv.ValueCount];
foreach (BooleanClause clause in bq.GetClauses())
{
Debug.Assert(!clause.IsProhibited);
Debug.Assert(!clause.IsRequired);
Term term = ((TermQuery)clause.Query).Term;
long ord = Dv.LookupTerm(term.Bytes);
if (ord >= 0)
{
bool success = Ords.Add(ord);
Debug.Assert(success); // no dups
TermContext context = TermContext.Build(reader.Context, term);
SimWeight w = weight.Similarity.ComputeWeight(1f, searcher.CollectionStatistics("field"), searcher.TermStatistics(term, context));
var dummy = w.GetValueForNormalization(); // ignored
w.Normalize(1F, 1F);
Sims[(int)ord] = weight.Similarity.GetSimScorer(w, (AtomicReaderContext)reader.Context);
}
}
}
public override float GetScore()
{
Debug.Assert(Score_Renamed != 0, CurrentMatched.ToString());
return (float)Score_Renamed * ((BooleanWeight)m_weight).Coord(CurrentMatched, ((BooleanWeight)m_weight).MaxCoord);
}
public override int Freq
{
get { return CurrentMatched; }
}
public override int DocID
{
get { return CurrentDoc; }
}
public override int NextDoc()
{
Debug.Assert(CurrentDoc != NO_MORE_DOCS);
for (CurrentDoc = CurrentDoc + 1; CurrentDoc < MaxDoc; CurrentDoc++)
{
CurrentMatched = 0;
Score_Renamed = 0;
Dv.SetDocument(CurrentDoc);
long ord;
while ((ord = Dv.NextOrd()) != SortedSetDocValues.NO_MORE_ORDS)
{
if (Ords.Contains(ord))
{
CurrentMatched++;
Score_Renamed += Sims[(int)ord].Score(CurrentDoc, 1);
}
}
if (CurrentMatched >= MinNrShouldMatch)
{
return CurrentDoc;
}
}
return CurrentDoc = NO_MORE_DOCS;
}
public override int Advance(int target)
{
int doc;
while ((doc = NextDoc()) < target)
{
}
return doc;
}
public override long GetCost()
{
return MaxDoc;
}
}
}
}