blob: 4f3f6e07475658352487f94dcddbabf5938bf7ba [file] [log] [blame]
// Lucene version compatibility level 4.8.1
using NUnit.Framework;
using System;
using System.Globalization;
using Assert = Lucene.Net.TestFramework.Assert;
namespace Lucene.Net.Facet
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
using Directory = Lucene.Net.Store.Directory;
using DirectoryTaxonomyReader = Lucene.Net.Facet.Taxonomy.Directory.DirectoryTaxonomyReader;
using DirectoryTaxonomyWriter = Lucene.Net.Facet.Taxonomy.Directory.DirectoryTaxonomyWriter;
using Document = Lucene.Net.Documents.Document;
using FastTaxonomyFacetCounts = Lucene.Net.Facet.Taxonomy.FastTaxonomyFacetCounts;
using IndexSearcher = Lucene.Net.Search.IndexSearcher;
using IOUtils = Lucene.Net.Util.IOUtils;
using MatchingDocs = Lucene.Net.Facet.FacetsCollector.MatchingDocs;
using MultiCollector = Lucene.Net.Search.MultiCollector;
using RandomIndexWriter = Lucene.Net.Index.RandomIndexWriter;
using Store = Lucene.Net.Documents.Field.Store;
using StringField = Lucene.Net.Documents.StringField;
using Term = Lucene.Net.Index.Term;
using TermQuery = Lucene.Net.Search.TermQuery;
public class TestRandomSamplingFacetsCollector : FacetTestCase
{
[Test]
public virtual void TestRandomSampling()
{
Directory dir = NewDirectory();
Directory taxoDir = NewDirectory();
DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir);
RandomIndexWriter writer = new RandomIndexWriter(
#if FEATURE_INSTANCE_TESTDATA_INITIALIZATION
this,
#endif
Random, dir);
FacetsConfig config = new FacetsConfig();
int numDocs = AtLeast(10000);
for (int i = 0; i < numDocs; i++)
{
Document doc = new Document();
doc.Add(new StringField("EvenOdd", (i % 2 == 0) ? "even" : "odd", Store.NO));
doc.Add(new FacetField("iMod10", Convert.ToString(i % 10, CultureInfo.InvariantCulture)));
writer.AddDocument(config.Build(taxoWriter, doc));
}
Random random = Random;
// NRT open
IndexSearcher searcher = NewSearcher(writer.GetReader());
var taxoReader = new DirectoryTaxonomyReader(taxoWriter);
IOUtils.Dispose(writer, taxoWriter);
// Test empty results
RandomSamplingFacetsCollector collectRandomZeroResults = new RandomSamplingFacetsCollector(numDocs / 10, random.NextInt64());
// There should be no divisions by zero
searcher.Search(new TermQuery(new Term("EvenOdd", "NeverMatches")), collectRandomZeroResults);
// There should be no divisions by zero and no null result
Assert.IsNotNull(collectRandomZeroResults.GetMatchingDocs());
// There should be no results at all
foreach (MatchingDocs doc in collectRandomZeroResults.GetMatchingDocs())
{
Assert.AreEqual(0, doc.TotalHits);
}
// Now start searching and retrieve results.
// Use a query to select half of the documents.
TermQuery query = new TermQuery(new Term("EvenOdd", "even"));
// there will be 5 facet values (0, 2, 4, 6 and 8), as only the even (i %
// 10) are hits.
// there is a REAL small chance that one of the 5 values will be missed when
// sampling.
// but is that 0.8 (chance not to take a value) ^ 2000 * 5 (any can be
// missing) ~ 10^-193
// so that is probably not going to happen.
int maxNumChildren = 5;
RandomSamplingFacetsCollector random100Percent = new RandomSamplingFacetsCollector(numDocs, random.NextInt64()); // no sampling
RandomSamplingFacetsCollector random10Percent = new RandomSamplingFacetsCollector(numDocs / 10, random.NextInt64()); // 10 % of total docs, 20% of the hits
FacetsCollector fc = new FacetsCollector();
searcher.Search(query, MultiCollector.Wrap(fc, random100Percent, random10Percent));
FastTaxonomyFacetCounts random10FacetCounts = new FastTaxonomyFacetCounts(taxoReader, config, random10Percent);
FastTaxonomyFacetCounts random100FacetCounts = new FastTaxonomyFacetCounts(taxoReader, config, random100Percent);
FastTaxonomyFacetCounts exactFacetCounts = new FastTaxonomyFacetCounts(taxoReader, config, fc);
FacetResult random10Result = random10Percent.AmortizeFacetCounts(random10FacetCounts.GetTopChildren(10, "iMod10"), config, searcher);
FacetResult random100Result = random100FacetCounts.GetTopChildren(10, "iMod10");
FacetResult exactResult = exactFacetCounts.GetTopChildren(10, "iMod10");
Assert.AreEqual(random100Result, exactResult);
// we should have five children, but there is a small chance we have less.
// (see above).
Assert.IsTrue(random10Result.ChildCount <= maxNumChildren);
// there should be one child at least.
Assert.IsTrue(random10Result.ChildCount >= 1);
// now calculate some statistics to determine if the sampled result is 'ok'.
// because random sampling is used, the results will vary each time.
int sum = 0;
foreach (LabelAndValue lav in random10Result.LabelValues)
{
sum += (int)lav.Value;
}
float mu = (float)sum / (float)maxNumChildren;
float variance = 0;
foreach (LabelAndValue lav in random10Result.LabelValues)
{
variance += (float)Math.Pow((mu - (int)lav.Value), 2);
}
variance = variance / maxNumChildren;
float sigma = (float)Math.Sqrt(variance);
// we query only half the documents and have 5 categories. The average
// number of docs in a category will thus be the total divided by 5*2
float targetMu = numDocs / (5.0f * 2.0f);
// the average should be in the range and the standard deviation should not
// be too great
Assert.IsTrue(sigma < 200);
Assert.IsTrue(targetMu - 3 * sigma < mu && mu < targetMu + 3 * sigma);
IOUtils.Dispose(searcher.IndexReader, taxoReader, dir, taxoDir);
}
}
}