blob: 152a40dcc702996cf362d2ccacecadd7019b0b04 [file] [log] [blame]
package org.apache.lucene.facet.sampling;
import java.util.List;
import java.util.Random;
import org.apache.lucene.facet.params.FacetIndexingParams;
import org.apache.lucene.facet.params.FacetSearchParams;
import org.apache.lucene.facet.sampling.RandomSampler;
import org.apache.lucene.facet.sampling.RepeatableSampler;
import org.apache.lucene.facet.sampling.Sampler;
import org.apache.lucene.facet.sampling.SamplingParams;
import org.apache.lucene.facet.search.BaseTestTopK;
import org.apache.lucene.facet.search.FacetRequest;
import org.apache.lucene.facet.search.FacetResult;
import org.apache.lucene.facet.search.FacetsCollector;
import org.apache.lucene.facet.search.StandardFacetsAccumulator;
import org.apache.lucene.facet.search.FacetRequest.ResultMode;
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
public abstract class BaseSampleTestTopK extends BaseTestTopK {
/** Number of top results */
protected static final int K = 2;
/** since there is a chance that this test would fail even if the code is correct, retry the sampling */
protected static final int RETRIES = 10;
@Override
protected FacetSearchParams searchParamsWithRequests(int numResults, FacetIndexingParams fip) {
FacetSearchParams res = super.searchParamsWithRequests(numResults, fip);
for (FacetRequest req : res.facetRequests) {
// randomize the way we aggregate results
if (random().nextBoolean()) {
req.setResultMode(ResultMode.GLOBAL_FLAT);
} else {
req.setResultMode(ResultMode.PER_NODE_IN_TREE);
}
}
return res;
}
protected abstract StandardFacetsAccumulator getSamplingAccumulator(Sampler sampler, TaxonomyReader taxoReader,
IndexReader indexReader, FacetSearchParams searchParams);
/**
* Try out faceted search with sampling enabled and complements either disabled or enforced
* Lots of randomly generated data is being indexed, and later on a "90% docs" faceted search
* is performed. The results are compared to non-sampled ones.
*/
public void testCountUsingSampling() throws Exception {
boolean useRandomSampler = random().nextBoolean();
for (int partitionSize : partitionSizes) {
try {
// complements return counts for all ordinals, so force ALL_PARENTS indexing
// so that it's easier to compare
FacetIndexingParams fip = getFacetIndexingParams(partitionSize, true);
initIndex(fip);
// Get all of the documents and run the query, then do different
// facet counts and compare to control
Query q = new TermQuery(new Term(CONTENT_FIELD, BETA)); // 90% of the docs
FacetSearchParams expectedSearchParams = searchParamsWithRequests(K, fip);
FacetsCollector fc = FacetsCollector.create(expectedSearchParams, indexReader, taxoReader);
searcher.search(q, fc);
List<FacetResult> expectedResults = fc.getFacetResults();
FacetSearchParams samplingSearchParams = searchParamsWithRequests(K, fip);
// try several times in case of failure, because the test has a chance to fail
// if the top K facets are not sufficiently common with the sample set
for (int nTrial = 0; nTrial < RETRIES; nTrial++) {
try {
// complement with sampling!
final Sampler sampler = createSampler(nTrial, useRandomSampler, samplingSearchParams);
assertSampling(expectedResults, q, sampler, samplingSearchParams, false);
assertSampling(expectedResults, q, sampler, samplingSearchParams, true);
break; // succeeded
} catch (AssertionError e) {
if (nTrial >= RETRIES - 1) {
throw e; // no more retries allowed, must fail
}
}
}
} finally {
closeAll();
}
}
}
private void assertSampling(List<FacetResult> expected, Query q, Sampler sampler, FacetSearchParams params, boolean complement) throws Exception {
FacetsCollector samplingFC = samplingCollector(complement, sampler, params);
searcher.search(q, samplingFC);
List<FacetResult> sampledResults = samplingFC.getFacetResults();
assertSameResults(expected, sampledResults);
}
private FacetsCollector samplingCollector(final boolean complement, final Sampler sampler,
FacetSearchParams samplingSearchParams) {
StandardFacetsAccumulator sfa = getSamplingAccumulator(sampler, taxoReader, indexReader, samplingSearchParams);
sfa.setComplementThreshold(complement ? StandardFacetsAccumulator.FORCE_COMPLEMENT : StandardFacetsAccumulator.DISABLE_COMPLEMENT);
return FacetsCollector.create(sfa);
}
private Sampler createSampler(int nTrial, boolean useRandomSampler, FacetSearchParams sParams) {
SamplingParams samplingParams = new SamplingParams();
/*
* Set sampling to Exact fixing with TakmiSampleFixer as it is not easy to
* validate results with amortized results.
*/
samplingParams.setSampleFixer(new TakmiSampleFixer(indexReader, taxoReader, sParams));
final double retryFactor = Math.pow(1.01, nTrial);
samplingParams.setOversampleFactor(5.0 * retryFactor); // Oversampling
samplingParams.setSampleRatio(0.8 * retryFactor);
samplingParams.setMinSampleSize((int) (100 * retryFactor));
samplingParams.setMaxSampleSize((int) (10000 * retryFactor));
samplingParams.setSamplingThreshold(11000); //force sampling
Sampler sampler = useRandomSampler ?
new RandomSampler(samplingParams, new Random(random().nextLong())) :
new RepeatableSampler(samplingParams);
return sampler;
}
}