| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.facet; |
| |
| import java.util.List; |
| import java.util.Random; |
| |
| import org.apache.lucene.document.Document; |
| import org.apache.lucene.document.Field.Store; |
| import org.apache.lucene.document.StringField; |
| import org.apache.lucene.facet.FacetsCollector.MatchingDocs; |
| import org.apache.lucene.facet.taxonomy.FastTaxonomyFacetCounts; |
| import org.apache.lucene.facet.taxonomy.TaxonomyReader; |
| import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyReader; |
| import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter; |
| import org.apache.lucene.index.RandomIndexWriter; |
| import org.apache.lucene.index.Term; |
| import org.apache.lucene.search.DocIdSetIterator; |
| import org.apache.lucene.search.IndexSearcher; |
| import org.apache.lucene.search.MultiCollector; |
| import org.apache.lucene.search.TermQuery; |
| import org.apache.lucene.store.Directory; |
| import org.apache.lucene.util.IOUtils; |
| |
| public class TestRandomSamplingFacetsCollector extends FacetTestCase { |
| |
| // The first 50 chi-square value for p-value=0.05, taken from: |
| // http://en.wikibooks.org/wiki/Engineering_Tables/Chi-Squared_Distibution |
| private static final float[] CHI_SQUARE_VALUES = new float[] {0.0f, 3.841f, |
| 5.991f, 7.815f, 9.488f, 11.07f, 12.592f, 14.067f, 15.507f, 16.919f, |
| 18.307f, 19.675f, 21.026f, 22.362f, 23.685f, 24.996f, 26.296f, 27.587f, |
| 28.869f, 30.144f, 31.41f, 32.671f, 33.924f, 35.172f, 36.415f, 37.652f, |
| 38.885f, 40.113f, 41.337f, 42.557f, 43.773f, 44.985f, 46.194f, 47.4f, |
| 48.602f, 49.802f, 50.998f, 52.192f, 53.384f, 54.572f, 55.758f, 56.942f, |
| 58.124f, 59.304f, 60.481f, 61.656f, 62.83f, 64.001f, 65.171f, 66.339f, |
| 67.505f}; |
| |
| public void testRandomSampling() throws Exception { |
| Directory dir = newDirectory(); |
| Directory taxoDir = newDirectory(); |
| |
| Random random = random(); |
| DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir); |
| RandomIndexWriter writer = new RandomIndexWriter(random, dir); |
| |
| FacetsConfig config = new FacetsConfig(); |
| |
| final int numCategories = 10; |
| int numDocs = atLeast(10000); |
| for (int i = 0; i < numDocs; i++) { |
| Document doc = new Document(); |
| doc.add(new StringField("EvenOdd", (i % 2 == 0) ? "even" : "odd", Store.NO)); |
| doc.add(new FacetField("iMod10", Integer.toString(i % numCategories))); |
| writer.addDocument(config.build(taxoWriter, doc)); |
| } |
| writer.forceMerge(CHI_SQUARE_VALUES.length - 1); |
| |
| // NRT open |
| IndexSearcher searcher = newSearcher(writer.getReader()); |
| TaxonomyReader taxoReader = new DirectoryTaxonomyReader(taxoWriter); |
| IOUtils.close(writer, taxoWriter); |
| |
| // Test empty results |
| RandomSamplingFacetsCollector collectRandomZeroResults = new RandomSamplingFacetsCollector(numDocs / 10, random.nextLong()); |
| |
| // There should be no divisions by zero |
| searcher.search(new TermQuery(new Term("EvenOdd", "NeverMatches")), collectRandomZeroResults); |
| |
| // There should be no divisions by zero and no null result |
| assertNotNull(collectRandomZeroResults.getMatchingDocs()); |
| |
| // There should be no results at all |
| for (MatchingDocs doc : collectRandomZeroResults.getMatchingDocs()) { |
| assertEquals(0, doc.totalHits); |
| } |
| |
| // Now start searching and retrieve results. |
| |
| // Use a query to select half of the documents. |
| TermQuery query = new TermQuery(new Term("EvenOdd", "even")); |
| |
| RandomSamplingFacetsCollector random10Percent = new RandomSamplingFacetsCollector(numDocs / 10, random.nextLong()); // 10% of total docs, 20% of the hits |
| |
| FacetsCollector fc = new FacetsCollector(); |
| |
| searcher.search(query, MultiCollector.wrap(fc, random10Percent)); |
| |
| final List<MatchingDocs> matchingDocs = random10Percent.getMatchingDocs(); |
| |
| // count the total hits and sampled docs, also store the number of sampled |
| // docs per segment |
| int totalSampledDocs = 0, totalHits = 0; |
| int[] numSampledDocs = new int[matchingDocs.size()]; |
| // System.out.println("numSegments=" + numSampledDocs.length); |
| for (int i = 0; i < numSampledDocs.length; i++) { |
| MatchingDocs md = matchingDocs.get(i); |
| final DocIdSetIterator iter = md.bits.iterator(); |
| while (iter.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) ++numSampledDocs[i]; |
| totalSampledDocs += numSampledDocs[i]; |
| totalHits += md.totalHits; |
| } |
| |
| // compute the chi-square value for the sampled documents' distribution |
| float chi_square = 0; |
| for (int i = 0; i < numSampledDocs.length; i++) { |
| MatchingDocs md = matchingDocs.get(i); |
| float ei = (float) md.totalHits / totalHits; |
| if (ei > 0.0f) { |
| float oi = (float) numSampledDocs[i] / totalSampledDocs; |
| chi_square += (Math.pow(ei - oi, 2) / ei); |
| } |
| } |
| |
| // Verify that the chi-square value isn't too big. According to |
| // http://en.wikipedia.org/wiki/Chi-squared_distribution#Table_of_.CF.872_value_vs_p-value, |
| // we basically verify that there is a really small chance of hitting a very |
| // bad sample (p-value < 0.05), for n-degrees of freedom. The number 'n' depends |
| // on the number of segments. |
| assertTrue("chisquare not statistically significant enough: " + chi_square, chi_square < CHI_SQUARE_VALUES[numSampledDocs.length]); |
| |
| // Test amortized counts - should be 5X the sampled count, but maximum numDocs/10 |
| final FastTaxonomyFacetCounts random10FacetCounts = new FastTaxonomyFacetCounts(taxoReader, config, random10Percent); |
| final FacetResult random10Result = random10FacetCounts.getTopChildren(10, "iMod10"); |
| final FacetResult amortized10Result = random10Percent.amortizeFacetCounts(random10Result, config, searcher); |
| for (int i = 0; i < amortized10Result.labelValues.length; i++) { |
| LabelAndValue amortized = amortized10Result.labelValues[i]; |
| LabelAndValue sampled = random10Result.labelValues[i]; |
| // since numDocs may not divide by 10 exactly, allow for some slack in the amortized count |
| assertEquals(amortized.value.floatValue(), Math.min(5 * sampled.value.floatValue(), numDocs / 10.f), 1.0); |
| } |
| |
| IOUtils.close(searcher.getIndexReader(), taxoReader, dir, taxoDir); |
| } |
| |
| } |