lucene/facet/src/test/org/apache/lucene/facet/TestRandomSamplingFacetsCollector.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.facet;

 import java.util.List;
 import java.util.Random;

 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field.Store;
 import org.apache.lucene.document.StringField;
 import org.apache.lucene.facet.FacetsCollector.MatchingDocs;
 import org.apache.lucene.facet.taxonomy.FastTaxonomyFacetCounts;
 import org.apache.lucene.facet.taxonomy.TaxonomyReader;
 import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyReader;
 import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter;
 import org.apache.lucene.index.RandomIndexWriter;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.search.DocIdSetIterator;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.MultiCollector;
 import org.apache.lucene.search.TermQuery;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.IOUtils;

 public class TestRandomSamplingFacetsCollector extends FacetTestCase {

   // The first 50 chi-square value for p-value=0.05, taken from:
   // http://en.wikibooks.org/wiki/Engineering_Tables/Chi-Squared_Distibution
   private static final float[] CHI_SQUARE_VALUES = new float[] {0.0f, 3.841f,
       5.991f, 7.815f, 9.488f, 11.07f, 12.592f, 14.067f, 15.507f, 16.919f,
       18.307f, 19.675f, 21.026f, 22.362f, 23.685f, 24.996f, 26.296f, 27.587f,
       28.869f, 30.144f, 31.41f, 32.671f, 33.924f, 35.172f, 36.415f, 37.652f,
       38.885f, 40.113f, 41.337f, 42.557f, 43.773f, 44.985f, 46.194f, 47.4f,
       48.602f, 49.802f, 50.998f, 52.192f, 53.384f, 54.572f, 55.758f, 56.942f,
       58.124f, 59.304f, 60.481f, 61.656f, 62.83f, 64.001f, 65.171f, 66.339f,
       67.505f};

   public void testRandomSampling() throws Exception {
     Directory dir = newDirectory();
     Directory taxoDir = newDirectory();

     Random random = random();
     DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir);
     RandomIndexWriter writer = new RandomIndexWriter(random, dir);

     FacetsConfig config = new FacetsConfig();

     final int numCategories = 10;
     int numDocs = atLeast(10000);
     for (int i = 0; i < numDocs; i++) {
       Document doc = new Document();
       doc.add(new StringField("EvenOdd", (i % 2 == 0) ? "even" : "odd", Store.NO));
       doc.add(new FacetField("iMod10", Integer.toString(i % numCategories)));
       writer.addDocument(config.build(taxoWriter, doc));
     }
     writer.forceMerge(CHI_SQUARE_VALUES.length - 1);

     // NRT open
     IndexSearcher searcher = newSearcher(writer.getReader());
     TaxonomyReader taxoReader = new DirectoryTaxonomyReader(taxoWriter);
     IOUtils.close(writer, taxoWriter);

     // Test empty results
     RandomSamplingFacetsCollector collectRandomZeroResults = new RandomSamplingFacetsCollector(numDocs / 10, random.nextLong());

     // There should be no divisions by zero
     searcher.search(new TermQuery(new Term("EvenOdd", "NeverMatches")), collectRandomZeroResults);

     // There should be no divisions by zero and no null result
     assertNotNull(collectRandomZeroResults.getMatchingDocs());

     // There should be no results at all
     for (MatchingDocs doc : collectRandomZeroResults.getMatchingDocs()) {
       assertEquals(0, doc.totalHits);
     }

     // Now start searching and retrieve results.

     // Use a query to select half of the documents.
     TermQuery query = new TermQuery(new Term("EvenOdd", "even"));

     RandomSamplingFacetsCollector random10Percent = new RandomSamplingFacetsCollector(numDocs / 10, random.nextLong()); // 10% of total docs, 20% of the hits

     FacetsCollector fc = new FacetsCollector();

     searcher.search(query, MultiCollector.wrap(fc, random10Percent));

     final List<MatchingDocs> matchingDocs = random10Percent.getMatchingDocs();

     // count the total hits and sampled docs, also store the number of sampled
     // docs per segment
     int totalSampledDocs = 0, totalHits = 0;
     int[] numSampledDocs = new int[matchingDocs.size()];
 //    System.out.println("numSegments=" + numSampledDocs.length);
     for (int i = 0; i < numSampledDocs.length; i++) {
       MatchingDocs md = matchingDocs.get(i);
       final DocIdSetIterator iter = md.bits.iterator();
       while (iter.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) ++numSampledDocs[i];
       totalSampledDocs += numSampledDocs[i];
       totalHits += md.totalHits;
     }

     // compute the chi-square value for the sampled documents' distribution
     float chi_square = 0;
     for (int i = 0; i < numSampledDocs.length; i++) {
       MatchingDocs md = matchingDocs.get(i);
       float ei = (float) md.totalHits / totalHits;
       if (ei > 0.0f) {
         float oi = (float) numSampledDocs[i] / totalSampledDocs;
         chi_square += (Math.pow(ei - oi, 2) / ei);
       }
     }

     // Verify that the chi-square value isn't too big. According to
     // http://en.wikipedia.org/wiki/Chi-squared_distribution#Table_of_.CF.872_value_vs_p-value,
     // we basically verify that there is a really small chance of hitting a very
     // bad sample (p-value < 0.05), for n-degrees of freedom. The number 'n' depends
     // on the number of segments.
     assertTrue("chisquare not statistically significant enough: " + chi_square, chi_square < CHI_SQUARE_VALUES[numSampledDocs.length]);

     // Test amortized counts - should be 5X the sampled count, but maximum numDocs/10
     final FastTaxonomyFacetCounts random10FacetCounts = new FastTaxonomyFacetCounts(taxoReader, config, random10Percent);
     final FacetResult random10Result = random10FacetCounts.getTopChildren(10, "iMod10");
     final FacetResult amortized10Result = random10Percent.amortizeFacetCounts(random10Result, config, searcher);
     for (int i = 0; i < amortized10Result.labelValues.length; i++) {
       LabelAndValue amortized = amortized10Result.labelValues[i];
       LabelAndValue sampled = random10Result.labelValues[i];
       // since numDocs may not divide by 10 exactly, allow for some slack in the amortized count
       assertEquals(amortized.value.floatValue(), Math.min(5 * sampled.value.floatValue(), numDocs / 10.f), 1.0);
     }

     IOUtils.close(searcher.getIndexReader(), taxoReader, dir, taxoDir);
   }

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.facet;

	import java.util.List;
	import java.util.Random;

	import org.apache.lucene.document.Document;
	import org.apache.lucene.document.Field.Store;
	import org.apache.lucene.document.StringField;
	import org.apache.lucene.facet.FacetsCollector.MatchingDocs;
	import org.apache.lucene.facet.taxonomy.FastTaxonomyFacetCounts;
	import org.apache.lucene.facet.taxonomy.TaxonomyReader;
	import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyReader;
	import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter;
	import org.apache.lucene.index.RandomIndexWriter;
	import org.apache.lucene.index.Term;
	import org.apache.lucene.search.DocIdSetIterator;
	import org.apache.lucene.search.IndexSearcher;
	import org.apache.lucene.search.MultiCollector;
	import org.apache.lucene.search.TermQuery;
	import org.apache.lucene.store.Directory;
	import org.apache.lucene.util.IOUtils;

	public class TestRandomSamplingFacetsCollector extends FacetTestCase {

	// The first 50 chi-square value for p-value=0.05, taken from:
	// http://en.wikibooks.org/wiki/Engineering_Tables/Chi-Squared_Distibution
	private static final float[] CHI_SQUARE_VALUES = new float[] {0.0f, 3.841f,
	5.991f, 7.815f, 9.488f, 11.07f, 12.592f, 14.067f, 15.507f, 16.919f,
	18.307f, 19.675f, 21.026f, 22.362f, 23.685f, 24.996f, 26.296f, 27.587f,
	28.869f, 30.144f, 31.41f, 32.671f, 33.924f, 35.172f, 36.415f, 37.652f,
	38.885f, 40.113f, 41.337f, 42.557f, 43.773f, 44.985f, 46.194f, 47.4f,
	48.602f, 49.802f, 50.998f, 52.192f, 53.384f, 54.572f, 55.758f, 56.942f,
	58.124f, 59.304f, 60.481f, 61.656f, 62.83f, 64.001f, 65.171f, 66.339f,
	67.505f};

	public void testRandomSampling() throws Exception {
	Directory dir = newDirectory();
	Directory taxoDir = newDirectory();

	Random random = random();
	DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir);
	RandomIndexWriter writer = new RandomIndexWriter(random, dir);

	FacetsConfig config = new FacetsConfig();

	final int numCategories = 10;
	int numDocs = atLeast(10000);
	for (int i = 0; i < numDocs; i++) {
	Document doc = new Document();
	doc.add(new StringField("EvenOdd", (i % 2 == 0) ? "even" : "odd", Store.NO));
	doc.add(new FacetField("iMod10", Integer.toString(i % numCategories)));
	writer.addDocument(config.build(taxoWriter, doc));
	}
	writer.forceMerge(CHI_SQUARE_VALUES.length - 1);

	// NRT open
	IndexSearcher searcher = newSearcher(writer.getReader());
	TaxonomyReader taxoReader = new DirectoryTaxonomyReader(taxoWriter);
	IOUtils.close(writer, taxoWriter);

	// Test empty results
	RandomSamplingFacetsCollector collectRandomZeroResults = new RandomSamplingFacetsCollector(numDocs / 10, random.nextLong());

	// There should be no divisions by zero
	searcher.search(new TermQuery(new Term("EvenOdd", "NeverMatches")), collectRandomZeroResults);

	// There should be no divisions by zero and no null result
	assertNotNull(collectRandomZeroResults.getMatchingDocs());

	// There should be no results at all
	for (MatchingDocs doc : collectRandomZeroResults.getMatchingDocs()) {
	assertEquals(0, doc.totalHits);
	}

	// Now start searching and retrieve results.

	// Use a query to select half of the documents.
	TermQuery query = new TermQuery(new Term("EvenOdd", "even"));

	RandomSamplingFacetsCollector random10Percent = new RandomSamplingFacetsCollector(numDocs / 10, random.nextLong()); // 10% of total docs, 20% of the hits

	FacetsCollector fc = new FacetsCollector();

	searcher.search(query, MultiCollector.wrap(fc, random10Percent));

	final List<MatchingDocs> matchingDocs = random10Percent.getMatchingDocs();

	// count the total hits and sampled docs, also store the number of sampled
	// docs per segment
	int totalSampledDocs = 0, totalHits = 0;
	int[] numSampledDocs = new int[matchingDocs.size()];
	// System.out.println("numSegments=" + numSampledDocs.length);
	for (int i = 0; i < numSampledDocs.length; i++) {
	MatchingDocs md = matchingDocs.get(i);
	final DocIdSetIterator iter = md.bits.iterator();
	while (iter.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) ++numSampledDocs[i];
	totalSampledDocs += numSampledDocs[i];
	totalHits += md.totalHits;
	}

	// compute the chi-square value for the sampled documents' distribution
	float chi_square = 0;
	for (int i = 0; i < numSampledDocs.length; i++) {
	MatchingDocs md = matchingDocs.get(i);
	float ei = (float) md.totalHits / totalHits;
	if (ei > 0.0f) {
	float oi = (float) numSampledDocs[i] / totalSampledDocs;
	chi_square += (Math.pow(ei - oi, 2) / ei);
	}
	}

	// Verify that the chi-square value isn't too big. According to
	// http://en.wikipedia.org/wiki/Chi-squared_distribution#Table_of_.CF.872_value_vs_p-value,
	// we basically verify that there is a really small chance of hitting a very
	// bad sample (p-value < 0.05), for n-degrees of freedom. The number 'n' depends
	// on the number of segments.
	assertTrue("chisquare not statistically significant enough: " + chi_square, chi_square < CHI_SQUARE_VALUES[numSampledDocs.length]);

	// Test amortized counts - should be 5X the sampled count, but maximum numDocs/10
	final FastTaxonomyFacetCounts random10FacetCounts = new FastTaxonomyFacetCounts(taxoReader, config, random10Percent);
	final FacetResult random10Result = random10FacetCounts.getTopChildren(10, "iMod10");
	final FacetResult amortized10Result = random10Percent.amortizeFacetCounts(random10Result, config, searcher);
	for (int i = 0; i < amortized10Result.labelValues.length; i++) {
	LabelAndValue amortized = amortized10Result.labelValues[i];
	LabelAndValue sampled = random10Result.labelValues[i];
	// since numDocs may not divide by 10 exactly, allow for some slack in the amortized count
	assertEquals(amortized.value.floatValue(), Math.min(5 * sampled.value.floatValue(), numDocs / 10.f), 1.0);
	}

	IOUtils.close(searcher.getIndexReader(), taxoReader, dir, taxoDir);
	}

	}