lucene/facet/src/java/org/apache/lucene/facet/sampling/Sampler.java - lucene-solr - Git at Google

 package org.apache.lucene.facet.sampling;

 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;

 import org.apache.lucene.facet.params.FacetSearchParams;
 import org.apache.lucene.facet.search.Aggregator;
 import org.apache.lucene.facet.search.FacetArrays;
 import org.apache.lucene.facet.search.FacetRequest;
 import org.apache.lucene.facet.search.FacetResult;
 import org.apache.lucene.facet.search.FacetResultNode;
 import org.apache.lucene.facet.search.ScoredDocIDs;
 import org.apache.lucene.facet.taxonomy.TaxonomyReader;

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 /**
  * Sampling definition for facets accumulation
  * <p>
  * The Sampler uses TAKMI style counting to provide a 'best guess' top-K result
  * set of the facets accumulated.
  * <p>
  * Note: Sampling accumulation (Accumulation over a sampled-set of the results),
  * does not guarantee accurate values for
  * {@link FacetResult#getNumValidDescendants()}.
  *
  * @lucene.experimental
  */
 public abstract class Sampler {

   protected final SamplingParams samplingParams;

   /**
    * Construct with {@link SamplingParams}
    */
   public Sampler() {
     this(new SamplingParams());
   }

   /**
    * Construct with certain {@link SamplingParams}
    *
    * @param params sampling params in effect
    * @throws IllegalArgumentException if the provided SamplingParams are not valid
    */
   public Sampler(SamplingParams params) throws IllegalArgumentException {
     if (!params.validate()) {
       throw new IllegalArgumentException("The provided SamplingParams are not valid!!");
     }
     this.samplingParams = params;
   }

   /**
    * Check if this sampler would complement for the input docIds
    */
   public boolean shouldSample(ScoredDocIDs docIds) {
     return docIds.size() > samplingParams.getSamplingThreshold();
   }

   /**
    * Compute a sample set out of the input set, based on the {@link SamplingParams#getSampleRatio()}
    * in effect. Sub classes can override to alter how the sample set is
    * computed.
    * <p>
    * If the input set is of size smaller than {@link SamplingParams#getMinSampleSize()},
    * the input set is returned (no sampling takes place).
    * <p>
    * Other than that, the returned set size will not be larger than {@link SamplingParams#getMaxSampleSize()}
    * nor smaller than {@link SamplingParams#getMinSampleSize()}.
    * @param docids
    *          full set of matching documents out of which a sample is needed.
    */
   public SampleResult getSampleSet(ScoredDocIDs docids) throws IOException {
     if (!shouldSample(docids)) {
       return new SampleResult(docids, 1d);
     }

     int actualSize = docids.size();
     int sampleSetSize = (int) (actualSize * samplingParams.getSampleRatio());
     sampleSetSize = Math.max(sampleSetSize, samplingParams.getMinSampleSize());
     sampleSetSize = Math.min(sampleSetSize, samplingParams.getMaxSampleSize());

     return createSample(docids, actualSize, sampleSetSize);
   }

   /**
    * Create and return a sample of the input set
    * @param docids input set out of which a sample is to be created
    * @param actualSize original size of set, prior to sampling
    * @param sampleSetSize required size of sample set
    * @return sample of the input set in the required size
    */
   protected abstract SampleResult createSample(ScoredDocIDs docids, int actualSize, int sampleSetSize)
       throws IOException;

   /**
    * Result of sample computation
    */
   public final static class SampleResult {
     public final ScoredDocIDs docids;
     public final double actualSampleRatio;
     protected SampleResult(ScoredDocIDs docids, double actualSampleRatio) {
       this.docids = docids;
       this.actualSampleRatio = actualSampleRatio;
     }
   }

   /**
    * Return the sampling params in effect
    */
   public final SamplingParams getSamplingParams() {
     return samplingParams;
   }

   /**
    * Trim the input facet result.<br>
    * Note: It is only valid to call this method with result obtained for a
    * facet request created through {@link #overSampledSearchParams(FacetSearchParams)}.
    *
    * @throws IllegalArgumentException
    *             if called with results not obtained for requests created
    *             through {@link #overSampledSearchParams(FacetSearchParams)}
    */
   public FacetResult trimResult(FacetResult facetResult) throws IllegalArgumentException {
     double overSampleFactor = getSamplingParams().getOversampleFactor();
     if (overSampleFactor <= 1) { // no factoring done?
       return facetResult;
     }

     OverSampledFacetRequest sampledFreq = null;

     try {
       sampledFreq = (OverSampledFacetRequest) facetResult.getFacetRequest();
     } catch (ClassCastException e) {
       throw new IllegalArgumentException(
           "It is only valid to call this method with result obtained for a " +
           "facet request created through sampler.overSamlpingSearchParams()",
           e);
     }

     FacetRequest origFrq = sampledFreq.orig;

     FacetResultNode trimmedRootNode = facetResult.getFacetResultNode();
     trimSubResults(trimmedRootNode, origFrq.numResults);

     return new FacetResult(origFrq, trimmedRootNode, facetResult.getNumValidDescendants());
   }

   /** Trim sub results to a given size. */
   private void trimSubResults(FacetResultNode node, int size) {
     if (node.subResults == FacetResultNode.EMPTY_SUB_RESULTS || node.subResults.size() == 0) {
       return;
     }

     ArrayList<FacetResultNode> trimmed = new ArrayList<FacetResultNode>(size);
     for (int i = 0; i < node.subResults.size() && i < size; i++) {
       FacetResultNode trimmedNode = node.subResults.get(i);
       trimSubResults(trimmedNode, size);
       trimmed.add(trimmedNode);
     }

     node.subResults = trimmed;
   }

   /**
    * Over-sampled search params, wrapping each request with an over-sampled one.
    */
   public FacetSearchParams overSampledSearchParams(FacetSearchParams original) {
     FacetSearchParams res = original;
     // So now we can sample -> altering the searchParams to accommodate for the statistical error for the sampling
     double overSampleFactor = getSamplingParams().getOversampleFactor();
     if (overSampleFactor > 1) { // any factoring to do?
       List<FacetRequest> facetRequests = new ArrayList<FacetRequest>();
       for (FacetRequest frq : original.facetRequests) {
         int overSampledNumResults = (int) Math.ceil(frq.numResults * overSampleFactor);
         facetRequests.add(new OverSampledFacetRequest(frq, overSampledNumResults));
       }
       res = new FacetSearchParams(original.indexingParams, facetRequests);
     }
     return res;
   }

   /**
    * Wrapping a facet request for over sampling.
    * Implementation detail: even if the original request is a count request, no
    * statistics will be computed for it as the wrapping is not a count request.
    * This is ok, as the sampling accumulator is later computing the statistics
    * over the original requests.
    */
   private static class OverSampledFacetRequest extends FacetRequest {
     final FacetRequest orig;
     public OverSampledFacetRequest(FacetRequest orig, int num) {
       super(orig.categoryPath, num);
       this.orig = orig;
       setDepth(orig.getDepth());
       setNumLabel(0); // don't label anything as we're over-sampling
       setResultMode(orig.getResultMode());
       setSortOrder(orig.getSortOrder());
     }

     @Override
     public Aggregator createAggregator(boolean useComplements, FacetArrays arrays, TaxonomyReader taxonomy)
         throws IOException {
       return orig.createAggregator(useComplements, arrays, taxonomy);
     }

     @Override
     public FacetArraysSource getFacetArraysSource() {
       return orig.getFacetArraysSource();
     }

     @Override
     public double getValueOf(FacetArrays arrays, int idx) {
       return orig.getValueOf(arrays, idx);
     }
   }

 }
	package org.apache.lucene.facet.sampling;

	import java.io.IOException;
	import java.util.ArrayList;
	import java.util.List;

	import org.apache.lucene.facet.params.FacetSearchParams;
	import org.apache.lucene.facet.search.Aggregator;
	import org.apache.lucene.facet.search.FacetArrays;
	import org.apache.lucene.facet.search.FacetRequest;
	import org.apache.lucene.facet.search.FacetResult;
	import org.apache.lucene.facet.search.FacetResultNode;
	import org.apache.lucene.facet.search.ScoredDocIDs;
	import org.apache.lucene.facet.taxonomy.TaxonomyReader;

	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	/**
	* Sampling definition for facets accumulation
	* <p>
	* The Sampler uses TAKMI style counting to provide a 'best guess' top-K result
	* set of the facets accumulated.
	* <p>
	* Note: Sampling accumulation (Accumulation over a sampled-set of the results),
	* does not guarantee accurate values for
	* {@link FacetResult#getNumValidDescendants()}.
	*
	* @lucene.experimental
	*/
	public abstract class Sampler {

	protected final SamplingParams samplingParams;

	/**
	* Construct with {@link SamplingParams}
	*/
	public Sampler() {
	this(new SamplingParams());
	}

	/**
	* Construct with certain {@link SamplingParams}
	*
	* @param params sampling params in effect
	* @throws IllegalArgumentException if the provided SamplingParams are not valid
	*/
	public Sampler(SamplingParams params) throws IllegalArgumentException {
	if (!params.validate()) {
	throw new IllegalArgumentException("The provided SamplingParams are not valid!!");
	}
	this.samplingParams = params;
	}

	/**
	* Check if this sampler would complement for the input docIds
	*/
	public boolean shouldSample(ScoredDocIDs docIds) {
	return docIds.size() > samplingParams.getSamplingThreshold();
	}

	/**
	* Compute a sample set out of the input set, based on the {@link SamplingParams#getSampleRatio()}
	* in effect. Sub classes can override to alter how the sample set is
	* computed.
	* <p>
	* If the input set is of size smaller than {@link SamplingParams#getMinSampleSize()},
	* the input set is returned (no sampling takes place).
	* <p>
	* Other than that, the returned set size will not be larger than {@link SamplingParams#getMaxSampleSize()}
	* nor smaller than {@link SamplingParams#getMinSampleSize()}.
	* @param docids
	* full set of matching documents out of which a sample is needed.
	*/
	public SampleResult getSampleSet(ScoredDocIDs docids) throws IOException {
	if (!shouldSample(docids)) {
	return new SampleResult(docids, 1d);
	}

	int actualSize = docids.size();
	int sampleSetSize = (int) (actualSize * samplingParams.getSampleRatio());
	sampleSetSize = Math.max(sampleSetSize, samplingParams.getMinSampleSize());
	sampleSetSize = Math.min(sampleSetSize, samplingParams.getMaxSampleSize());

	return createSample(docids, actualSize, sampleSetSize);
	}

	/**
	* Create and return a sample of the input set
	* @param docids input set out of which a sample is to be created
	* @param actualSize original size of set, prior to sampling
	* @param sampleSetSize required size of sample set
	* @return sample of the input set in the required size
	*/
	protected abstract SampleResult createSample(ScoredDocIDs docids, int actualSize, int sampleSetSize)
	throws IOException;

	/**
	* Result of sample computation
	*/
	public final static class SampleResult {
	public final ScoredDocIDs docids;
	public final double actualSampleRatio;
	protected SampleResult(ScoredDocIDs docids, double actualSampleRatio) {
	this.docids = docids;
	this.actualSampleRatio = actualSampleRatio;
	}
	}

	/**
	* Return the sampling params in effect
	*/
	public final SamplingParams getSamplingParams() {
	return samplingParams;
	}

	/**
	* Trim the input facet result.<br>
	* Note: It is only valid to call this method with result obtained for a
	* facet request created through {@link #overSampledSearchParams(FacetSearchParams)}.
	*
	* @throws IllegalArgumentException
	* if called with results not obtained for requests created
	* through {@link #overSampledSearchParams(FacetSearchParams)}
	*/
	public FacetResult trimResult(FacetResult facetResult) throws IllegalArgumentException {
	double overSampleFactor = getSamplingParams().getOversampleFactor();
	if (overSampleFactor <= 1) { // no factoring done?
	return facetResult;
	}

	OverSampledFacetRequest sampledFreq = null;

	try {
	sampledFreq = (OverSampledFacetRequest) facetResult.getFacetRequest();
	} catch (ClassCastException e) {
	throw new IllegalArgumentException(
	"It is only valid to call this method with result obtained for a " +
	"facet request created through sampler.overSamlpingSearchParams()",
	e);
	}

	FacetRequest origFrq = sampledFreq.orig;

	FacetResultNode trimmedRootNode = facetResult.getFacetResultNode();
	trimSubResults(trimmedRootNode, origFrq.numResults);

	return new FacetResult(origFrq, trimmedRootNode, facetResult.getNumValidDescendants());
	}

	/** Trim sub results to a given size. */
	private void trimSubResults(FacetResultNode node, int size) {
	if (node.subResults == FacetResultNode.EMPTY_SUB_RESULTS \|\| node.subResults.size() == 0) {
	return;
	}

	ArrayList<FacetResultNode> trimmed = new ArrayList<FacetResultNode>(size);
	for (int i = 0; i < node.subResults.size() && i < size; i++) {
	FacetResultNode trimmedNode = node.subResults.get(i);
	trimSubResults(trimmedNode, size);
	trimmed.add(trimmedNode);
	}

	node.subResults = trimmed;
	}

	/**
	* Over-sampled search params, wrapping each request with an over-sampled one.
	*/
	public FacetSearchParams overSampledSearchParams(FacetSearchParams original) {
	FacetSearchParams res = original;
	// So now we can sample -> altering the searchParams to accommodate for the statistical error for the sampling
	double overSampleFactor = getSamplingParams().getOversampleFactor();
	if (overSampleFactor > 1) { // any factoring to do?
	List<FacetRequest> facetRequests = new ArrayList<FacetRequest>();
	for (FacetRequest frq : original.facetRequests) {
	int overSampledNumResults = (int) Math.ceil(frq.numResults * overSampleFactor);
	facetRequests.add(new OverSampledFacetRequest(frq, overSampledNumResults));
	}
	res = new FacetSearchParams(original.indexingParams, facetRequests);
	}
	return res;
	}

	/**
	* Wrapping a facet request for over sampling.
	* Implementation detail: even if the original request is a count request, no
	* statistics will be computed for it as the wrapping is not a count request.
	* This is ok, as the sampling accumulator is later computing the statistics
	* over the original requests.
	*/
	private static class OverSampledFacetRequest extends FacetRequest {
	final FacetRequest orig;
	public OverSampledFacetRequest(FacetRequest orig, int num) {
	super(orig.categoryPath, num);
	this.orig = orig;
	setDepth(orig.getDepth());
	setNumLabel(0); // don't label anything as we're over-sampling
	setResultMode(orig.getResultMode());
	setSortOrder(orig.getSortOrder());
	}

	@Override
	public Aggregator createAggregator(boolean useComplements, FacetArrays arrays, TaxonomyReader taxonomy)
	throws IOException {
	return orig.createAggregator(useComplements, arrays, taxonomy);
	}

	@Override
	public FacetArraysSource getFacetArraysSource() {
	return orig.getFacetArraysSource();
	}

	@Override
	public double getValueOf(FacetArrays arrays, int idx) {
	return orig.getValueOf(arrays, idx);
	}
	}

	}