lucene/facet/src/java/org/apache/lucene/facet/RandomSamplingFacetsCollector.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.facet;

 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;

 import org.apache.lucene.facet.FacetsConfig.DimConfig;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.search.DocIdSetIterator;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.util.BitDocIdSet;
 import org.apache.lucene.util.FixedBitSet;

 /**
  * Collects hits for subsequent faceting, using sampling if needed. Once you've
  * run a search and collect hits into this, instantiate one of the
  * {@link Facets} subclasses to do the facet counting. Note that this collector
  * does not collect the scores of matching docs (i.e.
  * {@link FacetsCollector.MatchingDocs#scores}) is {@code null}.
  * <p>
  * If you require the original set of hits, you can call
  * {@link #getOriginalMatchingDocs()}. Also, since the counts of the top-facets
  * is based on the sampled set, you can amortize the counts by calling
  * {@link #amortizeFacetCounts}.
  */
 public class RandomSamplingFacetsCollector extends FacetsCollector {

   /**
    * Faster alternative for java.util.Random, inspired by
    * http://dmurphy747.wordpress.com/2011/03/23/xorshift-vs-random-
    * performance-in-java/
    * <p>
    * Has a period of 2^64-1
    */
   private static class XORShift64Random {

     private long x;

     /** Creates a xorshift random generator using the provided seed */
     public XORShift64Random(long seed) {
       x = seed == 0 ? 0xdeadbeef : seed;
     }

     /** Get the next random long value */
     public long randomLong() {
       x ^= (x << 21);
       x ^= (x >>> 35);
       x ^= (x << 4);
       return x;
     }

     /** Get the next random int, between 0 (inclusive) and n (exclusive) */
     public int nextInt(int n) {
       int res = (int) (randomLong() % n);
       return (res < 0) ? -res : res;
     }

   }

   private final static int NOT_CALCULATED = -1;

   private final int sampleSize;
   private final XORShift64Random random;

   private double samplingRate;
   private List<MatchingDocs> sampledDocs;
   private int totalHits = NOT_CALCULATED;
   private int leftoverBin = NOT_CALCULATED;
   private int leftoverIndex = NOT_CALCULATED;

   /**
    * Constructor with the given sample size and default seed.
    *
    * @see #RandomSamplingFacetsCollector(int, long)
    */
   public RandomSamplingFacetsCollector(int sampleSize) {
     this(sampleSize, 0);
   }

   /**
    * Constructor with the given sample size and seed.
    *
    * @param sampleSize
    *          The preferred sample size. If the number of hits is greater than
    *          the size, sampling will be done using a sample ratio of sampling
    *          size / totalN. For example: 1000 hits, sample size = 10 results in
    *          samplingRatio of 0.01. If the number of hits is lower, no sampling
    *          is done at all
    * @param seed
    *          The random seed. If {@code 0} then a seed will be chosen for you.
    */
   public RandomSamplingFacetsCollector(int sampleSize, long seed) {
     super(false);
     this.sampleSize = sampleSize;
     this.random = new XORShift64Random(seed);
     this.sampledDocs = null;
   }

   /**
    * Returns the sampled list of the matching documents. Note that a
    * {@link FacetsCollector.MatchingDocs} instance is returned per segment, even
    * if no hits from that segment are included in the sampled set.
    * <p>
    * Note: One or more of the MatchingDocs might be empty (not containing any
    * hits) as result of sampling.
    * <p>
    * Note: {@code MatchingDocs.totalHits} is copied from the original
    * MatchingDocs, scores is set to {@code null}
    */
   @Override
   public List<MatchingDocs> getMatchingDocs() {
     List<MatchingDocs> matchingDocs = super.getMatchingDocs();

     if (totalHits == NOT_CALCULATED) {
       totalHits = 0;
       for (MatchingDocs md : matchingDocs) {
         totalHits += md.totalHits;
       }
     }

     if (totalHits <= sampleSize) {
       return matchingDocs;
     }

     if (sampledDocs == null) {
       samplingRate = (1.0 * sampleSize) / totalHits;
       sampledDocs = createSampledDocs(matchingDocs);
     }
     return sampledDocs;
   }

   /** Returns the original matching documents. */
   public List<MatchingDocs> getOriginalMatchingDocs() {
     return super.getMatchingDocs();
   }

   /** Create a sampled copy of the matching documents list. */
   private List<MatchingDocs> createSampledDocs(List<MatchingDocs> matchingDocsList) {
     List<MatchingDocs> sampledDocsList = new ArrayList<>(matchingDocsList.size());
     for (MatchingDocs docs : matchingDocsList) {
       sampledDocsList.add(createSample(docs));
     }
     return sampledDocsList;
   }

   /** Create a sampled of the given hits. */
   private MatchingDocs createSample(MatchingDocs docs) {
     int maxdoc = docs.context.reader().maxDoc();

     // TODO: we could try the WAH8DocIdSet here as well, as the results will be sparse
     FixedBitSet sampleDocs = new FixedBitSet(maxdoc);

     int binSize = (int) (1.0 / samplingRate);

     try {
       int counter = 0;
       int limit, randomIndex;
       if (leftoverBin != NOT_CALCULATED) {
         limit = leftoverBin;
         // either NOT_CALCULATED, which means we already sampled from that bin,
         // or the next document to sample
         randomIndex = leftoverIndex;
       } else {
         limit = binSize;
         randomIndex = random.nextInt(binSize);
       }
       final DocIdSetIterator it = docs.bits.iterator();
       for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
         if (counter == randomIndex) {
           sampleDocs.set(doc);
         }
         counter++;
         if (counter >= limit) {
           counter = 0;
           limit = binSize;
           randomIndex = random.nextInt(binSize);
         }
       }

       if (counter == 0) {
         // we either exhausted the bin and the iterator at the same time, or
         // this segment had no results. in the latter case we might want to
         // carry leftover to the next segment as is, but that complicates the
         // code and doesn't seem so important.
         leftoverBin = leftoverIndex = NOT_CALCULATED;
       } else {
         leftoverBin = limit - counter;
         if (randomIndex > counter) {
           // the document to sample is in the next bin
           leftoverIndex = randomIndex - counter;
         } else if (randomIndex < counter) {
           // we sampled a document from the bin, so just skip over remaining
           // documents in the bin in the next segment.
           leftoverIndex = NOT_CALCULATED;
         }
       }

       return new MatchingDocs(docs.context, new BitDocIdSet(sampleDocs), docs.totalHits, null);
     } catch (IOException e) {
       throw new RuntimeException(e);
     }
   }

   /**
    * Note: if you use a counting {@link Facets} implementation, you can amortize the
    * sampled counts by calling this method. Uses the {@link FacetsConfig} and
    * the {@link IndexSearcher} to determine the upper bound for each facet value.
    */
   public FacetResult amortizeFacetCounts(FacetResult res, FacetsConfig config, IndexSearcher searcher) throws IOException {
     if (res == null || totalHits <= sampleSize) {
       return res;
     }

     LabelAndValue[] fixedLabelValues = new LabelAndValue[res.labelValues.length];
     IndexReader reader = searcher.getIndexReader();
     DimConfig dimConfig = config.getDimConfig(res.dim);

     // +2 to prepend dimension, append child label
     String[] childPath = new String[res.path.length + 2];
     childPath[0] = res.dim;

     System.arraycopy(res.path, 0, childPath, 1, res.path.length); // reuse

     for (int i = 0; i < res.labelValues.length; i++) {
       childPath[res.path.length + 1] = res.labelValues[i].label;
       String fullPath = FacetsConfig.pathToString(childPath, childPath.length);
       int max = reader.docFreq(new Term(dimConfig.indexFieldName, fullPath));
       int correctedCount = (int) (res.labelValues[i].value.doubleValue() / samplingRate);
       correctedCount = Math.min(max, correctedCount);
       fixedLabelValues[i] = new LabelAndValue(res.labelValues[i].label, correctedCount);
     }

     // cap the total count on the total number of non-deleted documents in the reader
     int correctedTotalCount = res.value.intValue();
     if (correctedTotalCount > 0) {
       correctedTotalCount = Math.min(reader.numDocs(), (int) (res.value.doubleValue() / samplingRate));
     }

     return new FacetResult(res.dim, res.path, correctedTotalCount, fixedLabelValues, res.childCount);
   }

   /** Returns the sampling rate that was used. */
   public double getSamplingRate() {
     return samplingRate;
   }

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.facet;

	import java.io.IOException;
	import java.util.ArrayList;
	import java.util.List;

	import org.apache.lucene.facet.FacetsConfig.DimConfig;
	import org.apache.lucene.index.IndexReader;
	import org.apache.lucene.index.Term;
	import org.apache.lucene.search.DocIdSetIterator;
	import org.apache.lucene.search.IndexSearcher;
	import org.apache.lucene.util.BitDocIdSet;
	import org.apache.lucene.util.FixedBitSet;

	/**
	* Collects hits for subsequent faceting, using sampling if needed. Once you've
	* run a search and collect hits into this, instantiate one of the
	* {@link Facets} subclasses to do the facet counting. Note that this collector
	* does not collect the scores of matching docs (i.e.
	* {@link FacetsCollector.MatchingDocs#scores}) is {@code null}.
	* <p>
	* If you require the original set of hits, you can call
	* {@link #getOriginalMatchingDocs()}. Also, since the counts of the top-facets
	* is based on the sampled set, you can amortize the counts by calling
	* {@link #amortizeFacetCounts}.
	*/
	public class RandomSamplingFacetsCollector extends FacetsCollector {

	/**
	* Faster alternative for java.util.Random, inspired by
	* http://dmurphy747.wordpress.com/2011/03/23/xorshift-vs-random-
	* performance-in-java/
	* <p>
	* Has a period of 2^64-1
	*/
	private static class XORShift64Random {

	private long x;

	/** Creates a xorshift random generator using the provided seed */
	public XORShift64Random(long seed) {
	x = seed == 0 ? 0xdeadbeef : seed;
	}

	/** Get the next random long value */
	public long randomLong() {
	x ^= (x << 21);
	x ^= (x >>> 35);
	x ^= (x << 4);
	return x;
	}

	/** Get the next random int, between 0 (inclusive) and n (exclusive) */
	public int nextInt(int n) {
	int res = (int) (randomLong() % n);
	return (res < 0) ? -res : res;
	}

	}

	private final static int NOT_CALCULATED = -1;

	private final int sampleSize;
	private final XORShift64Random random;

	private double samplingRate;
	private List<MatchingDocs> sampledDocs;
	private int totalHits = NOT_CALCULATED;
	private int leftoverBin = NOT_CALCULATED;
	private int leftoverIndex = NOT_CALCULATED;

	/**
	* Constructor with the given sample size and default seed.
	*
	* @see #RandomSamplingFacetsCollector(int, long)
	*/
	public RandomSamplingFacetsCollector(int sampleSize) {
	this(sampleSize, 0);
	}

	/**
	* Constructor with the given sample size and seed.
	*
	* @param sampleSize
	* The preferred sample size. If the number of hits is greater than
	* the size, sampling will be done using a sample ratio of sampling
	* size / totalN. For example: 1000 hits, sample size = 10 results in
	* samplingRatio of 0.01. If the number of hits is lower, no sampling
	* is done at all
	* @param seed
	* The random seed. If {@code 0} then a seed will be chosen for you.
	*/
	public RandomSamplingFacetsCollector(int sampleSize, long seed) {
	super(false);
	this.sampleSize = sampleSize;
	this.random = new XORShift64Random(seed);
	this.sampledDocs = null;
	}

	/**
	* Returns the sampled list of the matching documents. Note that a
	* {@link FacetsCollector.MatchingDocs} instance is returned per segment, even
	* if no hits from that segment are included in the sampled set.
	* <p>
	* Note: One or more of the MatchingDocs might be empty (not containing any
	* hits) as result of sampling.
	* <p>
	* Note: {@code MatchingDocs.totalHits} is copied from the original
	* MatchingDocs, scores is set to {@code null}
	*/
	@Override
	public List<MatchingDocs> getMatchingDocs() {
	List<MatchingDocs> matchingDocs = super.getMatchingDocs();

	if (totalHits == NOT_CALCULATED) {
	totalHits = 0;
	for (MatchingDocs md : matchingDocs) {
	totalHits += md.totalHits;
	}
	}

	if (totalHits <= sampleSize) {
	return matchingDocs;
	}

	if (sampledDocs == null) {
	samplingRate = (1.0 * sampleSize) / totalHits;
	sampledDocs = createSampledDocs(matchingDocs);
	}
	return sampledDocs;
	}

	/** Returns the original matching documents. */
	public List<MatchingDocs> getOriginalMatchingDocs() {
	return super.getMatchingDocs();
	}

	/** Create a sampled copy of the matching documents list. */
	private List<MatchingDocs> createSampledDocs(List<MatchingDocs> matchingDocsList) {
	List<MatchingDocs> sampledDocsList = new ArrayList<>(matchingDocsList.size());
	for (MatchingDocs docs : matchingDocsList) {
	sampledDocsList.add(createSample(docs));
	}
	return sampledDocsList;
	}

	/** Create a sampled of the given hits. */
	private MatchingDocs createSample(MatchingDocs docs) {
	int maxdoc = docs.context.reader().maxDoc();

	// TODO: we could try the WAH8DocIdSet here as well, as the results will be sparse
	FixedBitSet sampleDocs = new FixedBitSet(maxdoc);

	int binSize = (int) (1.0 / samplingRate);

	try {
	int counter = 0;
	int limit, randomIndex;
	if (leftoverBin != NOT_CALCULATED) {
	limit = leftoverBin;
	// either NOT_CALCULATED, which means we already sampled from that bin,
	// or the next document to sample
	randomIndex = leftoverIndex;
	} else {
	limit = binSize;
	randomIndex = random.nextInt(binSize);
	}
	final DocIdSetIterator it = docs.bits.iterator();
	for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
	if (counter == randomIndex) {
	sampleDocs.set(doc);
	}
	counter++;
	if (counter >= limit) {
	counter = 0;
	limit = binSize;
	randomIndex = random.nextInt(binSize);
	}
	}

	if (counter == 0) {
	// we either exhausted the bin and the iterator at the same time, or
	// this segment had no results. in the latter case we might want to
	// carry leftover to the next segment as is, but that complicates the
	// code and doesn't seem so important.
	leftoverBin = leftoverIndex = NOT_CALCULATED;
	} else {
	leftoverBin = limit - counter;
	if (randomIndex > counter) {
	// the document to sample is in the next bin
	leftoverIndex = randomIndex - counter;
	} else if (randomIndex < counter) {
	// we sampled a document from the bin, so just skip over remaining
	// documents in the bin in the next segment.
	leftoverIndex = NOT_CALCULATED;
	}
	}

	return new MatchingDocs(docs.context, new BitDocIdSet(sampleDocs), docs.totalHits, null);
	} catch (IOException e) {
	throw new RuntimeException(e);
	}
	}

	/**
	* Note: if you use a counting {@link Facets} implementation, you can amortize the
	* sampled counts by calling this method. Uses the {@link FacetsConfig} and
	* the {@link IndexSearcher} to determine the upper bound for each facet value.
	*/
	public FacetResult amortizeFacetCounts(FacetResult res, FacetsConfig config, IndexSearcher searcher) throws IOException {
	if (res == null \|\| totalHits <= sampleSize) {
	return res;
	}

	LabelAndValue[] fixedLabelValues = new LabelAndValue[res.labelValues.length];
	IndexReader reader = searcher.getIndexReader();
	DimConfig dimConfig = config.getDimConfig(res.dim);

	// +2 to prepend dimension, append child label
	String[] childPath = new String[res.path.length + 2];
	childPath[0] = res.dim;

	System.arraycopy(res.path, 0, childPath, 1, res.path.length); // reuse

	for (int i = 0; i < res.labelValues.length; i++) {
	childPath[res.path.length + 1] = res.labelValues[i].label;
	String fullPath = FacetsConfig.pathToString(childPath, childPath.length);
	int max = reader.docFreq(new Term(dimConfig.indexFieldName, fullPath));
	int correctedCount = (int) (res.labelValues[i].value.doubleValue() / samplingRate);
	correctedCount = Math.min(max, correctedCount);
	fixedLabelValues[i] = new LabelAndValue(res.labelValues[i].label, correctedCount);
	}

	// cap the total count on the total number of non-deleted documents in the reader
	int correctedTotalCount = res.value.intValue();
	if (correctedTotalCount > 0) {
	correctedTotalCount = Math.min(reader.numDocs(), (int) (res.value.doubleValue() / samplingRate));
	}

	return new FacetResult(res.dim, res.path, correctedTotalCount, fixedLabelValues, res.childCount);
	}

	/** Returns the sampling rate that was used. */
	public double getSamplingRate() {
	return samplingRate;
	}

	}