| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.facet; |
| |
| import java.io.IOException; |
| import java.util.ArrayList; |
| import java.util.List; |
| |
| import org.apache.lucene.facet.FacetsConfig.DimConfig; |
| import org.apache.lucene.index.IndexReader; |
| import org.apache.lucene.index.Term; |
| import org.apache.lucene.search.DocIdSetIterator; |
| import org.apache.lucene.search.IndexSearcher; |
| import org.apache.lucene.util.BitDocIdSet; |
| import org.apache.lucene.util.FixedBitSet; |
| |
| /** |
| * Collects hits for subsequent faceting, using sampling if needed. Once you've |
| * run a search and collect hits into this, instantiate one of the |
| * {@link Facets} subclasses to do the facet counting. Note that this collector |
| * does not collect the scores of matching docs (i.e. |
| * {@link FacetsCollector.MatchingDocs#scores}) is {@code null}. |
| * <p> |
| * If you require the original set of hits, you can call |
| * {@link #getOriginalMatchingDocs()}. Also, since the counts of the top-facets |
| * is based on the sampled set, you can amortize the counts by calling |
| * {@link #amortizeFacetCounts}. |
| */ |
| public class RandomSamplingFacetsCollector extends FacetsCollector { |
| |
| /** |
| * Faster alternative for java.util.Random, inspired by |
| * http://dmurphy747.wordpress.com/2011/03/23/xorshift-vs-random- |
| * performance-in-java/ |
| * <p> |
| * Has a period of 2^64-1 |
| */ |
| private static class XORShift64Random { |
| |
| private long x; |
| |
| /** Creates a xorshift random generator using the provided seed */ |
| public XORShift64Random(long seed) { |
| x = seed == 0 ? 0xdeadbeef : seed; |
| } |
| |
| /** Get the next random long value */ |
| public long randomLong() { |
| x ^= (x << 21); |
| x ^= (x >>> 35); |
| x ^= (x << 4); |
| return x; |
| } |
| |
| /** Get the next random int, between 0 (inclusive) and n (exclusive) */ |
| public int nextInt(int n) { |
| int res = (int) (randomLong() % n); |
| return (res < 0) ? -res : res; |
| } |
| |
| } |
| |
| private final static int NOT_CALCULATED = -1; |
| |
| private final int sampleSize; |
| private final XORShift64Random random; |
| |
| private double samplingRate; |
| private List<MatchingDocs> sampledDocs; |
| private int totalHits = NOT_CALCULATED; |
| private int leftoverBin = NOT_CALCULATED; |
| private int leftoverIndex = NOT_CALCULATED; |
| |
| /** |
| * Constructor with the given sample size and default seed. |
| * |
| * @see #RandomSamplingFacetsCollector(int, long) |
| */ |
| public RandomSamplingFacetsCollector(int sampleSize) { |
| this(sampleSize, 0); |
| } |
| |
| /** |
| * Constructor with the given sample size and seed. |
| * |
| * @param sampleSize |
| * The preferred sample size. If the number of hits is greater than |
| * the size, sampling will be done using a sample ratio of sampling |
| * size / totalN. For example: 1000 hits, sample size = 10 results in |
| * samplingRatio of 0.01. If the number of hits is lower, no sampling |
| * is done at all |
| * @param seed |
| * The random seed. If {@code 0} then a seed will be chosen for you. |
| */ |
| public RandomSamplingFacetsCollector(int sampleSize, long seed) { |
| super(false); |
| this.sampleSize = sampleSize; |
| this.random = new XORShift64Random(seed); |
| this.sampledDocs = null; |
| } |
| |
| /** |
| * Returns the sampled list of the matching documents. Note that a |
| * {@link FacetsCollector.MatchingDocs} instance is returned per segment, even |
| * if no hits from that segment are included in the sampled set. |
| * <p> |
| * Note: One or more of the MatchingDocs might be empty (not containing any |
| * hits) as result of sampling. |
| * <p> |
| * Note: {@code MatchingDocs.totalHits} is copied from the original |
| * MatchingDocs, scores is set to {@code null} |
| */ |
| @Override |
| public List<MatchingDocs> getMatchingDocs() { |
| List<MatchingDocs> matchingDocs = super.getMatchingDocs(); |
| |
| if (totalHits == NOT_CALCULATED) { |
| totalHits = 0; |
| for (MatchingDocs md : matchingDocs) { |
| totalHits += md.totalHits; |
| } |
| } |
| |
| if (totalHits <= sampleSize) { |
| return matchingDocs; |
| } |
| |
| if (sampledDocs == null) { |
| samplingRate = (1.0 * sampleSize) / totalHits; |
| sampledDocs = createSampledDocs(matchingDocs); |
| } |
| return sampledDocs; |
| } |
| |
| /** Returns the original matching documents. */ |
| public List<MatchingDocs> getOriginalMatchingDocs() { |
| return super.getMatchingDocs(); |
| } |
| |
| /** Create a sampled copy of the matching documents list. */ |
| private List<MatchingDocs> createSampledDocs(List<MatchingDocs> matchingDocsList) { |
| List<MatchingDocs> sampledDocsList = new ArrayList<>(matchingDocsList.size()); |
| for (MatchingDocs docs : matchingDocsList) { |
| sampledDocsList.add(createSample(docs)); |
| } |
| return sampledDocsList; |
| } |
| |
| /** Create a sampled of the given hits. */ |
| private MatchingDocs createSample(MatchingDocs docs) { |
| int maxdoc = docs.context.reader().maxDoc(); |
| |
| // TODO: we could try the WAH8DocIdSet here as well, as the results will be sparse |
| FixedBitSet sampleDocs = new FixedBitSet(maxdoc); |
| |
| int binSize = (int) (1.0 / samplingRate); |
| |
| try { |
| int counter = 0; |
| int limit, randomIndex; |
| if (leftoverBin != NOT_CALCULATED) { |
| limit = leftoverBin; |
| // either NOT_CALCULATED, which means we already sampled from that bin, |
| // or the next document to sample |
| randomIndex = leftoverIndex; |
| } else { |
| limit = binSize; |
| randomIndex = random.nextInt(binSize); |
| } |
| final DocIdSetIterator it = docs.bits.iterator(); |
| for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) { |
| if (counter == randomIndex) { |
| sampleDocs.set(doc); |
| } |
| counter++; |
| if (counter >= limit) { |
| counter = 0; |
| limit = binSize; |
| randomIndex = random.nextInt(binSize); |
| } |
| } |
| |
| if (counter == 0) { |
| // we either exhausted the bin and the iterator at the same time, or |
| // this segment had no results. in the latter case we might want to |
| // carry leftover to the next segment as is, but that complicates the |
| // code and doesn't seem so important. |
| leftoverBin = leftoverIndex = NOT_CALCULATED; |
| } else { |
| leftoverBin = limit - counter; |
| if (randomIndex > counter) { |
| // the document to sample is in the next bin |
| leftoverIndex = randomIndex - counter; |
| } else if (randomIndex < counter) { |
| // we sampled a document from the bin, so just skip over remaining |
| // documents in the bin in the next segment. |
| leftoverIndex = NOT_CALCULATED; |
| } |
| } |
| |
| return new MatchingDocs(docs.context, new BitDocIdSet(sampleDocs), docs.totalHits, null); |
| } catch (IOException e) { |
| throw new RuntimeException(e); |
| } |
| } |
| |
| /** |
| * Note: if you use a counting {@link Facets} implementation, you can amortize the |
| * sampled counts by calling this method. Uses the {@link FacetsConfig} and |
| * the {@link IndexSearcher} to determine the upper bound for each facet value. |
| */ |
| public FacetResult amortizeFacetCounts(FacetResult res, FacetsConfig config, IndexSearcher searcher) throws IOException { |
| if (res == null || totalHits <= sampleSize) { |
| return res; |
| } |
| |
| LabelAndValue[] fixedLabelValues = new LabelAndValue[res.labelValues.length]; |
| IndexReader reader = searcher.getIndexReader(); |
| DimConfig dimConfig = config.getDimConfig(res.dim); |
| |
| // +2 to prepend dimension, append child label |
| String[] childPath = new String[res.path.length + 2]; |
| childPath[0] = res.dim; |
| |
| System.arraycopy(res.path, 0, childPath, 1, res.path.length); // reuse |
| |
| for (int i = 0; i < res.labelValues.length; i++) { |
| childPath[res.path.length + 1] = res.labelValues[i].label; |
| String fullPath = FacetsConfig.pathToString(childPath, childPath.length); |
| int max = reader.docFreq(new Term(dimConfig.indexFieldName, fullPath)); |
| int correctedCount = (int) (res.labelValues[i].value.doubleValue() / samplingRate); |
| correctedCount = Math.min(max, correctedCount); |
| fixedLabelValues[i] = new LabelAndValue(res.labelValues[i].label, correctedCount); |
| } |
| |
| // cap the total count on the total number of non-deleted documents in the reader |
| int correctedTotalCount = res.value.intValue(); |
| if (correctedTotalCount > 0) { |
| correctedTotalCount = Math.min(reader.numDocs(), (int) (res.value.doubleValue() / samplingRate)); |
| } |
| |
| return new FacetResult(res.dim, res.path, correctedTotalCount, fixedLabelValues, res.childCount); |
| } |
| |
| /** Returns the sampling rate that was used. */ |
| public double getSamplingRate() { |
| return samplingRate; |
| } |
| |
| } |