| using System; |
| using System.Collections.Generic; |
| using System.IO; |
| |
| namespace Lucene.Net.Facet |
| { |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| using DimConfig = FacetsConfig.DimConfig; |
| using DocIdSetIterator = Lucene.Net.Search.DocIdSetIterator; |
| using FixedBitSet = Lucene.Net.Util.FixedBitSet; |
| using IndexReader = Lucene.Net.Index.IndexReader; |
| using IndexSearcher = Lucene.Net.Search.IndexSearcher; |
| using Term = Lucene.Net.Index.Term; |
| |
| /// <summary> |
| /// Collects hits for subsequent faceting, using sampling if needed. Once you've |
| /// run a search and collect hits into this, instantiate one of the |
| /// <see cref="Facets"/> subclasses to do the facet counting. Note that this collector |
| /// does not collect the scores of matching docs (i.e. |
| /// <see cref="FacetsCollector.MatchingDocs.Scores"/>) is <c>null</c>. |
| /// <para> |
| /// If you require the original set of hits, you can call |
| /// <see cref="GetOriginalMatchingDocs()"/>. Also, since the counts of the top-facets |
| /// is based on the sampled set, you can amortize the counts by calling |
| /// <see cref="AmortizeFacetCounts"/>. |
| /// </para> |
| /// </summary> |
| public class RandomSamplingFacetsCollector : FacetsCollector |
| { |
| /// <summary> |
| /// Faster alternative for java.util.Random, inspired by |
| /// http://dmurphy747.wordpress.com/2011/03/23/xorshift-vs-random-performance-in-java/ |
| /// <para> |
| /// Has a period of 2^64-1 |
| /// </para> |
| /// </summary> |
| private class XORShift64Random |
| { |
| internal long x; |
| |
| /// <summary> |
| /// Creates a xorshift random generator using the provided seed |
| /// </summary> |
| public XORShift64Random(long seed) |
| { |
| x = seed == 0 ? 0xdeadbeef : seed; |
| } |
| |
| /// <summary> |
| /// Get the next random long value |
| /// <para/> |
| /// NOTE: This was randomLong() in Lucene |
| /// </summary> |
| public virtual long RandomInt64() |
| { |
| x ^= (x << 21); |
| x ^= ((long)((ulong)x >> 35)); |
| x ^= (x << 4); |
| return x; |
| } |
| |
| /// <summary> |
| /// Get the next random int, between 0 (inclusive) and <paramref name="n"/> (exclusive) |
| /// <para/> |
| /// NOTE: This was nextInt() in Lucene |
| /// </summary> |
| public virtual int NextInt32(int n) |
| { |
| int res = (int)(RandomInt64() % n); |
| return (res < 0) ? -res : res; |
| } |
| } |
| |
| private const int NOT_CALCULATED = -1; |
| |
| private readonly int sampleSize; |
| private readonly XORShift64Random random; |
| |
| private double samplingRate; |
| private IList<MatchingDocs> sampledDocs; |
| private int totalHits = NOT_CALCULATED; |
| private int leftoverBin = NOT_CALCULATED; |
| private int leftoverIndex = NOT_CALCULATED; |
| |
| /// <summary> |
| /// Constructor with the given sample size and default seed. |
| /// </summary> |
| /// <seealso cref="RandomSamplingFacetsCollector(int, long)"/> |
| public RandomSamplingFacetsCollector(int sampleSize) |
| : this(sampleSize, 0) |
| { |
| } |
| |
| /// <summary> |
| /// Constructor with the given sample size and seed. |
| /// </summary> |
| /// <param name="sampleSize"> |
| /// The preferred sample size. If the number of hits is greater than |
| /// the size, sampling will be done using a sample ratio of sampling |
| /// size / totalN. For example: 1000 hits, sample size = 10 results in |
| /// samplingRatio of 0.01. If the number of hits is lower, no sampling |
| /// is done at all </param> |
| /// <param name="seed"> |
| /// The random seed. If <c>0</c> then a seed will be chosen for you. </param> |
| public RandomSamplingFacetsCollector(int sampleSize, long seed) |
| : base(false) |
| { |
| this.sampleSize = sampleSize; |
| this.random = new XORShift64Random(seed); |
| this.sampledDocs = null; |
| } |
| |
| /// <summary> |
| /// Returns the sampled list of the matching documents. Note that a |
| /// <see cref="FacetsCollector.MatchingDocs"/> instance is returned per segment, even |
| /// if no hits from that segment are included in the sampled set. |
| /// <para> |
| /// Note: One or more of the <see cref="FacetsCollector.MatchingDocs"/> might be empty (not containing any |
| /// hits) as result of sampling. |
| /// </para> |
| /// <para> |
| /// Note: <see cref="FacetsCollector.MatchingDocs.TotalHits"/> is copied from the original |
| /// <see cref="FacetsCollector.MatchingDocs"/>, scores is set to <c>null</c> |
| /// </para> |
| /// </summary> |
| public override IList<MatchingDocs> GetMatchingDocs() |
| { |
| IList<MatchingDocs> matchingDocs = base.GetMatchingDocs(); |
| |
| if (totalHits == NOT_CALCULATED) |
| { |
| totalHits = 0; |
| foreach (MatchingDocs md in matchingDocs) |
| { |
| totalHits += md.TotalHits; |
| } |
| } |
| |
| if (totalHits <= sampleSize) |
| { |
| return matchingDocs; |
| } |
| |
| if (sampledDocs == null) |
| { |
| samplingRate = (1.0 * sampleSize) / totalHits; |
| sampledDocs = CreateSampledDocs(matchingDocs); |
| } |
| return sampledDocs; |
| } |
| |
| /// <summary> |
| /// Returns the original matching documents. |
| /// </summary> |
| public virtual IList<MatchingDocs> GetOriginalMatchingDocs() |
| { |
| return base.GetMatchingDocs(); |
| } |
| |
| /// <summary> |
| /// Create a sampled copy of the matching documents list. |
| /// </summary> |
| private IList<MatchingDocs> CreateSampledDocs(ICollection<MatchingDocs> matchingDocsList) |
| { |
| List<MatchingDocs> sampledDocsList = new List<MatchingDocs>(matchingDocsList.Count); |
| foreach (MatchingDocs docs in matchingDocsList) |
| { |
| sampledDocsList.Add(CreateSample(docs)); |
| } |
| return sampledDocsList; |
| } |
| |
| /// <summary> |
| /// Create a sampled of the given hits. |
| /// </summary> |
| private MatchingDocs CreateSample(MatchingDocs docs) |
| { |
| int maxdoc = docs.Context.Reader.MaxDoc; |
| |
| // TODO: we could try the WAH8DocIdSet here as well, as the results will be sparse |
| FixedBitSet sampleDocs = new FixedBitSet(maxdoc); |
| |
| int binSize = (int)(1.0 / samplingRate); |
| |
| try |
| { |
| int counter = 0; |
| int limit, randomIndex; |
| if (leftoverBin != NOT_CALCULATED) |
| { |
| limit = leftoverBin; |
| // either NOT_CALCULATED, which means we already sampled from that bin, |
| // or the next document to sample |
| randomIndex = leftoverIndex; |
| } |
| else |
| { |
| limit = binSize; |
| randomIndex = random.NextInt32(binSize); |
| } |
| DocIdSetIterator it = docs.Bits.GetIterator(); |
| for (int doc = it.NextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.NextDoc()) |
| { |
| if (counter == randomIndex) |
| { |
| sampleDocs.Set(doc); |
| } |
| counter++; |
| if (counter >= limit) |
| { |
| counter = 0; |
| limit = binSize; |
| randomIndex = random.NextInt32(binSize); |
| } |
| } |
| |
| if (counter == 0) |
| { |
| // we either exhausted the bin and the iterator at the same time, or |
| // this segment had no results. in the latter case we might want to |
| // carry leftover to the next segment as is, but that complicates the |
| // code and doesn't seem so important. |
| leftoverBin = leftoverIndex = NOT_CALCULATED; |
| } |
| else |
| { |
| leftoverBin = limit - counter; |
| if (randomIndex > counter) |
| { |
| // the document to sample is in the next bin |
| leftoverIndex = randomIndex - counter; |
| } |
| else if (randomIndex < counter) |
| { |
| // we sampled a document from the bin, so just skip over remaining |
| // documents in the bin in the next segment. |
| leftoverIndex = NOT_CALCULATED; |
| } |
| } |
| |
| return new MatchingDocs(docs.Context, sampleDocs, docs.TotalHits, null); |
| } |
| catch (IOException) |
| { |
| throw new Exception(); |
| } |
| } |
| |
| /// <summary> |
| /// Note: if you use a counting <see cref="Facets"/> implementation, you can amortize the |
| /// sampled counts by calling this method. Uses the <see cref="FacetsConfig"/> and |
| /// the <see cref="IndexSearcher"/> to determine the upper bound for each facet value. |
| /// </summary> |
| public virtual FacetResult AmortizeFacetCounts(FacetResult res, FacetsConfig config, IndexSearcher searcher) |
| { |
| if (res == null || totalHits <= sampleSize) |
| { |
| return res; |
| } |
| |
| LabelAndValue[] fixedLabelValues = new LabelAndValue[res.LabelValues.Length]; |
| IndexReader reader = searcher.IndexReader; |
| DimConfig dimConfig = config.GetDimConfig(res.Dim); |
| |
| // +2 to prepend dimension, append child label |
| string[] childPath = new string[res.Path.Length + 2]; |
| childPath[0] = res.Dim; |
| |
| Array.Copy(res.Path, 0, childPath, 1, res.Path.Length); // reuse |
| |
| for (int i = 0; i < res.LabelValues.Length; i++) |
| { |
| childPath[res.Path.Length + 1] = res.LabelValues[i].Label; |
| string fullPath = FacetsConfig.PathToString(childPath, childPath.Length); |
| int max = reader.DocFreq(new Term(dimConfig.IndexFieldName, fullPath)); |
| int correctedCount = (int)((double)res.LabelValues[i].Value / samplingRate); |
| correctedCount = Math.Min(max, correctedCount); |
| fixedLabelValues[i] = new LabelAndValue(res.LabelValues[i].Label, correctedCount); |
| } |
| |
| // cap the total count on the total number of non-deleted documents in the reader |
| int correctedTotalCount = (int)res.Value; |
| if (correctedTotalCount > 0) |
| { |
| correctedTotalCount = Math.Min(reader.NumDocs, (int)((double)res.Value / samplingRate)); |
| } |
| |
| return new FacetResult(res.Dim, res.Path, correctedTotalCount, fixedLabelValues, res.ChildCount); |
| } |
| |
| /// <summary> |
| /// Returns the sampling rate that was used. |
| /// </summary> |
| public virtual double SamplingRate => samplingRate; |
| } |
| } |