| // Lucene version compatibility level 4.8.1 |
| using J2N.Text; |
| using Lucene.Net.Support; |
| using System; |
| using System.Collections.Generic; |
| |
| namespace Lucene.Net.Facet.SortedSet |
| { |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| using BytesRef = Lucene.Net.Util.BytesRef; |
| using DocIdSetIterator = Lucene.Net.Search.DocIdSetIterator; |
| using IndexReader = Lucene.Net.Index.IndexReader; |
| using MultiDocValues = Lucene.Net.Index.MultiDocValues; |
| using OrdRange = Lucene.Net.Facet.SortedSet.SortedSetDocValuesReaderState.OrdRange; |
| using ReaderUtil = Lucene.Net.Index.ReaderUtil; |
| using SortedSetDocValues = Lucene.Net.Index.SortedSetDocValues; |
| |
| /// <summary> |
| /// Compute facets counts from previously |
| /// indexed <see cref="SortedSetDocValuesFacetField"/>, |
| /// without require a separate taxonomy index. Faceting is |
| /// a bit slower (~25%), and there is added cost on every |
| /// <see cref="IndexReader"/> open to create a new |
| /// <see cref="SortedSetDocValuesReaderState"/>. Furthermore, this does |
| /// not support hierarchical facets; only flat (dimension + |
| /// label) facets, but it uses quite a bit less RAM to do |
| /// so. |
| /// |
| /// <para><b>NOTE</b>: this class should be instantiated and |
| /// then used from a single thread, because it holds a |
| /// thread-private instance of <see cref="SortedSetDocValues"/>. |
| /// |
| /// </para> |
| /// <para><b>NOTE:</b>: tie-break is by unicode sort order |
| /// |
| /// @lucene.experimental |
| /// </para> |
| /// </summary> |
| public class SortedSetDocValuesFacetCounts : Facets |
| { |
| internal readonly SortedSetDocValuesReaderState state; |
| internal readonly SortedSetDocValues dv; |
| internal readonly string field; |
| internal readonly int[] counts; |
| |
| /// <summary> |
| /// Sparse faceting: returns any dimension that had any |
| /// hits, topCount labels per dimension. |
| /// </summary> |
| public SortedSetDocValuesFacetCounts(SortedSetDocValuesReaderState state, FacetsCollector hits) |
| { |
| this.state = state; |
| this.field = state.Field; |
| dv = state.GetDocValues(); |
| counts = new int[state.Count]; |
| //System.out.println("field=" + field); |
| Count(hits.GetMatchingDocs()); |
| } |
| |
| public override FacetResult GetTopChildren(int topN, string dim, params string[] path) |
| { |
| if (topN <= 0) |
| { |
| throw new ArgumentException("topN must be > 0 (got: " + topN + ")"); |
| } |
| if (path.Length > 0) |
| { |
| throw new ArgumentException("path should be 0 length"); |
| } |
| OrdRange ordRange = state.GetOrdRange(dim); |
| if (ordRange == null) |
| { |
| throw new ArgumentException("dimension \"" + dim + "\" was not indexed"); |
| } |
| return GetDim(dim, ordRange, topN); |
| } |
| |
| private FacetResult GetDim(string dim, OrdRange ordRange, int topN) |
| { |
| TopOrdAndInt32Queue q = null; |
| |
| int bottomCount = 0; |
| |
| int dimCount = 0; |
| int childCount = 0; |
| |
| //System.out.println("getDim : " + ordRange.start + " - " + ordRange.end); |
| for (int ord = ordRange.Start; ord <= ordRange.End; ord++) |
| { |
| //System.out.println(" ord=" + ord + " count=" + counts[ord]); |
| if (counts[ord] > 0) |
| { |
| dimCount += counts[ord]; |
| childCount++; |
| if (counts[ord] > bottomCount) |
| { |
| if (q == null) |
| { |
| // Lazy init, so we don't create this for the |
| // sparse case unnecessarily |
| q = new TopOrdAndInt32Queue(topN); |
| } |
| // LUCENENET specific - use struct instead of reusing class instance for better performance |
| q.Insert(new OrdAndValue<int>(ord, counts[ord])); |
| if (q.Count == topN) |
| { |
| bottomCount = q.Top.Value; |
| } |
| } |
| } |
| } |
| |
| if (q == null) |
| { |
| return null; |
| } |
| |
| var scratch = new BytesRef(); |
| |
| LabelAndValue[] labelValues = new LabelAndValue[q.Count]; |
| for (int i = labelValues.Length - 1; i >= 0; i--) |
| { |
| var ordAndValue = q.Pop(); |
| dv.LookupOrd(ordAndValue.Ord, scratch); |
| string[] parts = FacetsConfig.StringToPath(scratch.Utf8ToString()); |
| labelValues[i] = new LabelAndValue(parts[1], ordAndValue.Value); |
| } |
| |
| return new FacetResult(dim, Arrays.Empty<string>(), dimCount, labelValues, childCount); |
| } |
| |
| /// <summary> |
| /// Does all the "real work" of tallying up the counts. </summary> |
| private void Count(IList<FacetsCollector.MatchingDocs> matchingDocs) |
| { |
| //System.out.println("ssdv count"); |
| |
| MultiDocValues.OrdinalMap ordinalMap; |
| |
| // TODO: is this right? really, we need a way to |
| // verify that this ordinalMap "matches" the leaves in |
| // matchingDocs... |
| if (dv is MultiDocValues.MultiSortedSetDocValues values && matchingDocs.Count > 1) |
| { |
| ordinalMap = values.Mapping; |
| } |
| else |
| { |
| ordinalMap = null; |
| } |
| |
| IndexReader origReader = state.OrigReader; |
| |
| foreach (FacetsCollector.MatchingDocs hits in matchingDocs) |
| { |
| var reader = hits.Context.AtomicReader; |
| //System.out.println(" reader=" + reader); |
| // LUCENE-5090: make sure the provided reader context "matches" |
| // the top-level reader passed to the |
| // SortedSetDocValuesReaderState, else cryptic |
| // AIOOBE can happen: |
| if (!Equals(ReaderUtil.GetTopLevelContext(hits.Context).Reader, origReader)) |
| { |
| throw new InvalidOperationException("the SortedSetDocValuesReaderState provided to this class does not match the reader being searched; you must create a new SortedSetDocValuesReaderState every time you open a new IndexReader"); |
| } |
| |
| SortedSetDocValues segValues = reader.GetSortedSetDocValues(field); |
| if (segValues == null) |
| { |
| continue; |
| } |
| |
| DocIdSetIterator docs = hits.Bits.GetIterator(); |
| |
| // TODO: yet another option is to count all segs |
| // first, only in seg-ord space, and then do a |
| // merge-sort-PQ in the end to only "resolve to |
| // global" those seg ords that can compete, if we know |
| // we just want top K? ie, this is the same algo |
| // that'd be used for merging facets across shards |
| // (distributed faceting). but this has much higher |
| // temp ram req'ts (sum of number of ords across all |
| // segs) |
| if (ordinalMap != null) |
| { |
| int segOrd = hits.Context.Ord; |
| |
| int numSegOrds = (int)segValues.ValueCount; |
| |
| if (hits.TotalHits < numSegOrds / 10) |
| { |
| //System.out.println(" remap as-we-go"); |
| // Remap every ord to global ord as we iterate: |
| int doc; |
| while ((doc = docs.NextDoc()) != DocIdSetIterator.NO_MORE_DOCS) |
| { |
| //System.out.println(" doc=" + doc); |
| segValues.SetDocument(doc); |
| int term = (int)segValues.NextOrd(); |
| while (term != SortedSetDocValues.NO_MORE_ORDS) |
| { |
| //System.out.println(" segOrd=" + segOrd + " ord=" + term + " globalOrd=" + ordinalMap.getGlobalOrd(segOrd, term)); |
| counts[(int)ordinalMap.GetGlobalOrd(segOrd, term)]++; |
| term = (int)segValues.NextOrd(); |
| } |
| } |
| } |
| else |
| { |
| //System.out.println(" count in seg ord first"); |
| |
| // First count in seg-ord space: |
| int[] segCounts = new int[numSegOrds]; |
| int doc; |
| while ((doc = docs.NextDoc()) != DocIdSetIterator.NO_MORE_DOCS) |
| { |
| //System.out.println(" doc=" + doc); |
| segValues.SetDocument(doc); |
| int term = (int)segValues.NextOrd(); |
| while (term != SortedSetDocValues.NO_MORE_ORDS) |
| { |
| //System.out.println(" ord=" + term); |
| segCounts[term]++; |
| term = (int)segValues.NextOrd(); |
| } |
| } |
| |
| // Then, migrate to global ords: |
| for (int ord = 0; ord < numSegOrds; ord++) |
| { |
| int count = segCounts[ord]; |
| if (count != 0) |
| { |
| //System.out.println(" migrate segOrd=" + segOrd + " ord=" + ord + " globalOrd=" + ordinalMap.getGlobalOrd(segOrd, ord)); |
| counts[(int)ordinalMap.GetGlobalOrd(segOrd, ord)] += count; |
| } |
| } |
| } |
| } |
| else |
| { |
| // No ord mapping (e.g., single segment index): |
| // just aggregate directly into counts: |
| int doc; |
| while ((doc = docs.NextDoc()) != DocIdSetIterator.NO_MORE_DOCS) |
| { |
| segValues.SetDocument(doc); |
| int term = (int)segValues.NextOrd(); |
| while (term != SortedSetDocValues.NO_MORE_ORDS) |
| { |
| counts[term]++; |
| term = (int)segValues.NextOrd(); |
| } |
| } |
| } |
| } |
| } |
| |
| public override float GetSpecificValue(string dim, params string[] path) |
| { |
| if (path.Length != 1) |
| { |
| throw new ArgumentException("path must be length=1"); |
| } |
| int ord = (int)dv.LookupTerm(new BytesRef(FacetsConfig.PathToString(dim, path))); |
| if (ord < 0) |
| { |
| return -1; |
| } |
| |
| return counts[ord]; |
| } |
| |
| public override IList<FacetResult> GetAllDims(int topN) |
| { |
| List<FacetResult> results = new List<FacetResult>(); |
| foreach (KeyValuePair<string, OrdRange> ent in state.PrefixToOrdRange) |
| { |
| FacetResult fr = GetDim(ent.Key, ent.Value, topN); |
| if (fr != null) |
| { |
| results.Add(fr); |
| } |
| } |
| |
| // Sort by highest count: |
| results.Sort(Comparer<FacetResult>.Create((a, b) => |
| { |
| if ((int)a.Value > (int)b.Value) |
| { |
| return -1; |
| } |
| else if ((int)b.Value > (int)a.Value) |
| { |
| return 1; |
| } |
| else |
| { |
| return a.Dim.CompareToOrdinal(b.Dim); |
| } |
| })); |
| return results; |
| } |
| } |
| } |