| using Lucene.Net.Diagnostics; |
| using Lucene.Net.Index; |
| using System; |
| using System.Collections.Generic; |
| using System.Diagnostics; |
| using System.IO; |
| using System.Linq; |
| using JCG = J2N.Collections.Generic; |
| |
| namespace Lucene.Net.Search.Grouping |
| { |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| /// <summary> |
| /// FirstPassGroupingCollector is the first of two passes necessary |
| /// to collect grouped hits. This pass gathers the top N sorted |
| /// groups. Concrete subclasses define what a group is and how it |
| /// is internally collected. |
| /// |
| /// <para> |
| /// See <a href="https://github.com/apache/lucene-solr/blob/releases/lucene-solr/4.8.0/lucene/grouping/src/java/org/apache/lucene/search/grouping/package.html">org.apache.lucene.search.grouping</a> for more |
| /// details including a full code example. |
| /// </para> |
| /// @lucene.experimental |
| /// </summary> |
| /// <typeparam name="TGroupValue"></typeparam> |
| public abstract class AbstractFirstPassGroupingCollector<TGroupValue> : IAbstractFirstPassGroupingCollector<TGroupValue> |
| { |
| private readonly Sort groupSort; |
| private readonly FieldComparer[] comparers; |
| private readonly int[] reversed; |
| private readonly int topNGroups; |
| private readonly IDictionary<TGroupValue, CollectedSearchGroup<TGroupValue>> groupMap; |
| private readonly int compIDXEnd; |
| |
| // Set once we reach topNGroups unique groups: |
| // @lucene.internal |
| protected JCG.SortedSet<CollectedSearchGroup<TGroupValue>> m_orderedGroups; |
| private int docBase; |
| private int spareSlot; |
| |
| /// <summary> |
| /// Create the first pass collector. |
| /// </summary> |
| /// <param name="groupSort"> |
| /// The <see cref="Sort"/> used to sort the |
| /// groups. The top sorted document within each group |
| /// according to groupSort, determines how that group |
| /// sorts against other groups. This must be non-null, |
| /// ie, if you want to groupSort by relevance use |
| /// Sort.RELEVANCE. |
| /// </param> |
| /// <param name="topNGroups">How many top groups to keep.</param> |
| /// <exception cref="IOException">If I/O related errors occur</exception> |
| public AbstractFirstPassGroupingCollector(Sort groupSort, int topNGroups) |
| { |
| if (topNGroups < 1) |
| { |
| throw new ArgumentException("topNGroups must be >= 1 (got " + topNGroups + ")"); |
| } |
| |
| // TODO: allow null groupSort to mean "by relevance", |
| // and specialize it? |
| this.groupSort = groupSort; |
| |
| this.topNGroups = topNGroups; |
| |
| SortField[] sortFields = groupSort.GetSort(); |
| comparers = new FieldComparer[sortFields.Length]; |
| compIDXEnd = comparers.Length - 1; |
| reversed = new int[sortFields.Length]; |
| for (int i = 0; i < sortFields.Length; i++) |
| { |
| SortField sortField = sortFields[i]; |
| |
| // use topNGroups + 1 so we have a spare slot to use for comparing (tracked by this.spareSlot): |
| comparers[i] = sortField.GetComparer(topNGroups + 1, i); |
| reversed[i] = sortField.IsReverse ? -1 : 1; |
| } |
| |
| spareSlot = topNGroups; |
| groupMap = new JCG.Dictionary<TGroupValue, CollectedSearchGroup<TGroupValue>>(topNGroups); |
| } |
| |
| /// <summary> |
| /// Returns top groups, starting from offset. This may |
| /// return null, if no groups were collected, or if the |
| /// number of unique groups collected is <= offset. |
| /// </summary> |
| /// <param name="groupOffset">The offset in the collected groups</param> |
| /// <param name="fillFields">Whether to fill to <see cref="SearchGroup{TGroupValue}.SortValues"/></param> |
| /// <returns>top groups, starting from offset</returns> |
| public virtual IEnumerable<ISearchGroup<TGroupValue>> GetTopGroups(int groupOffset, bool fillFields) |
| { |
| |
| //System.out.println("FP.getTopGroups groupOffset=" + groupOffset + " fillFields=" + fillFields + " groupMap.size()=" + groupMap.size()); |
| |
| if (groupOffset < 0) |
| { |
| throw new ArgumentException("groupOffset must be >= 0 (got " + groupOffset + ")"); |
| } |
| |
| if (groupMap.Count <= groupOffset) |
| { |
| return null; |
| } |
| |
| if (m_orderedGroups == null) |
| { |
| BuildSortedSet(); |
| } |
| |
| ICollection<ISearchGroup<TGroupValue>> result = new List<ISearchGroup<TGroupValue>>(); |
| int upto = 0; |
| int sortFieldCount = groupSort.GetSort().Length; |
| foreach (CollectedSearchGroup<TGroupValue> group in m_orderedGroups) |
| { |
| if (upto++ < groupOffset) |
| { |
| continue; |
| } |
| //System.out.println(" group=" + (group.groupValue == null ? "null" : group.groupValue.utf8ToString())); |
| SearchGroup<TGroupValue> searchGroup = new SearchGroup<TGroupValue>(); |
| searchGroup.GroupValue = group.GroupValue; |
| if (fillFields) |
| { |
| searchGroup.SortValues = new object[sortFieldCount]; |
| for (int sortFieldIDX = 0; sortFieldIDX < sortFieldCount; sortFieldIDX++) |
| { |
| searchGroup.SortValues[sortFieldIDX] = comparers[sortFieldIDX][group.ComparerSlot]; |
| } |
| } |
| result.Add(searchGroup); |
| } |
| //System.out.println(" return " + result.size() + " groups"); |
| return result; |
| } |
| |
| public virtual void SetScorer(Scorer scorer) |
| { |
| foreach (FieldComparer comparer in comparers) |
| { |
| comparer.SetScorer(scorer); |
| } |
| } |
| |
| public virtual void Collect(int doc) |
| { |
| //System.out.println("FP.collect doc=" + doc); |
| |
| // If orderedGroups != null we already have collected N groups and |
| // can short circuit by comparing this document to the bottom group, |
| // without having to find what group this document belongs to. |
| |
| // Even if this document belongs to a group in the top N, we'll know that |
| // we don't have to update that group. |
| |
| // Downside: if the number of unique groups is very low, this is |
| // wasted effort as we will most likely be updating an existing group. |
| if (m_orderedGroups != null) |
| { |
| for (int compIDX = 0; ; compIDX++) |
| { |
| int c = reversed[compIDX] * comparers[compIDX].CompareBottom(doc); |
| if (c < 0) |
| { |
| // Definitely not competitive. So don't even bother to continue |
| return; |
| } |
| else if (c > 0) |
| { |
| // Definitely competitive. |
| break; |
| } |
| else if (compIDX == compIDXEnd) |
| { |
| // Here c=0. If we're at the last comparer, this doc is not |
| // competitive, since docs are visited in doc Id order, which means |
| // this doc cannot compete with any other document in the queue. |
| return; |
| } |
| } |
| } |
| |
| // TODO: should we add option to mean "ignore docs that |
| // don't have the group field" (instead of stuffing them |
| // under null group)? |
| TGroupValue groupValue = GetDocGroupValue(doc); |
| |
| CollectedSearchGroup<TGroupValue> group; |
| if (!groupMap.TryGetValue(groupValue, out group)) |
| { |
| |
| // First time we are seeing this group, or, we've seen |
| // it before but it fell out of the top N and is now |
| // coming back |
| |
| if (groupMap.Count < topNGroups) |
| { |
| |
| // Still in startup transient: we have not |
| // seen enough unique groups to start pruning them; |
| // just keep collecting them |
| |
| // Add a new CollectedSearchGroup: |
| CollectedSearchGroup<TGroupValue> sg = new CollectedSearchGroup<TGroupValue>(); |
| sg.GroupValue = CopyDocGroupValue(groupValue, default(TGroupValue)); |
| sg.ComparerSlot = groupMap.Count; |
| sg.TopDoc = docBase + doc; |
| foreach (FieldComparer fc in comparers) |
| { |
| fc.Copy(sg.ComparerSlot, doc); |
| } |
| groupMap[sg.GroupValue] = sg; |
| |
| if (groupMap.Count == topNGroups) |
| { |
| // End of startup transient: we now have max |
| // number of groups; from here on we will drop |
| // bottom group when we insert new one: |
| BuildSortedSet(); |
| } |
| |
| return; |
| } |
| |
| // We already tested that the document is competitive, so replace |
| // the bottom group with this new group. |
| //CollectedSearchGroup<TGroupValue> bottomGroup = orderedGroups.PollLast(); |
| CollectedSearchGroup<TGroupValue> bottomGroup; |
| lock (m_orderedGroups) |
| { |
| bottomGroup = m_orderedGroups.Last(); |
| m_orderedGroups.Remove(bottomGroup); |
| } |
| if (Debugging.AssertsEnabled) Debugging.Assert(m_orderedGroups.Count == topNGroups - 1); |
| |
| groupMap.Remove(bottomGroup.GroupValue); |
| |
| // reuse the removed CollectedSearchGroup |
| bottomGroup.GroupValue = CopyDocGroupValue(groupValue, bottomGroup.GroupValue); |
| bottomGroup.TopDoc = docBase + doc; |
| |
| foreach (FieldComparer fc in comparers) |
| { |
| fc.Copy(bottomGroup.ComparerSlot, doc); |
| } |
| |
| groupMap[bottomGroup.GroupValue] = bottomGroup; |
| m_orderedGroups.Add(bottomGroup); |
| if (Debugging.AssertsEnabled) Debugging.Assert(m_orderedGroups.Count == topNGroups); |
| |
| int lastComparerSlot = m_orderedGroups.Last().ComparerSlot; |
| foreach (FieldComparer fc in comparers) |
| { |
| fc.SetBottom(lastComparerSlot); |
| } |
| |
| return; |
| } |
| |
| // Update existing group: |
| for (int compIDX = 0; ; compIDX++) |
| { |
| FieldComparer fc = comparers[compIDX]; |
| fc.Copy(spareSlot, doc); |
| |
| int c = reversed[compIDX] * fc.Compare(group.ComparerSlot, spareSlot); |
| if (c < 0) |
| { |
| // Definitely not competitive. |
| return; |
| } |
| else if (c > 0) |
| { |
| // Definitely competitive; set remaining comparers: |
| for (int compIDX2 = compIDX + 1; compIDX2 < comparers.Length; compIDX2++) |
| { |
| comparers[compIDX2].Copy(spareSlot, doc); |
| } |
| break; |
| } |
| else if (compIDX == compIDXEnd) |
| { |
| // Here c=0. If we're at the last comparer, this doc is not |
| // competitive, since docs are visited in doc Id order, which means |
| // this doc cannot compete with any other document in the queue. |
| return; |
| } |
| } |
| |
| // Remove before updating the group since lookup is done via comparers |
| // TODO: optimize this |
| |
| CollectedSearchGroup<TGroupValue> prevLast; |
| if (m_orderedGroups != null) |
| { |
| lock (m_orderedGroups) |
| { |
| prevLast = m_orderedGroups.Last(); |
| m_orderedGroups.Remove(group); |
| } |
| if (Debugging.AssertsEnabled) Debugging.Assert(m_orderedGroups.Count == topNGroups - 1); |
| } |
| else |
| { |
| prevLast = null; |
| } |
| |
| group.TopDoc = docBase + doc; |
| |
| // Swap slots |
| int tmp = spareSlot; |
| spareSlot = group.ComparerSlot; |
| group.ComparerSlot = tmp; |
| |
| // Re-add the changed group |
| if (m_orderedGroups != null) |
| { |
| m_orderedGroups.Add(group); |
| if (Debugging.AssertsEnabled) Debugging.Assert(m_orderedGroups.Count == topNGroups); |
| var newLast = m_orderedGroups.Last(); |
| // If we changed the value of the last group, or changed which group was last, then update bottom: |
| if (group == newLast || prevLast != newLast) |
| { |
| foreach (FieldComparer fc in comparers) |
| { |
| fc.SetBottom(newLast.ComparerSlot); |
| } |
| } |
| } |
| } |
| |
| private class BuildSortedSetComparer : IComparer<ICollectedSearchGroup> |
| { |
| private readonly AbstractFirstPassGroupingCollector<TGroupValue> outerInstance; |
| public BuildSortedSetComparer(AbstractFirstPassGroupingCollector<TGroupValue> outerInstance) |
| { |
| this.outerInstance = outerInstance; |
| } |
| public int Compare(ICollectedSearchGroup o1, ICollectedSearchGroup o2) |
| { |
| for (int compIDX = 0; ; compIDX++) |
| { |
| FieldComparer fc = outerInstance.comparers[compIDX]; |
| int c = outerInstance.reversed[compIDX] * fc.Compare(o1.ComparerSlot, o2.ComparerSlot); |
| if (c != 0) |
| { |
| return c; |
| } |
| else if (compIDX == outerInstance.compIDXEnd) |
| { |
| return o1.TopDoc - o2.TopDoc; |
| } |
| } |
| } |
| } |
| |
| private void BuildSortedSet() |
| { |
| var comparer = new BuildSortedSetComparer(this); |
| m_orderedGroups = new JCG.SortedSet<CollectedSearchGroup<TGroupValue>>(comparer); |
| m_orderedGroups.UnionWith(groupMap.Values); |
| if (Debugging.AssertsEnabled) Debugging.Assert(m_orderedGroups.Count > 0); |
| |
| foreach (FieldComparer fc in comparers) |
| { |
| fc.SetBottom(m_orderedGroups.Last().ComparerSlot); |
| } |
| } |
| |
| public virtual bool AcceptsDocsOutOfOrder => false; |
| |
| public virtual void SetNextReader(AtomicReaderContext context) |
| { |
| docBase = context.DocBase; |
| for (int i = 0; i < comparers.Length; i++) |
| { |
| comparers[i] = comparers[i].SetNextReader(context); |
| } |
| } |
| |
| /// <summary> |
| /// Returns the group value for the specified doc. |
| /// </summary> |
| /// <param name="doc">The specified doc</param> |
| /// <returns>the group value for the specified doc</returns> |
| protected abstract TGroupValue GetDocGroupValue(int doc); |
| |
| /// <summary> |
| /// Returns a copy of the specified group value by creating a new instance and copying the value from the specified |
| /// groupValue in the new instance. Or optionally the reuse argument can be used to copy the group value in. |
| /// </summary> |
| /// <param name="groupValue">The group value to copy</param> |
| /// <param name="reuse">Optionally a reuse instance to prevent a new instance creation</param> |
| /// <returns>a copy of the specified group value</returns> |
| protected abstract TGroupValue CopyDocGroupValue(TGroupValue groupValue, TGroupValue reuse); |
| |
| } |
| |
| /// <summary> |
| /// LUCENENET specific interface used to apply covariance to TGroupValue |
| /// to simulate Java's wildcard generics. |
| /// </summary> |
| /// <typeparam name="TGroupValue"></typeparam> |
| public interface IAbstractFirstPassGroupingCollector<out TGroupValue> : ICollector |
| { |
| /// <summary> |
| /// Returns top groups, starting from offset. This may |
| /// return null, if no groups were collected, or if the |
| /// number of unique groups collected is <= offset. |
| /// </summary> |
| /// <param name="groupOffset">The offset in the collected groups</param> |
| /// <param name="fillFields">Whether to fill to <see cref="SearchGroup{TGroupValue}.SortValues"/></param> |
| /// <returns>top groups, starting from offset</returns> |
| /// <remarks> |
| /// LUCENENET NOTE: We must use <see cref="IEnumerable{TGroupValue}"/> rather than |
| /// <see cref="ICollection{TGroupValue}"/> here because we need this to be covariant |
| /// </remarks> |
| IEnumerable<ISearchGroup<TGroupValue>> GetTopGroups(int groupOffset, bool fillFields); |
| } |
| } |