blob: 5a5db8475d5e082a92808334f2ba384639bdccd2 [file] [log] [blame]
// Lucene version compatibility level 4.8.1
using Lucene.Net.Diagnostics;
using Lucene.Net.Index;
using Lucene.Net.Search;
using Lucene.Net.Search.Grouping;
using Lucene.Net.Support;
using Lucene.Net.Util;
using System;
using System.Collections.Concurrent;
using System.Collections.Generic;
using System.IO;
namespace Lucene.Net.Join
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/// <summary>
/// Collects parent document hits for a <see cref="Query"/> containing one more more
/// BlockJoinQuery clauses, sorted by the
/// specified parent <see cref="Sort"/>. Note that this cannot perform
/// arbitrary joins; rather, it requires that all joined
/// documents are indexed as a doc block (using
/// <see cref="IndexWriter.AddDocuments(IEnumerable{IEnumerable{IIndexableField}}, Analysis.Analyzer)"/>
/// or <see cref="IndexWriter.UpdateDocuments(Term, IEnumerable{IEnumerable{IIndexableField}}, Analysis.Analyzer)"/>.
/// Ie, the join is computed
/// at index time.
///
/// <para>The parent <see cref="Sort"/> must only use
/// fields from the parent documents; sorting by field in
/// the child documents is not supported.</para>
///
/// <para>You should only use this
/// collector if one or more of the clauses in the query is
/// a <see cref="ToParentBlockJoinQuery"/>. This collector will find those query
/// clauses and record the matching child documents for the
/// top scoring parent documents.</para>
///
/// <para>Multiple joins (star join) and nested joins and a mix
/// of the two are allowed, as long as in all cases the
/// documents corresponding to a single row of each joined
/// parent table were indexed as a doc block.</para>
///
/// <para>For the simple star join you can retrieve the
/// <see cref="ITopGroups{T}"/> instance containing each <see cref="ToParentBlockJoinQuery"/>'s
/// matching child documents for the top parent groups,
/// using <see cref="GetTopGroups"/>. Ie,
/// a single query, which will contain two or more
/// <see cref="ToParentBlockJoinQuery"/>'s as clauses representing the star join,
/// can then retrieve two or more <see cref="ITopGroups{T}"/> instances.</para>
///
/// <para>For nested joins, the query will run correctly (ie,
/// match the right parent and child documents), however,
/// because <see cref="TopGroups{T}"/> is currently unable to support nesting
/// (each group is not able to hold another <see cref="TopGroups{T}"/>), you
/// are only able to retrieve the <see cref="TopGroups{T}"/> of the first
/// join. The <see cref="TopGroups{T}"/> of the nested joins will not be
/// correct.</para>
///
/// See <a href="http://lucene.apache.org/core/4_8_0/join/">http://lucene.apache.org/core/4_8_0/join/</a> for a code
/// sample.
///
/// @lucene.experimental
/// </summary>
public class ToParentBlockJoinCollector : ICollector
{
private readonly Sort sort;
// Maps each BlockJoinQuery instance to its "slot" in
// joinScorers and in OneGroup's cached doc/scores/count:
private readonly IDictionary<Query, int?> joinQueryID = new Dictionary<Query, int?>();
private readonly int numParentHits;
private readonly FieldValueHitQueue<OneGroup> queue;
private readonly FieldComparer[] comparers;
private readonly int[] reverseMul;
private readonly int compEnd;
private readonly bool trackMaxScore;
private readonly bool trackScores;
private int docBase;
private ToParentBlockJoinQuery.BlockJoinScorer[] joinScorers = Arrays.Empty<ToParentBlockJoinQuery.BlockJoinScorer>();
private AtomicReaderContext currentReaderContext;
private Scorer scorer;
private bool queueFull;
private OneGroup bottom;
private int totalHitCount;
private float maxScore = float.NaN;
/// <summary>
/// Creates a <see cref="ToParentBlockJoinCollector"/>. The provided <paramref name="sort"/> must
/// not be null. If you pass true <paramref name="trackScores"/>, all
/// ToParentBlockQuery instances must not use
/// <see cref="ScoreMode.None"/>.
/// </summary>
public ToParentBlockJoinCollector(Sort sort, int numParentHits, bool trackScores, bool trackMaxScore)
{
// TODO: allow null sort to be specialized to relevance
// only collector
this.sort = sort;
this.trackMaxScore = trackMaxScore;
if (trackMaxScore)
{
maxScore = float.MinValue;
}
//System.out.println("numParentHits=" + numParentHits);
this.trackScores = trackScores;
this.numParentHits = numParentHits;
queue = FieldValueHitQueue.Create<OneGroup>(sort.GetSort(), numParentHits);
comparers = queue.Comparers;
reverseMul = queue.ReverseMul;
compEnd = comparers.Length - 1;
}
private sealed class OneGroup : FieldValueHitQueue.Entry
{
public OneGroup(int comparerSlot, int parentDoc, float parentScore, int numJoins, bool doScores)
: base(comparerSlot, parentDoc, parentScore)
{
//System.out.println("make OneGroup parentDoc=" + parentDoc);
docs = new int[numJoins][];
for (int joinId = 0; joinId < numJoins; joinId++)
{
docs[joinId] = new int[5];
}
if (doScores)
{
scores = new float[numJoins][];
for (int joinId = 0; joinId < numJoins; joinId++)
{
scores[joinId] = new float[5];
}
}
counts = new int[numJoins];
}
internal AtomicReaderContext readerContext;
internal int[][] docs;
internal float[][] scores;
internal int[] counts;
}
public virtual void Collect(int parentDoc)
{
//System.out.println("\nC parentDoc=" + parentDoc);
totalHitCount++;
float score = float.NaN;
if (trackMaxScore)
{
score = scorer.GetScore();
maxScore = Math.Max(maxScore, score);
}
// TODO: we could sweep all joinScorers here and
// aggregate total child hit count, so we can fill this
// in getTopGroups (we wire it to 0 now)
if (queueFull)
{
//System.out.println(" queueFull");
// Fastmatch: return if this hit is not competitive
for (int i = 0; ; i++)
{
int c = reverseMul[i] * comparers[i].CompareBottom(parentDoc);
if (c < 0)
{
// Definitely not competitive.
//System.out.println(" skip");
return;
}
if (c > 0)
{
// Definitely competitive.
break;
}
if (i == compEnd)
{
// Here c=0. If we're at the last comparer, this doc is not
// competitive, since docs are visited in doc Id order, which means
// this doc cannot compete with any other document in the queue.
//System.out.println(" skip");
return;
}
}
//System.out.println(" competes! doc=" + (docBase + parentDoc));
// This hit is competitive - replace bottom element in queue & adjustTop
for (int i = 0; i < comparers.Length; i++)
{
comparers[i].Copy(bottom.Slot, parentDoc);
}
if (!trackMaxScore && trackScores)
{
score = scorer.GetScore();
}
bottom.Doc = docBase + parentDoc;
bottom.readerContext = currentReaderContext;
bottom.Score = score;
CopyGroups(bottom);
bottom = queue.UpdateTop();
for (int i = 0; i < comparers.Length; i++)
{
comparers[i].SetBottom(bottom.Slot);
}
}
else
{
// Startup transient: queue is not yet full:
int comparerSlot = totalHitCount - 1;
// Copy hit into queue
for (int i = 0; i < comparers.Length; i++)
{
comparers[i].Copy(comparerSlot, parentDoc);
}
//System.out.println(" startup: new OG doc=" + (docBase+parentDoc));
if (!trackMaxScore && trackScores)
{
score = scorer.GetScore();
}
OneGroup og = new OneGroup(comparerSlot, docBase + parentDoc, score, joinScorers.Length, trackScores);
og.readerContext = currentReaderContext;
CopyGroups(og);
bottom = queue.Add(og);
queueFull = totalHitCount == numParentHits;
if (queueFull)
{
// End of startup transient: queue just filled up:
for (int i = 0; i < comparers.Length; i++)
{
comparers[i].SetBottom(bottom.Slot);
}
}
}
}
// Pulls out child doc and scores for all join queries:
private void CopyGroups(OneGroup og)
{
// While rare, it's possible top arrays could be too
// short if join query had null scorer on first
// segment(s) but then became non-null on later segments
int numSubScorers = joinScorers.Length;
if (og.docs.Length < numSubScorers)
{
// While rare, this could happen if join query had
// null scorer on first segment(s) but then became
// non-null on later segments
og.docs = ArrayUtil.Grow(og.docs);
}
if (og.counts.Length < numSubScorers)
{
og.counts = ArrayUtil.Grow(og.counts);
}
if (trackScores && og.scores.Length < numSubScorers)
{
og.scores = ArrayUtil.Grow(og.scores);
}
//System.out.println("\ncopyGroups parentDoc=" + og.doc);
for (int scorerIDX = 0; scorerIDX < numSubScorers; scorerIDX++)
{
ToParentBlockJoinQuery.BlockJoinScorer joinScorer = joinScorers[scorerIDX];
//System.out.println(" scorer=" + joinScorer);
if (joinScorer != null && docBase + joinScorer.ParentDoc == og.Doc)
{
og.counts[scorerIDX] = joinScorer.ChildCount;
//System.out.println(" count=" + og.counts[scorerIDX]);
og.docs[scorerIDX] = joinScorer.SwapChildDocs(og.docs[scorerIDX]);
if (Debugging.AssertsEnabled) Debugging.Assert(og.docs[scorerIDX].Length >= og.counts[scorerIDX], "length={0} vs count={1}", og.docs[scorerIDX].Length, og.counts[scorerIDX]);
//System.out.println(" len=" + og.docs[scorerIDX].length);
/*
for(int idx=0;idx<og.counts[scorerIDX];idx++) {
System.out.println(" docs[" + idx + "]=" + og.docs[scorerIDX][idx]);
}
*/
if (trackScores)
{
//System.out.println(" copy scores");
og.scores[scorerIDX] = joinScorer.SwapChildScores(og.scores[scorerIDX]);
if (Debugging.AssertsEnabled) Debugging.Assert(og.scores[scorerIDX].Length >= og.counts[scorerIDX], "length={0} vs count={1}", og.scores[scorerIDX].Length, og.counts[scorerIDX]);
}
}
else
{
og.counts[scorerIDX] = 0;
}
}
}
public virtual void SetNextReader(AtomicReaderContext context)
{
currentReaderContext = context;
docBase = context.DocBase;
for (int compIDX = 0; compIDX < comparers.Length; compIDX++)
{
queue.SetComparer(compIDX, comparers[compIDX].SetNextReader(context));
}
}
public virtual bool AcceptsDocsOutOfOrder => false;
private void Enroll(ToParentBlockJoinQuery query, ToParentBlockJoinQuery.BlockJoinScorer scorer)
{
scorer.TrackPendingChildHits();
if (joinQueryID.TryGetValue(query, out int? slot))
{
joinScorers[(int)slot] = scorer;
}
else
{
joinQueryID[query] = joinScorers.Length;
//System.out.println("found JQ: " + query + " slot=" + joinScorers.length);
ToParentBlockJoinQuery.BlockJoinScorer[] newArray = new ToParentBlockJoinQuery.BlockJoinScorer[1 + joinScorers.Length];
Array.Copy(joinScorers, 0, newArray, 0, joinScorers.Length);
joinScorers = newArray;
joinScorers[joinScorers.Length - 1] = scorer;
}
}
public virtual void SetScorer(Scorer scorer)
{
//System.out.println("C.setScorer scorer=" + value);
// Since we invoke .score(), and the comparers likely
// do as well, cache it so it's only "really" computed
// once:
this.scorer = new ScoreCachingWrappingScorer(scorer);
for (int compIdx = 0; compIdx < comparers.Length; compIdx++)
{
comparers[compIdx].SetScorer(scorer);
}
Arrays.Fill(joinScorers, null);
var queue2 = new ConcurrentQueue<Scorer>();
//System.out.println("\nqueue: add top scorer=" + value);
queue2.Enqueue(scorer);
while (queue2.TryDequeue(out scorer))
{
//System.out.println(" poll: " + value + "; " + value.getWeight().getQuery());
if (scorer is ToParentBlockJoinQuery.BlockJoinScorer blockJoinScorer)
{
Enroll((ToParentBlockJoinQuery)scorer.Weight.Query, blockJoinScorer);
}
foreach (Scorer.ChildScorer sub in scorer.GetChildren())
{
//System.out.println(" add sub: " + sub.child + "; " + sub.child.getWeight().getQuery());
queue2.Enqueue(sub.Child);
}
}
}
private OneGroup[] sortedGroups;
private void SortQueue()
{
sortedGroups = new OneGroup[queue.Count];
for (int downTo = queue.Count - 1; downTo >= 0; downTo--)
{
sortedGroups[downTo] = queue.Pop();
}
}
/// <summary>
/// Returns the <see cref="ITopGroups{T}"/> for the specified
/// BlockJoinQuery. The groupValue of each GroupDocs will
/// be the parent docID for that group.
/// The number of documents within each group is calculated as minimum of <paramref name="maxDocsPerGroup"/>
/// and number of matched child documents for that group.
/// Returns <c>null</c> if no groups matched.
/// </summary>
/// <param name="query"> Search query </param>
/// <param name="withinGroupSort"> Sort criteria within groups </param>
/// <param name="offset"> Parent docs offset </param>
/// <param name="maxDocsPerGroup"> Upper bound of documents per group number </param>
/// <param name="withinGroupOffset"> Offset within each group of child docs </param>
/// <param name="fillSortFields"> Specifies whether to add sort fields or not </param>
/// <returns> <see cref="ITopGroups{T}"/> for specified query </returns>
/// <exception cref="IOException"> if there is a low-level I/O error </exception>
public virtual ITopGroups<int> GetTopGroups(ToParentBlockJoinQuery query, Sort withinGroupSort, int offset, int maxDocsPerGroup, int withinGroupOffset, bool fillSortFields)
{
if (!joinQueryID.TryGetValue(query, out int? slot))
{
if (totalHitCount == 0)
{
return null;
}
}
if (sortedGroups == null)
{
if (offset >= queue.Count)
{
return null;
}
SortQueue();
}
else if (offset > sortedGroups.Length)
{
return null;
}
return AccumulateGroups(slot == null ? -1 : (int)slot, offset, maxDocsPerGroup, withinGroupOffset, withinGroupSort, fillSortFields);
}
/// <summary>
/// Accumulates groups for the BlockJoinQuery specified by its slot.
/// </summary>
/// <param name="slot"> Search query's slot </param>
/// <param name="offset"> Parent docs offset </param>
/// <param name="maxDocsPerGroup"> Upper bound of documents per group number </param>
/// <param name="withinGroupOffset"> Offset within each group of child docs </param>
/// <param name="withinGroupSort"> Sort criteria within groups </param>
/// <param name="fillSortFields"> Specifies whether to add sort fields or not </param>
/// <returns> <see cref="ITopGroups{T}"/> for the query specified by slot </returns>
/// <exception cref="IOException"> if there is a low-level I/O error </exception>
private ITopGroups<int> AccumulateGroups(int slot, int offset, int maxDocsPerGroup, int withinGroupOffset, Sort withinGroupSort, bool fillSortFields)
{
var groups = new GroupDocs<int>[sortedGroups.Length - offset];
var fakeScorer = new FakeScorer();
int totalGroupedHitCount = 0;
//System.out.println("slot=" + slot);
for (int groupIdx = offset; groupIdx < sortedGroups.Length; groupIdx++)
{
OneGroup og = sortedGroups[groupIdx];
int numChildDocs;
if (slot == -1 || slot >= og.counts.Length)
{
numChildDocs = 0;
}
else
{
numChildDocs = og.counts[slot];
}
// Number of documents in group should be bounded to prevent redundant memory allocation
int numDocsInGroup = Math.Max(1, Math.Min(numChildDocs, maxDocsPerGroup));
//System.out.println("parent doc=" + og.doc + " numChildDocs=" + numChildDocs + " maxDocsPG=" + maxDocsPerGroup);
// At this point we hold all docs w/ in each group, unsorted; we now sort them:
ICollector collector;
if (withinGroupSort == null)
{
//System.out.println("sort by score");
// Sort by score
if (!trackScores)
{
throw new ArgumentException("cannot sort by relevance within group: trackScores=false");
}
collector = TopScoreDocCollector.Create(numDocsInGroup, true);
}
else
{
// Sort by fields
collector = TopFieldCollector.Create(withinGroupSort, numDocsInGroup, fillSortFields, trackScores, trackMaxScore, true);
}
collector.SetScorer(fakeScorer);
collector.SetNextReader(og.readerContext);
for (int docIdx = 0; docIdx < numChildDocs; docIdx++)
{
//System.out.println("docIDX=" + docIDX + " vs " + og.docs[slot].length);
int doc = og.docs[slot][docIdx];
fakeScorer.doc = doc;
if (trackScores)
{
fakeScorer._score = og.scores[slot][docIdx];
}
collector.Collect(doc);
}
totalGroupedHitCount += numChildDocs;
object[] groupSortValues;
if (fillSortFields)
{
groupSortValues = new object[comparers.Length];
for (int sortFieldIdx = 0; sortFieldIdx < comparers.Length; sortFieldIdx++)
{
groupSortValues[sortFieldIdx] = comparers[sortFieldIdx][og.Slot];
}
}
else
{
groupSortValues = null;
}
TopDocs topDocs;
if (withinGroupSort == null)
{
var tempCollector = (TopScoreDocCollector) collector;
topDocs = tempCollector.GetTopDocs(withinGroupOffset, numDocsInGroup);
}
else
{
var tempCollector = (TopFieldCollector) collector;
topDocs = tempCollector.GetTopDocs(withinGroupOffset, numDocsInGroup);
}
groups[groupIdx - offset] = new GroupDocs<int>(og.Score, topDocs.MaxScore, numChildDocs, topDocs.ScoreDocs, og.Doc, groupSortValues);
}
return new TopGroups<int>(new TopGroups<int>(sort.GetSort(), withinGroupSort?.GetSort(), 0, totalGroupedHitCount, groups, maxScore), totalHitCount);
}
/// <summary>
/// Returns the <see cref="TopGroups{T}"/> for the specified BlockJoinQuery. The groupValue of each
/// GroupDocs will be the parent docID for that group. The number of documents within
/// each group equals to the total number of matched child documents for that group.
/// Returns <c>null</c> if no groups matched.
/// </summary>
/// <param name="query">Search query</param>
/// <param name="withinGroupSort">Sort criteria within groups</param>
/// <param name="offset">Parent docs offset</param>
/// <param name="withinGroupOffset">Offset within each group of child docs</param>
/// <param name="fillSortFields">Specifies whether to add sort fields or not</param>
/// <returns><see cref="ITopGroups{T}"/> for specified query</returns>
/// <exception cref="IOException"> if there is a low-level I/O error </exception>
public virtual ITopGroups<int> GetTopGroupsWithAllChildDocs(ToParentBlockJoinQuery query, Sort withinGroupSort, int offset, int withinGroupOffset, bool fillSortFields)
{
return GetTopGroups(query, withinGroupSort, offset, int.MaxValue, withinGroupOffset, fillSortFields);
}
/// <summary>
/// Returns the highest score across all collected parent hits, as long as
/// <c>trackMaxScores=true</c> was passed
/// <see cref="ToParentBlockJoinCollector(Sort, int, bool, bool)"/> on
/// construction. Else, this returns <see cref="float.NaN"/>.
/// </summary>
public virtual float MaxScore => maxScore;
}
}