blob: f5b05974e41679180014773f1e7541e304da6d82 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.grouping;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Comparator;
import java.util.HashMap;
import java.util.TreeSet;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.FieldComparator;
import org.apache.lucene.search.LeafFieldComparator;
import org.apache.lucene.search.Scorable;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.SimpleCollector;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
/** FirstPassGroupingCollector is the first of two passes necessary
* to collect grouped hits. This pass gathers the top N sorted
* groups. Groups are defined by a {@link GroupSelector}
*
* <p>See {@link org.apache.lucene.search.grouping} for more
* details including a full code example.</p>
*
* @lucene.experimental
*/
public class FirstPassGroupingCollector<T> extends SimpleCollector {
private final GroupSelector<T> groupSelector;
private final FieldComparator<?>[] comparators;
private final LeafFieldComparator[] leafComparators;
private final int[] reversed;
private final int topNGroups;
private final boolean needsScores;
private final HashMap<T, CollectedSearchGroup<T>> groupMap;
private final int compIDXEnd;
// Set once we reach topNGroups unique groups:
/** @lucene.internal */
protected TreeSet<CollectedSearchGroup<T>> orderedGroups;
private int docBase;
private int spareSlot;
/**
* Create the first pass collector.
*
* @param groupSelector a GroupSelector used to defined groups
* @param groupSort The {@link Sort} used to sort the
* groups. The top sorted document within each group
* according to groupSort, determines how that group
* sorts against other groups. This must be non-null,
* ie, if you want to groupSort by relevance use
* Sort.RELEVANCE.
* @param topNGroups How many top groups to keep.
*/
@SuppressWarnings({"unchecked", "rawtypes"})
public FirstPassGroupingCollector(GroupSelector<T> groupSelector, Sort groupSort, int topNGroups) {
this.groupSelector = groupSelector;
if (topNGroups < 1) {
throw new IllegalArgumentException("topNGroups must be >= 1 (got " + topNGroups + ")");
}
// TODO: allow null groupSort to mean "by relevance",
// and specialize it?
this.topNGroups = topNGroups;
this.needsScores = groupSort.needsScores();
final SortField[] sortFields = groupSort.getSort();
comparators = new FieldComparator[sortFields.length];
leafComparators = new LeafFieldComparator[sortFields.length];
compIDXEnd = comparators.length - 1;
reversed = new int[sortFields.length];
for (int i = 0; i < sortFields.length; i++) {
final SortField sortField = sortFields[i];
// use topNGroups + 1 so we have a spare slot to use for comparing (tracked by this.spareSlot):
comparators[i] = sortField.getComparator(topNGroups + 1, i);
reversed[i] = sortField.getReverse() ? -1 : 1;
}
spareSlot = topNGroups;
groupMap = new HashMap<>(topNGroups);
}
@Override
public ScoreMode scoreMode() {
return needsScores ? ScoreMode.COMPLETE : ScoreMode.COMPLETE_NO_SCORES;
}
/**
* Returns top groups, starting from offset. This may
* return null, if no groups were collected, or if the
* number of unique groups collected is &lt;= offset.
*
* @param groupOffset The offset in the collected groups
* @return top groups, starting from offset
*/
public Collection<SearchGroup<T>> getTopGroups(int groupOffset) throws IOException {
//System.out.println("FP.getTopGroups groupOffset=" + groupOffset + " fillFields=" + fillFields + " groupMap.size()=" + groupMap.size());
if (groupOffset < 0) {
throw new IllegalArgumentException("groupOffset must be >= 0 (got " + groupOffset + ")");
}
if (groupMap.size() <= groupOffset) {
return null;
}
if (orderedGroups == null) {
buildSortedSet();
}
final Collection<SearchGroup<T>> result = new ArrayList<>();
int upto = 0;
final int sortFieldCount = comparators.length;
for(CollectedSearchGroup<T> group : orderedGroups) {
if (upto++ < groupOffset) {
continue;
}
// System.out.println(" group=" + (group.groupValue == null ? "null" : group.groupValue.toString()));
SearchGroup<T> searchGroup = new SearchGroup<>();
searchGroup.groupValue = group.groupValue;
searchGroup.sortValues = new Object[sortFieldCount];
for(int sortFieldIDX=0;sortFieldIDX<sortFieldCount;sortFieldIDX++) {
searchGroup.sortValues[sortFieldIDX] = comparators[sortFieldIDX].value(group.comparatorSlot);
}
result.add(searchGroup);
}
//System.out.println(" return " + result.size() + " groups");
return result;
}
@Override
public void setScorer(Scorable scorer) throws IOException {
groupSelector.setScorer(scorer);
for (LeafFieldComparator comparator : leafComparators) {
comparator.setScorer(scorer);
}
}
private boolean isCompetitive(int doc) throws IOException {
// If orderedGroups != null we already have collected N groups and
// can short circuit by comparing this document to the bottom group,
// without having to find what group this document belongs to.
// Even if this document belongs to a group in the top N, we'll know that
// we don't have to update that group.
// Downside: if the number of unique groups is very low, this is
// wasted effort as we will most likely be updating an existing group.
if (orderedGroups != null) {
for (int compIDX = 0;; compIDX++) {
final int c = reversed[compIDX] * leafComparators[compIDX].compareBottom(doc);
if (c < 0) {
// Definitely not competitive. So don't even bother to continue
return false;
} else if (c > 0) {
// Definitely competitive.
break;
} else if (compIDX == compIDXEnd) {
// Here c=0. If we're at the last comparator, this doc is not
// competitive, since docs are visited in doc Id order, which means
// this doc cannot compete with any other document in the queue.
return false;
}
}
}
return true;
}
@Override
public void collect(int doc) throws IOException {
if (isCompetitive(doc) == false)
return;
// TODO: should we add option to mean "ignore docs that
// don't have the group field" (instead of stuffing them
// under null group)?
groupSelector.advanceTo(doc);
T groupValue = groupSelector.currentValue();
final CollectedSearchGroup<T> group = groupMap.get(groupValue);
if (group == null) {
// First time we are seeing this group, or, we've seen
// it before but it fell out of the top N and is now
// coming back
if (groupMap.size() < topNGroups) {
// Still in startup transient: we have not
// seen enough unique groups to start pruning them;
// just keep collecting them
// Add a new CollectedSearchGroup:
CollectedSearchGroup<T> sg = new CollectedSearchGroup<>();
sg.groupValue = groupSelector.copyValue();
sg.comparatorSlot = groupMap.size();
sg.topDoc = docBase + doc;
for (LeafFieldComparator fc : leafComparators) {
fc.copy(sg.comparatorSlot, doc);
}
groupMap.put(sg.groupValue, sg);
if (groupMap.size() == topNGroups) {
// End of startup transient: we now have max
// number of groups; from here on we will drop
// bottom group when we insert new one:
buildSortedSet();
}
return;
}
// We already tested that the document is competitive, so replace
// the bottom group with this new group.
final CollectedSearchGroup<T> bottomGroup = orderedGroups.pollLast();
assert orderedGroups.size() == topNGroups -1;
groupMap.remove(bottomGroup.groupValue);
// reuse the removed CollectedSearchGroup
bottomGroup.groupValue = groupSelector.copyValue();
bottomGroup.topDoc = docBase + doc;
for (LeafFieldComparator fc : leafComparators) {
fc.copy(bottomGroup.comparatorSlot, doc);
}
groupMap.put(bottomGroup.groupValue, bottomGroup);
orderedGroups.add(bottomGroup);
assert orderedGroups.size() == topNGroups;
final int lastComparatorSlot = orderedGroups.last().comparatorSlot;
for (LeafFieldComparator fc : leafComparators) {
fc.setBottom(lastComparatorSlot);
}
return;
}
// Update existing group:
for (int compIDX = 0;; compIDX++) {
leafComparators[compIDX].copy(spareSlot, doc);
final int c = reversed[compIDX] * comparators[compIDX].compare(group.comparatorSlot, spareSlot);
if (c < 0) {
// Definitely not competitive.
return;
} else if (c > 0) {
// Definitely competitive; set remaining comparators:
for (int compIDX2=compIDX+1; compIDX2<comparators.length; compIDX2++) {
leafComparators[compIDX2].copy(spareSlot, doc);
}
break;
} else if (compIDX == compIDXEnd) {
// Here c=0. If we're at the last comparator, this doc is not
// competitive, since docs are visited in doc Id order, which means
// this doc cannot compete with any other document in the queue.
return;
}
}
// Remove before updating the group since lookup is done via comparators
// TODO: optimize this
final CollectedSearchGroup<T> prevLast;
if (orderedGroups != null) {
prevLast = orderedGroups.last();
orderedGroups.remove(group);
assert orderedGroups.size() == topNGroups-1;
} else {
prevLast = null;
}
group.topDoc = docBase + doc;
// Swap slots
final int tmp = spareSlot;
spareSlot = group.comparatorSlot;
group.comparatorSlot = tmp;
// Re-add the changed group
if (orderedGroups != null) {
orderedGroups.add(group);
assert orderedGroups.size() == topNGroups;
final CollectedSearchGroup<?> newLast = orderedGroups.last();
// If we changed the value of the last group, or changed which group was last, then update bottom:
if (group == newLast || prevLast != newLast) {
for (LeafFieldComparator fc : leafComparators) {
fc.setBottom(newLast.comparatorSlot);
}
}
}
}
private void buildSortedSet() throws IOException {
final Comparator<CollectedSearchGroup<?>> comparator = new Comparator<CollectedSearchGroup<?>>() {
@Override
public int compare(CollectedSearchGroup<?> o1, CollectedSearchGroup<?> o2) {
for (int compIDX = 0;; compIDX++) {
FieldComparator<?> fc = comparators[compIDX];
final int c = reversed[compIDX] * fc.compare(o1.comparatorSlot, o2.comparatorSlot);
if (c != 0) {
return c;
} else if (compIDX == compIDXEnd) {
return o1.topDoc - o2.topDoc;
}
}
}
};
orderedGroups = new TreeSet<>(comparator);
orderedGroups.addAll(groupMap.values());
assert orderedGroups.size() > 0;
for (LeafFieldComparator fc : leafComparators) {
fc.setBottom(orderedGroups.last().comparatorSlot);
}
}
@Override
protected void doSetNextReader(LeafReaderContext readerContext) throws IOException {
docBase = readerContext.docBase;
for (int i=0; i<comparators.length; i++) {
leafComparators[i] = comparators[i].getLeafComparator(readerContext);
}
groupSelector.setNextReader(readerContext);
}
/**
* @return the GroupSelector used for this Collector
*/
public GroupSelector<T> getGroupSelector() {
return groupSelector;
}
}