blob: 5228e6dd9102a7a0b2137b4ee5eec323236cead5 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.Stack;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.search.DiversifiedTopDocsCollector.ScoreDocKey;
import org.apache.lucene.util.PriorityQueue;
/**
* A {@link TopDocsCollector} that controls diversity in results by ensuring no
* more than maxHitsPerKey results from a common source are collected in the
* final results.
*
* An example application might be a product search in a marketplace where no
* more than 3 results per retailer are permitted in search results.
*
* <p>
* To compare behaviour with other forms of collector, a useful analogy might be
* the problem of making a compilation album of 1967's top hit records:
* <ol>
* <li>A vanilla query's results might look like a "Best of the Beatles" album -
* high quality but not much diversity</li>
* <li>A GroupingSearch would produce the equivalent of "The 10 top-selling
* artists of 1967 - some killer and quite a lot of filler"</li>
* <li>A "diversified" query would be the top 20 hit records of that year - with
* a max of 3 Beatles hits in order to maintain diversity</li>
* </ol>
* This collector improves on the "GroupingSearch" type queries by
* <ul>
* <li>Working in one pass over the data</li>
* <li>Not requiring the client to guess how many groups are required</li>
* <li>Removing low-scoring "filler" which sits at the end of each group's hits</li>
* </ul>
*
* This is an abstract class and subclasses have to provide a source of keys for
* documents which is then used to help identify duplicate sources.
*
* @lucene.experimental
*
*/
public abstract class DiversifiedTopDocsCollector extends
TopDocsCollector<ScoreDocKey> {
ScoreDocKey spare;
private ScoreDocKeyQueue globalQueue;
private int numHits;
private Map<Long, ScoreDocKeyQueue> perKeyQueues;
protected int maxNumPerKey;
private Stack<ScoreDocKeyQueue> sparePerKeyQueues = new Stack<>();
public DiversifiedTopDocsCollector(int numHits, int maxHitsPerKey) {
super(new ScoreDocKeyQueue(numHits));
// Need to access pq.lessThan() which is protected so have to cast here...
this.globalQueue = (ScoreDocKeyQueue) pq;
perKeyQueues = new HashMap<Long, ScoreDocKeyQueue>();
this.numHits = numHits;
this.maxNumPerKey = maxHitsPerKey;
}
/**
* Get a source of values used for grouping keys
*/
protected abstract NumericDocValues getKeys(LeafReaderContext context);
@Override
public ScoreMode scoreMode() {
return ScoreMode.COMPLETE;
}
@Override
protected TopDocs newTopDocs(ScoreDoc[] results, int start) {
if (results == null) {
return EMPTY_TOPDOCS;
}
return new TopDocs(new TotalHits(totalHits, TotalHits.Relation.EQUAL_TO), results);
}
protected ScoreDocKey insert(ScoreDocKey addition, int docBase,
NumericDocValues keys) throws IOException {
if ((globalQueue.size() >= numHits)
&& (globalQueue.lessThan(addition, globalQueue.top()))) {
// Queue is full and proposed addition is not a globally
// competitive score
return addition;
}
// The addition stands a chance of being entered - check the
// key-specific restrictions.
// We delay fetching the key until we are certain the score is globally
// competitive. We need to adjust the ScoreDoc's global doc value to be
// a leaf reader value when looking up keys
int leafDocID = addition.doc - docBase;
long value;
if (keys.advanceExact(leafDocID)) {
value = keys.longValue();
} else {
value = 0;
}
addition.key = value;
// For this to work the choice of key class needs to implement
// hashcode and equals.
ScoreDocKeyQueue thisKeyQ = perKeyQueues.get(addition.key);
if (thisKeyQ == null) {
if (sparePerKeyQueues.size() == 0) {
thisKeyQ = new ScoreDocKeyQueue(maxNumPerKey);
} else {
thisKeyQ = sparePerKeyQueues.pop();
}
perKeyQueues.put(addition.key, thisKeyQ);
}
ScoreDocKey perKeyOverflow = thisKeyQ.insertWithOverflow(addition);
if (perKeyOverflow == addition) {
// This key group has reached capacity and our proposed addition
// was not competitive in the group - do not insert into the
// main PQ or the key will be overly-populated in final results.
return addition;
}
if (perKeyOverflow == null) {
// This proposed addition is also locally competitive within the
// key group - make a global entry and return
ScoreDocKey globalOverflow = globalQueue.insertWithOverflow(addition);
perKeyGroupRemove(globalOverflow);
return globalOverflow;
}
// For the given key, we have reached max capacity but the new addition
// is better than a prior entry that still exists in the global results
// - request the weaker-scoring entry to be removed from the global
// queue.
globalQueue.remove(perKeyOverflow);
// Add the locally-competitive addition into the globally queue
globalQueue.add(addition);
return perKeyOverflow;
}
private void perKeyGroupRemove(ScoreDocKey globalOverflow) {
if (globalOverflow == null) {
return;
}
ScoreDocKeyQueue q = perKeyQueues.get(globalOverflow.key);
ScoreDocKey perKeyLowest = q.pop();
// The least globally-competitive item should also always be the least
// key-local item
assert (globalOverflow == perKeyLowest);
if (q.size() == 0) {
perKeyQueues.remove(globalOverflow.key);
sparePerKeyQueues.push(q);
}
}
@Override
public LeafCollector getLeafCollector(LeafReaderContext context)
throws IOException {
final int base = context.docBase;
final NumericDocValues keySource = getKeys(context);
return new LeafCollector() {
Scorable scorer;
@Override
public void setScorer(Scorable scorer) throws IOException {
this.scorer = scorer;
}
@Override
public void collect(int doc) throws IOException {
float score = scorer.score();
// This collector cannot handle NaN
assert !Float.isNaN(score);
totalHits++;
doc += base;
if (spare == null) {
spare = new ScoreDocKey(doc, score);
} else {
spare.doc = doc;
spare.score = score;
}
spare = insert(spare, base, keySource);
}
};
}
static class ScoreDocKeyQueue extends PriorityQueue<ScoreDocKey> {
ScoreDocKeyQueue(int size) {
super(size);
}
@Override
protected final boolean lessThan(ScoreDocKey hitA, ScoreDocKey hitB) {
if (hitA.score == hitB.score)
return hitA.doc > hitB.doc;
else
return hitA.score < hitB.score;
}
}
//
/**
* An extension to ScoreDoc that includes a key used for grouping purposes
*/
static public class ScoreDocKey extends ScoreDoc {
Long key;
protected ScoreDocKey(int doc, float score) {
super(doc, score);
}
public Long getKey() {
return key;
}
@Override
public String toString() {
return "key:" + key + " doc=" + doc + " s=" + score;
}
}
}