blob: 0e895001976137f68417adeb43e53ac1130f8c45 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.suggest.document;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.CollectionTerminatedException;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.SimpleCollector;
import org.apache.lucene.search.TotalHits;
import org.apache.lucene.search.suggest.Lookup;
import static org.apache.lucene.search.suggest.document.TopSuggestDocs.SuggestScoreDoc;
/**
* {@link org.apache.lucene.search.Collector} that collects completion and
* score, along with document id
* <p>
* Non scoring collector that collect completions in order of their
* pre-computed scores.
* <p>
* NOTE: One document can be collected multiple times if a document
* is matched for multiple unique completions for a given query
* <p>
* Subclasses should only override
* {@link TopSuggestDocsCollector#collect(int, CharSequence, CharSequence, float)}.
* <p>
* NOTE: {@link #setScorer(org.apache.lucene.search.Scorable)} and
* {@link #collect(int)} is not used
*
* @lucene.experimental
*/
public class TopSuggestDocsCollector extends SimpleCollector {
private final SuggestScoreDocPriorityQueue priorityQueue;
private final int num;
/** Only set if we are deduplicating hits: holds all per-segment hits until the end, when we dedup them */
private final List<SuggestScoreDoc> pendingResults;
/** Only set if we are deduplicating hits: holds all surface forms seen so far in the current segment */
final CharArraySet seenSurfaceForms;
/** Document base offset for the current Leaf */
protected int docBase;
/**
* Sole constructor
*
* Collects at most <code>num</code> completions
* with corresponding document and weight
*/
public TopSuggestDocsCollector(int num, boolean skipDuplicates) {
if (num <= 0) {
throw new IllegalArgumentException("'num' must be > 0");
}
this.num = num;
this.priorityQueue = new SuggestScoreDocPriorityQueue(num);
if (skipDuplicates) {
seenSurfaceForms = new CharArraySet(num, false);
pendingResults = new ArrayList<>();
} else {
seenSurfaceForms = null;
pendingResults = null;
}
}
/** Returns true if duplicates are filtered out */
protected boolean doSkipDuplicates() {
return seenSurfaceForms != null;
}
/**
* Returns the number of results to be collected
*/
public int getCountToCollect() {
return num;
}
@Override
protected void doSetNextReader(LeafReaderContext context) throws IOException {
docBase = context.docBase;
if (seenSurfaceForms != null) {
seenSurfaceForms.clear();
// NOTE: this also clears the priorityQueue:
for (SuggestScoreDoc hit : priorityQueue.getResults()) {
pendingResults.add(hit);
}
}
}
/**
* Called for every matched completion,
* similar to {@link org.apache.lucene.search.LeafCollector#collect(int)}
* but for completions.
*
* NOTE: collection at the leaf level is guaranteed to be in
* descending order of score
*/
public void collect(int docID, CharSequence key, CharSequence context, float score) throws IOException {
SuggestScoreDoc current = new SuggestScoreDoc(docBase + docID, key, context, score);
if (current == priorityQueue.insertWithOverflow(current)) {
// if the current SuggestScoreDoc has overflown from pq,
// we can assume all of the successive collections from
// this leaf will be overflown as well
// TODO: reuse the overflow instance?
throw new CollectionTerminatedException();
}
}
/**
* Returns at most <code>num</code> Top scoring {@link org.apache.lucene.search.suggest.document.TopSuggestDocs}s
*/
public TopSuggestDocs get() throws IOException {
SuggestScoreDoc[] suggestScoreDocs;
if (seenSurfaceForms != null) {
// NOTE: this also clears the priorityQueue:
for (SuggestScoreDoc hit : priorityQueue.getResults()) {
pendingResults.add(hit);
}
// Deduplicate all hits: we already dedup'd efficiently within each segment by
// truncating the FST top paths search, but across segments there may still be dups:
seenSurfaceForms.clear();
// TODO: we could use a priority queue here to make cost O(N * log(num)) instead of O(N * log(N)), where N = O(num *
// numSegments), but typically numSegments is smallish and num is smallish so this won't matter much in practice:
Collections.sort(pendingResults,
(a, b) -> {
// sort by higher score
int cmp = Float.compare(b.score, a.score);
if (cmp == 0) {
// tie break by completion key
cmp = Lookup.CHARSEQUENCE_COMPARATOR.compare(a.key, b.key);
if (cmp == 0) {
// prefer smaller doc id, in case of a tie
cmp = Integer.compare(a.doc, b.doc);
}
}
return cmp;
});
List<SuggestScoreDoc> hits = new ArrayList<>();
for (SuggestScoreDoc hit : pendingResults) {
if (seenSurfaceForms.contains(hit.key) == false) {
seenSurfaceForms.add(hit.key);
hits.add(hit);
if (hits.size() == num) {
break;
}
}
}
suggestScoreDocs = hits.toArray(new SuggestScoreDoc[0]);
} else {
suggestScoreDocs = priorityQueue.getResults();
}
if (suggestScoreDocs.length > 0) {
return new TopSuggestDocs(new TotalHits(suggestScoreDocs.length, TotalHits.Relation.EQUAL_TO), suggestScoreDocs);
} else {
return TopSuggestDocs.EMPTY;
}
}
/**
* Ignored
*/
@Override
public void collect(int doc) throws IOException {
// {@link #collect(int, CharSequence, CharSequence, long)} is used
// instead
}
/**
* Ignored
*/
@Override
public ScoreMode scoreMode() {
return ScoreMode.COMPLETE;
}
}