blob: b62d9b900cd107ae3c05d84c4f5cf9d3934f6b31 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.suggest.document;
import java.io.IOException;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import org.apache.lucene.search.suggest.analyzing.FSTUtil;
import org.apache.lucene.search.suggest.document.CompletionPostingsFormat.FSTLoadMode;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.ByteArrayDataOutput;
import org.apache.lucene.store.ByteBufferIndexInput;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRefBuilder;
import org.apache.lucene.util.fst.ByteSequenceOutputs;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.OffHeapFSTStore;
import org.apache.lucene.util.fst.PairOutputs.Pair;
import org.apache.lucene.util.fst.PairOutputs;
import org.apache.lucene.util.fst.PositiveIntOutputs;
import org.apache.lucene.util.fst.Util;
import static org.apache.lucene.search.suggest.document.NRTSuggester.PayLoadProcessor.parseSurfaceForm;
/**
* <p>
* NRTSuggester executes Top N search on a weighted FST specified by a {@link CompletionScorer}
* <p>
* See {@link #lookup(CompletionScorer, Bits, TopSuggestDocsCollector)} for more implementation
* details.
* <p>
* FST Format:
* <ul>
* <li>Input: analyzed forms of input terms</li>
* <li>Output: Pair&lt;Long, BytesRef&gt; containing weight, surface form and docID</li>
* </ul>
* <p>
* NOTE:
* <ul>
* <li>having too many deletions or using a very restrictive filter can make the search inadmissible due to
* over-pruning of potential paths. See {@link CompletionScorer#accept(int, Bits)}</li>
* <li>when matched documents are arbitrarily filtered ({@link CompletionScorer#filtered} set to <code>true</code>,
* it is assumed that the filter will roughly filter out half the number of documents that match
* the provided automaton</li>
* <li>lookup performance will degrade as more accepted completions lead to filtered out documents</li>
* </ul>
*
* @lucene.experimental
*/
public final class NRTSuggester implements Accountable {
/**
* FST<Weight,Surface>:
* input is the analyzed form, with a null byte between terms
* and a {@link NRTSuggesterBuilder#END_BYTE} to denote the
* end of the input
* weight is a long
* surface is the original, unanalyzed form followed by the docID
*/
private final FST<Pair<Long, BytesRef>> fst;
/**
* Highest number of analyzed paths we saw for any single
* input surface form. This can be > 1, when index analyzer
* creates graphs or if multiple surface form(s) yields the
* same analyzed form
*/
private final int maxAnalyzedPathsPerOutput;
/**
* Separator used between surface form and its docID in the FST output
*/
private final int payloadSep;
/**
* Maximum queue depth for TopNSearcher
*
* NOTE: value should be <= Integer.MAX_VALUE
*/
private static final long MAX_TOP_N_QUEUE_SIZE = 5000;
private NRTSuggester(FST<Pair<Long, BytesRef>> fst, int maxAnalyzedPathsPerOutput, int payloadSep) {
this.fst = fst;
this.maxAnalyzedPathsPerOutput = maxAnalyzedPathsPerOutput;
this.payloadSep = payloadSep;
}
@Override
public long ramBytesUsed() {
return fst == null ? 0 : fst.ramBytesUsed();
}
@Override
public Collection<Accountable> getChildResources() {
return Collections.emptyList();
}
/**
* Collects at most {@link TopSuggestDocsCollector#getCountToCollect()} completions that
* match the provided {@link CompletionScorer}.
* <p>
* The {@link CompletionScorer#automaton} is intersected with the {@link #fst}.
* {@link CompletionScorer#weight} is used to compute boosts and/or extract context
* for each matched partial paths. A top N search is executed on {@link #fst} seeded with
* the matched partial paths. Upon reaching a completed path, {@link CompletionScorer#accept(int, Bits)}
* and {@link CompletionScorer#score(float, float)} is used on the document id, index weight
* and query boost to filter and score the entry, before being collected via
* {@link TopSuggestDocsCollector#collect(int, CharSequence, CharSequence, float)}
*/
public void lookup(final CompletionScorer scorer, Bits acceptDocs, final TopSuggestDocsCollector collector) throws IOException {
final double liveDocsRatio = calculateLiveDocRatio(scorer.reader.numDocs(), scorer.reader.maxDoc());
if (liveDocsRatio == -1) {
return;
}
final List<FSTUtil.Path<Pair<Long, BytesRef>>> prefixPaths = FSTUtil.intersectPrefixPaths(scorer.automaton, fst);
// The topN is increased by a factor of # of intersected path
// to ensure search admissibility. For example, one suggestion can
// have multiple contexts, resulting in num_context paths for the
// suggestion instead of 1 in the FST. When queried for the suggestion,
// the topN value ensures that all paths to the suggestion are evaluated
// (in case of a match all context query).
// Note that collectors will early terminate as soon as enough suggestions
// have been collected, regardless of the set topN value. This value is the
// maximum number of suggestions that can be collected.
final int topN = collector.getCountToCollect() * prefixPaths.size();
final int queueSize = getMaxTopNSearcherQueueSize(topN, scorer.reader.numDocs(), liveDocsRatio, scorer.filtered);
final CharsRefBuilder spare = new CharsRefBuilder();
Comparator<Pair<Long, BytesRef>> comparator = getComparator();
Util.TopNSearcher<Pair<Long, BytesRef>> searcher = new Util.TopNSearcher<Pair<Long, BytesRef>>(fst, topN, queueSize, comparator,
new ScoringPathComparator(scorer)) {
private final ByteArrayDataInput scratchInput = new ByteArrayDataInput();
@Override
protected boolean acceptPartialPath(Util.FSTPath<Pair<Long,BytesRef>> path) {
if (collector.doSkipDuplicates()) {
// We are removing dups
if (path.payload == -1) {
// This path didn't yet see the complete surface form; let's see if it just did with the arc output we just added:
BytesRef arcOutput = path.arc.output().output2;
BytesRef output = path.output.output2;
for(int i=0;i<arcOutput.length;i++) {
if (arcOutput.bytes[arcOutput.offset + i] == payloadSep) {
// OK this arc that the path was just extended by contains the payloadSep, so we now have a full surface form in this path
path.payload = output.length - arcOutput.length + i;
assert output.bytes[output.offset + path.payload] == payloadSep;
break;
}
}
}
if (path.payload != -1) {
BytesRef output = path.output.output2;
spare.copyUTF8Bytes(output.bytes, output.offset, path.payload);
if (collector.seenSurfaceForms.contains(spare.chars(), 0, spare.length())) {
return false;
}
}
}
return true;
}
@Override
protected boolean acceptResult(Util.FSTPath<Pair<Long, BytesRef>> path) {
BytesRef output = path.output.output2;
int payloadSepIndex;
if (path.payload != -1) {
payloadSepIndex = path.payload;
spare.copyUTF8Bytes(output.bytes, output.offset, payloadSepIndex);
} else {
assert collector.doSkipDuplicates() == false;
payloadSepIndex = parseSurfaceForm(output, payloadSep, spare);
}
scratchInput.reset(output.bytes, output.offset + payloadSepIndex + 1, output.length - payloadSepIndex - 1);
int docID = scratchInput.readVInt();
if (!scorer.accept(docID, acceptDocs)) {
return false;
}
if (collector.doSkipDuplicates()) {
// now record that we've seen this surface form:
char[] key = new char[spare.length()];
System.arraycopy(spare.chars(), 0, key, 0, spare.length());
if (collector.seenSurfaceForms.contains(key)) {
// we already collected a higher scoring document with this key, in this segment:
return false;
}
collector.seenSurfaceForms.add(key);
}
try {
float score = scorer.score(decode(path.output.output1), path.boost);
collector.collect(docID, spare.toCharsRef(), path.context, score);
return true;
} catch (IOException e) {
throw new RuntimeException(e);
}
}
};
for (FSTUtil.Path<Pair<Long, BytesRef>> path : prefixPaths) {
scorer.weight.setNextMatch(path.input.get());
BytesRef output = path.output.output2;
int payload = -1;
if (collector.doSkipDuplicates()) {
for(int j=0;j<output.length;j++) {
if (output.bytes[output.offset+j] == payloadSep) {
// Important to cache this, else we have a possibly O(N^2) cost where N is the length of suggestions
payload = j;
break;
}
}
}
searcher.addStartPaths(path.fstNode, path.output, false, path.input, scorer.weight.boost(),
scorer.weight.context(), payload);
}
// hits are also returned by search()
// we do not use it, instead collect at acceptResult
searcher.search();
// search admissibility is not guaranteed
// see comment on getMaxTopNSearcherQueueSize
// assert search.isComplete;
}
/**
* Compares partial completion paths using {@link CompletionScorer#score(float, float)},
* breaks ties comparing path inputs
*/
private static class ScoringPathComparator implements Comparator<Util.FSTPath<Pair<Long, BytesRef>>> {
private final CompletionScorer scorer;
public ScoringPathComparator(CompletionScorer scorer) {
this.scorer = scorer;
}
@Override
public int compare(Util.FSTPath<Pair<Long, BytesRef>> first, Util.FSTPath<Pair<Long, BytesRef>> second) {
int cmp = Float.compare(scorer.score(decode(second.output.output1), second.boost),
scorer.score(decode(first.output.output1), first.boost));
return (cmp != 0) ? cmp : first.input.get().compareTo(second.input.get());
}
}
private static Comparator<Pair<Long, BytesRef>> getComparator() {
return new Comparator<Pair<Long, BytesRef>>() {
@Override
public int compare(Pair<Long, BytesRef> o1, Pair<Long, BytesRef> o2) {
return Long.compare(o1.output1, o2.output1);
}
};
}
/**
* Simple heuristics to try to avoid over-pruning potential suggestions by the
* TopNSearcher. Since suggestion entries can be rejected if they belong
* to a deleted document, the length of the TopNSearcher queue has to
* be increased by some factor, to account for the filtered out suggestions.
* This heuristic will try to make the searcher admissible, but the search
* can still lead to over-pruning
* <p>
* If a <code>filter</code> is applied, the queue size is increased by
* half the number of live documents.
* <p>
* The maximum queue size is {@link #MAX_TOP_N_QUEUE_SIZE}
*/
private int getMaxTopNSearcherQueueSize(int topN, int numDocs, double liveDocsRatio, boolean filterEnabled) {
long maxQueueSize = topN * maxAnalyzedPathsPerOutput;
// liveDocRatio can be at most 1.0 (if no docs were deleted)
assert liveDocsRatio <= 1.0d;
maxQueueSize = (long) (maxQueueSize / liveDocsRatio);
if (filterEnabled) {
maxQueueSize = maxQueueSize + (numDocs/2);
}
return (int) Math.min(MAX_TOP_N_QUEUE_SIZE, maxQueueSize);
}
private static double calculateLiveDocRatio(int numDocs, int maxDocs) {
return (numDocs > 0) ? ((double) numDocs / maxDocs) : -1;
}
private static boolean shouldLoadFSTOffHeap(IndexInput input, FSTLoadMode fstLoadMode) {
switch (fstLoadMode) {
case ON_HEAP:
return false;
case OFF_HEAP:
return true;
case AUTO:
return input instanceof ByteBufferIndexInput;
default:
throw new IllegalStateException("unknown enum constant: " + fstLoadMode);
}
}
/**
* Loads a {@link NRTSuggester} from {@link org.apache.lucene.store.IndexInput} on or off-heap
* depending on the provided <code>fstLoadMode</code>
*/
public static NRTSuggester load(IndexInput input, FSTLoadMode fstLoadMode) throws IOException {
final FST<Pair<Long, BytesRef>> fst;
if (shouldLoadFSTOffHeap(input, fstLoadMode)) {
OffHeapFSTStore store = new OffHeapFSTStore();
IndexInput clone = input.clone();
clone.seek(input.getFilePointer());
fst = new FST<>(clone, clone, new PairOutputs<>(
PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton()), store);
input.seek(clone.getFilePointer() + store.size());
} else {
fst = new FST<>(input, input, new PairOutputs<>(
PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton()));
}
/* read some meta info */
int maxAnalyzedPathsPerOutput = input.readVInt();
/*
* Label used to denote the end of an input in the FST and
* the beginning of dedup bytes
*/
int endByte = input.readVInt();
int payloadSep = input.readVInt();
return new NRTSuggester(fst, maxAnalyzedPathsPerOutput, payloadSep);
}
static long encode(long input) {
if (input < 0 || input > Integer.MAX_VALUE) {
throw new UnsupportedOperationException("cannot encode value: " + input);
}
return Integer.MAX_VALUE - input;
}
static long decode(long output) {
assert output >= 0 && output <= Integer.MAX_VALUE :
"decoded output: " + output + " is not within 0 and Integer.MAX_VALUE";
return Integer.MAX_VALUE - output;
}
/**
* Helper to encode/decode payload (surface + PAYLOAD_SEP + docID) output
*/
static final class PayLoadProcessor {
final static private int MAX_DOC_ID_LEN_WITH_SEP = 6; // vint takes at most 5 bytes
static int parseSurfaceForm(final BytesRef output, int payloadSep, CharsRefBuilder spare) {
int surfaceFormLen = -1;
for (int i = 0; i < output.length; i++) {
if (output.bytes[output.offset + i] == payloadSep) {
surfaceFormLen = i;
break;
}
}
assert surfaceFormLen != -1 : "no payloadSep found, unable to determine surface form";
spare.copyUTF8Bytes(output.bytes, output.offset, surfaceFormLen);
return surfaceFormLen;
}
static BytesRef make(final BytesRef surface, int docID, int payloadSep) throws IOException {
int len = surface.length + MAX_DOC_ID_LEN_WITH_SEP;
byte[] buffer = new byte[len];
ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
output.writeBytes(surface.bytes, surface.length - surface.offset);
output.writeByte((byte) payloadSep);
output.writeVInt(docID);
return new BytesRef(buffer, 0, output.getPosition());
}
}
}