| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.search.suggest.document; |
| |
| import java.io.IOException; |
| import java.util.Collection; |
| import java.util.Collections; |
| import java.util.Comparator; |
| import java.util.List; |
| |
| import org.apache.lucene.search.suggest.analyzing.FSTUtil; |
| import org.apache.lucene.search.suggest.document.CompletionPostingsFormat.FSTLoadMode; |
| import org.apache.lucene.store.ByteArrayDataInput; |
| import org.apache.lucene.store.ByteArrayDataOutput; |
| import org.apache.lucene.store.ByteBufferIndexInput; |
| import org.apache.lucene.store.IndexInput; |
| import org.apache.lucene.util.Accountable; |
| import org.apache.lucene.util.Bits; |
| import org.apache.lucene.util.BytesRef; |
| import org.apache.lucene.util.CharsRefBuilder; |
| import org.apache.lucene.util.fst.ByteSequenceOutputs; |
| import org.apache.lucene.util.fst.FST; |
| import org.apache.lucene.util.fst.OffHeapFSTStore; |
| import org.apache.lucene.util.fst.PairOutputs.Pair; |
| import org.apache.lucene.util.fst.PairOutputs; |
| import org.apache.lucene.util.fst.PositiveIntOutputs; |
| import org.apache.lucene.util.fst.Util; |
| |
| import static org.apache.lucene.search.suggest.document.NRTSuggester.PayLoadProcessor.parseSurfaceForm; |
| |
| /** |
| * <p> |
| * NRTSuggester executes Top N search on a weighted FST specified by a {@link CompletionScorer} |
| * <p> |
| * See {@link #lookup(CompletionScorer, Bits, TopSuggestDocsCollector)} for more implementation |
| * details. |
| * <p> |
| * FST Format: |
| * <ul> |
| * <li>Input: analyzed forms of input terms</li> |
| * <li>Output: Pair<Long, BytesRef> containing weight, surface form and docID</li> |
| * </ul> |
| * <p> |
| * NOTE: |
| * <ul> |
| * <li>having too many deletions or using a very restrictive filter can make the search inadmissible due to |
| * over-pruning of potential paths. See {@link CompletionScorer#accept(int, Bits)}</li> |
| * <li>when matched documents are arbitrarily filtered ({@link CompletionScorer#filtered} set to <code>true</code>, |
| * it is assumed that the filter will roughly filter out half the number of documents that match |
| * the provided automaton</li> |
| * <li>lookup performance will degrade as more accepted completions lead to filtered out documents</li> |
| * </ul> |
| * |
| * @lucene.experimental |
| */ |
| public final class NRTSuggester implements Accountable { |
| |
| /** |
| * FST<Weight,Surface>: |
| * input is the analyzed form, with a null byte between terms |
| * and a {@link NRTSuggesterBuilder#END_BYTE} to denote the |
| * end of the input |
| * weight is a long |
| * surface is the original, unanalyzed form followed by the docID |
| */ |
| private final FST<Pair<Long, BytesRef>> fst; |
| |
| /** |
| * Highest number of analyzed paths we saw for any single |
| * input surface form. This can be > 1, when index analyzer |
| * creates graphs or if multiple surface form(s) yields the |
| * same analyzed form |
| */ |
| private final int maxAnalyzedPathsPerOutput; |
| |
| /** |
| * Separator used between surface form and its docID in the FST output |
| */ |
| private final int payloadSep; |
| |
| /** |
| * Maximum queue depth for TopNSearcher |
| * |
| * NOTE: value should be <= Integer.MAX_VALUE |
| */ |
| private static final long MAX_TOP_N_QUEUE_SIZE = 5000; |
| |
| private NRTSuggester(FST<Pair<Long, BytesRef>> fst, int maxAnalyzedPathsPerOutput, int payloadSep) { |
| this.fst = fst; |
| this.maxAnalyzedPathsPerOutput = maxAnalyzedPathsPerOutput; |
| this.payloadSep = payloadSep; |
| } |
| |
| @Override |
| public long ramBytesUsed() { |
| return fst == null ? 0 : fst.ramBytesUsed(); |
| } |
| |
| @Override |
| public Collection<Accountable> getChildResources() { |
| return Collections.emptyList(); |
| } |
| |
| /** |
| * Collects at most {@link TopSuggestDocsCollector#getCountToCollect()} completions that |
| * match the provided {@link CompletionScorer}. |
| * <p> |
| * The {@link CompletionScorer#automaton} is intersected with the {@link #fst}. |
| * {@link CompletionScorer#weight} is used to compute boosts and/or extract context |
| * for each matched partial paths. A top N search is executed on {@link #fst} seeded with |
| * the matched partial paths. Upon reaching a completed path, {@link CompletionScorer#accept(int, Bits)} |
| * and {@link CompletionScorer#score(float, float)} is used on the document id, index weight |
| * and query boost to filter and score the entry, before being collected via |
| * {@link TopSuggestDocsCollector#collect(int, CharSequence, CharSequence, float)} |
| */ |
| public void lookup(final CompletionScorer scorer, Bits acceptDocs, final TopSuggestDocsCollector collector) throws IOException { |
| final double liveDocsRatio = calculateLiveDocRatio(scorer.reader.numDocs(), scorer.reader.maxDoc()); |
| if (liveDocsRatio == -1) { |
| return; |
| } |
| final List<FSTUtil.Path<Pair<Long, BytesRef>>> prefixPaths = FSTUtil.intersectPrefixPaths(scorer.automaton, fst); |
| // The topN is increased by a factor of # of intersected path |
| // to ensure search admissibility. For example, one suggestion can |
| // have multiple contexts, resulting in num_context paths for the |
| // suggestion instead of 1 in the FST. When queried for the suggestion, |
| // the topN value ensures that all paths to the suggestion are evaluated |
| // (in case of a match all context query). |
| // Note that collectors will early terminate as soon as enough suggestions |
| // have been collected, regardless of the set topN value. This value is the |
| // maximum number of suggestions that can be collected. |
| final int topN = collector.getCountToCollect() * prefixPaths.size(); |
| final int queueSize = getMaxTopNSearcherQueueSize(topN, scorer.reader.numDocs(), liveDocsRatio, scorer.filtered); |
| |
| final CharsRefBuilder spare = new CharsRefBuilder(); |
| |
| Comparator<Pair<Long, BytesRef>> comparator = getComparator(); |
| Util.TopNSearcher<Pair<Long, BytesRef>> searcher = new Util.TopNSearcher<Pair<Long, BytesRef>>(fst, topN, queueSize, comparator, |
| new ScoringPathComparator(scorer)) { |
| |
| private final ByteArrayDataInput scratchInput = new ByteArrayDataInput(); |
| |
| @Override |
| protected boolean acceptPartialPath(Util.FSTPath<Pair<Long,BytesRef>> path) { |
| if (collector.doSkipDuplicates()) { |
| // We are removing dups |
| if (path.payload == -1) { |
| // This path didn't yet see the complete surface form; let's see if it just did with the arc output we just added: |
| BytesRef arcOutput = path.arc.output().output2; |
| BytesRef output = path.output.output2; |
| for(int i=0;i<arcOutput.length;i++) { |
| if (arcOutput.bytes[arcOutput.offset + i] == payloadSep) { |
| // OK this arc that the path was just extended by contains the payloadSep, so we now have a full surface form in this path |
| path.payload = output.length - arcOutput.length + i; |
| assert output.bytes[output.offset + path.payload] == payloadSep; |
| break; |
| } |
| } |
| } |
| |
| if (path.payload != -1) { |
| BytesRef output = path.output.output2; |
| spare.copyUTF8Bytes(output.bytes, output.offset, path.payload); |
| if (collector.seenSurfaceForms.contains(spare.chars(), 0, spare.length())) { |
| return false; |
| } |
| } |
| } |
| return true; |
| } |
| |
| @Override |
| protected boolean acceptResult(Util.FSTPath<Pair<Long, BytesRef>> path) { |
| BytesRef output = path.output.output2; |
| int payloadSepIndex; |
| if (path.payload != -1) { |
| payloadSepIndex = path.payload; |
| spare.copyUTF8Bytes(output.bytes, output.offset, payloadSepIndex); |
| } else { |
| assert collector.doSkipDuplicates() == false; |
| payloadSepIndex = parseSurfaceForm(output, payloadSep, spare); |
| } |
| |
| scratchInput.reset(output.bytes, output.offset + payloadSepIndex + 1, output.length - payloadSepIndex - 1); |
| int docID = scratchInput.readVInt(); |
| |
| if (!scorer.accept(docID, acceptDocs)) { |
| return false; |
| } |
| if (collector.doSkipDuplicates()) { |
| // now record that we've seen this surface form: |
| char[] key = new char[spare.length()]; |
| System.arraycopy(spare.chars(), 0, key, 0, spare.length()); |
| if (collector.seenSurfaceForms.contains(key)) { |
| // we already collected a higher scoring document with this key, in this segment: |
| return false; |
| } |
| collector.seenSurfaceForms.add(key); |
| } |
| try { |
| float score = scorer.score(decode(path.output.output1), path.boost); |
| collector.collect(docID, spare.toCharsRef(), path.context, score); |
| return true; |
| } catch (IOException e) { |
| throw new RuntimeException(e); |
| } |
| } |
| }; |
| |
| for (FSTUtil.Path<Pair<Long, BytesRef>> path : prefixPaths) { |
| scorer.weight.setNextMatch(path.input.get()); |
| BytesRef output = path.output.output2; |
| int payload = -1; |
| if (collector.doSkipDuplicates()) { |
| for(int j=0;j<output.length;j++) { |
| if (output.bytes[output.offset+j] == payloadSep) { |
| // Important to cache this, else we have a possibly O(N^2) cost where N is the length of suggestions |
| payload = j; |
| break; |
| } |
| } |
| } |
| |
| searcher.addStartPaths(path.fstNode, path.output, false, path.input, scorer.weight.boost(), |
| scorer.weight.context(), payload); |
| } |
| // hits are also returned by search() |
| // we do not use it, instead collect at acceptResult |
| searcher.search(); |
| // search admissibility is not guaranteed |
| // see comment on getMaxTopNSearcherQueueSize |
| // assert search.isComplete; |
| } |
| |
| /** |
| * Compares partial completion paths using {@link CompletionScorer#score(float, float)}, |
| * breaks ties comparing path inputs |
| */ |
| private static class ScoringPathComparator implements Comparator<Util.FSTPath<Pair<Long, BytesRef>>> { |
| private final CompletionScorer scorer; |
| |
| public ScoringPathComparator(CompletionScorer scorer) { |
| this.scorer = scorer; |
| } |
| |
| @Override |
| public int compare(Util.FSTPath<Pair<Long, BytesRef>> first, Util.FSTPath<Pair<Long, BytesRef>> second) { |
| int cmp = Float.compare(scorer.score(decode(second.output.output1), second.boost), |
| scorer.score(decode(first.output.output1), first.boost)); |
| return (cmp != 0) ? cmp : first.input.get().compareTo(second.input.get()); |
| } |
| } |
| |
| private static Comparator<Pair<Long, BytesRef>> getComparator() { |
| return new Comparator<Pair<Long, BytesRef>>() { |
| @Override |
| public int compare(Pair<Long, BytesRef> o1, Pair<Long, BytesRef> o2) { |
| return Long.compare(o1.output1, o2.output1); |
| } |
| }; |
| } |
| |
| /** |
| * Simple heuristics to try to avoid over-pruning potential suggestions by the |
| * TopNSearcher. Since suggestion entries can be rejected if they belong |
| * to a deleted document, the length of the TopNSearcher queue has to |
| * be increased by some factor, to account for the filtered out suggestions. |
| * This heuristic will try to make the searcher admissible, but the search |
| * can still lead to over-pruning |
| * <p> |
| * If a <code>filter</code> is applied, the queue size is increased by |
| * half the number of live documents. |
| * <p> |
| * The maximum queue size is {@link #MAX_TOP_N_QUEUE_SIZE} |
| */ |
| private int getMaxTopNSearcherQueueSize(int topN, int numDocs, double liveDocsRatio, boolean filterEnabled) { |
| long maxQueueSize = topN * maxAnalyzedPathsPerOutput; |
| // liveDocRatio can be at most 1.0 (if no docs were deleted) |
| assert liveDocsRatio <= 1.0d; |
| maxQueueSize = (long) (maxQueueSize / liveDocsRatio); |
| if (filterEnabled) { |
| maxQueueSize = maxQueueSize + (numDocs/2); |
| } |
| return (int) Math.min(MAX_TOP_N_QUEUE_SIZE, maxQueueSize); |
| } |
| |
| private static double calculateLiveDocRatio(int numDocs, int maxDocs) { |
| return (numDocs > 0) ? ((double) numDocs / maxDocs) : -1; |
| } |
| |
| private static boolean shouldLoadFSTOffHeap(IndexInput input, FSTLoadMode fstLoadMode) { |
| switch (fstLoadMode) { |
| case ON_HEAP: |
| return false; |
| case OFF_HEAP: |
| return true; |
| case AUTO: |
| return input instanceof ByteBufferIndexInput; |
| default: |
| throw new IllegalStateException("unknown enum constant: " + fstLoadMode); |
| } |
| } |
| |
| /** |
| * Loads a {@link NRTSuggester} from {@link org.apache.lucene.store.IndexInput} on or off-heap |
| * depending on the provided <code>fstLoadMode</code> |
| */ |
| public static NRTSuggester load(IndexInput input, FSTLoadMode fstLoadMode) throws IOException { |
| final FST<Pair<Long, BytesRef>> fst; |
| if (shouldLoadFSTOffHeap(input, fstLoadMode)) { |
| OffHeapFSTStore store = new OffHeapFSTStore(); |
| IndexInput clone = input.clone(); |
| clone.seek(input.getFilePointer()); |
| fst = new FST<>(clone, clone, new PairOutputs<>( |
| PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton()), store); |
| input.seek(clone.getFilePointer() + store.size()); |
| } else { |
| fst = new FST<>(input, input, new PairOutputs<>( |
| PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton())); |
| } |
| |
| /* read some meta info */ |
| int maxAnalyzedPathsPerOutput = input.readVInt(); |
| /* |
| * Label used to denote the end of an input in the FST and |
| * the beginning of dedup bytes |
| */ |
| int endByte = input.readVInt(); |
| int payloadSep = input.readVInt(); |
| return new NRTSuggester(fst, maxAnalyzedPathsPerOutput, payloadSep); |
| } |
| |
| static long encode(long input) { |
| if (input < 0 || input > Integer.MAX_VALUE) { |
| throw new UnsupportedOperationException("cannot encode value: " + input); |
| } |
| return Integer.MAX_VALUE - input; |
| } |
| |
| static long decode(long output) { |
| assert output >= 0 && output <= Integer.MAX_VALUE : |
| "decoded output: " + output + " is not within 0 and Integer.MAX_VALUE"; |
| return Integer.MAX_VALUE - output; |
| } |
| |
| /** |
| * Helper to encode/decode payload (surface + PAYLOAD_SEP + docID) output |
| */ |
| static final class PayLoadProcessor { |
| final static private int MAX_DOC_ID_LEN_WITH_SEP = 6; // vint takes at most 5 bytes |
| |
| static int parseSurfaceForm(final BytesRef output, int payloadSep, CharsRefBuilder spare) { |
| int surfaceFormLen = -1; |
| for (int i = 0; i < output.length; i++) { |
| if (output.bytes[output.offset + i] == payloadSep) { |
| surfaceFormLen = i; |
| break; |
| } |
| } |
| assert surfaceFormLen != -1 : "no payloadSep found, unable to determine surface form"; |
| spare.copyUTF8Bytes(output.bytes, output.offset, surfaceFormLen); |
| return surfaceFormLen; |
| } |
| |
| static BytesRef make(final BytesRef surface, int docID, int payloadSep) throws IOException { |
| int len = surface.length + MAX_DOC_ID_LEN_WITH_SEP; |
| byte[] buffer = new byte[len]; |
| ByteArrayDataOutput output = new ByteArrayDataOutput(buffer); |
| output.writeBytes(surface.bytes, surface.length - surface.offset); |
| output.writeByte((byte) payloadSep); |
| output.writeVInt(docID); |
| return new BytesRef(buffer, 0, output.getPosition()); |
| } |
| } |
| } |