blob: 1f01c8025856d32b6c0ce039a5fd941da658f1a8 [file] [log] [blame]
using Lucene.Net.Util;
using Lucene.Net.Util.Fst;
using Lucene.Net.Util.Packed;
using System;
namespace Lucene.Net.Search.Suggest.Fst
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/// <summary>
/// Finite state automata based implementation of "autocomplete" functionality.
///
/// <h2>Implementation details</h2>
///
/// <para>
/// The construction step in the object finalizer works as follows:
/// <list type="bullet">
/// <item><description>A set of input terms and their buckets is given.</description></item>
/// <item><description>All terms in the input are prefixed with a synthetic pseudo-character
/// (code) of the weight bucket the term fell into. For example a term
/// <c>abc</c> with a discretized weight equal '1' would become
/// <c>1abc</c>.</description></item>
/// <item><description>The terms are then sorted by their raw value of UTF-8 character values
/// (including the synthetic bucket code in front).</description></item>
/// <item><description>A finite state automaton (<see cref="FST"/>) is constructed from the input. The
/// root node has arcs labeled with all possible weights. We cache all these
/// arcs, highest-weight first.</description></item>
/// </list>
///
/// </para>
/// <para>
/// At runtime, in <see cref="FSTCompletion.DoLookup(string, int)"/>,
/// the automaton is utilized as follows:
/// <list type="bullet">
/// <item><description>For each possible term weight encoded in the automaton (cached arcs from
/// the root above), starting with the highest one, we descend along the path of
/// the input key. If the key is not a prefix of a sequence in the automaton
/// (path ends prematurely), we exit immediately -- no completions.</description></item>
/// <item><description>Otherwise, we have found an internal automaton node that ends the key.
/// <b>The entire subautomaton (all paths) starting from this node form the key's
/// completions.</b> We start the traversal of this subautomaton. Every time we
/// reach a final state (arc), we add a single suggestion to the list of results
/// (the weight of this suggestion is constant and equal to the root path we
/// started from). The tricky part is that because automaton edges are sorted and
/// we scan depth-first, we can terminate the entire procedure as soon as we
/// collect enough suggestions the user requested.</description></item>
/// <item><description>In case the number of suggestions collected in the step above is still
/// insufficient, we proceed to the next (smaller) weight leaving the root node
/// and repeat the same algorithm again.</description></item>
/// </list>
///
/// <h2>Runtime behavior and performance characteristic</h2>
///
/// The algorithm described above is optimized for finding suggestions to short
/// prefixes in a top-weights-first order. This is probably the most common use
/// case: it allows presenting suggestions early and sorts them by the global
/// frequency (and then alphabetically).
///
/// </para>
/// <para>
/// If there is an exact match in the automaton, it is returned first on the
/// results list (even with by-weight sorting).
///
/// </para>
/// <para>
/// Note that the maximum lookup time for <b>any prefix</b> is the time of
/// descending to the subtree, plus traversal of the subtree up to the number of
/// requested suggestions (because they are already presorted by weight on the
/// root level and alphabetically at any node level).
///
/// </para>
/// <para>
/// To order alphabetically only (no ordering by priorities), use identical term
/// weights for all terms. Alphabetical suggestions are returned even if
/// non-constant weights are used, but the algorithm for doing this is
/// suboptimal.
///
/// </para>
/// <para>
/// "alphabetically" in any of the documentation above indicates UTF-8
/// representation order, nothing else.
///
/// </para>
/// <para>
/// <b>NOTE</b>: the FST file format is experimental and subject to suddenly
/// change, requiring you to rebuild the FST suggest index.
///
/// </para>
/// </summary>
/// <seealso cref="FSTCompletion"/>
/// @lucene.experimental
public class FSTCompletionBuilder
{
/// <summary>
/// Default number of buckets.
/// </summary>
public const int DEFAULT_BUCKETS = 10;
/// <summary>
/// The number of separate buckets for weights (discretization). The more
/// buckets, the more fine-grained term weights (priorities) can be assigned.
/// The speed of lookup will not decrease for prefixes which have
/// highly-weighted completions (because these are filled-in first), but will
/// decrease significantly for low-weighted terms (but these should be
/// infrequent, so it is all right).
///
/// <para>
/// The number of buckets must be within [1, 255] range.
/// </para>
/// </summary>
private readonly int buckets;
/// <summary>
/// Finite state automaton encoding all the lookup terms. See class notes for
/// details.
/// </summary>
internal FST<object> automaton;
/// <summary>
/// FST construction require re-sorting the input. This is the class that
/// collects all the input entries, their weights and then provides sorted
/// order.
/// </summary>
private readonly IBytesRefSorter sorter;
/// <summary>
/// Scratch buffer for <see cref="Add(BytesRef, int)"/>.
/// </summary>
private readonly BytesRef scratch = new BytesRef();
/// <summary>
/// Max tail sharing length.
/// </summary>
private readonly int shareMaxTailLength;
/// <summary>
/// Creates an <see cref="FSTCompletion"/> with default options: 10 buckets, exact match
/// promoted to first position and <see cref="InMemorySorter"/> with a comparer obtained from
/// <see cref="BytesRef.UTF8SortedAsUnicodeComparer"/>.
/// </summary>
public FSTCompletionBuilder()
: this(DEFAULT_BUCKETS, new InMemorySorter(BytesRef.UTF8SortedAsUnicodeComparer), int.MaxValue)
{
}
/// <summary>
/// Creates an FSTCompletion with the specified options. </summary>
/// <param name="buckets">
/// The number of buckets for weight discretization. Buckets are used
/// in <see cref="Add(BytesRef, int)"/> and must be smaller than the number
/// given here.
/// </param>
/// <param name="sorter">
/// <see cref="IBytesRefSorter"/> used for re-sorting input for the automaton.
/// For large inputs, use on-disk sorting implementations. The sorter
/// is closed automatically in <see cref="Build()"/> if it implements
/// <see cref="IDisposable"/>.
/// </param>
/// <param name="shareMaxTailLength">
/// Max shared suffix sharing length.
///
/// See the description of this parameter in <see cref="Builder"/>'s constructor.
/// In general, for very large inputs you'll want to construct a non-minimal
/// automaton which will be larger, but the construction will take far less ram.
/// For minimal automata, set it to <see cref="int.MaxValue"/>. </param>
public FSTCompletionBuilder(int buckets, IBytesRefSorter sorter, int shareMaxTailLength)
{
if (buckets < 1 || buckets > 255)
{
throw new ArgumentException("Buckets must be >= 1 and <= 255: " + buckets);
}
if (sorter == null)
{
throw new ArgumentException("BytesRefSorter must not be null.");
}
this.sorter = sorter;
this.buckets = buckets;
this.shareMaxTailLength = shareMaxTailLength;
}
/// <summary>
/// Appends a single suggestion and its weight to the internal buffers.
/// </summary>
/// <param name="utf8">
/// The suggestion (utf8 representation) to be added. The content is
/// copied and the object can be reused. </param>
/// <param name="bucket">
/// The bucket to place this suggestion in. Must be non-negative and
/// smaller than the number of buckets passed in the constructor.
/// Higher numbers indicate suggestions that should be presented
/// before suggestions placed in smaller buckets. </param>
public virtual void Add(BytesRef utf8, int bucket)
{
if (bucket < 0 || bucket >= buckets)
{
throw new ArgumentException("Bucket outside of the allowed range [0, " + buckets + "): " + bucket);
}
if (scratch.Bytes.Length < utf8.Length + 1)
{
scratch.Grow(utf8.Length + 10);
}
scratch.Length = 1;
scratch.Bytes[0] = (byte)bucket;
scratch.Append(utf8);
sorter.Add(scratch);
}
/// <summary>
/// Builds the final automaton from a list of added entries. This method may
/// take a longer while as it needs to build the automaton.
/// </summary>
public virtual FSTCompletion Build()
{
this.automaton = BuildAutomaton(sorter);
// Dispose of it if it is a disposable
using (sorter as IDisposable)
{
}
return new FSTCompletion(automaton);
}
/// <summary>
/// Builds the final automaton from a list of entries.
/// </summary>
private FST<object> BuildAutomaton(IBytesRefSorter sorter)
{
// Build the automaton.
Outputs<object> outputs = NoOutputs.Singleton;
object empty = outputs.NoOutput;
Builder<object> builder = new Builder<object>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, shareMaxTailLength, outputs, null, false, PackedInt32s.DEFAULT, true, 15);
BytesRef scratch = new BytesRef();
BytesRef entry;
Int32sRef scratchIntsRef = new Int32sRef();
int count = 0;
IBytesRefEnumerator iter = sorter.GetEnumerator();
while (iter.MoveNext())
{
entry = iter.Current;
count++;
if (scratch.CompareTo(entry) != 0)
{
builder.Add(Util.Fst.Util.ToInt32sRef(entry, scratchIntsRef), empty);
scratch.CopyBytes(entry);
}
}
return count == 0 ? null : builder.Finish();
}
}
}