src/Lucene.Net.Suggest/Suggest/Fst/FSTCompletionBuilder.cs - lucenenet - Git at Google

 using Lucene.Net.Util;
 using Lucene.Net.Util.Fst;
 using Lucene.Net.Util.Packed;
 using System;

 namespace Lucene.Net.Search.Suggest.Fst
 {
     /*
      * Licensed to the Apache Software Foundation (ASF) under one or more
      * contributor license agreements.  See the NOTICE file distributed with
      * this work for additional information regarding copyright ownership.
      * The ASF licenses this file to You under the Apache License, Version 2.0
      * (the "License"); you may not use this file except in compliance with
      * the License.  You may obtain a copy of the License at
      *
      *     http://www.apache.org/licenses/LICENSE-2.0
      *
      * Unless required by applicable law or agreed to in writing, software
      * distributed under the License is distributed on an "AS IS" BASIS,
      * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
      * See the License for the specific language governing permissions and
      * limitations under the License.
      */

     /// <summary>
     /// Finite state automata based implementation of "autocomplete" functionality.
     ///
     /// <h2>Implementation details</h2>
     ///
     /// <para>
     /// The construction step in the object finalizer works as follows:
     /// <list type="bullet">
     /// <item><description>A set of input terms and their buckets is given.</description></item>
     /// <item><description>All terms in the input are prefixed with a synthetic pseudo-character
     /// (code) of the weight bucket the term fell into. For example a term
     /// <c>abc</c> with a discretized weight equal '1' would become
     /// <c>1abc</c>.</description></item>
     /// <item><description>The terms are then sorted by their raw value of UTF-8 character values
     /// (including the synthetic bucket code in front).</description></item>
     /// <item><description>A finite state automaton (<see cref="FST"/>) is constructed from the input. The
     /// root node has arcs labeled with all possible weights. We cache all these
     /// arcs, highest-weight first.</description></item>
     /// </list>
     ///
     /// </para>
     /// <para>
     /// At runtime, in <see cref="FSTCompletion.DoLookup(string, int)"/>,
     /// the automaton is utilized as follows:
     /// <list type="bullet">
     /// <item><description>For each possible term weight encoded in the automaton (cached arcs from
     /// the root above), starting with the highest one, we descend along the path of
     /// the input key. If the key is not a prefix of a sequence in the automaton
     /// (path ends prematurely), we exit immediately -- no completions.</description></item>
     /// <item><description>Otherwise, we have found an internal automaton node that ends the key.
     /// <b>The entire subautomaton (all paths) starting from this node form the key's
     /// completions.</b> We start the traversal of this subautomaton. Every time we
     /// reach a final state (arc), we add a single suggestion to the list of results
     /// (the weight of this suggestion is constant and equal to the root path we
     /// started from). The tricky part is that because automaton edges are sorted and
     /// we scan depth-first, we can terminate the entire procedure as soon as we
     /// collect enough suggestions the user requested.</description></item>
     /// <item><description>In case the number of suggestions collected in the step above is still
     /// insufficient, we proceed to the next (smaller) weight leaving the root node
     /// and repeat the same algorithm again.</description></item>
     /// </list>
     ///
     /// <h2>Runtime behavior and performance characteristic</h2>
     ///
     /// The algorithm described above is optimized for finding suggestions to short
     /// prefixes in a top-weights-first order. This is probably the most common use
     /// case: it allows presenting suggestions early and sorts them by the global
     /// frequency (and then alphabetically).
     ///
     /// </para>
     /// <para>
     /// If there is an exact match in the automaton, it is returned first on the
     /// results list (even with by-weight sorting).
     ///
     /// </para>
     /// <para>
     /// Note that the maximum lookup time for <b>any prefix</b> is the time of
     /// descending to the subtree, plus traversal of the subtree up to the number of
     /// requested suggestions (because they are already presorted by weight on the
     /// root level and alphabetically at any node level).
     ///
     /// </para>
     /// <para>
     /// To order alphabetically only (no ordering by priorities), use identical term
     /// weights for all terms. Alphabetical suggestions are returned even if
     /// non-constant weights are used, but the algorithm for doing this is
     /// suboptimal.
     ///
     /// </para>
     /// <para>
     /// "alphabetically" in any of the documentation above indicates UTF-8
     /// representation order, nothing else.
     ///
     /// </para>
     /// <para>
     /// <b>NOTE</b>: the FST file format is experimental and subject to suddenly
     /// change, requiring you to rebuild the FST suggest index.
     ///
     /// </para>
     /// </summary>
     /// <seealso cref="FSTCompletion"/>
     /// @lucene.experimental
     public class FSTCompletionBuilder
     {
         /// <summary>
         /// Default number of buckets.
         /// </summary>
         public const int DEFAULT_BUCKETS = 10;

         /// <summary>
         /// The number of separate buckets for weights (discretization). The more
         /// buckets, the more fine-grained term weights (priorities) can be assigned.
         /// The speed of lookup will not decrease for prefixes which have
         /// highly-weighted completions (because these are filled-in first), but will
         /// decrease significantly for low-weighted terms (but these should be
         /// infrequent, so it is all right).
         ///
         /// <para>
         /// The number of buckets must be within [1, 255] range.
         /// </para>
         /// </summary>
         private readonly int buckets;

         /// <summary>
         /// Finite state automaton encoding all the lookup terms. See class notes for
         /// details.
         /// </summary>
         internal FST<object> automaton;

         /// <summary>
         /// FST construction require re-sorting the input. This is the class that
         /// collects all the input entries, their weights and then provides sorted
         /// order.
         /// </summary>
         private readonly IBytesRefSorter sorter;

         /// <summary>
         /// Scratch buffer for <see cref="Add(BytesRef, int)"/>.
         /// </summary>
         private readonly BytesRef scratch = new BytesRef();

         /// <summary>
         /// Max tail sharing length.
         /// </summary>
         private readonly int shareMaxTailLength;

         /// <summary>
         /// Creates an <see cref="FSTCompletion"/> with default options: 10 buckets, exact match
         /// promoted to first position and <see cref="InMemorySorter"/> with a comparer obtained from
         /// <see cref="BytesRef.UTF8SortedAsUnicodeComparer"/>.
         /// </summary>
         public FSTCompletionBuilder()
             : this(DEFAULT_BUCKETS, new InMemorySorter(BytesRef.UTF8SortedAsUnicodeComparer), int.MaxValue)
         {
         }

         /// <summary>
         /// Creates an FSTCompletion with the specified options. </summary>
         /// <param name="buckets">
         ///          The number of buckets for weight discretization. Buckets are used
         ///          in <see cref="Add(BytesRef, int)"/> and must be smaller than the number
         ///          given here.
         /// </param>
         /// <param name="sorter">
         ///          <see cref="IBytesRefSorter"/> used for re-sorting input for the automaton.
         ///          For large inputs, use on-disk sorting implementations. The sorter
         ///          is closed automatically in <see cref="Build()"/> if it implements
         ///          <see cref="IDisposable"/>.
         /// </param>
         /// <param name="shareMaxTailLength">
         ///          Max shared suffix sharing length.
         ///
         ///          See the description of this parameter in <see cref="Builder"/>'s constructor.
         ///          In general, for very large inputs you'll want to construct a non-minimal
         ///          automaton which will be larger, but the construction will take far less ram.
         ///          For minimal automata, set it to <see cref="int.MaxValue"/>. </param>
         public FSTCompletionBuilder(int buckets, IBytesRefSorter sorter, int shareMaxTailLength)
         {
             if (buckets < 1 || buckets > 255)
             {
                 throw new ArgumentException("Buckets must be >= 1 and <= 255: " + buckets);
             }

             if (sorter == null)
             {
                 throw new ArgumentException("BytesRefSorter must not be null.");
             }

             this.sorter = sorter;
             this.buckets = buckets;
             this.shareMaxTailLength = shareMaxTailLength;
         }

         /// <summary>
         /// Appends a single suggestion and its weight to the internal buffers.
         /// </summary>
         /// <param name="utf8">
         ///          The suggestion (utf8 representation) to be added. The content is
         ///          copied and the object can be reused. </param>
         /// <param name="bucket">
         ///          The bucket to place this suggestion in. Must be non-negative and
         ///          smaller than the number of buckets passed in the constructor.
         ///          Higher numbers indicate suggestions that should be presented
         ///          before suggestions placed in smaller buckets. </param>
         public virtual void Add(BytesRef utf8, int bucket)
         {
             if (bucket < 0 || bucket >= buckets)
             {
                 throw new ArgumentException("Bucket outside of the allowed range [0, " + buckets + "): " + bucket);
             }

             if (scratch.Bytes.Length < utf8.Length + 1)
             {
                 scratch.Grow(utf8.Length + 10);
             }

             scratch.Length = 1;
             scratch.Bytes[0] = (byte)bucket;
             scratch.Append(utf8);
             sorter.Add(scratch);
         }

         /// <summary>
         /// Builds the final automaton from a list of added entries. This method may
         /// take a longer while as it needs to build the automaton.
         /// </summary>
         public virtual FSTCompletion Build()
         {
             this.automaton = BuildAutomaton(sorter);

             // Dispose of it if it is a disposable
             using (sorter as IDisposable)
             {

             }

             return new FSTCompletion(automaton);
         }

         /// <summary>
         /// Builds the final automaton from a list of entries.
         /// </summary>
         private FST<object> BuildAutomaton(IBytesRefSorter sorter)
         {
             // Build the automaton.
             Outputs<object> outputs = NoOutputs.Singleton;
             object empty = outputs.NoOutput;
             Builder<object> builder = new Builder<object>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, shareMaxTailLength, outputs, null, false, PackedInt32s.DEFAULT, true, 15);

             BytesRef scratch = new BytesRef();
             BytesRef entry;
             Int32sRef scratchIntsRef = new Int32sRef();
             int count = 0;
             IBytesRefEnumerator iter = sorter.GetEnumerator();
             while (iter.MoveNext())
             {
                 entry = iter.Current;
                 count++;
                 if (scratch.CompareTo(entry) != 0)
                 {
                     builder.Add(Util.Fst.Util.ToInt32sRef(entry, scratchIntsRef), empty);
                     scratch.CopyBytes(entry);
                 }
             }

             return count == 0 ? null : builder.Finish();
         }
     }
 }
	using Lucene.Net.Util;
	using Lucene.Net.Util.Fst;
	using Lucene.Net.Util.Packed;
	using System;

	namespace Lucene.Net.Search.Suggest.Fst
	{
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	/// <summary>
	/// Finite state automata based implementation of "autocomplete" functionality.
	///
	/// <h2>Implementation details</h2>
	///
	/// <para>
	/// The construction step in the object finalizer works as follows:
	/// <list type="bullet">
	/// <item><description>A set of input terms and their buckets is given.</description></item>
	/// <item><description>All terms in the input are prefixed with a synthetic pseudo-character
	/// (code) of the weight bucket the term fell into. For example a term
	/// <c>abc</c> with a discretized weight equal '1' would become
	/// <c>1abc</c>.</description></item>
	/// <item><description>The terms are then sorted by their raw value of UTF-8 character values
	/// (including the synthetic bucket code in front).</description></item>
	/// <item><description>A finite state automaton (<see cref="FST"/>) is constructed from the input. The
	/// root node has arcs labeled with all possible weights. We cache all these
	/// arcs, highest-weight first.</description></item>
	/// </list>
	///
	/// </para>
	/// <para>
	/// At runtime, in <see cref="FSTCompletion.DoLookup(string, int)"/>,
	/// the automaton is utilized as follows:
	/// <list type="bullet">
	/// <item><description>For each possible term weight encoded in the automaton (cached arcs from
	/// the root above), starting with the highest one, we descend along the path of
	/// the input key. If the key is not a prefix of a sequence in the automaton
	/// (path ends prematurely), we exit immediately -- no completions.</description></item>
	/// <item><description>Otherwise, we have found an internal automaton node that ends the key.
	/// <b>The entire subautomaton (all paths) starting from this node form the key's
	/// completions.</b> We start the traversal of this subautomaton. Every time we
	/// reach a final state (arc), we add a single suggestion to the list of results
	/// (the weight of this suggestion is constant and equal to the root path we
	/// started from). The tricky part is that because automaton edges are sorted and
	/// we scan depth-first, we can terminate the entire procedure as soon as we
	/// collect enough suggestions the user requested.</description></item>
	/// <item><description>In case the number of suggestions collected in the step above is still
	/// insufficient, we proceed to the next (smaller) weight leaving the root node
	/// and repeat the same algorithm again.</description></item>
	/// </list>
	///
	/// <h2>Runtime behavior and performance characteristic</h2>
	///
	/// The algorithm described above is optimized for finding suggestions to short
	/// prefixes in a top-weights-first order. This is probably the most common use
	/// case: it allows presenting suggestions early and sorts them by the global
	/// frequency (and then alphabetically).
	///
	/// </para>
	/// <para>
	/// If there is an exact match in the automaton, it is returned first on the
	/// results list (even with by-weight sorting).
	///
	/// </para>
	/// <para>
	/// Note that the maximum lookup time for <b>any prefix</b> is the time of
	/// descending to the subtree, plus traversal of the subtree up to the number of
	/// requested suggestions (because they are already presorted by weight on the
	/// root level and alphabetically at any node level).
	///
	/// </para>
	/// <para>
	/// To order alphabetically only (no ordering by priorities), use identical term
	/// weights for all terms. Alphabetical suggestions are returned even if
	/// non-constant weights are used, but the algorithm for doing this is
	/// suboptimal.
	///
	/// </para>
	/// <para>
	/// "alphabetically" in any of the documentation above indicates UTF-8
	/// representation order, nothing else.
	///
	/// </para>
	/// <para>
	/// <b>NOTE</b>: the FST file format is experimental and subject to suddenly
	/// change, requiring you to rebuild the FST suggest index.
	///
	/// </para>
	/// </summary>
	/// <seealso cref="FSTCompletion"/>
	/// @lucene.experimental
	public class FSTCompletionBuilder
	{
	/// <summary>
	/// Default number of buckets.
	/// </summary>
	public const int DEFAULT_BUCKETS = 10;

	/// <summary>
	/// The number of separate buckets for weights (discretization). The more
	/// buckets, the more fine-grained term weights (priorities) can be assigned.
	/// The speed of lookup will not decrease for prefixes which have
	/// highly-weighted completions (because these are filled-in first), but will
	/// decrease significantly for low-weighted terms (but these should be
	/// infrequent, so it is all right).
	///
	/// <para>
	/// The number of buckets must be within [1, 255] range.
	/// </para>
	/// </summary>
	private readonly int buckets;

	/// <summary>
	/// Finite state automaton encoding all the lookup terms. See class notes for
	/// details.
	/// </summary>
	internal FST<object> automaton;

	/// <summary>
	/// FST construction require re-sorting the input. This is the class that
	/// collects all the input entries, their weights and then provides sorted
	/// order.
	/// </summary>
	private readonly IBytesRefSorter sorter;

	/// <summary>
	/// Scratch buffer for <see cref="Add(BytesRef, int)"/>.
	/// </summary>
	private readonly BytesRef scratch = new BytesRef();

	/// <summary>
	/// Max tail sharing length.
	/// </summary>
	private readonly int shareMaxTailLength;

	/// <summary>
	/// Creates an <see cref="FSTCompletion"/> with default options: 10 buckets, exact match
	/// promoted to first position and <see cref="InMemorySorter"/> with a comparer obtained from
	/// <see cref="BytesRef.UTF8SortedAsUnicodeComparer"/>.
	/// </summary>
	public FSTCompletionBuilder()
	: this(DEFAULT_BUCKETS, new InMemorySorter(BytesRef.UTF8SortedAsUnicodeComparer), int.MaxValue)
	{
	}

	/// <summary>
	/// Creates an FSTCompletion with the specified options. </summary>
	/// <param name="buckets">
	/// The number of buckets for weight discretization. Buckets are used
	/// in <see cref="Add(BytesRef, int)"/> and must be smaller than the number
	/// given here.
	/// </param>
	/// <param name="sorter">
	/// <see cref="IBytesRefSorter"/> used for re-sorting input for the automaton.
	/// For large inputs, use on-disk sorting implementations. The sorter
	/// is closed automatically in <see cref="Build()"/> if it implements
	/// <see cref="IDisposable"/>.
	/// </param>
	/// <param name="shareMaxTailLength">
	/// Max shared suffix sharing length.
	///
	/// See the description of this parameter in <see cref="Builder"/>'s constructor.
	/// In general, for very large inputs you'll want to construct a non-minimal
	/// automaton which will be larger, but the construction will take far less ram.
	/// For minimal automata, set it to <see cref="int.MaxValue"/>. </param>
	public FSTCompletionBuilder(int buckets, IBytesRefSorter sorter, int shareMaxTailLength)
	{
	if (buckets < 1 \|\| buckets > 255)
	{
	throw new ArgumentException("Buckets must be >= 1 and <= 255: " + buckets);
	}

	if (sorter == null)
	{
	throw new ArgumentException("BytesRefSorter must not be null.");
	}

	this.sorter = sorter;
	this.buckets = buckets;
	this.shareMaxTailLength = shareMaxTailLength;
	}

	/// <summary>
	/// Appends a single suggestion and its weight to the internal buffers.
	/// </summary>
	/// <param name="utf8">
	/// The suggestion (utf8 representation) to be added. The content is
	/// copied and the object can be reused. </param>
	/// <param name="bucket">
	/// The bucket to place this suggestion in. Must be non-negative and
	/// smaller than the number of buckets passed in the constructor.
	/// Higher numbers indicate suggestions that should be presented
	/// before suggestions placed in smaller buckets. </param>
	public virtual void Add(BytesRef utf8, int bucket)
	{
	if (bucket < 0 \|\| bucket >= buckets)
	{
	throw new ArgumentException("Bucket outside of the allowed range [0, " + buckets + "): " + bucket);
	}

	if (scratch.Bytes.Length < utf8.Length + 1)
	{
	scratch.Grow(utf8.Length + 10);
	}

	scratch.Length = 1;
	scratch.Bytes[0] = (byte)bucket;
	scratch.Append(utf8);
	sorter.Add(scratch);
	}

	/// <summary>
	/// Builds the final automaton from a list of added entries. This method may
	/// take a longer while as it needs to build the automaton.
	/// </summary>
	public virtual FSTCompletion Build()
	{
	this.automaton = BuildAutomaton(sorter);

	// Dispose of it if it is a disposable
	using (sorter as IDisposable)
	{

	}

	return new FSTCompletion(automaton);
	}

	/// <summary>
	/// Builds the final automaton from a list of entries.
	/// </summary>
	private FST<object> BuildAutomaton(IBytesRefSorter sorter)
	{
	// Build the automaton.
	Outputs<object> outputs = NoOutputs.Singleton;
	object empty = outputs.NoOutput;
	Builder<object> builder = new Builder<object>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, shareMaxTailLength, outputs, null, false, PackedInt32s.DEFAULT, true, 15);

	BytesRef scratch = new BytesRef();
	BytesRef entry;
	Int32sRef scratchIntsRef = new Int32sRef();
	int count = 0;
	IBytesRefEnumerator iter = sorter.GetEnumerator();
	while (iter.MoveNext())
	{
	entry = iter.Current;
	count++;
	if (scratch.CompareTo(entry) != 0)
	{
	builder.Add(Util.Fst.Util.ToInt32sRef(entry, scratchIntsRef), empty);
	scratch.CopyBytes(entry);
	}
	}

	return count == 0 ? null : builder.Finish();
	}
	}
	}