src/Lucene.Net.Suggest/Suggest/Fst/FSTCompletionLookup.cs - lucenenet - Git at Google

 using Lucene.Net.Store;
 using Lucene.Net.Support;
 using Lucene.Net.Support.IO;
 using Lucene.Net.Util;
 using Lucene.Net.Util.Fst;
 using System;
 using System.Collections.Generic;
 using System.IO;

 namespace Lucene.Net.Search.Suggest.Fst
 {
     /*
      * Licensed to the Apache Software Foundation (ASF) under one or more
      * contributor license agreements.  See the NOTICE file distributed with
      * this work for additional information regarding copyright ownership.
      * The ASF licenses this file to You under the Apache License, Version 2.0
      * (the "License"); you may not use this file except in compliance with
      * the License.  You may obtain a copy of the License at
      *
      *     http://www.apache.org/licenses/LICENSE-2.0
      *
      * Unless required by applicable law or agreed to in writing, software
      * distributed under the License is distributed on an "AS IS" BASIS,
      * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
      * See the License for the specific language governing permissions and
      * limitations under the License.
      */

     /// <summary>
     /// An adapter from <see cref="Lookup"/> API to <see cref="FSTCompletion"/>.
     ///
     /// <para>This adapter differs from <see cref="FSTCompletion"/> in that it attempts
     /// to discretize any "weights" as passed from in <see cref="IInputEnumerator.Weight"/>
     /// to match the number of buckets. For the rationale for bucketing, see
     /// <see cref="FSTCompletion"/>.
     ///
     /// </para>
     /// <para><b>Note:</b>Discretization requires an additional sorting pass.
     ///
     /// </para>
     /// <para>The range of weights for bucketing/ discretization is determined
     /// by sorting the input by weight and then dividing into
     /// equal ranges. Then, scores within each range are assigned to that bucket.
     ///
     /// </para>
     /// <para>Note that this means that even large differences in weights may be lost
     /// during automaton construction, but the overall distinction between "classes"
     /// of weights will be preserved regardless of the distribution of weights.
     ///
     /// </para>
     /// <para>For fine-grained control over which weights are assigned to which buckets,
     /// use <see cref="FSTCompletion"/> directly or <see cref="Tst.TSTLookup"/>, for example.
     ///
     /// </para>
     /// </summary>
     /// <seealso cref="FSTCompletion"/>
     /// @lucene.experimental
     public class FSTCompletionLookup : Lookup
     {
         /// <summary>
         /// An invalid bucket count if we're creating an object
         /// of this class from an existing FST.
         /// </summary>
         /// <seealso cref="FSTCompletionLookup(FSTCompletion, bool)"/>
         private const int INVALID_BUCKETS_COUNT = -1;

         /// <summary>
         /// Shared tail length for conflating in the created automaton. Setting this
         /// to larger values (<see cref="int.MaxValue"/>) will create smaller (or minimal)
         /// automata at the cost of RAM for keeping nodes hash in the <see cref="FST"/>.
         ///
         /// <para>Empirical pick.
         /// </para>
         /// </summary>
         private const int sharedTailLength = 5;

         private readonly int buckets;
         private readonly bool exactMatchFirst;

         /// <summary>
         /// Automaton used for completions with higher weights reordering.
         /// </summary>
         private FSTCompletion higherWeightsCompletion;

         /// <summary>
         /// Automaton used for normal completions.
         /// </summary>
         private FSTCompletion normalCompletion;

         /// <summary>
         /// Number of entries the lookup was built with </summary>
         private long count = 0;


         /// <summary>
         /// This constructor prepares for creating a suggested FST using the
         /// <see cref="Build(IInputEnumerator)"/> method. The number of weight
         /// discretization buckets is set to <see cref="FSTCompletion.DEFAULT_BUCKETS"/> and
         /// exact matches are promoted to the top of the suggestions list.
         /// </summary>
         public FSTCompletionLookup()
             : this(FSTCompletion.DEFAULT_BUCKETS, true)
         {
         }

         /// <summary>
         /// This constructor prepares for creating a suggested FST using the
         /// <see cref="Build(IInputEnumerator)"/> method.
         /// </summary>
         /// <param name="buckets">
         ///          The number of weight discretization buckets (see
         ///          <see cref="FSTCompletion"/> for details).
         /// </param>
         /// <param name="exactMatchFirst">
         ///          If <c>true</c> exact matches are promoted to the top of the
         ///          suggestions list. Otherwise they appear in the order of
         ///          discretized weight and alphabetical within the bucket. </param>
         public FSTCompletionLookup(int buckets, bool exactMatchFirst)
         {
             this.buckets = buckets;
             this.exactMatchFirst = exactMatchFirst;
         }

         /// <summary>
         /// This constructor takes a pre-built automaton.
         /// </summary>
         ///  <param name="completion">
         ///          An instance of <see cref="FSTCompletion"/>. </param>
         ///  <param name="exactMatchFirst">
         ///          If <code>true</code> exact matches are promoted to the top of the
         ///          suggestions list. Otherwise they appear in the order of
         ///          discretized weight and alphabetical within the bucket. </param>
         public FSTCompletionLookup(FSTCompletion completion, bool exactMatchFirst)
             : this(INVALID_BUCKETS_COUNT, exactMatchFirst)
         {
             this.normalCompletion = new FSTCompletion(completion.FST, false, exactMatchFirst);
             this.higherWeightsCompletion = new FSTCompletion(completion.FST, true, exactMatchFirst);
         }

         public override void Build(IInputEnumerator enumerator)
         {
             if (enumerator.HasPayloads)
             {
                 throw new ArgumentException("this suggester doesn't support payloads");
             }
             if (enumerator.HasContexts)
             {
                 throw new ArgumentException("this suggester doesn't support contexts");
             }
             FileInfo tempInput = FileSupport.CreateTempFile(typeof(FSTCompletionLookup).Name, ".input", OfflineSorter.DefaultTempDir());
             FileInfo tempSorted = FileSupport.CreateTempFile(typeof(FSTCompletionLookup).Name, ".sorted", OfflineSorter.DefaultTempDir());

             OfflineSorter.ByteSequencesWriter writer = new OfflineSorter.ByteSequencesWriter(tempInput);
             OfflineSorter.ByteSequencesReader reader = null;
             ExternalRefSorter sorter = null;

             // Push floats up front before sequences to sort them. For now, assume they are non-negative.
             // If negative floats are allowed some trickery needs to be done to find their byte order.
             bool success = false;
             count = 0;
             try
             {
                 byte[] buffer = Arrays.Empty<byte>();
                 ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
                 BytesRef spare;
                 while (enumerator.MoveNext())
                 {
                     spare = enumerator.Current;
                     if (spare.Length + 4 >= buffer.Length)
                     {
                         buffer = ArrayUtil.Grow(buffer, spare.Length + 4);
                     }

                     output.Reset(buffer);
                     output.WriteInt32(EncodeWeight(enumerator.Weight));
                     output.WriteBytes(spare.Bytes, spare.Offset, spare.Length);
                     writer.Write(buffer, 0, output.Position);
                 }
                 writer.Dispose();

                 // We don't know the distribution of scores and we need to bucket them, so we'll sort
                 // and divide into equal buckets.
                 OfflineSorter.SortInfo info = (new OfflineSorter()).Sort(tempInput, tempSorted);
                 tempInput.Delete();
                 FSTCompletionBuilder builder = new FSTCompletionBuilder(buckets, sorter = new ExternalRefSorter(new OfflineSorter()), sharedTailLength);

                 int inputLines = info.Lines;
                 reader = new OfflineSorter.ByteSequencesReader(tempSorted);
                 long line = 0;
                 int previousBucket = 0;
                 int previousScore = 0;
                 ByteArrayDataInput input = new ByteArrayDataInput();
                 BytesRef tmp1 = new BytesRef();
                 BytesRef tmp2 = new BytesRef();
                 while (reader.Read(tmp1))
                 {
                     input.Reset(tmp1.Bytes);
                     int currentScore = input.ReadInt32();

                     int bucket;
                     if (line > 0 && currentScore == previousScore)
                     {
                         bucket = previousBucket;
                     }
                     else
                     {
                         bucket = (int)(line * buckets / inputLines);
                     }
                     previousScore = currentScore;
                     previousBucket = bucket;

                     // Only append the input, discard the weight.
                     tmp2.Bytes = tmp1.Bytes;
                     tmp2.Offset = input.Position;
                     tmp2.Length = tmp1.Length - input.Position;
                     builder.Add(tmp2, bucket);

                     line++;
                     count++;
                 }

                 // The two FSTCompletions share the same automaton.
                 this.higherWeightsCompletion = builder.Build();
                 this.normalCompletion = new FSTCompletion(higherWeightsCompletion.FST, false, exactMatchFirst);

                 success = true;
             }
             finally
             {
                 if (success)
                 {
                     IOUtils.Dispose(reader, writer, sorter);
                 }
                 else
                 {
                     IOUtils.DisposeWhileHandlingException(reader, writer, sorter);
                 }

                 tempInput.Delete();
                 tempSorted.Delete();
             }
         }

         /// <summary>
         /// weight -> cost </summary>
         private static int EncodeWeight(long value)
         {
             if (value < int.MinValue || value > int.MaxValue)
             {
                 throw new NotSupportedException("cannot encode value: " + value);
             }
             return (int)value;
         }

         public override IList<LookupResult> DoLookup(string key, IEnumerable<BytesRef> contexts, bool higherWeightsFirst, int num)
         {
             if (contexts != null)
             {
                 throw new ArgumentException("this suggester doesn't support contexts");
             }
             IList<FSTCompletion.Completion> completions;
             if (higherWeightsFirst)
             {
                 completions = higherWeightsCompletion.DoLookup(key, num);
             }
             else
             {
                 completions = normalCompletion.DoLookup(key, num);
             }

             List<LookupResult> results = new List<LookupResult>(completions.Count);
             CharsRef spare = new CharsRef();
             foreach (FSTCompletion.Completion c in completions)
             {
                 spare.Grow(c.Utf8.Length);
                 UnicodeUtil.UTF8toUTF16(c.Utf8, spare);
                 results.Add(new LookupResult(spare.ToString(), c.Bucket));
             }
             return results;
         }

         /// <summary>
         /// Returns the bucket (weight) as a Long for the provided key if it exists,
         /// otherwise null if it does not.
         /// </summary>
         public virtual object Get(string key)
         {
             int bucket = normalCompletion.GetBucket(key);
             return bucket == -1 ? (long?)null : bucket;
         }

         public override bool Store(DataOutput output)
         {
             lock (this)
             {
                 output.WriteVInt64(count);
                 if (this.normalCompletion == null || normalCompletion.FST == null)
                 {
                     return false;
                 }
                 normalCompletion.FST.Save(output);
                 return true;
             }
         }

         public override bool Load(DataInput input)
         {
             lock (this)
             {
                 count = input.ReadVInt64();
                 this.higherWeightsCompletion = new FSTCompletion(new FST<object>(input, NoOutputs.Singleton));
                 this.normalCompletion = new FSTCompletion(higherWeightsCompletion.FST, false, exactMatchFirst);
                 return true;
             }
         }

         public override long GetSizeInBytes()
         {
             long mem = RamUsageEstimator.ShallowSizeOf(this) + RamUsageEstimator.ShallowSizeOf(normalCompletion) + RamUsageEstimator.ShallowSizeOf(higherWeightsCompletion);
             if (normalCompletion != null)
             {
                 mem += normalCompletion.FST.GetSizeInBytes();
             }
             if (higherWeightsCompletion != null && (normalCompletion == null || normalCompletion.FST != higherWeightsCompletion.FST))
             {
                 // the fst should be shared between the 2 completion instances, don't count it twice
                 mem += higherWeightsCompletion.FST.GetSizeInBytes();
             }
             return mem;
         }

         public override long Count => count;
     }
 }
	using Lucene.Net.Store;
	using Lucene.Net.Support;
	using Lucene.Net.Support.IO;
	using Lucene.Net.Util;
	using Lucene.Net.Util.Fst;
	using System;
	using System.Collections.Generic;
	using System.IO;

	namespace Lucene.Net.Search.Suggest.Fst
	{
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	/// <summary>
	/// An adapter from <see cref="Lookup"/> API to <see cref="FSTCompletion"/>.
	///
	/// <para>This adapter differs from <see cref="FSTCompletion"/> in that it attempts
	/// to discretize any "weights" as passed from in <see cref="IInputEnumerator.Weight"/>
	/// to match the number of buckets. For the rationale for bucketing, see
	/// <see cref="FSTCompletion"/>.
	///
	/// </para>
	/// <para><b>Note:</b>Discretization requires an additional sorting pass.
	///
	/// </para>
	/// <para>The range of weights for bucketing/ discretization is determined
	/// by sorting the input by weight and then dividing into
	/// equal ranges. Then, scores within each range are assigned to that bucket.
	///
	/// </para>
	/// <para>Note that this means that even large differences in weights may be lost
	/// during automaton construction, but the overall distinction between "classes"
	/// of weights will be preserved regardless of the distribution of weights.
	///
	/// </para>
	/// <para>For fine-grained control over which weights are assigned to which buckets,
	/// use <see cref="FSTCompletion"/> directly or <see cref="Tst.TSTLookup"/>, for example.
	///
	/// </para>
	/// </summary>
	/// <seealso cref="FSTCompletion"/>
	/// @lucene.experimental
	public class FSTCompletionLookup : Lookup
	{
	/// <summary>
	/// An invalid bucket count if we're creating an object
	/// of this class from an existing FST.
	/// </summary>
	/// <seealso cref="FSTCompletionLookup(FSTCompletion, bool)"/>
	private const int INVALID_BUCKETS_COUNT = -1;

	/// <summary>
	/// Shared tail length for conflating in the created automaton. Setting this
	/// to larger values (<see cref="int.MaxValue"/>) will create smaller (or minimal)
	/// automata at the cost of RAM for keeping nodes hash in the <see cref="FST"/>.
	///
	/// <para>Empirical pick.
	/// </para>
	/// </summary>
	private const int sharedTailLength = 5;

	private readonly int buckets;
	private readonly bool exactMatchFirst;

	/// <summary>
	/// Automaton used for completions with higher weights reordering.
	/// </summary>
	private FSTCompletion higherWeightsCompletion;

	/// <summary>
	/// Automaton used for normal completions.
	/// </summary>
	private FSTCompletion normalCompletion;

	/// <summary>
	/// Number of entries the lookup was built with </summary>
	private long count = 0;


	/// <summary>
	/// This constructor prepares for creating a suggested FST using the
	/// <see cref="Build(IInputEnumerator)"/> method. The number of weight
	/// discretization buckets is set to <see cref="FSTCompletion.DEFAULT_BUCKETS"/> and
	/// exact matches are promoted to the top of the suggestions list.
	/// </summary>
	public FSTCompletionLookup()
	: this(FSTCompletion.DEFAULT_BUCKETS, true)
	{
	}

	/// <summary>
	/// This constructor prepares for creating a suggested FST using the
	/// <see cref="Build(IInputEnumerator)"/> method.
	/// </summary>
	/// <param name="buckets">
	/// The number of weight discretization buckets (see
	/// <see cref="FSTCompletion"/> for details).
	/// </param>
	/// <param name="exactMatchFirst">
	/// If <c>true</c> exact matches are promoted to the top of the
	/// suggestions list. Otherwise they appear in the order of
	/// discretized weight and alphabetical within the bucket. </param>
	public FSTCompletionLookup(int buckets, bool exactMatchFirst)
	{
	this.buckets = buckets;
	this.exactMatchFirst = exactMatchFirst;
	}

	/// <summary>
	/// This constructor takes a pre-built automaton.
	/// </summary>
	/// <param name="completion">
	/// An instance of <see cref="FSTCompletion"/>. </param>
	/// <param name="exactMatchFirst">
	/// If <code>true</code> exact matches are promoted to the top of the
	/// suggestions list. Otherwise they appear in the order of
	/// discretized weight and alphabetical within the bucket. </param>
	public FSTCompletionLookup(FSTCompletion completion, bool exactMatchFirst)
	: this(INVALID_BUCKETS_COUNT, exactMatchFirst)
	{
	this.normalCompletion = new FSTCompletion(completion.FST, false, exactMatchFirst);
	this.higherWeightsCompletion = new FSTCompletion(completion.FST, true, exactMatchFirst);
	}

	public override void Build(IInputEnumerator enumerator)
	{
	if (enumerator.HasPayloads)
	{
	throw new ArgumentException("this suggester doesn't support payloads");
	}
	if (enumerator.HasContexts)
	{
	throw new ArgumentException("this suggester doesn't support contexts");
	}
	FileInfo tempInput = FileSupport.CreateTempFile(typeof(FSTCompletionLookup).Name, ".input", OfflineSorter.DefaultTempDir());
	FileInfo tempSorted = FileSupport.CreateTempFile(typeof(FSTCompletionLookup).Name, ".sorted", OfflineSorter.DefaultTempDir());

	OfflineSorter.ByteSequencesWriter writer = new OfflineSorter.ByteSequencesWriter(tempInput);
	OfflineSorter.ByteSequencesReader reader = null;
	ExternalRefSorter sorter = null;

	// Push floats up front before sequences to sort them. For now, assume they are non-negative.
	// If negative floats are allowed some trickery needs to be done to find their byte order.
	bool success = false;
	count = 0;
	try
	{
	byte[] buffer = Arrays.Empty<byte>();
	ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
	BytesRef spare;
	while (enumerator.MoveNext())
	{
	spare = enumerator.Current;
	if (spare.Length + 4 >= buffer.Length)
	{
	buffer = ArrayUtil.Grow(buffer, spare.Length + 4);
	}

	output.Reset(buffer);
	output.WriteInt32(EncodeWeight(enumerator.Weight));
	output.WriteBytes(spare.Bytes, spare.Offset, spare.Length);
	writer.Write(buffer, 0, output.Position);
	}
	writer.Dispose();

	// We don't know the distribution of scores and we need to bucket them, so we'll sort
	// and divide into equal buckets.
	OfflineSorter.SortInfo info = (new OfflineSorter()).Sort(tempInput, tempSorted);
	tempInput.Delete();
	FSTCompletionBuilder builder = new FSTCompletionBuilder(buckets, sorter = new ExternalRefSorter(new OfflineSorter()), sharedTailLength);

	int inputLines = info.Lines;
	reader = new OfflineSorter.ByteSequencesReader(tempSorted);
	long line = 0;
	int previousBucket = 0;
	int previousScore = 0;
	ByteArrayDataInput input = new ByteArrayDataInput();
	BytesRef tmp1 = new BytesRef();
	BytesRef tmp2 = new BytesRef();
	while (reader.Read(tmp1))
	{
	input.Reset(tmp1.Bytes);
	int currentScore = input.ReadInt32();

	int bucket;
	if (line > 0 && currentScore == previousScore)
	{
	bucket = previousBucket;
	}
	else
	{
	bucket = (int)(line * buckets / inputLines);
	}
	previousScore = currentScore;
	previousBucket = bucket;

	// Only append the input, discard the weight.
	tmp2.Bytes = tmp1.Bytes;
	tmp2.Offset = input.Position;
	tmp2.Length = tmp1.Length - input.Position;
	builder.Add(tmp2, bucket);

	line++;
	count++;
	}

	// The two FSTCompletions share the same automaton.
	this.higherWeightsCompletion = builder.Build();
	this.normalCompletion = new FSTCompletion(higherWeightsCompletion.FST, false, exactMatchFirst);

	success = true;
	}
	finally
	{
	if (success)
	{
	IOUtils.Dispose(reader, writer, sorter);
	}
	else
	{
	IOUtils.DisposeWhileHandlingException(reader, writer, sorter);
	}

	tempInput.Delete();
	tempSorted.Delete();
	}
	}

	/// <summary>
	/// weight -> cost </summary>
	private static int EncodeWeight(long value)
	{
	if (value < int.MinValue \|\| value > int.MaxValue)
	{
	throw new NotSupportedException("cannot encode value: " + value);
	}
	return (int)value;
	}

	public override IList<LookupResult> DoLookup(string key, IEnumerable<BytesRef> contexts, bool higherWeightsFirst, int num)
	{
	if (contexts != null)
	{
	throw new ArgumentException("this suggester doesn't support contexts");
	}
	IList<FSTCompletion.Completion> completions;
	if (higherWeightsFirst)
	{
	completions = higherWeightsCompletion.DoLookup(key, num);
	}
	else
	{
	completions = normalCompletion.DoLookup(key, num);
	}

	List<LookupResult> results = new List<LookupResult>(completions.Count);
	CharsRef spare = new CharsRef();
	foreach (FSTCompletion.Completion c in completions)
	{
	spare.Grow(c.Utf8.Length);
	UnicodeUtil.UTF8toUTF16(c.Utf8, spare);
	results.Add(new LookupResult(spare.ToString(), c.Bucket));
	}
	return results;
	}

	/// <summary>
	/// Returns the bucket (weight) as a Long for the provided key if it exists,
	/// otherwise null if it does not.
	/// </summary>
	public virtual object Get(string key)
	{
	int bucket = normalCompletion.GetBucket(key);
	return bucket == -1 ? (long?)null : bucket;
	}

	public override bool Store(DataOutput output)
	{
	lock (this)
	{
	output.WriteVInt64(count);
	if (this.normalCompletion == null \|\| normalCompletion.FST == null)
	{
	return false;
	}
	normalCompletion.FST.Save(output);
	return true;
	}
	}

	public override bool Load(DataInput input)
	{
	lock (this)
	{
	count = input.ReadVInt64();
	this.higherWeightsCompletion = new FSTCompletion(new FST<object>(input, NoOutputs.Singleton));
	this.normalCompletion = new FSTCompletion(higherWeightsCompletion.FST, false, exactMatchFirst);
	return true;
	}
	}

	public override long GetSizeInBytes()
	{
	long mem = RamUsageEstimator.ShallowSizeOf(this) + RamUsageEstimator.ShallowSizeOf(normalCompletion) + RamUsageEstimator.ShallowSizeOf(higherWeightsCompletion);
	if (normalCompletion != null)
	{
	mem += normalCompletion.FST.GetSizeInBytes();
	}
	if (higherWeightsCompletion != null && (normalCompletion == null \|\| normalCompletion.FST != higherWeightsCompletion.FST))
	{
	// the fst should be shared between the 2 completion instances, don't count it twice
	mem += higherWeightsCompletion.FST.GetSizeInBytes();
	}
	return mem;
	}

	public override long Count => count;
	}
	}