blob: bc2f877429300a65307e221087fdd8e611f11eda [file] [log] [blame]
using Lucene.Net.Store;
using Lucene.Net.Support;
using Lucene.Net.Support.IO;
using Lucene.Net.Util;
using Lucene.Net.Util.Fst;
using System;
using System.Collections.Generic;
using System.IO;
namespace Lucene.Net.Search.Suggest.Fst
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/// <summary>
/// An adapter from <see cref="Lookup"/> API to <see cref="FSTCompletion"/>.
///
/// <para>This adapter differs from <see cref="FSTCompletion"/> in that it attempts
/// to discretize any "weights" as passed from in <see cref="IInputEnumerator.Weight"/>
/// to match the number of buckets. For the rationale for bucketing, see
/// <see cref="FSTCompletion"/>.
///
/// </para>
/// <para><b>Note:</b>Discretization requires an additional sorting pass.
///
/// </para>
/// <para>The range of weights for bucketing/ discretization is determined
/// by sorting the input by weight and then dividing into
/// equal ranges. Then, scores within each range are assigned to that bucket.
///
/// </para>
/// <para>Note that this means that even large differences in weights may be lost
/// during automaton construction, but the overall distinction between "classes"
/// of weights will be preserved regardless of the distribution of weights.
///
/// </para>
/// <para>For fine-grained control over which weights are assigned to which buckets,
/// use <see cref="FSTCompletion"/> directly or <see cref="Tst.TSTLookup"/>, for example.
///
/// </para>
/// </summary>
/// <seealso cref="FSTCompletion"/>
/// @lucene.experimental
public class FSTCompletionLookup : Lookup
{
/// <summary>
/// An invalid bucket count if we're creating an object
/// of this class from an existing FST.
/// </summary>
/// <seealso cref="FSTCompletionLookup(FSTCompletion, bool)"/>
private const int INVALID_BUCKETS_COUNT = -1;
/// <summary>
/// Shared tail length for conflating in the created automaton. Setting this
/// to larger values (<see cref="int.MaxValue"/>) will create smaller (or minimal)
/// automata at the cost of RAM for keeping nodes hash in the <see cref="FST"/>.
///
/// <para>Empirical pick.
/// </para>
/// </summary>
private const int sharedTailLength = 5;
private readonly int buckets;
private readonly bool exactMatchFirst;
/// <summary>
/// Automaton used for completions with higher weights reordering.
/// </summary>
private FSTCompletion higherWeightsCompletion;
/// <summary>
/// Automaton used for normal completions.
/// </summary>
private FSTCompletion normalCompletion;
/// <summary>
/// Number of entries the lookup was built with </summary>
private long count = 0;
/// <summary>
/// This constructor prepares for creating a suggested FST using the
/// <see cref="Build(IInputEnumerator)"/> method. The number of weight
/// discretization buckets is set to <see cref="FSTCompletion.DEFAULT_BUCKETS"/> and
/// exact matches are promoted to the top of the suggestions list.
/// </summary>
public FSTCompletionLookup()
: this(FSTCompletion.DEFAULT_BUCKETS, true)
{
}
/// <summary>
/// This constructor prepares for creating a suggested FST using the
/// <see cref="Build(IInputEnumerator)"/> method.
/// </summary>
/// <param name="buckets">
/// The number of weight discretization buckets (see
/// <see cref="FSTCompletion"/> for details).
/// </param>
/// <param name="exactMatchFirst">
/// If <c>true</c> exact matches are promoted to the top of the
/// suggestions list. Otherwise they appear in the order of
/// discretized weight and alphabetical within the bucket. </param>
public FSTCompletionLookup(int buckets, bool exactMatchFirst)
{
this.buckets = buckets;
this.exactMatchFirst = exactMatchFirst;
}
/// <summary>
/// This constructor takes a pre-built automaton.
/// </summary>
/// <param name="completion">
/// An instance of <see cref="FSTCompletion"/>. </param>
/// <param name="exactMatchFirst">
/// If <code>true</code> exact matches are promoted to the top of the
/// suggestions list. Otherwise they appear in the order of
/// discretized weight and alphabetical within the bucket. </param>
public FSTCompletionLookup(FSTCompletion completion, bool exactMatchFirst)
: this(INVALID_BUCKETS_COUNT, exactMatchFirst)
{
this.normalCompletion = new FSTCompletion(completion.FST, false, exactMatchFirst);
this.higherWeightsCompletion = new FSTCompletion(completion.FST, true, exactMatchFirst);
}
public override void Build(IInputEnumerator enumerator)
{
if (enumerator.HasPayloads)
{
throw new ArgumentException("this suggester doesn't support payloads");
}
if (enumerator.HasContexts)
{
throw new ArgumentException("this suggester doesn't support contexts");
}
FileInfo tempInput = FileSupport.CreateTempFile(typeof(FSTCompletionLookup).Name, ".input", OfflineSorter.DefaultTempDir());
FileInfo tempSorted = FileSupport.CreateTempFile(typeof(FSTCompletionLookup).Name, ".sorted", OfflineSorter.DefaultTempDir());
OfflineSorter.ByteSequencesWriter writer = new OfflineSorter.ByteSequencesWriter(tempInput);
OfflineSorter.ByteSequencesReader reader = null;
ExternalRefSorter sorter = null;
// Push floats up front before sequences to sort them. For now, assume they are non-negative.
// If negative floats are allowed some trickery needs to be done to find their byte order.
bool success = false;
count = 0;
try
{
byte[] buffer = Arrays.Empty<byte>();
ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
BytesRef spare;
while (enumerator.MoveNext())
{
spare = enumerator.Current;
if (spare.Length + 4 >= buffer.Length)
{
buffer = ArrayUtil.Grow(buffer, spare.Length + 4);
}
output.Reset(buffer);
output.WriteInt32(EncodeWeight(enumerator.Weight));
output.WriteBytes(spare.Bytes, spare.Offset, spare.Length);
writer.Write(buffer, 0, output.Position);
}
writer.Dispose();
// We don't know the distribution of scores and we need to bucket them, so we'll sort
// and divide into equal buckets.
OfflineSorter.SortInfo info = (new OfflineSorter()).Sort(tempInput, tempSorted);
tempInput.Delete();
FSTCompletionBuilder builder = new FSTCompletionBuilder(buckets, sorter = new ExternalRefSorter(new OfflineSorter()), sharedTailLength);
int inputLines = info.Lines;
reader = new OfflineSorter.ByteSequencesReader(tempSorted);
long line = 0;
int previousBucket = 0;
int previousScore = 0;
ByteArrayDataInput input = new ByteArrayDataInput();
BytesRef tmp1 = new BytesRef();
BytesRef tmp2 = new BytesRef();
while (reader.Read(tmp1))
{
input.Reset(tmp1.Bytes);
int currentScore = input.ReadInt32();
int bucket;
if (line > 0 && currentScore == previousScore)
{
bucket = previousBucket;
}
else
{
bucket = (int)(line * buckets / inputLines);
}
previousScore = currentScore;
previousBucket = bucket;
// Only append the input, discard the weight.
tmp2.Bytes = tmp1.Bytes;
tmp2.Offset = input.Position;
tmp2.Length = tmp1.Length - input.Position;
builder.Add(tmp2, bucket);
line++;
count++;
}
// The two FSTCompletions share the same automaton.
this.higherWeightsCompletion = builder.Build();
this.normalCompletion = new FSTCompletion(higherWeightsCompletion.FST, false, exactMatchFirst);
success = true;
}
finally
{
if (success)
{
IOUtils.Dispose(reader, writer, sorter);
}
else
{
IOUtils.DisposeWhileHandlingException(reader, writer, sorter);
}
tempInput.Delete();
tempSorted.Delete();
}
}
/// <summary>
/// weight -> cost </summary>
private static int EncodeWeight(long value)
{
if (value < int.MinValue || value > int.MaxValue)
{
throw new NotSupportedException("cannot encode value: " + value);
}
return (int)value;
}
public override IList<LookupResult> DoLookup(string key, IEnumerable<BytesRef> contexts, bool higherWeightsFirst, int num)
{
if (contexts != null)
{
throw new ArgumentException("this suggester doesn't support contexts");
}
IList<FSTCompletion.Completion> completions;
if (higherWeightsFirst)
{
completions = higherWeightsCompletion.DoLookup(key, num);
}
else
{
completions = normalCompletion.DoLookup(key, num);
}
List<LookupResult> results = new List<LookupResult>(completions.Count);
CharsRef spare = new CharsRef();
foreach (FSTCompletion.Completion c in completions)
{
spare.Grow(c.Utf8.Length);
UnicodeUtil.UTF8toUTF16(c.Utf8, spare);
results.Add(new LookupResult(spare.ToString(), c.Bucket));
}
return results;
}
/// <summary>
/// Returns the bucket (weight) as a Long for the provided key if it exists,
/// otherwise null if it does not.
/// </summary>
public virtual object Get(string key)
{
int bucket = normalCompletion.GetBucket(key);
return bucket == -1 ? (long?)null : bucket;
}
public override bool Store(DataOutput output)
{
lock (this)
{
output.WriteVInt64(count);
if (this.normalCompletion == null || normalCompletion.FST == null)
{
return false;
}
normalCompletion.FST.Save(output);
return true;
}
}
public override bool Load(DataInput input)
{
lock (this)
{
count = input.ReadVInt64();
this.higherWeightsCompletion = new FSTCompletion(new FST<object>(input, NoOutputs.Singleton));
this.normalCompletion = new FSTCompletion(higherWeightsCompletion.FST, false, exactMatchFirst);
return true;
}
}
public override long GetSizeInBytes()
{
long mem = RamUsageEstimator.ShallowSizeOf(this) + RamUsageEstimator.ShallowSizeOf(normalCompletion) + RamUsageEstimator.ShallowSizeOf(higherWeightsCompletion);
if (normalCompletion != null)
{
mem += normalCompletion.FST.GetSizeInBytes();
}
if (higherWeightsCompletion != null && (normalCompletion == null || normalCompletion.FST != higherWeightsCompletion.FST))
{
// the fst should be shared between the 2 completion instances, don't count it twice
mem += higherWeightsCompletion.FST.GetSizeInBytes();
}
return mem;
}
public override long Count => count;
}
}