blob: c7fdab522e40faed05e8a1ad312a5733170c7610 [file] [log] [blame]
using J2N.Text;
using Lucene.Net.Analysis;
using Lucene.Net.Analysis.Shingle;
using Lucene.Net.Analysis.TokenAttributes;
using Lucene.Net.Codecs;
using Lucene.Net.Documents;
using Lucene.Net.Index;
using Lucene.Net.Index.Extensions;
using Lucene.Net.Store;
using Lucene.Net.Support;
using Lucene.Net.Util;
using Lucene.Net.Util.Fst;
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using System.Linq;
using JCG = J2N.Collections.Generic;
using Directory = Lucene.Net.Store.Directory;
namespace Lucene.Net.Search.Suggest.Analyzing
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// TODO
// - test w/ syns
// - add pruning of low-freq ngrams?
/// <summary>
/// Builds an ngram model from the text sent to <see cref="Build(IInputIterator, double)"/>
/// and predicts based on the last grams-1 tokens in
/// the request sent to <see cref="DoLookup(string, IEnumerable{BytesRef}, bool, int)"/>. This tries to
/// handle the "long tail" of suggestions for when the
/// incoming query is a never before seen query string.
///
/// <para>Likely this suggester would only be used as a
/// fallback, when the primary suggester fails to find
/// any suggestions.
///
/// </para>
/// <para>Note that the weight for each suggestion is unused,
/// and the suggestions are the analyzed forms (so your
/// analysis process should normally be very "light").
///
/// </para>
/// <para>This uses the stupid backoff language model to smooth
/// scores across ngram models; see
/// <a href="http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.76.1126">
/// "Large language models in machine translation"</a> for details.
///
/// </para>
/// <para> From <see cref="DoLookup(string, IEnumerable{BytesRef}, bool, int)"/>, the key of each result is the
/// ngram token; the value is <see cref="long.MaxValue"/> * score (fixed
/// point, cast to long). Divide by <see cref="long.MaxValue"/> to get
/// the score back, which ranges from 0.0 to 1.0.
///
/// <c>onlyMorePopular</c> is unused.
///
/// @lucene.experimental
/// </para>
/// </summary>
public class FreeTextSuggester : Lookup
{
/// <summary>
/// Codec name used in the header for the saved model. </summary>
public const string CODEC_NAME = "freetextsuggest";
/// <summary>
/// Initial version of the the saved model file format. </summary>
public const int VERSION_START = 0;
/// <summary>
/// Current version of the the saved model file format. </summary>
public const int VERSION_CURRENT = VERSION_START;
/// <summary>
/// By default we use a bigram model. </summary>
public const int DEFAULT_GRAMS = 2;
// In general this could vary with gram, but the
// original paper seems to use this constant:
/// <summary>
/// The constant used for backoff smoothing; during
/// lookup, this means that if a given trigram did not
/// occur, and we backoff to the bigram, the overall score
/// will be 0.4 times what the bigram model would have
/// assigned.
/// </summary>
public const double ALPHA = 0.4;
/// <summary>
/// Holds 1gram, 2gram, 3gram models as a single FST. </summary>
private FST<long?> fst;
/// <summary>
/// Analyzer that will be used for analyzing suggestions at
/// index time.
/// </summary>
private readonly Analyzer indexAnalyzer;
private long totTokens;
/// <summary>
/// Analyzer that will be used for analyzing suggestions at
/// query time.
/// </summary>
private readonly Analyzer queryAnalyzer;
// 2 = bigram, 3 = trigram
private readonly int grams;
private readonly byte separator;
/// <summary>
/// Number of entries the lookup was built with </summary>
private long count = 0;
/// <summary>
/// The default character used to join multiple tokens
/// into a single ngram token. The input tokens produced
/// by the analyzer must not contain this character.
/// </summary>
public const byte DEFAULT_SEPARATOR = 0x1e;
/// <summary>
/// Instantiate, using the provided analyzer for both
/// indexing and lookup, using bigram model by default.
/// </summary>
public FreeTextSuggester(Analyzer analyzer)
: this(analyzer, analyzer, DEFAULT_GRAMS)
{
}
/// <summary>
/// Instantiate, using the provided indexing and lookup
/// analyzers, using bigram model by default.
/// </summary>
public FreeTextSuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer)
: this(indexAnalyzer, queryAnalyzer, DEFAULT_GRAMS)
{
}
/// <summary>
/// Instantiate, using the provided indexing and lookup
/// analyzers, with the specified model (2
/// = bigram, 3 = trigram, etc.).
/// </summary>
public FreeTextSuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer, int grams)
: this(indexAnalyzer, queryAnalyzer, grams, DEFAULT_SEPARATOR)
{
}
/// <summary>
/// Instantiate, using the provided indexing and lookup
/// analyzers, and specified model (2 = bigram, 3 =
/// trigram ,etc.). The <paramref name="separator"/> is passed to <see cref="ShingleFilter.SetTokenSeparator(string)"/>
/// to join multiple
/// tokens into a single ngram token; it must be an ascii
/// (7-bit-clean) byte. No input tokens should have this
/// byte, otherwise <see cref="ArgumentException"/> is
/// thrown.
/// </summary>
public FreeTextSuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer, int grams, byte separator)
{
this.grams = grams;
this.indexAnalyzer = AddShingles(indexAnalyzer);
this.queryAnalyzer = AddShingles(queryAnalyzer);
if (grams < 1)
{
throw new System.ArgumentException("grams must be >= 1");
}
if ((separator & 0x80) != 0)
{
throw new System.ArgumentException("separator must be simple ascii character");
}
this.separator = separator;
}
/// <summary>
/// Returns byte size of the underlying FST. </summary>
public override long GetSizeInBytes()
{
if (fst == null)
{
return 0;
}
return fst.GetSizeInBytes();
}
private class AnalyzingComparer : IComparer<BytesRef>
{
private readonly ByteArrayDataInput readerA = new ByteArrayDataInput();
private readonly ByteArrayDataInput readerB = new ByteArrayDataInput();
private readonly BytesRef scratchA = new BytesRef();
private readonly BytesRef scratchB = new BytesRef();
public virtual int Compare(BytesRef a, BytesRef b)
{
readerA.Reset(a.Bytes, a.Offset, a.Length);
readerB.Reset(b.Bytes, b.Offset, b.Length);
// By token:
scratchA.Length = (ushort)readerA.ReadInt16();
scratchA.Bytes = a.Bytes;
scratchA.Offset = readerA.Position;
scratchB.Bytes = b.Bytes;
scratchB.Length = (ushort)readerB.ReadInt16();
scratchB.Offset = readerB.Position;
int cmp = scratchA.CompareTo(scratchB);
if (cmp != 0)
{
return cmp;
}
readerA.SkipBytes(scratchA.Length);
readerB.SkipBytes(scratchB.Length);
// By length (smaller surface forms sorted first):
cmp = a.Length - b.Length;
if (cmp != 0)
{
return cmp;
}
// By surface form:
scratchA.Offset = readerA.Position;
scratchA.Length = a.Length - scratchA.Offset;
scratchB.Offset = readerB.Position;
scratchB.Length = b.Length - scratchB.Offset;
return scratchA.CompareTo(scratchB);
}
}
private Analyzer AddShingles(Analyzer other)
{
if (grams == 1)
{
return other;
}
else
{
// TODO: use ShingleAnalyzerWrapper?
// Tack on ShingleFilter to the end, to generate token ngrams:
return new AnalyzerWrapperAnonymousInnerClassHelper(this, other.Strategy, other);
}
}
private class AnalyzerWrapperAnonymousInnerClassHelper : AnalyzerWrapper
{
private readonly FreeTextSuggester outerInstance;
private readonly Analyzer other;
public AnalyzerWrapperAnonymousInnerClassHelper(FreeTextSuggester outerInstance, ReuseStrategy reuseStrategy, Analyzer other)
: base(reuseStrategy)
{
this.outerInstance = outerInstance;
this.other = other;
}
protected override Analyzer GetWrappedAnalyzer(string fieldName)
{
return other;
}
protected override TokenStreamComponents WrapComponents(string fieldName, TokenStreamComponents components)
{
ShingleFilter shingles = new ShingleFilter(components.TokenStream, 2, outerInstance.grams);
shingles.SetTokenSeparator(char.ToString((char)outerInstance.separator));
return new TokenStreamComponents(components.Tokenizer, shingles);
}
}
public override void Build(IInputIterator iterator)
{
Build(iterator, IndexWriterConfig.DEFAULT_RAM_BUFFER_SIZE_MB);
}
/// <summary>
/// Build the suggest index, using up to the specified
/// amount of temporary RAM while building. Note that
/// the weights for the suggestions are ignored.
/// </summary>
public virtual void Build(IInputIterator iterator, double ramBufferSizeMB)
{
if (iterator.HasPayloads)
{
throw new System.ArgumentException("this suggester doesn't support payloads");
}
if (iterator.HasContexts)
{
throw new System.ArgumentException("this suggester doesn't support contexts");
}
string prefix = this.GetType().Name;
var directory = OfflineSorter.DefaultTempDir();
// LUCENENET specific - using GetRandomFileName() instead of picking a random int
DirectoryInfo tempIndexPath = null;
while (true)
{
tempIndexPath = new DirectoryInfo(Path.Combine(directory.FullName, prefix + ".index." + Path.GetFileNameWithoutExtension(Path.GetRandomFileName())));
tempIndexPath.Create();
if (System.IO.Directory.Exists(tempIndexPath.FullName))
{
break;
}
}
Directory dir = FSDirectory.Open(tempIndexPath);
try
{
#pragma warning disable 612, 618
IndexWriterConfig iwc = new IndexWriterConfig(LuceneVersion.LUCENE_CURRENT, indexAnalyzer);
#pragma warning restore 612, 618
iwc.SetOpenMode(OpenMode.CREATE);
iwc.SetRAMBufferSizeMB(ramBufferSizeMB);
IndexWriter writer = new IndexWriter(dir, iwc);
var ft = new FieldType(TextField.TYPE_NOT_STORED);
// TODO: if only we had IndexOptions.TERMS_ONLY...
ft.IndexOptions = IndexOptions.DOCS_AND_FREQS;
ft.OmitNorms = true;
ft.Freeze();
Document doc = new Document();
Field field = new Field("body", "", ft);
doc.Add(field);
totTokens = 0;
IndexReader reader = null;
bool success = false;
count = 0;
try
{
while (true)
{
BytesRef surfaceForm = iterator.Next();
if (surfaceForm == null)
{
break;
}
field.SetStringValue(surfaceForm.Utf8ToString());
writer.AddDocument(doc);
count++;
}
reader = DirectoryReader.Open(writer, false);
Terms terms = MultiFields.GetTerms(reader, "body");
if (terms == null)
{
throw new System.ArgumentException("need at least one suggestion");
}
// Move all ngrams into an FST:
TermsEnum termsEnum = terms.GetIterator(null);
Outputs<long?> outputs = PositiveInt32Outputs.Singleton;
Builder<long?> builder = new Builder<long?>(FST.INPUT_TYPE.BYTE1, outputs);
Int32sRef scratchInts = new Int32sRef();
while (true)
{
BytesRef term = termsEnum.Next();
if (term == null)
{
break;
}
int ngramCount = CountGrams(term);
if (ngramCount > grams)
{
throw new System.ArgumentException("tokens must not contain separator byte; got token=" + term + " but gramCount=" + ngramCount + ", which is greater than expected max ngram size=" + grams);
}
if (ngramCount == 1)
{
totTokens += termsEnum.TotalTermFreq;
}
builder.Add(Lucene.Net.Util.Fst.Util.ToInt32sRef(term, scratchInts), EncodeWeight(termsEnum.TotalTermFreq));
}
fst = builder.Finish();
if (fst == null)
{
throw new System.ArgumentException("need at least one suggestion");
}
//System.out.println("FST: " + fst.getNodeCount() + " nodes");
/*
PrintWriter pw = new PrintWriter("/x/tmp/out.dot");
Util.toDot(fst, pw, true, true);
pw.close();
*/
success = true;
}
finally
{
if (success)
{
IOUtils.Dispose(writer, reader);
}
else
{
IOUtils.DisposeWhileHandlingException(writer, reader);
}
}
}
finally
{
try
{
IOUtils.Dispose(dir);
}
finally
{
// LUCENENET specific - since we are removing the entire directory anyway,
// it doesn't make sense to first do a loop in order remove the files.
// Let the System.IO.Directory.Delete() method handle that.
// We also need to dispose the Directory instance first before deleting from disk.
try
{
System.IO.Directory.Delete(tempIndexPath.FullName, true);
}
catch (Exception e)
{
throw new InvalidOperationException("failed to remove " + tempIndexPath, e);
}
}
}
}
public override bool Store(DataOutput output)
{
CodecUtil.WriteHeader(output, CODEC_NAME, VERSION_CURRENT);
output.WriteVInt64(count);
output.WriteByte(separator);
output.WriteVInt32(grams);
output.WriteVInt64(totTokens);
fst.Save(output);
return true;
}
public override bool Load(DataInput input)
{
CodecUtil.CheckHeader(input, CODEC_NAME, VERSION_START, VERSION_START);
count = input.ReadVInt64();
var separatorOrig = (sbyte)input.ReadByte();
if (separatorOrig != separator)
{
throw new InvalidOperationException("separator=" + separator + " is incorrect: original model was built with separator=" + separatorOrig);
}
int gramsOrig = input.ReadVInt32();
if (gramsOrig != grams)
{
throw new InvalidOperationException("grams=" + grams + " is incorrect: original model was built with grams=" + gramsOrig);
}
totTokens = input.ReadVInt64();
fst = new FST<long?>(input, PositiveInt32Outputs.Singleton);
return true;
}
public override IList<LookupResult> DoLookup(string key, bool onlyMorePopular, int num) // ignored
{
return DoLookup(key, null, onlyMorePopular, num);
}
/// <summary>
/// Lookup, without any context. </summary>
public virtual IList<LookupResult> DoLookup(string key, int num)
{
return DoLookup(key, null, true, num);
}
public override IList<LookupResult> DoLookup(string key, IEnumerable<BytesRef> contexts, /* ignored */ bool onlyMorePopular, int num)
{
try
{
return DoLookup(key, contexts, num);
}
catch (IOException ioe)
{
// bogus:
throw new Exception(ioe.ToString(), ioe);
}
}
public override long Count
{
get
{
return count;
}
}
private int CountGrams(BytesRef token)
{
int count = 1;
for (int i = 0; i < token.Length; i++)
{
if (token.Bytes[token.Offset + i] == separator)
{
count++;
}
}
return count;
}
/// <summary>
/// Retrieve suggestions.
/// </summary>
public virtual IList<LookupResult> DoLookup(string key, IEnumerable<BytesRef> contexts, int num)
{
if (contexts != null)
{
throw new System.ArgumentException("this suggester doesn't support contexts");
}
TokenStream ts = queryAnalyzer.GetTokenStream("", key.ToString());
try
{
ITermToBytesRefAttribute termBytesAtt = ts.AddAttribute<ITermToBytesRefAttribute>();
IOffsetAttribute offsetAtt = ts.AddAttribute<IOffsetAttribute>();
IPositionLengthAttribute posLenAtt = ts.AddAttribute<IPositionLengthAttribute>();
IPositionIncrementAttribute posIncAtt = ts.AddAttribute<IPositionIncrementAttribute>();
ts.Reset();
var lastTokens = new BytesRef[grams];
//System.out.println("lookup: key='" + key + "'");
// Run full analysis, but save only the
// last 1gram, last 2gram, etc.:
BytesRef tokenBytes = termBytesAtt.BytesRef;
int maxEndOffset = -1;
bool sawRealToken = false;
while (ts.IncrementToken())
{
termBytesAtt.FillBytesRef();
sawRealToken |= tokenBytes.Length > 0;
// TODO: this is somewhat iffy; today, ShingleFilter
// sets posLen to the gram count; maybe we should make
// a separate dedicated att for this?
int gramCount = posLenAtt.PositionLength;
Debug.Assert(gramCount <= grams);
// Safety: make sure the recalculated count "agrees":
if (CountGrams(tokenBytes) != gramCount)
{
throw new System.ArgumentException("tokens must not contain separator byte; got token=" + tokenBytes + " but gramCount=" + gramCount + " does not match recalculated count=" + CountGrams(tokenBytes));
}
maxEndOffset = Math.Max(maxEndOffset, offsetAtt.EndOffset);
lastTokens[gramCount - 1] = BytesRef.DeepCopyOf(tokenBytes);
}
ts.End();
if (!sawRealToken)
{
throw new System.ArgumentException("no tokens produced by analyzer, or the only tokens were empty strings");
}
// Carefully fill last tokens with _ tokens;
// ShingleFilter appraently won't emit "only hole"
// tokens:
int endPosInc = posIncAtt.PositionIncrement;
// Note this will also be true if input is the empty
// string (in which case we saw no tokens and
// maxEndOffset is still -1), which in fact works out OK
// because we fill the unigram with an empty BytesRef
// below:
bool lastTokenEnded = offsetAtt.EndOffset > maxEndOffset || endPosInc > 0;
//System.out.println("maxEndOffset=" + maxEndOffset + " vs " + offsetAtt.EndOffset);
if (lastTokenEnded)
{
//System.out.println(" lastTokenEnded");
// If user hit space after the last token, then
// "upgrade" all tokens. This way "foo " will suggest
// all bigrams starting w/ foo, and not any unigrams
// starting with "foo":
for (int i = grams - 1; i > 0; i--)
{
BytesRef token = lastTokens[i - 1];
if (token == null)
{
continue;
}
token.Grow(token.Length + 1);
token.Bytes[token.Length] = separator;
token.Length++;
lastTokens[i] = token;
}
lastTokens[0] = new BytesRef();
}
var arc = new FST.Arc<long?>();
var bytesReader = fst.GetBytesReader();
// Try highest order models first, and if they return
// results, return that; else, fallback:
double backoff = 1.0;
List<LookupResult> results = new List<LookupResult>(num);
// We only add a given suffix once, from the highest
// order model that saw it; for subsequent lower order
// models we skip it:
var seen = new JCG.HashSet<BytesRef>();
for (int gram = grams - 1; gram >= 0; gram--)
{
BytesRef token = lastTokens[gram];
// Don't make unigram predictions from empty string:
if (token == null || (token.Length == 0 && key.Length > 0))
{
// Input didn't have enough tokens:
//System.out.println(" gram=" + gram + ": skip: not enough input");
continue;
}
if (endPosInc > 0 && gram <= endPosInc)
{
// Skip hole-only predictions; in theory we
// shouldn't have to do this, but we'd need to fix
// ShingleFilter to produce only-hole tokens:
//System.out.println(" break: only holes now");
break;
}
//System.out.println("try " + (gram+1) + " gram token=" + token.utf8ToString());
// TODO: we could add fuzziness here
// match the prefix portion exactly
//Pair<Long,BytesRef> prefixOutput = null;
long? prefixOutput = null;
try
{
prefixOutput = LookupPrefix(fst, bytesReader, token, arc);
}
catch (IOException bogus)
{
throw new Exception(bogus.ToString(), bogus);
}
//System.out.println(" prefixOutput=" + prefixOutput);
if (prefixOutput == null)
{
// This model never saw this prefix, e.g. the
// trigram model never saw context "purple mushroom"
backoff *= ALPHA;
continue;
}
// TODO: we could do this division at build time, and
// bake it into the FST?
// Denominator for computing scores from current
// model's predictions:
long contextCount = totTokens;
BytesRef lastTokenFragment = null;
for (int i = token.Length - 1; i >= 0; i--)
{
if (token.Bytes[token.Offset + i] == separator)
{
BytesRef context = new BytesRef(token.Bytes, token.Offset, i);
long? output = Lucene.Net.Util.Fst.Util.Get(fst, Lucene.Net.Util.Fst.Util.ToInt32sRef(context, new Int32sRef()));
Debug.Assert(output != null);
contextCount = DecodeWeight(output);
lastTokenFragment = new BytesRef(token.Bytes, token.Offset + i + 1, token.Length - i - 1);
break;
}
}
BytesRef finalLastToken;
if (lastTokenFragment == null)
{
finalLastToken = BytesRef.DeepCopyOf(token);
}
else
{
finalLastToken = BytesRef.DeepCopyOf(lastTokenFragment);
}
Debug.Assert(finalLastToken.Offset == 0);
CharsRef spare = new CharsRef();
// complete top-N
Util.Fst.Util.TopResults<long?> completions = null;
try
{
// Because we store multiple models in one FST
// (1gram, 2gram, 3gram), we must restrict the
// search so that it only considers the current
// model. For highest order model, this is not
// necessary since all completions in the FST
// must be from this model, but for lower order
// models we have to filter out the higher order
// ones:
// Must do num+seen.size() for queue depth because we may
// reject up to seen.size() paths in acceptResult():
Util.Fst.Util.TopNSearcher<long?> searcher = new TopNSearcherAnonymousInnerClassHelper(this, fst, num, num + seen.Count, weightComparer, seen, finalLastToken);
// since this search is initialized with a single start node
// it is okay to start with an empty input path here
searcher.AddStartPaths(arc, prefixOutput, true, new Int32sRef());
completions = searcher.Search();
Debug.Assert(completions.IsComplete);
}
catch (IOException bogus)
{
throw new Exception(bogus.ToString(), bogus);
}
int prefixLength = token.Length;
BytesRef suffix = new BytesRef(8);
//System.out.println(" " + completions.length + " completions");
foreach (Util.Fst.Util.Result<long?> completion in completions)
{
token.Length = prefixLength;
// append suffix
Util.Fst.Util.ToBytesRef(completion.Input, suffix);
token.Append(suffix);
//System.out.println(" completion " + token.utf8ToString());
// Skip this path if a higher-order model already
// saw/predicted its last token:
BytesRef lastToken = token;
for (int i = token.Length - 1; i >= 0; i--)
{
if (token.Bytes[token.Offset + i] == separator)
{
Debug.Assert(token.Length - i - 1 > 0);
lastToken = new BytesRef(token.Bytes, token.Offset + i + 1, token.Length - i - 1);
break;
}
}
if (seen.Contains(lastToken))
{
//System.out.println(" skip dup " + lastToken.utf8ToString());
goto nextCompletionContinue;
}
seen.Add(BytesRef.DeepCopyOf(lastToken));
spare.Grow(token.Length);
UnicodeUtil.UTF8toUTF16(token, spare);
LookupResult result = new LookupResult(spare.ToString(),
// LUCENENET NOTE: We need to calculate this as decimal because when using double it can sometimes
// return numbers that are greater than long.MaxValue, which results in a negative long number.
(long)(long.MaxValue * (decimal)backoff * ((decimal)DecodeWeight(completion.Output)) / contextCount));
results.Add(result);
Debug.Assert(results.Count == seen.Count);
//System.out.println(" add result=" + result);
nextCompletionContinue:;
}
backoff *= ALPHA;
}
results.Sort(new ComparerAnonymousInnerClassHelper(this));
if (results.Count > num)
{
results.SubList(num, results.Count).Clear();
}
return results;
}
finally
{
IOUtils.DisposeWhileHandlingException(ts);
}
}
private class TopNSearcherAnonymousInnerClassHelper : Util.Fst.Util.TopNSearcher<long?>
{
private readonly FreeTextSuggester outerInstance;
private IEnumerable<BytesRef> seen;
private BytesRef finalLastToken;
public TopNSearcherAnonymousInnerClassHelper(
FreeTextSuggester outerInstance,
FST<long?> fst,
int num,
int size,
IComparer<long?> weightComparer,
IEnumerable<BytesRef> seen,
BytesRef finalLastToken)
: base(fst, num, size, weightComparer)
{
this.outerInstance = outerInstance;
this.seen = seen;
this.finalLastToken = finalLastToken;
scratchBytes = new BytesRef();
}
private BytesRef scratchBytes;
protected override void AddIfCompetitive(Util.Fst.Util.FSTPath<long?> path)
{
if (path.Arc.Label != outerInstance.separator)
{
//System.out.println(" keep path: " + Util.toBytesRef(path.input, new BytesRef()).utf8ToString() + "; " + path + "; arc=" + path.arc);
base.AddIfCompetitive(path);
}
else
{
//System.out.println(" prevent path: " + Util.toBytesRef(path.input, new BytesRef()).utf8ToString() + "; " + path + "; arc=" + path.arc);
}
}
protected override bool AcceptResult(Int32sRef input, long? output)
{
Util.Fst.Util.ToBytesRef(input, scratchBytes);
finalLastToken.Grow(finalLastToken.Length + scratchBytes.Length);
int lenSav = finalLastToken.Length;
finalLastToken.Append(scratchBytes);
//System.out.println(" accept? input='" + scratchBytes.utf8ToString() + "'; lastToken='" + finalLastToken.utf8ToString() + "'; return " + (seen.contains(finalLastToken) == false));
bool ret = seen.Contains(finalLastToken) == false;
finalLastToken.Length = lenSav;
return ret;
}
}
private sealed class ComparerAnonymousInnerClassHelper : IComparer<Lookup.LookupResult>
{
private readonly FreeTextSuggester outerInstance;
public ComparerAnonymousInnerClassHelper(FreeTextSuggester outerInstance)
{
this.outerInstance = outerInstance;
}
public int Compare(LookupResult a, LookupResult b)
{
if (a.Value > b.Value)
{
return -1;
}
else if (a.Value < b.Value)
{
return 1;
}
else
{
// Tie break by UTF16 sort order:
return a.Key.CompareToOrdinal(b.Key);
}
}
}
/// <summary>
/// weight -> cost </summary>
private long EncodeWeight(long ngramCount)
{
return long.MaxValue - ngramCount;
}
/// <summary>
/// cost -> weight </summary>
//private long decodeWeight(Pair<Long,BytesRef> output) {
private static long DecodeWeight(long? output)
{
Debug.Assert(output != null);
return (int)(long.MaxValue - output); // LUCENENET TODO: Perhaps a Java Lucene bug? Why cast to int when returning long?
}
// NOTE: copied from WFSTCompletionLookup & tweaked
private long? LookupPrefix(FST<long?> fst, FST.BytesReader bytesReader, BytesRef scratch, FST.Arc<long?> arc)
{
long? output = fst.Outputs.NoOutput;
fst.GetFirstArc(arc);
var bytes = scratch.Bytes;
var pos = scratch.Offset;
var end = pos + scratch.Length;
while (pos < end)
{
if (fst.FindTargetArc(bytes[pos++] & 0xff, arc, arc, bytesReader) == null)
{
return null;
}
else
{
output = fst.Outputs.Add(output, arc.Output);
}
}
return output;
}
internal static readonly IComparer<long?> weightComparer = new ComparerAnonymousInnerClassHelper2();
private sealed class ComparerAnonymousInnerClassHelper2 : IComparer<long?>
{
public ComparerAnonymousInnerClassHelper2()
{
}
public int Compare(long? left, long? right)
{
return Comparer<long?>.Default.Compare(left, right);
}
}
/// <summary>
/// Returns the weight associated with an input string,
/// or null if it does not exist.
/// </summary>
public virtual object Get(string key)
{
throw new NotSupportedException();
}
}
}