| using J2N.Text; |
| using Lucene.Net.Analysis; |
| using Lucene.Net.Analysis.Shingle; |
| using Lucene.Net.Analysis.TokenAttributes; |
| using Lucene.Net.Codecs; |
| using Lucene.Net.Documents; |
| using Lucene.Net.Index; |
| using Lucene.Net.Index.Extensions; |
| using Lucene.Net.Store; |
| using Lucene.Net.Support; |
| using Lucene.Net.Util; |
| using Lucene.Net.Util.Fst; |
| using System; |
| using System.Collections.Generic; |
| using System.Diagnostics; |
| using System.IO; |
| using System.Linq; |
| using JCG = J2N.Collections.Generic; |
| using Directory = Lucene.Net.Store.Directory; |
| |
| namespace Lucene.Net.Search.Suggest.Analyzing |
| { |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| // TODO |
| // - test w/ syns |
| // - add pruning of low-freq ngrams? |
| |
| /// <summary> |
| /// Builds an ngram model from the text sent to <see cref="Build(IInputIterator, double)"/> |
| /// and predicts based on the last grams-1 tokens in |
| /// the request sent to <see cref="DoLookup(string, IEnumerable{BytesRef}, bool, int)"/>. This tries to |
| /// handle the "long tail" of suggestions for when the |
| /// incoming query is a never before seen query string. |
| /// |
| /// <para>Likely this suggester would only be used as a |
| /// fallback, when the primary suggester fails to find |
| /// any suggestions. |
| /// |
| /// </para> |
| /// <para>Note that the weight for each suggestion is unused, |
| /// and the suggestions are the analyzed forms (so your |
| /// analysis process should normally be very "light"). |
| /// |
| /// </para> |
| /// <para>This uses the stupid backoff language model to smooth |
| /// scores across ngram models; see |
| /// <a href="http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.76.1126"> |
| /// "Large language models in machine translation"</a> for details. |
| /// |
| /// </para> |
| /// <para> From <see cref="DoLookup(string, IEnumerable{BytesRef}, bool, int)"/>, the key of each result is the |
| /// ngram token; the value is <see cref="long.MaxValue"/> * score (fixed |
| /// point, cast to long). Divide by <see cref="long.MaxValue"/> to get |
| /// the score back, which ranges from 0.0 to 1.0. |
| /// |
| /// <c>onlyMorePopular</c> is unused. |
| /// |
| /// @lucene.experimental |
| /// </para> |
| /// </summary> |
| public class FreeTextSuggester : Lookup |
| { |
| |
| /// <summary> |
| /// Codec name used in the header for the saved model. </summary> |
| public const string CODEC_NAME = "freetextsuggest"; |
| |
| /// <summary> |
| /// Initial version of the the saved model file format. </summary> |
| public const int VERSION_START = 0; |
| |
| /// <summary> |
| /// Current version of the the saved model file format. </summary> |
| public const int VERSION_CURRENT = VERSION_START; |
| |
| /// <summary> |
| /// By default we use a bigram model. </summary> |
| public const int DEFAULT_GRAMS = 2; |
| |
| // In general this could vary with gram, but the |
| // original paper seems to use this constant: |
| /// <summary> |
| /// The constant used for backoff smoothing; during |
| /// lookup, this means that if a given trigram did not |
| /// occur, and we backoff to the bigram, the overall score |
| /// will be 0.4 times what the bigram model would have |
| /// assigned. |
| /// </summary> |
| public const double ALPHA = 0.4; |
| |
| /// <summary> |
| /// Holds 1gram, 2gram, 3gram models as a single FST. </summary> |
| private FST<long?> fst; |
| |
| /// <summary> |
| /// Analyzer that will be used for analyzing suggestions at |
| /// index time. |
| /// </summary> |
| private readonly Analyzer indexAnalyzer; |
| |
| private long totTokens; |
| |
| /// <summary> |
| /// Analyzer that will be used for analyzing suggestions at |
| /// query time. |
| /// </summary> |
| private readonly Analyzer queryAnalyzer; |
| |
| // 2 = bigram, 3 = trigram |
| private readonly int grams; |
| |
| private readonly byte separator; |
| |
| /// <summary> |
| /// Number of entries the lookup was built with </summary> |
| private long count = 0; |
| |
| /// <summary> |
| /// The default character used to join multiple tokens |
| /// into a single ngram token. The input tokens produced |
| /// by the analyzer must not contain this character. |
| /// </summary> |
| public const byte DEFAULT_SEPARATOR = 0x1e; |
| |
| /// <summary> |
| /// Instantiate, using the provided analyzer for both |
| /// indexing and lookup, using bigram model by default. |
| /// </summary> |
| public FreeTextSuggester(Analyzer analyzer) |
| : this(analyzer, analyzer, DEFAULT_GRAMS) |
| { |
| } |
| |
| /// <summary> |
| /// Instantiate, using the provided indexing and lookup |
| /// analyzers, using bigram model by default. |
| /// </summary> |
| public FreeTextSuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer) |
| : this(indexAnalyzer, queryAnalyzer, DEFAULT_GRAMS) |
| { |
| } |
| |
| /// <summary> |
| /// Instantiate, using the provided indexing and lookup |
| /// analyzers, with the specified model (2 |
| /// = bigram, 3 = trigram, etc.). |
| /// </summary> |
| public FreeTextSuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer, int grams) |
| : this(indexAnalyzer, queryAnalyzer, grams, DEFAULT_SEPARATOR) |
| { |
| } |
| |
| /// <summary> |
| /// Instantiate, using the provided indexing and lookup |
| /// analyzers, and specified model (2 = bigram, 3 = |
| /// trigram ,etc.). The <paramref name="separator"/> is passed to <see cref="ShingleFilter.SetTokenSeparator(string)"/> |
| /// to join multiple |
| /// tokens into a single ngram token; it must be an ascii |
| /// (7-bit-clean) byte. No input tokens should have this |
| /// byte, otherwise <see cref="ArgumentException"/> is |
| /// thrown. |
| /// </summary> |
| public FreeTextSuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer, int grams, byte separator) |
| { |
| this.grams = grams; |
| this.indexAnalyzer = AddShingles(indexAnalyzer); |
| this.queryAnalyzer = AddShingles(queryAnalyzer); |
| if (grams < 1) |
| { |
| throw new System.ArgumentException("grams must be >= 1"); |
| } |
| if ((separator & 0x80) != 0) |
| { |
| throw new System.ArgumentException("separator must be simple ascii character"); |
| } |
| this.separator = separator; |
| } |
| |
| /// <summary> |
| /// Returns byte size of the underlying FST. </summary> |
| public override long GetSizeInBytes() |
| { |
| if (fst == null) |
| { |
| return 0; |
| } |
| return fst.GetSizeInBytes(); |
| } |
| |
| private class AnalyzingComparer : IComparer<BytesRef> |
| { |
| private readonly ByteArrayDataInput readerA = new ByteArrayDataInput(); |
| private readonly ByteArrayDataInput readerB = new ByteArrayDataInput(); |
| private readonly BytesRef scratchA = new BytesRef(); |
| private readonly BytesRef scratchB = new BytesRef(); |
| |
| public virtual int Compare(BytesRef a, BytesRef b) |
| { |
| readerA.Reset(a.Bytes, a.Offset, a.Length); |
| readerB.Reset(b.Bytes, b.Offset, b.Length); |
| |
| // By token: |
| scratchA.Length = (ushort)readerA.ReadInt16(); |
| scratchA.Bytes = a.Bytes; |
| scratchA.Offset = readerA.Position; |
| |
| scratchB.Bytes = b.Bytes; |
| scratchB.Length = (ushort)readerB.ReadInt16(); |
| scratchB.Offset = readerB.Position; |
| |
| int cmp = scratchA.CompareTo(scratchB); |
| if (cmp != 0) |
| { |
| return cmp; |
| } |
| readerA.SkipBytes(scratchA.Length); |
| readerB.SkipBytes(scratchB.Length); |
| |
| // By length (smaller surface forms sorted first): |
| cmp = a.Length - b.Length; |
| if (cmp != 0) |
| { |
| return cmp; |
| } |
| |
| // By surface form: |
| scratchA.Offset = readerA.Position; |
| scratchA.Length = a.Length - scratchA.Offset; |
| scratchB.Offset = readerB.Position; |
| scratchB.Length = b.Length - scratchB.Offset; |
| |
| return scratchA.CompareTo(scratchB); |
| } |
| } |
| |
| private Analyzer AddShingles(Analyzer other) |
| { |
| if (grams == 1) |
| { |
| return other; |
| } |
| else |
| { |
| // TODO: use ShingleAnalyzerWrapper? |
| // Tack on ShingleFilter to the end, to generate token ngrams: |
| return new AnalyzerWrapperAnonymousInnerClassHelper(this, other.Strategy, other); |
| } |
| } |
| |
| private class AnalyzerWrapperAnonymousInnerClassHelper : AnalyzerWrapper |
| { |
| private readonly FreeTextSuggester outerInstance; |
| private readonly Analyzer other; |
| |
| public AnalyzerWrapperAnonymousInnerClassHelper(FreeTextSuggester outerInstance, ReuseStrategy reuseStrategy, Analyzer other) |
| : base(reuseStrategy) |
| { |
| this.outerInstance = outerInstance; |
| this.other = other; |
| } |
| |
| protected override Analyzer GetWrappedAnalyzer(string fieldName) |
| { |
| return other; |
| } |
| |
| protected override TokenStreamComponents WrapComponents(string fieldName, TokenStreamComponents components) |
| { |
| ShingleFilter shingles = new ShingleFilter(components.TokenStream, 2, outerInstance.grams); |
| shingles.SetTokenSeparator(char.ToString((char)outerInstance.separator)); |
| return new TokenStreamComponents(components.Tokenizer, shingles); |
| } |
| } |
| |
| public override void Build(IInputIterator iterator) |
| { |
| Build(iterator, IndexWriterConfig.DEFAULT_RAM_BUFFER_SIZE_MB); |
| } |
| |
| /// <summary> |
| /// Build the suggest index, using up to the specified |
| /// amount of temporary RAM while building. Note that |
| /// the weights for the suggestions are ignored. |
| /// </summary> |
| public virtual void Build(IInputIterator iterator, double ramBufferSizeMB) |
| { |
| if (iterator.HasPayloads) |
| { |
| throw new System.ArgumentException("this suggester doesn't support payloads"); |
| } |
| if (iterator.HasContexts) |
| { |
| throw new System.ArgumentException("this suggester doesn't support contexts"); |
| } |
| |
| string prefix = this.GetType().Name; |
| var directory = OfflineSorter.DefaultTempDir(); |
| |
| // LUCENENET specific - using GetRandomFileName() instead of picking a random int |
| DirectoryInfo tempIndexPath = null; |
| while (true) |
| { |
| tempIndexPath = new DirectoryInfo(Path.Combine(directory.FullName, prefix + ".index." + Path.GetFileNameWithoutExtension(Path.GetRandomFileName()))); |
| tempIndexPath.Create(); |
| if (System.IO.Directory.Exists(tempIndexPath.FullName)) |
| { |
| break; |
| } |
| } |
| |
| Directory dir = FSDirectory.Open(tempIndexPath); |
| try |
| { |
| #pragma warning disable 612, 618 |
| IndexWriterConfig iwc = new IndexWriterConfig(LuceneVersion.LUCENE_CURRENT, indexAnalyzer); |
| #pragma warning restore 612, 618 |
| iwc.SetOpenMode(OpenMode.CREATE); |
| iwc.SetRAMBufferSizeMB(ramBufferSizeMB); |
| IndexWriter writer = new IndexWriter(dir, iwc); |
| |
| var ft = new FieldType(TextField.TYPE_NOT_STORED); |
| // TODO: if only we had IndexOptions.TERMS_ONLY... |
| ft.IndexOptions = IndexOptions.DOCS_AND_FREQS; |
| ft.OmitNorms = true; |
| ft.Freeze(); |
| |
| Document doc = new Document(); |
| Field field = new Field("body", "", ft); |
| doc.Add(field); |
| |
| totTokens = 0; |
| IndexReader reader = null; |
| |
| bool success = false; |
| count = 0; |
| try |
| { |
| while (true) |
| { |
| BytesRef surfaceForm = iterator.Next(); |
| if (surfaceForm == null) |
| { |
| break; |
| } |
| field.SetStringValue(surfaceForm.Utf8ToString()); |
| writer.AddDocument(doc); |
| count++; |
| } |
| reader = DirectoryReader.Open(writer, false); |
| |
| Terms terms = MultiFields.GetTerms(reader, "body"); |
| if (terms == null) |
| { |
| throw new System.ArgumentException("need at least one suggestion"); |
| } |
| |
| // Move all ngrams into an FST: |
| TermsEnum termsEnum = terms.GetIterator(null); |
| |
| Outputs<long?> outputs = PositiveInt32Outputs.Singleton; |
| Builder<long?> builder = new Builder<long?>(FST.INPUT_TYPE.BYTE1, outputs); |
| |
| Int32sRef scratchInts = new Int32sRef(); |
| while (true) |
| { |
| BytesRef term = termsEnum.Next(); |
| if (term == null) |
| { |
| break; |
| } |
| int ngramCount = CountGrams(term); |
| if (ngramCount > grams) |
| { |
| throw new System.ArgumentException("tokens must not contain separator byte; got token=" + term + " but gramCount=" + ngramCount + ", which is greater than expected max ngram size=" + grams); |
| } |
| if (ngramCount == 1) |
| { |
| totTokens += termsEnum.TotalTermFreq; |
| } |
| |
| builder.Add(Lucene.Net.Util.Fst.Util.ToInt32sRef(term, scratchInts), EncodeWeight(termsEnum.TotalTermFreq)); |
| } |
| |
| fst = builder.Finish(); |
| if (fst == null) |
| { |
| throw new System.ArgumentException("need at least one suggestion"); |
| } |
| //System.out.println("FST: " + fst.getNodeCount() + " nodes"); |
| |
| /* |
| PrintWriter pw = new PrintWriter("/x/tmp/out.dot"); |
| Util.toDot(fst, pw, true, true); |
| pw.close(); |
| */ |
| |
| success = true; |
| } |
| finally |
| { |
| if (success) |
| { |
| IOUtils.Dispose(writer, reader); |
| } |
| else |
| { |
| IOUtils.DisposeWhileHandlingException(writer, reader); |
| } |
| } |
| } |
| finally |
| { |
| try |
| { |
| IOUtils.Dispose(dir); |
| } |
| finally |
| { |
| // LUCENENET specific - since we are removing the entire directory anyway, |
| // it doesn't make sense to first do a loop in order remove the files. |
| // Let the System.IO.Directory.Delete() method handle that. |
| // We also need to dispose the Directory instance first before deleting from disk. |
| try |
| { |
| System.IO.Directory.Delete(tempIndexPath.FullName, true); |
| } |
| catch (Exception e) |
| { |
| throw new InvalidOperationException("failed to remove " + tempIndexPath, e); |
| } |
| } |
| } |
| } |
| |
| public override bool Store(DataOutput output) |
| { |
| CodecUtil.WriteHeader(output, CODEC_NAME, VERSION_CURRENT); |
| output.WriteVInt64(count); |
| output.WriteByte(separator); |
| output.WriteVInt32(grams); |
| output.WriteVInt64(totTokens); |
| fst.Save(output); |
| return true; |
| } |
| |
| public override bool Load(DataInput input) |
| { |
| CodecUtil.CheckHeader(input, CODEC_NAME, VERSION_START, VERSION_START); |
| count = input.ReadVInt64(); |
| var separatorOrig = (sbyte)input.ReadByte(); |
| if (separatorOrig != separator) |
| { |
| throw new InvalidOperationException("separator=" + separator + " is incorrect: original model was built with separator=" + separatorOrig); |
| } |
| int gramsOrig = input.ReadVInt32(); |
| if (gramsOrig != grams) |
| { |
| throw new InvalidOperationException("grams=" + grams + " is incorrect: original model was built with grams=" + gramsOrig); |
| } |
| totTokens = input.ReadVInt64(); |
| |
| fst = new FST<long?>(input, PositiveInt32Outputs.Singleton); |
| |
| return true; |
| } |
| |
| public override IList<LookupResult> DoLookup(string key, bool onlyMorePopular, int num) // ignored |
| { |
| return DoLookup(key, null, onlyMorePopular, num); |
| } |
| |
| /// <summary> |
| /// Lookup, without any context. </summary> |
| public virtual IList<LookupResult> DoLookup(string key, int num) |
| { |
| return DoLookup(key, null, true, num); |
| } |
| |
| public override IList<LookupResult> DoLookup(string key, IEnumerable<BytesRef> contexts, /* ignored */ bool onlyMorePopular, int num) |
| { |
| try |
| { |
| return DoLookup(key, contexts, num); |
| } |
| catch (IOException ioe) |
| { |
| // bogus: |
| throw new Exception(ioe.ToString(), ioe); |
| } |
| } |
| |
| public override long Count |
| { |
| get |
| { |
| return count; |
| } |
| } |
| |
| private int CountGrams(BytesRef token) |
| { |
| int count = 1; |
| for (int i = 0; i < token.Length; i++) |
| { |
| if (token.Bytes[token.Offset + i] == separator) |
| { |
| count++; |
| } |
| } |
| |
| return count; |
| } |
| |
| /// <summary> |
| /// Retrieve suggestions. |
| /// </summary> |
| public virtual IList<LookupResult> DoLookup(string key, IEnumerable<BytesRef> contexts, int num) |
| { |
| if (contexts != null) |
| { |
| throw new System.ArgumentException("this suggester doesn't support contexts"); |
| } |
| |
| TokenStream ts = queryAnalyzer.GetTokenStream("", key.ToString()); |
| try |
| { |
| ITermToBytesRefAttribute termBytesAtt = ts.AddAttribute<ITermToBytesRefAttribute>(); |
| IOffsetAttribute offsetAtt = ts.AddAttribute<IOffsetAttribute>(); |
| IPositionLengthAttribute posLenAtt = ts.AddAttribute<IPositionLengthAttribute>(); |
| IPositionIncrementAttribute posIncAtt = ts.AddAttribute<IPositionIncrementAttribute>(); |
| ts.Reset(); |
| |
| var lastTokens = new BytesRef[grams]; |
| //System.out.println("lookup: key='" + key + "'"); |
| |
| // Run full analysis, but save only the |
| // last 1gram, last 2gram, etc.: |
| BytesRef tokenBytes = termBytesAtt.BytesRef; |
| int maxEndOffset = -1; |
| bool sawRealToken = false; |
| while (ts.IncrementToken()) |
| { |
| termBytesAtt.FillBytesRef(); |
| sawRealToken |= tokenBytes.Length > 0; |
| // TODO: this is somewhat iffy; today, ShingleFilter |
| // sets posLen to the gram count; maybe we should make |
| // a separate dedicated att for this? |
| int gramCount = posLenAtt.PositionLength; |
| |
| Debug.Assert(gramCount <= grams); |
| |
| // Safety: make sure the recalculated count "agrees": |
| if (CountGrams(tokenBytes) != gramCount) |
| { |
| throw new System.ArgumentException("tokens must not contain separator byte; got token=" + tokenBytes + " but gramCount=" + gramCount + " does not match recalculated count=" + CountGrams(tokenBytes)); |
| } |
| maxEndOffset = Math.Max(maxEndOffset, offsetAtt.EndOffset); |
| lastTokens[gramCount - 1] = BytesRef.DeepCopyOf(tokenBytes); |
| } |
| ts.End(); |
| |
| if (!sawRealToken) |
| { |
| throw new System.ArgumentException("no tokens produced by analyzer, or the only tokens were empty strings"); |
| } |
| |
| // Carefully fill last tokens with _ tokens; |
| // ShingleFilter appraently won't emit "only hole" |
| // tokens: |
| int endPosInc = posIncAtt.PositionIncrement; |
| |
| // Note this will also be true if input is the empty |
| // string (in which case we saw no tokens and |
| // maxEndOffset is still -1), which in fact works out OK |
| // because we fill the unigram with an empty BytesRef |
| // below: |
| bool lastTokenEnded = offsetAtt.EndOffset > maxEndOffset || endPosInc > 0; |
| //System.out.println("maxEndOffset=" + maxEndOffset + " vs " + offsetAtt.EndOffset); |
| |
| if (lastTokenEnded) |
| { |
| //System.out.println(" lastTokenEnded"); |
| // If user hit space after the last token, then |
| // "upgrade" all tokens. This way "foo " will suggest |
| // all bigrams starting w/ foo, and not any unigrams |
| // starting with "foo": |
| for (int i = grams - 1; i > 0; i--) |
| { |
| BytesRef token = lastTokens[i - 1]; |
| if (token == null) |
| { |
| continue; |
| } |
| token.Grow(token.Length + 1); |
| token.Bytes[token.Length] = separator; |
| token.Length++; |
| lastTokens[i] = token; |
| } |
| lastTokens[0] = new BytesRef(); |
| } |
| |
| var arc = new FST.Arc<long?>(); |
| |
| var bytesReader = fst.GetBytesReader(); |
| |
| // Try highest order models first, and if they return |
| // results, return that; else, fallback: |
| double backoff = 1.0; |
| |
| List<LookupResult> results = new List<LookupResult>(num); |
| |
| // We only add a given suffix once, from the highest |
| // order model that saw it; for subsequent lower order |
| // models we skip it: |
| var seen = new JCG.HashSet<BytesRef>(); |
| |
| for (int gram = grams - 1; gram >= 0; gram--) |
| { |
| BytesRef token = lastTokens[gram]; |
| // Don't make unigram predictions from empty string: |
| if (token == null || (token.Length == 0 && key.Length > 0)) |
| { |
| // Input didn't have enough tokens: |
| //System.out.println(" gram=" + gram + ": skip: not enough input"); |
| continue; |
| } |
| |
| if (endPosInc > 0 && gram <= endPosInc) |
| { |
| // Skip hole-only predictions; in theory we |
| // shouldn't have to do this, but we'd need to fix |
| // ShingleFilter to produce only-hole tokens: |
| //System.out.println(" break: only holes now"); |
| break; |
| } |
| |
| //System.out.println("try " + (gram+1) + " gram token=" + token.utf8ToString()); |
| |
| // TODO: we could add fuzziness here |
| // match the prefix portion exactly |
| //Pair<Long,BytesRef> prefixOutput = null; |
| long? prefixOutput = null; |
| try |
| { |
| prefixOutput = LookupPrefix(fst, bytesReader, token, arc); |
| } |
| catch (IOException bogus) |
| { |
| throw new Exception(bogus.ToString(), bogus); |
| } |
| //System.out.println(" prefixOutput=" + prefixOutput); |
| |
| if (prefixOutput == null) |
| { |
| // This model never saw this prefix, e.g. the |
| // trigram model never saw context "purple mushroom" |
| backoff *= ALPHA; |
| continue; |
| } |
| |
| // TODO: we could do this division at build time, and |
| // bake it into the FST? |
| |
| // Denominator for computing scores from current |
| // model's predictions: |
| long contextCount = totTokens; |
| |
| BytesRef lastTokenFragment = null; |
| |
| for (int i = token.Length - 1; i >= 0; i--) |
| { |
| if (token.Bytes[token.Offset + i] == separator) |
| { |
| BytesRef context = new BytesRef(token.Bytes, token.Offset, i); |
| long? output = Lucene.Net.Util.Fst.Util.Get(fst, Lucene.Net.Util.Fst.Util.ToInt32sRef(context, new Int32sRef())); |
| Debug.Assert(output != null); |
| contextCount = DecodeWeight(output); |
| lastTokenFragment = new BytesRef(token.Bytes, token.Offset + i + 1, token.Length - i - 1); |
| break; |
| } |
| } |
| |
| BytesRef finalLastToken; |
| |
| if (lastTokenFragment == null) |
| { |
| finalLastToken = BytesRef.DeepCopyOf(token); |
| } |
| else |
| { |
| finalLastToken = BytesRef.DeepCopyOf(lastTokenFragment); |
| } |
| Debug.Assert(finalLastToken.Offset == 0); |
| |
| CharsRef spare = new CharsRef(); |
| |
| // complete top-N |
| Util.Fst.Util.TopResults<long?> completions = null; |
| try |
| { |
| |
| // Because we store multiple models in one FST |
| // (1gram, 2gram, 3gram), we must restrict the |
| // search so that it only considers the current |
| // model. For highest order model, this is not |
| // necessary since all completions in the FST |
| // must be from this model, but for lower order |
| // models we have to filter out the higher order |
| // ones: |
| |
| // Must do num+seen.size() for queue depth because we may |
| // reject up to seen.size() paths in acceptResult(): |
| Util.Fst.Util.TopNSearcher<long?> searcher = new TopNSearcherAnonymousInnerClassHelper(this, fst, num, num + seen.Count, weightComparer, seen, finalLastToken); |
| |
| // since this search is initialized with a single start node |
| // it is okay to start with an empty input path here |
| searcher.AddStartPaths(arc, prefixOutput, true, new Int32sRef()); |
| |
| completions = searcher.Search(); |
| Debug.Assert(completions.IsComplete); |
| } |
| catch (IOException bogus) |
| { |
| throw new Exception(bogus.ToString(), bogus); |
| } |
| |
| int prefixLength = token.Length; |
| |
| BytesRef suffix = new BytesRef(8); |
| //System.out.println(" " + completions.length + " completions"); |
| |
| foreach (Util.Fst.Util.Result<long?> completion in completions) |
| { |
| token.Length = prefixLength; |
| // append suffix |
| Util.Fst.Util.ToBytesRef(completion.Input, suffix); |
| token.Append(suffix); |
| |
| //System.out.println(" completion " + token.utf8ToString()); |
| |
| // Skip this path if a higher-order model already |
| // saw/predicted its last token: |
| BytesRef lastToken = token; |
| for (int i = token.Length - 1; i >= 0; i--) |
| { |
| if (token.Bytes[token.Offset + i] == separator) |
| { |
| Debug.Assert(token.Length - i - 1 > 0); |
| lastToken = new BytesRef(token.Bytes, token.Offset + i + 1, token.Length - i - 1); |
| break; |
| } |
| } |
| if (seen.Contains(lastToken)) |
| { |
| //System.out.println(" skip dup " + lastToken.utf8ToString()); |
| goto nextCompletionContinue; |
| } |
| seen.Add(BytesRef.DeepCopyOf(lastToken)); |
| spare.Grow(token.Length); |
| UnicodeUtil.UTF8toUTF16(token, spare); |
| LookupResult result = new LookupResult(spare.ToString(), |
| // LUCENENET NOTE: We need to calculate this as decimal because when using double it can sometimes |
| // return numbers that are greater than long.MaxValue, which results in a negative long number. |
| (long)(long.MaxValue * (decimal)backoff * ((decimal)DecodeWeight(completion.Output)) / contextCount)); |
| results.Add(result); |
| Debug.Assert(results.Count == seen.Count); |
| //System.out.println(" add result=" + result); |
| nextCompletionContinue:; |
| } |
| backoff *= ALPHA; |
| } |
| |
| results.Sort(new ComparerAnonymousInnerClassHelper(this)); |
| |
| if (results.Count > num) |
| { |
| results.SubList(num, results.Count).Clear(); |
| } |
| |
| return results; |
| } |
| finally |
| { |
| IOUtils.DisposeWhileHandlingException(ts); |
| } |
| } |
| |
| private class TopNSearcherAnonymousInnerClassHelper : Util.Fst.Util.TopNSearcher<long?> |
| { |
| private readonly FreeTextSuggester outerInstance; |
| |
| private IEnumerable<BytesRef> seen; |
| private BytesRef finalLastToken; |
| |
| public TopNSearcherAnonymousInnerClassHelper( |
| FreeTextSuggester outerInstance, |
| FST<long?> fst, |
| int num, |
| int size, |
| IComparer<long?> weightComparer, |
| IEnumerable<BytesRef> seen, |
| BytesRef finalLastToken) |
| : base(fst, num, size, weightComparer) |
| { |
| this.outerInstance = outerInstance; |
| this.seen = seen; |
| this.finalLastToken = finalLastToken; |
| scratchBytes = new BytesRef(); |
| } |
| |
| |
| private BytesRef scratchBytes; |
| |
| protected override void AddIfCompetitive(Util.Fst.Util.FSTPath<long?> path) |
| { |
| if (path.Arc.Label != outerInstance.separator) |
| { |
| //System.out.println(" keep path: " + Util.toBytesRef(path.input, new BytesRef()).utf8ToString() + "; " + path + "; arc=" + path.arc); |
| base.AddIfCompetitive(path); |
| } |
| else |
| { |
| //System.out.println(" prevent path: " + Util.toBytesRef(path.input, new BytesRef()).utf8ToString() + "; " + path + "; arc=" + path.arc); |
| } |
| } |
| |
| protected override bool AcceptResult(Int32sRef input, long? output) |
| { |
| Util.Fst.Util.ToBytesRef(input, scratchBytes); |
| finalLastToken.Grow(finalLastToken.Length + scratchBytes.Length); |
| int lenSav = finalLastToken.Length; |
| finalLastToken.Append(scratchBytes); |
| //System.out.println(" accept? input='" + scratchBytes.utf8ToString() + "'; lastToken='" + finalLastToken.utf8ToString() + "'; return " + (seen.contains(finalLastToken) == false)); |
| bool ret = seen.Contains(finalLastToken) == false; |
| |
| finalLastToken.Length = lenSav; |
| return ret; |
| } |
| } |
| |
| private sealed class ComparerAnonymousInnerClassHelper : IComparer<Lookup.LookupResult> |
| { |
| private readonly FreeTextSuggester outerInstance; |
| |
| public ComparerAnonymousInnerClassHelper(FreeTextSuggester outerInstance) |
| { |
| this.outerInstance = outerInstance; |
| } |
| |
| public int Compare(LookupResult a, LookupResult b) |
| { |
| if (a.Value > b.Value) |
| { |
| return -1; |
| } |
| else if (a.Value < b.Value) |
| { |
| return 1; |
| } |
| else |
| { |
| // Tie break by UTF16 sort order: |
| return a.Key.CompareToOrdinal(b.Key); |
| } |
| } |
| } |
| |
| /// <summary> |
| /// weight -> cost </summary> |
| private long EncodeWeight(long ngramCount) |
| { |
| return long.MaxValue - ngramCount; |
| } |
| |
| /// <summary> |
| /// cost -> weight </summary> |
| //private long decodeWeight(Pair<Long,BytesRef> output) { |
| private static long DecodeWeight(long? output) |
| { |
| Debug.Assert(output != null); |
| return (int)(long.MaxValue - output); // LUCENENET TODO: Perhaps a Java Lucene bug? Why cast to int when returning long? |
| } |
| |
| // NOTE: copied from WFSTCompletionLookup & tweaked |
| private long? LookupPrefix(FST<long?> fst, FST.BytesReader bytesReader, BytesRef scratch, FST.Arc<long?> arc) |
| { |
| |
| long? output = fst.Outputs.NoOutput; |
| |
| fst.GetFirstArc(arc); |
| |
| var bytes = scratch.Bytes; |
| var pos = scratch.Offset; |
| var end = pos + scratch.Length; |
| while (pos < end) |
| { |
| if (fst.FindTargetArc(bytes[pos++] & 0xff, arc, arc, bytesReader) == null) |
| { |
| return null; |
| } |
| else |
| { |
| output = fst.Outputs.Add(output, arc.Output); |
| } |
| } |
| |
| return output; |
| } |
| |
| internal static readonly IComparer<long?> weightComparer = new ComparerAnonymousInnerClassHelper2(); |
| |
| private sealed class ComparerAnonymousInnerClassHelper2 : IComparer<long?> |
| { |
| public ComparerAnonymousInnerClassHelper2() |
| { |
| } |
| |
| public int Compare(long? left, long? right) |
| { |
| return Comparer<long?>.Default.Compare(left, right); |
| } |
| } |
| |
| /// <summary> |
| /// Returns the weight associated with an input string, |
| /// or null if it does not exist. |
| /// </summary> |
| public virtual object Get(string key) |
| { |
| throw new NotSupportedException(); |
| } |
| } |
| } |