src/Lucene.Net.Suggest/Suggest/Analyzing/AnalyzingSuggester.cs - lucenenet - Git at Google

 using J2N.Collections.Generic.Extensions;
 using Lucene.Net.Analysis;
 using Lucene.Net.Diagnostics;
 using Lucene.Net.Store;
 using Lucene.Net.Support;
 using Lucene.Net.Support.IO;
 using Lucene.Net.Util;
 using Lucene.Net.Util.Automaton;
 using Lucene.Net.Util.Fst;
 using System;
 using System.Collections.Generic;
 using System.Diagnostics;
 using System.IO;
 using JCG = J2N.Collections.Generic;

 namespace Lucene.Net.Search.Suggest.Analyzing
 {
     /*
      * Licensed to the Apache Software Foundation (ASF) under one or more
      * contributor license agreements.  See the NOTICE file distributed with
      * this work for additional information regarding copyright ownership.
      * The ASF licenses this file to You under the Apache License, Version 2.0
      * (the "License"); you may not use this file except in compliance with
      * the License.  You may obtain a copy of the License at
      *
      *     http://www.apache.org/licenses/LICENSE-2.0
      *
      * Unless required by applicable law or agreed to in writing, software
      * distributed under the License is distributed on an "AS IS" BASIS,
      * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
      * See the License for the specific language governing permissions and
      * limitations under the License.
      */

     /// <summary>
     /// Suggester that first analyzes the surface form, adds the
     /// analyzed form to a weighted FST, and then does the same
     /// thing at lookup time.  This means lookup is based on the
     /// analyzed form while suggestions are still the surface
     /// form(s).
     ///
     /// <para>
     /// This can result in powerful suggester functionality.  For
     /// example, if you use an analyzer removing stop words,
     /// then the partial text "ghost chr..." could see the
     /// suggestion "The Ghost of Christmas Past". Note that
     /// position increments MUST NOT be preserved for this example
     /// to work, so you should call the constructor with
     /// <see cref="preservePositionIncrements"/> parameter set to
     /// false
     ///
     /// </para>
     /// <para>
     /// If SynonymFilter is used to map wifi and wireless network to
     /// hotspot then the partial text "wirele..." could suggest
     /// "wifi router".  Token normalization like stemmers, accent
     /// removal, etc., would allow suggestions to ignore such
     /// variations.
     ///
     /// </para>
     /// <para>
     /// When two matching suggestions have the same weight, they
     /// are tie-broken by the analyzed form.  If their analyzed
     /// form is the same then the order is undefined.
     ///
     /// </para>
     /// <para>
     /// There are some limitations:
     /// <list type="number">
     ///
     ///   <item><description> A lookup from a query like "net" in English won't
     ///        be any different than "net " (ie, user added a
     ///        trailing space) because analyzers don't reflect
     ///        when they've seen a token separator and when they
     ///        haven't.</description></item>
     ///
     ///   <item><description> If you're using <see cref="Analysis.Core.StopFilter"/>, and the user will
     ///        type "fast apple", but so far all they've typed is
     ///        "fast a", again because the analyzer doesn't convey whether
     ///        it's seen a token separator after the "a",
     ///        <see cref="Analysis.Core.StopFilter"/> will remove that "a" causing
     ///        far more matches than you'd expect.</description></item>
     ///
     ///   <item><description> Lookups with the empty string return no results
     ///        instead of all results.</description></item>
     /// </list>
     ///
     /// @lucene.experimental
     /// </para>
     /// </summary>
     public class AnalyzingSuggester : Lookup
     {

         /// <summary>
         /// FST(Weight,Surface):
         /// input is the analyzed form, with a null byte between terms
         /// weights are encoded as costs: (<see cref="int.MaxValue"/> - weight)
         /// surface is the original, unanalyzed form.
         /// </summary>
         private FST<PairOutputs<long?, BytesRef>.Pair> fst = null;

         /// <summary>
         /// Analyzer that will be used for analyzing suggestions at
         /// index time.
         /// </summary>
         private readonly Analyzer indexAnalyzer;

         /// <summary>
         /// Analyzer that will be used for analyzing suggestions at
         /// query time.
         /// </summary>
         private readonly Analyzer queryAnalyzer;

         /// <summary>
         /// True if exact match suggestions should always be returned first.
         /// </summary>
         private readonly bool exactFirst;

         /// <summary>
         /// True if separator between tokens should be preserved.
         /// </summary>
         private readonly bool preserveSep;

         /// <summary>
         /// Represents the separation between tokens, if
         /// <see cref="SuggesterOptions.PRESERVE_SEP"/> was specified
         /// </summary>
         private const int SEP_LABEL = '\u001F';

         /// <summary>
         /// Marks end of the analyzed input and start of dedup
         ///  byte.
         /// </summary>
         private const int END_BYTE = 0x0;

         /// <summary>
         /// Maximum number of dup surface forms (different surface
         ///  forms for the same analyzed form).
         /// </summary>
         private readonly int maxSurfaceFormsPerAnalyzedForm;

         /// <summary>
         /// Maximum graph paths to index for a single analyzed
         ///  surface form.  This only matters if your analyzer
         ///  makes lots of alternate paths (e.g. contains
         ///  SynonymFilter).
         /// </summary>
         private readonly int maxGraphExpansions;

         /// <summary>
         /// Highest number of analyzed paths we saw for any single
         ///  input surface form.  For analyzers that never create
         ///  graphs this will always be 1.
         /// </summary>
         private int maxAnalyzedPathsForOneInput;

         private bool hasPayloads;

         private const int PAYLOAD_SEP = '\u001f';

         /// <summary>
         /// Whether position holes should appear in the automaton. </summary>
         private readonly bool preservePositionIncrements; // LUCENENET: marked readonly

         /// <summary>
         /// Number of entries the lookup was built with </summary>
         private long count = 0;

         /// <summary>
         /// Calls <see cref="AnalyzingSuggester(Analyzer, Analyzer, SuggesterOptions, int, int, bool)">
         /// AnalyzingSuggester(analyzer, analyzer, SuggesterOptions.EXACT_FIRST | SuggesterOptions.PRESERVE_SEP, 256, -1, true)
         /// </see>
         /// </summary>
         public AnalyzingSuggester(Analyzer analyzer)
             : this(analyzer, analyzer, SuggesterOptions.EXACT_FIRST | SuggesterOptions.PRESERVE_SEP, 256, -1, true)
         {
         }

         /// <summary>
         /// Calls <see cref="AnalyzingSuggester(Analyzer,Analyzer,SuggesterOptions,int,int,bool)">
         /// AnalyzingSuggester(indexAnalyzer, queryAnalyzer, SuggesterOptions.EXACT_FIRST | SuggesterOptions.PRESERVE_SEP, 256, -1, true)
         /// </see>
         /// </summary>
         public AnalyzingSuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer)
             : this(indexAnalyzer, queryAnalyzer, SuggesterOptions.EXACT_FIRST | SuggesterOptions.PRESERVE_SEP, 256, -1, true)
         {
         }

         /// <summary>
         /// Creates a new suggester.
         /// </summary>
         /// <param name="indexAnalyzer"> Analyzer that will be used for
         ///   analyzing suggestions while building the index. </param>
         /// <param name="queryAnalyzer"> Analyzer that will be used for
         ///   analyzing query text during lookup </param>
         /// <param name="options"> see <see cref="SuggesterOptions.EXACT_FIRST"/>, <see cref="SuggesterOptions.PRESERVE_SEP"/> </param>
         /// <param name="maxSurfaceFormsPerAnalyzedForm"> Maximum number of
         ///   surface forms to keep for a single analyzed form.
         ///   When there are too many surface forms we discard the
         ///   lowest weighted ones. </param>
         /// <param name="maxGraphExpansions"> Maximum number of graph paths
         ///   to expand from the analyzed form.  Set this to -1 for
         ///   no limit. </param>
         /// <param name="preservePositionIncrements"> Whether position holes
         ///   should appear in the automata </param>
         public AnalyzingSuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer, SuggesterOptions options,
             int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions, bool preservePositionIncrements)
         {
             this.indexAnalyzer = indexAnalyzer;
             this.queryAnalyzer = queryAnalyzer;
             if ((options & ~(SuggesterOptions.EXACT_FIRST | SuggesterOptions.PRESERVE_SEP)) != 0)
             {
                 throw new ArgumentException("options should only contain SuggesterOptions.EXACT_FIRST and SuggesterOptions.PRESERVE_SEP; got " +
                                                    options);
             }
             this.exactFirst = (options & SuggesterOptions.EXACT_FIRST) != 0;
             this.preserveSep = (options & SuggesterOptions.PRESERVE_SEP) != 0;

             // NOTE: this is just an implementation limitation; if
             // somehow this is a problem we could fix it by using
             // more than one byte to disambiguate ... but 256 seems
             // like it should be way more then enough.
             if (maxSurfaceFormsPerAnalyzedForm <= 0 || maxSurfaceFormsPerAnalyzedForm > 256)
             {
                 throw new ArgumentException("maxSurfaceFormsPerAnalyzedForm must be > 0 and < 256 (got: " +
                                                    maxSurfaceFormsPerAnalyzedForm + ")");
             }
             this.maxSurfaceFormsPerAnalyzedForm = maxSurfaceFormsPerAnalyzedForm;

             if (maxGraphExpansions < 1 && maxGraphExpansions != -1)
             {
                 throw new ArgumentException("maxGraphExpansions must -1 (no limit) or > 0 (got: " +
                                                    maxGraphExpansions + ")");
             }
             this.maxGraphExpansions = maxGraphExpansions;
             this.preservePositionIncrements = preservePositionIncrements;
         }

         /// <summary>
         /// Returns byte size of the underlying FST. </summary>
         public override long GetSizeInBytes()
         {
             return fst == null ? 0 : fst.GetSizeInBytes();
         }

         private static void CopyDestTransitions(State from, State to, IList<Transition> transitions) // LUCENENET: CA1822: Mark members as static
         {
             if (to.Accept)
             {
                 from.Accept = true;
             }
             foreach (Transition t in to.GetTransitions())
             {
                 transitions.Add(t);
             }
         }

         // Replaces SEP with epsilon or remaps them if
         // we were asked to preserve them:
         private void ReplaceSep(Automaton a)
         {

             State[] states = a.GetNumberedStates();

             // Go in reverse topo sort so we know we only have to
             // make one pass:
             for (int stateNumber = states.Length - 1; stateNumber >= 0; stateNumber--)
             {
                 State state = states[stateNumber];
                 IList<Transition> newTransitions = new List<Transition>();
                 foreach (Transition t in state.GetTransitions())
                 {
                     if (Debugging.AssertsEnabled) Debugging.Assert(t.Min == t.Max);
                     if (t.Min == TokenStreamToAutomaton.POS_SEP)
                     {
                         if (preserveSep)
                         {
                             // Remap to SEP_LABEL:
                             newTransitions.Add(new Transition(SEP_LABEL, t.Dest));
                         }
                         else
                         {
                             CopyDestTransitions(state, t.Dest, newTransitions);
                             a.IsDeterministic = false;
                         }
                     }
                     else if (t.Min == TokenStreamToAutomaton.HOLE)
                     {

                         // Just remove the hole: there will then be two
                         // SEP tokens next to each other, which will only
                         // match another hole at search time.  Note that
                         // it will also match an empty-string token ... if
                         // that's somehow a problem we can always map HOLE
                         // to a dedicated byte (and escape it in the
                         // input).
                         CopyDestTransitions(state, t.Dest, newTransitions);
                         a.IsDeterministic = false;
                     }
                     else
                     {
                         newTransitions.Add(t);
                     }
                 }
                 state.SetTransitions(newTransitions.ToArray());
             }
         }

         /// <summary>
         /// Used by subclass to change the lookup automaton, if
         ///  necessary.
         /// </summary>
         protected internal virtual Automaton ConvertAutomaton(Automaton a)
         {
             return a;
         }

         internal virtual TokenStreamToAutomaton GetTokenStreamToAutomaton()
         {
             return new TokenStreamToAutomaton { PreservePositionIncrements = preservePositionIncrements };
         }

         private sealed class AnalyzingComparer : IComparer<BytesRef>
         {
             private readonly bool hasPayloads;

             public AnalyzingComparer(bool hasPayloads)
             {
                 this.hasPayloads = hasPayloads;
             }

             private readonly ByteArrayDataInput readerA = new ByteArrayDataInput();
             private readonly ByteArrayDataInput readerB = new ByteArrayDataInput();
             private readonly BytesRef scratchA = new BytesRef();
             private readonly BytesRef scratchB = new BytesRef();

             public int Compare(BytesRef a, BytesRef b)
             {

                 // First by analyzed form:
                 readerA.Reset(a.Bytes, a.Offset, a.Length);
                 scratchA.Length = (ushort)readerA.ReadInt16();
                 scratchA.Bytes = a.Bytes;
                 scratchA.Offset = readerA.Position;

                 readerB.Reset(b.Bytes, b.Offset, b.Length);
                 scratchB.Bytes = b.Bytes;
                 scratchB.Length = (ushort)readerB.ReadInt16();
                 scratchB.Offset = readerB.Position;

                 int cmp = scratchA.CompareTo(scratchB);
                 if (cmp != 0)
                 {
                     return cmp;
                 }
                 readerA.SkipBytes(scratchA.Length);
                 readerB.SkipBytes(scratchB.Length);

                 // Next by cost:
                 long aCost = readerA.ReadInt32();
                 long bCost = readerB.ReadInt32();
                 if (Debugging.AssertsEnabled)
                 {
                     Debugging.Assert(DecodeWeight(aCost) >= 0);
                     Debugging.Assert(DecodeWeight(bCost) >= 0);
                 }
                 if (aCost < bCost)
                 {
                     return -1;
                 }
                 else if (aCost > bCost)
                 {
                     return 1;
                 }

                 // Finally by surface form:
                 if (hasPayloads)
                 {
                     scratchA.Length = (ushort)readerA.ReadInt16();
                     scratchB.Length = (ushort)readerB.ReadInt16();
                     scratchA.Offset = readerA.Position;
                     scratchB.Offset = readerB.Position;
                 }
                 else
                 {
                     scratchA.Offset = readerA.Position;
                     scratchB.Offset = readerB.Position;
                     scratchA.Length = a.Length - scratchA.Offset;
                     scratchB.Length = b.Length - scratchB.Offset;
                 }

                 return scratchA.CompareTo(scratchB);
             }
         }

         public override void Build(IInputEnumerator enumerator)
         {
             if (enumerator.HasContexts)
             {
                 throw new ArgumentException("this suggester doesn't support contexts");
             }
             string prefix = this.GetType().Name;
             var directory = OfflineSorter.DefaultTempDir();
             var tempInput = FileSupport.CreateTempFile(prefix, ".input", directory);
             var tempSorted = FileSupport.CreateTempFile(prefix, ".sorted", directory);

             hasPayloads = enumerator.HasPayloads;

             var writer = new OfflineSorter.ByteSequencesWriter(tempInput);
             OfflineSorter.ByteSequencesReader reader = null;
             var scratch = new BytesRef();

             TokenStreamToAutomaton ts2a = GetTokenStreamToAutomaton();

             bool success = false;
             count = 0;
             byte[] buffer = new byte[8];
             try
             {
                 var output = new ByteArrayDataOutput(buffer);
                 BytesRef surfaceForm;

                 while (enumerator.MoveNext())
                 {
                     surfaceForm = enumerator.Current;
                     ISet<Int32sRef> paths = ToFiniteStrings(surfaceForm, ts2a);

                     maxAnalyzedPathsForOneInput = Math.Max(maxAnalyzedPathsForOneInput, paths.Count);

                     foreach (Int32sRef path in paths)
                     {

                         Util.Fst.Util.ToBytesRef(path, scratch);

                         // length of the analyzed text (FST input)
                         if (scratch.Length > ushort.MaxValue - 2)
                         {
                             throw new ArgumentException("cannot handle analyzed forms > " + (ushort.MaxValue - 2) +
                                                                " in length (got " + scratch.Length + ")");
                         }
                         ushort analyzedLength = (ushort)scratch.Length;

                         // compute the required length:
                         // analyzed sequence + weight (4) + surface + analyzedLength (short)
                         int requiredLength = analyzedLength + 4 + surfaceForm.Length + 2;

                         BytesRef payload;

                         if (hasPayloads)
                         {
                             if (surfaceForm.Length > (ushort.MaxValue - 2))
                             {
                                 throw new ArgumentException("cannot handle surface form > " + (ushort.MaxValue - 2) +
                                                             " in length (got " + surfaceForm.Length + ")");
                             }
                             payload = enumerator.Payload;
                             // payload + surfaceLength (short)
                             requiredLength += payload.Length + 2;
                         }
                         else
                         {
                             payload = null;
                         }

                         buffer = ArrayUtil.Grow(buffer, requiredLength);

                         output.Reset(buffer);

                         output.WriteInt16((short)analyzedLength);

                         output.WriteBytes(scratch.Bytes, scratch.Offset, scratch.Length);

                         output.WriteInt32(EncodeWeight(enumerator.Weight));

                         if (hasPayloads)
                         {
                             for (int i = 0; i < surfaceForm.Length; i++)
                             {
                                 if (surfaceForm.Bytes[i] == PAYLOAD_SEP)
                                 {
                                     throw new ArgumentException(
                                         "surface form cannot contain unit separator character U+001F; this character is reserved");
                                 }
                             }
                             output.WriteInt16((short)surfaceForm.Length);
                             output.WriteBytes(surfaceForm.Bytes, surfaceForm.Offset, surfaceForm.Length);
                             output.WriteBytes(payload.Bytes, payload.Offset, payload.Length);
                         }
                         else
                         {
                             output.WriteBytes(surfaceForm.Bytes, surfaceForm.Offset, surfaceForm.Length);
                         }

                         if (Debugging.AssertsEnabled) Debugging.Assert(output.Position == requiredLength, "{0} vs {1}", output.Position, requiredLength);

                         writer.Write(buffer, 0, output.Position);
                     }
                     count++;
                 }
                 writer.Dispose();

                 // Sort all input/output pairs (required by FST.Builder):
                 (new OfflineSorter(new AnalyzingComparer(hasPayloads))).Sort(tempInput, tempSorted);

                 // Free disk space:
                 tempInput.Delete();

                 reader = new OfflineSorter.ByteSequencesReader(tempSorted);

                 var outputs = new PairOutputs<long?, BytesRef>(PositiveInt32Outputs.Singleton,
                     ByteSequenceOutputs.Singleton);
                 var builder = new Builder<PairOutputs<long?, BytesRef>.Pair>(FST.INPUT_TYPE.BYTE1, outputs);

                 // Build FST:
                 BytesRef previousAnalyzed = null;
                 BytesRef analyzed = new BytesRef();
                 BytesRef surface = new BytesRef();
                 Int32sRef scratchInts = new Int32sRef();
                 var input = new ByteArrayDataInput();

                 // Used to remove duplicate surface forms (but we
                 // still index the hightest-weight one).  We clear
                 // this when we see a new analyzed form, so it cannot
                 // grow unbounded (at most 256 entries):
                 var seenSurfaceForms = new JCG.HashSet<BytesRef>();

                 var dedup = 0;
                 while (reader.Read(scratch))
                 {
                     input.Reset(scratch.Bytes, scratch.Offset, scratch.Length);
                     ushort analyzedLength = (ushort)input.ReadInt16();
                     analyzed.Grow(analyzedLength + 2);
                     input.ReadBytes(analyzed.Bytes, 0, analyzedLength);
                     analyzed.Length = analyzedLength;

                     long cost = input.ReadInt32();

                     surface.Bytes = scratch.Bytes;
                     if (hasPayloads)
                     {
                         surface.Length = (ushort)input.ReadInt16();
                         surface.Offset = input.Position;
                     }
                     else
                     {
                         surface.Offset = input.Position;
                         surface.Length = scratch.Length - surface.Offset;
                     }

                     if (previousAnalyzed == null)
                     {
                         previousAnalyzed = new BytesRef();
                         previousAnalyzed.CopyBytes(analyzed);
                         seenSurfaceForms.Add(BytesRef.DeepCopyOf(surface));
                     }
                     else if (analyzed.Equals(previousAnalyzed))
                     {
                         dedup++;
                         if (dedup >= maxSurfaceFormsPerAnalyzedForm)
                         {
                             // More than maxSurfaceFormsPerAnalyzedForm
                             // dups: skip the rest:
                             continue;
                         }
                         if (seenSurfaceForms.Contains(surface))
                         {
                             continue;
                         }
                         seenSurfaceForms.Add(BytesRef.DeepCopyOf(surface));
                     }
                     else
                     {
                         dedup = 0;
                         previousAnalyzed.CopyBytes(analyzed);
                         seenSurfaceForms.Clear();
                         seenSurfaceForms.Add(BytesRef.DeepCopyOf(surface));
                     }

                     // TODO: I think we can avoid the extra 2 bytes when
                     // there is no dup (dedup==0), but we'd have to fix
                     // the exactFirst logic ... which would be sort of
                     // hairy because we'd need to special case the two
                     // (dup/not dup)...

                     // NOTE: must be byte 0 so we sort before whatever
                     // is next
                     analyzed.Bytes[analyzed.Offset + analyzed.Length] = 0;
                     analyzed.Bytes[analyzed.Offset + analyzed.Length + 1] = (byte)dedup;
                     analyzed.Length += 2;

                     Util.Fst.Util.ToInt32sRef(analyzed, scratchInts);
                     //System.out.println("ADD: " + scratchInts + " -> " + cost + ": " + surface.utf8ToString());
                     if (!hasPayloads)
                     {
                         builder.Add(scratchInts, outputs.NewPair(cost, BytesRef.DeepCopyOf(surface)));
                     }
                     else
                     {
                         int payloadOffset = input.Position + surface.Length;
                         int payloadLength = scratch.Length - payloadOffset;
                         BytesRef br = new BytesRef(surface.Length + 1 + payloadLength);
                         Array.Copy(surface.Bytes, surface.Offset, br.Bytes, 0, surface.Length);
                         br.Bytes[surface.Length] = PAYLOAD_SEP;
                         Array.Copy(scratch.Bytes, payloadOffset, br.Bytes, surface.Length + 1, payloadLength);
                         br.Length = br.Bytes.Length;
                         builder.Add(scratchInts, outputs.NewPair(cost, br));
                     }
                 }
                 fst = builder.Finish();

                 //Util.dotToFile(fst, "/tmp/suggest.dot");

                 success = true;
             }
             finally
             {
                 if (success)
                 {
                     IOUtils.Dispose(reader, writer);
                 }
                 else
                 {
                     IOUtils.DisposeWhileHandlingException(reader, writer);
                 }

                 tempInput.Delete();
                 tempSorted.Delete();
             }
         }

         public override bool Store(DataOutput output)
         {
             output.WriteVInt64(count);
             if (fst == null)
             {
                 return false;
             }

             fst.Save(output);
             output.WriteVInt32(maxAnalyzedPathsForOneInput);
             output.WriteByte((byte)(hasPayloads ? 1 : 0));
             return true;
         }

         public override bool Load(DataInput input)
         {
             count = input.ReadVInt64();
             this.fst = new FST<PairOutputs<long?, BytesRef>.Pair>(input, new PairOutputs<long?, BytesRef>(PositiveInt32Outputs.Singleton, ByteSequenceOutputs.Singleton));
             maxAnalyzedPathsForOneInput = input.ReadVInt32();
             hasPayloads = input.ReadByte() == 1;
             return true;
         }

         private LookupResult GetLookupResult(long? output1, BytesRef output2, CharsRef spare)
         {
             LookupResult result;
             if (hasPayloads)
             {
                 int sepIndex = -1;
                 for (int i = 0; i < output2.Length; i++)
                 {
                     if (output2.Bytes[output2.Offset + i] == PAYLOAD_SEP)
                     {
                         sepIndex = i;
                         break;
                     }
                 }
                 if (Debugging.AssertsEnabled) Debugging.Assert(sepIndex != -1);
                 spare.Grow(sepIndex);

                 int payloadLen = output2.Length - sepIndex - 1;
                 UnicodeUtil.UTF8toUTF16(output2.Bytes, output2.Offset, sepIndex, spare);
                 BytesRef payload = new BytesRef(payloadLen);
                 Array.Copy(output2.Bytes, sepIndex + 1, payload.Bytes, 0, payloadLen);
                 payload.Length = payloadLen;
                 result = new LookupResult(spare.ToString(), DecodeWeight(output1.GetValueOrDefault()), payload);
             }
             else
             {
                 spare.Grow(output2.Length);
                 UnicodeUtil.UTF8toUTF16(output2, spare);
                 result = new LookupResult(spare.ToString(), DecodeWeight(output1.GetValueOrDefault()));
             }

             return result;
         }

         private bool SameSurfaceForm(BytesRef key, BytesRef output2)
         {
             if (hasPayloads)
             {
                 // output2 has at least PAYLOAD_SEP byte:
                 if (key.Length >= output2.Length)
                 {
                     return false;
                 }
                 for (int i = 0; i < key.Length; i++)
                 {
                     if (key.Bytes[key.Offset + i] != output2.Bytes[output2.Offset + i])
                     {
                         return false;
                     }
                 }
                 return output2.Bytes[output2.Offset + key.Length] == PAYLOAD_SEP;
             }
             else
             {
                 return key.BytesEquals(output2);
             }
         }

         public override IList<LookupResult> DoLookup(string key, IEnumerable<BytesRef> contexts, bool onlyMorePopular, int num)
         {
             if (Debugging.AssertsEnabled) Debugging.Assert(num > 0);

             if (onlyMorePopular)
             {
                 throw new ArgumentException("this suggester only works with onlyMorePopular=false");
             }
             if (contexts != null)
             {
                 throw new ArgumentException("this suggester doesn't support contexts");
             }
             if (fst == null)
             {
                 return Collections.EmptyList<LookupResult>();
             }

             //System.out.println("lookup key=" + key + " num=" + num);
             for (var i = 0; i < key.Length; i++)
             {
                 if (key[i] == 0x1E)
                 {
                     throw new ArgumentException(
                         "lookup key cannot contain HOLE character U+001E; this character is reserved");
                 }
                 if (key[i] == 0x1F)
                 {
                     throw new ArgumentException(
                         "lookup key cannot contain unit separator character U+001F; this character is reserved");
                 }
             }

             var utf8Key = new BytesRef(key);
             try
             {

                 Automaton lookupAutomaton = ToLookupAutomaton(key);

                 var spare = new CharsRef();

                 //System.out.println("  now intersect exactFirst=" + exactFirst);

                 // Intersect automaton w/ suggest wFST and get all
                 // prefix starting nodes & their outputs:
                 //final PathIntersector intersector = getPathIntersector(lookupAutomaton, fst);

                 //System.out.println("  prefixPaths: " + prefixPaths.size());

                 FST.BytesReader bytesReader = fst.GetBytesReader();

                 var scratchArc = new FST.Arc<PairOutputs<long?, BytesRef>.Pair>();

                 IList<LookupResult> results = new List<LookupResult>();

                 IList<FSTUtil.Path<PairOutputs<long?, BytesRef>.Pair>> prefixPaths =
                     FSTUtil.IntersectPrefixPaths(ConvertAutomaton(lookupAutomaton), fst);

                 if (exactFirst)
                 {

                     int count = 0;
                     foreach (FSTUtil.Path<PairOutputs<long?, BytesRef>.Pair> path in prefixPaths)
                     {
                         if (fst.FindTargetArc(END_BYTE, path.FstNode, scratchArc, bytesReader) != null)
                         {
                             // This node has END_BYTE arc leaving, meaning it's an
                             // "exact" match:
                             count++;
                         }
                     }

                     // Searcher just to find the single exact only
                     // match, if present:
                     Util.Fst.Util.TopNSearcher<PairOutputs<long?, BytesRef>.Pair> searcher;
                     searcher = new Util.Fst.Util.TopNSearcher<PairOutputs<long?, BytesRef>.Pair>(fst, count * maxSurfaceFormsPerAnalyzedForm,
                         count * maxSurfaceFormsPerAnalyzedForm, weightComparer);

                     // NOTE: we could almost get away with only using
                     // the first start node.  The only catch is if
                     // maxSurfaceFormsPerAnalyzedForm had kicked in and
                     // pruned our exact match from one of these nodes
                     // ...:
                     foreach (var path in prefixPaths)
                     {
                         if (fst.FindTargetArc(END_BYTE, path.FstNode, scratchArc, bytesReader) != null)
                         {
                             // This node has END_BYTE arc leaving, meaning it's an
                             // "exact" match:
                             searcher.AddStartPaths(scratchArc, fst.Outputs.Add(path.Output, scratchArc.Output), false,
                                 path.Input);
                         }
                     }

                     var completions = searcher.Search();
                     if (Debugging.AssertsEnabled) Debugging.Assert(completions.IsComplete);

                     // NOTE: this is rather inefficient: we enumerate
                     // every matching "exactly the same analyzed form"
                     // path, and then do linear scan to see if one of
                     // these exactly matches the input.  It should be
                     // possible (though hairy) to do something similar
                     // to getByOutput, since the surface form is encoded
                     // into the FST output, so we more efficiently hone
                     // in on the exact surface-form match.  Still, I
                     // suspect very little time is spent in this linear
                     // seach: it's bounded by how many prefix start
                     // nodes we have and the
                     // maxSurfaceFormsPerAnalyzedForm:
                     foreach (var completion in completions)
                     {
                         BytesRef output2 = completion.Output.Output2;
                         if (SameSurfaceForm(utf8Key, output2))
                         {
                             results.Add(GetLookupResult(completion.Output.Output1, output2, spare));
                             break;
                         }
                     }

                     if (results.Count == num)
                     {
                         // That was quick:
                         return results;
                     }
                 }

                 Util.Fst.Util.TopNSearcher<PairOutputs<long?, BytesRef>.Pair> searcher2;
                 searcher2 = new TopNSearcherAnonymousClass(this, fst, num - results.Count,
                     num * maxAnalyzedPathsForOneInput, weightComparer, utf8Key, results);

                 prefixPaths = GetFullPrefixPaths(prefixPaths, lookupAutomaton, fst);

                 foreach (FSTUtil.Path<PairOutputs<long?, BytesRef>.Pair> path in prefixPaths)
                 {
                     searcher2.AddStartPaths(path.FstNode, path.Output, true, path.Input);
                 }

                 var completions2 = searcher2.Search();
                 if (Debugging.AssertsEnabled) Debugging.Assert(completions2.IsComplete);

                 foreach (Util.Fst.Util.Result<PairOutputs<long?, BytesRef>.Pair> completion in completions2)
                 {

                     LookupResult result = GetLookupResult(completion.Output.Output1, completion.Output.Output2, spare);

                     // TODO: for fuzzy case would be nice to return
                     // how many edits were required

                     //System.out.println("    result=" + result);
                     results.Add(result);

                     if (results.Count == num)
                     {
                         // In the exactFirst=true case the search may
                         // produce one extra path
                         break;
                     }
                 }

                 return results;
             }
             catch (IOException bogus)
             {
                 throw new Exception(bogus.ToString(), bogus);
             }
         }

         private class TopNSearcherAnonymousClass : Util.Fst.Util.TopNSearcher<PairOutputs<long?, BytesRef>.Pair>
         {
             private readonly AnalyzingSuggester outerInstance;

             private readonly BytesRef utf8Key;
             private readonly IList<LookupResult> results;

             public TopNSearcherAnonymousClass(
                 AnalyzingSuggester outerInstance,
                 FST<PairOutputs<long?, BytesRef>.Pair> fst,
                 int topN,
                 int maxQueueDepth,
                 IComparer<PairOutputs<long?, BytesRef>.Pair> comparer,
                 BytesRef utf8Key,
                 IList<LookupResult> results)
                 : base(fst, topN, maxQueueDepth, comparer)
             {
                 this.outerInstance = outerInstance;
                 this.utf8Key = utf8Key;
                 this.results = results;
                 seen = new JCG.HashSet<BytesRef>();
             }

             private readonly ISet<BytesRef> seen;

             protected override bool AcceptResult(Int32sRef input, PairOutputs<long?, BytesRef>.Pair output)
             {

                 // Dedup: when the input analyzes to a graph we
                 // can get duplicate surface forms:
                 if (seen.Contains(output.Output2))
                 {
                     return false;
                 }
                 seen.Add(output.Output2);

                 if (!outerInstance.exactFirst)
                 {
                     return true;
                 }
                 else
                 {
                     // In exactFirst mode, don't accept any paths
                     // matching the surface form since that will
                     // create duplicate results:
                     if (outerInstance.SameSurfaceForm(utf8Key, output.Output2))
                     {
                         // We found exact match, which means we should
                         // have already found it in the first search:
                         if (Debugging.AssertsEnabled) Debugging.Assert(results.Count == 1);
                         return false;
                     }
                     else
                     {
                         return true;
                     }
                 }
             }
         }

         public override long Count => count;

         /// <summary>
         /// Returns all prefix paths to initialize the search.
         /// </summary>
         protected internal virtual IList<FSTUtil.Path<PairOutputs<long?, BytesRef>.Pair>> GetFullPrefixPaths(
             IList<FSTUtil.Path<PairOutputs<long?, BytesRef>.Pair>> prefixPaths, Automaton lookupAutomaton,
             FST<PairOutputs<long?, BytesRef>.Pair> fst)
         {
             return prefixPaths;
         }

         internal ISet<Int32sRef> ToFiniteStrings(BytesRef surfaceForm, TokenStreamToAutomaton ts2a)
         {
             // Analyze surface form:
             Automaton automaton = null;
             TokenStream ts = indexAnalyzer.GetTokenStream("", surfaceForm.Utf8ToString());
             try
             {

                 // Create corresponding automaton: labels are bytes
                 // from each analyzed token, with byte 0 used as
                 // separator between tokens:
                 automaton = ts2a.ToAutomaton(ts);
             }
             finally
             {
                 IOUtils.DisposeWhileHandlingException(ts);
             }

             ReplaceSep(automaton);
             automaton = ConvertAutomaton(automaton);

             if (Debugging.AssertsEnabled) Debugging.Assert(SpecialOperations.IsFinite(automaton));

             // Get all paths from the automaton (there can be
             // more than one path, eg if the analyzer created a
             // graph using SynFilter or WDF):

             // TODO: we could walk & add simultaneously, so we
             // don't have to alloc [possibly biggish]
             // intermediate HashSet in RAM:
             return SpecialOperations.GetFiniteStrings(automaton, maxGraphExpansions);
         }

         internal Automaton ToLookupAutomaton(string key)
         {
             // TODO: is there a Reader from a CharSequence?
             // Turn tokenstream into automaton:
             Automaton automaton = null;
             TokenStream ts = queryAnalyzer.GetTokenStream("", key);
             try
             {
                 automaton = (GetTokenStreamToAutomaton()).ToAutomaton(ts);
             }
             finally
             {
                 IOUtils.DisposeWhileHandlingException(ts);
             }

             // TODO: we could use the end offset to "guess"
             // whether the final token was a partial token; this
             // would only be a heuristic ... but maybe an OK one.
             // This way we could eg differentiate "net" from "net ",
             // which we can't today...

             ReplaceSep(automaton);

             // TODO: we can optimize this somewhat by determinizing
             // while we convert
             BasicOperations.Determinize(automaton);
             return automaton;
         }

         /// <summary>
         /// Returns the weight associated with an input string,
         /// or null if it does not exist.
         /// </summary>
         public virtual object Get(string key)
         {
             throw new NotSupportedException();
         }

         /// <summary>
         /// cost -> weight </summary>
         private static int DecodeWeight(long encoded)
         {
             return (int)(int.MaxValue - encoded);
         }

         /// <summary>
         /// weight -> cost </summary>
         private static int EncodeWeight(long value)
         {
             if (value < 0 || value > int.MaxValue)
             {
                 throw new NotSupportedException("cannot encode value: " + value);
             }
             return int.MaxValue - (int)value;
         }

         internal static readonly IComparer<PairOutputs<long?, BytesRef>.Pair> weightComparer =
             Comparer<PairOutputs<long?, BytesRef>.Pair>.Create((left, right) => Comparer<long?>.Default.Compare(left.Output1, right.Output1));
     }
 }