| using J2N.Collections.Generic.Extensions; |
| using Lucene.Net.Analysis; |
| using Lucene.Net.Diagnostics; |
| using Lucene.Net.Store; |
| using Lucene.Net.Support; |
| using Lucene.Net.Support.IO; |
| using Lucene.Net.Util; |
| using Lucene.Net.Util.Automaton; |
| using Lucene.Net.Util.Fst; |
| using System; |
| using System.Collections.Generic; |
| using System.Diagnostics; |
| using System.IO; |
| using JCG = J2N.Collections.Generic; |
| |
| namespace Lucene.Net.Search.Suggest.Analyzing |
| { |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| /// <summary> |
| /// Suggester that first analyzes the surface form, adds the |
| /// analyzed form to a weighted FST, and then does the same |
| /// thing at lookup time. This means lookup is based on the |
| /// analyzed form while suggestions are still the surface |
| /// form(s). |
| /// |
| /// <para> |
| /// This can result in powerful suggester functionality. For |
| /// example, if you use an analyzer removing stop words, |
| /// then the partial text "ghost chr..." could see the |
| /// suggestion "The Ghost of Christmas Past". Note that |
| /// position increments MUST NOT be preserved for this example |
| /// to work, so you should call the constructor with |
| /// <see cref="preservePositionIncrements"/> parameter set to |
| /// false |
| /// |
| /// </para> |
| /// <para> |
| /// If SynonymFilter is used to map wifi and wireless network to |
| /// hotspot then the partial text "wirele..." could suggest |
| /// "wifi router". Token normalization like stemmers, accent |
| /// removal, etc., would allow suggestions to ignore such |
| /// variations. |
| /// |
| /// </para> |
| /// <para> |
| /// When two matching suggestions have the same weight, they |
| /// are tie-broken by the analyzed form. If their analyzed |
| /// form is the same then the order is undefined. |
| /// |
| /// </para> |
| /// <para> |
| /// There are some limitations: |
| /// <list type="number"> |
| /// |
| /// <item><description> A lookup from a query like "net" in English won't |
| /// be any different than "net " (ie, user added a |
| /// trailing space) because analyzers don't reflect |
| /// when they've seen a token separator and when they |
| /// haven't.</description></item> |
| /// |
| /// <item><description> If you're using <see cref="Analysis.Core.StopFilter"/>, and the user will |
| /// type "fast apple", but so far all they've typed is |
| /// "fast a", again because the analyzer doesn't convey whether |
| /// it's seen a token separator after the "a", |
| /// <see cref="Analysis.Core.StopFilter"/> will remove that "a" causing |
| /// far more matches than you'd expect.</description></item> |
| /// |
| /// <item><description> Lookups with the empty string return no results |
| /// instead of all results.</description></item> |
| /// </list> |
| /// |
| /// @lucene.experimental |
| /// </para> |
| /// </summary> |
| public class AnalyzingSuggester : Lookup |
| { |
| |
| /// <summary> |
| /// FST(Weight,Surface): |
| /// input is the analyzed form, with a null byte between terms |
| /// weights are encoded as costs: (<see cref="int.MaxValue"/> - weight) |
| /// surface is the original, unanalyzed form. |
| /// </summary> |
| private FST<PairOutputs<long?, BytesRef>.Pair> fst = null; |
| |
| /// <summary> |
| /// Analyzer that will be used for analyzing suggestions at |
| /// index time. |
| /// </summary> |
| private readonly Analyzer indexAnalyzer; |
| |
| /// <summary> |
| /// Analyzer that will be used for analyzing suggestions at |
| /// query time. |
| /// </summary> |
| private readonly Analyzer queryAnalyzer; |
| |
| /// <summary> |
| /// True if exact match suggestions should always be returned first. |
| /// </summary> |
| private readonly bool exactFirst; |
| |
| /// <summary> |
| /// True if separator between tokens should be preserved. |
| /// </summary> |
| private readonly bool preserveSep; |
| |
| /// <summary> |
| /// Represents the separation between tokens, if |
| /// <see cref="SuggesterOptions.PRESERVE_SEP"/> was specified |
| /// </summary> |
| private const int SEP_LABEL = '\u001F'; |
| |
| /// <summary> |
| /// Marks end of the analyzed input and start of dedup |
| /// byte. |
| /// </summary> |
| private const int END_BYTE = 0x0; |
| |
| /// <summary> |
| /// Maximum number of dup surface forms (different surface |
| /// forms for the same analyzed form). |
| /// </summary> |
| private readonly int maxSurfaceFormsPerAnalyzedForm; |
| |
| /// <summary> |
| /// Maximum graph paths to index for a single analyzed |
| /// surface form. This only matters if your analyzer |
| /// makes lots of alternate paths (e.g. contains |
| /// SynonymFilter). |
| /// </summary> |
| private readonly int maxGraphExpansions; |
| |
| /// <summary> |
| /// Highest number of analyzed paths we saw for any single |
| /// input surface form. For analyzers that never create |
| /// graphs this will always be 1. |
| /// </summary> |
| private int maxAnalyzedPathsForOneInput; |
| |
| private bool hasPayloads; |
| |
| private const int PAYLOAD_SEP = '\u001f'; |
| |
| /// <summary> |
| /// Whether position holes should appear in the automaton. </summary> |
| private readonly bool preservePositionIncrements; // LUCENENET: marked readonly |
| |
| /// <summary> |
| /// Number of entries the lookup was built with </summary> |
| private long count = 0; |
| |
| /// <summary> |
| /// Calls <see cref="AnalyzingSuggester(Analyzer, Analyzer, SuggesterOptions, int, int, bool)"> |
| /// AnalyzingSuggester(analyzer, analyzer, SuggesterOptions.EXACT_FIRST | SuggesterOptions.PRESERVE_SEP, 256, -1, true) |
| /// </see> |
| /// </summary> |
| public AnalyzingSuggester(Analyzer analyzer) |
| : this(analyzer, analyzer, SuggesterOptions.EXACT_FIRST | SuggesterOptions.PRESERVE_SEP, 256, -1, true) |
| { |
| } |
| |
| /// <summary> |
| /// Calls <see cref="AnalyzingSuggester(Analyzer,Analyzer,SuggesterOptions,int,int,bool)"> |
| /// AnalyzingSuggester(indexAnalyzer, queryAnalyzer, SuggesterOptions.EXACT_FIRST | SuggesterOptions.PRESERVE_SEP, 256, -1, true) |
| /// </see> |
| /// </summary> |
| public AnalyzingSuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer) |
| : this(indexAnalyzer, queryAnalyzer, SuggesterOptions.EXACT_FIRST | SuggesterOptions.PRESERVE_SEP, 256, -1, true) |
| { |
| } |
| |
| /// <summary> |
| /// Creates a new suggester. |
| /// </summary> |
| /// <param name="indexAnalyzer"> Analyzer that will be used for |
| /// analyzing suggestions while building the index. </param> |
| /// <param name="queryAnalyzer"> Analyzer that will be used for |
| /// analyzing query text during lookup </param> |
| /// <param name="options"> see <see cref="SuggesterOptions.EXACT_FIRST"/>, <see cref="SuggesterOptions.PRESERVE_SEP"/> </param> |
| /// <param name="maxSurfaceFormsPerAnalyzedForm"> Maximum number of |
| /// surface forms to keep for a single analyzed form. |
| /// When there are too many surface forms we discard the |
| /// lowest weighted ones. </param> |
| /// <param name="maxGraphExpansions"> Maximum number of graph paths |
| /// to expand from the analyzed form. Set this to -1 for |
| /// no limit. </param> |
| /// <param name="preservePositionIncrements"> Whether position holes |
| /// should appear in the automata </param> |
| public AnalyzingSuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer, SuggesterOptions options, |
| int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions, bool preservePositionIncrements) |
| { |
| this.indexAnalyzer = indexAnalyzer; |
| this.queryAnalyzer = queryAnalyzer; |
| if ((options & ~(SuggesterOptions.EXACT_FIRST | SuggesterOptions.PRESERVE_SEP)) != 0) |
| { |
| throw new ArgumentException("options should only contain SuggesterOptions.EXACT_FIRST and SuggesterOptions.PRESERVE_SEP; got " + |
| options); |
| } |
| this.exactFirst = (options & SuggesterOptions.EXACT_FIRST) != 0; |
| this.preserveSep = (options & SuggesterOptions.PRESERVE_SEP) != 0; |
| |
| // NOTE: this is just an implementation limitation; if |
| // somehow this is a problem we could fix it by using |
| // more than one byte to disambiguate ... but 256 seems |
| // like it should be way more then enough. |
| if (maxSurfaceFormsPerAnalyzedForm <= 0 || maxSurfaceFormsPerAnalyzedForm > 256) |
| { |
| throw new ArgumentException("maxSurfaceFormsPerAnalyzedForm must be > 0 and < 256 (got: " + |
| maxSurfaceFormsPerAnalyzedForm + ")"); |
| } |
| this.maxSurfaceFormsPerAnalyzedForm = maxSurfaceFormsPerAnalyzedForm; |
| |
| if (maxGraphExpansions < 1 && maxGraphExpansions != -1) |
| { |
| throw new ArgumentException("maxGraphExpansions must -1 (no limit) or > 0 (got: " + |
| maxGraphExpansions + ")"); |
| } |
| this.maxGraphExpansions = maxGraphExpansions; |
| this.preservePositionIncrements = preservePositionIncrements; |
| } |
| |
| /// <summary> |
| /// Returns byte size of the underlying FST. </summary> |
| public override long GetSizeInBytes() |
| { |
| return fst == null ? 0 : fst.GetSizeInBytes(); |
| } |
| |
| private static void CopyDestTransitions(State from, State to, IList<Transition> transitions) // LUCENENET: CA1822: Mark members as static |
| { |
| if (to.Accept) |
| { |
| from.Accept = true; |
| } |
| foreach (Transition t in to.GetTransitions()) |
| { |
| transitions.Add(t); |
| } |
| } |
| |
| // Replaces SEP with epsilon or remaps them if |
| // we were asked to preserve them: |
| private void ReplaceSep(Automaton a) |
| { |
| |
| State[] states = a.GetNumberedStates(); |
| |
| // Go in reverse topo sort so we know we only have to |
| // make one pass: |
| for (int stateNumber = states.Length - 1; stateNumber >= 0; stateNumber--) |
| { |
| State state = states[stateNumber]; |
| IList<Transition> newTransitions = new List<Transition>(); |
| foreach (Transition t in state.GetTransitions()) |
| { |
| if (Debugging.AssertsEnabled) Debugging.Assert(t.Min == t.Max); |
| if (t.Min == TokenStreamToAutomaton.POS_SEP) |
| { |
| if (preserveSep) |
| { |
| // Remap to SEP_LABEL: |
| newTransitions.Add(new Transition(SEP_LABEL, t.Dest)); |
| } |
| else |
| { |
| CopyDestTransitions(state, t.Dest, newTransitions); |
| a.IsDeterministic = false; |
| } |
| } |
| else if (t.Min == TokenStreamToAutomaton.HOLE) |
| { |
| |
| // Just remove the hole: there will then be two |
| // SEP tokens next to each other, which will only |
| // match another hole at search time. Note that |
| // it will also match an empty-string token ... if |
| // that's somehow a problem we can always map HOLE |
| // to a dedicated byte (and escape it in the |
| // input). |
| CopyDestTransitions(state, t.Dest, newTransitions); |
| a.IsDeterministic = false; |
| } |
| else |
| { |
| newTransitions.Add(t); |
| } |
| } |
| state.SetTransitions(newTransitions.ToArray()); |
| } |
| } |
| |
| /// <summary> |
| /// Used by subclass to change the lookup automaton, if |
| /// necessary. |
| /// </summary> |
| protected internal virtual Automaton ConvertAutomaton(Automaton a) |
| { |
| return a; |
| } |
| |
| internal virtual TokenStreamToAutomaton GetTokenStreamToAutomaton() |
| { |
| return new TokenStreamToAutomaton { PreservePositionIncrements = preservePositionIncrements }; |
| } |
| |
| private sealed class AnalyzingComparer : IComparer<BytesRef> |
| { |
| private readonly bool hasPayloads; |
| |
| public AnalyzingComparer(bool hasPayloads) |
| { |
| this.hasPayloads = hasPayloads; |
| } |
| |
| private readonly ByteArrayDataInput readerA = new ByteArrayDataInput(); |
| private readonly ByteArrayDataInput readerB = new ByteArrayDataInput(); |
| private readonly BytesRef scratchA = new BytesRef(); |
| private readonly BytesRef scratchB = new BytesRef(); |
| |
| public int Compare(BytesRef a, BytesRef b) |
| { |
| |
| // First by analyzed form: |
| readerA.Reset(a.Bytes, a.Offset, a.Length); |
| scratchA.Length = (ushort)readerA.ReadInt16(); |
| scratchA.Bytes = a.Bytes; |
| scratchA.Offset = readerA.Position; |
| |
| readerB.Reset(b.Bytes, b.Offset, b.Length); |
| scratchB.Bytes = b.Bytes; |
| scratchB.Length = (ushort)readerB.ReadInt16(); |
| scratchB.Offset = readerB.Position; |
| |
| int cmp = scratchA.CompareTo(scratchB); |
| if (cmp != 0) |
| { |
| return cmp; |
| } |
| readerA.SkipBytes(scratchA.Length); |
| readerB.SkipBytes(scratchB.Length); |
| |
| // Next by cost: |
| long aCost = readerA.ReadInt32(); |
| long bCost = readerB.ReadInt32(); |
| if (Debugging.AssertsEnabled) |
| { |
| Debugging.Assert(DecodeWeight(aCost) >= 0); |
| Debugging.Assert(DecodeWeight(bCost) >= 0); |
| } |
| if (aCost < bCost) |
| { |
| return -1; |
| } |
| else if (aCost > bCost) |
| { |
| return 1; |
| } |
| |
| // Finally by surface form: |
| if (hasPayloads) |
| { |
| scratchA.Length = (ushort)readerA.ReadInt16(); |
| scratchB.Length = (ushort)readerB.ReadInt16(); |
| scratchA.Offset = readerA.Position; |
| scratchB.Offset = readerB.Position; |
| } |
| else |
| { |
| scratchA.Offset = readerA.Position; |
| scratchB.Offset = readerB.Position; |
| scratchA.Length = a.Length - scratchA.Offset; |
| scratchB.Length = b.Length - scratchB.Offset; |
| } |
| |
| return scratchA.CompareTo(scratchB); |
| } |
| } |
| |
| public override void Build(IInputEnumerator enumerator) |
| { |
| if (enumerator.HasContexts) |
| { |
| throw new ArgumentException("this suggester doesn't support contexts"); |
| } |
| string prefix = this.GetType().Name; |
| var directory = OfflineSorter.DefaultTempDir(); |
| var tempInput = FileSupport.CreateTempFile(prefix, ".input", directory); |
| var tempSorted = FileSupport.CreateTempFile(prefix, ".sorted", directory); |
| |
| hasPayloads = enumerator.HasPayloads; |
| |
| var writer = new OfflineSorter.ByteSequencesWriter(tempInput); |
| OfflineSorter.ByteSequencesReader reader = null; |
| var scratch = new BytesRef(); |
| |
| TokenStreamToAutomaton ts2a = GetTokenStreamToAutomaton(); |
| |
| bool success = false; |
| count = 0; |
| byte[] buffer = new byte[8]; |
| try |
| { |
| var output = new ByteArrayDataOutput(buffer); |
| BytesRef surfaceForm; |
| |
| while (enumerator.MoveNext()) |
| { |
| surfaceForm = enumerator.Current; |
| ISet<Int32sRef> paths = ToFiniteStrings(surfaceForm, ts2a); |
| |
| maxAnalyzedPathsForOneInput = Math.Max(maxAnalyzedPathsForOneInput, paths.Count); |
| |
| foreach (Int32sRef path in paths) |
| { |
| |
| Util.Fst.Util.ToBytesRef(path, scratch); |
| |
| // length of the analyzed text (FST input) |
| if (scratch.Length > ushort.MaxValue - 2) |
| { |
| throw new ArgumentException("cannot handle analyzed forms > " + (ushort.MaxValue - 2) + |
| " in length (got " + scratch.Length + ")"); |
| } |
| ushort analyzedLength = (ushort)scratch.Length; |
| |
| // compute the required length: |
| // analyzed sequence + weight (4) + surface + analyzedLength (short) |
| int requiredLength = analyzedLength + 4 + surfaceForm.Length + 2; |
| |
| BytesRef payload; |
| |
| if (hasPayloads) |
| { |
| if (surfaceForm.Length > (ushort.MaxValue - 2)) |
| { |
| throw new ArgumentException("cannot handle surface form > " + (ushort.MaxValue - 2) + |
| " in length (got " + surfaceForm.Length + ")"); |
| } |
| payload = enumerator.Payload; |
| // payload + surfaceLength (short) |
| requiredLength += payload.Length + 2; |
| } |
| else |
| { |
| payload = null; |
| } |
| |
| buffer = ArrayUtil.Grow(buffer, requiredLength); |
| |
| output.Reset(buffer); |
| |
| output.WriteInt16((short)analyzedLength); |
| |
| output.WriteBytes(scratch.Bytes, scratch.Offset, scratch.Length); |
| |
| output.WriteInt32(EncodeWeight(enumerator.Weight)); |
| |
| if (hasPayloads) |
| { |
| for (int i = 0; i < surfaceForm.Length; i++) |
| { |
| if (surfaceForm.Bytes[i] == PAYLOAD_SEP) |
| { |
| throw new ArgumentException( |
| "surface form cannot contain unit separator character U+001F; this character is reserved"); |
| } |
| } |
| output.WriteInt16((short)surfaceForm.Length); |
| output.WriteBytes(surfaceForm.Bytes, surfaceForm.Offset, surfaceForm.Length); |
| output.WriteBytes(payload.Bytes, payload.Offset, payload.Length); |
| } |
| else |
| { |
| output.WriteBytes(surfaceForm.Bytes, surfaceForm.Offset, surfaceForm.Length); |
| } |
| |
| if (Debugging.AssertsEnabled) Debugging.Assert(output.Position == requiredLength, "{0} vs {1}", output.Position, requiredLength); |
| |
| writer.Write(buffer, 0, output.Position); |
| } |
| count++; |
| } |
| writer.Dispose(); |
| |
| // Sort all input/output pairs (required by FST.Builder): |
| (new OfflineSorter(new AnalyzingComparer(hasPayloads))).Sort(tempInput, tempSorted); |
| |
| // Free disk space: |
| tempInput.Delete(); |
| |
| reader = new OfflineSorter.ByteSequencesReader(tempSorted); |
| |
| var outputs = new PairOutputs<long?, BytesRef>(PositiveInt32Outputs.Singleton, |
| ByteSequenceOutputs.Singleton); |
| var builder = new Builder<PairOutputs<long?, BytesRef>.Pair>(FST.INPUT_TYPE.BYTE1, outputs); |
| |
| // Build FST: |
| BytesRef previousAnalyzed = null; |
| BytesRef analyzed = new BytesRef(); |
| BytesRef surface = new BytesRef(); |
| Int32sRef scratchInts = new Int32sRef(); |
| var input = new ByteArrayDataInput(); |
| |
| // Used to remove duplicate surface forms (but we |
| // still index the hightest-weight one). We clear |
| // this when we see a new analyzed form, so it cannot |
| // grow unbounded (at most 256 entries): |
| var seenSurfaceForms = new JCG.HashSet<BytesRef>(); |
| |
| var dedup = 0; |
| while (reader.Read(scratch)) |
| { |
| input.Reset(scratch.Bytes, scratch.Offset, scratch.Length); |
| ushort analyzedLength = (ushort)input.ReadInt16(); |
| analyzed.Grow(analyzedLength + 2); |
| input.ReadBytes(analyzed.Bytes, 0, analyzedLength); |
| analyzed.Length = analyzedLength; |
| |
| long cost = input.ReadInt32(); |
| |
| surface.Bytes = scratch.Bytes; |
| if (hasPayloads) |
| { |
| surface.Length = (ushort)input.ReadInt16(); |
| surface.Offset = input.Position; |
| } |
| else |
| { |
| surface.Offset = input.Position; |
| surface.Length = scratch.Length - surface.Offset; |
| } |
| |
| if (previousAnalyzed == null) |
| { |
| previousAnalyzed = new BytesRef(); |
| previousAnalyzed.CopyBytes(analyzed); |
| seenSurfaceForms.Add(BytesRef.DeepCopyOf(surface)); |
| } |
| else if (analyzed.Equals(previousAnalyzed)) |
| { |
| dedup++; |
| if (dedup >= maxSurfaceFormsPerAnalyzedForm) |
| { |
| // More than maxSurfaceFormsPerAnalyzedForm |
| // dups: skip the rest: |
| continue; |
| } |
| if (seenSurfaceForms.Contains(surface)) |
| { |
| continue; |
| } |
| seenSurfaceForms.Add(BytesRef.DeepCopyOf(surface)); |
| } |
| else |
| { |
| dedup = 0; |
| previousAnalyzed.CopyBytes(analyzed); |
| seenSurfaceForms.Clear(); |
| seenSurfaceForms.Add(BytesRef.DeepCopyOf(surface)); |
| } |
| |
| // TODO: I think we can avoid the extra 2 bytes when |
| // there is no dup (dedup==0), but we'd have to fix |
| // the exactFirst logic ... which would be sort of |
| // hairy because we'd need to special case the two |
| // (dup/not dup)... |
| |
| // NOTE: must be byte 0 so we sort before whatever |
| // is next |
| analyzed.Bytes[analyzed.Offset + analyzed.Length] = 0; |
| analyzed.Bytes[analyzed.Offset + analyzed.Length + 1] = (byte)dedup; |
| analyzed.Length += 2; |
| |
| Util.Fst.Util.ToInt32sRef(analyzed, scratchInts); |
| //System.out.println("ADD: " + scratchInts + " -> " + cost + ": " + surface.utf8ToString()); |
| if (!hasPayloads) |
| { |
| builder.Add(scratchInts, outputs.NewPair(cost, BytesRef.DeepCopyOf(surface))); |
| } |
| else |
| { |
| int payloadOffset = input.Position + surface.Length; |
| int payloadLength = scratch.Length - payloadOffset; |
| BytesRef br = new BytesRef(surface.Length + 1 + payloadLength); |
| Array.Copy(surface.Bytes, surface.Offset, br.Bytes, 0, surface.Length); |
| br.Bytes[surface.Length] = PAYLOAD_SEP; |
| Array.Copy(scratch.Bytes, payloadOffset, br.Bytes, surface.Length + 1, payloadLength); |
| br.Length = br.Bytes.Length; |
| builder.Add(scratchInts, outputs.NewPair(cost, br)); |
| } |
| } |
| fst = builder.Finish(); |
| |
| //Util.dotToFile(fst, "/tmp/suggest.dot"); |
| |
| success = true; |
| } |
| finally |
| { |
| if (success) |
| { |
| IOUtils.Dispose(reader, writer); |
| } |
| else |
| { |
| IOUtils.DisposeWhileHandlingException(reader, writer); |
| } |
| |
| tempInput.Delete(); |
| tempSorted.Delete(); |
| } |
| } |
| |
| public override bool Store(DataOutput output) |
| { |
| output.WriteVInt64(count); |
| if (fst == null) |
| { |
| return false; |
| } |
| |
| fst.Save(output); |
| output.WriteVInt32(maxAnalyzedPathsForOneInput); |
| output.WriteByte((byte)(hasPayloads ? 1 : 0)); |
| return true; |
| } |
| |
| public override bool Load(DataInput input) |
| { |
| count = input.ReadVInt64(); |
| this.fst = new FST<PairOutputs<long?, BytesRef>.Pair>(input, new PairOutputs<long?, BytesRef>(PositiveInt32Outputs.Singleton, ByteSequenceOutputs.Singleton)); |
| maxAnalyzedPathsForOneInput = input.ReadVInt32(); |
| hasPayloads = input.ReadByte() == 1; |
| return true; |
| } |
| |
| private LookupResult GetLookupResult(long? output1, BytesRef output2, CharsRef spare) |
| { |
| LookupResult result; |
| if (hasPayloads) |
| { |
| int sepIndex = -1; |
| for (int i = 0; i < output2.Length; i++) |
| { |
| if (output2.Bytes[output2.Offset + i] == PAYLOAD_SEP) |
| { |
| sepIndex = i; |
| break; |
| } |
| } |
| if (Debugging.AssertsEnabled) Debugging.Assert(sepIndex != -1); |
| spare.Grow(sepIndex); |
| |
| int payloadLen = output2.Length - sepIndex - 1; |
| UnicodeUtil.UTF8toUTF16(output2.Bytes, output2.Offset, sepIndex, spare); |
| BytesRef payload = new BytesRef(payloadLen); |
| Array.Copy(output2.Bytes, sepIndex + 1, payload.Bytes, 0, payloadLen); |
| payload.Length = payloadLen; |
| result = new LookupResult(spare.ToString(), DecodeWeight(output1.GetValueOrDefault()), payload); |
| } |
| else |
| { |
| spare.Grow(output2.Length); |
| UnicodeUtil.UTF8toUTF16(output2, spare); |
| result = new LookupResult(spare.ToString(), DecodeWeight(output1.GetValueOrDefault())); |
| } |
| |
| return result; |
| } |
| |
| private bool SameSurfaceForm(BytesRef key, BytesRef output2) |
| { |
| if (hasPayloads) |
| { |
| // output2 has at least PAYLOAD_SEP byte: |
| if (key.Length >= output2.Length) |
| { |
| return false; |
| } |
| for (int i = 0; i < key.Length; i++) |
| { |
| if (key.Bytes[key.Offset + i] != output2.Bytes[output2.Offset + i]) |
| { |
| return false; |
| } |
| } |
| return output2.Bytes[output2.Offset + key.Length] == PAYLOAD_SEP; |
| } |
| else |
| { |
| return key.BytesEquals(output2); |
| } |
| } |
| |
| public override IList<LookupResult> DoLookup(string key, IEnumerable<BytesRef> contexts, bool onlyMorePopular, int num) |
| { |
| if (Debugging.AssertsEnabled) Debugging.Assert(num > 0); |
| |
| if (onlyMorePopular) |
| { |
| throw new ArgumentException("this suggester only works with onlyMorePopular=false"); |
| } |
| if (contexts != null) |
| { |
| throw new ArgumentException("this suggester doesn't support contexts"); |
| } |
| if (fst == null) |
| { |
| return Collections.EmptyList<LookupResult>(); |
| } |
| |
| //System.out.println("lookup key=" + key + " num=" + num); |
| for (var i = 0; i < key.Length; i++) |
| { |
| if (key[i] == 0x1E) |
| { |
| throw new ArgumentException( |
| "lookup key cannot contain HOLE character U+001E; this character is reserved"); |
| } |
| if (key[i] == 0x1F) |
| { |
| throw new ArgumentException( |
| "lookup key cannot contain unit separator character U+001F; this character is reserved"); |
| } |
| } |
| |
| var utf8Key = new BytesRef(key); |
| try |
| { |
| |
| Automaton lookupAutomaton = ToLookupAutomaton(key); |
| |
| var spare = new CharsRef(); |
| |
| //System.out.println(" now intersect exactFirst=" + exactFirst); |
| |
| // Intersect automaton w/ suggest wFST and get all |
| // prefix starting nodes & their outputs: |
| //final PathIntersector intersector = getPathIntersector(lookupAutomaton, fst); |
| |
| //System.out.println(" prefixPaths: " + prefixPaths.size()); |
| |
| FST.BytesReader bytesReader = fst.GetBytesReader(); |
| |
| var scratchArc = new FST.Arc<PairOutputs<long?, BytesRef>.Pair>(); |
| |
| IList<LookupResult> results = new List<LookupResult>(); |
| |
| IList<FSTUtil.Path<PairOutputs<long?, BytesRef>.Pair>> prefixPaths = |
| FSTUtil.IntersectPrefixPaths(ConvertAutomaton(lookupAutomaton), fst); |
| |
| if (exactFirst) |
| { |
| |
| int count = 0; |
| foreach (FSTUtil.Path<PairOutputs<long?, BytesRef>.Pair> path in prefixPaths) |
| { |
| if (fst.FindTargetArc(END_BYTE, path.FstNode, scratchArc, bytesReader) != null) |
| { |
| // This node has END_BYTE arc leaving, meaning it's an |
| // "exact" match: |
| count++; |
| } |
| } |
| |
| // Searcher just to find the single exact only |
| // match, if present: |
| Util.Fst.Util.TopNSearcher<PairOutputs<long?, BytesRef>.Pair> searcher; |
| searcher = new Util.Fst.Util.TopNSearcher<PairOutputs<long?, BytesRef>.Pair>(fst, count * maxSurfaceFormsPerAnalyzedForm, |
| count * maxSurfaceFormsPerAnalyzedForm, weightComparer); |
| |
| // NOTE: we could almost get away with only using |
| // the first start node. The only catch is if |
| // maxSurfaceFormsPerAnalyzedForm had kicked in and |
| // pruned our exact match from one of these nodes |
| // ...: |
| foreach (var path in prefixPaths) |
| { |
| if (fst.FindTargetArc(END_BYTE, path.FstNode, scratchArc, bytesReader) != null) |
| { |
| // This node has END_BYTE arc leaving, meaning it's an |
| // "exact" match: |
| searcher.AddStartPaths(scratchArc, fst.Outputs.Add(path.Output, scratchArc.Output), false, |
| path.Input); |
| } |
| } |
| |
| var completions = searcher.Search(); |
| if (Debugging.AssertsEnabled) Debugging.Assert(completions.IsComplete); |
| |
| // NOTE: this is rather inefficient: we enumerate |
| // every matching "exactly the same analyzed form" |
| // path, and then do linear scan to see if one of |
| // these exactly matches the input. It should be |
| // possible (though hairy) to do something similar |
| // to getByOutput, since the surface form is encoded |
| // into the FST output, so we more efficiently hone |
| // in on the exact surface-form match. Still, I |
| // suspect very little time is spent in this linear |
| // seach: it's bounded by how many prefix start |
| // nodes we have and the |
| // maxSurfaceFormsPerAnalyzedForm: |
| foreach (var completion in completions) |
| { |
| BytesRef output2 = completion.Output.Output2; |
| if (SameSurfaceForm(utf8Key, output2)) |
| { |
| results.Add(GetLookupResult(completion.Output.Output1, output2, spare)); |
| break; |
| } |
| } |
| |
| if (results.Count == num) |
| { |
| // That was quick: |
| return results; |
| } |
| } |
| |
| Util.Fst.Util.TopNSearcher<PairOutputs<long?, BytesRef>.Pair> searcher2; |
| searcher2 = new TopNSearcherAnonymousClass(this, fst, num - results.Count, |
| num * maxAnalyzedPathsForOneInput, weightComparer, utf8Key, results); |
| |
| prefixPaths = GetFullPrefixPaths(prefixPaths, lookupAutomaton, fst); |
| |
| foreach (FSTUtil.Path<PairOutputs<long?, BytesRef>.Pair> path in prefixPaths) |
| { |
| searcher2.AddStartPaths(path.FstNode, path.Output, true, path.Input); |
| } |
| |
| var completions2 = searcher2.Search(); |
| if (Debugging.AssertsEnabled) Debugging.Assert(completions2.IsComplete); |
| |
| foreach (Util.Fst.Util.Result<PairOutputs<long?, BytesRef>.Pair> completion in completions2) |
| { |
| |
| LookupResult result = GetLookupResult(completion.Output.Output1, completion.Output.Output2, spare); |
| |
| // TODO: for fuzzy case would be nice to return |
| // how many edits were required |
| |
| //System.out.println(" result=" + result); |
| results.Add(result); |
| |
| if (results.Count == num) |
| { |
| // In the exactFirst=true case the search may |
| // produce one extra path |
| break; |
| } |
| } |
| |
| return results; |
| } |
| catch (IOException bogus) |
| { |
| throw new Exception(bogus.ToString(), bogus); |
| } |
| } |
| |
| private class TopNSearcherAnonymousClass : Util.Fst.Util.TopNSearcher<PairOutputs<long?, BytesRef>.Pair> |
| { |
| private readonly AnalyzingSuggester outerInstance; |
| |
| private readonly BytesRef utf8Key; |
| private readonly IList<LookupResult> results; |
| |
| public TopNSearcherAnonymousClass( |
| AnalyzingSuggester outerInstance, |
| FST<PairOutputs<long?, BytesRef>.Pair> fst, |
| int topN, |
| int maxQueueDepth, |
| IComparer<PairOutputs<long?, BytesRef>.Pair> comparer, |
| BytesRef utf8Key, |
| IList<LookupResult> results) |
| : base(fst, topN, maxQueueDepth, comparer) |
| { |
| this.outerInstance = outerInstance; |
| this.utf8Key = utf8Key; |
| this.results = results; |
| seen = new JCG.HashSet<BytesRef>(); |
| } |
| |
| private readonly ISet<BytesRef> seen; |
| |
| protected override bool AcceptResult(Int32sRef input, PairOutputs<long?, BytesRef>.Pair output) |
| { |
| |
| // Dedup: when the input analyzes to a graph we |
| // can get duplicate surface forms: |
| if (seen.Contains(output.Output2)) |
| { |
| return false; |
| } |
| seen.Add(output.Output2); |
| |
| if (!outerInstance.exactFirst) |
| { |
| return true; |
| } |
| else |
| { |
| // In exactFirst mode, don't accept any paths |
| // matching the surface form since that will |
| // create duplicate results: |
| if (outerInstance.SameSurfaceForm(utf8Key, output.Output2)) |
| { |
| // We found exact match, which means we should |
| // have already found it in the first search: |
| if (Debugging.AssertsEnabled) Debugging.Assert(results.Count == 1); |
| return false; |
| } |
| else |
| { |
| return true; |
| } |
| } |
| } |
| } |
| |
| public override long Count => count; |
| |
| /// <summary> |
| /// Returns all prefix paths to initialize the search. |
| /// </summary> |
| protected internal virtual IList<FSTUtil.Path<PairOutputs<long?, BytesRef>.Pair>> GetFullPrefixPaths( |
| IList<FSTUtil.Path<PairOutputs<long?, BytesRef>.Pair>> prefixPaths, Automaton lookupAutomaton, |
| FST<PairOutputs<long?, BytesRef>.Pair> fst) |
| { |
| return prefixPaths; |
| } |
| |
| internal ISet<Int32sRef> ToFiniteStrings(BytesRef surfaceForm, TokenStreamToAutomaton ts2a) |
| { |
| // Analyze surface form: |
| Automaton automaton = null; |
| TokenStream ts = indexAnalyzer.GetTokenStream("", surfaceForm.Utf8ToString()); |
| try |
| { |
| |
| // Create corresponding automaton: labels are bytes |
| // from each analyzed token, with byte 0 used as |
| // separator between tokens: |
| automaton = ts2a.ToAutomaton(ts); |
| } |
| finally |
| { |
| IOUtils.DisposeWhileHandlingException(ts); |
| } |
| |
| ReplaceSep(automaton); |
| automaton = ConvertAutomaton(automaton); |
| |
| if (Debugging.AssertsEnabled) Debugging.Assert(SpecialOperations.IsFinite(automaton)); |
| |
| // Get all paths from the automaton (there can be |
| // more than one path, eg if the analyzer created a |
| // graph using SynFilter or WDF): |
| |
| // TODO: we could walk & add simultaneously, so we |
| // don't have to alloc [possibly biggish] |
| // intermediate HashSet in RAM: |
| return SpecialOperations.GetFiniteStrings(automaton, maxGraphExpansions); |
| } |
| |
| internal Automaton ToLookupAutomaton(string key) |
| { |
| // TODO: is there a Reader from a CharSequence? |
| // Turn tokenstream into automaton: |
| Automaton automaton = null; |
| TokenStream ts = queryAnalyzer.GetTokenStream("", key); |
| try |
| { |
| automaton = (GetTokenStreamToAutomaton()).ToAutomaton(ts); |
| } |
| finally |
| { |
| IOUtils.DisposeWhileHandlingException(ts); |
| } |
| |
| // TODO: we could use the end offset to "guess" |
| // whether the final token was a partial token; this |
| // would only be a heuristic ... but maybe an OK one. |
| // This way we could eg differentiate "net" from "net ", |
| // which we can't today... |
| |
| ReplaceSep(automaton); |
| |
| // TODO: we can optimize this somewhat by determinizing |
| // while we convert |
| BasicOperations.Determinize(automaton); |
| return automaton; |
| } |
| |
| /// <summary> |
| /// Returns the weight associated with an input string, |
| /// or null if it does not exist. |
| /// </summary> |
| public virtual object Get(string key) |
| { |
| throw new NotSupportedException(); |
| } |
| |
| /// <summary> |
| /// cost -> weight </summary> |
| private static int DecodeWeight(long encoded) |
| { |
| return (int)(int.MaxValue - encoded); |
| } |
| |
| /// <summary> |
| /// weight -> cost </summary> |
| private static int EncodeWeight(long value) |
| { |
| if (value < 0 || value > int.MaxValue) |
| { |
| throw new NotSupportedException("cannot encode value: " + value); |
| } |
| return int.MaxValue - (int)value; |
| } |
| |
| internal static readonly IComparer<PairOutputs<long?, BytesRef>.Pair> weightComparer = |
| Comparer<PairOutputs<long?, BytesRef>.Pair>.Create((left, right) => Comparer<long?>.Default.Compare(left.Output1, right.Output1)); |
| } |
| } |