| using Lucene.Net.Util; |
| using Lucene.Net.Util.Fst; |
| using System; |
| using System.Collections.Generic; |
| using System.Diagnostics; |
| using System.Globalization; |
| using System.IO; |
| using System.Linq; |
| |
| namespace Lucene.Net.Search.Suggest.Fst |
| { |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| /// <summary> |
| /// Finite state automata based implementation of "autocomplete" functionality. |
| /// </summary> |
| /// <seealso cref="FSTCompletionBuilder"/> |
| /// @lucene.experimental |
| |
| // TODO: we could store exact weights as outputs from the FST (int4 encoded |
| // floats). This would provide exact outputs from this method and to some |
| // degree allowed post-sorting on a more fine-grained weight. |
| |
| // TODO: support for Analyzers (infix suggestions, synonyms?) |
| |
| public class FSTCompletion |
| { |
| /// <summary> |
| /// A single completion for a given key. |
| /// </summary> |
| public sealed class Completion : IComparable<Completion> |
| { |
| /// <summary> |
| /// UTF-8 bytes of the suggestion </summary> |
| public BytesRef Utf8 { get; private set; } |
| /// <summary> |
| /// source bucket (weight) of the suggestion </summary> |
| public int Bucket { get; private set; } |
| |
| internal Completion(BytesRef key, int bucket) |
| { |
| this.Utf8 = BytesRef.DeepCopyOf(key); |
| this.Bucket = bucket; |
| } |
| |
| public override string ToString() |
| { |
| return Utf8.Utf8ToString() + "/" + Bucket.ToString("0.0", CultureInfo.InvariantCulture); |
| } |
| |
| /// <seealso cref="BytesRef.CompareTo(object)"></seealso> |
| public int CompareTo(Completion o) |
| { |
| return this.Utf8.CompareTo(o.Utf8); |
| } |
| } |
| |
| /// <summary> |
| /// Default number of buckets. |
| /// </summary> |
| public const int DEFAULT_BUCKETS = 10; |
| |
| /// <summary> |
| /// An empty result. Keep this an <see cref="List{T}"/> to keep all the returned |
| /// lists of single type (monomorphic calls). |
| /// </summary> |
| private static readonly List<Completion> EMPTY_RESULT = new List<Completion>(); |
| |
| /// <summary> |
| /// Finite state automaton encoding all the lookup terms. See class notes for |
| /// details. |
| /// </summary> |
| private readonly FST<object> automaton; |
| |
| /// <summary> |
| /// An array of arcs leaving the root automaton state and encoding weights of |
| /// all completions in their sub-trees. |
| /// </summary> |
| private readonly FST.Arc<object>[] rootArcs; |
| |
| /// <seealso cref="FSTCompletion(FST{object}, bool, bool)" /> |
| private readonly bool exactFirst; |
| |
| /// <seealso cref="FSTCompletion(FST{object}, bool, bool)" /> |
| private readonly bool higherWeightsFirst; |
| |
| // LUCENENET SPECIFIC: We need some thread safety to execute atomic list operations |
| private readonly object syncLock = new object(); |
| |
| /// <summary> |
| /// Constructs an FSTCompletion, specifying higherWeightsFirst and exactFirst. </summary> |
| /// <param name="automaton"> |
| /// Automaton with completions. See <see cref="FSTCompletionBuilder"/>. </param> |
| /// <param name="higherWeightsFirst"> |
| /// Return most popular suggestions first. This is the default |
| /// behavior for this implementation. Setting it to <c>false</c> |
| /// has no effect (use constant term weights to sort alphabetically |
| /// only). </param> |
| /// <param name="exactFirst"> |
| /// Find and push an exact match to the first position of the result |
| /// list if found. </param> |
| public FSTCompletion(FST<object> automaton, bool higherWeightsFirst, bool exactFirst) |
| { |
| this.automaton = automaton; |
| if (automaton != null) |
| { |
| this.rootArcs = CacheRootArcs(automaton); |
| } |
| else |
| { |
| this.rootArcs = new FST.Arc<object>[0]; |
| } |
| this.higherWeightsFirst = higherWeightsFirst; |
| this.exactFirst = exactFirst; |
| } |
| |
| /// <summary> |
| /// Defaults to higher weights first and exact first. </summary> |
| /// <seealso cref="FSTCompletion(FST{object}, bool, bool)"/> |
| public FSTCompletion(FST<object> automaton) |
| : this(automaton, true, true) |
| { |
| } |
| |
| /// <summary> |
| /// Cache the root node's output arcs starting with completions with the |
| /// highest weights. |
| /// </summary> |
| private static FST.Arc<object>[] CacheRootArcs(FST<object> automaton) |
| { |
| try |
| { |
| IList<FST.Arc<object>> rootArcs = new List<FST.Arc<object>>(); |
| FST.Arc<object> arc = automaton.GetFirstArc(new FST.Arc<object>()); |
| FST.BytesReader fstReader = automaton.GetBytesReader(); |
| automaton.ReadFirstTargetArc(arc, arc, fstReader); |
| while (true) |
| { |
| rootArcs.Add((new FST.Arc<object>()).CopyFrom(arc)); |
| if (arc.IsLast) |
| { |
| break; |
| } |
| automaton.ReadNextArc(arc, fstReader); |
| } |
| |
| // we want highest weights first. |
| return rootArcs.Reverse().ToArray(); |
| } |
| catch (IOException e) |
| { |
| throw new Exception(e.ToString(), e); |
| } |
| } |
| |
| /// <summary> |
| /// Returns the first exact match by traversing root arcs, starting from the |
| /// arc <paramref name="rootArcIndex"/>. |
| /// </summary> |
| /// <param name="rootArcIndex"> |
| /// The first root arc index in <see cref="rootArcs"/> to consider when |
| /// matching. |
| /// </param> |
| /// <param name="utf8"> |
| /// The sequence of utf8 bytes to follow. |
| /// </param> |
| /// <returns> Returns the bucket number of the match or <code>-1</code> if no |
| /// match was found. </returns> |
| private int GetExactMatchStartingFromRootArc(int rootArcIndex, BytesRef utf8) |
| { |
| // Get the UTF-8 bytes representation of the input key. |
| try |
| { |
| FST.Arc<object> scratch = new FST.Arc<object>(); |
| FST.BytesReader fstReader = automaton.GetBytesReader(); |
| for (; rootArcIndex < rootArcs.Length; rootArcIndex++) |
| { |
| FST.Arc<object> rootArc = rootArcs[rootArcIndex]; |
| FST.Arc<object> arc = scratch.CopyFrom(rootArc); |
| |
| // Descend into the automaton using the key as prefix. |
| if (DescendWithPrefix(arc, utf8)) |
| { |
| automaton.ReadFirstTargetArc(arc, arc, fstReader); |
| if (arc.Label == Lucene.Net.Util.Fst.FST.END_LABEL) |
| { |
| // Normalize prefix-encoded weight. |
| return rootArc.Label; |
| } |
| } |
| } |
| } |
| catch (IOException e) |
| { |
| // Should never happen, but anyway. |
| throw new Exception(e.ToString(), e); |
| } |
| |
| // No match. |
| return -1; |
| } |
| |
| /// <summary> |
| /// Lookup suggestions to <paramref name="key"/>. |
| /// </summary> |
| /// <param name="key"> |
| /// The prefix to which suggestions should be sought. </param> |
| /// <param name="num"> |
| /// At most this number of suggestions will be returned. </param> |
| /// <returns> Returns the suggestions, sorted by their approximated weight first |
| /// (decreasing) and then alphabetically (UTF-8 codepoint order). </returns> |
| public virtual IList<Completion> DoLookup(string key, int num) |
| { |
| if (key.Length == 0 || automaton == null) |
| { |
| return EMPTY_RESULT; |
| } |
| |
| try |
| { |
| var keyUtf8 = new BytesRef(key); |
| if (!higherWeightsFirst && rootArcs.Length > 1) |
| { |
| // We could emit a warning here (?). An optimal strategy for |
| // alphabetically sorted |
| // suggestions would be to add them with a constant weight -- this saves |
| // unnecessary |
| // traversals and sorting. |
| return LookupSortedAlphabetically(keyUtf8, num); |
| } |
| else |
| { |
| return LookupSortedByWeight(keyUtf8, num, false); |
| } |
| } |
| catch (IOException e) |
| { |
| // Should never happen, but anyway. |
| throw new Exception(e.ToString(), e); |
| } |
| } |
| |
| /// <summary> |
| /// Lookup suggestions sorted alphabetically <c>if weights are not |
| /// constant</c>. This is a workaround: in general, use constant weights for |
| /// alphabetically sorted result. |
| /// </summary> |
| private List<Completion> LookupSortedAlphabetically(BytesRef key, int num) |
| { |
| // Greedily get num results from each weight branch. |
| var res = LookupSortedByWeight(key, num, true); |
| |
| // Sort and trim. |
| res.Sort(); |
| if (res.Count > num) |
| { |
| res = res.GetRange(0, num - 0); |
| } |
| return res; |
| } |
| |
| /// <summary> |
| /// Lookup suggestions sorted by weight (descending order). |
| /// </summary> |
| /// <param name="collectAll"> |
| /// If <c>true</c>, the routine terminates immediately when |
| /// <paramref name="num"/> suggestions have been collected. If |
| /// <c>false</c>, it will collect suggestions from all weight |
| /// arcs (needed for <see cref="LookupSortedAlphabetically"/>. </param> |
| private List<Completion> LookupSortedByWeight(BytesRef key, int num, bool collectAll) |
| { |
| // Don't overallocate the results buffers. This also serves the purpose of |
| // allowing the user of this class to request all matches using Integer.MAX_VALUE as |
| // the number of results. |
| List<Completion> res = new List<Completion>(Math.Min(10, num)); |
| |
| BytesRef output = BytesRef.DeepCopyOf(key); |
| for (int i = 0; i < rootArcs.Length; i++) |
| { |
| FST.Arc<object> rootArc = rootArcs[i]; |
| FST.Arc<object> arc = (new FST.Arc<object>()).CopyFrom(rootArc); |
| |
| // Descend into the automaton using the key as prefix. |
| if (DescendWithPrefix(arc, key)) |
| { |
| // A subgraph starting from the current node has the completions |
| // of the key prefix. The arc we're at is the last key's byte, |
| // so we will collect it too. |
| output.Length = key.Length - 1; |
| if (Collect(res, num, rootArc.Label, output, arc) && !collectAll) |
| { |
| // We have enough suggestions to return immediately. Keep on looking |
| // for an |
| // exact match, if requested. |
| if (exactFirst) |
| { |
| if (!CheckExistingAndReorder(res, key)) |
| { |
| int exactMatchBucket = GetExactMatchStartingFromRootArc(i, key); |
| if (exactMatchBucket != -1) |
| { |
| // Insert as the first result and truncate at num. |
| while (res.Count >= num) |
| { |
| res.RemoveAt(res.Count - 1); |
| } |
| res.Insert(0, new Completion(key, exactMatchBucket)); |
| } |
| } |
| } |
| break; |
| } |
| } |
| } |
| return res; |
| } |
| |
| /// <summary> |
| /// Checks if the list of |
| /// <see cref="Lookup.LookupResult"/>s already has a |
| /// <paramref name="key"/>. If so, reorders that |
| /// <see cref="Lookup.LookupResult"/> to the first |
| /// position. |
| /// </summary> |
| /// <returns> |
| /// Returns <c>true</c> if and only if <paramref name="list"/> contained |
| /// <paramref name="key"/>. |
| /// </returns> |
| private bool CheckExistingAndReorder(IList<Completion> list, BytesRef key) |
| { |
| // We assume list does not have duplicates (because of how the FST is created). |
| for (int i = list.Count; --i >= 0; ) |
| { |
| if (key.Equals(list[i].Utf8)) |
| { |
| // Key found. Unless already at i==0, remove it and push up front so |
| // that the ordering |
| // remains identical with the exception of the exact match. |
| lock (syncLock) |
| { |
| if (key.Equals(list[i].Utf8)) |
| { |
| var element = list.ElementAt(i); |
| list.Remove(element); |
| list.Insert(0, element); |
| } |
| } |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| /// <summary> |
| /// Descend along the path starting at <paramref name="arc"/> and going through bytes |
| /// in the argument. |
| /// </summary> |
| /// <param name="arc"> |
| /// The starting arc. This argument is modified in-place. </param> |
| /// <param name="utf8"> |
| /// The term to descend along. </param> |
| /// <returns> If <c>true</c>, <paramref name="arc"/> will be set to the arc |
| /// matching last byte of <c>term</c>. <c>false</c> is |
| /// returned if no such prefix exists. </returns> |
| private bool DescendWithPrefix(FST.Arc<object> arc, BytesRef utf8) |
| { |
| int max = utf8.Offset + utf8.Length; |
| // Cannot save as instance var since multiple threads |
| // can use FSTCompletion at once... |
| FST.BytesReader fstReader = automaton.GetBytesReader(); |
| for (int i = utf8.Offset; i < max; i++) |
| { |
| if (automaton.FindTargetArc(utf8.Bytes[i] & 0xff, arc, arc, fstReader) == null) |
| { |
| // No matching prefixes, return an empty result. |
| return false; |
| } |
| } |
| return true; |
| } |
| |
| /// <summary> |
| /// Recursive collect lookup results from the automaton subgraph starting at |
| /// <paramref name="arc"/>. |
| /// </summary> |
| /// <param name="num"> |
| /// Maximum number of results needed (early termination). </param> |
| private bool Collect(IList<Completion> res, int num, int bucket, BytesRef output, FST.Arc<object> arc) |
| { |
| if (output.Length == output.Bytes.Length) |
| { |
| output.Bytes = ArrayUtil.Grow(output.Bytes); |
| } |
| Debug.Assert(output.Offset == 0); |
| output.Bytes[output.Length++] = (byte) arc.Label; |
| FST.BytesReader fstReader = automaton.GetBytesReader(); |
| automaton.ReadFirstTargetArc(arc, arc, fstReader); |
| while (true) |
| { |
| if (arc.Label == Lucene.Net.Util.Fst.FST.END_LABEL) |
| { |
| res.Add(new Completion(output, bucket)); |
| if (res.Count >= num) |
| { |
| return true; |
| } |
| } |
| else |
| { |
| int save = output.Length; |
| if (Collect(res, num, bucket, output, (new FST.Arc<object>()).CopyFrom(arc))) |
| { |
| return true; |
| } |
| output.Length = save; |
| } |
| |
| if (arc.IsLast) |
| { |
| break; |
| } |
| automaton.ReadNextArc(arc, fstReader); |
| } |
| return false; |
| } |
| |
| /// <summary> |
| /// Returns the bucket count (discretization thresholds). |
| /// </summary> |
| public virtual int BucketCount |
| { |
| get |
| { |
| return rootArcs.Length; |
| } |
| } |
| |
| /// <summary> |
| /// Returns the bucket assigned to a given key (if found) or <c>-1</c> if |
| /// no exact match exists. |
| /// </summary> |
| public virtual int GetBucket(string key) |
| { |
| return GetExactMatchStartingFromRootArc(0, new BytesRef(key)); |
| } |
| |
| /// <summary> |
| /// Returns the internal automaton. |
| /// </summary> |
| public virtual FST<object> FST |
| { |
| get |
| { |
| return automaton; |
| } |
| } |
| } |
| } |