| using Lucene.Net.Index; |
| using Lucene.Net.Search; |
| using Lucene.Net.Util; |
| using System; |
| using System.IO; |
| using System.Runtime.CompilerServices; |
| |
| namespace Lucene.Net.Sandbox.Queries |
| { |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| /// <summary> |
| /// Potentially slow fuzzy <see cref="TermsEnum"/> for enumerating all terms that are similar |
| /// to the specified filter term. |
| /// <para/> |
| /// If the minSimilarity or maxEdits is greater than the Automaton's |
| /// allowable range, this backs off to the classic (brute force) |
| /// fuzzy terms enum method by calling <see cref="FuzzyTermsEnum.GetAutomatonEnum(int, BytesRef)"/>. |
| /// <para/> |
| /// Term enumerations are always ordered by |
| /// <see cref="FuzzyTermsEnum.Comparer"/>. Each term in the enumeration is |
| /// greater than all that precede it. |
| /// </summary> |
| [Obsolete("Use FuzzyTermsEnum instead.")] |
| public class SlowFuzzyTermsEnum : FuzzyTermsEnum |
| { |
| public SlowFuzzyTermsEnum(Terms terms, AttributeSource atts, Term term, |
| float minSimilarity, int prefixLength) |
| : base(terms, atts, term, minSimilarity, prefixLength, false) |
| { |
| } |
| |
| protected override void MaxEditDistanceChanged(BytesRef lastTerm, int maxEdits, bool init) |
| { |
| TermsEnum newEnum = GetAutomatonEnum(maxEdits, lastTerm); |
| if (newEnum != null) |
| { |
| SetEnum(newEnum); |
| } |
| else if (init) |
| { |
| SetEnum(new LinearFuzzyTermsEnum(this)); |
| } |
| } |
| |
| /// <summary> |
| /// Implement fuzzy enumeration with linear brute force. |
| /// </summary> |
| private class LinearFuzzyTermsEnum : FilteredTermsEnum |
| { |
| private readonly SlowFuzzyTermsEnum outerInstance; |
| |
| /// <summary> |
| /// Allows us save time required to create a new array |
| /// every time similarity is called. |
| /// </summary> |
| private int[] d; |
| private int[] p; |
| |
| /// <summary>this is the text, minus the prefix</summary> |
| private readonly int[] text; |
| |
| private readonly IBoostAttribute boostAtt; |
| |
| /// <summary> |
| /// Constructor for enumeration of all terms from specified <c>reader</c> which share a prefix of |
| /// length <c>prefixLength</c> with <c>term</c> and which have a fuzzy similarity > |
| /// <c>minSimilarity</c>. |
| /// <para/> |
| /// After calling the constructor the enumeration is already pointing to the first |
| /// valid term if such a term exists. |
| /// </summary> |
| /// <exception cref="IOException">If there is a low-level I/O error.</exception> |
| public LinearFuzzyTermsEnum(SlowFuzzyTermsEnum outerInstance) |
| : base(outerInstance.m_terms.GetEnumerator()) |
| { |
| this.outerInstance = outerInstance; |
| this.boostAtt = Attributes.AddAttribute<IBoostAttribute>(); |
| |
| this.text = new int[outerInstance.m_termLength - outerInstance.m_realPrefixLength]; |
| System.Array.Copy(outerInstance.m_termText, outerInstance.m_realPrefixLength, text, 0, text.Length); |
| string prefix = UnicodeUtil.NewString(outerInstance.m_termText, 0, outerInstance.m_realPrefixLength); |
| prefixBytesRef = new BytesRef(prefix); |
| this.d = new int[this.text.Length + 1]; |
| this.p = new int[this.text.Length + 1]; |
| |
| |
| SetInitialSeekTerm(prefixBytesRef); |
| } |
| |
| private readonly BytesRef prefixBytesRef; |
| /// <summary>used for unicode conversion from BytesRef byte[] to int[]</summary> |
| private readonly Int32sRef utf32 = new Int32sRef(20); |
| |
| /// <summary> |
| /// <para> |
| /// The termCompare method in FuzzyTermEnum uses Levenshtein distance to |
| /// calculate the distance between the given term and the comparing term. |
| /// </para> |
| /// <para> |
| /// If the minSimilarity is >= 1.0, this uses the maxEdits as the comparison. |
| /// Otherwise, this method uses the following logic to calculate similarity. |
| /// <code> |
| /// similarity = 1 - ((float)distance / (float) (prefixLength + Math.min(textlen, targetlen))); |
| /// </code> |
| /// where distance is the Levenshtein distance for the two words. |
| /// </para> |
| /// </summary> |
| #if NETFRAMEWORK |
| [MethodImpl(MethodImplOptions.NoOptimization)] // LUCENENET specific: comparing float equality fails in x86 on .NET Framework with optimizations enabled. Fixes TestTokenLengthOpt. |
| #endif |
| protected override sealed AcceptStatus Accept(BytesRef term) |
| { |
| if (StringHelper.StartsWith(term, prefixBytesRef)) |
| { |
| UnicodeUtil.UTF8toUTF32(term, utf32); |
| int distance = CalcDistance(utf32.Int32s, outerInstance.m_realPrefixLength, utf32.Length - outerInstance.m_realPrefixLength); |
| |
| //Integer.MIN_VALUE is the sentinel that Levenshtein stopped early |
| if (distance == int.MinValue) |
| { |
| return AcceptStatus.NO; |
| } |
| //no need to calc similarity, if raw is true and distance > maxEdits |
| if (outerInstance.m_raw == true && distance > outerInstance.m_maxEdits) |
| { |
| return AcceptStatus.NO; |
| } |
| float similarity = CalcSimilarity(distance, (utf32.Length - outerInstance.m_realPrefixLength), text.Length); |
| |
| //if raw is true, then distance must also be <= maxEdits by now |
| //given the previous if statement |
| if (outerInstance.m_raw == true || |
| (outerInstance.m_raw == false && similarity > outerInstance.MinSimilarity)) |
| { |
| boostAtt.Boost = (similarity - outerInstance.MinSimilarity) * outerInstance.m_scaleFactor; |
| return AcceptStatus.YES; |
| } |
| else |
| { |
| return AcceptStatus.NO; |
| } |
| } |
| else |
| { |
| return AcceptStatus.END; |
| } |
| } |
| |
| /****************************** |
| * Compute Levenshtein distance |
| ******************************/ |
| |
| /// <summary> |
| /// <para> |
| /// <see cref="CalcDistance(int[], int, int)"/> returns the Levenshtein distance between the query term |
| /// and the target term. |
| /// </para> |
| /// <para> |
| /// Embedded within this algorithm is a fail-fast Levenshtein distance |
| /// algorithm. The fail-fast algorithm differs from the standard Levenshtein |
| /// distance algorithm in that it is aborted if it is discovered that the |
| /// minimum distance between the words is greater than some threshold. |
| /// </para> |
| /// <para> |
| /// Levenshtein distance (also known as edit distance) is a measure of similarity |
| /// between two strings where the distance is measured as the number of character |
| /// deletions, insertions or substitutions required to transform one string to |
| /// the other string. |
| /// </para> |
| /// </summary> |
| /// <param name="target">the target word or phrase</param> |
| /// <param name="offset">the offset at which to start the comparison</param> |
| /// <param name="length">the length of what's left of the string to compare</param> |
| /// <returns> |
| /// the number of edits or <see cref="int.MaxValue"/> if the edit distance is |
| /// greater than maxDistance. |
| /// </returns> |
| private int CalcDistance(int[] target, int offset, int length) |
| { |
| int m = length; |
| int n = text.Length; |
| if (n == 0) |
| { |
| //we don't have anything to compare. That means if we just add |
| //the letters for m we get the new word |
| return m; |
| } |
| if (m == 0) |
| { |
| return n; |
| } |
| |
| int maxDistance = CalculateMaxDistance(m); |
| |
| if (maxDistance < Math.Abs(m - n)) |
| { |
| //just adding the characters of m to n or vice-versa results in |
| //too many edits |
| //for example "pre" length is 3 and "prefixes" length is 8. We can see that |
| //given this optimal circumstance, the edit distance cannot be less than 5. |
| //which is 8-3 or more precisely Math.abs(3-8). |
| //if our maximum edit distance is 4, then we can discard this word |
| //without looking at it. |
| return int.MinValue; |
| } |
| |
| // init matrix d |
| for (int i = 0; i <= n; ++i) |
| { |
| p[i] = i; |
| } |
| |
| // start computing edit distance |
| for (int j = 1; j <= m; ++j) |
| { // iterates through target |
| int bestPossibleEditDistance = m; |
| int t_j = target[offset + j - 1]; // jth character of t |
| d[0] = j; |
| |
| for (int i = 1; i <= n; ++i) |
| { // iterates through text |
| // minimum of cell to the left+1, to the top+1, diagonally left and up +(0|1) |
| if (t_j != text[i - 1]) |
| { |
| d[i] = Math.Min(Math.Min(d[i - 1], p[i]), p[i - 1]) + 1; |
| } |
| else |
| { |
| d[i] = Math.Min(Math.Min(d[i - 1] + 1, p[i] + 1), p[i - 1]); |
| } |
| bestPossibleEditDistance = Math.Min(bestPossibleEditDistance, d[i]); |
| } |
| |
| //After calculating row i, the best possible edit distance |
| //can be found by found by finding the smallest value in a given column. |
| //If the bestPossibleEditDistance is greater than the max distance, abort. |
| |
| if (j > maxDistance && bestPossibleEditDistance > maxDistance) |
| { //equal is okay, but not greater |
| //the closest the target can be to the text is just too far away. |
| //this target is leaving the party early. |
| return int.MinValue; |
| } |
| |
| // copy current distance counts to 'previous row' distance counts: swap p and d |
| int[] _d = p; |
| p = d; |
| d = _d; |
| } |
| |
| // our last action in the above loop was to switch d and p, so p now |
| // actually has the most recent cost counts |
| |
| return p[n]; |
| } |
| |
| private float CalcSimilarity(int edits, int m, int n) |
| { |
| // this will return less than 0.0 when the edit distance is |
| // greater than the number of characters in the shorter word. |
| // but this was the formula that was previously used in FuzzyTermEnum, |
| // so it has not been changed (even though minimumSimilarity must be |
| // greater than 0.0) |
| |
| return 1.0f - ((float)edits / (float)(outerInstance.m_realPrefixLength + Math.Min(n, m))); |
| } |
| |
| /// <summary> |
| /// The max Distance is the maximum Levenshtein distance for the text |
| /// compared to some other value that results in score that is |
| /// better than the minimum similarity. |
| /// </summary> |
| /// <param name="m">the length of the "other value"</param> |
| /// <returns>the maximum levenshtein distance that we care about</returns> |
| private int CalculateMaxDistance(int m) |
| { |
| return outerInstance.m_raw ? outerInstance.m_maxEdits : Math.Min(outerInstance.m_maxEdits, |
| (int)((1 - outerInstance.MinSimilarity) * (Math.Min(text.Length, m) + outerInstance.m_realPrefixLength))); |
| } |
| } |
| } |
| } |