blob: 12a16a0a924122c0f784c4c33c9e986eb97106d9 [file] [log] [blame]
using Lucene.Net.Index;
using Lucene.Net.Search;
using Lucene.Net.Util;
using System;
using System.IO;
using System.Runtime.CompilerServices;
namespace Lucene.Net.Sandbox.Queries
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/// <summary>
/// Potentially slow fuzzy <see cref="TermsEnum"/> for enumerating all terms that are similar
/// to the specified filter term.
/// <para/>
/// If the minSimilarity or maxEdits is greater than the Automaton's
/// allowable range, this backs off to the classic (brute force)
/// fuzzy terms enum method by calling <see cref="FuzzyTermsEnum.GetAutomatonEnum(int, BytesRef)"/>.
/// <para/>
/// Term enumerations are always ordered by
/// <see cref="FuzzyTermsEnum.Comparer"/>. Each term in the enumeration is
/// greater than all that precede it.
/// </summary>
[Obsolete("Use FuzzyTermsEnum instead.")]
public class SlowFuzzyTermsEnum : FuzzyTermsEnum
{
public SlowFuzzyTermsEnum(Terms terms, AttributeSource atts, Term term,
float minSimilarity, int prefixLength)
: base(terms, atts, term, minSimilarity, prefixLength, false)
{
}
protected override void MaxEditDistanceChanged(BytesRef lastTerm, int maxEdits, bool init)
{
TermsEnum newEnum = GetAutomatonEnum(maxEdits, lastTerm);
if (newEnum != null)
{
SetEnum(newEnum);
}
else if (init)
{
SetEnum(new LinearFuzzyTermsEnum(this));
}
}
/// <summary>
/// Implement fuzzy enumeration with linear brute force.
/// </summary>
private class LinearFuzzyTermsEnum : FilteredTermsEnum
{
private readonly SlowFuzzyTermsEnum outerInstance;
/// <summary>
/// Allows us save time required to create a new array
/// every time similarity is called.
/// </summary>
private int[] d;
private int[] p;
/// <summary>this is the text, minus the prefix</summary>
private readonly int[] text;
private readonly IBoostAttribute boostAtt;
/// <summary>
/// Constructor for enumeration of all terms from specified <c>reader</c> which share a prefix of
/// length <c>prefixLength</c> with <c>term</c> and which have a fuzzy similarity &gt;
/// <c>minSimilarity</c>.
/// <para/>
/// After calling the constructor the enumeration is already pointing to the first
/// valid term if such a term exists.
/// </summary>
/// <exception cref="IOException">If there is a low-level I/O error.</exception>
public LinearFuzzyTermsEnum(SlowFuzzyTermsEnum outerInstance)
: base(outerInstance.m_terms.GetEnumerator())
{
this.outerInstance = outerInstance;
this.boostAtt = Attributes.AddAttribute<IBoostAttribute>();
this.text = new int[outerInstance.m_termLength - outerInstance.m_realPrefixLength];
System.Array.Copy(outerInstance.m_termText, outerInstance.m_realPrefixLength, text, 0, text.Length);
string prefix = UnicodeUtil.NewString(outerInstance.m_termText, 0, outerInstance.m_realPrefixLength);
prefixBytesRef = new BytesRef(prefix);
this.d = new int[this.text.Length + 1];
this.p = new int[this.text.Length + 1];
SetInitialSeekTerm(prefixBytesRef);
}
private readonly BytesRef prefixBytesRef;
/// <summary>used for unicode conversion from BytesRef byte[] to int[]</summary>
private readonly Int32sRef utf32 = new Int32sRef(20);
/// <summary>
/// <para>
/// The termCompare method in FuzzyTermEnum uses Levenshtein distance to
/// calculate the distance between the given term and the comparing term.
/// </para>
/// <para>
/// If the minSimilarity is >= 1.0, this uses the maxEdits as the comparison.
/// Otherwise, this method uses the following logic to calculate similarity.
/// <code>
/// similarity = 1 - ((float)distance / (float) (prefixLength + Math.min(textlen, targetlen)));
/// </code>
/// where distance is the Levenshtein distance for the two words.
/// </para>
/// </summary>
#if NETFRAMEWORK
[MethodImpl(MethodImplOptions.NoOptimization)] // LUCENENET specific: comparing float equality fails in x86 on .NET Framework with optimizations enabled. Fixes TestTokenLengthOpt.
#endif
protected override sealed AcceptStatus Accept(BytesRef term)
{
if (StringHelper.StartsWith(term, prefixBytesRef))
{
UnicodeUtil.UTF8toUTF32(term, utf32);
int distance = CalcDistance(utf32.Int32s, outerInstance.m_realPrefixLength, utf32.Length - outerInstance.m_realPrefixLength);
//Integer.MIN_VALUE is the sentinel that Levenshtein stopped early
if (distance == int.MinValue)
{
return AcceptStatus.NO;
}
//no need to calc similarity, if raw is true and distance > maxEdits
if (outerInstance.m_raw == true && distance > outerInstance.m_maxEdits)
{
return AcceptStatus.NO;
}
float similarity = CalcSimilarity(distance, (utf32.Length - outerInstance.m_realPrefixLength), text.Length);
//if raw is true, then distance must also be <= maxEdits by now
//given the previous if statement
if (outerInstance.m_raw == true ||
(outerInstance.m_raw == false && similarity > outerInstance.MinSimilarity))
{
boostAtt.Boost = (similarity - outerInstance.MinSimilarity) * outerInstance.m_scaleFactor;
return AcceptStatus.YES;
}
else
{
return AcceptStatus.NO;
}
}
else
{
return AcceptStatus.END;
}
}
/******************************
* Compute Levenshtein distance
******************************/
/// <summary>
/// <para>
/// <see cref="CalcDistance(int[], int, int)"/> returns the Levenshtein distance between the query term
/// and the target term.
/// </para>
/// <para>
/// Embedded within this algorithm is a fail-fast Levenshtein distance
/// algorithm. The fail-fast algorithm differs from the standard Levenshtein
/// distance algorithm in that it is aborted if it is discovered that the
/// minimum distance between the words is greater than some threshold.
/// </para>
/// <para>
/// Levenshtein distance (also known as edit distance) is a measure of similarity
/// between two strings where the distance is measured as the number of character
/// deletions, insertions or substitutions required to transform one string to
/// the other string.
/// </para>
/// </summary>
/// <param name="target">the target word or phrase</param>
/// <param name="offset">the offset at which to start the comparison</param>
/// <param name="length">the length of what's left of the string to compare</param>
/// <returns>
/// the number of edits or <see cref="int.MaxValue"/> if the edit distance is
/// greater than maxDistance.
/// </returns>
private int CalcDistance(int[] target, int offset, int length)
{
int m = length;
int n = text.Length;
if (n == 0)
{
//we don't have anything to compare. That means if we just add
//the letters for m we get the new word
return m;
}
if (m == 0)
{
return n;
}
int maxDistance = CalculateMaxDistance(m);
if (maxDistance < Math.Abs(m - n))
{
//just adding the characters of m to n or vice-versa results in
//too many edits
//for example "pre" length is 3 and "prefixes" length is 8. We can see that
//given this optimal circumstance, the edit distance cannot be less than 5.
//which is 8-3 or more precisely Math.abs(3-8).
//if our maximum edit distance is 4, then we can discard this word
//without looking at it.
return int.MinValue;
}
// init matrix d
for (int i = 0; i <= n; ++i)
{
p[i] = i;
}
// start computing edit distance
for (int j = 1; j <= m; ++j)
{ // iterates through target
int bestPossibleEditDistance = m;
int t_j = target[offset + j - 1]; // jth character of t
d[0] = j;
for (int i = 1; i <= n; ++i)
{ // iterates through text
// minimum of cell to the left+1, to the top+1, diagonally left and up +(0|1)
if (t_j != text[i - 1])
{
d[i] = Math.Min(Math.Min(d[i - 1], p[i]), p[i - 1]) + 1;
}
else
{
d[i] = Math.Min(Math.Min(d[i - 1] + 1, p[i] + 1), p[i - 1]);
}
bestPossibleEditDistance = Math.Min(bestPossibleEditDistance, d[i]);
}
//After calculating row i, the best possible edit distance
//can be found by found by finding the smallest value in a given column.
//If the bestPossibleEditDistance is greater than the max distance, abort.
if (j > maxDistance && bestPossibleEditDistance > maxDistance)
{ //equal is okay, but not greater
//the closest the target can be to the text is just too far away.
//this target is leaving the party early.
return int.MinValue;
}
// copy current distance counts to 'previous row' distance counts: swap p and d
int[] _d = p;
p = d;
d = _d;
}
// our last action in the above loop was to switch d and p, so p now
// actually has the most recent cost counts
return p[n];
}
private float CalcSimilarity(int edits, int m, int n)
{
// this will return less than 0.0 when the edit distance is
// greater than the number of characters in the shorter word.
// but this was the formula that was previously used in FuzzyTermEnum,
// so it has not been changed (even though minimumSimilarity must be
// greater than 0.0)
return 1.0f - ((float)edits / (float)(outerInstance.m_realPrefixLength + Math.Min(n, m)));
}
/// <summary>
/// The max Distance is the maximum Levenshtein distance for the text
/// compared to some other value that results in score that is
/// better than the minimum similarity.
/// </summary>
/// <param name="m">the length of the "other value"</param>
/// <returns>the maximum levenshtein distance that we care about</returns>
private int CalculateMaxDistance(int m)
{
return outerInstance.m_raw ? outerInstance.m_maxEdits : Math.Min(outerInstance.m_maxEdits,
(int)((1 - outerInstance.MinSimilarity) * (Math.Min(text.Length, m) + outerInstance.m_realPrefixLength)));
}
}
}
}