src/Lucene.Net.Core/Search/FuzzyTermEnum.cs - lucenenet - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 using System;

 using IndexReader = Lucene.Net.Index.IndexReader;
 using Term = Lucene.Net.Index.Term;

 namespace Lucene.Net.Search
 {

     /// <summary>Subclass of FilteredTermEnum for enumerating all terms that are similiar
     /// to the specified filter term.
     ///
     /// <p/>Term enumerations are always ordered by Term.compareTo().  Each term in
     /// the enumeration is greater than all that precede it.
     /// </summary>
     public sealed class FuzzyTermEnum:FilteredTermEnum
     {
         /* Allows us save time required to create a new array
         * everytime similarity is called.
         */
         private int[] p;
         private int[] d;

         private float similarity;
         private bool endEnum = false;

         private bool isDisposed;

         private Term searchTerm = null;
         private System.String field;
         private System.String text;
         private System.String prefix;

         private float minimumSimilarity;
         private float scale_factor;

         /// <summary> Creates a FuzzyTermEnum with an empty prefix and a minSimilarity of 0.5f.
         /// <p/>
         /// After calling the constructor the enumeration is already pointing to the first
         /// valid term if such a term exists.
         ///
         /// </summary>
         /// <param name="reader">
         /// </param>
         /// <param name="term">
         /// </param>
         /// <throws>  IOException </throws>
         /// <seealso cref="FuzzyTermEnum(IndexReader, Term, float, int)">
         /// </seealso>
         public FuzzyTermEnum(IndexReader reader, Term term):this(reader, term, FuzzyQuery.defaultMinSimilarity, FuzzyQuery.defaultPrefixLength)
         {
         }

         /// <summary> Creates a FuzzyTermEnum with an empty prefix.
         /// <p/>
         /// After calling the constructor the enumeration is already pointing to the first
         /// valid term if such a term exists.
         ///
         /// </summary>
         /// <param name="reader">
         /// </param>
         /// <param name="term">
         /// </param>
         /// <param name="minSimilarity">
         /// </param>
         /// <throws>  IOException </throws>
         /// <seealso cref="FuzzyTermEnum(IndexReader, Term, float, int)">
         /// </seealso>
         public FuzzyTermEnum(IndexReader reader, Term term, float minSimilarity):this(reader, term, minSimilarity, FuzzyQuery.defaultPrefixLength)
         {
         }

         /// <summary> Constructor for enumeration of all terms from specified <c>reader</c> which share a prefix of
         /// length <c>prefixLength</c> with <c>term</c> and which have a fuzzy similarity &gt;
         /// <c>minSimilarity</c>.
         /// <p/>
         /// After calling the constructor the enumeration is already pointing to the first
         /// valid term if such a term exists.
         ///
         /// </summary>
         /// <param name="reader">Delivers terms.
         /// </param>
         /// <param name="term">Pattern term.
         /// </param>
         /// <param name="minSimilarity">Minimum required similarity for terms from the reader. Default value is 0.5f.
         /// </param>
         /// <param name="prefixLength">Length of required common prefix. Default value is 0.
         /// </param>
         /// <throws>  IOException </throws>
         public FuzzyTermEnum(IndexReader reader, Term term, float minSimilarity, int prefixLength):base()
         {

             if (minSimilarity >= 1.0f)
                 throw new System.ArgumentException("minimumSimilarity cannot be greater than or equal to 1");
             else if (minSimilarity < 0.0f)
                 throw new System.ArgumentException("minimumSimilarity cannot be less than 0");
             if (prefixLength < 0)
                 throw new System.ArgumentException("prefixLength cannot be less than 0");

             this.minimumSimilarity = minSimilarity;
             this.scale_factor = 1.0f / (1.0f - minimumSimilarity);
             this.searchTerm = term;
             this.field = searchTerm.Field;

             //The prefix could be longer than the word.
             //It's kind of silly though.  It means we must match the entire word.
             int fullSearchTermLength = searchTerm.Text.Length;
             int realPrefixLength = prefixLength > fullSearchTermLength?fullSearchTermLength:prefixLength;

             this.text = searchTerm.Text.Substring(realPrefixLength);
             this.prefix = searchTerm.Text.Substring(0, (realPrefixLength) - (0));

             this.p = new int[this.text.Length + 1];
             this.d = new int[this.text.Length + 1];

             SetEnum(reader.Terms(new Term(searchTerm.Field, prefix)));
         }

         /// <summary> The termCompare method in FuzzyTermEnum uses Levenshtein distance to
         /// calculate the distance between the given term and the comparing term.
         /// </summary>
         protected internal override bool TermCompare(Term term)
         {
             if ((System.Object) field == (System.Object) term.Field && term.Text.StartsWith(prefix))
             {
                 System.String target = term.Text.Substring(prefix.Length);
                 this.similarity = Similarity(target);
                 return (similarity > minimumSimilarity);
             }
             endEnum = true;
             return false;
         }

         public override float Difference()
         {
             return ((similarity - minimumSimilarity) * scale_factor);
         }

         public override bool EndEnum()
         {
             return endEnum;
         }

         // <summary>
         // ***************************
         // Compute Levenshtein distance
         // ****************************
         // </summary>

         /// <summary> <p/>Similarity returns a number that is 1.0f or less (including negative numbers)
         /// based on how similar the Term is compared to a target term.  It returns
         /// exactly 0.0f when
         /// <c>
         /// editDistance &gt; maximumEditDistance</c>
         /// Otherwise it returns:
         /// <c>
         /// 1 - (editDistance / length)</c>
         /// where length is the length of the shortest term (text or target) including a
         /// prefix that are identical and editDistance is the Levenshtein distance for
         /// the two words.<p/>
         ///
         /// <p/>Embedded within this algorithm is a fail-fast Levenshtein distance
         /// algorithm.  The fail-fast algorithm differs from the standard Levenshtein
         /// distance algorithm in that it is aborted if it is discovered that the
         /// mimimum distance between the words is greater than some threshold.
         ///
         /// <p/>To calculate the maximum distance threshold we use the following formula:
         /// <c>
         /// (1 - minimumSimilarity) * length</c>
         /// where length is the shortest term including any prefix that is not part of the
         /// similarity comparision.  This formula was derived by solving for what maximum value
         /// of distance returns false for the following statements:
         /// <code>
         /// similarity = 1 - ((float)distance / (float) (prefixLength + Math.min(textlen, targetlen)));
         /// return (similarity > minimumSimilarity);</code>
         /// where distance is the Levenshtein distance for the two words.
         /// <p/>
         /// <p/>Levenshtein distance (also known as edit distance) is a measure of similiarity
         /// between two strings where the distance is measured as the number of character
         /// deletions, insertions or substitutions required to transform one string to
         /// the other string.
         /// </summary>
         /// <param name="target">the target word or phrase
         /// </param>
         /// <returns> the similarity,  0.0 or less indicates that it matches less than the required
         /// threshold and 1.0 indicates that the text and target are identical
         /// </returns>
         private float Similarity(System.String target)
         {

             int m = target.Length;
             int n = text.Length;
             if (n == 0)
             {
                 //we don't have anything to compare.  That means if we just add
                 //the letters for m we get the new word
                 return prefix.Length == 0 ? 0.0f : 1.0f - ((float)m / prefix.Length);
             }
             if (m == 0)
             {
                 return prefix.Length == 0 ? 0.0f : 1.0f - ((float)n / prefix.Length);
             }

             int maxDistance = CalculateMaxDistance(m);

             if (maxDistance < System.Math.Abs(m - n))
             {
                 //just adding the characters of m to n or vice-versa results in
                 //too many edits
                 //for example "pre" length is 3 and "prefixes" length is 8.  We can see that
                 //given this optimal circumstance, the edit distance cannot be less than 5.
                 //which is 8-3 or more precisesly Math.abs(3-8).
                 //if our maximum edit distance is 4, then we can discard this word
                 //without looking at it.
                 return 0.0f;
             }

             // init matrix d
             for (int i = 0; i < n; ++i)
             {
                 p[i] = i;
             }

                 // start computing edit distance
             for (int j = 1; j <= m; ++j)
             {
                 int bestPossibleEditDistance = m;
                 char t_j = target[j - 1];
                 d[0] = j;
                 for (int i = 1; i <= n; ++i)
                 {
                     // minimum of cell to the left+1, to the top+1, diagonally left and up +(0|1)
                     if (t_j != text[i - 1])
                     {
                         d[i] = Math.Min(Math.Min(d[i - 1], p[i]), p[i - 1]) + 1;
                     }
                     else
                     {
                         d[i] = Math.Min(Math.Min(d[i - 1] + 1, p[i] + 1), p[i - 1]);
                     }
                     bestPossibleEditDistance = System.Math.Min(bestPossibleEditDistance, d[i]);
                 }

                 //After calculating row i, the best possible edit distance
                 //can be found by found by finding the smallest value in a given column.
                 //If the bestPossibleEditDistance is greater than the max distance, abort.

                 if (j > maxDistance && bestPossibleEditDistance > maxDistance)
                 {
                     //equal is okay, but not greater
                     //the closest the target can be to the text is just too far away.
                     //this target is leaving the party early.
                     return 0.0f;
                 }

                 // copy current distance counts to 'previous row' distance counts: swap p and d
                   int[] _d = p;
                   p = d;
                   d = _d;
             }

             // our last action in the above loop was to switch d and p, so p now
             // actually has the most recent cost counts

             // this will return less than 0.0 when the edit distance is
             // greater than the number of characters in the shorter word.
             // but this was the formula that was previously used in FuzzyTermEnum,
             // so it has not been changed (even though minimumSimilarity must be
             // greater than 0.0)
             return 1.0f - (p[n] / (float)(prefix.Length + System.Math.Min(n, m)));

         }

         /// <summary> The max Distance is the maximum Levenshtein distance for the text
         /// compared to some other value that results in score that is
         /// better than the minimum similarity.
         /// </summary>
         /// <param name="m">the length of the "other value"
         /// </param>
         /// <returns> the maximum levenshtein distance that we care about
         /// </returns>
         private int CalculateMaxDistance(int m)
         {
             return (int) ((1 - minimumSimilarity) * (System.Math.Min(text.Length, m) + prefix.Length));
         }

         protected override void Dispose(bool disposing)
         {
             if (isDisposed) return;

             if (disposing)
             {
                 p = null;
                 d = null;
                 searchTerm = null;
             }

             isDisposed = true;
             base.Dispose(disposing); //call super.close() and let the garbage collector do its work.
         }
     }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	using System;

	using IndexReader = Lucene.Net.Index.IndexReader;
	using Term = Lucene.Net.Index.Term;

	namespace Lucene.Net.Search
	{

	/// <summary>Subclass of FilteredTermEnum for enumerating all terms that are similiar
	/// to the specified filter term.
	///
	/// <p/>Term enumerations are always ordered by Term.compareTo(). Each term in
	/// the enumeration is greater than all that precede it.
	/// </summary>
	public sealed class FuzzyTermEnum:FilteredTermEnum
	{
	/* Allows us save time required to create a new array
	* everytime similarity is called.
	*/
	private int[] p;
	private int[] d;

	private float similarity;
	private bool endEnum = false;

	private bool isDisposed;

	private Term searchTerm = null;
	private System.String field;
	private System.String text;
	private System.String prefix;

	private float minimumSimilarity;
	private float scale_factor;

	/// <summary> Creates a FuzzyTermEnum with an empty prefix and a minSimilarity of 0.5f.
	/// <p/>
	/// After calling the constructor the enumeration is already pointing to the first
	/// valid term if such a term exists.
	///
	/// </summary>
	/// <param name="reader">
	/// </param>
	/// <param name="term">
	/// </param>
	/// <throws> IOException </throws>
	/// <seealso cref="FuzzyTermEnum(IndexReader, Term, float, int)">
	/// </seealso>
	public FuzzyTermEnum(IndexReader reader, Term term):this(reader, term, FuzzyQuery.defaultMinSimilarity, FuzzyQuery.defaultPrefixLength)
	{
	}

	/// <summary> Creates a FuzzyTermEnum with an empty prefix.
	/// <p/>
	/// After calling the constructor the enumeration is already pointing to the first
	/// valid term if such a term exists.
	///
	/// </summary>
	/// <param name="reader">
	/// </param>
	/// <param name="term">
	/// </param>
	/// <param name="minSimilarity">
	/// </param>
	/// <throws> IOException </throws>
	/// <seealso cref="FuzzyTermEnum(IndexReader, Term, float, int)">
	/// </seealso>
	public FuzzyTermEnum(IndexReader reader, Term term, float minSimilarity):this(reader, term, minSimilarity, FuzzyQuery.defaultPrefixLength)
	{
	}

	/// <summary> Constructor for enumeration of all terms from specified <c>reader</c> which share a prefix of
	/// length <c>prefixLength</c> with <c>term</c> and which have a fuzzy similarity >
	/// <c>minSimilarity</c>.
	/// <p/>
	/// After calling the constructor the enumeration is already pointing to the first
	/// valid term if such a term exists.
	///
	/// </summary>
	/// <param name="reader">Delivers terms.
	/// </param>
	/// <param name="term">Pattern term.
	/// </param>
	/// <param name="minSimilarity">Minimum required similarity for terms from the reader. Default value is 0.5f.
	/// </param>
	/// <param name="prefixLength">Length of required common prefix. Default value is 0.
	/// </param>
	/// <throws> IOException </throws>
	public FuzzyTermEnum(IndexReader reader, Term term, float minSimilarity, int prefixLength):base()
	{

	if (minSimilarity >= 1.0f)
	throw new System.ArgumentException("minimumSimilarity cannot be greater than or equal to 1");
	else if (minSimilarity < 0.0f)
	throw new System.ArgumentException("minimumSimilarity cannot be less than 0");
	if (prefixLength < 0)
	throw new System.ArgumentException("prefixLength cannot be less than 0");

	this.minimumSimilarity = minSimilarity;
	this.scale_factor = 1.0f / (1.0f - minimumSimilarity);
	this.searchTerm = term;
	this.field = searchTerm.Field;

	//The prefix could be longer than the word.
	//It's kind of silly though. It means we must match the entire word.
	int fullSearchTermLength = searchTerm.Text.Length;
	int realPrefixLength = prefixLength > fullSearchTermLength?fullSearchTermLength:prefixLength;

	this.text = searchTerm.Text.Substring(realPrefixLength);
	this.prefix = searchTerm.Text.Substring(0, (realPrefixLength) - (0));

	this.p = new int[this.text.Length + 1];
	this.d = new int[this.text.Length + 1];

	SetEnum(reader.Terms(new Term(searchTerm.Field, prefix)));
	}

	/// <summary> The termCompare method in FuzzyTermEnum uses Levenshtein distance to
	/// calculate the distance between the given term and the comparing term.
	/// </summary>
	protected internal override bool TermCompare(Term term)
	{
	if ((System.Object) field == (System.Object) term.Field && term.Text.StartsWith(prefix))
	{
	System.String target = term.Text.Substring(prefix.Length);
	this.similarity = Similarity(target);
	return (similarity > minimumSimilarity);
	}
	endEnum = true;
	return false;
	}

	public override float Difference()
	{
	return ((similarity - minimumSimilarity) * scale_factor);
	}

	public override bool EndEnum()
	{
	return endEnum;
	}

	// <summary>
	// ***************************
	// Compute Levenshtein distance
	// ****************************
	// </summary>

	/// <summary> <p/>Similarity returns a number that is 1.0f or less (including negative numbers)
	/// based on how similar the Term is compared to a target term. It returns
	/// exactly 0.0f when
	/// <c>
	/// editDistance > maximumEditDistance</c>
	/// Otherwise it returns:
	/// <c>
	/// 1 - (editDistance / length)</c>
	/// where length is the length of the shortest term (text or target) including a
	/// prefix that are identical and editDistance is the Levenshtein distance for
	/// the two words.<p/>
	///
	/// <p/>Embedded within this algorithm is a fail-fast Levenshtein distance
	/// algorithm. The fail-fast algorithm differs from the standard Levenshtein
	/// distance algorithm in that it is aborted if it is discovered that the
	/// mimimum distance between the words is greater than some threshold.
	///
	/// <p/>To calculate the maximum distance threshold we use the following formula:
	/// <c>
	/// (1 - minimumSimilarity) * length</c>
	/// where length is the shortest term including any prefix that is not part of the
	/// similarity comparision. This formula was derived by solving for what maximum value
	/// of distance returns false for the following statements:
	/// <code>
	/// similarity = 1 - ((float)distance / (float) (prefixLength + Math.min(textlen, targetlen)));
	/// return (similarity > minimumSimilarity);</code>
	/// where distance is the Levenshtein distance for the two words.
	/// <p/>
	/// <p/>Levenshtein distance (also known as edit distance) is a measure of similiarity
	/// between two strings where the distance is measured as the number of character
	/// deletions, insertions or substitutions required to transform one string to
	/// the other string.
	/// </summary>
	/// <param name="target">the target word or phrase
	/// </param>
	/// <returns> the similarity, 0.0 or less indicates that it matches less than the required
	/// threshold and 1.0 indicates that the text and target are identical
	/// </returns>
	private float Similarity(System.String target)
	{

	int m = target.Length;
	int n = text.Length;
	if (n == 0)
	{
	//we don't have anything to compare. That means if we just add
	//the letters for m we get the new word
	return prefix.Length == 0 ? 0.0f : 1.0f - ((float)m / prefix.Length);
	}
	if (m == 0)
	{
	return prefix.Length == 0 ? 0.0f : 1.0f - ((float)n / prefix.Length);
	}

	int maxDistance = CalculateMaxDistance(m);

	if (maxDistance < System.Math.Abs(m - n))
	{
	//just adding the characters of m to n or vice-versa results in
	//too many edits
	//for example "pre" length is 3 and "prefixes" length is 8. We can see that
	//given this optimal circumstance, the edit distance cannot be less than 5.
	//which is 8-3 or more precisesly Math.abs(3-8).
	//if our maximum edit distance is 4, then we can discard this word
	//without looking at it.
	return 0.0f;
	}

	// init matrix d
	for (int i = 0; i < n; ++i)
	{
	p[i] = i;
	}

	// start computing edit distance
	for (int j = 1; j <= m; ++j)
	{
	int bestPossibleEditDistance = m;
	char t_j = target[j - 1];
	d[0] = j;
	for (int i = 1; i <= n; ++i)
	{
	// minimum of cell to the left+1, to the top+1, diagonally left and up +(0\|1)
	if (t_j != text[i - 1])
	{
	d[i] = Math.Min(Math.Min(d[i - 1], p[i]), p[i - 1]) + 1;
	}
	else
	{
	d[i] = Math.Min(Math.Min(d[i - 1] + 1, p[i] + 1), p[i - 1]);
	}
	bestPossibleEditDistance = System.Math.Min(bestPossibleEditDistance, d[i]);
	}

	//After calculating row i, the best possible edit distance
	//can be found by found by finding the smallest value in a given column.
	//If the bestPossibleEditDistance is greater than the max distance, abort.

	if (j > maxDistance && bestPossibleEditDistance > maxDistance)
	{
	//equal is okay, but not greater
	//the closest the target can be to the text is just too far away.
	//this target is leaving the party early.
	return 0.0f;
	}

	// copy current distance counts to 'previous row' distance counts: swap p and d
	int[] _d = p;
	p = d;
	d = _d;
	}

	// our last action in the above loop was to switch d and p, so p now
	// actually has the most recent cost counts

	// this will return less than 0.0 when the edit distance is
	// greater than the number of characters in the shorter word.
	// but this was the formula that was previously used in FuzzyTermEnum,
	// so it has not been changed (even though minimumSimilarity must be
	// greater than 0.0)
	return 1.0f - (p[n] / (float)(prefix.Length + System.Math.Min(n, m)));

	}

	/// <summary> The max Distance is the maximum Levenshtein distance for the text
	/// compared to some other value that results in score that is
	/// better than the minimum similarity.
	/// </summary>
	/// <param name="m">the length of the "other value"
	/// </param>
	/// <returns> the maximum levenshtein distance that we care about
	/// </returns>
	private int CalculateMaxDistance(int m)
	{
	return (int) ((1 - minimumSimilarity) * (System.Math.Min(text.Length, m) + prefix.Length));
	}

	protected override void Dispose(bool disposing)
	{
	if (isDisposed) return;

	if (disposing)
	{
	p = null;
	d = null;
	searchTerm = null;
	}

	isDisposed = true;
	base.Dispose(disposing); //call super.close() and let the garbage collector do its work.
	}
	}
	}