src/java/org/apache/lucene/search/FuzzyTermEnum.java - lucene-solr - Git at Google

 package org.apache.lucene.search;

 /**
  * Copyright 2004 The Apache Software Foundation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 import java.io.IOException;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.Term;

 /** Subclass of FilteredTermEnum for enumerating all terms that are similiar to the specified filter term.

   <p>Term enumerations are always ordered by Term.compareTo().  Each term in
   the enumeration is greater than all that precede it.  */
 public final class FuzzyTermEnum extends FilteredTermEnum {
     double distance;
     boolean endEnum = false;

     Term searchTerm = null;
     String field = "";
     String text = "";
     int textlen;
     String prefix = "";
     int prefixLength = 0;
     float minimumSimilarity;
     double scale_factor;


     /**
      * Empty prefix and minSimilarity of 0.5f are used.
      *
      * @param reader
      * @param term
      * @throws IOException
      * @see #FuzzyTermEnum(IndexReader, Term, float, int)
      */
     public FuzzyTermEnum(IndexReader reader, Term term) throws IOException {
       this(reader, term, FuzzyQuery.defaultMinSimilarity, 0);
     }

     /**
      * This is the standard FuzzyTermEnum with an empty prefix.
      *
      * @param reader
      * @param term
      * @param minSimilarity
      * @throws IOException
      * @see #FuzzyTermEnum(IndexReader, Term, float, int)
      */
     public FuzzyTermEnum(IndexReader reader, Term term, float minSimilarity) throws IOException {
       this(reader, term, minSimilarity, 0);
     }

     /**
      * Constructor for enumeration of all terms from specified <code>reader</code> which share a prefix of
      * length <code>prefixLength</code> with <code>term</code> and which have a fuzzy similarity &gt;
      * <code>minSimilarity</code>.
      *
      * @param reader Delivers terms.
      * @param term Pattern term.
      * @param minSimilarity Minimum required similarity for terms from the reader. Default value is 0.5f.
      * @param prefixLength Length of required common prefix. Default value is 0.
      * @throws IOException
      */
     public FuzzyTermEnum(IndexReader reader, Term term, float minSimilarity, int prefixLength) throws IOException {
         super();
         minimumSimilarity = minSimilarity;
         scale_factor = 1.0f / (1.0f - minimumSimilarity);
         searchTerm = term;
         field = searchTerm.field();
         text = searchTerm.text();
         textlen = text.length();
         if(prefixLength > 0 && prefixLength < textlen){
             this.prefixLength = prefixLength;
             prefix = text.substring(0, prefixLength);
             text = text.substring(prefixLength);
             textlen = text.length();
         }
         setEnum(reader.terms(new Term(searchTerm.field(), prefix)));
     }

     /**
      The termCompare method in FuzzyTermEnum uses Levenshtein distance to
      calculate the distance between the given term and the comparing term.
      */
     protected final boolean termCompare(Term term) {
         String termText = term.text();
         if (field == term.field() && termText.startsWith(prefix)) {
             String target = termText.substring(prefixLength);
             int targetlen = target.length();
             int dist = editDistance(text, target, textlen, targetlen);
             distance = 1 - ((double)dist / (double)Math.min(textlen, targetlen));
             return (distance > minimumSimilarity);
         }
         endEnum = true;
         return false;
     }

     protected final float difference() {
         return (float)((distance - minimumSimilarity) * scale_factor);
     }

     public final boolean endEnum() {
         return endEnum;
     }

     /******************************
      * Compute Levenshtein distance
      ******************************/

     /**
      Finds and returns the smallest of three integers
      */
     private static final int min(int a, int b, int c) {
         int t = (a < b) ? a : b;
         return (t < c) ? t : c;
     }

     /**
      * This static array saves us from the time required to create a new array
      * everytime editDistance is called.
      */
     private int e[][] = new int[1][1];

     /**
      Levenshtein distance also known as edit distance is a measure of similiarity
      between two strings where the distance is measured as the number of character
      deletions, insertions or substitutions required to transform one string to
      the other string.
      <p>This method takes in four parameters; two strings and their respective
      lengths to compute the Levenshtein distance between the two strings.
      The result is returned as an integer.
      */
     private final int editDistance(String s, String t, int n, int m) {
         if (e.length <= n || e[0].length <= m) {
             e = new int[Math.max(e.length, n+1)][Math.max(e[0].length, m+1)];
         }
         int d[][] = e; // matrix
         int i; // iterates through s
         int j; // iterates through t
         char s_i; // ith character of s

         if (n == 0) return m;
         if (m == 0) return n;

         // init matrix d
         for (i = 0; i <= n; i++) d[i][0] = i;
         for (j = 0; j <= m; j++) d[0][j] = j;

         // start computing edit distance
         for (i = 1; i <= n; i++) {
             s_i = s.charAt(i - 1);
             for (j = 1; j <= m; j++) {
                 if (s_i != t.charAt(j-1))
                     d[i][j] = min(d[i-1][j], d[i][j-1], d[i-1][j-1])+1;
                 else d[i][j] = min(d[i-1][j]+1, d[i][j-1]+1, d[i-1][j-1]);
             }
         }

         // we got the result!
         return d[n][m];
     }

   public void close() throws IOException {
       super.close();
       searchTerm = null;
       field = null;
       text = null;
   }
 }
	package org.apache.lucene.search;

	/**
	* Copyright 2004 The Apache Software Foundation
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	import java.io.IOException;
	import org.apache.lucene.index.IndexReader;
	import org.apache.lucene.index.Term;

	/** Subclass of FilteredTermEnum for enumerating all terms that are similiar to the specified filter term.

	<p>Term enumerations are always ordered by Term.compareTo(). Each term in
	the enumeration is greater than all that precede it. */
	public final class FuzzyTermEnum extends FilteredTermEnum {
	double distance;
	boolean endEnum = false;

	Term searchTerm = null;
	String field = "";
	String text = "";
	int textlen;
	String prefix = "";
	int prefixLength = 0;
	float minimumSimilarity;
	double scale_factor;


	/**
	* Empty prefix and minSimilarity of 0.5f are used.
	*
	* @param reader
	* @param term
	* @throws IOException
	* @see #FuzzyTermEnum(IndexReader, Term, float, int)
	*/
	public FuzzyTermEnum(IndexReader reader, Term term) throws IOException {
	this(reader, term, FuzzyQuery.defaultMinSimilarity, 0);
	}

	/**
	* This is the standard FuzzyTermEnum with an empty prefix.
	*
	* @param reader
	* @param term
	* @param minSimilarity
	* @throws IOException
	* @see #FuzzyTermEnum(IndexReader, Term, float, int)
	*/
	public FuzzyTermEnum(IndexReader reader, Term term, float minSimilarity) throws IOException {
	this(reader, term, minSimilarity, 0);
	}

	/**
	* Constructor for enumeration of all terms from specified <code>reader</code> which share a prefix of
	* length <code>prefixLength</code> with <code>term</code> and which have a fuzzy similarity >
	* <code>minSimilarity</code>.
	*
	* @param reader Delivers terms.
	* @param term Pattern term.
	* @param minSimilarity Minimum required similarity for terms from the reader. Default value is 0.5f.
	* @param prefixLength Length of required common prefix. Default value is 0.
	* @throws IOException
	*/
	public FuzzyTermEnum(IndexReader reader, Term term, float minSimilarity, int prefixLength) throws IOException {
	super();
	minimumSimilarity = minSimilarity;
	scale_factor = 1.0f / (1.0f - minimumSimilarity);
	searchTerm = term;
	field = searchTerm.field();
	text = searchTerm.text();
	textlen = text.length();
	if(prefixLength > 0 && prefixLength < textlen){
	this.prefixLength = prefixLength;
	prefix = text.substring(0, prefixLength);
	text = text.substring(prefixLength);
	textlen = text.length();
	}
	setEnum(reader.terms(new Term(searchTerm.field(), prefix)));
	}

	/**
	The termCompare method in FuzzyTermEnum uses Levenshtein distance to
	calculate the distance between the given term and the comparing term.
	*/
	protected final boolean termCompare(Term term) {
	String termText = term.text();
	if (field == term.field() && termText.startsWith(prefix)) {
	String target = termText.substring(prefixLength);
	int targetlen = target.length();
	int dist = editDistance(text, target, textlen, targetlen);
	distance = 1 - ((double)dist / (double)Math.min(textlen, targetlen));
	return (distance > minimumSimilarity);
	}
	endEnum = true;
	return false;
	}

	protected final float difference() {
	return (float)((distance - minimumSimilarity) * scale_factor);
	}

	public final boolean endEnum() {
	return endEnum;
	}

	/******************************
	* Compute Levenshtein distance
	******************************/

	/**
	Finds and returns the smallest of three integers
	*/
	private static final int min(int a, int b, int c) {
	int t = (a < b) ? a : b;
	return (t < c) ? t : c;
	}

	/**
	* This static array saves us from the time required to create a new array
	* everytime editDistance is called.
	*/
	private int e[][] = new int[1][1];

	/**
	Levenshtein distance also known as edit distance is a measure of similiarity
	between two strings where the distance is measured as the number of character
	deletions, insertions or substitutions required to transform one string to
	the other string.
	<p>This method takes in four parameters; two strings and their respective
	lengths to compute the Levenshtein distance between the two strings.
	The result is returned as an integer.
	*/
	private final int editDistance(String s, String t, int n, int m) {
	if (e.length <= n \|\| e[0].length <= m) {
	e = new int[Math.max(e.length, n+1)][Math.max(e[0].length, m+1)];
	}
	int d[][] = e; // matrix
	int i; // iterates through s
	int j; // iterates through t
	char s_i; // ith character of s

	if (n == 0) return m;
	if (m == 0) return n;

	// init matrix d
	for (i = 0; i <= n; i++) d[i][0] = i;
	for (j = 0; j <= m; j++) d[0][j] = j;

	// start computing edit distance
	for (i = 1; i <= n; i++) {
	s_i = s.charAt(i - 1);
	for (j = 1; j <= m; j++) {
	if (s_i != t.charAt(j-1))
	d[i][j] = min(d[i-1][j], d[i][j-1], d[i-1][j-1])+1;
	else d[i][j] = min(d[i-1][j]+1, d[i][j-1]+1, d[i-1][j-1]);
	}
	}

	// we got the result!
	return d[n][m];
	}

	public void close() throws IOException {
	super.close();
	searchTerm = null;
	field = null;
	text = null;
	}
	}