blob: 7ede17243b86ec07e970411bdaf3fe158601ee1d [file] [log] [blame]
package org.apache.lucene.search;
/**
* Copyright 2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
/** Subclass of FilteredTermEnum for enumerating all terms that are similiar to the specified filter term.
<p>Term enumerations are always ordered by Term.compareTo(). Each term in
the enumeration is greater than all that precede it. */
public final class FuzzyTermEnum extends FilteredTermEnum {
double distance;
boolean endEnum = false;
Term searchTerm = null;
String field = "";
String text = "";
int textlen;
String prefix = "";
int prefixLength = 0;
float minimumSimilarity;
double scale_factor;
/**
* Empty prefix and minSimilarity of 0.5f are used.
*
* @param reader
* @param term
* @throws IOException
* @see #FuzzyTermEnum(IndexReader, Term, float, int)
*/
public FuzzyTermEnum(IndexReader reader, Term term) throws IOException {
this(reader, term, FuzzyQuery.defaultMinSimilarity, 0);
}
/**
* This is the standard FuzzyTermEnum with an empty prefix.
*
* @param reader
* @param term
* @param minSimilarity
* @throws IOException
* @see #FuzzyTermEnum(IndexReader, Term, float, int)
*/
public FuzzyTermEnum(IndexReader reader, Term term, float minSimilarity) throws IOException {
this(reader, term, minSimilarity, 0);
}
/**
* Constructor for enumeration of all terms from specified <code>reader</code> which share a prefix of
* length <code>prefixLength</code> with <code>term</code> and which have a fuzzy similarity &gt;
* <code>minSimilarity</code>.
*
* @param reader Delivers terms.
* @param term Pattern term.
* @param minSimilarity Minimum required similarity for terms from the reader. Default value is 0.5f.
* @param prefixLength Length of required common prefix. Default value is 0.
* @throws IOException
*/
public FuzzyTermEnum(IndexReader reader, Term term, float minSimilarity, int prefixLength) throws IOException {
super();
minimumSimilarity = minSimilarity;
scale_factor = 1.0f / (1.0f - minimumSimilarity);
searchTerm = term;
field = searchTerm.field();
text = searchTerm.text();
textlen = text.length();
if(prefixLength > 0 && prefixLength < textlen){
this.prefixLength = prefixLength;
prefix = text.substring(0, prefixLength);
text = text.substring(prefixLength);
textlen = text.length();
}
setEnum(reader.terms(new Term(searchTerm.field(), prefix)));
}
/**
The termCompare method in FuzzyTermEnum uses Levenshtein distance to
calculate the distance between the given term and the comparing term.
*/
protected final boolean termCompare(Term term) {
String termText = term.text();
if (field == term.field() && termText.startsWith(prefix)) {
String target = termText.substring(prefixLength);
int targetlen = target.length();
int dist = editDistance(text, target, textlen, targetlen);
distance = 1 - ((double)dist / (double)Math.min(textlen, targetlen));
return (distance > minimumSimilarity);
}
endEnum = true;
return false;
}
protected final float difference() {
return (float)((distance - minimumSimilarity) * scale_factor);
}
public final boolean endEnum() {
return endEnum;
}
/******************************
* Compute Levenshtein distance
******************************/
/**
Finds and returns the smallest of three integers
*/
private static final int min(int a, int b, int c) {
int t = (a < b) ? a : b;
return (t < c) ? t : c;
}
/**
* This static array saves us from the time required to create a new array
* everytime editDistance is called.
*/
private int e[][] = new int[1][1];
/**
Levenshtein distance also known as edit distance is a measure of similiarity
between two strings where the distance is measured as the number of character
deletions, insertions or substitutions required to transform one string to
the other string.
<p>This method takes in four parameters; two strings and their respective
lengths to compute the Levenshtein distance between the two strings.
The result is returned as an integer.
*/
private final int editDistance(String s, String t, int n, int m) {
if (e.length <= n || e[0].length <= m) {
e = new int[Math.max(e.length, n+1)][Math.max(e[0].length, m+1)];
}
int d[][] = e; // matrix
int i; // iterates through s
int j; // iterates through t
char s_i; // ith character of s
if (n == 0) return m;
if (m == 0) return n;
// init matrix d
for (i = 0; i <= n; i++) d[i][0] = i;
for (j = 0; j <= m; j++) d[0][j] = j;
// start computing edit distance
for (i = 1; i <= n; i++) {
s_i = s.charAt(i - 1);
for (j = 1; j <= m; j++) {
if (s_i != t.charAt(j-1))
d[i][j] = min(d[i-1][j], d[i][j-1], d[i-1][j-1])+1;
else d[i][j] = min(d[i-1][j]+1, d[i][j-1]+1, d[i-1][j-1]);
}
}
// we got the result!
return d[n][m];
}
public void close() throws IOException {
super.close();
searchTerm = null;
field = null;
text = null;
}
}