blob: 9335e3182ea0d0b6989445f894c8574c48c1b321 [file] [log] [blame]
package org.apache.lucene.search;
/**
* Copyright 2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
/** Subclass of FilteredTermEnum for enumerating all terms that are similiar to the specified filter term.
<p>Term enumerations are always ordered by Term.compareTo(). Each term in
the enumeration is greater than all that precede it. */
public final class FuzzyTermEnum extends FilteredTermEnum {
double distance;
boolean fieldMatch = false;
boolean endEnum = false;
Term searchTerm = null;
String field = "";
String text = "";
int textlen;
public FuzzyTermEnum(IndexReader reader, Term term) throws IOException {
super();
searchTerm = term;
field = searchTerm.field();
text = searchTerm.text();
textlen = text.length();
setEnum(reader.terms(new Term(searchTerm.field(), "")));
}
/**
The termCompare method in FuzzyTermEnum uses Levenshtein distance to
calculate the distance between the given term and the comparing term.
*/
protected final boolean termCompare(Term term) {
if (field == term.field()) {
String target = term.text();
int targetlen = target.length();
int dist = editDistance(text, target, textlen, targetlen);
distance = 1 - ((double)dist / (double)Math.min(textlen, targetlen));
return (distance > FUZZY_THRESHOLD);
}
endEnum = true;
return false;
}
protected final float difference() {
return (float)((distance - FUZZY_THRESHOLD) * SCALE_FACTOR);
}
public final boolean endEnum() {
return endEnum;
}
/******************************
* Compute Levenshtein distance
******************************/
public static final double FUZZY_THRESHOLD = 0.5;
public static final double SCALE_FACTOR = 1.0f / (1.0f - FUZZY_THRESHOLD);
/**
Finds and returns the smallest of three integers
*/
private static final int min(int a, int b, int c) {
int t = (a < b) ? a : b;
return (t < c) ? t : c;
}
/**
* This static array saves us from the time required to create a new array
* everytime editDistance is called.
*/
private int e[][] = new int[1][1];
/**
Levenshtein distance also known as edit distance is a measure of similiarity
between two strings where the distance is measured as the number of character
deletions, insertions or substitutions required to transform one string to
the other string.
<p>This method takes in four parameters; two strings and their respective
lengths to compute the Levenshtein distance between the two strings.
The result is returned as an integer.
*/
private final int editDistance(String s, String t, int n, int m) {
if (e.length <= n || e[0].length <= m) {
e = new int[Math.max(e.length, n+1)][Math.max(e[0].length, m+1)];
}
int d[][] = e; // matrix
int i; // iterates through s
int j; // iterates through t
char s_i; // ith character of s
if (n == 0) return m;
if (m == 0) return n;
// init matrix d
for (i = 0; i <= n; i++) d[i][0] = i;
for (j = 0; j <= m; j++) d[0][j] = j;
// start computing edit distance
for (i = 1; i <= n; i++) {
s_i = s.charAt(i - 1);
for (j = 1; j <= m; j++) {
if (s_i != t.charAt(j-1))
d[i][j] = min(d[i-1][j], d[i][j-1], d[i-1][j-1])+1;
else d[i][j] = min(d[i-1][j]+1, d[i][j-1]+1, d[i-1][j-1]);
}
}
// we got the result!
return d[n][m];
}
public void close() throws IOException {
super.close();
searchTerm = null;
field = null;
text = null;
}
}