blob: 98559c47984a60c85a3b03fe9455988f96d872b8 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.spell;
/**
* N-Gram version of edit distance based on paper by Grzegorz Kondrak, "N-gram similarity and
* distance". Proceedings of the Twelfth International Conference on String Processing and
* Information Retrieval (SPIRE 2005), pp. 115-126, Buenos Aires, Argentina, November 2005.
* http://www.cs.ualberta.ca/~kondrak/papers/spire05.pdf
*
* <p>This implementation uses the position-based optimization to compute partial matches of n-gram
* sub-strings and adds a null-character prefix of size n-1 so that the first character is contained
* in the same number of n-grams as a middle character. Null-character prefix matches are discounted
* so that strings with no matching characters will return a distance of 0.
*/
public class NGramDistance implements StringDistance {
private int n;
/**
* Creates an N-Gram distance measure using n-grams of the specified size.
*
* @param size The size of the n-gram to be used to compute the string distance.
*/
public NGramDistance(int size) {
this.n = size;
}
/** Creates an N-Gram distance measure using n-grams of size 2. */
public NGramDistance() {
this(2);
}
@Override
public float getDistance(String source, String target) {
final int sl = source.length();
final int tl = target.length();
if (sl == 0 || tl == 0) {
if (sl == tl) {
return 1;
} else {
return 0;
}
}
int cost = 0;
if (sl < n || tl < n) {
for (int i = 0, ni = Math.min(sl, tl); i < ni; i++) {
if (source.charAt(i) == target.charAt(i)) {
cost++;
}
}
return (float) cost / Math.max(sl, tl);
}
char[] sa = new char[sl + n - 1];
float p[]; // 'previous' cost array, horizontally
float d[]; // cost array, horizontally
float _d[]; // placeholder to assist in swapping p and d
// construct sa with prefix
for (int i = 0; i < sa.length; i++) {
if (i < n - 1) {
sa[i] = 0; // add prefix
} else {
sa[i] = source.charAt(i - n + 1);
}
}
p = new float[sl + 1];
d = new float[sl + 1];
// indexes into strings s and t
int i; // iterates through source
int j; // iterates through target
char[] t_j = new char[n]; // jth n-gram of t
for (i = 0; i <= sl; i++) {
p[i] = i;
}
for (j = 1; j <= tl; j++) {
// construct t_j n-gram
if (j < n) {
for (int ti = 0; ti < n - j; ti++) {
t_j[ti] = 0; // add prefix
}
for (int ti = n - j; ti < n; ti++) {
t_j[ti] = target.charAt(ti - (n - j));
}
} else {
t_j = target.substring(j - n, j).toCharArray();
}
d[0] = j;
for (i = 1; i <= sl; i++) {
cost = 0;
int tn = n;
// compare sa to t_j
for (int ni = 0; ni < n; ni++) {
if (sa[i - 1 + ni] != t_j[ni]) {
cost++;
} else if (sa[i - 1 + ni] == 0) { // discount matches on prefix
tn--;
}
}
float ec = (float) cost / tn;
// minimum of cell to the left+1, to the top+1, diagonally left and up +cost
d[i] = Math.min(Math.min(d[i - 1] + 1, p[i] + 1), p[i - 1] + ec);
}
// copy current distance counts to 'previous row' distance counts
_d = p;
p = d;
d = _d;
}
// our last action in the above loop was to switch d and p, so p now
// actually has the most recent cost counts
return 1.0f - (p[sl] / Math.max(tl, sl));
}
@Override
public int hashCode() {
return 1427 * n * getClass().hashCode();
}
@Override
public boolean equals(Object obj) {
if (this == obj) return true;
if (null == obj || getClass() != obj.getClass()) return false;
NGramDistance o = (NGramDistance) obj;
return o.n == this.n;
}
@Override
public String toString() {
return "ngram(" + n + ")";
}
}