| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.search.spell; |
| |
| /** |
| * N-Gram version of edit distance based on paper by Grzegorz Kondrak, |
| * "N-gram similarity and distance". Proceedings of the Twelfth International |
| * Conference on String Processing and Information Retrieval (SPIRE 2005), pp. 115-126, |
| * Buenos Aires, Argentina, November 2005. |
| * http://www.cs.ualberta.ca/~kondrak/papers/spire05.pdf |
| * |
| * This implementation uses the position-based optimization to compute partial |
| * matches of n-gram sub-strings and adds a null-character prefix of size n-1 |
| * so that the first character is contained in the same number of n-grams as |
| * a middle character. Null-character prefix matches are discounted so that |
| * strings with no matching characters will return a distance of 0. |
| * |
| */ |
| public class NGramDistance implements StringDistance { |
| |
| private int n; |
| |
| /** |
| * Creates an N-Gram distance measure using n-grams of the specified size. |
| * @param size The size of the n-gram to be used to compute the string distance. |
| */ |
| public NGramDistance(int size) { |
| this.n = size; |
| } |
| |
| /** |
| * Creates an N-Gram distance measure using n-grams of size 2. |
| */ |
| public NGramDistance() { |
| this(2); |
| } |
| |
| @Override |
| public float getDistance(String source, String target) { |
| final int sl = source.length(); |
| final int tl = target.length(); |
| |
| if (sl == 0 || tl == 0) { |
| if (sl == tl) { |
| return 1; |
| } |
| else { |
| return 0; |
| } |
| } |
| |
| int cost = 0; |
| if (sl < n || tl < n) { |
| for (int i=0,ni=Math.min(sl,tl);i<ni;i++) { |
| if (source.charAt(i) == target.charAt(i)) { |
| cost++; |
| } |
| } |
| return (float) cost/Math.max(sl, tl); |
| } |
| |
| char[] sa = new char[sl+n-1]; |
| float p[]; //'previous' cost array, horizontally |
| float d[]; // cost array, horizontally |
| float _d[]; //placeholder to assist in swapping p and d |
| |
| //construct sa with prefix |
| for (int i=0;i<sa.length;i++) { |
| if (i < n-1) { |
| sa[i]=0; //add prefix |
| } |
| else { |
| sa[i] = source.charAt(i-n+1); |
| } |
| } |
| p = new float[sl+1]; |
| d = new float[sl+1]; |
| |
| // indexes into strings s and t |
| int i; // iterates through source |
| int j; // iterates through target |
| |
| char[] t_j = new char[n]; // jth n-gram of t |
| |
| for (i = 0; i<=sl; i++) { |
| p[i] = i; |
| } |
| |
| for (j = 1; j<=tl; j++) { |
| //construct t_j n-gram |
| if (j < n) { |
| for (int ti=0;ti<n-j;ti++) { |
| t_j[ti]=0; //add prefix |
| } |
| for (int ti=n-j;ti<n;ti++) { |
| t_j[ti]=target.charAt(ti-(n-j)); |
| } |
| } |
| else { |
| t_j = target.substring(j-n, j).toCharArray(); |
| } |
| d[0] = j; |
| for (i=1; i<=sl; i++) { |
| cost = 0; |
| int tn=n; |
| //compare sa to t_j |
| for (int ni=0;ni<n;ni++) { |
| if (sa[i-1+ni] != t_j[ni]) { |
| cost++; |
| } |
| else if (sa[i-1+ni] == 0) { //discount matches on prefix |
| tn--; |
| } |
| } |
| float ec = (float) cost/tn; |
| // minimum of cell to the left+1, to the top+1, diagonally left and up +cost |
| d[i] = Math.min(Math.min(d[i-1]+1, p[i]+1), p[i-1]+ec); |
| } |
| // copy current distance counts to 'previous row' distance counts |
| _d = p; |
| p = d; |
| d = _d; |
| } |
| |
| // our last action in the above loop was to switch d and p, so p now |
| // actually has the most recent cost counts |
| return 1.0f - (p[sl] / Math.max(tl, sl)); |
| } |
| |
| @Override |
| public int hashCode() { |
| return 1427 * n * getClass().hashCode(); |
| } |
| |
| @Override |
| public boolean equals(Object obj) { |
| if (this == obj) return true; |
| if (null == obj || getClass() != obj.getClass()) return false; |
| |
| NGramDistance o = (NGramDistance)obj; |
| return o.n == this.n; |
| } |
| |
| @Override |
| public String toString() { |
| return "ngram(" + n + ")"; |
| } |
| } |