| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| */ |
| package org.apache.openjpa.lib.util; |
| |
| import java.util.Arrays; |
| import java.util.Collection; |
| |
| /** |
| * Utilities for calculating string distance. |
| * |
| * @author Marc Prud'hommeaux |
| */ |
| public class StringDistance { |
| |
| /** |
| * Returns the candidate string with the closest Levenshtein distance |
| * to the given string. |
| * |
| * @see #getClosestLevenshteinDistance(String,Collection,int) |
| */ |
| public static String getClosestLevenshteinDistance(String str, |
| String[] candidates) { |
| if (candidates == null) |
| return null; |
| return getClosestLevenshteinDistance(str, Arrays.asList(candidates)); |
| } |
| |
| /** |
| * Returns the candidate string with the closest Levenshtein distance |
| * to the given string. |
| * |
| * @see #getClosestLevenshteinDistance(String,Collection,int) |
| */ |
| public static String getClosestLevenshteinDistance(String str, |
| Collection candidates) { |
| return getClosestLevenshteinDistance(str, candidates, |
| Integer.MAX_VALUE); |
| } |
| |
| /** |
| * Returns the candidate string with the closest Levenshtein distance |
| * to the given string. |
| * |
| * @see #getClosestLevenshteinDistance(String,Collection,int) |
| */ |
| public static String getClosestLevenshteinDistance(String str, |
| String[] candidates, int threshold) { |
| if (candidates == null) |
| return null; |
| return getClosestLevenshteinDistance(str, Arrays.asList(candidates), |
| threshold); |
| } |
| |
| /** |
| * Returns the candidate string with the closest Levenshtein distance |
| * to the given string and using the threshold as the specified |
| * percentage of the length of the candidate string(0.0f-1.0f). |
| * |
| * @see #getClosestLevenshteinDistance(String,Collection,int) |
| */ |
| public static String getClosestLevenshteinDistance(String str, |
| String[] candidates, float thresholdPercentage) { |
| if (candidates == null) |
| return null; |
| |
| return getClosestLevenshteinDistance(str, Arrays.asList(candidates), |
| thresholdPercentage); |
| } |
| |
| /** |
| * Returns the candidate string with the closest Levenshtein distance |
| * to the given string and using the threshold as the specified |
| * percentage of the length of the candidate string(0.0f-1.0f). |
| * |
| * @see #getClosestLevenshteinDistance(String,Collection,int) |
| */ |
| public static String getClosestLevenshteinDistance(String str, |
| Collection candidates, float thresholdPercentage) { |
| if (str == null) |
| return null; |
| |
| thresholdPercentage = Math.min(thresholdPercentage, 1.0f); |
| thresholdPercentage = Math.max(thresholdPercentage, 0.0f); |
| |
| return getClosestLevenshteinDistance(str, candidates, |
| (int) (str.length() * thresholdPercentage)); |
| } |
| |
| /** |
| * Returns the candidate string with the closest Levenshtein distance |
| * to the given string. |
| * |
| * @param str the string to check |
| * @param candidates the list of strings to test against |
| * @param threshold the threshold distance a candidate must meet |
| * @see #getLevenshteinDistance |
| */ |
| public static String getClosestLevenshteinDistance(String str, |
| Collection<String> candidates, int threshold) { |
| if (candidates == null || candidates.isEmpty()) |
| return null; |
| |
| String minString = null; |
| int minValue = Integer.MAX_VALUE; |
| |
| for(String candidate : candidates) { |
| int distance = getLevenshteinDistance(str, candidate); |
| if (distance < minValue) { |
| minValue = distance; |
| minString = candidate; |
| } |
| } |
| |
| // return the lowest close string only if we surpass the threshhold |
| if (minValue <= threshold) |
| return minString; |
| else |
| return null; |
| } |
| |
| /** |
| * Returns the Levenshtein distance between the two strings. |
| * The distance is the minimum number of changes that need to be |
| * applied to the first string in order to get to the second |
| * string. For details of the algorithm, see |
| * <a href="http://en.wikipedia.org/wiki/Levenshtein_distance"> |
| * http://en.wikipedia.org/wiki/Levenshtein_distance</a>. |
| */ |
| public static int getLevenshteinDistance(String s, String t) { |
| int n = s.length(); |
| int m = t.length(); |
| |
| if (n == 0) |
| return m; |
| |
| if (m == 0) |
| return n; |
| |
| int[][] matrix = new int[n + 1][m + 1]; |
| |
| for (int i = 0; i <= n; i++) |
| matrix[i][0] = i; |
| |
| for (int j = 0; j <= m; j++) |
| matrix[0][j] = j; |
| |
| for (int i = 1; i <= n; i++) { |
| int si = s.charAt(i - 1); |
| |
| for (int j = 1; j <= m; j++) { |
| int tj = t.charAt(j - 1); |
| |
| int cost; |
| |
| if (si == tj) |
| cost = 0; |
| else |
| cost = 1; |
| |
| matrix[i][j] = min(matrix[i - 1][j] + 1, matrix[i][j - 1] + 1, |
| matrix[i - 1][j - 1] + cost); |
| } |
| } |
| |
| return matrix[n][m]; |
| } |
| |
| private static int min(int a, int b, int c) { |
| int mi = a; |
| |
| if (b < mi) |
| mi = b; |
| |
| if (c < mi) |
| mi = c; |
| |
| return mi; |
| } |
| } |
| |