| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.commons.text.similarity; |
| |
| /** |
| * A similarity algorithm indicating the length of the longest common subsequence between two strings. |
| * |
| * <p> |
| * The Longest common subsequence algorithm returns the length of the longest subsequence that two strings have in |
| * common. Two strings that are entirely different, return a value of 0, and two strings that return a value |
| * of the commonly shared length implies that the strings are completely the same in value and position. |
| * <i>Note.</i> Generally this algorithm is fairly inefficient, as for length <i>m</i>, <i>n</i> of the input |
| * <code>CharSequence</code>'s <code>left</code> and <code>right</code> respectively, the runtime of the |
| * algorithm is <i>O(m*n)</i>. |
| * </p> |
| * |
| * <p> |
| * This implementation is based on the Longest Commons Substring algorithm |
| * from <a href="https://en.wikipedia.org/wiki/Longest_common_subsequence_problem"> |
| * https://en.wikipedia.org/wiki/Longest_common_subsequence_problem</a>. |
| * </p> |
| * |
| * <p>For further reading see:</p> |
| * |
| * <p>Lothaire, M. <i>Applied combinatorics on words</i>. New York: Cambridge U Press, 2005. <b>12-13</b></p> |
| * |
| * @since 1.0 |
| */ |
| public class LongestCommonSubsequence implements SimilarityScore<Integer> { |
| |
| /** |
| * Calculates longest common subsequence similarity score of two <code>CharSequence</code>'s passed as |
| * input. |
| * |
| * @param left first character sequence |
| * @param right second character sequence |
| * @return longestCommonSubsequenceLength |
| * @throws IllegalArgumentException |
| * if either String input {@code null} |
| */ |
| @Override |
| public Integer apply(final CharSequence left, final CharSequence right) { |
| // Quick return for invalid inputs |
| if (left == null || right == null) { |
| throw new IllegalArgumentException("Inputs must not be null"); |
| } |
| return longestCommonSubsequence(left, right).length(); |
| } |
| |
| /** |
| * Computes the longest common subsequence between the two <code>CharSequence</code>'s passed as input. |
| * |
| * <p> |
| * Note, a substring and subsequence are not necessarily the same thing. Indeed, <code>abcxyzqrs</code> and |
| * <code>xyzghfm</code> have both the same common substring and subsequence, namely <code>xyz</code>. However, |
| * <code>axbyczqrs</code> and <code>abcxyzqtv</code> have the longest common subsequence <code>xyzq</code> because a |
| * subsequence need not have adjacent characters. |
| * </p> |
| * |
| * <p> |
| * For reference, we give the definition of a subsequence for the reader: a <i>subsequence</i> is a sequence that |
| * can be derived from another sequence by deleting some elements without changing the order of the remaining |
| * elements. |
| * </p> |
| * |
| * @param left first character sequence |
| * @param right second character sequence |
| * @return the longest common subsequence found |
| * @throws IllegalArgumentException |
| * if either String input {@code null} |
| * @deprecated Deprecated as of 1.2 due to a typo in the method name. |
| * Use {@link #longestCommonSubsequence(CharSequence, CharSequence)} instead. |
| * This method will be removed in 2.0. |
| */ |
| @Deprecated |
| public CharSequence logestCommonSubsequence(final CharSequence left, final CharSequence right) { |
| return longestCommonSubsequence(left, right); |
| } |
| |
| /** |
| * Computes the longest common subsequence between the two <code>CharSequence</code>'s passed as |
| * input. |
| * |
| * <p> |
| * Note, a substring and subsequence are not necessarily the same thing. Indeed, <code>abcxyzqrs</code> and |
| * <code>xyzghfm</code> have both the same common substring and subsequence, namely <code>xyz</code>. However, |
| * <code>axbyczqrs</code> and <code>abcxyzqtv</code> have the longest common subsequence <code>xyzq</code> because a |
| * subsequence need not have adjacent characters. |
| * </p> |
| * |
| * <p> |
| * For reference, we give the definition of a subsequence for the reader: a <i>subsequence</i> is a sequence that |
| * can be derived from another sequence by deleting some elements without changing the order of the remaining |
| * elements. |
| * </p> |
| * |
| * @param left first character sequence |
| * @param right second character sequence |
| * @return the longest common subsequence found |
| * @throws IllegalArgumentException |
| * if either String input {@code null} |
| * @since 1.2 |
| */ |
| public CharSequence longestCommonSubsequence(final CharSequence left, final CharSequence right) { |
| // Quick return |
| if (left == null || right == null) { |
| throw new IllegalArgumentException("Inputs must not be null"); |
| } |
| final StringBuilder longestCommonSubstringArray = new StringBuilder(Math.max(left.length(), right.length())); |
| final int[][] lcsLengthArray = longestCommonSubstringLengthArray(left, right); |
| int i = left.length() - 1; |
| int j = right.length() - 1; |
| int k = lcsLengthArray[left.length()][right.length()] - 1; |
| while (k >= 0) { |
| if (left.charAt(i) == right.charAt(j)) { |
| longestCommonSubstringArray.append(left.charAt(i)); |
| i = i - 1; |
| j = j - 1; |
| k = k - 1; |
| } else if (lcsLengthArray[i + 1][j] < lcsLengthArray[i][j + 1]) { |
| i = i - 1; |
| } else { |
| j = j - 1; |
| } |
| } |
| return longestCommonSubstringArray.reverse().toString(); |
| } |
| |
| /** |
| * |
| * Computes the lcsLengthArray for the sake of doing the actual lcs calculation. This is the |
| * dynamic programming portion of the algorithm, and is the reason for the runtime complexity being |
| * O(m*n), where m=left.length() and n=right.length(). |
| * |
| * @param left first character sequence |
| * @param right second character sequence |
| * @return lcsLengthArray |
| */ |
| public int[][] longestCommonSubstringLengthArray(final CharSequence left, final CharSequence right) { |
| final int[][] lcsLengthArray = new int[left.length() + 1][right.length() + 1]; |
| for (int i = 0; i < left.length(); i++) { |
| for (int j = 0; j < right.length(); j++) { |
| if (i == 0) { |
| lcsLengthArray[i][j] = 0; |
| } |
| if (j == 0) { |
| lcsLengthArray[i][j] = 0; |
| } |
| if (left.charAt(i) == right.charAt(j)) { |
| lcsLengthArray[i + 1][j + 1] = lcsLengthArray[i][j] + 1; |
| } else { |
| lcsLengthArray[i + 1][j + 1] = Math.max(lcsLengthArray[i + 1][j], lcsLengthArray[i][j + 1]); |
| } |
| } |
| } |
| return lcsLengthArray; |
| } |
| |
| } |