| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.commons.text.similarity; |
| |
| /** |
| * A similarity algorithm indicating the length of the longest common subsequence between two strings. |
| * |
| * <p> |
| * The Longest common subsequence algorithm returns the length of the longest subsequence that two strings have in |
| * common. Two strings that are entirely different, return a value of 0, and two strings that return a value |
| * of the commonly shared length implies that the strings are completely the same in value and position. |
| * <i>Note.</i> Generally this algorithm is fairly inefficient, as for length <i>m</i>, <i>n</i> of the input |
| * {@code CharSequence}'s {@code left} and {@code right} respectively, the runtime of the |
| * algorithm is <i>O(m*n)</i>. |
| * </p> |
| * |
| * <p> |
| * As of version 1.10, a more space-efficient of the algorithm is implemented. The new algorithm has linear space |
| * complexity instead of quadratic. However, time complexity is still quadratic in the size of input strings. |
| * </p> |
| * |
| * <p> |
| * The implementation is based on Hirschberg's Longest Commons Substring algorithm (cited below). |
| * </p> |
| * |
| * <p>For further reading see:</p> |
| * <ul> |
| * <li> |
| * Lothaire, M. <i>Applied combinatorics on words</i>. New York: Cambridge U Press, 2005. <b>12-13</b> |
| * </li> |
| * <li> |
| * D. S. Hirschberg, "A linear space algorithm for computing maximal common subsequences," CACM, 1975, pp. 341--343. |
| * </li> |
| * </ul> |
| * |
| * @since 1.0 |
| */ |
| public class LongestCommonSubsequence implements SimilarityScore<Integer> { |
| /** |
| * Calculates the longest common subsequence similarity score of two {@code CharSequence}'s passed as |
| * input. |
| * |
| * <p> |
| * This method implements a more efficient version of LCS algorithm which has quadratic time and |
| * linear space complexity. |
| * </p> |
| * |
| * <p> |
| * This method is based on newly implemented {@link #algorithmB(CharSequence, CharSequence)}. |
| * An evaluation using JMH revealed that this method is almost two times faster than its previous version. |
| * </p> |
| * |
| * @param left first character sequence |
| * @param right second character sequence |
| * @return length of the longest common subsequence of <code>left</code> and <code>right</code> |
| * @throws IllegalArgumentException if either String input {@code null} |
| */ |
| @Override |
| public Integer apply(final CharSequence left, final CharSequence right) { |
| // Quick return for invalid inputs |
| if (left == null || right == null) { |
| throw new IllegalArgumentException("Inputs must not be null"); |
| } |
| // Find lengths of two strings |
| final int leftSz = left.length(); |
| final int rightSz = right.length(); |
| |
| // Check if we can avoid calling algorithmB which involves heap space allocation |
| if (leftSz == 0 || rightSz == 0) { |
| return 0; |
| } |
| |
| // Check if we can save even more space |
| if (leftSz < rightSz) { |
| return algorithmB(right, left)[leftSz]; |
| } |
| return algorithmB(left, right)[rightSz]; |
| } |
| |
| /** |
| * An implementation of "ALG B" from Hirschberg's CACM '71 paper. |
| * Assuming the first input sequence is of size <code>m</code> and the second input sequence is of size |
| * <code>n</code>, this method returns the last row of the dynamic programming (DP) table when calculating |
| * the LCS of the two sequences in <i>O(m*n)</i> time and <i>O(n)</i> space. |
| * The last element of the returned array, is the size of the LCS of the two input sequences. |
| * |
| * @param left first input sequence. |
| * @param right second input sequence. |
| * @return last row of the dynamic-programming (DP) table for calculating the LCS of <code>left</code> and <code>right</code> |
| * @since 1.10 |
| */ |
| private static int[] algorithmB(final CharSequence left, final CharSequence right) { |
| final int m = left.length(); |
| final int n = right.length(); |
| |
| // Creating an array for storing two rows of DP table |
| final int[][] dpRows = new int[2][1 + n]; |
| |
| for (int i = 1; i <= m; i++) { |
| // K(0, j) <- K(1, j) [j = 0...n], as per the paper: |
| // Since we have references in Java, we can swap references instead of literal copying. |
| // We could also use a "binary index" using modulus operator, but directly swapping the |
| // two rows helps readability and keeps the code consistent with the algorithm description |
| // in the paper. |
| final int[] temp = dpRows[0]; |
| dpRows[0] = dpRows[1]; |
| dpRows[1] = temp; |
| |
| for (int j = 1; j <= n; j++) { |
| if (left.charAt(i - 1) == right.charAt(j - 1)) { |
| dpRows[1][j] = dpRows[0][j - 1] + 1; |
| } else { |
| dpRows[1][j] = Math.max(dpRows[1][j - 1], dpRows[0][j]); |
| } |
| } |
| } |
| |
| // LL(j) <- K(1, j) [j=0...n], as per the paper: |
| // We don't need literal copying of the array, we can just return the reference |
| return dpRows[1]; |
| } |
| |
| /** |
| * Computes the longest common subsequence between the two {@code CharSequence}'s passed as input. |
| * |
| * <p> |
| * Note, a substring and subsequence are not necessarily the same thing. Indeed, {@code abcxyzqrs} and |
| * {@code xyzghfm} have both the same common substring and subsequence, namely {@code xyz}. However, |
| * {@code axbyczqrs} and {@code abcxyzqtv} have the longest common subsequence {@code xyzq} because a |
| * subsequence need not have adjacent characters. |
| * </p> |
| * |
| * <p> |
| * For reference, we give the definition of a subsequence for the reader: a <i>subsequence</i> is a sequence that |
| * can be derived from another sequence by deleting some elements without changing the order of the remaining |
| * elements. |
| * </p> |
| * |
| * @param left first character sequence |
| * @param right second character sequence |
| * @return the longest common subsequence found |
| * @throws IllegalArgumentException if either String input {@code null} |
| * @deprecated Deprecated as of 1.2 due to a typo in the method name. |
| * Use {@link #longestCommonSubsequence(CharSequence, CharSequence)} instead. |
| * This method will be removed in 2.0. |
| */ |
| @Deprecated |
| public CharSequence logestCommonSubsequence(final CharSequence left, final CharSequence right) { |
| return longestCommonSubsequence(left, right); |
| } |
| |
| /** |
| * Computes the longest common subsequence between the two {@code CharSequence}'s passed as |
| * input. |
| * |
| * <p> |
| * This method implements a more efficient version of LCS algorithm which although has quadratic time, it |
| * has linear space complexity. |
| * </p> |
| * |
| * |
| * <p> |
| * Note, a substring and subsequence are not necessarily the same thing. Indeed, {@code abcxyzqrs} and |
| * {@code xyzghfm} have both the same common substring and subsequence, namely {@code xyz}. However, |
| * {@code axbyczqrs} and {@code abcxyzqtv} have the longest common subsequence {@code xyzq} because a |
| * subsequence need not have adjacent characters. |
| * </p> |
| * |
| * <p> |
| * For reference, we give the definition of a subsequence for the reader: a <i>subsequence</i> is a sequence that |
| * can be derived from another sequence by deleting some elements without changing the order of the remaining |
| * elements. |
| * </p> |
| * |
| * @param left first character sequence |
| * @param right second character sequence |
| * @return the longest common subsequence found |
| * @throws IllegalArgumentException if either String input {@code null} |
| * @since 1.2 |
| */ |
| public CharSequence longestCommonSubsequence(final CharSequence left, final CharSequence right) { |
| // Quick return |
| if (left == null || right == null) { |
| throw new IllegalArgumentException("Inputs must not be null"); |
| } |
| // Find lengths of two strings |
| final int leftSz = left.length(); |
| final int rightSz = right.length(); |
| |
| // Check if we can avoid calling algorithmC which involves heap space allocation |
| if (leftSz == 0 || rightSz == 0) { |
| return ""; |
| } |
| |
| // Check if we can save even more space |
| if (leftSz < rightSz) { |
| return algorithmC(right, left); |
| } |
| return algorithmC(left, right); |
| } |
| |
| /** |
| * An implementation of "ALG C" from Hirschberg's CACM '71 paper. |
| * Assuming the first input sequence is of size <code>m</code> and the second input sequence is of size |
| * <code>n</code>, this method returns the Longest Common Subsequence (LCS) of the two sequences in |
| * <i>O(m*n)</i> time and <i>O(m+n)</i> space. |
| * |
| * @param left first input sequence. |
| * @param right second input sequence. |
| * @return the LCS of <code>left</code> and <code>right</code> |
| * @since 1.10 |
| */ |
| private static String algorithmC(final CharSequence left, final CharSequence right) { |
| final int m = left.length(); |
| final int n = right.length(); |
| |
| final StringBuilder out = new StringBuilder(); |
| |
| if (m == 1) { // Handle trivial cases, as per the paper |
| final char leftCh = left.charAt(0); |
| for (int j = 0; j < n; j++) { |
| if (leftCh == right.charAt(j)) { |
| out.append(leftCh); |
| break; |
| } |
| } |
| } else if (n > 0 && m > 1) { |
| final int mid = m / 2; // Find the middle point |
| |
| final CharSequence leftFirstPart = left.subSequence(0, mid); |
| final CharSequence leftSecondPart = left.subSequence(mid, m); |
| |
| // Step 3 of the algorithm: two calls to Algorithm B |
| final int[] l1 = algorithmB(leftFirstPart, right); |
| final int[] l2 = algorithmB(reverse(leftSecondPart), reverse(right)); |
| |
| // Find k, as per the Step 4 of the algorithm |
| int k = 0; |
| int t = 0; |
| for (int j = 0; j <= n; j++) { |
| final int s = l1[j] + l2[n - j]; |
| if (t < s) { |
| t = s; |
| k = j; |
| } |
| } |
| |
| // Step 5: solve simpler problems, recursively |
| out.append(algorithmC(leftFirstPart, right.subSequence(0, k))); |
| out.append(algorithmC(leftSecondPart, right.subSequence(k, n))); |
| } |
| |
| return out.toString(); |
| } |
| |
| // An auxiliary method for CharSequence reversal |
| private static String reverse(final CharSequence s) { |
| return (new StringBuilder(s)).reverse().toString(); |
| } |
| |
| /** |
| * Computes the lcsLengthArray for the sake of doing the actual lcs calculation. This is the |
| * dynamic programming portion of the algorithm, and is the reason for the runtime complexity being |
| * O(m*n), where m=left.length() and n=right.length(). |
| * |
| * @param left first character sequence |
| * @param right second character sequence |
| * @return lcsLengthArray |
| * @deprecated Deprecated as of 1.10. A more efficient implementation for calculating LCS is now available. |
| * Use {@link #longestCommonSubsequence(CharSequence, CharSequence)} instead to directly calculate the LCS. |
| * This method will be removed in 2.0. |
| */ |
| @Deprecated |
| public int[][] longestCommonSubstringLengthArray(final CharSequence left, final CharSequence right) { |
| final int[][] lcsLengthArray = new int[left.length() + 1][right.length() + 1]; |
| for (int i = 0; i < left.length(); i++) { |
| for (int j = 0; j < right.length(); j++) { |
| if (i == 0) { |
| lcsLengthArray[i][j] = 0; |
| } |
| if (j == 0) { |
| lcsLengthArray[i][j] = 0; |
| } |
| if (left.charAt(i) == right.charAt(j)) { |
| lcsLengthArray[i + 1][j + 1] = lcsLengthArray[i][j] + 1; |
| } else { |
| lcsLengthArray[i + 1][j + 1] = Math.max(lcsLengthArray[i + 1][j], lcsLengthArray[i][j + 1]); |
| } |
| } |
| } |
| return lcsLengthArray; |
| } |
| |
| } |