blob: cdb7c9b37372f604825edfe504b4000fa515ef03 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.commons.text.similarity;
import static org.assertj.core.api.Assertions.assertThatIllegalArgumentException;
import static org.junit.jupiter.api.Assertions.assertEquals;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.function.Function;
import java.util.regex.Pattern;
import org.junit.jupiter.api.Test;
/**
* Unit tests for {@link IntersectionSimilarity}.
*/
public class IntersectionSimilarityTest {
@Test
public void testIntersectionUsingSetCharacter() {
// Compute using single characters.
// This test uses a set and so should not allow duplicates.
final IntersectionSimilarity<Character> similarity =
new IntersectionSimilarity<>(IntersectionSimilarityTest::toCharacterSet);
// Expected:
// size A or B = count of unique characters (exclude duplicates)
// intersection = count of unique matching characters (exclude duplicates)
assertIntersection(similarity, "", "", 0, 0, 0);
assertIntersection(similarity, "a", "", 1, 0, 0);
assertIntersection(similarity, "a", "a", 1, 1, 1);
assertIntersection(similarity, "a", "b", 1, 1, 0);
assertIntersection(similarity, "aa", "ab", 1, 2, 1);
assertIntersection(similarity, "ab", "ab", 2, 2, 2);
assertIntersection(similarity, "aaba", "abaa", 2, 2, 2);
assertIntersection(similarity, "aaaa", "aa", 1, 1, 1);
assertIntersection(similarity, "aa", "aaaa", 1, 1, 1);
assertIntersection(similarity, "aaaa", "aaa", 1, 1, 1);
assertIntersection(similarity, "aabab", "ababa", 2, 2, 2);
assertIntersection(similarity, "the same", "the same", 7, 7, 7);
assertIntersection(similarity, "abcdefghijklm", "ab_defg ijklm", 13, 13, 11);
}
@Test
public void testIntersectionUsingListCharacter() {
// Compute using single characters.
// This test uses a list and so duplicates should be matched.
final IntersectionSimilarity<Character> similarity =
new IntersectionSimilarity<>(IntersectionSimilarityTest::toCharacterList);
// Expected:
// size A or B = sequence length
// intersection = count of matching characters (include duplicates)
assertIntersection(similarity, "", "", 0, 0, 0);
assertIntersection(similarity, "a", "", 1, 0, 0);
assertIntersection(similarity, "a", "a", 1, 1, 1);
assertIntersection(similarity, "a", "b", 1, 1, 0);
assertIntersection(similarity, "aa", "ab", 2, 2, 1);
assertIntersection(similarity, "ab", "ab", 2, 2, 2);
assertIntersection(similarity, "aaba", "abaa", 4, 4, 4);
assertIntersection(similarity, "aaaa", "aa", 4, 2, 2);
assertIntersection(similarity, "aa", "aaaa", 2, 4, 2);
assertIntersection(similarity, "aaaa", "aaa", 4, 3, 3);
assertIntersection(similarity, "aabab", "ababa", 5, 5, 5);
assertIntersection(similarity, "the same", "the same", 8, 8, 8);
assertIntersection(similarity, "abcdefghijklm", "ab_defg ijklm", 13, 13, 11);
}
@Test
public void testIntersectionUsingSetBigrams() {
// Compute using pairs of characters (bigrams).
// This can be done using a 32-bit int to store two 16-bit characters.
// This test uses a set and so should not allow duplicates.
final IntersectionSimilarity<Integer> similarity =
new IntersectionSimilarity<>(IntersectionSimilarityTest::toBigramSet);
// Expected:
// size A or B = count of unique bigrams (exclude duplicates)
// intersection = count of unique matching bigrams (exclude duplicates)
assertIntersection(similarity, "", "", 0, 0, 0);
assertIntersection(similarity, "a", "", 0, 0, 0);
assertIntersection(similarity, "a", "a", 0, 0, 0);
assertIntersection(similarity, "a", "b", 0, 0, 0);
assertIntersection(similarity, "aa", "ab", 1, 1, 0);
assertIntersection(similarity, "ab", "ab", 1, 1, 1);
assertIntersection(similarity, "aaba", "abaa", 3, 3, 3);
assertIntersection(similarity, "aaaa", "aa", 1, 1, 1);
assertIntersection(similarity, "aa", "aaaa", 1, 1, 1);
assertIntersection(similarity, "aaaa", "aaa", 1, 1, 1);
assertIntersection(similarity, "aabab", "ababa", 3, 2, 2);
assertIntersection(similarity, "the same", "the same", 7, 7, 7);
assertIntersection(similarity, "abcdefghijklm", "ab_defg ijklm", 12, 12, 8);
}
@Test
public void testIntersectionUsingListBigrams() {
// Compute using pairs of characters (bigrams).
// This can be done using a 32-bit int to store two 16-bit characters.
// This test uses a list and so duplicates should be matched.
final IntersectionSimilarity<Integer> similarity =
new IntersectionSimilarity<>(IntersectionSimilarityTest::toBigramList);
// Expected:
// size A or B = sequence length - 1
// intersection = count of matching bigrams (include duplicates)
assertIntersection(similarity, "", "", 0, 0, 0);
assertIntersection(similarity, "a", "", 0, 0, 0);
assertIntersection(similarity, "a", "a", 0, 0, 0);
assertIntersection(similarity, "a", "b", 0, 0, 0);
assertIntersection(similarity, "aa", "ab", 1, 1, 0);
assertIntersection(similarity, "ab", "ab", 1, 1, 1);
assertIntersection(similarity, "aaba", "abaa", 3, 3, 3);
assertIntersection(similarity, "aaaa", "aa", 3, 1, 1);
assertIntersection(similarity, "aa", "aaaa", 1, 3, 1);
assertIntersection(similarity, "aaaa", "aaa", 3, 2, 2);
assertIntersection(similarity, "aabab", "ababa", 4, 4, 3);
assertIntersection(similarity, "the same", "the same", 7, 7, 7);
assertIntersection(similarity, "abcdefghijklm", "ab_defg ijklm", 12, 12, 8);
}
@Test
public void testIntersectionUsingSetCharacterListCharacter() {
// Compute using a custom converter that returns a Set and a List.
// This is an edge-case test.
final HashMap<CharSequence, Collection<Character>> converter = new HashMap<>();
final String sequence1 = "aabbccdd";
final String sequence2 = "aaaaaabbbfffff";
converter.put(sequence1, toCharacterSet(sequence1));
converter.put(sequence2, toCharacterList(sequence2));
final IntersectionSimilarity<Character> similarity =
new IntersectionSimilarity<>(converter::get);
// Expected:
// size A = count of unique characters (exclude duplicates)
// size B = sequence length
// intersection = count of matching characters (exclude duplicates)
assertIntersection(similarity, sequence1, sequence2, 4, sequence2.length(), 2);
assertIntersection(similarity, sequence2, sequence1, sequence2.length(), 4, 2);
}
/**
* Convert the {@link CharSequence} to a {@link Set} of {@link Character}s.
*
* @param sequence the sequence
* @return the set
*/
private static Set<Character> toCharacterSet(final CharSequence sequence) {
final int length = sequence.length();
final Set<Character> set = new HashSet<>(length);
for (int i = 0; i < length; i++) {
set.add(sequence.charAt(i));
}
return set;
}
/**
* Convert the {@link CharSequence} to a {@link List} of {@link Character}s.
*
* @param sequence the sequence
* @return the list
*/
private static List<Character> toCharacterList(final CharSequence sequence) {
final int length = sequence.length();
final List<Character> list = new ArrayList<>(length);
for (int i = 0; i < length; i++) {
list.add(sequence.charAt(i));
}
return list;
}
/**
* Convert the {@link CharSequence} to a {@link Set} of bigrams (pairs of characters).
* These are represented using 2 16-bit chars packed into a 32-bit int.
*
* @param sequence the sequence
* @return the set
*/
private static Set<Integer> toBigramSet(final CharSequence sequence) {
final int length = sequence.length();
final Set<Integer> set = new HashSet<>(length);
if (length > 1) {
char ch2 = sequence.charAt(0);
for (int i = 1; i < length; i++) {
final char ch1 = ch2;
ch2 = sequence.charAt(i);
set.add(Integer.valueOf((ch1 << 16) | ch2));
}
}
return set;
}
/**
* Convert the {@link CharSequence} to a {@link List} of bigrams (pairs of characters).
* These are represented using 2 16-bit chars packed into a 32-bit int.
*
* @param sequence the sequence
* @return the list
*/
private static List<Integer> toBigramList(final CharSequence sequence) {
final int length = sequence.length();
final List<Integer> list = new ArrayList<>(length);
if (length > 1) {
char ch2 = sequence.charAt(0);
for (int i = 1; i < length; i++) {
final char ch1 = ch2;
ch2 = sequence.charAt(i);
list.add(Integer.valueOf((ch1 << 16) | ch2));
}
}
return list;
}
private static <T> void assertIntersection(final IntersectionSimilarity<T> similarity,
final CharSequence cs1, final CharSequence cs2, final int sizeA, final int sizeB, final int intersection) {
final IntersectionResult result = similarity.apply(cs1, cs2);
assertEquals(sizeA, result.getSizeA(), "Size A error");
assertEquals(sizeB, result.getSizeB(), "Size B error");
assertEquals(intersection, result.getIntersection(), "Intersection error");
}
@Test
public void testF1ScoreUsingListWordBigrams() {
// Example of a word letter pairs algorithm by Simon White:
// http://www.catalysoft.com/articles/StrikeAMatch.html
// This splits into words using whitespace and then computes uppercase
// bigrams for each word.
// Split on whitespace
final Pattern pattern = Pattern.compile("\\s+");
// Compute using pairs of characters (bigrams) for each word.
// This can be done using a 32-bit int to store two 16-bit characters
final Function<CharSequence, Collection<Integer>> converter = cs -> {
final List<Integer> set = new ArrayList<>();
for (final String word : pattern.split(cs)) {
if (word.length() > 1) {
// The strings are converted to upper case
char ch2 = Character.toUpperCase(word.charAt(0));
for (int i = 1; i < word.length(); i++) {
final char ch1 = ch2;
ch2 = Character.toUpperCase(word.charAt(i));
set.add(Integer.valueOf((ch1 << 16) | ch2));
}
}
}
return set;
};
final IntersectionSimilarity<Integer> similarity = new IntersectionSimilarity<>(converter);
String bookTitle;
final String search1 = "Web Database Applications";
final String search2 = "PHP Web Applications";
final String search3 = "Web Aplications";
bookTitle = "Web Database Applications with PHP & MySQL";
assertEquals(82, toF1ScorePercent(similarity.apply(bookTitle, search1)));
assertEquals(68, toF1ScorePercent(similarity.apply(bookTitle, search2)));
assertEquals(59, toF1ScorePercent(similarity.apply(bookTitle, search3)));
bookTitle = "Creating Database Web Applications with PHP and ASP";
assertEquals(71, toF1ScorePercent(similarity.apply(bookTitle, search1)));
assertEquals(59, toF1ScorePercent(similarity.apply(bookTitle, search2)));
assertEquals(50, toF1ScorePercent(similarity.apply(bookTitle, search3)));
bookTitle = "Building Database Applications on the Web Using PHP3";
assertEquals(70, toF1ScorePercent(similarity.apply(bookTitle, search1)));
assertEquals(58, toF1ScorePercent(similarity.apply(bookTitle, search2)));
assertEquals(49, toF1ScorePercent(similarity.apply(bookTitle, search3)));
bookTitle = "Building Web Database Applications with Visual Studio 6";
assertEquals(67, toF1ScorePercent(similarity.apply(bookTitle, search1)));
assertEquals(47, toF1ScorePercent(similarity.apply(bookTitle, search2)));
assertEquals(46, toF1ScorePercent(similarity.apply(bookTitle, search3)));
bookTitle = "Web Application Development With PHP";
assertEquals(51, toF1ScorePercent(similarity.apply(bookTitle, search1)));
assertEquals(67, toF1ScorePercent(similarity.apply(bookTitle, search2)));
assertEquals(56, toF1ScorePercent(similarity.apply(bookTitle, search3)));
bookTitle = "WebRAD: Building Database Applications on the Web with Visual FoxPro and Web Connection";
assertEquals(49, toF1ScorePercent(similarity.apply(bookTitle, search1)));
assertEquals(34, toF1ScorePercent(similarity.apply(bookTitle, search2)));
assertEquals(32, toF1ScorePercent(similarity.apply(bookTitle, search3)));
bookTitle = "Structural Assessment: The Role of Large and Full-Scale Testing";
assertEquals(12, toF1ScorePercent(similarity.apply(bookTitle, search1)));
assertEquals(7, toF1ScorePercent(similarity.apply(bookTitle, search2)));
assertEquals(7, toF1ScorePercent(similarity.apply(bookTitle, search3)));
bookTitle = "How to Find a Scholarship Online";
assertEquals(10, toF1ScorePercent(similarity.apply(bookTitle, search1)));
assertEquals(11, toF1ScorePercent(similarity.apply(bookTitle, search2)));
assertEquals(12, toF1ScorePercent(similarity.apply(bookTitle, search3)));
}
private static int toF1ScorePercent(final IntersectionResult result) {
final double value = 2.0 * result.getIntersection() / (result.getSizeA() + result.getSizeB());
// Convert to percentage
return (int) Math.round(value * 100);
}
@Test
public void testConstructorWithNullConverterThrows() {
assertThatIllegalArgumentException().isThrownBy(() -> new IntersectionSimilarity<>(null));
}
@Test
public void testApplyNullNull() {
assertThatIllegalArgumentException().isThrownBy(() -> new IntersectionSimilarity<>(cs -> new HashSet<>(Collections.singletonList(cs))).apply(null, null));
}
@Test
public void testApplyStringNull() {
assertThatIllegalArgumentException().isThrownBy(() -> new IntersectionSimilarity<>(cs -> new HashSet<>(Collections.singletonList(cs))).apply("left", null));
}
@Test
public void testApplyNullString() {
assertThatIllegalArgumentException().isThrownBy(() -> new IntersectionSimilarity<>(cs -> new HashSet<>(Collections.singletonList(cs))).apply(null, "right"));
}
}