blob: fe350c2c756541e74f941ae5fa1f75a1c5178e3c [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.datasketches.theta;
import static org.apache.datasketches.theta.JaccardSimilarity.exactlyEqual;
import static org.apache.datasketches.theta.JaccardSimilarity.jaccard;
import static org.testng.Assert.assertFalse;
import static org.testng.Assert.assertTrue;
import org.testng.annotations.Test;
/**
* @author Lee Rhodes
*/
@SuppressWarnings("javadoc")
public class JaccardSimilarityTest {
@Test
public void checkNullsEmpties() {
int minK = 1 << 12;
double threshold = 0.95;
println("Check nulls & empties, minK: " + minK + "\t Th: " + threshold);
//check both null
double[] jResults = jaccard(null, null);
boolean state = jResults[1] > threshold;
println("null \t null:\t" + state + "\t" + jaccardString(jResults));
assertFalse(state);
state = exactlyEqual(null, null);
assertFalse(state);
UpdateSketch measured = UpdateSketch.builder().setNominalEntries(minK).build();
UpdateSketch expected = UpdateSketch.builder().setNominalEntries(minK).build();
//check both empty
jResults = jaccard(measured, expected);
state = jResults[1] > threshold;
println("empty\tempty:\t" + state + "\t" + jaccardString(jResults));
assertTrue(state);
state = exactlyEqual(measured, expected);
assertTrue(state);
state = exactlyEqual(measured, measured);
assertTrue(state);
//adjust one
expected.update(1);
jResults = jaccard(measured, expected);
state = jResults[1] > threshold;
println("empty\t 1:\t" + state + "\t" + jaccardString(jResults));
assertFalse(state);
state = exactlyEqual(measured, expected);
assertFalse(state);
println("");
}
@Test
public void checkExactMode() {
int k = 1 << 12;
int u = k;
double threshold = 0.9999;
println("Exact Mode, minK: " + k + "\t Th: " + threshold);
UpdateSketch measured = UpdateSketch.builder().setNominalEntries(k).build();
UpdateSketch expected = UpdateSketch.builder().setNominalEntries(k).build();
for (int i = 0; i < (u-1); i++) { //one short
measured.update(i);
expected.update(i);
}
double[] jResults = jaccard(measured, expected);
boolean state = jResults[1] > threshold;
println(state + "\t" + jaccardString(jResults));
assertTrue(state);
state = exactlyEqual(measured, expected);
assertTrue(state);
measured.update(u-1); //now exactly k entries
expected.update(u); //now exactly k entries but differs by one
jResults = jaccard(measured, expected);
state = jResults[1] > threshold;
println(state + "\t" + jaccardString(jResults));
assertFalse(state);
state = exactlyEqual(measured, expected);
assertFalse(state);
println("");
}
@Test
public void checkEstMode() {
int k = 1 << 12;
int u = 1 << 20;
double threshold = 0.9999;
println("Estimation Mode, minK: " + k + "\t Th: " + threshold);
UpdateSketch measured = UpdateSketch.builder().setNominalEntries(k).build();
UpdateSketch expected = UpdateSketch.builder().setNominalEntries(k).build();
for (int i = 0; i < u; i++) {
measured.update(i);
expected.update(i);
}
double[] jResults = jaccard(measured, expected);
boolean state = jResults[1] > threshold;
println(state + "\t" + jaccardString(jResults));
assertTrue(state);
state = exactlyEqual(measured, expected);
assertTrue(state);
for (int i = u; i < (u + 50); i++) { //empirically determined
measured.update(i);
}
jResults = jaccard(measured, expected);
state = jResults[1] >= threshold;
println(state + "\t" + jaccardString(jResults));
assertFalse(state);
state = exactlyEqual(measured, expected);
assertFalse(state);
println("");
}
/**
* Enable printing on this test and you will see that the distribution is pretty tight,
* about +/- 0.7%, which is pretty good since the accuracy of the underlying sketch is about
* +/- 1.56%.
*/
@Test
public void checkSimilarity() {
int minK = 1 << 12;
int u1 = 1 << 20;
int u2 = (int) (u1 * 0.95);
double threshold = 0.943;
println("Estimation Mode, minK: " + minK + "\t Th: " + threshold);
UpdateSketch expected = UpdateSketch.builder().setNominalEntries(minK).build();
UpdateSketch measured = UpdateSketch.builder().setNominalEntries(minK).build();
for (int i = 0; i < u1; i++) {
expected.update(i);
}
for (int i = 0; i < u2; i++) {
measured.update(i);
}
double[] jResults = JaccardSimilarity.jaccard(measured, expected);
boolean state = JaccardSimilarity.similarityTest(measured, expected, threshold);
println(state + "\t" + jaccardString(jResults));
assertTrue(state);
//check identity case
state = JaccardSimilarity.similarityTest(measured, measured, threshold);
assertTrue(state);
}
/**
* Enable printing on this test and you will see that the distribution is much looser,
* about +/- 14%. This is due to the fact that intersections loose accuracy as the ratio of
* intersection to the union becomes a small number.
*/
@Test
public void checkDissimilarity() {
int minK = 1 << 12;
int u1 = 1 << 20;
int u2 = (int) (u1 * 0.05);
double threshold = 0.061;
println("Estimation Mode, minK: " + minK + "\t Th: " + threshold);
UpdateSketch expected = UpdateSketch.builder().setNominalEntries(minK).build();
UpdateSketch measured = UpdateSketch.builder().setNominalEntries(minK).build();
for (int i = 0; i < u1; i++) {
expected.update(i);
}
for (int i = 0; i < u2; i++) {
measured.update(i);
}
double[] jResults = JaccardSimilarity.jaccard(measured, expected);
boolean state = JaccardSimilarity.dissimilarityTest(measured, expected, threshold);
println(state + "\t" + jaccardString(jResults));
assertTrue(state);
}
private static String jaccardString(double[] jResults) {
double lb = jResults[0];
double est = jResults[1];
double ub = jResults[2];
return lb + "\t" + est + "\t" + ub + "\t" + ((lb/est) - 1.0) + "\t" + ((ub/est) - 1.0);
}
@Test
public void checkMinK() {
UpdateSketch skA = UpdateSketch.builder().build(); //4096
UpdateSketch skB = UpdateSketch.builder().build(); //4096
skA.update(1);
skB.update(1);
double[] result = JaccardSimilarity.jaccard(skA, skB);
println(result[0] + ", " + result[1] + ", " + result[2]);
for (int i = 1; i < 4096; i++) {
skA.update(i);
skB.update(i);
}
result = JaccardSimilarity.jaccard(skA, skB);
println(result[0] + ", " + result[1] + ", " + result[2]);
}
@Test
public void printlnTest() {
println("PRINTING: "+this.getClass().getName());
}
/**
* @param s value to print
*/
static void println(String s) {
//System.out.println(s); //disable here
}
}