added seed
diff --git a/theta/include/theta_jaccard_similarity_base.hpp b/theta/include/theta_jaccard_similarity_base.hpp
index f6248e6..5e8dcf5 100644
--- a/theta/include/theta_jaccard_similarity_base.hpp
+++ b/theta/include/theta_jaccard_similarity_base.hpp
@@ -46,20 +46,21 @@
*
* @param sketch_a given sketch A
* @param sketch_b given sketch B
+ * @param seed for the hash function that was used to create the sketch
* @return a double array {LowerBound, Estimate, UpperBound} of the Jaccard index.
* The Upper and Lower bounds are for a confidence interval of 95.4% or +/- 2 standard deviations.
*/
template<typename SketchA, typename SketchB>
- static std::array<double, 3> jaccard(const SketchA& sketch_a, const SketchB& sketch_b) {
+ static std::array<double, 3> jaccard(const SketchA& sketch_a, const SketchB& sketch_b, uint64_t seed = DEFAULT_SEED) {
if (reinterpret_cast<const void*>(&sketch_a) == reinterpret_cast<const void*>(&sketch_b)) return {1, 1, 1};
if (sketch_a.is_empty() && sketch_b.is_empty()) return {1, 1, 1};
if (sketch_a.is_empty() || sketch_b.is_empty()) return {0, 0, 0};
- auto union_ab = compute_union(sketch_a, sketch_b);
+ auto union_ab = compute_union(sketch_a, sketch_b, seed);
if (identical_sets(sketch_a, sketch_b, union_ab)) return {1, 1, 1};
// intersection
- Intersection i;
+ Intersection i(seed);
i.update(sketch_a);
i.update(sketch_b);
i.update(union_ab); // ensures that intersection is a subset of the union
@@ -76,15 +77,16 @@
* Returns true if the two given sketches are equivalent.
* @param sketch_a the given sketch A
* @param sketch_b the given sketch B
+ * @param seed for the hash function that was used to create the sketch
* @return true if the two given sketches are exactly equal
*/
template<typename SketchA, typename SketchB>
- static bool exactly_equal(const SketchA& sketch_a, const SketchB& sketch_b) {
+ static bool exactly_equal(const SketchA& sketch_a, const SketchB& sketch_b, uint64_t seed = DEFAULT_SEED) {
if (reinterpret_cast<const void*>(&sketch_a) == reinterpret_cast<const void*>(&sketch_b)) return true;
if (sketch_a.is_empty() && sketch_b.is_empty()) return true;
if (sketch_a.is_empty() || sketch_b.is_empty()) return false;
- auto union_ab = compute_union(sketch_a, sketch_b);
+ auto union_ab = compute_union(sketch_a, sketch_b, seed);
if (identical_sets(sketch_a, sketch_b, union_ab)) return true;
return false;
}
@@ -99,12 +101,13 @@
* @param actual the sketch to be tested
* @param expected the reference sketch that is considered to be correct
* @param threshold a real value between zero and one
+ * @param seed for the hash function that was used to create the sketch
* @return true if the similarity of the two sketches is greater than the given threshold
* with at least 97.7% confidence
*/
template<typename SketchA, typename SketchB>
- static bool similarity_test(const SketchA& actual, const SketchB& expected, double threshold) {
- auto jc = jaccard(actual, expected);
+ static bool similarity_test(const SketchA& actual, const SketchB& expected, double threshold, uint64_t seed = DEFAULT_SEED) {
+ auto jc = jaccard(actual, expected, seed);
return jc[0] >= threshold;
}
@@ -118,23 +121,24 @@
* @param actual the sketch to be tested
* @param expected the reference sketch that is considered to be correct
* @param threshold a real value between zero and one
+ * @param seed for the hash function that was used to create the sketch
* @return true if the dissimilarity of the two sketches is greater than the given threshold
* with at least 97.7% confidence
*/
template<typename SketchA, typename SketchB>
- static bool dissimilarity_test(const SketchA& actual, const SketchB& expected, double threshold) {
- auto jc = jaccard(actual, expected);
+ static bool dissimilarity_test(const SketchA& actual, const SketchB& expected, double threshold, uint64_t seed = DEFAULT_SEED) {
+ auto jc = jaccard(actual, expected, seed);
return jc[2] <= threshold;
}
private:
template<typename SketchA, typename SketchB>
- static typename Union::CompactSketch compute_union(const SketchA& sketch_a, const SketchB& sketch_b) {
+ static typename Union::CompactSketch compute_union(const SketchA& sketch_a, const SketchB& sketch_b, uint64_t seed) {
const auto count_a = sketch_a.get_num_retained();
const auto count_b = sketch_b.get_num_retained();
const uint8_t lg_k = std::min(std::max(log2(ceiling_power_of_2(count_a + count_b)), theta_constants::MIN_LG_K), theta_constants::MAX_LG_K);
- auto u = typename Union::builder().set_lg_k(lg_k).build();
+ auto u = typename Union::builder().set_lg_k(lg_k).set_seed(seed).build();
u.update(sketch_a);
u.update(sketch_b);
return u.get_result(false);
diff --git a/theta/test/theta_jaccard_similarity_test.cpp b/theta/test/theta_jaccard_similarity_test.cpp
index d40a0ce..83e7c6d 100644
--- a/theta/test/theta_jaccard_similarity_test.cpp
+++ b/theta/test/theta_jaccard_similarity_test.cpp
@@ -100,6 +100,28 @@
REQUIRE(jc[2] == Approx(0.33).margin(0.01));
}
+TEST_CASE("theta jaccard: half overlap estimation mode custom seed", "[theta_sketch]") {
+ const uint64_t seed = 123;
+ auto sk_a = update_theta_sketch::builder().set_seed(seed).build();
+ auto sk_b = update_theta_sketch::builder().set_seed(seed).build();
+ for (int i = 0; i < 10000; ++i) {
+ sk_a.update(i);
+ sk_b.update(i + 5000);
+ }
+
+ // update sketches
+ auto jc = theta_jaccard_similarity::jaccard(sk_a, sk_b, seed);
+ REQUIRE(jc[0] == Approx(0.33).margin(0.01));
+ REQUIRE(jc[1] == Approx(0.33).margin(0.01));
+ REQUIRE(jc[2] == Approx(0.33).margin(0.01));
+
+ // compact sketches
+ jc = theta_jaccard_similarity::jaccard(sk_a.compact(), sk_b.compact(), seed);
+ REQUIRE(jc[0] == Approx(0.33).margin(0.01));
+ REQUIRE(jc[1] == Approx(0.33).margin(0.01));
+ REQUIRE(jc[2] == Approx(0.33).margin(0.01));
+}
+
/**
* The distribution is quite tight, about +/- 0.7%, which is pretty good since the accuracy of the
* underlying sketch is about +/- 1.56%.
@@ -120,6 +142,23 @@
REQUIRE(theta_jaccard_similarity::similarity_test(actual, actual, threshold));
}
+TEST_CASE("theta jaccard: similarity test custom seed", "[theta_sketch]") {
+ const int8_t min_lg_k = 12;
+ const int u1 = 1 << 20;
+ const int u2 = static_cast<int>(u1 * 0.95);
+ const double threshold = 0.943;
+ const uint64_t seed = 1234;
+
+ auto expected = update_theta_sketch::builder().set_lg_k(min_lg_k).set_seed(seed).build();
+ for (int i = 0; i < u1; ++i) expected.update(i);
+
+ auto actual = update_theta_sketch::builder().set_lg_k(min_lg_k).set_seed(seed).build();
+ for (int i = 0; i < u2; ++i) actual.update(i);
+
+ REQUIRE(theta_jaccard_similarity::similarity_test(actual, expected, threshold, seed));
+ REQUIRE(theta_jaccard_similarity::similarity_test(actual, actual, threshold, seed));
+}
+
/**
* The distribution is much looser here, about +/- 14%. This is due to the fact that intersections loose accuracy
* as the ratio of intersection to the union becomes a small number.
@@ -140,4 +179,21 @@
REQUIRE_FALSE(theta_jaccard_similarity::dissimilarity_test(actual, actual, threshold));
}
+TEST_CASE("theta jaccard: dissimilarity test custom seed", "[theta_sketch]") {
+ const int8_t min_lg_k = 12;
+ const int u1 = 1 << 20;
+ const int u2 = static_cast<int>(u1 * 0.05);
+ const double threshold = 0.061;
+ const uint64_t seed = 1234;
+
+ auto expected = update_theta_sketch::builder().set_lg_k(min_lg_k).set_seed(seed).build();
+ for (int i = 0; i < u1; ++i) expected.update(i);
+
+ auto actual = update_theta_sketch::builder().set_lg_k(min_lg_k).set_seed(seed).build();
+ for (int i = 0; i < u2; ++i) actual.update(i);
+
+ REQUIRE(theta_jaccard_similarity::dissimilarity_test(actual, expected, threshold, seed));
+ REQUIRE_FALSE(theta_jaccard_similarity::dissimilarity_test(actual, actual, threshold));
+}
+
} /* namespace datasketches */