Merge pull request #197 from apache/python_theta_jaccard

Add jaccard similarity to theta sketch in python
diff --git a/python/src/theta_wrapper.cpp b/python/src/theta_wrapper.cpp
index 53dd5e4..f285793 100644
--- a/python/src/theta_wrapper.cpp
+++ b/python/src/theta_wrapper.cpp
@@ -19,11 +19,13 @@
 
 #include <sstream>
 #include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
 
 #include "theta_sketch.hpp"
 #include "theta_union.hpp"
 #include "theta_intersection.hpp"
 #include "theta_a_not_b.hpp"
+#include "theta_jaccard_similarity.hpp"
 #include "common_defs.hpp"
 
 
@@ -62,6 +64,10 @@
   return compact_theta_sketch::deserialize(skStr.c_str(), skStr.length(), seed);
 }
 
+py::list theta_jaccard_sim_computation(const theta_sketch& sketch_a, const theta_sketch& sketch_b) {
+  return py::cast(theta_jaccard_similarity::jaccard(sketch_a, sketch_b));
+}
+
 }
 }
 
@@ -144,4 +150,23 @@
     .def("compute", &theta_a_not_b::compute<const theta_sketch&, const theta_sketch&>, py::arg("a"), py::arg("b"), py::arg("ordered")=true,
          "Returns a sketch with the reuslt of appying the A-not-B operation on the given inputs")
   ;
+  
+  py::class_<theta_jaccard_similarity>(m, "theta_jaccard_similarity")
+     .def_static("jaccard", &dspy::theta_jaccard_sim_computation,
+                 py::arg("sketch_a"), py::arg("sketch_b"),
+                 "Returns a list with {lower_bound, estimate, upper_bound} of the Jaccard similarity between sketches")
+     .def_static("exactly_equal", &theta_jaccard_similarity::exactly_equal<const theta_sketch&, const theta_sketch&>,
+                 py::arg("sketch_a"), py::arg("sketch_b"),
+                 "Returns True if sketch_a and sketch_b are equivalent, otherwise False")
+     .def_static("similarity_test", &theta_jaccard_similarity::similarity_test<const theta_sketch&, const theta_sketch&>,
+                 py::arg("actual"), py::arg("expected"), py::arg("threshold"),
+                 "Tests similarity of an actual sketch against an expected sketch. Computers the lower bound of the Jaccard "
+                 "index J_{LB} of the actual and expected sketches. If J_{LB} >= threshold, then the sketches are considered "
+                 "to be similar sith a confidence of 97.7% and returns True, otherwise False.")
+     .def_static("dissimilarity_test", &theta_jaccard_similarity::dissimilarity_test<const theta_sketch&, const theta_sketch&>,
+                 py::arg("actual"), py::arg("expected"), py::arg("threshold"),
+                 "Tests dissimilarity of an actual sketch against an expected sketch. Computers the lower bound of the Jaccard "
+                 "index J_{UB} of the actual and expected sketches. If J_{UB} <= threshold, then the sketches are considered "
+                 "to be dissimilar sith a confidence of 97.7% and returns True, otherwise False.")            
+  ;     
 }
diff --git a/python/tests/theta_test.py b/python/tests/theta_test.py
index 31cfcb2..93e37a4 100644
--- a/python/tests/theta_test.py
+++ b/python/tests/theta_test.py
@@ -20,6 +20,7 @@
 from datasketches import theta_sketch, update_theta_sketch
 from datasketches import compact_theta_sketch, theta_union
 from datasketches import theta_intersection, theta_a_not_b
+from datasketches import theta_jaccard_similarity
 
 class ThetaTest(unittest.TestCase):
     def test_theta_basic_example(self):
@@ -109,6 +110,30 @@
         self.assertGreaterEqual(result.get_upper_bound(1), 3 * n / 4)
 
 
+        # JACCARD SIMILARITY
+        # Jaccard Similarity measure returns (lower_bound, estimate, upper_bound)
+        jac = theta_jaccard_similarity.jaccard(sk1, sk2)
+
+        # we can check that results are in the expected order
+        self.assertLess(jac[0], jac[1])
+        self.assertLess(jac[1], jac[2])
+
+        # checks for sketch equivalency
+        self.assertTrue(theta_jaccard_similarity.exactly_equal(sk1, sk1))
+        self.assertFalse(theta_jaccard_similarity.exactly_equal(sk1, sk2))
+
+        # we can apply a check for similarity or dissimilarity at a
+        # given threshhold, at 97.7% confidence.
+
+        # check that the Jaccard Index is at most (upper bound) 0.2.
+        # exact result would be 1/7
+        self.assertTrue(theta_jaccard_similarity.dissimilarity_test(sk1, sk2, 0.2))
+
+        # check that the Jaccard Index is at least (lower bound) 0.7
+        # exact result would be 3/4, using result from A NOT B test
+        self.assertTrue(theta_jaccard_similarity.similarity_test(sk1, result, 0.7))
+
+
     def generate_theta_sketch(self, n, k, offset=0):
       sk = update_theta_sketch(k)
       for i in range(0, n):