Add req python tests, fix minor issues in kll python wrapper/tests

commit: fc6c45e52d0bd44a92897a0b138455a05d8e1eed [log] [tgz]
author: Jon Malkin <jmalkin@users.noreply.github.com> Thu Feb 11 01:50:59 2021 -0800
committer: Jon Malkin <jmalkin@users.noreply.github.com> Thu Feb 11 01:50:59 2021 -0800
tree: 1708e34952c5039057956da3b1eef91de77fa37b
parent: 31c77410f3451b085d439686f579435a0c45a22c [diff]
diff --git a/python/src/kll_wrapper.cpp b/python/src/kll_wrapper.cpp
index 13e6a6f..d356358 100644
--- a/python/src/kll_wrapper.cpp
+++ b/python/src/kll_wrapper.cpp

@@ -200,7 +200,7 @@
          "If pmf is True, returns the 'double-sided' normalized rank error for the get_PMF() function.\n"
          "Otherwise, it is the 'single-sided' normalized rank error for all the other queries.\n"
          "Constants were derived as the best fit to 99 percentile empirically measured max error in thousands of trials")
-    .def("serialize", &dspy::kll_sketch_serialize<T>, "Serailizes the sketch into a bytes object")
+    .def("serialize", &dspy::kll_sketch_serialize<T>, "Serializes the sketch into a bytes object")
     .def_static("deserialize", &dspy::kll_sketch_deserialize<T>, "Deserializes the sketch from a bytes object")
     ;
 }

diff --git a/python/src/req_wrapper.cpp b/python/src/req_wrapper.cpp
index c524416..9ef0b87 100644
--- a/python/src/req_wrapper.cpp
+++ b/python/src/req_wrapper.cpp

@@ -223,7 +223,7 @@
          "Returns an approximate lower bound on the given normalized rank.\n"
          "Normalized rank must be a value between 0.0 and 1.0 (inclusive); "
          "the number of standard deviations must be 1, 2, or 3.")
-    .def("get_rank_lower_bound", &req_sketch<T>::get_rank_upper_bound, py::arg("rank"), py::arg("num_std_dev"),
+    .def("get_rank_upper_bound", &req_sketch<T>::get_rank_upper_bound, py::arg("rank"), py::arg("num_std_dev"),
          "Returns an approximate upper bound on the given normalized rank.\n"
          "Normalized rank must be a value between 0.0 and 1.0 (inclusive); "
          "the number of standard deviations must be 1, 2, or 3.")
@@ -235,12 +235,12 @@
          "Normalized rank must be a value between 0.0 and 1.0 (inclusive). If is_hra is True, uses high "
          "rank accuracy mode, else low rank accuracy. N is an estimate of the total number of points "
          "provided to the sketch.")
-    .def("serialize", &dspy::req_sketch_serialize<T>, "Serailizes the sketch into a bytes object")
+    .def("serialize", &dspy::req_sketch_serialize<T>, "Serializes the sketch into a bytes object")
     .def_static("deserialize", &dspy::req_sketch_deserialize<T>, "Deserializes the sketch from a bytes object")
     ;
 }
 
 void init_req(py::module &m) {
-  //bind_req_sketch<int>(m, "req_ints_sketch");
+  bind_req_sketch<int>(m, "req_ints_sketch");
   bind_req_sketch<float>(m, "req_floats_sketch");
 }

diff --git a/python/tests/kll_test.py b/python/tests/kll_test.py
index 929da6b..696260f 100644
--- a/python/tests/kll_test.py
+++ b/python/tests/kll_test.py

@@ -16,9 +16,7 @@
 # under the License.
 
 import unittest
-from datasketches import (kll_ints_sketch, kll_floats_sketch, 
-                          vector_of_kll_ints_sketches,
-                          vector_of_kll_floats_sketches)
+from datasketches import kll_ints_sketch, kll_floats_sketch
 import numpy as np
 
 class KllTest(unittest.TestCase):
@@ -59,6 +57,7 @@
       self.assertFalse(kll.is_empty())
       self.assertTrue(kll.is_estimation_mode())
       self.assertEqual(kll.get_n(), n)
+      self.assertEqual(kll.get_k(), k)
       self.assertLess(kll.get_num_retained(), n)
 
       # merging itself will double the number of items the sketch has seen

diff --git a/python/tests/req_test.py b/python/tests/req_test.py
new file mode 100644
index 0000000..1e39bb7
--- /dev/null
+++ b/python/tests/req_test.py

@@ -0,0 +1,126 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import unittest
+from datasketches import req_ints_sketch, req_floats_sketch
+import numpy as np
+
+class reqTest(unittest.TestCase):
+    def test_req_example(self):
+      k = 12
+      n = 2 ** 20
+
+      # create a sketch and inject ~1 million N(0,1) points as an array and as a single item
+      req = req_floats_sketch(k, True) # high rank accuracy
+      req.update(np.random.normal(size=n-1))
+      req.update(0.0)
+
+      # 0 should be near the median
+      self.assertAlmostEqual(0.5, req.get_rank(0.0), delta=0.03)
+      
+      # the median should be near 0
+      self.assertAlmostEqual(0.0, req.get_quantile(0.5), delta=0.03)
+
+      # we also track the min/max independently from the rest of the data
+      # which lets us know the full observed data range
+      self.assertLessEqual(req.get_min_value(), req.get_quantile(0.01))
+      self.assertLessEqual(0.0, req.get_rank(req.get_min_value()))
+      self.assertGreaterEqual(req.get_max_value(), req.get_quantile(0.99))
+      self.assertGreaterEqual(1.0, req.get_rank(req.get_max_value()))
+
+      # we can also extract a list of values at a time,
+      # here the values should give us something close to [-2, -1, 0, 1, 2].
+      # then get the CDF, which will return something close to
+      # the original values used in get_quantiles()
+      # finally, can check the normalized rank error bound
+      pts = req.get_quantiles([0.0228, 0.1587, 0.5, 0.8413, 0.9772])
+      cdf = req.get_cdf(pts)  # include 1.0 at end to account for all probability mass
+      self.assertEqual(len(cdf), len(pts)+1)
+      
+      # For relative error quantiles, the error depends on the actual rank
+      # so we need to use that to detemrine the bounds
+      est = req.get_rank(0.999, True)
+      lb = req.get_rank_lower_bound(est, 1)
+      ub = req.get_rank_upper_bound(est, 1)
+      self.assertLessEqual(lb, est)
+      self.assertLessEqual(est, ub)
+
+      # and a few basic queries about the sketch
+      self.assertFalse(req.is_empty())
+      self.assertTrue(req.is_estimation_mode())
+      self.assertEqual(req.get_n(), n)
+      self.assertLess(req.get_num_retained(), n)
+      self.assertEqual(req.get_k(), k)
+
+      # merging itself will double the number of items the sketch has seen
+      req.merge(req)
+      self.assertEqual(req.get_n(), 2*n)
+
+      # we can then serialize and reconstruct the sketch
+      req_bytes = req.serialize()
+      new_req = req.deserialize(req_bytes)
+      self.assertEqual(req.get_num_retained(), new_req.get_num_retained())
+      self.assertEqual(req.get_min_value(), new_req.get_min_value())
+      self.assertEqual(req.get_max_value(), new_req.get_max_value())
+      self.assertEqual(req.get_quantile(0.7), new_req.get_quantile(0.7))
+      self.assertEqual(req.get_rank(0.0), new_req.get_rank(0.0))
+
+    def test_req_ints_sketch(self):
+        k = 100
+        n = 10
+        req = req_ints_sketch(k)
+        for i in range(0, n):
+          req.update(i)
+
+        self.assertEqual(req.get_min_value(), 0)
+        self.assertEqual(req.get_max_value(), n-1)
+        self.assertEqual(req.get_n(), n)
+        self.assertFalse(req.is_empty())
+        self.assertFalse(req.is_estimation_mode()) # n < k
+        self.assertEqual(req.get_k(), k)
+
+        pmf = req.get_pmf([round(n/2)])
+        self.assertIsNotNone(pmf)
+        self.assertEqual(len(pmf), 2)
+
+        cdf = req.get_cdf([round(n/2)])
+        self.assertIsNotNone(cdf)
+        self.assertEqual(len(cdf), 2)
+
+        self.assertEqual(req.get_quantile(0.5), round(n/2))
+        quants = req.get_quantiles([0.25, 0.5, 0.75])
+        self.assertIsNotNone(quants)
+        self.assertEqual(len(quants), 3)
+
+        self.assertEqual(req.get_rank(round(n/2)), 0.5)
+
+        # merge self
+        req.merge(req)
+        self.assertEqual(req.get_n(), 2 * n)
+
+        sk_bytes = req.serialize()
+        self.assertTrue(isinstance(req_ints_sketch.deserialize(sk_bytes), req_ints_sketch))
+
+    def test_req_floats_sketch(self):
+      # already tested ints and it's templatized, so just make sure it instantiates properly
+      k = 75
+      req = req_floats_sketch(k, False) # low rank accuracy
+      self.assertTrue(req.is_empty())
+      self.assertFalse(req.is_hra())
+
+if __name__ == '__main__':
+    unittest.main()
commit	fc6c45e52d0bd44a92897a0b138455a05d8e1eed	[log] [tgz]
author	Jon Malkin <jmalkin@users.noreply.github.com>	Thu Feb 11 01:50:59 2021 -0800
committer	Jon Malkin <jmalkin@users.noreply.github.com>	Thu Feb 11 01:50:59 2021 -0800
tree	1708e34952c5039057956da3b1eef91de77fa37b
parent	31c77410f3451b085d439686f579435a0c45a22c [diff]