|  | # Licensed to the Apache Software Foundation (ASF) under one | 
|  | # or more contributor license agreements.  See the NOTICE file | 
|  | # distributed with this work for additional information | 
|  | # regarding copyright ownership.  The ASF licenses this file | 
|  | # to you under the Apache License, Version 2.0 (the | 
|  | # "License"); you may not use this file except in compliance | 
|  | # with the License.  You may obtain a copy of the License at | 
|  | # | 
|  | #   http://www.apache.org/licenses/LICENSE-2.0 | 
|  | # | 
|  | # Unless required by applicable law or agreed to in writing, | 
|  | # software distributed under the License is distributed on an | 
|  | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | 
|  | # KIND, either express or implied.  See the License for the | 
|  | # specific language governing permissions and limitations | 
|  | # under the License. | 
|  |  | 
|  | import unittest | 
|  | from datasketches import kll_ints_sketch, kll_floats_sketch, kll_doubles_sketch | 
|  | from datasketches import kll_items_sketch, ks_test, PyStringsSerDe | 
|  | import copy | 
|  | import numpy as np | 
|  |  | 
|  | class KllTest(unittest.TestCase): | 
|  | def test_kll_floats_example(self): | 
|  | k = 160 | 
|  | n = 2 ** 20 | 
|  |  | 
|  | # create a sketch and inject ~1 million N(0,1) points as an array and as a single item | 
|  | kll = kll_floats_sketch(k) | 
|  | kll.update(np.random.normal(size=n-1)) | 
|  | kll.update(0.0) | 
|  |  | 
|  | # 0 should be near the median | 
|  | self.assertAlmostEqual(0.5, kll.get_rank(0.0), delta=0.035) | 
|  |  | 
|  | # the median should be near 0 | 
|  | self.assertAlmostEqual(0.0, kll.get_quantile(0.5), delta=0.035) | 
|  |  | 
|  | # we also track the min/max independently from the rest of the data | 
|  | # which lets us know the full observed data range | 
|  | self.assertLessEqual(kll.get_min_value(), kll.get_quantile(0.01)) | 
|  | self.assertLessEqual(0.0, kll.get_rank(kll.get_min_value())) | 
|  | self.assertGreaterEqual(kll.get_max_value(), kll.get_quantile(0.99)) | 
|  | self.assertGreaterEqual(1.0, kll.get_rank(kll.get_max_value())) | 
|  |  | 
|  | # we can also extract a list of values at a time, | 
|  | # here the values should give us something close to [-2, -1, 0, 1, 2]. | 
|  | # then get the CDF, which will return something close to | 
|  | # the original values used in get_quantiles() | 
|  | # finally, can check the normalized rank error bound | 
|  | pts = kll.get_quantiles([0.0228, 0.1587, 0.5, 0.8413, 0.9772]) | 
|  | cdf = kll.get_cdf(pts)  # include 1.0 at end to account for all probability mass | 
|  | self.assertEqual(len(cdf), len(pts)+1) | 
|  | err = kll.normalized_rank_error(False) | 
|  | self.assertEqual(err, kll_floats_sketch.get_normalized_rank_error(k, False)) | 
|  |  | 
|  | # and a few basic queries about the sketch | 
|  | self.assertFalse(kll.is_empty()) | 
|  | self.assertTrue(kll.is_estimation_mode()) | 
|  | self.assertEqual(kll.n, n) | 
|  | self.assertEqual(kll.k, k) | 
|  | self.assertLess(kll.num_retained, n) | 
|  |  | 
|  | # merging itself will double the number of items the sketch has seen | 
|  | # but need to do that with a copy | 
|  | kll_copy = copy.copy(kll) | 
|  | kll.merge(kll_copy) | 
|  | self.assertEqual(kll.n, 2*n) | 
|  |  | 
|  | # we can then serialize and reconstruct the sketch | 
|  | kll_bytes = kll.serialize() | 
|  | new_kll = kll_floats_sketch.deserialize(kll_bytes) | 
|  | self.assertEqual(kll.num_retained, new_kll.num_retained) | 
|  | self.assertEqual(kll.get_min_value(), new_kll.get_min_value()) | 
|  | self.assertEqual(kll.get_max_value(), new_kll.get_max_value()) | 
|  | self.assertEqual(kll.get_quantile(0.7), new_kll.get_quantile(0.7)) | 
|  | self.assertEqual(kll.get_rank(0.0), new_kll.get_rank(0.0)) | 
|  |  | 
|  | # A Kolmogorov-Smirnov Test of kll and new_kll should match, even for | 
|  | # a fairly small p-value -- cannot reject the null hypothesis that | 
|  | # they come from the same distribution (since they do) | 
|  | self.assertFalse(ks_test(kll, new_kll, 0.001)) | 
|  |  | 
|  | total_weight = 0 | 
|  | for tuple in kll: | 
|  | item = tuple[0] | 
|  | weight = tuple[1] | 
|  | total_weight = total_weight + weight | 
|  | self.assertEqual(total_weight, kll.n) | 
|  |  | 
|  | def test_kll_ints_sketch(self): | 
|  | k = 100 | 
|  | n = 10 | 
|  | kll = kll_ints_sketch(k) | 
|  | for i in range(0, n): | 
|  | kll.update(i) | 
|  |  | 
|  | self.assertEqual(kll.get_min_value(), 0) | 
|  | self.assertEqual(kll.get_max_value(), n-1) | 
|  | self.assertEqual(kll.n, n) | 
|  | self.assertFalse(kll.is_empty()) | 
|  | self.assertFalse(kll.is_estimation_mode()) # n < k | 
|  | self.assertEqual(kll.k, k) | 
|  |  | 
|  | pmf = kll.get_pmf([round(n/2)]) | 
|  | self.assertIsNotNone(pmf) | 
|  | self.assertEqual(len(pmf), 2) | 
|  |  | 
|  | cdf = kll.get_cdf([round(n/2)]) | 
|  | self.assertIsNotNone(cdf) | 
|  | self.assertEqual(len(cdf), 2) | 
|  |  | 
|  | self.assertEqual(kll.get_quantile(0.5), round(n/2)) | 
|  | quants = kll.get_quantiles([0.25, 0.5, 0.75]) | 
|  | self.assertIsNotNone(quants) | 
|  | self.assertEqual(len(quants), 3) | 
|  |  | 
|  | self.assertEqual(kll.get_rank(round(n/2)), 0.5) | 
|  |  | 
|  | # merge copy of self | 
|  | kll_copy = copy.copy(kll) | 
|  | kll.merge(kll_copy) | 
|  | self.assertEqual(kll.n, 2 * n) | 
|  |  | 
|  | sk_bytes = kll.serialize() | 
|  | self.assertTrue(isinstance(kll_ints_sketch.deserialize(sk_bytes), kll_ints_sketch)) | 
|  |  | 
|  | def test_kll_doubles_sketch(self): | 
|  | # already tested float and ints and it's templatized, so just make sure it instantiates properly | 
|  | k = 75 | 
|  | kll = kll_doubles_sketch(k) | 
|  | self.assertTrue(kll.is_empty()) | 
|  |  | 
|  | def test_kll_items_sketch(self): | 
|  | # most functionality has been tested, but we need to ensure objects and sorting work | 
|  | # as well as serialization | 
|  | k = 100 | 
|  | n = 2 ** 16 | 
|  |  | 
|  | # create a sketch and inject enough points to force compaction | 
|  | kll = kll_items_sketch(k) | 
|  | for i in range(0, n): | 
|  | kll.update(str(i)) | 
|  |  | 
|  | # check iterator for items | 
|  | total_weight = 0 | 
|  | for tuple in kll: | 
|  | item = tuple[0] | 
|  | weight = tuple[1] | 
|  | total_weight = total_weight + weight | 
|  | self.assertEqual(total_weight, kll.n) | 
|  |  | 
|  | kll_copy = copy.copy(kll) | 
|  | kll.merge(kll_copy) | 
|  | self.assertEqual(kll.n, 2 * n) | 
|  |  | 
|  | kll_bytes = kll.serialize(PyStringsSerDe()) | 
|  | new_kll = kll_items_sketch.deserialize(kll_bytes, PyStringsSerDe()) | 
|  | self.assertEqual(kll.num_retained, new_kll.num_retained) | 
|  | self.assertEqual(kll.get_min_value(), new_kll.get_min_value()) | 
|  | self.assertEqual(kll.get_max_value(), new_kll.get_max_value()) | 
|  | self.assertEqual(kll.get_quantile(0.7), new_kll.get_quantile(0.7)) | 
|  | self.assertEqual(kll.get_rank(str(n/4)), new_kll.get_rank(str(n/4))) | 
|  |  | 
|  | # The sketches are identical so we cannot reject the null hypothesis that they | 
|  | # reflect the same underlying distribtion | 
|  | self.assertFalse(ks_test(kll, new_kll, 0.001)) | 
|  |  | 
|  | # check that printing works as expected | 
|  | self.assertGreater(len(kll.to_string(True, True)), 0) | 
|  | self.assertEqual(len(kll.__str__()), len(kll.to_string())) | 
|  |  | 
|  | if __name__ == '__main__': | 
|  | unittest.main() |