|  | # Licensed to the Apache Software Foundation (ASF) under one | 
|  | # or more contributor license agreements.  See the NOTICE file | 
|  | # distributed with this work for additional information | 
|  | # regarding copyright ownership.  The ASF licenses this file | 
|  | # to you under the Apache License, Version 2.0 (the | 
|  | # "License"); you may not use this file except in compliance | 
|  | # with the License.  You may obtain a copy of the License at | 
|  | # | 
|  | #   http://www.apache.org/licenses/LICENSE-2.0 | 
|  | # | 
|  | # Unless required by applicable law or agreed to in writing, | 
|  | # software distributed under the License is distributed on an | 
|  | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | 
|  | # KIND, either express or implied.  See the License for the | 
|  | # specific language governing permissions and limitations | 
|  | # under the License. | 
|  |  | 
|  | import unittest | 
|  | from datasketches import count_min_sketch | 
|  |  | 
|  | class CountMinTest(unittest.TestCase): | 
|  | def test_count_min_example(self): | 
|  | # we'll define target confidence and relative error and use the built-in | 
|  | # methods to determine how many hashes and buckets to use | 
|  | confidence = 0.95 | 
|  | num_hashes = count_min_sketch.suggest_num_hashes(confidence) | 
|  | relative_error = 0.01 | 
|  | num_buckets = count_min_sketch.suggest_num_buckets(relative_error) | 
|  |  | 
|  | # now we can create a few empty sketches | 
|  | cm = count_min_sketch(num_hashes, num_buckets) | 
|  | cm2 = count_min_sketch(num_hashes, num_buckets) | 
|  | self.assertTrue(cm.is_empty()) | 
|  |  | 
|  | # we'll use a moderate number of distinct items with | 
|  | # increasing weights, with each item's weight being | 
|  | # equal to its value | 
|  | n = 1000 | 
|  | total_wt = 0 | 
|  | for i in range(1, n+1): | 
|  | cm.update(i, i) | 
|  | total_wt += i | 
|  | self.assertFalse(cm.is_empty()) | 
|  | self.assertEqual(cm.total_weight, total_wt) | 
|  |  | 
|  | # querying the items, each of them should | 
|  | # have a non-zero count.  the estimate should | 
|  | # be at least i with appropriately behaved bounds. | 
|  | for i in range(1, n+1): | 
|  | val = cm.get_estimate(i) | 
|  | self.assertGreaterEqual(val, i) | 
|  | self.assertGreaterEqual(val, cm.get_lower_bound(i)) | 
|  | self.assertGreater(cm.get_upper_bound(i), val) | 
|  |  | 
|  | # values not in the sketch should have lower estimates, but | 
|  | # are not guaranteed to be zero and will succeed | 
|  | self.assertIsNotNone(cm.get_estimate("not in set")) | 
|  |  | 
|  | # we can create another sketch with partial overlap | 
|  | # and merge them | 
|  | for i in range(int(n / 2), int(3 * n / 2)): | 
|  | cm2.update(i, i) | 
|  | cm.merge(cm2) | 
|  |  | 
|  | # and the estimated weight for the overlapped meerged values | 
|  | # (n/2 to n) should now be at least 2x the value | 
|  | self.assertGreaterEqual(cm.get_estimate(n), 2 * n) | 
|  |  | 
|  | # finally, serialize and reconstruct | 
|  | cm_bytes = cm.serialize() | 
|  | self.assertEqual(cm.get_serialized_size_bytes(), len(cm_bytes)) | 
|  | new_cm = count_min_sketch.deserialize(cm_bytes) | 
|  |  | 
|  | # and now interrogate the sketch | 
|  | self.assertFalse(new_cm.is_empty()) | 
|  | self.assertEqual(new_cm.num_hashes, cm.num_hashes) | 
|  | self.assertEqual(new_cm.num_buckets, cm.num_buckets) | 
|  | self.assertEqual(new_cm.total_weight, cm.total_weight) | 
|  |  | 
|  | # we can also iterate through values in and out of the sketch to ensure | 
|  | # the estimates match | 
|  | for i in range(0, 2 * n): | 
|  | self.assertEqual(cm.get_estimate(i), new_cm.get_estimate(i)) | 
|  |  | 
|  | # finally just check that printing works as expected | 
|  | self.assertGreater(len(cm.to_string()), 0) | 
|  | self.assertEqual(len(cm.__str__()), len(cm.to_string())) | 
|  |  | 
|  | if __name__ == '__main__': | 
|  | unittest.main() |