blob: a14c0c10a5cb4c3c24b01ab80309f48b5d2fa8a8 [file] [log] [blame]
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import unittest
from datasketches import count_min_sketch
class CountMinTest(unittest.TestCase):
def test_count_min_example(self):
# we'll define target confidence and relative error and use the built-in
# methods to determine how many hashes and buckets to use
confidence = 0.95
num_hashes = count_min_sketch.suggest_num_hashes(confidence)
relative_error = 0.01
num_buckets = count_min_sketch.suggest_num_buckets(relative_error)
# now we can create a few empty sketches
cm = count_min_sketch(num_hashes, num_buckets)
cm2 = count_min_sketch(num_hashes, num_buckets)
self.assertTrue(cm.is_empty())
# we'll use a moderate number of distinct items with
# increasing weights, with each item's weight being
# equal to its value
n = 1000
total_wt = 0
for i in range(1, n+1):
cm.update(i, i)
total_wt += i
self.assertFalse(cm.is_empty())
self.assertEqual(cm.get_total_weight(), total_wt)
# querying the items, each of them should
# have a non-zero count. the estimate should
# be at least i with appropriately behaved bounds.
for i in range(1, n+1):
val = cm.get_estimate(i)
self.assertGreaterEqual(val, i)
self.assertGreaterEqual(val, cm.get_lower_bound(i))
self.assertGreater(cm.get_upper_bound(i), val)
# values not in the sketch should have lower estimates, but
# are not guaranteed to be zero and will succeed
self.assertIsNotNone(cm.get_estimate("not in set"))
# we can create another sketch with partial overlap
# and merge them
for i in range(int(n / 2), int(3 * n / 2)):
cm2.update(i, i)
cm.merge(cm2)
# and the estimated weight for the overlapped meerged values
# (n/2 to n) should now be at least 2x the value
self.assertGreaterEqual(cm.get_estimate(n), 2 * n)
# finally, serialize and reconstruct
cm_bytes = cm.serialize()
self.assertEqual(cm.get_serialized_size_bytes(), len(cm_bytes))
new_cm = count_min_sketch.deserialize(cm_bytes)
# and now interrogate the sketch
self.assertFalse(new_cm.is_empty())
self.assertEqual(new_cm.get_num_hashes(), cm.get_num_hashes())
self.assertEqual(new_cm.get_num_buckets(), cm.get_num_buckets())
self.assertEqual(new_cm.get_total_weight(), cm.get_total_weight())
# we can also iterate through values in and out of the sketch to ensure
# the estimates match
for i in range(0, 2 * n):
self.assertEqual(cm.get_estimate(i), new_cm.get_estimate(i))
if __name__ == '__main__':
unittest.main()