tests/count_min_test.py - datasketches-python - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.

 import unittest
 from datasketches import count_min_sketch

 class CountMinTest(unittest.TestCase):
   def test_count_min_example(self):
     # we'll define target confidence and relative error and use the built-in
     # methods to determine how many hashes and buckets to use
     confidence = 0.95
     num_hashes = count_min_sketch.suggest_num_hashes(confidence)
     relative_error = 0.01
     num_buckets = count_min_sketch.suggest_num_buckets(relative_error)

     # now we can create a few empty sketches
     cm = count_min_sketch(num_hashes, num_buckets)
     cm2 = count_min_sketch(num_hashes, num_buckets)
     self.assertTrue(cm.is_empty())

     # we'll use a moderate number of distinct items with
     # increasing weights, with each item's weight being
     # equal to its value
     n = 1000
     total_wt = 0
     for i in range(1, n+1):
       cm.update(i, i)
       total_wt += i
     self.assertFalse(cm.is_empty())
     self.assertEqual(cm.total_weight, total_wt)

     # querying the items, each of them should
     # have a non-zero count.  the estimate should
     # be at least i with appropriately behaved bounds.
     for i in range(1, n+1):
       val = cm.get_estimate(i)
       self.assertGreaterEqual(val, i)
       self.assertGreaterEqual(val, cm.get_lower_bound(i))
       self.assertGreater(cm.get_upper_bound(i), val)

     # values not in the sketch should have lower estimates, but
     # are not guaranteed to be zero and will succeed
     self.assertIsNotNone(cm.get_estimate("not in set"))

     # we can create another sketch with partial overlap
     # and merge them
     for i in range(int(n / 2), int(3 * n / 2)):
       cm2.update(i, i)
     cm.merge(cm2)

     # and the estimated weight for the overlapped meerged values
     # (n/2 to n) should now be at least 2x the value
     self.assertGreaterEqual(cm.get_estimate(n), 2 * n)

     # finally, serialize and reconstruct
     cm_bytes = cm.serialize()
     self.assertEqual(cm.get_serialized_size_bytes(), len(cm_bytes))
     new_cm = count_min_sketch.deserialize(cm_bytes)

     # and now interrogate the sketch
     self.assertFalse(new_cm.is_empty())
     self.assertEqual(new_cm.num_hashes, cm.num_hashes)
     self.assertEqual(new_cm.num_buckets, cm.num_buckets)
     self.assertEqual(new_cm.total_weight, cm.total_weight)

     # we can also iterate through values in and out of the sketch to ensure
     # the estimates match
     for i in range(0, 2 * n):
       self.assertEqual(cm.get_estimate(i), new_cm.get_estimate(i))

     # finally just check that printing works as expected
     self.assertGreater(len(cm.to_string()), 0)
     self.assertEqual(len(cm.__str__()), len(cm.to_string()))

 if __name__ == '__main__':
     unittest.main()
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.

	import unittest
	from datasketches import count_min_sketch

	class CountMinTest(unittest.TestCase):
	def test_count_min_example(self):
	# we'll define target confidence and relative error and use the built-in
	# methods to determine how many hashes and buckets to use
	confidence = 0.95
	num_hashes = count_min_sketch.suggest_num_hashes(confidence)
	relative_error = 0.01
	num_buckets = count_min_sketch.suggest_num_buckets(relative_error)

	# now we can create a few empty sketches
	cm = count_min_sketch(num_hashes, num_buckets)
	cm2 = count_min_sketch(num_hashes, num_buckets)
	self.assertTrue(cm.is_empty())

	# we'll use a moderate number of distinct items with
	# increasing weights, with each item's weight being
	# equal to its value
	n = 1000
	total_wt = 0
	for i in range(1, n+1):
	cm.update(i, i)
	total_wt += i
	self.assertFalse(cm.is_empty())
	self.assertEqual(cm.total_weight, total_wt)

	# querying the items, each of them should
	# have a non-zero count. the estimate should
	# be at least i with appropriately behaved bounds.
	for i in range(1, n+1):
	val = cm.get_estimate(i)
	self.assertGreaterEqual(val, i)
	self.assertGreaterEqual(val, cm.get_lower_bound(i))
	self.assertGreater(cm.get_upper_bound(i), val)

	# values not in the sketch should have lower estimates, but
	# are not guaranteed to be zero and will succeed
	self.assertIsNotNone(cm.get_estimate("not in set"))

	# we can create another sketch with partial overlap
	# and merge them
	for i in range(int(n / 2), int(3 * n / 2)):
	cm2.update(i, i)
	cm.merge(cm2)

	# and the estimated weight for the overlapped meerged values
	# (n/2 to n) should now be at least 2x the value
	self.assertGreaterEqual(cm.get_estimate(n), 2 * n)

	# finally, serialize and reconstruct
	cm_bytes = cm.serialize()
	self.assertEqual(cm.get_serialized_size_bytes(), len(cm_bytes))
	new_cm = count_min_sketch.deserialize(cm_bytes)

	# and now interrogate the sketch
	self.assertFalse(new_cm.is_empty())
	self.assertEqual(new_cm.num_hashes, cm.num_hashes)
	self.assertEqual(new_cm.num_buckets, cm.num_buckets)
	self.assertEqual(new_cm.total_weight, cm.total_weight)

	# we can also iterate through values in and out of the sketch to ensure
	# the estimates match
	for i in range(0, 2 * n):
	self.assertEqual(cm.get_estimate(i), new_cm.get_estimate(i))

	# finally just check that printing works as expected
	self.assertGreater(len(cm.to_string()), 0)
	self.assertEqual(len(cm.__str__()), len(cm.to_string()))

	if __name__ == '__main__':
	unittest.main()