blob: a7c085d18ef816e5bade06894ad91c135903b851 [file] [log] [blame]
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import unittest
from datasketches import hll_sketch, hll_union, tgt_hll_type
class HllTest(unittest.TestCase):
def test_hll_example(self):
k = 12 # 2^k = 4096 rows in the table
n = 1 << 18 # ~256k unique values
# create a couple sketches and inject some values
# we'll have 1/4 of the values overlap
hll = hll_sketch(k, tgt_hll_type.HLL_8)
hll2 = hll_sketch(k, tgt_hll_type.HLL_6)
offset = int(3 * n / 4) # it's a float w/o cast
# because we hash on the bits, not an abstract numeric value,
# hll.update(1) and hll.update(1.0) give different results.
for i in range(0, n):
hll.update(i)
hll2.update(i + offset)
# we can check that the upper and lower bounds bracket the
# estimate, without needing to know the exact value.
self.assertLessEqual(hll.get_lower_bound(1), hll.get_estimate())
self.assertGreaterEqual(hll.get_upper_bound(1), hll.get_estimate())
# unioning uses a separate class, and we can either get a result
# sketch or query the union object directly
union = hll_union(k)
union.update(hll)
union.update(hll2)
result = union.get_result()
self.assertEqual(result.get_estimate(), union.get_estimate())
# since our process here (including post-union HLL) is
# deterministic, we have checked and know the exact
# answer is within one standard deviation of the estimate
self.assertLessEqual(union.get_lower_bound(1), 7 * n / 4)
self.assertGreaterEqual(union.get_upper_bound(1), 7 * n / 4)
# serialize for storage and reconstruct
sk_bytes = result.serialize_compact()
self.assertEqual(len(sk_bytes), result.get_compact_serialization_bytes())
new_hll = hll_sketch.deserialize(sk_bytes)
# the sketch can self-report its configuation and status
self.assertEqual(new_hll.lg_config_k, k)
self.assertEqual(new_hll.tgt_type, tgt_hll_type.HLL_4)
self.assertFalse(new_hll.is_empty())
# if we want to reduce some object overhead, we can also reset
new_hll.reset()
self.assertTrue(new_hll.is_empty())
def test_hll_sketch(self):
k = 8
n = 117
hll = self.generate_sketch(n, k, tgt_hll_type.HLL_6)
hll.update('string data')
hll.update(3.14159) # double data
self.assertLessEqual(hll.get_lower_bound(1), hll.get_estimate())
self.assertGreaterEqual(hll.get_upper_bound(1), hll.get_estimate())
self.assertEqual(hll.lg_config_k, k)
self.assertEqual(hll.tgt_type, tgt_hll_type.HLL_6)
bytes_compact = hll.serialize_compact()
bytes_update = hll.serialize_updatable()
self.assertEqual(len(bytes_compact), hll.get_compact_serialization_bytes())
self.assertEqual(len(bytes_update), hll.get_updatable_serialization_bytes())
self.assertFalse(hll.is_compact())
self.assertFalse(hll.is_empty())
self.assertTrue(isinstance(hll_sketch.deserialize(bytes_compact), hll_sketch))
self.assertTrue(isinstance(hll_sketch.deserialize(bytes_update), hll_sketch))
self.assertIsNotNone(hll_sketch.get_rel_err(True, False, 12, 1))
self.assertIsNotNone(hll_sketch.get_max_updatable_serialization_bytes(20, tgt_hll_type.HLL_6))
hll.reset()
self.assertTrue(hll.is_empty())
def test_hll_union(self):
k = 7
n = 53
union = hll_union(k)
sk = self.generate_sketch(n, k, tgt_hll_type.HLL_4, 0)
union.update(sk)
sk = self.generate_sketch(3 * n, k, tgt_hll_type.HLL_4, n)
union.update(sk)
union.update('string data')
union.update(1.4142136)
self.assertLessEqual(union.get_lower_bound(1), union.get_estimate())
self.assertGreaterEqual(union.get_upper_bound(1), union.get_estimate())
self.assertEqual(union.lg_config_k, k)
self.assertFalse(union.is_empty())
sk = union.get_result()
self.assertTrue(isinstance(sk, hll_sketch))
self.assertEqual(sk.tgt_type, tgt_hll_type.HLL_4)
def generate_sketch(self, n, k, sk_type=tgt_hll_type.HLL_4, st_idx=0):
sk = hll_sketch(k, sk_type)
for i in range(st_idx, st_idx + n):
sk.update(i)
return sk
if __name__ == '__main__':
unittest.main()