blob: 9029ecf3735f9b86609035a37cfee0467e580bc4 [file] [log] [blame]
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import unittest
from datasketches import cpc_sketch, cpc_union
class CpcTest(unittest.TestCase):
def test_cpc_example(self):
k = 12 # 2^k = 4096 rows in the table
n = 1 << 18 # ~256k unique values
# create a couple sketches and inject some values
# we'll have 1/4 of the values overlap
cpc = cpc_sketch(k)
cpc2 = cpc_sketch(k)
offset = int(3 * n / 4) # it's a float w/o cast
# because we hash on the bits, not an abstract numeric value,
# cpc.update(1) and cpc.update(1.0) give different results.
for i in range(0, n):
cpc.update(i)
cpc2.update(i + offset)
# although we provide get_composite_estimate() and get_estimate(),
# the latter will always give the best available estimate. we
# recommend using get_estimate().
# we can check that the upper and lower bounds bracket the
# estimate, without needing to know the exact value.
self.assertLessEqual(cpc.get_lower_bound(1), cpc.get_estimate())
self.assertGreaterEqual(cpc.get_upper_bound(1), cpc.get_estimate())
# unioning uses a separate class, but we need to get_result()
# tp query the unioned sketches
union = cpc_union(k)
union.update(cpc)
union.update(cpc2)
result = union.get_result()
# since our process here (including post-union CPC) is
# deterministic, we have checked and know the exact
# answer is within one standard deviation of the estimate
self.assertLessEqual(result.get_lower_bound(1), 7 * n / 4)
self.assertGreaterEqual(result.get_upper_bound(1), 7 * n / 4)
# serialize for storage and reconstruct
sk_bytes = result.serialize()
new_cpc = cpc_sketch.deserialize(sk_bytes)
self.assertFalse(new_cpc.is_empty())
if __name__ == '__main__':
unittest.main()