blob: f30e45d75883a113ccdf4cb404f5c5c19b7b9d79 [file] [log] [blame]
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import unittest
from datasketches import var_opt_sketch, var_opt_union
class VoTest(unittest.TestCase):
def test_vo_example(self):
k = 50 # a small value so we can easily fill the sketch
vo = var_opt_sketch(k)
# varopt sampling reduces to standard reservoir sampling
# if the items are all equally weighted, although the
# algorithm will be significantly slower than an optimized
# reservoir sampler
n = 5 * k
for i in range(0, n):
vo.update(i)
# we can also add a heavy item, using a negative weight for
# easy filtering later. keep in mind that "heavy" is a
# relative concept, so using a fixed multiple of n may not
# be considered a heavy item for larger values of n
vo.update(-1, 1000 * n)
self.assertEqual(k, vo.k)
self.assertEqual(k, vo.num_samples)
self.assertEqual(n + 1, vo.n)
self.assertFalse(vo.is_empty())
# we can easily get the list of items in the sample
items = vo.get_samples()
self.assertEqual(len(items), k)
# we can also apply a predicate to the sketch to get an estimate
# (with optimially minimal variance) of the subset sum of items
# matching that predicate among the entire population
# we'll use a lambda here, but any function operating on a single
# item which returns a boolean value should work
summary = vo.estimate_subset_sum(lambda x: x < 0)
self.assertEqual(summary['estimate'], 1000 * n)
self.assertEqual(summary['total_sketch_weight'], 1001 * n)
# a regular function is similarly handled
def geq_zero(x):
return x >= 0
summary = vo.estimate_subset_sum(geq_zero)
self.assertEqual(summary['estimate'], n)
self.assertEqual(summary['total_sketch_weight'], 1001 * n)
# next we'll create a second, smaller sketch with
# only heavier items relative to the previous sketch,
# but with the sketch in sampling mode
k2 = 5
vo2 = var_opt_sketch(k2)
# for weight, use the estimate of all items >=0 from before
wt = summary['estimate']
for i in range(0, k2 + 1):
vo2.update((2 * n) + i, wt)
# now union the sketches, demonstrating how the
# union's k may not be equal to that of either
# input value
union = var_opt_union(k)
union.update(vo)
union.update(vo2)
result = union.get_result()
self.assertEqual(n + k2 + 2, result.n)
self.assertFalse(result.is_empty())
self.assertGreater(result.k, k2)
self.assertLess(result.k, k)
# we can compare what information is available from both
# the union and a sketch.
print(union)
# if we want to print the list of itmes, there must be a
# __str__() method for each item (which need not be the same
# type; they're all generic python objects when used from
# python), otherwise you may trigger an exception.
# to_string() is provided as a convenince to avoid direct
# calls to __str__() with parameters.
print(result.to_string(True))
if __name__ == '__main__':
unittest.main()