apache / datasketches-cpp / 6f04d3f16938a3a7dc62a0c3c912b92bcb5d289f / . / python / tests / vo_test.py

# Licensed to the Apache Software Foundation (ASF) under one | |

# or more contributor license agreements. See the NOTICE file | |

# distributed with this work for additional information | |

# regarding copyright ownership. The ASF licenses this file | |

# to you under the Apache License, Version 2.0 (the | |

# "License"); you may not use this file except in compliance | |

# with the License. You may obtain a copy of the License at | |

# | |

# http://www.apache.org/licenses/LICENSE-2.0 | |

# | |

# Unless required by applicable law or agreed to in writing, | |

# software distributed under the License is distributed on an | |

# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | |

# KIND, either express or implied. See the License for the | |

# specific language governing permissions and limitations | |

# under the License. | |

import unittest | |

from datasketches import var_opt_sketch, var_opt_union | |

class VoTest(unittest.TestCase): | |

def test_vo_example(self): | |

k = 50 # a small value so we can easily fill the sketch | |

vo = var_opt_sketch(k) | |

# varopt sampling reduces to standard reservoir sampling | |

# if the items are all equally weighted, although the | |

# algorithm will be significantly slower than an optimized | |

# reservoir sampler | |

n = 5 * k | |

for i in range(0, n): | |

vo.update(i) | |

# we can also add a heavy item, using a negative weight for | |

# easy filtering later. keep in mind that "heavy" is a | |

# relative concept, so using a fixed multiple of n may not | |

# be considered a heavy item for larger values of n | |

vo.update(-1, 1000 * n) | |

self.assertEqual(k, vo.k) | |

self.assertEqual(k, vo.num_samples) | |

self.assertEqual(n + 1, vo.n) | |

self.assertFalse(vo.is_empty()) | |

# we can easily get the list of items in the sample | |

items = vo.get_samples() | |

self.assertEqual(len(items), k) | |

# we can also apply a predicate to the sketch to get an estimate | |

# (with optimially minimal variance) of the subset sum of items | |

# matching that predicate among the entire population | |

# we'll use a lambda here, but any function operating on a single | |

# item which returns a boolean value should work | |

summary = vo.estimate_subset_sum(lambda x: x < 0) | |

self.assertEqual(summary['estimate'], 1000 * n) | |

self.assertEqual(summary['total_sketch_weight'], 1001 * n) | |

# a regular function is similarly handled | |

def geq_zero(x): | |

return x >= 0 | |

summary = vo.estimate_subset_sum(geq_zero) | |

self.assertEqual(summary['estimate'], n) | |

self.assertEqual(summary['total_sketch_weight'], 1001 * n) | |

# next we'll create a second, smaller sketch with | |

# only heavier items relative to the previous sketch, | |

# but with the sketch in sampling mode | |

k2 = 5 | |

vo2 = var_opt_sketch(k2) | |

# for weight, use the estimate of all items >=0 from before | |

wt = summary['estimate'] | |

for i in range(0, k2 + 1): | |

vo2.update((2 * n) + i, wt) | |

# now union the sketches, demonstrating how the | |

# union's k may not be equal to that of either | |

# input value | |

union = var_opt_union(k) | |

union.update(vo) | |

union.update(vo2) | |

result = union.get_result() | |

self.assertEqual(n + k2 + 2, result.n) | |

self.assertFalse(result.is_empty()) | |

self.assertGreater(result.k, k2) | |

self.assertLess(result.k, k) | |

# we can compare what information is available from both | |

# the union and a sketch. | |

print(union) | |

# if we want to print the list of itmes, there must be a | |

# __str__() method for each item (which need not be the same | |

# type; they're all generic python objects when used from | |

# python), otherwise you may trigger an exception. | |

# to_string() is provided as a convenince to avoid direct | |

# calls to __str__() with parameters. | |

print(result.to_string(True)) | |

if __name__ == '__main__': | |

unittest.main() |