blob: 61ee03c94449484887ddd709c8d1df31bae93872 [file] [log] [blame]
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import unittest
from datasketches import frequent_strings_sketch, frequent_items_error_type
class FiTest(unittest.TestCase):
def test_fi_example(self):
k = 3 # a small value so we can easily fill the sketch
fi = frequent_strings_sketch(k)
# we'll use a small number of distinct items so we
# can use exponentially increasing weights and have
# some frequent items, decreasing so we have some
# small items inserted after a purge
n = 8
for i in range(0, n):
fi.update(str(i), 2 ** (n - i))
# there are two ways to extract items :
# * NO_FALSE_POSITIVES includes all items with a lower bound
# above the a posteriori error
# * NO_FALSE_NEGATIVES includes all items with an uper bound
# above the a posteriori error
# a more complete discussion may be found at
# https://datasketches.github.io/docs/Frequency/FrequentItemsOverview.html
items_no_fp = fi.get_frequent_items(frequent_items_error_type.NO_FALSE_POSITIVES)
items_no_fn = fi.get_frequent_items(frequent_items_error_type.NO_FALSE_NEGATIVES)
self.assertLessEqual(len(items_no_fp), len(items_no_fn))
# the items list returns a decreasing weight-sorted list, and
# for each item we have (item, estimate, lower_bound, upper_bound)
item = items_no_fp[1]
self.assertLessEqual(item[2], item[1]) # lower bound vs estimate
self.assertLessEqual(item[1], item[3]) # estimate vs upper bound
# we can also query directly for a specific item
id = items_no_fn[0][0]
est = fi.get_estimate(id)
lb = fi.get_lower_bound(id)
ub = fi.get_upper_bound(id)
self.assertLessEqual(lb, est)
self.assertLessEqual(est, ub)
# the values are zero if the item isn't in our list
self.assertEqual(fi.get_estimate("NaN"), 0)
# now create a second sketch with a lot of unique
# values but all with equal weight (of 1) such that
# the total weight is much larger than the first sketch
fi2 = frequent_strings_sketch(k)
wt = fi.get_total_weight()
for i in range(0, 4*wt):
fi2.update(str(i))
# merge the second sketch into the first
fi.merge(fi2)
# we can see that the weight is much larger
self.assertEqual(5 * wt, fi.get_total_weight())
# querying with NO_FALSE_POSITIVES means we don't find anything
# heavy enough to return
items_no_fp = fi.get_frequent_items(frequent_items_error_type.NO_FALSE_POSITIVES)
self.assertEqual(len(items_no_fp), 0)
# we do, however, find a few potential heavy items
# if querying with NO_FALSE_NEGATIVES
items_no_fn = fi.get_frequent_items(frequent_items_error_type.NO_FALSE_NEGATIVES)
self.assertGreater(len(items_no_fn), 0)
# finally, serialize and reconstruct
fi_bytes = fi.serialize()
self.assertEqual(len(fi_bytes), fi.get_serialized_size_bytes())
new_fi = frequent_strings_sketch.deserialize(fi_bytes)
# and now interrogate the sketch
self.assertFalse(new_fi.is_empty())
self.assertGreater(new_fi.get_num_active_items(), 0)
self.assertEqual(5 * wt, new_fi.get_total_weight())
def test_fi_sketch(self):
# only testing a few things not used in the above example
k = 12
wt = 10000
fi = frequent_strings_sketch(k)
self.assertAlmostEqual(fi.get_sketch_epsilon(), 0.0008545, delta=1e-6)
sk_apriori_error = fi.get_sketch_epsilon() * wt
reference_apriori_error = frequent_strings_sketch.get_apriori_error(k, wt)
self.assertAlmostEqual(sk_apriori_error, reference_apriori_error, delta=1e-6)
if __name__ == '__main__':
unittest.main()