python/tests/req_test.py - datasketches-cpp - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.

 import unittest
 from datasketches import req_ints_sketch, req_floats_sketch
 import numpy as np

 class reqTest(unittest.TestCase):
     def test_req_example(self):
       k = 12
       n = 2 ** 20

       # create a sketch and inject ~1 million N(0,1) points as an array and as a single item
       req = req_floats_sketch(k, True) # high rank accuracy
       req.update(np.random.normal(size=n-1))
       req.update(0.0)

       # 0 should be near the median
       self.assertAlmostEqual(0.5, req.get_rank(0.0), delta=0.03)

       # the median should be near 0
       self.assertAlmostEqual(0.0, req.get_quantile(0.5), delta=0.03)

       # we also track the min/max independently from the rest of the data
       # which lets us know the full observed data range
       self.assertLessEqual(req.get_min_value(), req.get_quantile(0.01))
       self.assertLessEqual(0.0, req.get_rank(req.get_min_value()))
       self.assertGreaterEqual(req.get_max_value(), req.get_quantile(0.99))
       self.assertGreaterEqual(1.0, req.get_rank(req.get_max_value()))

       # we can also extract a list of values at a time,
       # here the values should give us something close to [-2, -1, 0, 1, 2].
       # then get the CDF, which will return something close to
       # the original values used in get_quantiles()
       # finally, can check the normalized rank error bound
       pts = req.get_quantiles([0.0228, 0.1587, 0.5, 0.8413, 0.9772])
       cdf = req.get_cdf(pts)  # include 1.0 at end to account for all probability mass
       self.assertEqual(len(cdf), len(pts)+1)

       # For relative error quantiles, the error depends on the actual rank
       # so we need to use that to detemrine the bounds
       est = req.get_rank(0.999, True)
       lb = req.get_rank_lower_bound(est, 1)
       ub = req.get_rank_upper_bound(est, 1)
       self.assertLessEqual(lb, est)
       self.assertLessEqual(est, ub)

       # and a few basic queries about the sketch
       self.assertFalse(req.is_empty())
       self.assertTrue(req.is_estimation_mode())
       self.assertEqual(req.get_n(), n)
       self.assertLess(req.get_num_retained(), n)
       self.assertEqual(req.get_k(), k)

       # merging itself will double the number of items the sketch has seen
       req.merge(req)
       self.assertEqual(req.get_n(), 2*n)

       # we can then serialize and reconstruct the sketch
       req_bytes = req.serialize()
       new_req = req.deserialize(req_bytes)
       self.assertEqual(req.get_num_retained(), new_req.get_num_retained())
       self.assertEqual(req.get_min_value(), new_req.get_min_value())
       self.assertEqual(req.get_max_value(), new_req.get_max_value())
       self.assertEqual(req.get_quantile(0.7), new_req.get_quantile(0.7))
       self.assertEqual(req.get_rank(0.0), new_req.get_rank(0.0))

     def test_req_ints_sketch(self):
         k = 100
         n = 10
         req = req_ints_sketch(k)
         for i in range(0, n):
           req.update(i)

         self.assertEqual(req.get_min_value(), 0)
         self.assertEqual(req.get_max_value(), n-1)
         self.assertEqual(req.get_n(), n)
         self.assertFalse(req.is_empty())
         self.assertFalse(req.is_estimation_mode()) # n < k
         self.assertEqual(req.get_k(), k)

         pmf = req.get_pmf([round(n/2)])
         self.assertIsNotNone(pmf)
         self.assertEqual(len(pmf), 2)

         cdf = req.get_cdf([round(n/2)])
         self.assertIsNotNone(cdf)
         self.assertEqual(len(cdf), 2)

         self.assertEqual(req.get_quantile(0.5), round(n/2))
         quants = req.get_quantiles([0.25, 0.5, 0.75])
         self.assertIsNotNone(quants)
         self.assertEqual(len(quants), 3)

         self.assertEqual(req.get_rank(round(n/2)), 0.5)

         # merge self
         req.merge(req)
         self.assertEqual(req.get_n(), 2 * n)

         sk_bytes = req.serialize()
         self.assertTrue(isinstance(req_ints_sketch.deserialize(sk_bytes), req_ints_sketch))

     def test_req_floats_sketch(self):
       # already tested ints and it's templatized, so just make sure it instantiates properly
       k = 75
       req = req_floats_sketch(k, False) # low rank accuracy
       self.assertTrue(req.is_empty())
       self.assertFalse(req.is_hra())

 if __name__ == '__main__':
     unittest.main()
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.

	import unittest
	from datasketches import req_ints_sketch, req_floats_sketch
	import numpy as np

	class reqTest(unittest.TestCase):
	def test_req_example(self):
	k = 12
	n = 2 ** 20

	# create a sketch and inject ~1 million N(0,1) points as an array and as a single item
	req = req_floats_sketch(k, True) # high rank accuracy
	req.update(np.random.normal(size=n-1))
	req.update(0.0)

	# 0 should be near the median
	self.assertAlmostEqual(0.5, req.get_rank(0.0), delta=0.03)

	# the median should be near 0
	self.assertAlmostEqual(0.0, req.get_quantile(0.5), delta=0.03)

	# we also track the min/max independently from the rest of the data
	# which lets us know the full observed data range
	self.assertLessEqual(req.get_min_value(), req.get_quantile(0.01))
	self.assertLessEqual(0.0, req.get_rank(req.get_min_value()))
	self.assertGreaterEqual(req.get_max_value(), req.get_quantile(0.99))
	self.assertGreaterEqual(1.0, req.get_rank(req.get_max_value()))

	# we can also extract a list of values at a time,
	# here the values should give us something close to [-2, -1, 0, 1, 2].
	# then get the CDF, which will return something close to
	# the original values used in get_quantiles()
	# finally, can check the normalized rank error bound
	pts = req.get_quantiles([0.0228, 0.1587, 0.5, 0.8413, 0.9772])
	cdf = req.get_cdf(pts) # include 1.0 at end to account for all probability mass
	self.assertEqual(len(cdf), len(pts)+1)

	# For relative error quantiles, the error depends on the actual rank
	# so we need to use that to detemrine the bounds
	est = req.get_rank(0.999, True)
	lb = req.get_rank_lower_bound(est, 1)
	ub = req.get_rank_upper_bound(est, 1)
	self.assertLessEqual(lb, est)
	self.assertLessEqual(est, ub)

	# and a few basic queries about the sketch
	self.assertFalse(req.is_empty())
	self.assertTrue(req.is_estimation_mode())
	self.assertEqual(req.get_n(), n)
	self.assertLess(req.get_num_retained(), n)
	self.assertEqual(req.get_k(), k)

	# merging itself will double the number of items the sketch has seen
	req.merge(req)
	self.assertEqual(req.get_n(), 2*n)

	# we can then serialize and reconstruct the sketch
	req_bytes = req.serialize()
	new_req = req.deserialize(req_bytes)
	self.assertEqual(req.get_num_retained(), new_req.get_num_retained())
	self.assertEqual(req.get_min_value(), new_req.get_min_value())
	self.assertEqual(req.get_max_value(), new_req.get_max_value())
	self.assertEqual(req.get_quantile(0.7), new_req.get_quantile(0.7))
	self.assertEqual(req.get_rank(0.0), new_req.get_rank(0.0))

	def test_req_ints_sketch(self):
	k = 100
	n = 10
	req = req_ints_sketch(k)
	for i in range(0, n):
	req.update(i)

	self.assertEqual(req.get_min_value(), 0)
	self.assertEqual(req.get_max_value(), n-1)
	self.assertEqual(req.get_n(), n)
	self.assertFalse(req.is_empty())
	self.assertFalse(req.is_estimation_mode()) # n < k
	self.assertEqual(req.get_k(), k)

	pmf = req.get_pmf([round(n/2)])
	self.assertIsNotNone(pmf)
	self.assertEqual(len(pmf), 2)

	cdf = req.get_cdf([round(n/2)])
	self.assertIsNotNone(cdf)
	self.assertEqual(len(cdf), 2)

	self.assertEqual(req.get_quantile(0.5), round(n/2))
	quants = req.get_quantiles([0.25, 0.5, 0.75])
	self.assertIsNotNone(quants)
	self.assertEqual(len(quants), 3)

	self.assertEqual(req.get_rank(round(n/2)), 0.5)

	# merge self
	req.merge(req)
	self.assertEqual(req.get_n(), 2 * n)

	sk_bytes = req.serialize()
	self.assertTrue(isinstance(req_ints_sketch.deserialize(sk_bytes), req_ints_sketch))

	def test_req_floats_sketch(self):
	# already tested ints and it's templatized, so just make sure it instantiates properly
	k = 75
	req = req_floats_sketch(k, False) # low rank accuracy
	self.assertTrue(req.is_empty())
	self.assertFalse(req.is_hra())

	if __name__ == '__main__':
	unittest.main()