blob: c14bb03a96ce32e0a111d503d583e21d44a99e5a [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
#include <cppunit/TestFixture.h>
#include <cppunit/extensions/HelperMacros.h>
#include <cmath>
#include <cstring>
#include <kll_sketch.hpp>
#include <test_allocator.hpp>
namespace datasketches {
static const double RANK_EPS_FOR_K_200 = 0.0133;
static const double NUMERIC_NOISE_TOLERANCE = 1E-6;
#ifdef TEST_BINARY_INPUT_PATH
static std::string testBinaryInputPath = TEST_BINARY_INPUT_PATH;
#else
static std::string testBinaryInputPath = "test/";
#endif
// typical usage would be just kll_sketch<float> or kll_sketch<std::string>, but here we use test_allocator
typedef kll_sketch<float, std::less<float>, serde<float>, test_allocator<float>> kll_float_sketch;
// let std::string use the default allocator for simplicity, otherwise we need to define "less" and "serde"
typedef kll_sketch<std::string, std::less<std::string>, serde<std::string>, test_allocator<std::string>> kll_string_sketch;
class kll_sketch_test: public CppUnit::TestFixture {
CPPUNIT_TEST_SUITE(kll_sketch_test);
CPPUNIT_TEST(k_limits);
CPPUNIT_TEST(empty);
CPPUNIT_TEST(bad_get_quantile);
CPPUNIT_TEST(one_item);
CPPUNIT_TEST(many_items_exact_mode);
CPPUNIT_TEST(many_items_estimation_mode);
CPPUNIT_TEST(consistency_between_get_rank_and_get_PMF_CDF);
CPPUNIT_TEST(deserialize_from_java);
CPPUNIT_TEST(serialize_deserialize_empty);
CPPUNIT_TEST(serialize_deserialize_one_item);
CPPUNIT_TEST(deserialize_one_item_v1);
CPPUNIT_TEST(serialize_deserialize_stream);
CPPUNIT_TEST(serialize_deserialize_bytes);
CPPUNIT_TEST(floor_of_log2_of_fraction);
CPPUNIT_TEST(out_of_order_split_points_float);
CPPUNIT_TEST(out_of_order_split_points_int);
CPPUNIT_TEST(nan_split_point);
CPPUNIT_TEST(merge);
CPPUNIT_TEST(merge_lower_k);
CPPUNIT_TEST(merge_exact_mode_lower_k);
CPPUNIT_TEST(merge_min_value_from_other);
CPPUNIT_TEST(merge_min_and_max_from_other);
CPPUNIT_TEST(sketch_of_ints);
CPPUNIT_TEST(sketch_of_strings_stream);
CPPUNIT_TEST(sketch_of_strings_bytes);
CPPUNIT_TEST(sketch_of_strings_single_item_bytes);
CPPUNIT_TEST(copy);
CPPUNIT_TEST_SUITE_END();
public:
void setUp() {
test_allocator_total_bytes = 0;
}
void tearDown() {
if (test_allocator_total_bytes != 0) {
CPPUNIT_ASSERT_EQUAL((long long) 0, test_allocator_total_bytes);
}
}
void k_limits() {
kll_float_sketch sketch1(kll_float_sketch::MIN_K); // this should work
kll_float_sketch sketch2(kll_float_sketch::MAX_K); // this should work
CPPUNIT_ASSERT_THROW(new kll_float_sketch(kll_float_sketch::MIN_K - 1), std::invalid_argument);
// MAX_K + 1 makes no sense because k is uint16_t
}
void empty() {
kll_float_sketch sketch;
CPPUNIT_ASSERT(sketch.is_empty());
CPPUNIT_ASSERT(!sketch.is_estimation_mode());
CPPUNIT_ASSERT_EQUAL((uint64_t) 0, sketch.get_n());
CPPUNIT_ASSERT_EQUAL(0u, sketch.get_num_retained());
CPPUNIT_ASSERT(std::isnan(sketch.get_rank(0)));
CPPUNIT_ASSERT(std::isnan(sketch.get_min_value()));
CPPUNIT_ASSERT(std::isnan(sketch.get_max_value()));
CPPUNIT_ASSERT(std::isnan(sketch.get_quantile(0.5)));
const double fractions[3] {0, 0.5, 1};
CPPUNIT_ASSERT(!sketch.get_quantiles(fractions, 3));
const float split_points[1] {0};
CPPUNIT_ASSERT(!sketch.get_PMF(split_points, 1));
CPPUNIT_ASSERT(!sketch.get_CDF(split_points, 1));
int count = 0;
for (auto& it: sketch) {
(void) it; // to suppress "unused" warning
++count;
}
CPPUNIT_ASSERT_EQUAL(0, count);
}
void bad_get_quantile() {
kll_float_sketch sketch;
sketch.update(0); // has to be non-empty to reach the check
CPPUNIT_ASSERT_THROW(sketch.get_quantile(-1), std::invalid_argument);
}
void one_item() {
kll_float_sketch sketch;
sketch.update(1);
CPPUNIT_ASSERT(!sketch.is_empty());
CPPUNIT_ASSERT(!sketch.is_estimation_mode());
CPPUNIT_ASSERT_EQUAL((uint64_t) 1, sketch.get_n());
CPPUNIT_ASSERT_EQUAL(1u, sketch.get_num_retained());
CPPUNIT_ASSERT_EQUAL(0.0, sketch.get_rank(1));
CPPUNIT_ASSERT_EQUAL(1.0, sketch.get_rank(2));
CPPUNIT_ASSERT_EQUAL(1.0f, sketch.get_min_value());
CPPUNIT_ASSERT_EQUAL(1.0f, sketch.get_max_value());
CPPUNIT_ASSERT_EQUAL(1.0f, sketch.get_quantile(0.5));
const double fractions[3] {0, 0.5, 1};
auto quantiles(sketch.get_quantiles(fractions, 3));
CPPUNIT_ASSERT(quantiles);
CPPUNIT_ASSERT_EQUAL(1.0f, quantiles[0]);
CPPUNIT_ASSERT_EQUAL(1.0f, quantiles[1]);
CPPUNIT_ASSERT_EQUAL(1.0f, quantiles[2]);
int count = 0;
for (auto& it: sketch) {
CPPUNIT_ASSERT_EQUAL(1, (int) it.second);
++count;
}
CPPUNIT_ASSERT_EQUAL(1, count);
}
void many_items_exact_mode() {
kll_float_sketch sketch;
const uint32_t n(200);
for (uint32_t i = 0; i < n; i++) {
sketch.update(i);
CPPUNIT_ASSERT_EQUAL((uint64_t) i + 1, sketch.get_n());
}
CPPUNIT_ASSERT(!sketch.is_empty());
CPPUNIT_ASSERT(!sketch.is_estimation_mode());
CPPUNIT_ASSERT_EQUAL(n, sketch.get_num_retained());
CPPUNIT_ASSERT_EQUAL(0.0f, sketch.get_min_value());
CPPUNIT_ASSERT_EQUAL(0.0f, sketch.get_quantile(0));
CPPUNIT_ASSERT_EQUAL((float) n - 1, sketch.get_max_value());
CPPUNIT_ASSERT_EQUAL((float) n - 1, sketch.get_quantile(1));
const double fractions[3] {0, 0.5, 1};
auto quantiles(sketch.get_quantiles(fractions, 3));
CPPUNIT_ASSERT(quantiles);
CPPUNIT_ASSERT_EQUAL(0.0f, quantiles[0]);
CPPUNIT_ASSERT_EQUAL((float) n / 2, quantiles[1]);
CPPUNIT_ASSERT_EQUAL((float) n - 1 , quantiles[2]);
for (uint32_t i = 0; i < n; i++) {
const double trueRank = (double) i / n;
CPPUNIT_ASSERT_EQUAL(trueRank, sketch.get_rank(i));
}
}
void many_items_estimation_mode() {
kll_float_sketch sketch;
const int n(1000000);
for (int i = 0; i < n; i++) {
sketch.update(i);
CPPUNIT_ASSERT_EQUAL((uint64_t) i + 1, sketch.get_n());
}
CPPUNIT_ASSERT(!sketch.is_empty());
CPPUNIT_ASSERT(sketch.is_estimation_mode());
CPPUNIT_ASSERT_EQUAL(0.0f, sketch.get_min_value()); // min value is exact
CPPUNIT_ASSERT_EQUAL(0.0f, sketch.get_quantile(0)); // min value is exact
CPPUNIT_ASSERT_EQUAL((float) n - 1, sketch.get_max_value()); // max value is exact
CPPUNIT_ASSERT_EQUAL((float) n - 1, sketch.get_quantile(1)); // max value is exact
// test rank
for (int i = 0; i < n; i++) {
const double trueRank = (double) i / n;
CPPUNIT_ASSERT_DOUBLES_EQUAL(trueRank, sketch.get_rank(i), RANK_EPS_FOR_K_200);
}
// test quantiles at every 0.1 percentage point
double fractions[1001];
double reverse_fractions[1001]; // check that ordering does not matter
for (int i = 0; i < 1001; i++) {
fractions[i] = (double) i / 1000;
reverse_fractions[1000 - i] = fractions[i];
}
auto quantiles = sketch.get_quantiles(fractions, 1001);
auto reverse_quantiles = sketch.get_quantiles(reverse_fractions, 1001);
float previous_quantile(0);
for (int i = 0; i < 1001; i++) {
// expensive in a loop, just to check the equivalence here, not advised for real code
const float quantile = sketch.get_quantile(fractions[i]);
CPPUNIT_ASSERT_EQUAL(quantile, quantiles[i]);
CPPUNIT_ASSERT_EQUAL(quantile, reverse_quantiles[1000 - i]);
CPPUNIT_ASSERT(previous_quantile <= quantile);
previous_quantile = quantile;
}
//sketch.to_stream(std::cout);
}
void consistency_between_get_rank_and_get_PMF_CDF() {
kll_float_sketch sketch;
const int n = 1000;
float values[n];
for (int i = 0; i < n; i++) {
sketch.update(i);
values[i] = i;
}
const auto ranks(sketch.get_CDF(values, n));
const auto pmf(sketch.get_PMF(values, n));
double subtotal_pmf(0);
for (int i = 0; i < n; i++) {
CPPUNIT_ASSERT_EQUAL_MESSAGE("rank vs CDF for value " + std::to_string(i), ranks[i], sketch.get_rank(values[i]));
subtotal_pmf += pmf[i];
CPPUNIT_ASSERT_DOUBLES_EQUAL_MESSAGE("CDF vs PMF for value " + std::to_string(i), ranks[i], subtotal_pmf, NUMERIC_NOISE_TOLERANCE);
}
}
void deserialize_from_java() {
std::ifstream is;
is.exceptions(std::ios::failbit | std::ios::badbit);
is.open(testBinaryInputPath + "kll_sketch_from_java.bin", std::ios::binary);
auto sketch = kll_float_sketch::deserialize(is);
CPPUNIT_ASSERT(!sketch.is_empty());
CPPUNIT_ASSERT(sketch.is_estimation_mode());
CPPUNIT_ASSERT_EQUAL((uint64_t) 1000000, sketch.get_n());
CPPUNIT_ASSERT_EQUAL(614u, sketch.get_num_retained());
CPPUNIT_ASSERT_EQUAL(0.0f, sketch.get_min_value());
CPPUNIT_ASSERT_EQUAL(999999.0f, sketch.get_max_value());
}
void serialize_deserialize_empty() {
kll_float_sketch sketch;
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
sketch.serialize(s);
CPPUNIT_ASSERT_EQUAL(sketch.get_serialized_size_bytes(), (uint32_t) s.tellp());
auto sketch2 = kll_float_sketch::deserialize(s);
CPPUNIT_ASSERT_EQUAL(sketch2.get_serialized_size_bytes(), (uint32_t) s.tellg());
CPPUNIT_ASSERT_EQUAL(sketch.is_empty(), sketch2.is_empty());
CPPUNIT_ASSERT_EQUAL(sketch.is_estimation_mode(), sketch2.is_estimation_mode());
CPPUNIT_ASSERT_EQUAL(sketch.get_n(), sketch2.get_n());
CPPUNIT_ASSERT_EQUAL(sketch.get_num_retained(), sketch2.get_num_retained());
CPPUNIT_ASSERT(std::isnan(sketch2.get_min_value()));
CPPUNIT_ASSERT(std::isnan(sketch2.get_max_value()));
CPPUNIT_ASSERT_EQUAL(sketch.get_normalized_rank_error(false), sketch2.get_normalized_rank_error(false));
CPPUNIT_ASSERT_EQUAL(sketch.get_normalized_rank_error(true), sketch2.get_normalized_rank_error(true));
}
void serialize_deserialize_one_item() {
kll_float_sketch sketch;
sketch.update(1);
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
sketch.serialize(s);
CPPUNIT_ASSERT_EQUAL(sketch.get_serialized_size_bytes(), (uint32_t) s.tellp());
auto sketch2 = kll_float_sketch::deserialize(s);
CPPUNIT_ASSERT_EQUAL(sketch2.get_serialized_size_bytes(), (uint32_t) s.tellg());
CPPUNIT_ASSERT_EQUAL(s.tellp(), s.tellg());
CPPUNIT_ASSERT(!sketch2.is_empty());
CPPUNIT_ASSERT(!sketch2.is_estimation_mode());
CPPUNIT_ASSERT_EQUAL((uint64_t) 1, sketch2.get_n());
CPPUNIT_ASSERT_EQUAL(1u, sketch2.get_num_retained());
CPPUNIT_ASSERT_EQUAL(1.0f, sketch2.get_min_value());
CPPUNIT_ASSERT_EQUAL(1.0f, sketch2.get_max_value());
CPPUNIT_ASSERT_EQUAL(1.0f, sketch2.get_quantile(0.5));
CPPUNIT_ASSERT_EQUAL(0.0, sketch2.get_rank(1));
CPPUNIT_ASSERT_EQUAL(1.0, sketch2.get_rank(2));
}
void deserialize_one_item_v1() {
std::ifstream is;
is.exceptions(std::ios::failbit | std::ios::badbit);
is.open(testBinaryInputPath + "kll_sketch_float_one_item_v1.bin", std::ios::binary);
auto sketch = kll_float_sketch::deserialize(is);
CPPUNIT_ASSERT(!sketch.is_empty());
CPPUNIT_ASSERT(!sketch.is_estimation_mode());
CPPUNIT_ASSERT_EQUAL((uint64_t) 1, sketch.get_n());
CPPUNIT_ASSERT_EQUAL(1u, sketch.get_num_retained());
CPPUNIT_ASSERT_EQUAL(1.0f, sketch.get_min_value());
CPPUNIT_ASSERT_EQUAL(1.0f, sketch.get_max_value());
}
void serialize_deserialize_stream() {
kll_float_sketch sketch;
const int n(1000);
for (int i = 0; i < n; i++) sketch.update(i);
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
sketch.serialize(s);
CPPUNIT_ASSERT_EQUAL(sketch.get_serialized_size_bytes(), (uint32_t) s.tellp());
auto sketch2 = kll_float_sketch::deserialize(s);
CPPUNIT_ASSERT_EQUAL(sketch2.get_serialized_size_bytes(), (uint32_t) s.tellg());
CPPUNIT_ASSERT_EQUAL(s.tellp(), s.tellg());
CPPUNIT_ASSERT_EQUAL(sketch.is_empty(), sketch2.is_empty());
CPPUNIT_ASSERT_EQUAL(sketch.is_estimation_mode(), sketch2.is_estimation_mode());
CPPUNIT_ASSERT_EQUAL(sketch.get_n(), sketch2.get_n());
CPPUNIT_ASSERT_EQUAL(sketch.get_num_retained(), sketch2.get_num_retained());
CPPUNIT_ASSERT_EQUAL(sketch.get_min_value(), sketch2.get_min_value());
CPPUNIT_ASSERT_EQUAL(sketch.get_max_value(), sketch2.get_max_value());
CPPUNIT_ASSERT_EQUAL(sketch.get_normalized_rank_error(false), sketch2.get_normalized_rank_error(false));
CPPUNIT_ASSERT_EQUAL(sketch.get_normalized_rank_error(true), sketch2.get_normalized_rank_error(true));
CPPUNIT_ASSERT_EQUAL(sketch.get_quantile(0.5), sketch2.get_quantile(0.5));
CPPUNIT_ASSERT_EQUAL(sketch.get_rank(0), sketch2.get_rank(0));
CPPUNIT_ASSERT_EQUAL(sketch.get_rank(n), sketch2.get_rank(n));
}
void serialize_deserialize_bytes() {
kll_float_sketch sketch;
const int n(1000);
for (int i = 0; i < n; i++) sketch.update(i);
auto data = sketch.serialize();
CPPUNIT_ASSERT_EQUAL(sketch.get_serialized_size_bytes(), (uint32_t) data.second);
auto sketch2 = kll_float_sketch::deserialize(data.first.get(), data.second);
CPPUNIT_ASSERT_EQUAL(sketch2.get_serialized_size_bytes(), (uint32_t) data.second);
CPPUNIT_ASSERT_EQUAL(sketch.is_empty(), sketch2.is_empty());
CPPUNIT_ASSERT_EQUAL(sketch.is_estimation_mode(), sketch2.is_estimation_mode());
CPPUNIT_ASSERT_EQUAL(sketch.get_n(), sketch2.get_n());
CPPUNIT_ASSERT_EQUAL(sketch.get_num_retained(), sketch2.get_num_retained());
CPPUNIT_ASSERT_EQUAL(sketch.get_min_value(), sketch2.get_min_value());
CPPUNIT_ASSERT_EQUAL(sketch.get_max_value(), sketch2.get_max_value());
CPPUNIT_ASSERT_EQUAL(sketch.get_normalized_rank_error(false), sketch2.get_normalized_rank_error(false));
CPPUNIT_ASSERT_EQUAL(sketch.get_normalized_rank_error(true), sketch2.get_normalized_rank_error(true));
CPPUNIT_ASSERT_EQUAL(sketch.get_quantile(0.5), sketch2.get_quantile(0.5));
CPPUNIT_ASSERT_EQUAL(sketch.get_rank(0), sketch2.get_rank(0));
CPPUNIT_ASSERT_EQUAL(sketch.get_rank(n), sketch2.get_rank(n));
}
void floor_of_log2_of_fraction() {
CPPUNIT_ASSERT_EQUAL((uint8_t) 0, kll_helper::floor_of_log2_of_fraction(0, 1));
CPPUNIT_ASSERT_EQUAL((uint8_t) 0, kll_helper::floor_of_log2_of_fraction(1, 2));
CPPUNIT_ASSERT_EQUAL((uint8_t) 0, kll_helper::floor_of_log2_of_fraction(2, 2));
CPPUNIT_ASSERT_EQUAL((uint8_t) 0, kll_helper::floor_of_log2_of_fraction(3, 2));
CPPUNIT_ASSERT_EQUAL((uint8_t) 1, kll_helper::floor_of_log2_of_fraction(4, 2));
CPPUNIT_ASSERT_EQUAL((uint8_t) 1, kll_helper::floor_of_log2_of_fraction(5, 2));
CPPUNIT_ASSERT_EQUAL((uint8_t) 1, kll_helper::floor_of_log2_of_fraction(6, 2));
CPPUNIT_ASSERT_EQUAL((uint8_t) 1, kll_helper::floor_of_log2_of_fraction(7, 2));
CPPUNIT_ASSERT_EQUAL((uint8_t) 2, kll_helper::floor_of_log2_of_fraction(8, 2));
}
void out_of_order_split_points_float() {
kll_float_sketch sketch;
sketch.update(0); // has too be non-empty to reach the check
float split_points[2] = {1, 0};
CPPUNIT_ASSERT_THROW(sketch.get_CDF(split_points, 2), std::invalid_argument);
}
void out_of_order_split_points_int() {
kll_sketch<int> sketch;
sketch.update(0); // has too be non-empty to reach the check
int split_points[2] = {1, 0};
CPPUNIT_ASSERT_THROW(sketch.get_CDF(split_points, 2), std::invalid_argument);
}
void nan_split_point() {
kll_float_sketch sketch;
sketch.update(0); // has too be non-empty to reach the check
float split_points[1] = {std::numeric_limits<float>::quiet_NaN()};
CPPUNIT_ASSERT_THROW(sketch.get_CDF(split_points, 1), std::invalid_argument);
}
void merge() {
kll_float_sketch sketch1;
kll_float_sketch sketch2;
const int n = 10000;
for (int i = 0; i < n; i++) {
sketch1.update(i);
sketch2.update((2 * n) - i - 1);
}
CPPUNIT_ASSERT_EQUAL(0.0f, sketch1.get_min_value());
CPPUNIT_ASSERT_EQUAL((float) n - 1, sketch1.get_max_value());
CPPUNIT_ASSERT_EQUAL((float) n, sketch2.get_min_value());
CPPUNIT_ASSERT_EQUAL(2.0f * n - 1, sketch2.get_max_value());
sketch1.merge(sketch2);
CPPUNIT_ASSERT(!sketch1.is_empty());
CPPUNIT_ASSERT_EQUAL((uint64_t) 2 * n, sketch1.get_n());
CPPUNIT_ASSERT_EQUAL(0.0f, sketch1.get_min_value());
CPPUNIT_ASSERT_EQUAL(2.0f * n - 1, sketch1.get_max_value());
CPPUNIT_ASSERT_DOUBLES_EQUAL(n, sketch1.get_quantile(0.5), n * RANK_EPS_FOR_K_200);
}
void merge_lower_k() {
kll_float_sketch sketch1(256);
kll_float_sketch sketch2(128);
const int n = 10000;
for (int i = 0; i < n; i++) {
sketch1.update(i);
sketch2.update((2 * n) - i - 1);
}
CPPUNIT_ASSERT_EQUAL(0.0f, sketch1.get_min_value());
CPPUNIT_ASSERT_EQUAL((float) n - 1, sketch1.get_max_value());
CPPUNIT_ASSERT_EQUAL((float) n, sketch2.get_min_value());
CPPUNIT_ASSERT_EQUAL(2.0f * n - 1, sketch2.get_max_value());
CPPUNIT_ASSERT(sketch1.get_normalized_rank_error(false) < sketch2.get_normalized_rank_error(false));
CPPUNIT_ASSERT(sketch1.get_normalized_rank_error(true) < sketch2.get_normalized_rank_error(true));
sketch1.merge(sketch2);
// sketch1 must get "contaminated" by the lower K in sketch2
CPPUNIT_ASSERT_EQUAL(sketch1.get_normalized_rank_error(false), sketch2.get_normalized_rank_error(false));
CPPUNIT_ASSERT_EQUAL(sketch1.get_normalized_rank_error(true), sketch2.get_normalized_rank_error(true));
CPPUNIT_ASSERT(!sketch1.is_empty());
CPPUNIT_ASSERT_EQUAL((uint64_t) 2 * n, sketch1.get_n());
CPPUNIT_ASSERT_EQUAL(0.0f, sketch1.get_min_value());
CPPUNIT_ASSERT_EQUAL(2.0f * n - 1, sketch1.get_max_value());
CPPUNIT_ASSERT_DOUBLES_EQUAL(n, sketch1.get_quantile(0.5), n * RANK_EPS_FOR_K_200);
}
void merge_exact_mode_lower_k() {
kll_float_sketch sketch1(256);
kll_float_sketch sketch2(128);
const int n = 10000;
for (int i = 0; i < n; i++) {
sketch1.update(i);
}
// rank error should not be affected by a merge with an empty sketch with lower k
const double rank_error_before_merge = sketch1.get_normalized_rank_error(true);
sketch1.merge(sketch2);
CPPUNIT_ASSERT_EQUAL(rank_error_before_merge, sketch1.get_normalized_rank_error(true));
CPPUNIT_ASSERT(!sketch1.is_empty());
CPPUNIT_ASSERT_EQUAL((uint64_t) n, sketch1.get_n());
CPPUNIT_ASSERT_EQUAL(0.0f, sketch1.get_min_value());
CPPUNIT_ASSERT_EQUAL((float) n - 1, sketch1.get_max_value());
CPPUNIT_ASSERT_DOUBLES_EQUAL(n / 2, sketch1.get_quantile(0.5), n / 2 * RANK_EPS_FOR_K_200);
sketch2.update(0);
sketch1.merge(sketch2);
// rank error should not be affected by a merge with a sketch in exact mode with lower k
CPPUNIT_ASSERT_EQUAL(rank_error_before_merge, sketch1.get_normalized_rank_error(true));
}
void merge_min_value_from_other() {
kll_float_sketch sketch1;
kll_float_sketch sketch2;
sketch1.update(1);
sketch2.update(2);
sketch2.merge(sketch1);
CPPUNIT_ASSERT_EQUAL(1.0f, sketch2.get_min_value());
CPPUNIT_ASSERT_EQUAL(2.0f, sketch2.get_max_value());
}
void merge_min_and_max_from_other() {
kll_float_sketch sketch1;
for (int i = 0; i < 1000000; i++) sketch1.update(i);
kll_float_sketch sketch2;
sketch2.merge(sketch1);
CPPUNIT_ASSERT_EQUAL(0.0f, sketch2.get_min_value());
CPPUNIT_ASSERT_EQUAL(999999.0f, sketch2.get_max_value());
}
void sketch_of_ints() {
kll_sketch<int> sketch;
CPPUNIT_ASSERT_THROW(sketch.get_quantile(0), std::runtime_error);
CPPUNIT_ASSERT_THROW(sketch.get_min_value(), std::runtime_error);
CPPUNIT_ASSERT_THROW(sketch.get_max_value(), std::runtime_error);
const int n(1000);
for (int i = 0; i < n; i++) sketch.update(i);
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
sketch.serialize(s);
CPPUNIT_ASSERT_EQUAL(sketch.get_serialized_size_bytes(), (uint32_t) s.tellp());
auto sketch2 = kll_sketch<int>::deserialize(s);
CPPUNIT_ASSERT_EQUAL(sketch2.get_serialized_size_bytes(), (uint32_t) s.tellg());
CPPUNIT_ASSERT_EQUAL(s.tellp(), s.tellg());
CPPUNIT_ASSERT_EQUAL(sketch.is_empty(), sketch2.is_empty());
CPPUNIT_ASSERT_EQUAL(sketch.is_estimation_mode(), sketch2.is_estimation_mode());
CPPUNIT_ASSERT_EQUAL(sketch.get_n(), sketch2.get_n());
CPPUNIT_ASSERT_EQUAL(sketch.get_num_retained(), sketch2.get_num_retained());
CPPUNIT_ASSERT_EQUAL(sketch.get_min_value(), sketch2.get_min_value());
CPPUNIT_ASSERT_EQUAL(sketch.get_max_value(), sketch2.get_max_value());
CPPUNIT_ASSERT_EQUAL(sketch.get_normalized_rank_error(false), sketch2.get_normalized_rank_error(false));
CPPUNIT_ASSERT_EQUAL(sketch.get_normalized_rank_error(true), sketch2.get_normalized_rank_error(true));
CPPUNIT_ASSERT_EQUAL(sketch.get_quantile(0.5), sketch2.get_quantile(0.5));
CPPUNIT_ASSERT_EQUAL(sketch.get_rank(0), sketch2.get_rank(0));
CPPUNIT_ASSERT_EQUAL(sketch.get_rank(n), sketch2.get_rank(n));
}
void sketch_of_strings_stream() {
kll_string_sketch sketch1;
CPPUNIT_ASSERT_THROW(sketch1.get_quantile(0), std::runtime_error);
CPPUNIT_ASSERT_THROW(sketch1.get_min_value(), std::runtime_error);
CPPUNIT_ASSERT_THROW(sketch1.get_max_value(), std::runtime_error);
CPPUNIT_ASSERT_EQUAL(8u, sketch1.get_serialized_size_bytes());
const int n = 1000;
for (int i = 0; i < n; i++) sketch1.update(std::to_string(i));
CPPUNIT_ASSERT_EQUAL(std::string("0"), sketch1.get_min_value());
CPPUNIT_ASSERT_EQUAL(std::string("999"), sketch1.get_max_value());
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
sketch1.serialize(s);
CPPUNIT_ASSERT_EQUAL(sketch1.get_serialized_size_bytes(), (uint32_t) s.tellp());
auto sketch2 = kll_string_sketch::deserialize(s);
CPPUNIT_ASSERT_EQUAL(sketch2.get_serialized_size_bytes(), (uint32_t) s.tellg());
CPPUNIT_ASSERT_EQUAL(s.tellp(), s.tellg());
CPPUNIT_ASSERT_EQUAL(sketch1.is_empty(), sketch2.is_empty());
CPPUNIT_ASSERT_EQUAL(sketch1.is_estimation_mode(), sketch2.is_estimation_mode());
CPPUNIT_ASSERT_EQUAL(sketch1.get_n(), sketch2.get_n());
CPPUNIT_ASSERT_EQUAL(sketch1.get_num_retained(), sketch2.get_num_retained());
CPPUNIT_ASSERT_EQUAL(sketch1.get_min_value(), sketch2.get_min_value());
CPPUNIT_ASSERT_EQUAL(sketch1.get_max_value(), sketch2.get_max_value());
CPPUNIT_ASSERT_EQUAL(sketch1.get_normalized_rank_error(false), sketch2.get_normalized_rank_error(false));
CPPUNIT_ASSERT_EQUAL(sketch1.get_normalized_rank_error(true), sketch2.get_normalized_rank_error(true));
CPPUNIT_ASSERT_EQUAL(sketch1.get_quantile(0.5), sketch2.get_quantile(0.5));
CPPUNIT_ASSERT_EQUAL(sketch1.get_rank(std::to_string(0)), sketch2.get_rank(std::to_string(0)));
CPPUNIT_ASSERT_EQUAL(sketch1.get_rank(std::to_string(n)), sketch2.get_rank(std::to_string(n)));
// to take a look using hexdump
//std::ofstream os("kll-string.bin");
//sketch1.serialize(os);
// debug print
//sketch1.to_stream(std::cout);
}
void sketch_of_strings_bytes() {
kll_string_sketch sketch1;
CPPUNIT_ASSERT_THROW(sketch1.get_quantile(0), std::runtime_error);
CPPUNIT_ASSERT_THROW(sketch1.get_min_value(), std::runtime_error);
CPPUNIT_ASSERT_THROW(sketch1.get_max_value(), std::runtime_error);
CPPUNIT_ASSERT_EQUAL(8u, sketch1.get_serialized_size_bytes());
const int n = 1000;
for (int i = 0; i < n; i++) sketch1.update(std::to_string(i));
CPPUNIT_ASSERT_EQUAL(std::string("0"), sketch1.get_min_value());
CPPUNIT_ASSERT_EQUAL(std::string("999"), sketch1.get_max_value());
auto data = sketch1.serialize();
CPPUNIT_ASSERT_EQUAL((size_t) sketch1.get_serialized_size_bytes(), data.second);
auto sketch2 = kll_string_sketch::deserialize(data.first.get(), data.second);
CPPUNIT_ASSERT_EQUAL(sketch2.get_serialized_size_bytes(), (uint32_t) data.second);
CPPUNIT_ASSERT_EQUAL(sketch1.is_empty(), sketch2.is_empty());
CPPUNIT_ASSERT_EQUAL(sketch1.is_estimation_mode(), sketch2.is_estimation_mode());
CPPUNIT_ASSERT_EQUAL(sketch1.get_n(), sketch2.get_n());
CPPUNIT_ASSERT_EQUAL(sketch1.get_num_retained(), sketch2.get_num_retained());
CPPUNIT_ASSERT_EQUAL(sketch1.get_min_value(), sketch2.get_min_value());
CPPUNIT_ASSERT_EQUAL(sketch1.get_max_value(), sketch2.get_max_value());
CPPUNIT_ASSERT_EQUAL(sketch1.get_normalized_rank_error(false), sketch2.get_normalized_rank_error(false));
CPPUNIT_ASSERT_EQUAL(sketch1.get_normalized_rank_error(true), sketch2.get_normalized_rank_error(true));
CPPUNIT_ASSERT_EQUAL(sketch1.get_quantile(0.5), sketch2.get_quantile(0.5));
CPPUNIT_ASSERT_EQUAL(sketch1.get_rank(std::to_string(0)), sketch2.get_rank(std::to_string(0)));
CPPUNIT_ASSERT_EQUAL(sketch1.get_rank(std::to_string(n)), sketch2.get_rank(std::to_string(n)));
}
void sketch_of_strings_single_item_bytes() {
kll_string_sketch sketch1;
sketch1.update("a");
auto data = sketch1.serialize();
CPPUNIT_ASSERT_EQUAL((size_t) sketch1.get_serialized_size_bytes(), data.second);
auto sketch2 = kll_string_sketch::deserialize(data.first.get(), data.second);
CPPUNIT_ASSERT_EQUAL(sketch2.get_serialized_size_bytes(), (uint32_t) data.second);
}
void copy() {
kll_sketch<int> sketch1;
const int n(1000);
for (int i = 0; i < n; i++) sketch1.update(i);
// copy constructor
kll_sketch<int> sketch2(sketch1);
for (int i = 0; i < n; i++) {
CPPUNIT_ASSERT_EQUAL(sketch1.get_rank(i), sketch2.get_rank(i));
}
// copy assignment
kll_sketch<int> sketch3;
sketch3 = sketch1;
for (int i = 0; i < n; i++) {
CPPUNIT_ASSERT_EQUAL(sketch1.get_rank(i), sketch3.get_rank(i));
}
}
};
CPPUNIT_TEST_SUITE_REGISTRATION(kll_sketch_test);
} /* namespace datasketches */