blob: b8d0b03167e5fe914fb2a7f7ae938b55d28f7195 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
#include <catch2/catch.hpp>
#include <cmath>
#include <cstring>
#include <sstream>
#include <fstream>
#include <stdexcept>
#include <kll_sketch.hpp>
#include <test_allocator.hpp>
namespace datasketches {
static const double RANK_EPS_FOR_K_200 = 0.0133;
static const double NUMERIC_NOISE_TOLERANCE = 1E-6;
#ifdef TEST_BINARY_INPUT_PATH
static std::string testBinaryInputPath = TEST_BINARY_INPUT_PATH;
#else
static std::string testBinaryInputPath = "test/";
#endif
// typical usage would be just kll_sketch<float> or kll_sketch<std::string>, but here we use test_allocator
using kll_float_sketch = kll_sketch<float, std::less<float>, test_allocator<float>>;
// let std::string use the default allocator for simplicity, otherwise we need to define "less" and "serde"
using kll_string_sketch = kll_sketch<std::string, std::less<std::string>, test_allocator<std::string>>;
TEST_CASE("kll sketch", "[kll_sketch]") {
// setup
test_allocator_total_bytes = 0;
SECTION("k limits") {
kll_float_sketch sketch1(kll_constants::MIN_K, std::less<float>(), 0); // this should work
kll_float_sketch sketch2(kll_constants::MAX_K, std::less<float>(), 0); // this should work
REQUIRE_THROWS_AS(new kll_float_sketch(kll_constants::MIN_K - 1, std::less<float>(), 0), std::invalid_argument);
// MAX_K + 1 makes no sense because k is uint16_t
//std::cout << "sizeof(kll_sketch<float>)=" << sizeof(kll_sketch<float>) << "\n";
//std::cout << "sizeof(kll_sketch<double>)=" << sizeof(kll_sketch<double>) << "\n";
}
SECTION("empty") {
kll_float_sketch sketch(200, std::less<float>(), 0);
REQUIRE(sketch.is_empty());
REQUIRE_FALSE(sketch.is_estimation_mode());
REQUIRE(sketch.get_n() == 0);
REQUIRE(sketch.get_num_retained() == 0);
REQUIRE_THROWS_AS(sketch.get_min_item(), std::runtime_error);
REQUIRE_THROWS_AS(sketch.get_max_item(), std::runtime_error);
REQUIRE_THROWS_AS(sketch.get_rank(0), std::runtime_error);
REQUIRE_THROWS_AS(sketch.get_quantile(0.5), std::runtime_error);
const float split_points[1] {0};
REQUIRE_THROWS_AS(sketch.get_PMF(split_points, 1), std::runtime_error);
REQUIRE_THROWS_AS(sketch.get_CDF(split_points, 1), std::runtime_error);
for (auto pair: sketch) {
unused(pair); // to suppress "unused" warning
FAIL("should be no iterations over an empty sketch");
}
}
SECTION("get bad quantile") {
kll_float_sketch sketch(200, std::less<float>(), 0);
sketch.update(0); // has to be non-empty to reach the check
REQUIRE_THROWS_AS(sketch.get_quantile(-1), std::invalid_argument);
}
SECTION("one item") {
kll_float_sketch sketch(200, std::less<float>(), 0);
sketch.update(1.0f);
REQUIRE_FALSE(sketch.is_empty());
REQUIRE_FALSE(sketch.is_estimation_mode());
REQUIRE(sketch.get_n() == 1);
REQUIRE(sketch.get_num_retained() == 1);
REQUIRE(sketch.get_rank(1.0f, false) == 0.0);
REQUIRE(sketch.get_rank(1.0f) == 1.0);
REQUIRE(sketch.get_rank(2.0f, false) == 1.0);
REQUIRE(sketch.get_rank(std::numeric_limits<float>::infinity()) == 1.0);
REQUIRE(sketch.get_min_item() == 1.0);
REQUIRE(sketch.get_max_item() == 1.0);
REQUIRE(sketch.get_quantile(0.5) == 1.0);
int count = 0;
for (auto pair: sketch) {
REQUIRE(pair.second == 1);
++count;
}
REQUIRE(count == 1);
// iterator dereferencing
auto it = sketch.begin();
REQUIRE(it->first == 1.0f);
REQUIRE((*it).first == 1.0f);
}
SECTION("NaN") {
kll_float_sketch sketch(200, std::less<float>(), 0);
sketch.update(std::numeric_limits<float>::quiet_NaN());
REQUIRE(sketch.is_empty());
sketch.update(0);
sketch.update(std::numeric_limits<float>::quiet_NaN());
REQUIRE(sketch.get_n() == 1);
}
SECTION("many items, exact mode") {
kll_float_sketch sketch(200, std::less<float>(), 0);
const uint32_t n = 200;
for (uint32_t i = 1; i <= n; i++) {
sketch.update(static_cast<float>(i));
REQUIRE(sketch.get_n() == i);
}
REQUIRE_FALSE(sketch.is_empty());
REQUIRE_FALSE(sketch.is_estimation_mode());
REQUIRE(sketch.get_num_retained() == n);
REQUIRE(sketch.get_min_item() == 1);
REQUIRE(sketch.get_quantile(0) == 1);
REQUIRE(sketch.get_max_item() == n);
REQUIRE(sketch.get_quantile(1) == n);
for (uint32_t i = 1; i <= n; i++) {
const double true_rank_inclusive = static_cast<double>(i) / n;
REQUIRE(sketch.get_rank(static_cast<float>(i)) == true_rank_inclusive);
const double true_rank_exclusive = static_cast<double>(i - 1) / n;
REQUIRE(sketch.get_rank(static_cast<float>(i), false) == true_rank_exclusive);
}
}
SECTION("10 items") {
kll_float_sketch sketch(200, std::less<float>(), 0);
sketch.update(1.0f);
sketch.update(2.0f);
sketch.update(3.0f);
sketch.update(4.0f);
sketch.update(5.0f);
sketch.update(6.0f);
sketch.update(7.0f);
sketch.update(8.0f);
sketch.update(9.0f);
sketch.update(10.0f);
REQUIRE(sketch.get_quantile(0) == 1.0);
REQUIRE(sketch.get_quantile(0.5) == 5.0);
REQUIRE(sketch.get_quantile(0.99) == 10.0);
REQUIRE(sketch.get_quantile(1) == 10.0);
}
SECTION("100 items") {
kll_float_sketch sketch(200, std::less<float>(), 0);
for (int i = 0; i < 100; ++i) sketch.update(static_cast<float>(i));
REQUIRE(sketch.get_quantile(0) == 0);
REQUIRE(sketch.get_quantile(0.01) == 0);
REQUIRE(sketch.get_quantile(0.5) == 49);
REQUIRE(sketch.get_quantile(0.99) == 98.0);
REQUIRE(sketch.get_quantile(1) == 99.0);
}
SECTION("many items, estimation mode") {
kll_float_sketch sketch(200, std::less<float>(), 0);
const int n = 1000000;
for (int i = 0; i < n; i++) {
sketch.update(static_cast<float>(i));
REQUIRE(sketch.get_n() == static_cast<uint64_t>(i + 1));
}
REQUIRE_FALSE(sketch.is_empty());
REQUIRE(sketch.is_estimation_mode());
REQUIRE(sketch.get_min_item() == 0.0); // min value is exact
REQUIRE(sketch.get_max_item() == n - 1); // max value is exact
// test rank
for (int i = 0; i < n; i++) {
const double trueRank = (double) i / n;
REQUIRE(sketch.get_rank(static_cast<float>(i), false) == Approx(trueRank).margin(RANK_EPS_FOR_K_200));
}
//std::cout << sketch.to_string();
uint32_t count = 0;
uint64_t total_weight = 0;
for (auto pair: sketch) {
++count;
total_weight += pair.second;
}
REQUIRE(count == sketch.get_num_retained());
REQUIRE(total_weight == sketch.get_n());
}
SECTION("consistency between get_rank and get_PMF/CDF") {
kll_float_sketch sketch(200, std::less<float>(), 0);
const int n = 200;
float values[n];
for (int i = 0; i < n; i++) {
sketch.update(static_cast<float>(i));
values[i] = static_cast<float>(i);
}
{ // inclusive=false
const auto ranks(sketch.get_CDF(values, n, false));
const auto pmf(sketch.get_PMF(values, n, false));
double subtotal_pmf = 0;
for (int i = 0; i < n; i++) {
if (sketch.get_rank(values[i], false) != ranks[i]) {
FAIL("checking rank vs CDF for value " + std::to_string(i));
}
subtotal_pmf += pmf[i];
if (std::abs(ranks[i] - subtotal_pmf) > NUMERIC_NOISE_TOLERANCE) {
FAIL("CDF vs PMF for value " + std::to_string(i));
}
}
}
{ // inclusive=true (default)
const auto ranks(sketch.get_CDF(values, n));
const auto pmf(sketch.get_PMF(values, n));
double subtotal_pmf = 0;
for (int i = 0; i < n; i++) {
if (sketch.get_rank(values[i]) != ranks[i]) {
FAIL("checking rank vs CDF for value " + std::to_string(i));
}
subtotal_pmf += pmf[i];
if (std::abs(ranks[i] - subtotal_pmf) > NUMERIC_NOISE_TOLERANCE) {
FAIL("CDF vs PMF for value " + std::to_string(i));
}
}
}
}
SECTION("stream serialize deserialize empty") {
kll_float_sketch sketch(200, std::less<float>(), 0);
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
sketch.serialize(s);
REQUIRE(static_cast<size_t>(s.tellp()) == sketch.get_serialized_size_bytes());
auto sketch2 = kll_float_sketch::deserialize(s, serde<float>(), std::less<float>(), 0);
REQUIRE(static_cast<size_t>(s.tellp()) == sketch2.get_serialized_size_bytes());
REQUIRE(s.tellg() == s.tellp());
REQUIRE(sketch2.is_empty() == sketch.is_empty());
REQUIRE(sketch2.is_estimation_mode() == sketch.is_estimation_mode());
REQUIRE(sketch2.get_n() == sketch.get_n());
REQUIRE(sketch2.get_num_retained() == sketch.get_num_retained());
REQUIRE_THROWS_AS(sketch2.get_min_item(), std::runtime_error);
REQUIRE_THROWS_AS(sketch2.get_max_item(), std::runtime_error);
REQUIRE(sketch2.get_normalized_rank_error(false) == sketch.get_normalized_rank_error(false));
REQUIRE(sketch2.get_normalized_rank_error(true) == sketch.get_normalized_rank_error(true));
}
SECTION("bytes serialize deserialize empty") {
kll_float_sketch sketch(200, std::less<float>(), 0);
auto bytes = sketch.serialize();
auto sketch2 = kll_float_sketch::deserialize(bytes.data(), bytes.size(), serde<float>(), std::less<float>(), 0);
REQUIRE(bytes.size() == sketch.get_serialized_size_bytes());
REQUIRE(sketch2.is_empty() == sketch.is_empty());
REQUIRE(sketch2.is_estimation_mode() == sketch.is_estimation_mode());
REQUIRE(sketch2.get_n() == sketch.get_n());
REQUIRE(sketch2.get_num_retained() == sketch.get_num_retained());
REQUIRE_THROWS_AS(sketch2.get_min_item(), std::runtime_error);
REQUIRE_THROWS_AS(sketch2.get_max_item(), std::runtime_error);
REQUIRE(sketch2.get_normalized_rank_error(false) == sketch.get_normalized_rank_error(false));
REQUIRE(sketch2.get_normalized_rank_error(true) == sketch.get_normalized_rank_error(true));
}
SECTION("stream serialize deserialize one item") {
kll_float_sketch sketch(200, std::less<float>(), 0);
sketch.update(1.0f);
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
sketch.serialize(s);
REQUIRE(static_cast<size_t>(s.tellp()) == sketch.get_serialized_size_bytes());
auto sketch2 = kll_float_sketch::deserialize(s, serde<float>(), std::less<float>(), 0);
REQUIRE(static_cast<size_t>(s.tellp()) == sketch2.get_serialized_size_bytes());
REQUIRE(s.tellg() == s.tellp());
REQUIRE_FALSE(sketch2.is_empty());
REQUIRE_FALSE(sketch2.is_estimation_mode());
REQUIRE(sketch2.get_n() == 1);
REQUIRE(sketch2.get_num_retained() == 1);
REQUIRE(sketch2.get_min_item() == 1.0);
REQUIRE(sketch2.get_max_item() == 1.0);
REQUIRE(sketch2.get_quantile(0.5) == 1.0);
REQUIRE(sketch2.get_rank(1, false) == 0.0);
REQUIRE(sketch2.get_rank(2, false) == 1.0);
}
SECTION("bytes serialize deserialize one item") {
kll_float_sketch sketch(200, std::less<float>(), 0);
sketch.update(1.0f);
auto bytes = sketch.serialize();
REQUIRE(bytes.size() == sketch.get_serialized_size_bytes());
auto sketch2 = kll_float_sketch::deserialize(bytes.data(), bytes.size(), serde<float>(), std::less<float>(), 0);
REQUIRE(bytes.size() == sketch2.get_serialized_size_bytes());
REQUIRE_FALSE(sketch2.is_empty());
REQUIRE_FALSE(sketch2.is_estimation_mode());
REQUIRE(sketch2.get_n() == 1);
REQUIRE(sketch2.get_num_retained() == 1);
REQUIRE(sketch2.get_min_item() == 1.0);
REQUIRE(sketch2.get_max_item() == 1.0);
REQUIRE(sketch2.get_quantile(0.5) == 1.0);
REQUIRE(sketch2.get_rank(1, false) == 0.0);
REQUIRE(sketch2.get_rank(2, false) == 1.0);
}
SECTION("deserialize one item v1") {
std::ifstream is;
is.exceptions(std::ios::failbit | std::ios::badbit);
is.open(testBinaryInputPath + "kll_sketch_float_one_item_v1.sk", std::ios::binary);
auto sketch = kll_float_sketch::deserialize(is, serde<float>(), std::less<float>(), 0);
REQUIRE_FALSE(sketch.is_empty());
REQUIRE_FALSE(sketch.is_estimation_mode());
REQUIRE(sketch.get_n() == 1);
REQUIRE(sketch.get_num_retained() == 1);
REQUIRE(sketch.get_min_item() == 1.0);
REQUIRE(sketch.get_max_item() == 1.0);
}
SECTION("stream serialize deserialize three items") {
kll_float_sketch sketch(200, std::less<float>(), 0);
sketch.update(1.0f);
sketch.update(2.0f);
sketch.update(3.0f);
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
sketch.serialize(s);
REQUIRE(static_cast<size_t>(s.tellp()) == sketch.get_serialized_size_bytes());
auto sketch2 = kll_float_sketch::deserialize(s, serde<float>(), std::less<float>(), 0);
REQUIRE(static_cast<size_t>(s.tellp()) == sketch2.get_serialized_size_bytes());
REQUIRE(s.tellg() == s.tellp());
REQUIRE_FALSE(sketch2.is_empty());
REQUIRE_FALSE(sketch2.is_estimation_mode());
REQUIRE(sketch2.get_n() == 3);
REQUIRE(sketch2.get_num_retained() == 3);
REQUIRE(sketch2.get_min_item() == 1.0);
REQUIRE(sketch2.get_max_item() == 3.0);
}
SECTION("bytes serialize deserialize three items") {
kll_float_sketch sketch(200, std::less<float>(), 0);
sketch.update(1.0f);
sketch.update(2.0f);
sketch.update(3.0f);
auto bytes = sketch.serialize();
REQUIRE(bytes.size() == sketch.get_serialized_size_bytes());
auto sketch2 = kll_float_sketch::deserialize(bytes.data(), bytes.size(), serde<float>(), std::less<float>(), 0);
REQUIRE(bytes.size() == sketch2.get_serialized_size_bytes());
REQUIRE_FALSE(sketch2.is_empty());
REQUIRE_FALSE(sketch2.is_estimation_mode());
REQUIRE(sketch2.get_n() == 3);
REQUIRE(sketch2.get_num_retained() == 3);
REQUIRE(sketch2.get_min_item() == 1.0);
REQUIRE(sketch2.get_max_item() == 3.0);
}
SECTION("stream serialize deserialize many floats") {
kll_float_sketch sketch(200, std::less<float>(), 0);
const int n = 1000;
for (int i = 0; i < n; i++) sketch.update(static_cast<float>(i));
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
sketch.serialize(s);
REQUIRE(static_cast<size_t>(s.tellp()) == sketch.get_serialized_size_bytes());
auto sketch2 = kll_float_sketch::deserialize(s, serde<float>(), std::less<float>(), 0);
REQUIRE(static_cast<size_t>(s.tellp()) == sketch2.get_serialized_size_bytes());
REQUIRE(s.tellg() == s.tellp());
REQUIRE(sketch2.is_empty() == sketch.is_empty());
REQUIRE(sketch2.is_estimation_mode() == sketch.is_estimation_mode());
REQUIRE(sketch2.get_n() == sketch.get_n());
REQUIRE(sketch2.get_num_retained() == sketch.get_num_retained());
REQUIRE(sketch2.get_min_item() == sketch.get_min_item());
REQUIRE(sketch2.get_max_item() == sketch.get_max_item());
REQUIRE(sketch2.get_normalized_rank_error(false) == sketch.get_normalized_rank_error(false));
REQUIRE(sketch2.get_normalized_rank_error(true) == sketch.get_normalized_rank_error(true));
REQUIRE(sketch2.get_quantile(0.5) == sketch.get_quantile(0.5));
REQUIRE(sketch2.get_rank(0) == sketch.get_rank(0));
REQUIRE(sketch2.get_rank(static_cast<float>(n)) == sketch.get_rank(static_cast<float>(n)));
}
SECTION("bytes serialize deserialize many floats") {
kll_float_sketch sketch(200, std::less<float>(), 0);
const int n = 1000;
for (int i = 0; i < n; i++) sketch.update(static_cast<float>(i));
auto bytes = sketch.serialize();
REQUIRE(bytes.size() == sketch.get_serialized_size_bytes());
auto sketch2 = kll_float_sketch::deserialize(bytes.data(), bytes.size(), serde<float>(), std::less<float>(), 0);
REQUIRE(bytes.size() == sketch2.get_serialized_size_bytes());
REQUIRE(sketch2.is_empty() == sketch.is_empty());
REQUIRE(sketch2.is_estimation_mode() == sketch.is_estimation_mode());
REQUIRE(sketch2.get_n() == sketch.get_n());
REQUIRE(sketch2.get_num_retained() == sketch.get_num_retained());
REQUIRE(sketch2.get_min_item() == sketch.get_min_item());
REQUIRE(sketch2.get_max_item() == sketch.get_max_item());
REQUIRE(sketch2.get_normalized_rank_error(false) == sketch.get_normalized_rank_error(false));
REQUIRE(sketch2.get_normalized_rank_error(true) == sketch.get_normalized_rank_error(true));
REQUIRE(sketch2.get_quantile(0.5) == sketch.get_quantile(0.5));
REQUIRE(sketch2.get_rank(0) == sketch.get_rank(0));
REQUIRE(sketch2.get_rank(static_cast<float>(n)) == sketch.get_rank(static_cast<float>(n)));
REQUIRE_THROWS_AS(kll_float_sketch::deserialize(bytes.data(), 7, serde<float>(), std::less<float>(), 0), std::out_of_range);
REQUIRE_THROWS_AS(kll_float_sketch::deserialize(bytes.data(), 15, serde<float>(), std::less<float>(), 0), std::out_of_range);
REQUIRE_THROWS_AS(kll_float_sketch::deserialize(bytes.data(), bytes.size() - 1, serde<float>(), std::less<float>(), 0), std::out_of_range);
}
SECTION("bytes serialize deserialize many ints") {
kll_sketch<int> sketch;
const int n = 1000;
for (int i = 0; i < n; i++) sketch.update(i);
auto bytes = sketch.serialize();
REQUIRE(bytes.size() == sketch.get_serialized_size_bytes());
auto sketch2 = kll_sketch<int>::deserialize(bytes.data(), bytes.size());
REQUIRE(bytes.size() == sketch2.get_serialized_size_bytes());
REQUIRE(sketch2.is_empty() == sketch.is_empty());
REQUIRE(sketch2.is_estimation_mode() == sketch.is_estimation_mode());
REQUIRE(sketch2.get_n() == sketch.get_n());
REQUIRE(sketch2.get_num_retained() == sketch.get_num_retained());
REQUIRE(sketch2.get_min_item() == sketch.get_min_item());
REQUIRE(sketch2.get_max_item() == sketch.get_max_item());
REQUIRE(sketch2.get_normalized_rank_error(false) == sketch.get_normalized_rank_error(false));
REQUIRE(sketch2.get_normalized_rank_error(true) == sketch.get_normalized_rank_error(true));
REQUIRE(sketch2.get_quantile(0.5) == sketch.get_quantile(0.5));
REQUIRE(sketch2.get_rank(0) == sketch.get_rank(0));
REQUIRE(sketch2.get_rank(n) == sketch.get_rank(n));
REQUIRE_THROWS_AS(kll_sketch<int>::deserialize(bytes.data(), 7), std::out_of_range);
REQUIRE_THROWS_AS(kll_sketch<int>::deserialize(bytes.data(), 15), std::out_of_range);
REQUIRE_THROWS_AS(kll_sketch<int>::deserialize(bytes.data(), bytes.size() - 1), std::out_of_range);
}
SECTION("floor of log2 of fraction") {
REQUIRE(kll_helper::floor_of_log2_of_fraction(0, 1) == 0);
REQUIRE(kll_helper::floor_of_log2_of_fraction(1, 2) == 0);
REQUIRE(kll_helper::floor_of_log2_of_fraction(2, 2) == 0);
REQUIRE(kll_helper::floor_of_log2_of_fraction(3, 2) == 0);
REQUIRE(kll_helper::floor_of_log2_of_fraction(4, 2) == 1);
REQUIRE(kll_helper::floor_of_log2_of_fraction(5, 2) == 1);
REQUIRE(kll_helper::floor_of_log2_of_fraction(6, 2) == 1);
REQUIRE(kll_helper::floor_of_log2_of_fraction(7, 2) == 1);
REQUIRE(kll_helper::floor_of_log2_of_fraction(8, 2) == 2);
}
SECTION("out of order split points, float") {
kll_float_sketch sketch(200, std::less<float>(), 0);
sketch.update(0); // has too be non-empty to reach the check
float split_points[2] = {1, 0};
REQUIRE_THROWS_AS(sketch.get_CDF(split_points, 2), std::invalid_argument);
}
SECTION("out of order split points, int") {
kll_sketch<int> sketch;
sketch.update(0); // has too be non-empty to reach the check
int split_points[2] = {1, 0};
REQUIRE_THROWS_AS(sketch.get_CDF(split_points, 2), std::invalid_argument);
}
SECTION("NaN split point") {
kll_float_sketch sketch(200, std::less<float>(), 0);
sketch.update(0); // has too be non-empty to reach the check
float split_points[1] = {std::numeric_limits<float>::quiet_NaN()};
REQUIRE_THROWS_AS(sketch.get_CDF(split_points, 1), std::invalid_argument);
}
SECTION("merge") {
kll_float_sketch sketch1(200, std::less<float>(), 0);
kll_float_sketch sketch2(200, std::less<float>(), 0);
const int n = 10000;
for (int i = 0; i < n; i++) {
sketch1.update(static_cast<float>(i));
sketch2.update(static_cast<float>((2 * n) - i - 1));
}
REQUIRE(sketch1.get_min_item() == 0.0f);
REQUIRE(sketch1.get_max_item() == n - 1);
REQUIRE(sketch2.get_min_item() == n);
REQUIRE(sketch2.get_max_item() == 2.0f * n - 1);
sketch1.merge(sketch2);
REQUIRE_FALSE(sketch1.is_empty());
REQUIRE(sketch1.get_n() == 2 * n);
REQUIRE(sketch1.get_min_item() == 0.0f);
REQUIRE(sketch1.get_max_item() == 2.0f * n - 1);
REQUIRE(sketch1.get_quantile(0.5) == Approx(n).margin(n * RANK_EPS_FOR_K_200));
}
SECTION("merge lower k") {
kll_float_sketch sketch1(256, std::less<float>(), 0);
kll_float_sketch sketch2(128, std::less<float>(), 0);
const int n = 10000;
for (int i = 0; i < n; i++) {
sketch1.update(static_cast<float>(i));
sketch2.update(static_cast<float>((2 * n) - i - 1));
}
REQUIRE(sketch1.get_min_item() == 0.0f);
REQUIRE(sketch1.get_max_item() == n - 1);
REQUIRE(sketch2.get_min_item() == n);
REQUIRE(sketch2.get_max_item() == 2.0f * n - 1);
REQUIRE(sketch1.get_k() == 256);
REQUIRE(sketch2.get_k() == 128);
REQUIRE(sketch1.get_normalized_rank_error(false) < sketch2.get_normalized_rank_error(false));
REQUIRE(sketch1.get_normalized_rank_error(true) < sketch2.get_normalized_rank_error(true));
sketch1.merge(sketch2);
// sketch1 must get "contaminated" by the lower K in sketch2
REQUIRE(sketch2.get_normalized_rank_error(false) == sketch1.get_normalized_rank_error(false));
REQUIRE(sketch2.get_normalized_rank_error(true) == sketch1.get_normalized_rank_error(true));
REQUIRE_FALSE(sketch1.is_empty());
REQUIRE(sketch1.get_n() == 2 * n);
REQUIRE(sketch1.get_min_item() == 0.0f);
REQUIRE(sketch1.get_max_item() == 2.0f * n - 1);
REQUIRE(sketch1.get_quantile(0.5) == Approx(n).margin(n * RANK_EPS_FOR_K_200));
}
SECTION("merge exact mode, lower k") {
kll_float_sketch sketch1(256, std::less<float>(), 0);
kll_float_sketch sketch2(128, std::less<float>(), 0);
const int n = 10000;
for (int i = 0; i < n; i++) {
sketch1.update(static_cast<float>(i));
}
// rank error should not be affected by a merge with an empty sketch with lower k
const double rank_error_before_merge = sketch1.get_normalized_rank_error(true);
sketch1.merge(sketch2);
REQUIRE(sketch1.get_normalized_rank_error(true) == rank_error_before_merge);
REQUIRE_FALSE(sketch1.is_empty());
REQUIRE(sketch1.get_n() == n);
REQUIRE(sketch1.get_min_item() == 0.0f);
REQUIRE(sketch1.get_max_item() == n - 1);
REQUIRE(sketch1.get_quantile(0.5) == Approx(n / 2).margin(n / 2 * RANK_EPS_FOR_K_200));
sketch2.update(0);
sketch1.merge(sketch2);
// rank error should not be affected by a merge with a sketch in exact mode with lower k
REQUIRE(sketch1.get_normalized_rank_error(true) == rank_error_before_merge);
}
SECTION("merge min value from other") {
kll_float_sketch sketch1(200, std::less<float>(), 0);
kll_float_sketch sketch2(200, std::less<float>(), 0);
sketch1.update(1.0f);
sketch2.update(2.0f);
sketch2.merge(sketch1);
REQUIRE(sketch2.get_min_item() == 1.0f);
REQUIRE(sketch2.get_max_item() == 2.0f);
}
SECTION("merge min and max values from other") {
kll_float_sketch sketch1(200, std::less<float>(), 0);
for (int i = 0; i < 1000000; i++) sketch1.update(static_cast<float>(i));
kll_float_sketch sketch2(200, std::less<float>(), 0);
sketch2.merge(sketch1);
REQUIRE(sketch2.get_min_item() == 0.0f);
REQUIRE(sketch2.get_max_item() == 999999.0f);
}
SECTION("sketch of ints") {
kll_sketch<int> sketch;
REQUIRE_THROWS_AS(sketch.get_quantile(0), std::runtime_error);
REQUIRE_THROWS_AS(sketch.get_min_item(), std::runtime_error);
REQUIRE_THROWS_AS(sketch.get_max_item(), std::runtime_error);
const int n = 1000;
for (int i = 0; i < n; i++) sketch.update(i);
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
sketch.serialize(s);
REQUIRE(static_cast<size_t>(s.tellp()) == sketch.get_serialized_size_bytes());
auto sketch2 = kll_sketch<int>::deserialize(s);
REQUIRE(static_cast<size_t>(s.tellp()) == sketch2.get_serialized_size_bytes());
REQUIRE(s.tellg() == s.tellp());
REQUIRE(sketch2.is_empty() == sketch.is_empty());
REQUIRE(sketch2.is_estimation_mode() == sketch.is_estimation_mode());
REQUIRE(sketch2.get_n() == sketch.get_n());
REQUIRE(sketch2.get_num_retained() == sketch.get_num_retained());
REQUIRE(sketch2.get_min_item() == sketch.get_min_item());
REQUIRE(sketch2.get_max_item() == sketch.get_max_item());
REQUIRE(sketch2.get_normalized_rank_error(false) == sketch.get_normalized_rank_error(false));
REQUIRE(sketch2.get_normalized_rank_error(true) == sketch.get_normalized_rank_error(true));
REQUIRE(sketch2.get_quantile(0.5) == sketch.get_quantile(0.5));
REQUIRE(sketch2.get_rank(0) == sketch.get_rank(0));
REQUIRE(sketch2.get_rank(n) == sketch.get_rank(n));
}
SECTION("sketch of strings stream") {
kll_string_sketch sketch1(200, std::less<std::string>(), 0);
REQUIRE_THROWS_AS(sketch1.get_quantile(0), std::runtime_error);
REQUIRE_THROWS_AS(sketch1.get_min_item(), std::runtime_error);
REQUIRE_THROWS_AS(sketch1.get_max_item(), std::runtime_error);
REQUIRE(sketch1.get_serialized_size_bytes() == 8);
const int n = 1000;
for (int i = 0; i < n; i++) sketch1.update(std::to_string(i));
REQUIRE(sketch1.get_min_item() == std::string("0"));
REQUIRE(sketch1.get_max_item() == std::string("999"));
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
sketch1.serialize(s);
REQUIRE(static_cast<size_t>(s.tellp()) == sketch1.get_serialized_size_bytes());
auto sketch2 = kll_string_sketch::deserialize(s, serde<std::string>(), std::less<std::string>(), 0);
REQUIRE(static_cast<size_t>(s.tellp()) == sketch2.get_serialized_size_bytes());
REQUIRE(s.tellg() == s.tellp());
REQUIRE(sketch2.is_empty() == sketch1.is_empty());
REQUIRE(sketch2.is_estimation_mode() == sketch1.is_estimation_mode());
REQUIRE(sketch2.get_n() == sketch1.get_n());
REQUIRE(sketch2.get_num_retained() == sketch1.get_num_retained());
REQUIRE(sketch2.get_min_item() == sketch1.get_min_item());
REQUIRE(sketch2.get_max_item() == sketch1.get_max_item());
REQUIRE(sketch2.get_normalized_rank_error(false) == sketch1.get_normalized_rank_error(false));
REQUIRE(sketch2.get_normalized_rank_error(true) == sketch1.get_normalized_rank_error(true));
REQUIRE(sketch2.get_quantile(0.5) == sketch1.get_quantile(0.5));
REQUIRE(sketch2.get_rank(std::to_string(0)) == sketch1.get_rank(std::to_string(0)));
REQUIRE(sketch2.get_rank(std::to_string(n)) == sketch1.get_rank(std::to_string(n)));
// to take a look using hexdump
//std::ofstream os("kll-string.sk");
//sketch1.serialize(os);
// debug print
//sketch1.to_stream(std::cout);
}
SECTION("sketch of strings bytes") {
kll_string_sketch sketch1(200, std::less<std::string>(), 0);
REQUIRE_THROWS_AS(sketch1.get_quantile(0), std::runtime_error);
REQUIRE_THROWS_AS(sketch1.get_min_item(), std::runtime_error);
REQUIRE_THROWS_AS(sketch1.get_max_item(), std::runtime_error);
REQUIRE(sketch1.get_serialized_size_bytes() == 8);
const int n = 1000;
for (int i = 0; i < n; i++) sketch1.update(std::to_string(i));
REQUIRE(sketch1.get_min_item() == std::string("0"));
REQUIRE(sketch1.get_max_item() == std::string("999"));
auto bytes = sketch1.serialize();
REQUIRE(bytes.size() == sketch1.get_serialized_size_bytes());
auto sketch2 = kll_string_sketch::deserialize(bytes.data(), bytes.size(), serde<std::string>(),
std::less<std::string>(), 0);
REQUIRE(bytes.size() == sketch2.get_serialized_size_bytes());
REQUIRE(sketch2.is_empty() == sketch1.is_empty());
REQUIRE(sketch2.is_estimation_mode() == sketch1.is_estimation_mode());
REQUIRE(sketch2.get_n() == sketch1.get_n());
REQUIRE(sketch2.get_num_retained() == sketch1.get_num_retained());
REQUIRE(sketch2.get_min_item() == sketch1.get_min_item());
REQUIRE(sketch2.get_max_item() == sketch1.get_max_item());
REQUIRE(sketch2.get_normalized_rank_error(false) == sketch1.get_normalized_rank_error(false));
REQUIRE(sketch2.get_normalized_rank_error(true) == sketch1.get_normalized_rank_error(true));
REQUIRE(sketch2.get_quantile(0.5) == sketch1.get_quantile(0.5));
REQUIRE(sketch2.get_rank(std::to_string(0)) == sketch1.get_rank(std::to_string(0)));
REQUIRE(sketch2.get_rank(std::to_string(n)) == sketch1.get_rank(std::to_string(n)));
}
SECTION("sketch of strings, single item, bytes") {
kll_string_sketch sketch1(200, std::less<std::string>(), 0);
sketch1.update("a");
auto bytes = sketch1.serialize();
REQUIRE(bytes.size() == sketch1.get_serialized_size_bytes());
auto sketch2 = kll_string_sketch::deserialize(bytes.data(), bytes.size(), serde<std::string>(),
std::less<std::string>(), 0);
REQUIRE(bytes.size() == sketch2.get_serialized_size_bytes());
}
SECTION("copy") {
kll_sketch<int> sketch1;
const int n(1000);
for (int i = 0; i < n; i++) sketch1.update(i);
// copy constructor
kll_sketch<int> sketch2(sketch1);
for (int i = 0; i < n; i++) {
REQUIRE(sketch2.get_rank(i) == sketch1.get_rank(i));
}
// copy assignment
kll_sketch<int> sketch3;
sketch3 = sketch1;
for (int i = 0; i < n; i++) {
REQUIRE(sketch3.get_rank(i) == sketch1.get_rank(i));
}
}
SECTION("move") {
kll_sketch<int> sketch1;
const int n(100);
for (int i = 0; i < n; i++) sketch1.update(i);
// move constructor
kll_sketch<int> sketch2(std::move(sketch1));
for (int i = 0; i < n; i++) {
REQUIRE(sketch2.get_rank(i, false) == (double) i / n);
}
// move assignment
kll_sketch<int> sketch3;
sketch3 = std::move(sketch2);
for (int i = 0; i < n; i++) {
REQUIRE(sketch3.get_rank(i, false) == (double) i / n);
}
}
SECTION("max serialized size arithmetic type") {
REQUIRE(kll_sketch<float>::get_max_serialized_size_bytes(200, 10) == 1968);
REQUIRE(kll_sketch<float>::get_max_serialized_size_bytes(200, 100) == 2316);
REQUIRE(kll_sketch<float>::get_max_serialized_size_bytes(200, 1000) == 2440);
REQUIRE(kll_sketch<float>::get_max_serialized_size_bytes(200, 1000000) == 2800);
REQUIRE(kll_sketch<float>::get_max_serialized_size_bytes(200, 1000000000) == 3160);
}
SECTION("max serialized size non-arithmetic type") {
REQUIRE(kll_sketch<std::string>::get_max_serialized_size_bytes(200, 10, 4) == 1968);
REQUIRE(kll_sketch<std::string>::get_max_serialized_size_bytes(200, 100, 4) == 2316);
REQUIRE(kll_sketch<std::string>::get_max_serialized_size_bytes(200, 1000, 4) == 2440);
REQUIRE(kll_sketch<std::string>::get_max_serialized_size_bytes(200, 1000000, 4) == 2800);
REQUIRE(kll_sketch<std::string>::get_max_serialized_size_bytes(200, 1000000000, 4) == 3160);
}
SECTION("issue #236") {
kll_sketch<int8_t> kll;
kll.update(1);
kll.update(2);
kll.update(3);
auto blob = kll.serialize();
auto kll2 = kll_sketch<int8_t>::deserialize(blob.data(), blob.size());
}
SECTION("sorted view") {
kll_sketch<int> kll;
kll.update(2);
kll.update(3);
kll.update(1);
{
auto view = kll.get_sorted_view();
REQUIRE(view.size() == 3);
auto it = view.begin();
REQUIRE(it->first == 1); // operator->
REQUIRE((*it).first == 1); // operator*
REQUIRE(it->second == 1);
REQUIRE(it.get_weight() == 1);
++it;
REQUIRE(it->first == 2);
REQUIRE(it->second == 2);
REQUIRE(it.get_weight() == 1);
++it;
REQUIRE(it->first == 3);
REQUIRE(it->second == 3);
REQUIRE(it.get_weight() == 1);
++it;
REQUIRE(it == view.end());
}
}
SECTION("type conversion: empty") {
kll_sketch<double> kll_double;
kll_sketch<float> kll_float(kll_double);
REQUIRE(kll_float.is_empty());
REQUIRE(kll_float.get_k() == kll_double.get_k());
REQUIRE(kll_float.get_n() == 0);
REQUIRE(kll_float.get_num_retained() == 0);
}
SECTION("type conversion: over k") {
kll_sketch<double> kll_double;
for (int i = 0; i < 1000; ++i) kll_double.update(static_cast<double>(i));
kll_sketch<float> kll_float(kll_double);
REQUIRE(!kll_float.is_empty());
REQUIRE(kll_float.get_k() == kll_double.get_k());
REQUIRE(kll_float.get_n() == kll_double.get_n());
REQUIRE(kll_float.get_num_retained() == kll_double.get_num_retained());
auto sv_float = kll_float.get_sorted_view();
auto sv_double = kll_double.get_sorted_view();
auto sv_float_it = sv_float.begin();
auto sv_double_it = sv_double.begin();
while (sv_float_it != sv_float.end()) {
REQUIRE(sv_double_it != sv_double.end());
auto float_pair = *sv_float_it;
auto double_pair = *sv_double_it;
REQUIRE(float_pair.first == Approx(double_pair.first).margin(0.01));
REQUIRE(float_pair.second == double_pair.second);
++sv_float_it;
++sv_double_it;
}
REQUIRE(sv_double_it == sv_double.end());
}
class A {
int val;
public:
A(int val): val(val) {}
int get_val() const { return val; }
};
struct less_A {
bool operator()(const A& a1, const A& a2) const { return a1.get_val() < a2.get_val(); }
};
class B {
int val;
public:
explicit B(const A& a): val(a.get_val()) {}
int get_val() const { return val; }
};
struct less_B {
bool operator()(const B& b1, const B& b2) const { return b1.get_val() < b2.get_val(); }
};
SECTION("type conversion: custom types") {
kll_sketch<A, less_A> sa;
sa.update(1);
sa.update(2);
sa.update(3);
kll_sketch<B, less_B> sb(sa);
REQUIRE(sb.get_n() == 3);
}
// cleanup
REQUIRE(test_allocator_total_bytes == 0);
}
} /* namespace datasketches */