blob: fc3f5d1c97598a08930f2f89bf369ffdf6d94471 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
#include <catch2/catch.hpp>
#include <iostream>
#include <fstream>
#include "tdigest.hpp"
namespace datasketches {
TEST_CASE("empty", "[tdigest]") {
tdigest_double td(10);
// std::cout << td.to_string();
REQUIRE(td.is_empty());
REQUIRE(td.get_k() == 10);
REQUIRE(td.get_total_weight() == 0);
REQUIRE_THROWS_AS(td.get_min_value(), std::runtime_error);
REQUIRE_THROWS_AS(td.get_max_value(), std::runtime_error);
REQUIRE_THROWS_AS(td.get_rank(0), std::runtime_error);
REQUIRE_THROWS_AS(td.get_quantile(0.5), std::runtime_error);
const double split_points[1] {0};
REQUIRE_THROWS_AS(td.get_PMF(split_points, 1), std::runtime_error);
REQUIRE_THROWS_AS(td.get_CDF(split_points, 1), std::runtime_error);
}
TEST_CASE("one value", "[tdigest]") {
tdigest_double td(100);
td.update(1);
REQUIRE(td.get_k() == 100);
REQUIRE(td.get_total_weight() == 1);
REQUIRE(td.get_min_value() == 1);
REQUIRE(td.get_max_value() == 1);
REQUIRE(td.get_rank(0.99) == 0);
REQUIRE(td.get_rank(1) == 0.5);
REQUIRE(td.get_rank(1.01) == 1);
REQUIRE(td.get_quantile(0) == 1);
REQUIRE(td.get_quantile(0.5) == 1);
REQUIRE(td.get_quantile(1) == 1);
}
TEST_CASE("many values", "[tdigest]") {
const size_t n = 10000;
tdigest_double td;
for (size_t i = 0; i < n; ++i) td.update(i);
REQUIRE_FALSE(td.is_empty());
REQUIRE(td.get_total_weight() == n);
REQUIRE(td.get_min_value() == 0);
REQUIRE(td.get_max_value() == n - 1);
REQUIRE(td.get_rank(0) == Approx(0).margin(0.0001));
REQUIRE(td.get_rank(n / 4) == Approx(0.25).margin(0.0001));
REQUIRE(td.get_rank(n / 2) == Approx(0.5).margin(0.0001));
REQUIRE(td.get_rank(n * 3 / 4) == Approx(0.75).margin(0.0001));
REQUIRE(td.get_rank(n) == 1);
REQUIRE(td.get_quantile(0) == 0);
REQUIRE(td.get_quantile(0.5) == Approx(n / 2).epsilon(0.03));
REQUIRE(td.get_quantile(0.9) == Approx(n * 0.9).epsilon(0.01));
REQUIRE(td.get_quantile(0.95) == Approx(n * 0.95).epsilon(0.01));
REQUIRE(td.get_quantile(1) == n - 1);
const double split_points[1] {n / 2};
const auto pmf = td.get_PMF(split_points, 1);
REQUIRE(pmf.size() == 2);
REQUIRE(pmf[0] == Approx(0.5).margin(0.0001));
REQUIRE(pmf[1] == Approx(0.5).margin(0.0001));
const auto cdf = td.get_CDF(split_points, 1);
REQUIRE(cdf.size() == 2);
REQUIRE(cdf[0] == Approx(0.5).margin(0.0001));
REQUIRE(cdf[1] == 1);
}
TEST_CASE("rank - two values", "[tdigest]") {
tdigest_double td(100);
td.update(1);
td.update(2);
// td.compress();
// std::cout << td.to_string(true);
REQUIRE(td.get_rank(0.99) == 0);
REQUIRE(td.get_rank(1) == 0.25);
REQUIRE(td.get_rank(1.25) == 0.375);
REQUIRE(td.get_rank(1.5) == 0.5);
REQUIRE(td.get_rank(1.75) == 0.625);
REQUIRE(td.get_rank(2) == 0.75);
REQUIRE(td.get_rank(2.01) == 1);
}
TEST_CASE("rank - repeated value", "[tdigest]") {
tdigest_double td(100);
td.update(1);
td.update(1);
td.update(1);
td.update(1);
// td.compress();
// std::cout << td.to_string(true);
REQUIRE(td.get_rank(0.99) == 0);
REQUIRE(td.get_rank(1) == 0.5);
REQUIRE(td.get_rank(1.01) == 1);
}
TEST_CASE("rank - repeated block", "[tdigest]") {
tdigest_double td(100);
td.update(1);
td.update(2);
td.update(2);
td.update(3);
// td.compress();
// std::cout << td.to_string(true);
REQUIRE(td.get_rank(0.99) == 0);
REQUIRE(td.get_rank(1) == 0.125);
REQUIRE(td.get_rank(2) == 0.5);
REQUIRE(td.get_rank(3) == 0.875);
REQUIRE(td.get_rank(3.01) == 1);
}
TEST_CASE("merge small", "[tdigest]") {
tdigest_double td1(10);
td1.update(1);
td1.update(2);
tdigest_double td2(10);
td2.update(2);
td2.update(3);
td1.merge(td2);
REQUIRE(td1.get_min_value() == 1);
REQUIRE(td1.get_max_value() == 3);
REQUIRE(td1.get_total_weight() == 4);
REQUIRE(td1.get_rank(0.99) == 0);
REQUIRE(td1.get_rank(1) == 0.125);
REQUIRE(td1.get_rank(2) == 0.5);
REQUIRE(td1.get_rank(3) == 0.875);
REQUIRE(td1.get_rank(3.01) == 1);
}
TEST_CASE("merge large", "[tdigest]") {
const size_t n = 10000;
tdigest_double td1;
tdigest_double td2;
for (size_t i = 0; i < n / 2; ++i) {
td1.update(i);
td2.update(n / 2 + i);
}
// std::cout << td1.to_string();
// std::cout << td2.to_string();
td1.merge(td2);
// td1.compress();
// std::cout << td1.to_string(true);
REQUIRE(td1.get_total_weight() == n);
REQUIRE(td1.get_min_value() == 0);
REQUIRE(td1.get_max_value() == n - 1);
REQUIRE(td1.get_rank(0) == Approx(0).margin(0.0001));
REQUIRE(td1.get_rank(n / 4) == Approx(0.25).margin(0.0001));
REQUIRE(td1.get_rank(n / 2) == Approx(0.5).margin(0.0001));
REQUIRE(td1.get_rank(n * 3 / 4) == Approx(0.75).margin(0.0001));
REQUIRE(td1.get_rank(n) == 1);
}
TEST_CASE("serialize deserialize stream empty", "[tdigest]") {
tdigest<double> td(100);
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
td.serialize(s);
auto deserialized_td = tdigest<double>::deserialize(s);
REQUIRE(td.get_k() == deserialized_td.get_k());
REQUIRE(td.get_total_weight() == deserialized_td.get_total_weight());
REQUIRE(td.is_empty() == deserialized_td.is_empty());
}
TEST_CASE("serialize deserialize stream single value", "[tdigest]") {
tdigest<double> td;
td.update(123);
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
td.serialize(s);
auto deserialized_td = tdigest<double>::deserialize(s);
REQUIRE(deserialized_td.get_k() == 200);
REQUIRE(deserialized_td.get_total_weight() == 1);
REQUIRE_FALSE(deserialized_td.is_empty());
REQUIRE(deserialized_td.get_min_value() == 123);
REQUIRE(deserialized_td.get_max_value() == 123);
}
TEST_CASE("serialize deserialize stream single value buffered", "[tdigest]") {
tdigest<double> td;
td.update(123);
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
td.serialize(s, true);
auto deserialized_td = tdigest<double>::deserialize(s);
REQUIRE(deserialized_td.get_k() == 200);
REQUIRE(deserialized_td.get_total_weight() == 1);
REQUIRE_FALSE(deserialized_td.is_empty());
REQUIRE(deserialized_td.get_min_value() == 123);
REQUIRE(deserialized_td.get_max_value() == 123);
}
TEST_CASE("serialize deserialize stream many values", "[tdigest]") {
tdigest<double> td(100);
for (int i = 0; i < 1000; ++i) td.update(i);
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
td.serialize(s);
auto deserialized_td = tdigest<double>::deserialize(s);
REQUIRE(td.get_k() == deserialized_td.get_k());
REQUIRE(td.get_total_weight() == deserialized_td.get_total_weight());
REQUIRE(td.is_empty() == deserialized_td.is_empty());
REQUIRE(td.get_min_value() == deserialized_td.get_min_value());
REQUIRE(td.get_max_value() == deserialized_td.get_max_value());
REQUIRE(td.get_rank(500) == deserialized_td.get_rank(500));
REQUIRE(td.get_quantile(0.5) == deserialized_td.get_quantile(0.5));
}
TEST_CASE("serialize deserialize stream many values with buffer", "[tdigest]") {
tdigest<double> td(100);
for (int i = 0; i < 10000; ++i) td.update(i);
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
td.serialize(s, true);
auto deserialized_td = tdigest<double>::deserialize(s);
REQUIRE(td.get_k() == deserialized_td.get_k());
REQUIRE(td.get_total_weight() == deserialized_td.get_total_weight());
REQUIRE(td.is_empty() == deserialized_td.is_empty());
REQUIRE(td.get_min_value() == deserialized_td.get_min_value());
REQUIRE(td.get_max_value() == deserialized_td.get_max_value());
REQUIRE(td.get_rank(500) == deserialized_td.get_rank(500));
REQUIRE(td.get_quantile(0.5) == deserialized_td.get_quantile(0.5));
}
TEST_CASE("serialize deserialize bytes empty", "[tdigest]") {
tdigest<double> td(100);
auto bytes = td.serialize();
auto deserialized_td = tdigest<double>::deserialize(bytes.data(), bytes.size());
REQUIRE(td.get_k() == deserialized_td.get_k());
REQUIRE(td.get_total_weight() == deserialized_td.get_total_weight());
REQUIRE(td.is_empty() == deserialized_td.is_empty());
}
TEST_CASE("serialize deserialize bytes single value", "[tdigest]") {
tdigest<double> td(200);
td.update(123);
auto bytes = td.serialize();
auto deserialized_td = tdigest<double>::deserialize(bytes.data(), bytes.size());
REQUIRE(deserialized_td.get_k() == 200);
REQUIRE(deserialized_td.get_total_weight() == 1);
REQUIRE_FALSE(deserialized_td.is_empty());
REQUIRE(deserialized_td.get_min_value() == 123);
REQUIRE(deserialized_td.get_max_value() == 123);
}
TEST_CASE("serialize deserialize bytes single value buffered", "[tdigest]") {
tdigest<double> td(200);
td.update(123);
auto bytes = td.serialize(0, true);
auto deserialized_td = tdigest<double>::deserialize(bytes.data(), bytes.size());
REQUIRE(deserialized_td.get_k() == 200);
REQUIRE(deserialized_td.get_total_weight() == 1);
REQUIRE_FALSE(deserialized_td.is_empty());
REQUIRE(deserialized_td.get_min_value() == 123);
REQUIRE(deserialized_td.get_max_value() == 123);
}
TEST_CASE("serialize deserialize bytes many values", "[tdigest]") {
tdigest<double> td(100);
for (int i = 0; i < 1000; ++i) td.update(i);
auto bytes = td.serialize();
auto deserialized_td = tdigest<double>::deserialize(bytes.data(), bytes.size());
REQUIRE(td.get_k() == deserialized_td.get_k());
REQUIRE(td.get_total_weight() == deserialized_td.get_total_weight());
REQUIRE(td.is_empty() == deserialized_td.is_empty());
REQUIRE(td.get_min_value() == deserialized_td.get_min_value());
REQUIRE(td.get_max_value() == deserialized_td.get_max_value());
REQUIRE(td.get_rank(500) == deserialized_td.get_rank(500));
REQUIRE(td.get_quantile(0.5) == deserialized_td.get_quantile(0.5));
}
TEST_CASE("serialize deserialize bytes many values with buffer", "[tdigest]") {
tdigest<double> td(100);
for (int i = 0; i < 10000; ++i) td.update(i);
auto bytes = td.serialize();
auto deserialized_td = tdigest<double>::deserialize(bytes.data(), bytes.size());
REQUIRE(td.get_k() == deserialized_td.get_k());
REQUIRE(td.get_total_weight() == deserialized_td.get_total_weight());
REQUIRE(td.is_empty() == deserialized_td.is_empty());
REQUIRE(td.get_min_value() == deserialized_td.get_min_value());
REQUIRE(td.get_max_value() == deserialized_td.get_max_value());
REQUIRE(td.get_rank(500) == deserialized_td.get_rank(500));
REQUIRE(td.get_quantile(0.5) == deserialized_td.get_quantile(0.5));
}
TEST_CASE("serialize deserialize steam and bytes equivalence empty", "[tdigest]") {
tdigest<double> td(100);
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
td.serialize(s);
auto bytes = td.serialize();
REQUIRE(bytes.size() == static_cast<size_t>(s.tellp()));
for (size_t i = 0; i < bytes.size(); ++i) {
REQUIRE(((char*)bytes.data())[i] == (char)s.get());
}
s.seekg(0); // rewind
auto deserialized_td1 = tdigest<double>::deserialize(s);
auto deserialized_td2 = tdigest<double>::deserialize(bytes.data(), bytes.size());
REQUIRE(bytes.size() == static_cast<size_t>(s.tellg()));
REQUIRE(deserialized_td1.is_empty());
REQUIRE(deserialized_td2.is_empty());
REQUIRE(deserialized_td1.get_k() == 100);
REQUIRE(deserialized_td2.get_k() == 100);
REQUIRE(deserialized_td1.get_total_weight() == 0);
REQUIRE(deserialized_td2.get_total_weight() == 0);
}
TEST_CASE("serialize deserialize steam and bytes equivalence", "[tdigest]") {
tdigest<double> td(100);
const int n = 1000;
for (int i = 0; i < n; ++i) td.update(i);
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
td.serialize(s);
auto bytes = td.serialize();
REQUIRE(bytes.size() == static_cast<size_t>(s.tellp()));
for (size_t i = 0; i < bytes.size(); ++i) {
REQUIRE(((char*)bytes.data())[i] == (char)s.get());
}
s.seekg(0); // rewind
auto deserialized_td1 = tdigest<double>::deserialize(s);
auto deserialized_td2 = tdigest<double>::deserialize(bytes.data(), bytes.size());
REQUIRE(bytes.size() == static_cast<size_t>(s.tellg()));
REQUIRE_FALSE(deserialized_td1.is_empty());
REQUIRE(deserialized_td1.get_k() == 100);
REQUIRE(deserialized_td1.get_total_weight() == n);
REQUIRE(deserialized_td1.get_min_value() == 0);
REQUIRE(deserialized_td1.get_max_value() == n - 1);
REQUIRE_FALSE(deserialized_td2.is_empty());
REQUIRE(deserialized_td2.get_k() == 100);
REQUIRE(deserialized_td2.get_total_weight() == n);
REQUIRE(deserialized_td2.get_min_value() == 0);
REQUIRE(deserialized_td2.get_max_value() == n - 1);
REQUIRE(deserialized_td1.get_rank(n / 2) == deserialized_td2.get_rank(n / 2));
REQUIRE(deserialized_td1.get_quantile(0.5) == deserialized_td2.get_quantile(0.5));
}
TEST_CASE("serialize deserialize steam and bytes equivalence with buffer", "[tdigest]") {
tdigest<double> td(100);
const int n = 10000;
for (int i = 0; i < n; ++i) td.update(i);
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
td.serialize(s, true);
auto bytes = td.serialize(0, true);
REQUIRE(bytes.size() == static_cast<size_t>(s.tellp()));
for (size_t i = 0; i < bytes.size(); ++i) {
REQUIRE(((char*)bytes.data())[i] == (char)s.get());
}
s.seekg(0); // rewind
auto deserialized_td1 = tdigest<double>::deserialize(s);
auto deserialized_td2 = tdigest<double>::deserialize(bytes.data(), bytes.size());
REQUIRE(bytes.size() == static_cast<size_t>(s.tellg()));
REQUIRE_FALSE(deserialized_td1.is_empty());
REQUIRE(deserialized_td1.get_k() == 100);
REQUIRE(deserialized_td1.get_total_weight() == n);
REQUIRE(deserialized_td1.get_min_value() == 0);
REQUIRE(deserialized_td1.get_max_value() == n - 1);
REQUIRE_FALSE(deserialized_td2.is_empty());
REQUIRE(deserialized_td2.get_k() == 100);
REQUIRE(deserialized_td2.get_total_weight() == n);
REQUIRE(deserialized_td2.get_min_value() == 0);
REQUIRE(deserialized_td2.get_max_value() == n - 1);
REQUIRE(deserialized_td1.get_rank(n / 2) == deserialized_td2.get_rank(n / 2));
REQUIRE(deserialized_td1.get_quantile(0.5) == deserialized_td2.get_quantile(0.5));
}
TEST_CASE("deserialize from reference implementation stream double", "[tdigest]") {
std::ifstream is;
is.exceptions(std::ios::failbit | std::ios::badbit);
is.open(std::string(TEST_BINARY_INPUT_PATH) + "tdigest_ref_k100_n10000_double.sk", std::ios::binary);
const auto td = tdigest<double>::deserialize(is);
const size_t n = 10000;
REQUIRE(td.get_total_weight() == n);
REQUIRE(td.get_min_value() == 0);
REQUIRE(td.get_max_value() == n - 1);
REQUIRE(td.get_rank(0) == Approx(0).margin(0.0001));
REQUIRE(td.get_rank(n / 4) == Approx(0.25).margin(0.0001));
REQUIRE(td.get_rank(n / 2) == Approx(0.5).margin(0.0001));
REQUIRE(td.get_rank(n * 3 / 4) == Approx(0.75).margin(0.0001));
REQUIRE(td.get_rank(n) == 1);
}
TEST_CASE("deserialize from reference implementation stream float", "[tdigest]") {
std::ifstream is;
is.exceptions(std::ios::failbit | std::ios::badbit);
is.open(std::string(TEST_BINARY_INPUT_PATH) + "tdigest_ref_k100_n10000_float.sk", std::ios::binary);
const auto td = tdigest<float>::deserialize(is);
const size_t n = 10000;
REQUIRE(td.get_total_weight() == n);
REQUIRE(td.get_min_value() == 0);
REQUIRE(td.get_max_value() == n - 1);
REQUIRE(td.get_rank(0) == Approx(0).margin(0.0001));
REQUIRE(td.get_rank(n / 4) == Approx(0.25).margin(0.0001));
REQUIRE(td.get_rank(n / 2) == Approx(0.5).margin(0.0001));
REQUIRE(td.get_rank(n * 3 / 4) == Approx(0.75).margin(0.0001));
REQUIRE(td.get_rank(n) == 1);
}
TEST_CASE("deserialize from reference implementation bytes double", "[tdigest]") {
std::ifstream is;
is.exceptions(std::ios::failbit | std::ios::badbit);
is.open(std::string(TEST_BINARY_INPUT_PATH) + "tdigest_ref_k100_n10000_double.sk", std::ios::binary);
std::vector<char> bytes((std::istreambuf_iterator<char>(is)), (std::istreambuf_iterator<char>()));
const auto td = tdigest<double>::deserialize(bytes.data(), bytes.size());
const size_t n = 10000;
REQUIRE(td.get_total_weight() == n);
REQUIRE(td.get_min_value() == 0);
REQUIRE(td.get_max_value() == n - 1);
REQUIRE(td.get_rank(0) == Approx(0).margin(0.0001));
REQUIRE(td.get_rank(n / 4) == Approx(0.25).margin(0.0001));
REQUIRE(td.get_rank(n / 2) == Approx(0.5).margin(0.0001));
REQUIRE(td.get_rank(n * 3 / 4) == Approx(0.75).margin(0.0001));
REQUIRE(td.get_rank(n) == 1);
}
TEST_CASE("deserialize from reference implementation bytes float", "[tdigest]") {
std::ifstream is;
is.exceptions(std::ios::failbit | std::ios::badbit);
is.open(std::string(TEST_BINARY_INPUT_PATH) + "tdigest_ref_k100_n10000_float.sk", std::ios::binary);
std::vector<char> bytes((std::istreambuf_iterator<char>(is)), (std::istreambuf_iterator<char>()));
const auto td = tdigest<double>::deserialize(bytes.data(), bytes.size());
const size_t n = 10000;
REQUIRE(td.get_total_weight() == n);
REQUIRE(td.get_min_value() == 0);
REQUIRE(td.get_max_value() == n - 1);
REQUIRE(td.get_rank(0) == Approx(0).margin(0.0001));
REQUIRE(td.get_rank(n / 4) == Approx(0.25).margin(0.0001));
REQUIRE(td.get_rank(n / 2) == Approx(0.5).margin(0.0001));
REQUIRE(td.get_rank(n * 3 / 4) == Approx(0.75).margin(0.0001));
REQUIRE(td.get_rank(n) == 1);
}
} /* namespace datasketches */