be/test/util/tdigest_test.cpp - doris - Git at Google

 // Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements.  See the NOTICE file
 // distributed with this work for additional information
 // regarding copyright ownership.  The ASF licenses this file
 // to you under the Apache License, Version 2.0 (the
 // "License"); you may not use this file except in compliance
 // with the License.  You may obtain a copy of the License at
 //
 //   http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing,
 // software distributed under the License is distributed on an
 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.

 #include "util/tdigest.h"

 #include <gtest/gtest.h>

 #include <random>

 #include "test_util/test_util.h"

 namespace doris {

 class TDigestTest : public ::testing::Test {
 protected:
     // You can remove any or all of the following functions if its body
     // is empty.
     TDigestTest() {
         // You can do set-up work for each test here.
     }

     virtual ~TDigestTest() {
         // You can do clean-up work that doesn't throw exceptions here.
     }

     // If the constructor and destructor are not enough for setting up
     // and cleaning up each test, you can define the following methods:

     virtual void SetUp() {
         // Code here will be called immediately after the constructor (right
         // before each test).
     }

     virtual void TearDown() {
         // Code here will be called immediately after each test (right
         // before the destructor).
     }

     static void SetUpTestCase() {
         static bool initialized = false;
         if (!initialized) {
             FLAGS_logtostderr = true;
             google::InstallFailureSignalHandler();
             google::InitGoogleLogging("testing::TDigestTest");
             initialized = true;
         }
     }

     // Objects declared here can be used by all tests in the test case for Foo.
 };

 static double quantile(const double q, const std::vector<double>& values) {
     double q1;
     if (values.size() == 0) {
         q1 = NAN;
     } else if (q == 1 || values.size() == 1) {
         q1 = values[values.size() - 1];
     } else {
         auto index = q * values.size();
         if (index < 0.5) {
             q1 = values[0];
         } else if (values.size() - index < 0.5) {
             q1 = values[values.size() - 1];
         } else {
             index -= 0.5;
             const int intIndex = static_cast<int>(index);
             q1 = values[intIndex + 1] * (index - intIndex) +
                  values[intIndex] * (intIndex + 1 - index);
         }
     }
     return q1;
 }

 TEST_F(TDigestTest, CrashAfterMerge) {
     TDigest digest(1000);
     std::uniform_real_distribution<> reals(0.0, 1.0);
     std::random_device gen;
     for (int i = 0; i < LOOP_LESS_OR_MORE(100, 100000); i++) {
         digest.add(reals(gen));
     }
     digest.compress();

     TDigest digest2(1000);
     digest2.merge(&digest);
     digest2.quantile(0.5);
 }

 TEST_F(TDigestTest, EmptyDigest) {
     TDigest digest(100);
     EXPECT_EQ(0, digest.processed().size());
 }

 TEST_F(TDigestTest, SingleValue) {
     TDigest digest(100);
     std::random_device gen;
     std::uniform_real_distribution<> dist(0, 1000);
     const auto value = dist(gen);
     digest.add(value);
     std::uniform_real_distribution<> dist2(0, 1.0);
     const double q = dist2(gen);
     EXPECT_NEAR(value, digest.quantile(0.0), 0.001f);
     EXPECT_NEAR(value, digest.quantile(q), 0.001f);
     EXPECT_NEAR(value, digest.quantile(1.0), 0.001f);
 }

 TEST_F(TDigestTest, FewValues) {
     // When there are few values in the tree, quantiles should be exact
     TDigest digest(1000);

     std::random_device gen;
     std::uniform_real_distribution<> reals(0.0, 100.0);
     std::uniform_int_distribution<> dist(0, 10);
     std::uniform_int_distribution<> bools(0, 1);
     std::uniform_real_distribution<> qvalue(0.0, 1.0);

     const auto length = 10; //dist(gen);

     std::vector<double> values;
     values.reserve(length);
     for (int i = 0; i < length; ++i) {
         auto const value = (i == 0 || bools(gen)) ? reals(gen) : values[i - 1];
         digest.add(value);
         values.push_back(value);
     }
     std::sort(values.begin(), values.end());
     digest.compress();

     EXPECT_EQ(digest.processed().size(), values.size());

     std::vector<double> testValues{0.0, 1.0e-10, qvalue(gen), 0.5, 1.0 - 1e-10, 1.0};
     for (auto q : testValues) {
         double q1 = quantile(q, values);
         auto q2 = digest.quantile(q);
         if (std::isnan(q1)) {
             EXPECT_TRUE(std::isnan(q2));
         } else {
             EXPECT_NEAR(q1, q2, 0.03) << "q = " << q;
         }
     }
 }

 TEST_F(TDigestTest, MoreThan2BValues) {
     TDigest digest(1000);

     std::random_device gen;
     std::uniform_real_distribution<> reals(0.0, 1.0);
     for (int i = 0; i < 1000; ++i) {
         const double next = reals(gen);
         digest.add(next);
     }
     for (int i = 0; i < 10; ++i) {
         const double next = reals(gen);
         const auto count = 1L << 28;
         digest.add(next, count);
     }
     EXPECT_EQ(static_cast<long>(1000 + float(10L * (1 << 28))), digest.totalWeight());
     EXPECT_GT(digest.totalWeight(), std::numeric_limits<int32_t>::max());
     std::vector<double> quantiles{0, 0.1, 0.5, 0.9, 1, reals(gen)};
     std::sort(quantiles.begin(), quantiles.end());
     auto prev = std::numeric_limits<double>::min();
     for (double q : quantiles) {
         const double v = digest.quantile(q);
         EXPECT_GE(v, prev) << "q = " << q;
         prev = v;
     }
 }

 TEST_F(TDigestTest, MergeTest) {
     TDigest digest1(1000);
     TDigest digest2(1000);

     digest2.add(std::vector<const TDigest*>{&digest1});
 }

 TEST_F(TDigestTest, TestSorted) {
     TDigest digest(1000);
     std::uniform_real_distribution<> reals(0.0, 1.0);
     std::uniform_int_distribution<> ints(0, 10);

     std::random_device gen;
     for (int i = 0; i < 10000; ++i) {
         digest.add(reals(gen), 1 + ints(gen));
     }
     digest.compress();
     Centroid previous(0, 0);
     for (auto centroid : digest.processed()) {
         if (previous.weight() != 0) {
             CHECK_LE(previous.mean(), centroid.mean());
         }
         previous = centroid;
     }
 }

 TEST_F(TDigestTest, ExtremeQuantiles) {
     TDigest digest(1000);
     // t-digest shouldn't merge extreme nodes, but let's still test how it would
     // answer to extreme quantiles in that case ('extreme' in the sense that the
     // quantile is either before the first node or after the last one)

     digest.add(10, 3);
     digest.add(20, 1);
     digest.add(40, 5);
     // this group tree is roughly equivalent to the following sorted array:
     // [ ?, 10, ?, 20, ?, ?, 50, ?, ? ]
     // and we expect it to compute approximate missing values:
     // [ 5, 10, 15, 20, 30, 40, 50, 60, 70]
     std::vector<double> values{5.0, 10.0, 15.0, 20.0, 30.0, 35.0, 40.0, 45.0, 50.0};
     std::vector<double> quantiles{1.5 / 9.0, 3.5 / 9.0, 6.5 / 9.0};
     for (auto q : quantiles) {
         EXPECT_NEAR(quantile(q, values), digest.quantile(q), 0.01) << "q = " << q;
     }
 }

 TEST_F(TDigestTest, Montonicity) {
     TDigest digest(1000);
     std::uniform_real_distribution<> reals(0.0, 1.0);
     std::random_device gen;
     for (int i = 0; i < LOOP_LESS_OR_MORE(10, 100000); i++) {
         digest.add(reals(gen));
     }

     double lastQuantile = -1;
     double lastX = -1;
     for (double z = 0; z <= 1; z += LOOP_LESS_OR_MORE(0.1, 1e-5)) {
         double x = digest.quantile(z);
         EXPECT_GE(x, lastX);
         lastX = x;

         double q = digest.cdf(z);
         EXPECT_GE(q, lastQuantile);
         lastQuantile = q;
     }
 }

 } // namespace doris

 int main(int argc, char** argv) {
     ::testing::InitGoogleTest(&argc, argv);
     return RUN_ALL_TESTS();
 }
	// Licensed to the Apache Software Foundation (ASF) under one
	// or more contributor license agreements. See the NOTICE file
	// distributed with this work for additional information
	// regarding copyright ownership. The ASF licenses this file
	// to you under the Apache License, Version 2.0 (the
	// "License"); you may not use this file except in compliance
	// with the License. You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing,
	// software distributed under the License is distributed on an
	// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	// KIND, either express or implied. See the License for the
	// specific language governing permissions and limitations
	// under the License.

	#include "util/tdigest.h"

	#include <gtest/gtest.h>

	#include <random>

	#include "test_util/test_util.h"

	namespace doris {

	class TDigestTest : public ::testing::Test {
	protected:
	// You can remove any or all of the following functions if its body
	// is empty.
	TDigestTest() {
	// You can do set-up work for each test here.
	}

	virtual ~TDigestTest() {
	// You can do clean-up work that doesn't throw exceptions here.
	}

	// If the constructor and destructor are not enough for setting up
	// and cleaning up each test, you can define the following methods:

	virtual void SetUp() {
	// Code here will be called immediately after the constructor (right
	// before each test).
	}

	virtual void TearDown() {
	// Code here will be called immediately after each test (right
	// before the destructor).
	}

	static void SetUpTestCase() {
	static bool initialized = false;
	if (!initialized) {
	FLAGS_logtostderr = true;
	google::InstallFailureSignalHandler();
	google::InitGoogleLogging("testing::TDigestTest");
	initialized = true;
	}
	}

	// Objects declared here can be used by all tests in the test case for Foo.
	};

	static double quantile(const double q, const std::vector<double>& values) {
	double q1;
	if (values.size() == 0) {
	q1 = NAN;
	} else if (q == 1 \|\| values.size() == 1) {
	q1 = values[values.size() - 1];
	} else {
	auto index = q * values.size();
	if (index < 0.5) {
	q1 = values[0];
	} else if (values.size() - index < 0.5) {
	q1 = values[values.size() - 1];
	} else {
	index -= 0.5;
	const int intIndex = static_cast<int>(index);
	q1 = values[intIndex + 1] * (index - intIndex) +
	values[intIndex] * (intIndex + 1 - index);
	}
	}
	return q1;
	}

	TEST_F(TDigestTest, CrashAfterMerge) {
	TDigest digest(1000);
	std::uniform_real_distribution<> reals(0.0, 1.0);
	std::random_device gen;
	for (int i = 0; i < LOOP_LESS_OR_MORE(100, 100000); i++) {
	digest.add(reals(gen));
	}
	digest.compress();

	TDigest digest2(1000);
	digest2.merge(&digest);
	digest2.quantile(0.5);
	}

	TEST_F(TDigestTest, EmptyDigest) {
	TDigest digest(100);
	EXPECT_EQ(0, digest.processed().size());
	}

	TEST_F(TDigestTest, SingleValue) {
	TDigest digest(100);
	std::random_device gen;
	std::uniform_real_distribution<> dist(0, 1000);
	const auto value = dist(gen);
	digest.add(value);
	std::uniform_real_distribution<> dist2(0, 1.0);
	const double q = dist2(gen);
	EXPECT_NEAR(value, digest.quantile(0.0), 0.001f);
	EXPECT_NEAR(value, digest.quantile(q), 0.001f);
	EXPECT_NEAR(value, digest.quantile(1.0), 0.001f);
	}

	TEST_F(TDigestTest, FewValues) {
	// When there are few values in the tree, quantiles should be exact
	TDigest digest(1000);

	std::random_device gen;
	std::uniform_real_distribution<> reals(0.0, 100.0);
	std::uniform_int_distribution<> dist(0, 10);
	std::uniform_int_distribution<> bools(0, 1);
	std::uniform_real_distribution<> qvalue(0.0, 1.0);

	const auto length = 10; //dist(gen);

	std::vector<double> values;
	values.reserve(length);
	for (int i = 0; i < length; ++i) {
	auto const value = (i == 0 \|\| bools(gen)) ? reals(gen) : values[i - 1];
	digest.add(value);
	values.push_back(value);
	}
	std::sort(values.begin(), values.end());
	digest.compress();

	EXPECT_EQ(digest.processed().size(), values.size());

	std::vector<double> testValues{0.0, 1.0e-10, qvalue(gen), 0.5, 1.0 - 1e-10, 1.0};
	for (auto q : testValues) {
	double q1 = quantile(q, values);
	auto q2 = digest.quantile(q);
	if (std::isnan(q1)) {
	EXPECT_TRUE(std::isnan(q2));
	} else {
	EXPECT_NEAR(q1, q2, 0.03) << "q = " << q;
	}
	}
	}

	TEST_F(TDigestTest, MoreThan2BValues) {
	TDigest digest(1000);

	std::random_device gen;
	std::uniform_real_distribution<> reals(0.0, 1.0);
	for (int i = 0; i < 1000; ++i) {
	const double next = reals(gen);
	digest.add(next);
	}
	for (int i = 0; i < 10; ++i) {
	const double next = reals(gen);
	const auto count = 1L << 28;
	digest.add(next, count);
	}
	EXPECT_EQ(static_cast<long>(1000 + float(10L * (1 << 28))), digest.totalWeight());
	EXPECT_GT(digest.totalWeight(), std::numeric_limits<int32_t>::max());
	std::vector<double> quantiles{0, 0.1, 0.5, 0.9, 1, reals(gen)};
	std::sort(quantiles.begin(), quantiles.end());
	auto prev = std::numeric_limits<double>::min();
	for (double q : quantiles) {
	const double v = digest.quantile(q);
	EXPECT_GE(v, prev) << "q = " << q;
	prev = v;
	}
	}

	TEST_F(TDigestTest, MergeTest) {
	TDigest digest1(1000);
	TDigest digest2(1000);

	digest2.add(std::vector<const TDigest*>{&digest1});
	}

	TEST_F(TDigestTest, TestSorted) {
	TDigest digest(1000);
	std::uniform_real_distribution<> reals(0.0, 1.0);
	std::uniform_int_distribution<> ints(0, 10);

	std::random_device gen;
	for (int i = 0; i < 10000; ++i) {
	digest.add(reals(gen), 1 + ints(gen));
	}
	digest.compress();
	Centroid previous(0, 0);
	for (auto centroid : digest.processed()) {
	if (previous.weight() != 0) {
	CHECK_LE(previous.mean(), centroid.mean());
	}
	previous = centroid;
	}
	}

	TEST_F(TDigestTest, ExtremeQuantiles) {
	TDigest digest(1000);
	// t-digest shouldn't merge extreme nodes, but let's still test how it would
	// answer to extreme quantiles in that case ('extreme' in the sense that the
	// quantile is either before the first node or after the last one)

	digest.add(10, 3);
	digest.add(20, 1);
	digest.add(40, 5);
	// this group tree is roughly equivalent to the following sorted array:
	// [ ?, 10, ?, 20, ?, ?, 50, ?, ? ]
	// and we expect it to compute approximate missing values:
	// [ 5, 10, 15, 20, 30, 40, 50, 60, 70]
	std::vector<double> values{5.0, 10.0, 15.0, 20.0, 30.0, 35.0, 40.0, 45.0, 50.0};
	std::vector<double> quantiles{1.5 / 9.0, 3.5 / 9.0, 6.5 / 9.0};
	for (auto q : quantiles) {
	EXPECT_NEAR(quantile(q, values), digest.quantile(q), 0.01) << "q = " << q;
	}
	}

	TEST_F(TDigestTest, Montonicity) {
	TDigest digest(1000);
	std::uniform_real_distribution<> reals(0.0, 1.0);
	std::random_device gen;
	for (int i = 0; i < LOOP_LESS_OR_MORE(10, 100000); i++) {
	digest.add(reals(gen));
	}

	double lastQuantile = -1;
	double lastX = -1;
	for (double z = 0; z <= 1; z += LOOP_LESS_OR_MORE(0.1, 1e-5)) {
	double x = digest.quantile(z);
	EXPECT_GE(x, lastX);
	lastX = x;

	double q = digest.cdf(z);
	EXPECT_GE(q, lastQuantile);
	lastQuantile = q;
	}
	}

	} // namespace doris

	int main(int argc, char** argv) {
	::testing::InitGoogleTest(&argc, argv);
	return RUN_ALL_TESTS();
	}