blob: b454fabd6ce4a1a699ae8021e05920fc718c8df4 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "vec/utils/histogram_helpers.hpp"
#include <gtest/gtest-message.h>
#include <gtest/gtest-test-part.h>
#include <algorithm>
#include <map>
#include <vector>
#include "gtest/gtest_pred_impl.h"
#include "vec/data_types/data_type_number.h"
namespace doris::vectorized {
// Define a helper function to create a test map with specified number of distinct values
// and total counts.
template <typename T>
std::map<T, size_t> create_test_map(const std::vector<T>& values,
const std::vector<size_t>& counts) {
std::map<T, size_t> test_map;
for (size_t i = 0; i < values.size(); ++i) {
test_map[values[i]] = counts[i];
}
return test_map;
}
/******************test build_histogram******************/
// Test case 1: Test when input map is empty.
TEST(BuildHistogramTest, empty_map) {
std::vector<Bucket<int>> buckets;
std::map<int, size_t> ordered_map;
size_t num_buckets = 10;
EXPECT_FALSE(build_histogram(buckets, ordered_map, num_buckets));
EXPECT_EQ(buckets.size(), 0);
}
// Test case 2: Test when input map has only one element.
TEST(BuildHistogramTest, single_element) {
std::vector<Bucket<int>> buckets;
std::map<int, size_t> ordered_map = {{1, 5}};
size_t num_buckets = 10;
EXPECT_TRUE(build_histogram(buckets, ordered_map, num_buckets));
EXPECT_EQ(buckets.size(), 1);
EXPECT_EQ(buckets[0].lower, 1);
EXPECT_EQ(buckets[0].upper, 1);
EXPECT_EQ(buckets[0].ndv, 1);
EXPECT_EQ(buckets[0].count, 5);
EXPECT_EQ(buckets[0].pre_sum, 0);
}
// Test case 3: Test when num_buckets >= number of distinct elements in input map.
TEST(BuildHistogramTest, enough_buckets) {
std::vector<Bucket<int>> buckets;
std::map<int, size_t> test_map = create_test_map<int>({1, 2, 3}, {10, 20, 30});
size_t num_buckets = test_map.size();
EXPECT_TRUE(build_histogram(buckets, test_map, num_buckets));
EXPECT_EQ(buckets.size(), num_buckets);
EXPECT_EQ(buckets[0].lower, 1);
EXPECT_EQ(buckets[0].upper, 1);
EXPECT_EQ(buckets[0].ndv, 1);
EXPECT_EQ(buckets[0].count, 10);
EXPECT_EQ(buckets[0].pre_sum, 0);
EXPECT_EQ(buckets[1].lower, 2);
EXPECT_EQ(buckets[1].upper, 2);
EXPECT_EQ(buckets[1].ndv, 1);
EXPECT_EQ(buckets[1].count, 20);
EXPECT_EQ(buckets[1].pre_sum, 10);
EXPECT_EQ(buckets[2].lower, 3);
EXPECT_EQ(buckets[2].upper, 3);
EXPECT_EQ(buckets[2].ndv, 1);
EXPECT_EQ(buckets[2].count, 30);
EXPECT_EQ(buckets[2].pre_sum, 30);
}
// Test case 4: Test when num_buckets < number of distinct elements in input map.
TEST(BuildHistogramTest, not_enough_buckets) {
std::vector<Bucket<int>> buckets;
std::map<int, size_t> test_map = create_test_map<int>({1, 2, 3, 4}, {5, 10, 20, 30});
size_t num_buckets = 2;
EXPECT_TRUE(build_histogram(buckets, test_map, num_buckets));
EXPECT_EQ(buckets.size(), num_buckets);
EXPECT_EQ(buckets[0].lower, 1);
EXPECT_EQ(buckets[0].upper, 3);
EXPECT_EQ(buckets[0].ndv, 3);
EXPECT_EQ(buckets[0].count, 35);
EXPECT_EQ(buckets[0].pre_sum, 0);
EXPECT_EQ(buckets[1].lower, 4);
EXPECT_EQ(buckets[1].upper, 4);
EXPECT_EQ(buckets[1].ndv, 1);
EXPECT_EQ(buckets[1].count, 30);
EXPECT_EQ(buckets[1].pre_sum, 35);
}
// Test case 5: Test when the sum of counts of two adjacent elements is larger than a single bucket's capacity.
TEST(BuildHistogramTest, two_adjacent_values_larger_than_capacity) {
std::vector<Bucket<int>> buckets;
std::map<int, size_t> test_map = create_test_map<int>({1, 2, 3, 4}, {20, 30, 40, 50});
size_t num_buckets = 3;
EXPECT_TRUE(build_histogram(buckets, test_map, num_buckets));
EXPECT_EQ(buckets[0].lower, 1);
EXPECT_EQ(buckets[0].upper, 2);
EXPECT_EQ(buckets.size(), num_buckets);
EXPECT_EQ(buckets[0].ndv, 2);
EXPECT_EQ(buckets[0].count, 50);
EXPECT_EQ(buckets[0].pre_sum, 0);
EXPECT_EQ(buckets[1].lower, 3);
EXPECT_EQ(buckets[1].upper, 3);
EXPECT_EQ(buckets[1].ndv, 1);
EXPECT_EQ(buckets[1].count, 40);
EXPECT_EQ(buckets[1].pre_sum, 50);
EXPECT_EQ(buckets[2].lower, 4);
EXPECT_EQ(buckets[2].upper, 4);
EXPECT_EQ(buckets[2].ndv, 1);
EXPECT_EQ(buckets[2].count, 50);
EXPECT_EQ(buckets[2].pre_sum, 90);
}
// Test case 6: Test when the sum of counts of all elements is smaller than a single bucket's capacity.
TEST(BuildHistogramTest, all_values_in_one_bucket) {
std::vector<Bucket<int>> buckets;
std::map<int, size_t> test_map = create_test_map<int>({1, 2, 3}, {5, 10, 15});
size_t num_buckets = 1;
EXPECT_TRUE(build_histogram(buckets, test_map, num_buckets));
EXPECT_EQ(buckets[0].lower, 1);
EXPECT_EQ(buckets[0].upper, 3);
EXPECT_EQ(buckets.size(), num_buckets);
EXPECT_EQ(buckets[0].ndv, 3);
EXPECT_EQ(buckets[0].count, 30);
EXPECT_EQ(buckets[0].pre_sum, 0);
}
// Test case 7: Test when the sum of counts of all elements is larger than the total capacity of all buckets.
TEST(BuildHistogramTest, all_values_in_multiple_buckets) {
std::vector<Bucket<int>> buckets;
std::map<int, size_t> test_map =
create_test_map<int>({1, 2, 3, 4, 5, 6}, {100, 200, 300, 400, 500, 600});
size_t num_buckets = 4;
EXPECT_TRUE(build_histogram(buckets, test_map, num_buckets));
// Check if the buckets are constructed correctly
EXPECT_EQ(buckets.size(), num_buckets);
EXPECT_EQ(buckets[0].lower, 1);
EXPECT_EQ(buckets[0].upper, 3);
EXPECT_EQ(buckets[0].ndv, 3);
EXPECT_EQ(buckets[0].count, 600);
EXPECT_EQ(buckets[0].pre_sum, 0);
EXPECT_EQ(buckets[1].lower, 4);
EXPECT_EQ(buckets[1].upper, 4);
EXPECT_EQ(buckets[1].ndv, 1);
EXPECT_EQ(buckets[1].count, 400);
EXPECT_EQ(buckets[1].pre_sum, 600);
EXPECT_EQ(buckets[2].lower, 5);
EXPECT_EQ(buckets[2].upper, 5);
EXPECT_EQ(buckets[2].ndv, 1);
EXPECT_EQ(buckets[2].count, 500);
EXPECT_EQ(buckets[2].pre_sum, 1000);
EXPECT_EQ(buckets[3].lower, 6);
EXPECT_EQ(buckets[3].upper, 6);
EXPECT_EQ(buckets[3].ndv, 1);
EXPECT_EQ(buckets[3].count, 600);
EXPECT_EQ(buckets[3].pre_sum, 1500);
// Check if the pre_sum values are correct.
std::vector<size_t> pre_sum(num_buckets + 1, 0);
for (size_t i = 0; i < num_buckets; ++i) {
pre_sum[i + 1] = pre_sum[i] + buckets[i].count;
}
for (size_t i = 0; i < buckets.size(); ++i) {
EXPECT_EQ(buckets[i].pre_sum, pre_sum[i]);
}
}
/*****************test histogram_to_json*****************/
// Test case 1: Empty buckets
TEST(HistogramToJsonTest, empty_buckets) {
rapidjson::StringBuffer buffer;
std::vector<Bucket<int>> buckets;
auto data_type = std::make_shared<DataTypeInt8>();
bool result = histogram_to_json(buffer, buckets, data_type);
std::string result_str = std::string(buffer.GetString());
std::string expect_result_str = "{\"num_buckets\":0,\"buckets\":[]}";
EXPECT_FALSE(result);
EXPECT_EQ(result_str, expect_result_str);
}
// Test case 2: Non-empty buckets
TEST(HistogramToJsonTest, non_empty_buckets) {
rapidjson::StringBuffer buffer;
std::vector<Bucket<int>> buckets = {{0, 10, 5, 20, 100}, {10, 20, 10, 30, 200}};
auto data_type = std::make_shared<DataTypeInt8>();
bool result = histogram_to_json(buffer, buckets, data_type);
std::string result_str = std::string(buffer.GetString());
std::string expect_result_str =
"{\"num_buckets\":2,\"buckets\":[{\"lower\":\"0\",\"upper\":\"10\",\"ndv\":5,"
"\"count\":20,\"pre_sum\":100},{\"lower\":\"10\",\"upper\":\"20\",\"ndv\":10,"
"\"count\":30,\"pre_sum\":200}]}";
EXPECT_TRUE(result);
EXPECT_EQ(result_str, expect_result_str);
}
/************test calculate_bucket_max_values************/
// Test case 1: Test with a map of size 1 and num_buckets = 1
TEST(CalculateBucketMaxValuesTest, single_value_one_bucket) {
std::map<int, size_t> value_map = {{1, 1}};
const size_t num_buckets = 1;
const size_t expected_max_values = 1;
EXPECT_EQ(calculate_bucket_max_values(value_map, num_buckets), expected_max_values);
}
// Test case 2: Test with a map of size 1 and num_buckets > 1
TEST(CalculateBucketMaxValuesTest, single_value_multiple_buckets) {
std::map<int, size_t> value_map = {{1, 1}};
const size_t num_buckets = 10;
const size_t expected_max_values = 1;
EXPECT_EQ(calculate_bucket_max_values(value_map, num_buckets), expected_max_values);
}
// Test case 3: Test with a map of size 2 and num_buckets = 1
TEST(CalculateBucketMaxValuesTest, multiple_values_one_bucket) {
std::map<int, size_t> value_map = {{1, 1}, {2, 2}};
const size_t num_buckets = 1;
const size_t expected_max_values = 3;
EXPECT_EQ(calculate_bucket_max_values(value_map, num_buckets), expected_max_values);
}
// Test case 4: Test with a map of size 2 and num_buckets > 1
TEST(CalculateBucketMaxValuesTest, multiple_values_multiple_buckets) {
std::map<int, size_t> value_map = {{1, 1}, {2, 2}};
const size_t num_buckets = 2;
const size_t expected_max_values = 1;
EXPECT_EQ(calculate_bucket_max_values(value_map, num_buckets), expected_max_values);
}
// Test case 5: Test with a map of size 3 and num_buckets > 1
TEST(CalculateBucketMaxValuesTest, multiple_values_multiple_buckets2) {
std::map<int, size_t> value_map = create_test_map<int>({1, 2, 3}, {2, 2, 2});
const size_t num_buckets = 3;
const size_t expected_max_values = 2;
EXPECT_EQ(calculate_bucket_max_values(value_map, num_buckets), expected_max_values);
}
// Test case 6: Test with a map of size 3 and num_buckets > 1
TEST(CalculateBucketMaxValuesTest, multiple_values_multiple_buckets3) {
std::map<int, size_t> value_map = create_test_map<int>({1, 2, 3}, {2, 2, 2});
const size_t num_buckets = 4;
const size_t expected_max_values = 1;
EXPECT_EQ(calculate_bucket_max_values(value_map, num_buckets), expected_max_values);
}
/**************test can_assign_into_buckets**************/
// Test case 1: Test with empty input map.
TEST(CanAssignIntoBucketsTest, empty_map) {
std::map<int, size_t> empty_map;
EXPECT_FALSE(can_assign_into_buckets(empty_map, 1, 1));
}
// Test case 2: Test with a single element in input map.
TEST(CanAssignIntoBucketsTest, one_element_map) {
std::map<int, size_t> test_map = {{42, 10}};
EXPECT_FALSE(can_assign_into_buckets(test_map, 1, 1));
EXPECT_FALSE(can_assign_into_buckets(test_map, 5, 1));
EXPECT_TRUE(can_assign_into_buckets(test_map, 5, 2));
}
// Test case 3: Test with enough buckets to store all distinct elements.
TEST(CanAssignIntoBucketsTest, enough_buckets) {
std::map<int, size_t> test_map = create_test_map<int>({1, 2, 3}, {10, 20, 30});
EXPECT_FALSE(can_assign_into_buckets(test_map, 1, 3));
EXPECT_TRUE(can_assign_into_buckets(test_map, 10, 3));
EXPECT_TRUE(can_assign_into_buckets(test_map, 20, 3));
}
// Test case 4: Test with not enough buckets to store all distinct elements.
TEST(CanAssignIntoBucketsTest, not_enough_buckets) {
std::map<int, size_t> test_map = create_test_map<int>({1, 2, 3, 4}, {5, 10, 15, 20});
EXPECT_TRUE(can_assign_into_buckets(test_map, 30, 2));
EXPECT_FALSE(can_assign_into_buckets(test_map, 30, 1));
}
// Test case 5: Test with all values in a single bucket.
TEST(CanAssignIntoBucketsTest, all_values_in_one_bucket) {
std::map<int, size_t> test_map = create_test_map<int>({1, 2, 3, 4}, {5, 10, 15, 20});
EXPECT_TRUE(can_assign_into_buckets(test_map, 100, 1));
EXPECT_FALSE(can_assign_into_buckets(test_map, 30, 1));
}
// Test case 6: Test with a single element that has a count greater than the total bucket capacity.
TEST(CanAssignIntoBucketsTest, single_element_larger_than_bucket_capacity) {
std::map<int, size_t> test_map = {{42, 100}};
EXPECT_FALSE(can_assign_into_buckets(test_map, 99, 1));
EXPECT_TRUE(can_assign_into_buckets(test_map, 100, 1));
EXPECT_TRUE(can_assign_into_buckets(test_map, 200, 1));
}
} // namespace doris::vectorized