blob: 14b42c51819f7973d4732a10813316d60749ae51 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "vec/columns/column_dictionary.h"
#include <gmock/gmock-more-matchers.h>
#include <gtest/gtest.h>
#include <cstddef>
#include <cstdint>
#include "common/exception.h"
#include "runtime/primitive_type.h"
#include "vec/columns/column_vector.h"
#include "vec/columns/common_column_test.h"
#include "vec/common/sip_hash.h"
#include "vec/common/string_ref.h"
#include "vec/core/field.h"
#include "vec/core/types.h"
#include "vec/data_types/data_type_factory.hpp"
#include "vec/data_types/data_type_string.h"
using namespace doris;
namespace doris::vectorized {
class ColumnDictionaryTest : public CommonColumnTest {
protected:
static std::string test_data_dir;
static std::string test_result_dir;
static void SetUpTestSuite() {
// void SetUp() override {
auto root_dir = std::string(getenv("ROOT"));
test_data_dir = root_dir + "/be/test/data/vec/columns";
test_result_dir = root_dir + "/be/test/expected_result/vec/columns";
dt_str = DataTypeFactory::instance().create_data_type(FieldType::OLAP_FIELD_TYPE_STRING, 0,
0);
dt_int32 =
DataTypeFactory::instance().create_data_type(FieldType::OLAP_FIELD_TYPE_INT, 0, 0);
column_dict_data = ColumnString::create();
column_dict_indices = ColumnInt32::create();
column_dict_char = ColumnDictI32::create(FieldType::OLAP_FIELD_TYPE_CHAR);
EXPECT_TRUE(column_dict_char->is_dict_empty());
column_dict_varchar = ColumnDictI32::create(FieldType::OLAP_FIELD_TYPE_STRING);
column_dict_str = ColumnDictI32::create(FieldType::OLAP_FIELD_TYPE_STRING);
load_columns_data();
}
static DataTypePtr dt_str;
static DataTypePtr dt_int32;
static ColumnString::MutablePtr column_dict_data;
static ColumnInt32::MutablePtr column_dict_indices;
static std::vector<StringRef> dict_array;
static ColumnDictI32::MutablePtr column_dict_char;
static ColumnDictI32::MutablePtr column_dict_varchar;
static ColumnDictI32::MutablePtr column_dict_str;
static void load_columns_data() {
std::cout << "loading test dataset" << std::endl;
{
MutableColumns columns;
columns.push_back(column_dict_data->get_ptr());
DataTypeSerDeSPtrs serde = {dt_str->get_serde()};
std::string data_file = test_data_dir + "/DICT_STR.csv";
load_columns_data_from_file(columns, serde, ';', {0}, data_file);
}
{
MutableColumns columns;
columns.push_back(column_dict_indices->get_ptr());
DataTypeSerDeSPtrs serde = {dt_int32->get_serde()};
std::string data_file = test_data_dir + "/DICT_INDICES.csv";
load_columns_data_from_file(columns, serde, ';', {0}, data_file);
}
const auto& dict_indices_data = column_dict_indices->get_data();
auto dict_data_row_count = column_dict_data->size();
auto dict_indices_row_count = column_dict_indices->size();
std::cout << "dict size: " << dict_data_row_count
<< ", dict indices size: " << dict_indices_row_count << std::endl;
dict_array.resize(dict_data_row_count);
for (size_t i = 0; i != dict_data_row_count; ++i) {
auto dict_item = column_dict_data->get_data_at(i);
dict_array[i].data = dict_item.data;
dict_array[i].size = dict_item.size;
}
auto check_func = [&](auto& col) {
col->reserve(dict_indices_row_count);
col->insert_many_dict_data(column_dict_indices->get_data().data(), 0, dict_array.data(),
dict_indices_row_count, dict_data_row_count);
(void)col->dict_debug_string();
EXPECT_EQ(col->dict_size(), dict_data_row_count);
EXPECT_EQ(col->size(), dict_indices_row_count);
for (size_t i = 0; i < dict_indices_row_count; ++i) {
EXPECT_EQ(col->get_value(dict_indices_data[i]), dict_array[dict_indices_data[i]]);
}
};
check_func(column_dict_char);
check_func(column_dict_varchar);
check_func(column_dict_str);
}
};
TEST_F(ColumnDictionaryTest, is_column_dictionary) {
EXPECT_TRUE(column_dict_char->is_column_dictionary());
EXPECT_TRUE(column_dict_varchar->is_column_dictionary());
EXPECT_TRUE(column_dict_str->is_column_dictionary());
}
TEST_F(ColumnDictionaryTest, get_data_at) {
EXPECT_THROW(column_dict_char->get_data_at(0), Exception);
EXPECT_THROW(column_dict_varchar->get_data_at(1), Exception);
EXPECT_THROW(column_dict_str->get_data_at(2), Exception);
}
TEST_F(ColumnDictionaryTest, insert_from) {
EXPECT_THROW(column_dict_char->insert_from(*column_dict_char, 0), Exception);
EXPECT_THROW(column_dict_varchar->insert_from(*column_dict_char, 1), Exception);
EXPECT_THROW(column_dict_str->insert_from(*column_dict_char, 2), Exception);
}
TEST_F(ColumnDictionaryTest, insert_range_from) {
EXPECT_THROW(column_dict_char->insert_range_from(*column_dict_char, 0, 1), Exception);
EXPECT_THROW(column_dict_varchar->insert_range_from(*column_dict_char, 1, 2), Exception);
EXPECT_THROW(column_dict_str->insert_range_from(*column_dict_char, 3, 4), Exception);
}
TEST_F(ColumnDictionaryTest, insert_indices_from) {
EXPECT_THROW(column_dict_char->insert_indices_from(*column_dict_char, nullptr, nullptr),
Exception);
EXPECT_THROW(column_dict_varchar->insert_indices_from(*column_dict_char, nullptr, nullptr),
Exception);
EXPECT_THROW(column_dict_str->insert_indices_from(*column_dict_char, nullptr, nullptr),
Exception);
}
TEST_F(ColumnDictionaryTest, update_hash_with_value) {
SipHash h;
EXPECT_THROW(column_dict_char->update_hash_with_value(0, h), Exception);
EXPECT_THROW(column_dict_varchar->update_hash_with_value(1, h), Exception);
EXPECT_THROW(column_dict_str->update_hash_with_value(2, h), Exception);
}
TEST_F(ColumnDictionaryTest, insert_data) {
EXPECT_THROW(column_dict_char->insert_data(nullptr, 0), Exception);
}
/*
TEST_F(ColumnDictionaryTest, insert_default) {
auto test_func = [](const ColumnDictI32::MutablePtr& source_column) {
auto* col_vec_src = assert_cast<ColumnDictI32*>(source_column.get());
auto src_size = source_column->size();
auto check_func = [&](size_t clone_count) {
size_t actual_clone_count = std::min(clone_count, src_size);
auto target_column = source_column->clone_resized(actual_clone_count);
auto* col_vec_target = assert_cast<ColumnDictI32*>(target_column.get());
const auto& codes_data = col_vec_target->get_data();
col_vec_target->insert_default();
auto target_size = col_vec_target->size();
EXPECT_EQ(target_size, actual_clone_count + 1);
size_t i = 0;
for (; i < actual_clone_count; ++i) {
EXPECT_EQ(col_vec_target->get_value(codes_data[i]),
col_vec_src->get_value(codes_data[i]));
}
EXPECT_EQ(col_vec_target->get_value(codes_data[i]), StringRef {});
};
check_func(0);
check_func(10);
};
test_func(column_dict_char);
test_func(column_dict_varchar);
test_func(column_dict_str);
}
*/
TEST_F(ColumnDictionaryTest, clear) {
auto target_column = ColumnDictI32::create(FieldType::OLAP_FIELD_TYPE_CHAR);
target_column->insert_many_dict_data(dict_array.data(), dict_array.size());
target_column->clear();
EXPECT_EQ(target_column->size(), 0);
}
TEST_F(ColumnDictionaryTest, byte_size) {
EXPECT_EQ(column_dict_char->byte_size(), column_dict_char->size() * 4);
}
TEST_F(ColumnDictionaryTest, allocated_bytes) {
EXPECT_EQ(column_dict_char->allocated_bytes(), column_dict_char->size() * 4);
}
TEST_F(ColumnDictionaryTest, has_enough_capacity) {
EXPECT_THROW(column_dict_char->has_enough_capacity(
ColumnDictI32(FieldType::OLAP_FIELD_TYPE_VARCHAR)),
Exception);
}
TEST_F(ColumnDictionaryTest, pop_back) {
EXPECT_THROW(column_dict_char->pop_back(9), Exception);
}
/*
TEST_F(ColumnDictionaryTest, reserve) {
auto target_column = column_dict_char->clone();
target_column->reserve(target_column->size() + 9);
auto* col_vec_target = assert_cast<ColumnDictI32*>(target_column.get());
EXPECT_EQ(col_vec_target->get_data().capacity(), target_column->size() + 9);
}
*/
TEST_F(ColumnDictionaryTest, insert) {
EXPECT_THROW(column_dict_char->insert(Field {}), Exception);
}
TEST_F(ColumnDictionaryTest, field) {
auto count = column_dict_char->size();
auto* col_vec = assert_cast<ColumnDictI32*>(column_dict_char.get());
const auto& codes_data = col_vec->get_data();
for (size_t i = 0; i != count; ++i) {
Field f;
column_dict_char->get(i, f);
EXPECT_EQ(f.get<int32_t>(), codes_data[i]);
}
}
TEST_F(ColumnDictionaryTest, serialize_value_into_arena) {
Arena arena;
const char* ptr = nullptr;
EXPECT_THROW(column_dict_char->serialize_value_into_arena(0, arena, ptr), Exception);
}
TEST_F(ColumnDictionaryTest, deserialize_and_insert_from_arena) {
const char* ptr = nullptr;
EXPECT_THROW(column_dict_char->deserialize_and_insert_from_arena(ptr), Exception);
}
TEST_F(ColumnDictionaryTest, get_raw_data) {
EXPECT_THROW(column_dict_char->get_raw_data(), Exception);
}
TEST_F(ColumnDictionaryTest, filter) {
IColumn::Filter filt;
EXPECT_THROW(column_dict_char->filter(filt, 10), Exception);
EXPECT_THROW(column_dict_char->filter(filt), Exception);
}
TEST_F(ColumnDictionaryTest, clone) {
EXPECT_THROW(column_dict_char->clone(), Exception);
}
TEST_F(ColumnDictionaryTest, permute) {
IColumn::Permutation perm;
EXPECT_THROW(column_dict_char->permute(perm, 1), Exception);
}
TEST_F(ColumnDictionaryTest, filter_by_selector) {
auto test_func = [&](const auto& source_column) {
auto src_size = source_column->size();
const auto& codes_data = source_column->get_data();
EXPECT_TRUE(src_size <= UINT16_MAX);
auto target_column = ColumnString::create();
std::vector<uint16_t> indices(src_size);
std::iota(indices.begin(), indices.end(), 0);
std::random_device rd;
std::mt19937 g(rd());
std::shuffle(indices.begin(), indices.end(), g);
size_t sel_size = src_size / 2;
indices.resize(sel_size);
auto status =
source_column->filter_by_selector(indices.data(), sel_size, target_column.get());
EXPECT_TRUE(status.ok());
EXPECT_EQ(target_column->size(), sel_size);
for (size_t i = 0; i != sel_size; ++i) {
auto real_data = target_column->get_data_at(i);
auto expect_data = source_column->get_value(codes_data[indices[i]]);
if (real_data != expect_data) {
std::cout << "index: " << i << ", real_data: " << real_data.to_string()
<< "\nexpect_data: " << expect_data.to_string() << std::endl;
}
EXPECT_EQ(real_data, expect_data);
}
};
test_func(column_dict_char);
}
TEST_F(ColumnDictionaryTest, insert_many_dict_data) {
ColumnDictI32::MutablePtr tmp_column_dict =
ColumnDictI32::create(FieldType::OLAP_FIELD_TYPE_CHAR);
tmp_column_dict->insert_many_dict_data(dict_array.data(), dict_array.size());
for (size_t i = 0; i != dict_array.size(); ++i) {
EXPECT_EQ(tmp_column_dict->get_value(i), dict_array[i]);
}
}
TEST_F(ColumnDictionaryTest, convert_dict_codes_if_necessary) {
{
ColumnDictI32::MutablePtr tmp_column_dict =
ColumnDictI32::create(FieldType::OLAP_FIELD_TYPE_CHAR);
tmp_column_dict->convert_dict_codes_if_necessary();
EXPECT_FALSE(tmp_column_dict->is_dict_sorted());
EXPECT_FALSE(tmp_column_dict->is_dict_code_converted());
auto dict_data_row_count = column_dict_data->size();
auto dict_indices_row_count = column_dict_indices->size();
tmp_column_dict->reserve(dict_indices_row_count);
tmp_column_dict->insert_many_dict_data(column_dict_indices->get_data().data(), 0,
dict_array.data(), dict_indices_row_count,
dict_data_row_count);
tmp_column_dict->convert_dict_codes_if_necessary();
EXPECT_TRUE(tmp_column_dict->is_dict_sorted());
EXPECT_TRUE(tmp_column_dict->is_dict_code_converted());
}
}
TEST_F(ColumnDictionaryTest, find_code) {
auto size = dict_array.size();
for (size_t i = 0; i != size; ++i) {
EXPECT_EQ(column_dict_char->find_code(dict_array[i]), i);
}
}
/*
TEST_F(ColumnDictionaryTest, initialize_hash_values_for_runtime_filter) {
ColumnDictI32::MutablePtr tmp_column_dict =
ColumnDictI32::create(FieldType::OLAP_FIELD_TYPE_CHAR);
auto dict_data_row_count = column_dict_data->size();
auto dict_indices_row_count = column_dict_indices->size();
tmp_column_dict->reserve(dict_indices_row_count);
tmp_column_dict->insert_many_dict_data(column_dict_indices->get_data().data(), 0,
dict_array.data(), dict_indices_row_count,
dict_data_row_count);
tmp_column_dict->initialize_hash_values_for_runtime_filter();
for (size_t i = 0; i != dict_data_row_count; ++i) {
EXPECT_EQ(tmp_column_dict->_dict._compute_hash_value_flags[i], 0);
}
for (size_t i = 0; i != dict_data_row_count; ++i) {
EXPECT_NE(tmp_column_dict->get_hash_value(i), 0);
}
for (size_t i = 0; i != dict_data_row_count; ++i) {
EXPECT_EQ(tmp_column_dict->_dict._compute_hash_value_flags[i], 1);
}
}
*/
TEST_F(ColumnDictionaryTest, rowset_segment_id) {
RowsetId rowset_id {1, 2, 3};
uint32_t segment_id = 100;
column_dict_char->set_rowset_segment_id({rowset_id, segment_id});
auto ids = column_dict_char->get_rowset_segment_id();
EXPECT_EQ(ids.first, rowset_id);
EXPECT_EQ(ids.second, segment_id);
}
TEST_F(ColumnDictionaryTest, convert_to_predicate_column_if_dictionary) {
ColumnDictI32::MutablePtr tmp_column_dict =
ColumnDictI32::create(FieldType::OLAP_FIELD_TYPE_CHAR);
auto dict_data_row_count = column_dict_data->size();
auto dict_indices_row_count = column_dict_indices->size();
tmp_column_dict->reserve(dict_indices_row_count);
tmp_column_dict->insert_many_dict_data(column_dict_indices->get_data().data(), 0,
dict_array.data(), dict_indices_row_count,
dict_data_row_count);
auto col_converted = tmp_column_dict->convert_to_predicate_column_if_dictionary();
for (size_t i = 0; i != dict_indices_row_count; ++i) {
// EXPECT_EQ(col_converted->get_data_at(i), column_dict_data->get_data_at(i));
}
}
std::string ColumnDictionaryTest::test_data_dir;
std::string ColumnDictionaryTest::test_result_dir;
DataTypePtr ColumnDictionaryTest::dt_str;
DataTypePtr ColumnDictionaryTest::dt_int32;
ColumnString::MutablePtr ColumnDictionaryTest::column_dict_data;
ColumnInt32::MutablePtr ColumnDictionaryTest::column_dict_indices;
std::vector<StringRef> ColumnDictionaryTest::dict_array;
ColumnDictI32::MutablePtr ColumnDictionaryTest::column_dict_char;
ColumnDictI32::MutablePtr ColumnDictionaryTest::column_dict_varchar;
ColumnDictI32::MutablePtr ColumnDictionaryTest::column_dict_str;
} // namespace doris::vectorized