blob: 2dd8ed8cb08770935ee3253d1630db574cbe3da2 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "vec/columns/column_string.h"
#include <gmock/gmock-more-matchers.h>
#include <gtest/gtest.h>
#include <algorithm>
#include <cstddef>
#include <cstdint>
#include "vec/columns/column_vector.h"
#include "vec/columns/common_column_test.h"
#include "vec/common/string_ref.h"
#include "vec/core/block.h"
#include "vec/core/field.h"
#include "vec/core/types.h"
#include "vec/data_types/data_type_factory.hpp"
#include "vec/data_types/data_type_string.h"
#include "vec/functions/function_string.h"
using namespace doris;
namespace doris::vectorized {
static std::string test_data_dir;
static std::string test_result_dir;
static DataTypePtr dt_str =
DataTypeFactory::instance().create_data_type(FieldType::OLAP_FIELD_TYPE_STRING, 0, 0);
static DataTypePtr dt_jsonb =
DataTypeFactory::instance().create_data_type(FieldType::OLAP_FIELD_TYPE_JSONB, 0, 0);
static ColumnString::MutablePtr column_str32;
static ColumnString64::MutablePtr column_str64;
static ColumnString::MutablePtr column_str32_json;
static ColumnString64::MutablePtr column_str64_json;
class ColumnStringTest : public CommonColumnTest {
protected:
static void SetUpTestSuite() {
auto root_dir = std::string(getenv("ROOT"));
test_data_dir = root_dir + "/be/test/data/vec/columns";
test_result_dir = root_dir + "/be/test/expected_result/vec/columns";
column_str32 = ColumnString::create();
column_str64 = ColumnString64::create();
load_columns_data();
column_str32_json = ColumnString::create();
column_str64_json = ColumnString64::create();
load_json_columns_data();
}
static void load_columns_data() {
std::cout << "loading test dataset" << std::endl;
{
MutableColumns columns;
columns.push_back(column_str32->get_ptr());
DataTypeSerDeSPtrs serde = {dt_str->get_serde()};
std::string data_file = test_data_dir + "/STRING.csv";
load_columns_data_from_file(columns, serde, ';', {0}, data_file);
EXPECT_TRUE(!column_str32->empty());
column_str32->insert_default();
column_str64->insert_range_from(*column_str32, 0, column_str32->size());
}
std::cout << "column str size: " << column_str32->size() << std::endl;
}
static void load_json_columns_data() {
std::cout << "loading json dataset" << std::endl;
{
MutableColumns columns;
columns.push_back(column_str32_json->get_ptr());
DataTypeSerDeSPtrs serde = {dt_jsonb->get_serde()};
std::string test_data_dir_json =
std::string(getenv("ROOT")) + "/regression-test/data/nereids_function_p0/";
std::vector<std::string> json_files = {
test_data_dir_json + "json_variant/boolean_boundary.jsonl",
test_data_dir_json + "json_variant/null_boundary.jsonl",
test_data_dir_json + "json_variant/number_boundary.jsonl",
test_data_dir_json + "json_variant/string_boundary.jsonl",
test_data_dir_json + "json_variant/array_boolean_boundary.jsonl",
test_data_dir_json + "json_variant/array_nullable_null_boundary.jsonl",
test_data_dir_json + "json_variant/array_number_boundary.jsonl",
test_data_dir_json + "json_variant/array_string_boundary.jsonl",
test_data_dir_json + "json_variant/array_object_boundary.jsonl",
test_data_dir_json + "json_variant/array_nullable_boolean_boundary.jsonl",
test_data_dir_json + "json_variant/array_nullable_number_boundary.jsonl",
test_data_dir_json + "json_variant/array_nullable_string_boundary.jsonl",
test_data_dir_json + "json_variant/array_nullable_object_boundary.jsonl",
test_data_dir_json + "json_variant/array_array_boolean_boundary.jsonl",
test_data_dir_json + "json_variant/array_array_number_boundary.jsonl",
test_data_dir_json + "json_variant/array_array_string_boundary.jsonl",
test_data_dir_json +
"json_variant/array_nullable_array_nullable_boolean_boundary.jsonl",
test_data_dir_json +
"json_variant/array_nullable_array_nullable_null_boundary.jsonl",
test_data_dir_json +
"json_variant/array_nullable_array_nullable_number_boundary.jsonl",
test_data_dir_json +
"json_variant/array_nullable_array_nullable_string_boundary.jsonl",
test_data_dir_json + "json_variant/object_boundary.jsonl",
};
for (const auto& json_file : json_files) {
load_columns_data_from_file(columns, serde, '\n', {0}, json_file);
EXPECT_TRUE(!column_str32_json->empty());
column_str32_json->insert_default();
column_str64_json->insert_range_from(*column_str32_json, 0,
column_str32_json->size());
std::cout << "column str size: " << column_str32_json->size() << std::endl;
std::cout << "column str64 size: " << column_str64_json->size() << std::endl;
}
}
}
#define column_string_common_test(callback, only_str32) \
callback<TYPE_STRING>(ColumnString(), column_str32->get_ptr()); \
if (!only_str32) { \
callback<TYPE_STRING>(ColumnString64(), column_str64->get_ptr()); \
}
void hash_common_test(
const std::string& function_name,
std::function<void(const MutableColumns& load_cols, DataTypeSerDeSPtrs serders,
const std::string& res_file_name)>
assert_callback) {
{
MutableColumns columns;
columns.push_back(column_str32->get_ptr());
DataTypeSerDeSPtrs serdes = {dt_str->get_serde()};
assert_callback(columns, serdes,
test_result_dir + "/column_str32_" + function_name + ".out");
}
{
MutableColumns columns;
columns.push_back(column_str64->get_ptr());
DataTypeSerDeSPtrs serdes = {dt_str->get_serde()};
assert_callback(columns, serdes,
test_result_dir + "/column_str64_" + function_name + ".out");
}
{
MutableColumns columns;
columns.push_back(column_str32_json->get_ptr());
DataTypeSerDeSPtrs serdes = {dt_jsonb->get_serde()};
assert_callback(columns, serdes,
test_result_dir + "/column_str32_json_" + function_name + ".out");
}
{
MutableColumns columns;
columns.push_back(column_str64_json->get_ptr());
DataTypeSerDeSPtrs serdes = {dt_jsonb->get_serde()};
assert_callback(columns, serdes,
test_result_dir + "/column_str64_json_" + function_name + ".out");
}
}
};
TEST_F(ColumnStringTest, check_chars_length) {
ColumnString::check_chars_length(0, 0);
ColumnString::check_chars_length(std::numeric_limits<uint32_t>::max() - 1, 0);
ColumnString::check_chars_length(std::numeric_limits<uint32_t>::max(), 0);
EXPECT_THROW(
ColumnString::check_chars_length((size_t)std::numeric_limits<uint32_t>::max() + 1, 0),
Exception);
}
TEST_F(ColumnStringTest, is_variable_length) {
ColumnString::MutablePtr col = ColumnString::create();
EXPECT_TRUE(col->is_variable_length());
ColumnString64::MutablePtr col64 = ColumnString64::create();
EXPECT_TRUE(col64->is_variable_length());
}
TEST_F(ColumnStringTest, sanity_check) {
auto test_func = [](auto& col) {
auto& chars = col->get_chars();
auto& offsets = col->get_offsets();
col->sanity_check();
std::string data = "123";
col->insert_data(data.data(), data.size());
col->sanity_check();
offsets[0] = 1;
// chars.size() != offsets[count - 1]
EXPECT_THROW(col->sanity_check(), Exception);
offsets[0] = chars.size();
offsets[-1] = 1;
// offsets[-1] != 0
EXPECT_THROW(col->sanity_check(), Exception);
offsets[-1] = 0;
// (offsets[i] < offsets[i - 1])
col->insert_data(data.data(), data.size());
offsets[0] = 1000;
EXPECT_THROW(col->sanity_check(), Exception);
};
{
ColumnString::MutablePtr col = ColumnString::create();
test_func(col);
}
{
ColumnString64::MutablePtr col = ColumnString64::create();
test_func(col);
}
}
TEST_F(ColumnStringTest, byte_size) {
{
ColumnString::MutablePtr col = ColumnString::create();
EXPECT_EQ(col->byte_size(), 0);
col->insert_data("123", 3);
col->insert_data("456", 3);
EXPECT_EQ(col->byte_size(),
col->get_chars().size() +
col->get_offsets().size() * sizeof(col->get_offsets()[0]));
}
{
ColumnString64::MutablePtr col = ColumnString64::create();
EXPECT_EQ(col->byte_size(), 0);
col->insert_data("123", 3);
col->insert_data("456", 3);
EXPECT_EQ(col->byte_size(),
col->get_chars().size() +
col->get_offsets().size() * sizeof(col->get_offsets()[0]));
}
}
TEST_F(ColumnStringTest, has_enough_capacity) {
auto test_func = [](const auto& src_col) {
auto src_size = src_col->size();
auto assert_col = src_col->clone_empty();
ASSERT_FALSE(assert_col->has_enough_capacity(*src_col));
assert_col->reserve(src_size);
ASSERT_TRUE(assert_col->has_enough_capacity(*src_col));
};
{
ColumnString::MutablePtr col = ColumnString::create();
col->insert_data("123", 3);
col->insert_data("456", 3);
test_func(col);
}
{
ColumnString64::MutablePtr col = ColumnString64::create();
col->insert_data("123", 3);
col->insert_data("456", 3);
test_func(col);
}
}
TEST_F(ColumnStringTest, allocated_bytes) {
{
ColumnString::MutablePtr col = ColumnString::create();
EXPECT_EQ(col->allocated_bytes(), 0);
col->insert_data("123", 3);
col->insert_data("456", 3);
EXPECT_EQ(col->allocated_bytes(),
col->get_chars().allocated_bytes() + col->get_offsets().allocated_bytes());
}
{
ColumnString64::MutablePtr col = ColumnString64::create();
EXPECT_EQ(col->allocated_bytes(), 0);
col->insert_data("123", 3);
col->insert_data("456", 3);
EXPECT_EQ(col->allocated_bytes(),
col->get_chars().allocated_bytes() + col->get_offsets().allocated_bytes());
}
}
TEST_F(ColumnStringTest, clone_resized) {
column_string_common_test(assert_column_vector_clone_resized_callback, false);
}
TEST_F(ColumnStringTest, field_test) {
auto test_func = [](const auto& source_column) {
auto src_size = source_column->size();
{
auto assert_col = source_column->clone_empty();
for (size_t i = 0; i != src_size; ++i) {
Field f;
source_column->get(i, f);
assert_col->insert(f);
}
for (size_t i = 0; i != src_size; ++i) {
Field f;
assert_col->get(i, f);
ASSERT_EQ(f.get<StringRef>(), source_column->get_data_at(i));
}
}
{
auto assert_col = source_column->clone_empty();
for (size_t i = 0; i != src_size; ++i) {
JsonbField jsonbf;
Field f = Field::create_field<TYPE_JSONB>(std::move(jsonbf));
source_column->get(i, f);
assert_col->insert(f);
}
for (size_t i = 0; i != src_size; ++i) {
JsonbField jsonbf;
Field f = Field::create_field<TYPE_JSONB>((std::move(jsonbf)));
assert_col->get(i, f);
const auto& real_field = vectorized::get<const JsonbField&>(f);
ASSERT_EQ(StringRef(real_field.get_value(), real_field.get_size()),
source_column->get_data_at(i));
}
}
};
test_func(column_str32);
test_func(column_str64);
test_func(column_str32_json);
test_func(column_str64_json);
}
TEST_F(ColumnStringTest, insert_many_from) {
column_string_common_test(assert_column_vector_insert_many_from_callback, false);
}
TEST_F(ColumnStringTest, is_column_string64) {
EXPECT_FALSE(column_str32->is_column_string64());
EXPECT_TRUE(column_str64->is_column_string64());
EXPECT_FALSE(column_str32_json->is_column_string64());
EXPECT_TRUE(column_str64_json->is_column_string64());
}
TEST_F(ColumnStringTest, insert_from) {
{
assert_column_vector_insert_from_callback<TYPE_STRING>(ColumnString(),
column_str32->get_ptr());
auto tmp_col_str32 = ColumnString::create();
assert_column_vector_insert_from_callback<TYPE_STRING>(ColumnString(),
tmp_col_str32->get_ptr());
}
{
assert_column_vector_insert_from_callback<TYPE_STRING>(ColumnString64(),
column_str64->get_ptr());
auto tmp_col_str = ColumnString64::create();
assert_column_vector_insert_from_callback<TYPE_STRING>(ColumnString64(),
tmp_col_str->get_ptr());
}
{
assert_column_vector_insert_from_callback<TYPE_JSONB>(ColumnString(),
column_str32_json->get_ptr());
auto tmp_col_str32 = ColumnString::create();
assert_column_vector_insert_from_callback<TYPE_JSONB>(ColumnString(),
tmp_col_str32->get_ptr());
}
{
assert_column_vector_insert_from_callback<TYPE_JSONB>(ColumnString64(),
column_str64_json->get_ptr());
auto tmp_col_str = ColumnString64::create();
assert_column_vector_insert_from_callback<TYPE_JSONB>(ColumnString64(),
tmp_col_str->get_ptr());
}
}
TEST_F(ColumnStringTest, insert_data) {
column_string_common_test(assert_column_vector_insert_data_callback, false);
}
TEST_F(ColumnStringTest, insert_data_without_reserve) {
auto test_func = [](auto& col) {
size_t input_rows_count = 10;
std::string test_str = "1234567890";
col->get_offsets().reserve(input_rows_count);
col->get_chars().reserve(input_rows_count * test_str.length());
for (int i = 0; i < input_rows_count; i++) {
col->insert_data_without_reserve(test_str.data(), test_str.size());
}
EXPECT_EQ(col->size(), input_rows_count);
for (size_t i = 0; i < input_rows_count; i++) {
EXPECT_EQ(col->get_data_at(i), StringRef(test_str));
}
};
{
auto col = ColumnString::create();
test_func(col);
}
{
auto col = ColumnString64::create();
test_func(col);
}
}
TEST_F(ColumnStringTest, insert_many_strings_without_reserve) {
auto test_func = [&](size_t clone_count, auto x, const auto& source_column) {
using ColumnType = decltype(x);
auto src_size = source_column->size();
size_t actual_clone_count = std::min(clone_count, src_size);
auto target_column = source_column->clone_resized(actual_clone_count);
auto* col_vec_target = assert_cast<ColumnType*>(target_column.get());
for (size_t i = 0; i != actual_clone_count; ++i) {
EXPECT_EQ(col_vec_target->get_data_at(i), source_column->get_data_at(i));
}
std::vector<size_t> indices(src_size);
std::iota(indices.begin(), indices.end(), 0);
std::random_device rd;
std::mt19937 g(rd());
std::shuffle(indices.begin(), indices.end(), g);
size_t sel_size = src_size / 2;
indices.resize(sel_size);
std::vector<StringRef> strings(sel_size);
size_t length = 0;
for (size_t i = 0; i != sel_size; ++i) {
auto value = source_column->get_data_at(indices[i]);
strings[i].data = value.data;
strings[i].size = value.size;
length += value.size;
}
col_vec_target->get_offsets().reserve(sel_size + col_vec_target->get_offsets().size());
col_vec_target->get_chars().reserve(length + col_vec_target->get_chars().size());
col_vec_target->insert_many_strings_without_reserve(strings.data(), sel_size);
EXPECT_EQ(col_vec_target->size(), actual_clone_count + sel_size);
for (size_t i = 0; i != actual_clone_count; ++i) {
auto real_data = col_vec_target->get_data_at(i);
auto expect_data = source_column->get_data_at(i);
if (real_data != expect_data) {
std::cout << "index: " << i << ", real_data: " << real_data.to_string()
<< "\nexpect_data: " << expect_data.to_string() << std::endl;
}
EXPECT_EQ(real_data, expect_data);
}
for (size_t i = 0; i != sel_size; ++i) {
EXPECT_EQ(col_vec_target->get_data_at(actual_clone_count + i), strings[i]);
}
// test insert consecutive strings
target_column = source_column->clone_resized(actual_clone_count);
col_vec_target = assert_cast<ColumnType*>(target_column.get());
sel_size = src_size / 2;
strings.resize(sel_size);
length = 0;
for (size_t i = 0; i != sel_size; ++i) {
auto value = source_column->get_data_at(i);
strings[i].data = value.data;
strings[i].size = value.size;
length += value.size;
}
col_vec_target->get_offsets().reserve(sel_size + col_vec_target->get_offsets().size());
col_vec_target->get_chars().reserve(length + col_vec_target->get_chars().size());
col_vec_target->insert_many_strings_without_reserve(strings.data(), sel_size);
EXPECT_EQ(col_vec_target->size(), actual_clone_count + sel_size);
for (size_t i = 0; i != actual_clone_count; ++i) {
auto real_data = col_vec_target->get_data_at(i);
auto expect_data = source_column->get_data_at(i);
EXPECT_EQ(real_data, expect_data);
}
for (size_t i = 0; i != sel_size; ++i) {
EXPECT_EQ(col_vec_target->get_data_at(actual_clone_count + i), strings[i]);
}
};
test_func(0, ColumnString(), column_str32);
test_func(10, ColumnString(), column_str32);
test_func(0, ColumnString64(), column_str64);
test_func(10, ColumnString64(), column_str64);
test_func(0, ColumnString(), column_str32_json);
test_func(10, ColumnString(), column_str32_json);
test_func(0, ColumnString64(), column_str64_json);
test_func(10, ColumnString64(), column_str64_json);
}
TEST_F(ColumnStringTest, insert_many_continuous_binary_data) {
auto test_func = [&](size_t clone_count, auto x, const auto& source_column) {
using ColumnType = decltype(x);
auto src_size = source_column->size();
auto* col_vec_src = assert_cast<ColumnType*>(source_column.get());
size_t actual_clone_count = std::min(clone_count, src_size);
auto target_column = source_column->clone_resized(actual_clone_count);
auto* col_vec_target = assert_cast<ColumnType*>(target_column.get());
for (size_t i = 0; i != actual_clone_count; ++i) {
EXPECT_EQ(col_vec_target->get_data_at(i), source_column->get_data_at(i));
}
srand((unsigned)time(nullptr));
auto start_offset = rand() % src_size;
auto insert_count = src_size - start_offset;
const auto* insert_data = (const char*)col_vec_src->get_chars().data();
const auto* insert_offsets = col_vec_src->get_offsets().data() + start_offset - 1;
col_vec_target->insert_many_continuous_binary_data(insert_data, insert_offsets,
insert_count);
EXPECT_EQ(col_vec_target->size(), actual_clone_count + insert_count);
size_t i = 0;
for (; i != actual_clone_count; ++i) {
EXPECT_EQ(col_vec_target->get_data_at(i), source_column->get_data_at(i));
}
for (size_t j = start_offset; i != col_vec_target->size(); ++i, ++j) {
auto real_data = col_vec_target->get_data_at(i);
auto expect_data = source_column->get_data_at(j);
if (real_data != expect_data) {
std::cout << "index: " << i << ", real_data: " << real_data.to_string()
<< "\nexpect_data: " << expect_data.to_string() << std::endl;
}
EXPECT_EQ(real_data, expect_data);
}
// test insert 0 items
insert_count = 0;
target_column = source_column->clone_resized(actual_clone_count);
col_vec_target = assert_cast<ColumnType*>(target_column.get());
col_vec_target->insert_many_continuous_binary_data(insert_data, insert_offsets,
insert_count);
EXPECT_EQ(col_vec_target->size(), actual_clone_count);
for (i = 0; i != actual_clone_count; ++i) {
EXPECT_EQ(col_vec_target->get_data_at(i), source_column->get_data_at(i));
}
};
test_func(0, ColumnString(), column_str32);
test_func(10, ColumnString(), column_str32);
test_func(0, ColumnString(), column_str32_json);
test_func(10, ColumnString(), column_str32_json);
}
TEST_F(ColumnStringTest, insert_many_strings) {
auto test_func = [&](size_t clone_count, auto x, const auto& source_column) {
using ColumnType = decltype(x);
auto src_size = source_column->size();
size_t actual_clone_count = std::min(clone_count, src_size);
auto target_column = source_column->clone_resized(actual_clone_count);
auto* col_vec_target = assert_cast<ColumnType*>(target_column.get());
for (size_t i = 0; i != actual_clone_count; ++i) {
std::cout << "index: " << i
<< ", real_data: " << col_vec_target->get_data_at(i).to_string() << "\n";
EXPECT_EQ(col_vec_target->get_data_at(i), source_column->get_data_at(i));
}
std::vector<size_t> indices(src_size);
std::iota(indices.begin(), indices.end(), 0);
std::random_device rd;
std::mt19937 g(rd());
std::shuffle(indices.begin(), indices.end(), g);
size_t sel_size = src_size / 2;
indices.resize(sel_size);
std::vector<StringRef> strings(sel_size);
size_t length = 0;
for (size_t i = 0; i != sel_size; ++i) {
auto value = source_column->get_data_at(indices[i]);
strings[i].data = value.data;
strings[i].size = value.size;
length += value.size;
}
col_vec_target->insert_many_strings(strings.data(), sel_size);
EXPECT_EQ(col_vec_target->size(), actual_clone_count + sel_size);
for (size_t i = 0; i != actual_clone_count; ++i) {
auto real_data = col_vec_target->get_data_at(i);
auto expect_data = source_column->get_data_at(i);
if (real_data != expect_data) {
std::cout << "index: " << i << ", real_data: " << real_data.to_string()
<< "\nexpect_data: " << expect_data.to_string() << std::endl;
}
EXPECT_EQ(real_data, expect_data);
}
for (size_t i = 0; i != sel_size; ++i) {
EXPECT_EQ(col_vec_target->get_data_at(actual_clone_count + i), strings[i]);
}
};
test_func(0, ColumnString(), column_str32);
test_func(10, ColumnString(), column_str32);
test_func(0, ColumnString64(), column_str64);
test_func(10, ColumnString64(), column_str64);
test_func(0, ColumnString(), column_str32_json);
test_func(10, ColumnString(), column_str32_json);
test_func(0, ColumnString64(), column_str64_json);
test_func(10, ColumnString64(), column_str64_json);
}
TEST_F(ColumnStringTest, insert_many_strings_overflow) {
auto test_func = [&](size_t clone_count, auto x, const auto& source_column, size_t max_length) {
using ColumnType = decltype(x);
auto src_size = source_column->size();
size_t actual_clone_count = std::min(clone_count, src_size);
auto target_column = source_column->clone_resized(actual_clone_count);
auto* col_vec_target = assert_cast<ColumnType*>(target_column.get());
for (size_t i = 0; i != actual_clone_count; ++i) {
EXPECT_EQ(col_vec_target->get_data_at(i), source_column->get_data_at(i));
}
std::vector<size_t> indices(src_size);
std::iota(indices.begin(), indices.end(), 0);
std::random_device rd;
std::mt19937 g(rd());
std::shuffle(indices.begin(), indices.end(), g);
size_t sel_size = src_size / 2;
indices.resize(sel_size);
std::vector<StringRef> strings(sel_size);
ColumnString tmp_strings;
tmp_strings.reserve(sel_size * max_length);
for (size_t i = 0; i != sel_size; ++i) {
auto value = source_column->get_data_at(indices[i]);
strings[i].data = value.data;
strings[i].size = value.size;
if (strings[i].size > max_length) {
strings[i].size = max_length;
} else if (strings[i].size < max_length) {
auto tmp_str = std::string(value.data, value.size);
tmp_str.resize(max_length, 'a');
tmp_strings.insert_data(tmp_str.data(), tmp_str.size());
auto tmp_item = tmp_strings.get_data_at(tmp_strings.size() - 1);
strings[i].data = tmp_item.data;
strings[i].size = max_length;
}
}
col_vec_target->insert_many_strings_overflow(strings.data(), sel_size, max_length);
EXPECT_EQ(col_vec_target->size(), actual_clone_count + sel_size);
for (size_t i = 0; i != actual_clone_count; ++i) {
auto real_data = col_vec_target->get_data_at(i);
auto expect_data = source_column->get_data_at(i);
EXPECT_EQ(real_data, expect_data);
}
for (size_t i = 0; i != sel_size; ++i) {
EXPECT_EQ(col_vec_target->get_data_at(actual_clone_count + i), strings[i]);
}
};
std::vector<size_t> clone_counts = {0, 10};
std::vector<size_t> max_lengths = {0, 3, 8, 13, 16, 29, 32, 33, 64, 66, 100, 128, 256};
for (auto clone_count : clone_counts) {
for (auto max_length : max_lengths) {
test_func(clone_count, ColumnString(), column_str32, max_length);
test_func(clone_count, ColumnString(), column_str32_json, max_length);
}
for (auto max_length : max_lengths) {
test_func(clone_count, ColumnString64(), column_str64, max_length);
test_func(clone_count, ColumnString64(), column_str64_json, max_length);
}
}
}
TEST_F(ColumnStringTest, insert_many_dict_data) {
auto test_func = [&](size_t clone_count, auto x, const auto& source_column) {
using ColumnType = decltype(x);
auto src_size = source_column->size();
size_t actual_clone_count = std::min(clone_count, src_size);
std::vector<StringRef> dict(src_size);
for (size_t i = 0; i != src_size; ++i) {
auto value = source_column->get_data_at(i);
dict[i].data = value.data;
dict[i].size = value.size;
}
auto target_column = source_column->clone_resized(actual_clone_count);
auto* col_vec_target = assert_cast<ColumnType*>(target_column.get());
for (size_t i = 0; i != actual_clone_count; ++i) {
std::cout << "index: " << i
<< ", real_data: " << col_vec_target->get_data_at(i).to_string() << "\n";
EXPECT_EQ(col_vec_target->get_data_at(i), source_column->get_data_at(i));
}
std::vector<int32_t> data_array(src_size);
std::iota(data_array.begin(), data_array.end(), 0);
std::random_device rd;
std::mt19937 g(rd());
std::shuffle(data_array.begin(), data_array.end(), g);
size_t start_index = 0;
size_t num = src_size - start_index;
col_vec_target->insert_many_dict_data(data_array.data(), start_index, dict.data(), num,
dict.size());
EXPECT_EQ(col_vec_target->size(), actual_clone_count + num);
for (size_t i = 0; i != actual_clone_count; ++i) {
auto real_data = col_vec_target->get_data_at(i);
auto expect_data = source_column->get_data_at(i);
if (real_data != expect_data) {
std::cout << "index: " << i << ", real_data: " << real_data.to_string()
<< "\nexpect_data: " << expect_data.to_string() << std::endl;
}
EXPECT_EQ(real_data, expect_data);
}
for (size_t i = 0; i != num; ++i) {
auto real_data = col_vec_target->get_data_at(actual_clone_count + i);
auto expected_data = source_column->get_data_at(data_array[start_index + i]);
EXPECT_EQ(real_data, expected_data);
}
};
test_func(0, ColumnString(), column_str32);
test_func(10, ColumnString(), column_str32);
test_func(0, ColumnString64(), column_str64);
test_func(10, ColumnString64(), column_str64);
test_func(0, ColumnString(), column_str32_json);
test_func(10, ColumnString(), column_str32_json);
test_func(0, ColumnString64(), column_str64_json);
test_func(10, ColumnString64(), column_str64_json);
}
TEST_F(ColumnStringTest, pop_back_test) {
column_string_common_test(assert_column_vector_pop_back_callback, false);
}
TEST_F(ColumnStringTest, ser_deser_test) {
{
MutableColumns columns;
columns.push_back(column_str32->get_ptr());
ser_deserialize_with_arena_impl(columns, {dt_str});
}
{
MutableColumns columns;
columns.push_back(column_str64->get_ptr());
ser_deserialize_with_arena_impl(columns, {dt_str});
}
{
MutableColumns columns;
columns.push_back(column_str32_json->get_ptr());
ser_deserialize_with_arena_impl(columns, {dt_jsonb});
}
{
MutableColumns columns;
columns.push_back(column_str64_json->get_ptr());
ser_deserialize_with_arena_impl(columns, {dt_jsonb});
}
}
TEST_F(ColumnStringTest, ser_deser_vec_test) {
column_string_common_test(assert_column_vector_serialize_vec_callback, false);
}
TEST_F(ColumnStringTest, get_max_row_byte_size) {
{
size_t max_size = 0;
size_t num_rows = column_str32->size();
for (size_t i = 0; i != num_rows; ++i) {
max_size = std::max<size_t>(max_size, column_str32->size_at(i));
}
EXPECT_EQ(column_str32->get_max_row_byte_size(), max_size + sizeof(uint32_t));
}
{
size_t max_size = 0;
size_t num_rows = column_str64->size();
for (size_t i = 0; i != num_rows; ++i) {
max_size = std::max<size_t>(max_size, column_str64->size_at(i));
}
EXPECT_EQ(column_str64->get_max_row_byte_size(), max_size + sizeof(uint32_t));
}
{
size_t max_size = 0;
size_t num_rows = column_str32_json->size();
for (size_t i = 0; i != num_rows; ++i) {
max_size = std::max<size_t>(max_size, column_str32_json->size_at(i));
}
EXPECT_EQ(column_str32_json->get_max_row_byte_size(), max_size + sizeof(uint32_t));
}
{
size_t max_size = 0;
size_t num_rows = column_str64_json->size();
for (size_t i = 0; i != num_rows; ++i) {
max_size = std::max<size_t>(max_size, column_str64_json->size_at(i));
}
EXPECT_EQ(column_str64_json->get_max_row_byte_size(), max_size + sizeof(uint32_t));
}
}
TEST_F(ColumnStringTest, update_xxHash_with_value) {
hash_common_test("update_xxHash_with_value", assert_update_xxHash_with_value_callback);
}
TEST_F(ColumnStringTest, update_sip_hash_with_value_test) {
hash_common_test("update_sip_hash_with_value",
assert_column_vector_update_siphashes_with_value_callback);
}
TEST_F(ColumnStringTest, update_hashes_with_value_test) {
hash_common_test("update_hashes_with_value",
assert_column_vector_update_hashes_with_value_callback);
}
TEST_F(ColumnStringTest, update_crc_with_value_test) {
hash_common_test("update_crc_with_value", assert_update_crc_with_value_callback);
}
TEST_F(ColumnStringTest, update_crcs_with_value_test) {
std::string function_name = "update_crcs_with_value";
{
MutableColumns columns;
columns.push_back(column_str32->get_ptr());
DataTypeSerDeSPtrs serdes = {dt_str->get_serde()};
std::vector<PrimitiveType> pts(columns.size(), PrimitiveType::TYPE_STRING);
assert_column_vector_update_crc_hashes_callback(
columns, serdes, pts, test_result_dir + "/column_str32_" + function_name + ".out");
}
{
MutableColumns columns;
columns.push_back(column_str64->get_ptr());
DataTypeSerDeSPtrs serdes = {dt_str->get_serde()};
std::vector<PrimitiveType> pts(columns.size(), PrimitiveType::TYPE_STRING);
assert_column_vector_update_crc_hashes_callback(
columns, serdes, pts, test_result_dir + "/column_str64_" + function_name + ".out");
}
{
MutableColumns columns;
columns.push_back(column_str32_json->get_ptr());
DataTypeSerDeSPtrs serdes = {dt_jsonb->get_serde()};
std::vector<PrimitiveType> pts(columns.size(), PrimitiveType::TYPE_JSONB);
assert_column_vector_update_crc_hashes_callback(
columns, serdes, pts,
test_result_dir + "/column_str32_json_" + function_name + ".out");
}
{
MutableColumns columns;
columns.push_back(column_str64_json->get_ptr());
DataTypeSerDeSPtrs serdes = {dt_jsonb->get_serde()};
std::vector<PrimitiveType> pts(columns.size(), PrimitiveType::TYPE_JSONB);
assert_column_vector_update_crc_hashes_callback(
columns, serdes, pts,
test_result_dir + "/column_str64_json_" + function_name + ".out");
}
}
TEST_F(ColumnStringTest, insert_range_from) {
column_string_common_test(assert_column_vector_insert_range_from_callback, false);
}
TEST_F(ColumnStringTest, insert_range_from_ignore_overflow) {
column_string_common_test(assert_column_vector_insert_range_from_ignore_overflow_callback,
false);
}
TEST_F(ColumnStringTest, insert_indices_from) {
auto test_func = [](auto& target_column, const auto& source_column) {
// Test case 1: Empty source column
// Test case 2: Empty indices array
// Test case 3: Normal case with multiple indices
// Select elements in different order
// Test case 4: Duplicate indices
auto src_size = source_column->size();
// Test case 1: Empty target column
{
auto tmp_target_column = target_column->clone_empty();
std::vector<uint32_t> indices;
// empty indices array
tmp_target_column->insert_indices_from(*source_column, indices.data(), indices.data());
EXPECT_EQ(tmp_target_column->size(), 0);
}
auto test_func2 = [&](size_t clone_count) {
size_t actual_clone_count = std::min(clone_count, src_size);
{
auto tmp_target_column = target_column->clone_resized(actual_clone_count);
// insert all elements from source column
std::vector<uint32_t> indices(src_size);
std::iota(indices.begin(), indices.end(), 0);
tmp_target_column->insert_indices_from(*source_column, indices.data(),
indices.data() + src_size);
EXPECT_EQ(tmp_target_column->size(), actual_clone_count + indices.size());
size_t j = 0;
for (j = 0; j != actual_clone_count; ++j) {
EXPECT_EQ(tmp_target_column->get_data_at(j), target_column->get_data_at(j));
}
for (size_t k = 0; j < actual_clone_count + indices.size(); ++j, ++k) {
EXPECT_EQ(tmp_target_column->get_data_at(j),
source_column->get_data_at(indices[k]));
}
}
{
// Normal case with random indices
auto tmp_target_column = target_column->clone_resized(actual_clone_count);
std::vector<uint32_t> indices(src_size);
std::iota(indices.begin(), indices.end(), 0);
std::random_device rd;
std::mt19937 g(rd());
std::shuffle(indices.begin(), indices.end(), g);
tmp_target_column->insert_indices_from(*source_column, indices.data(),
indices.data() + indices.size());
EXPECT_EQ(tmp_target_column->size(), actual_clone_count + indices.size());
size_t j = 0;
for (j = 0; j != actual_clone_count; ++j) {
EXPECT_EQ(tmp_target_column->get_data_at(j), target_column->get_data_at(j));
}
for (size_t k = 0; j < actual_clone_count + indices.size(); ++j, ++k) {
EXPECT_EQ(tmp_target_column->get_data_at(j),
source_column->get_data_at(indices[k]));
}
}
{
// Normal case with duplicate indices
auto tmp_target_column = target_column->clone_resized(actual_clone_count);
std::vector<uint32_t> indices = {0, uint32_t(source_column->size() - 1),
uint32_t((source_column->size() + 1) >> 1),
uint32_t(source_column->size() - 1), 0};
tmp_target_column->insert_indices_from(*source_column, indices.data(),
indices.data() + indices.size());
EXPECT_EQ(tmp_target_column->size(), actual_clone_count + indices.size());
size_t j = 0;
for (j = 0; j != actual_clone_count; ++j) {
EXPECT_EQ(tmp_target_column->get_data_at(j), target_column->get_data_at(j));
}
for (size_t k = 0; j < actual_clone_count + indices.size(); ++j, ++k) {
EXPECT_EQ(tmp_target_column->get_data_at(j),
source_column->get_data_at(indices[k]));
}
}
};
test_func2(0);
test_func2(10);
};
test_func(column_str32, column_str32);
test_func(column_str32, column_str64);
test_func(column_str64, column_str32);
test_func(column_str64, column_str64);
test_func(column_str32, column_str32_json);
test_func(column_str32, column_str64_json);
test_func(column_str64, column_str32_json);
test_func(column_str64, column_str64_json);
}
TEST_F(ColumnStringTest, filter) {
column_string_common_test(assert_column_vector_filter_callback, true);
{
IColumn::Filter filter;
EXPECT_THROW(column_str64->filter(filter, column_str64->size()), Exception);
EXPECT_THROW(column_str64->filter(filter), Exception);
}
}
TEST_F(ColumnStringTest, filter_by_selector) {
auto test_func = [&](const auto& source_column) {
auto src_size = source_column->size();
EXPECT_TRUE(src_size <= UINT16_MAX);
auto target_column = source_column->clone_empty();
std::vector<uint16_t> indices(src_size);
std::iota(indices.begin(), indices.end(), 0);
std::random_device rd;
std::mt19937 g(rd());
std::shuffle(indices.begin(), indices.end(), g);
size_t sel_size = src_size / 2;
indices.resize(sel_size);
std::sort(indices.begin(), indices.end());
std::cout << "selection count: " << sel_size << ", indices: ";
for (auto i : indices) {
std::cout << i << ",";
}
std::cout << std::endl;
auto status =
source_column->filter_by_selector(indices.data(), sel_size, target_column.get());
EXPECT_TRUE(status.ok());
EXPECT_EQ(target_column->size(), sel_size);
for (size_t i = 0; i != sel_size; ++i) {
auto real_data = target_column->get_data_at(i);
auto expect_data = source_column->get_data_at(indices[i]);
if (real_data != expect_data) {
std::cout << "index: " << i << ", real_data: " << real_data.to_string()
<< "\nexpect_data: " << expect_data.to_string() << std::endl;
}
EXPECT_EQ(real_data, expect_data);
}
};
test_func(column_str32);
test_func(column_str32_json);
{
auto target_column = column_str64->clone_empty();
std::vector<uint16_t> indices(10, 0);
auto status = column_str64->filter_by_selector(indices.data(), 10, target_column.get());
EXPECT_FALSE(status.ok());
}
{
auto target_column = column_str64_json->clone_empty();
std::vector<uint16_t> indices(10, 0);
auto status = column_str64_json->filter_by_selector(indices.data(), 0, target_column.get());
EXPECT_FALSE(status.ok());
}
}
TEST_F(ColumnStringTest, permute) {
{
// test empty column and limit == 0
IColumn::Permutation permutation(0);
auto col = column_str32->clone_empty();
col->permute(permutation, 0);
EXPECT_EQ(col->size(), 0);
}
{
IColumn::Permutation permutation(0);
auto col = column_str64->clone_empty();
col->permute(permutation, 0);
EXPECT_EQ(col->size(), 0);
}
{
IColumn::Permutation permutation(0);
EXPECT_THROW(column_str32->permute(permutation, 10), Exception);
EXPECT_THROW(column_str64->permute(permutation, 10), Exception);
}
{
// test empty column and limit == 0
IColumn::Permutation permutation(0);
auto col = column_str32_json->clone_empty();
col->permute(permutation, 0);
EXPECT_EQ(col->size(), 0);
}
{
IColumn::Permutation permutation(0);
auto col = column_str64_json->clone_empty();
col->permute(permutation, 0);
EXPECT_EQ(col->size(), 0);
}
{
IColumn::Permutation permutation(0);
EXPECT_THROW(column_str32_json->permute(permutation, 10), Exception);
EXPECT_THROW(column_str64_json->permute(permutation, 10), Exception);
}
MutableColumns columns;
columns.push_back(column_str32->get_ptr());
columns.push_back(column_str64->get_ptr());
columns.push_back(column_str32_json->get_ptr());
columns.push_back(column_str64_json->get_ptr());
assert_column_vector_permute(columns, 0);
assert_column_vector_permute(columns, 1);
assert_column_vector_permute(columns, column_str32->size());
assert_column_vector_permute(columns, UINT64_MAX);
}
TEST_F(ColumnStringTest, insert_default) {
column_string_common_test(assert_column_vector_insert_default_callback, false);
}
TEST_F(ColumnStringTest, insert_many_default) {
column_string_common_test(assert_column_vector_insert_many_defaults_callback, false);
}
TEST_F(ColumnStringTest, get_permutation) {
assert_column_permutations2(*column_str32, dt_str);
assert_column_permutations2(*column_str64, dt_str);
assert_column_permutations2(*column_str32_json, dt_jsonb);
assert_column_permutations2(*column_str64_json, dt_jsonb);
}
TEST_F(ColumnStringTest, is_column_string) {
EXPECT_TRUE(column_str32->is_column_string());
EXPECT_TRUE(column_str64->is_column_string());
EXPECT_TRUE(column_str32_json->is_column_string());
EXPECT_TRUE(column_str64_json->is_column_string());
}
TEST_F(ColumnStringTest, structure_equals) {
EXPECT_TRUE(column_str32->structure_equals(ColumnString()));
EXPECT_FALSE(column_str32->structure_equals(*column_str64));
EXPECT_TRUE(column_str64->structure_equals(ColumnString64()));
EXPECT_FALSE(column_str64->structure_equals(*column_str32));
EXPECT_TRUE(column_str32_json->structure_equals(ColumnString()));
EXPECT_FALSE(column_str32_json->structure_equals(*column_str64_json));
EXPECT_TRUE(column_str64_json->structure_equals(ColumnString64()));
EXPECT_FALSE(column_str64_json->structure_equals(*column_str32_json));
EXPECT_FALSE(column_str32->structure_equals(ColumnInt32()));
}
TEST_F(ColumnStringTest, clear) {
auto tmp_col = column_str32->clone();
EXPECT_EQ(tmp_col->size(), column_str32->size());
auto* tmp_col_str = assert_cast<ColumnString*>(tmp_col.get());
EXPECT_EQ(tmp_col_str->get_offsets().size(), column_str32->size());
tmp_col->clear();
EXPECT_EQ(tmp_col->size(), 0);
EXPECT_EQ(tmp_col_str->get_offsets().size(), 0);
EXPECT_EQ(tmp_col_str->get_chars().size(), 0);
{
auto tmp_col = column_str32_json->clone();
EXPECT_EQ(tmp_col->size(), column_str32_json->size());
auto* tmp_col_str = assert_cast<ColumnString*>(tmp_col.get());
EXPECT_EQ(tmp_col_str->get_offsets().size(), column_str32_json->size());
tmp_col->clear();
EXPECT_EQ(tmp_col->size(), 0);
EXPECT_EQ(tmp_col_str->get_offsets().size(), 0);
EXPECT_EQ(tmp_col_str->get_chars().size(), 0);
}
{
auto tmp_col = column_str64->clone();
EXPECT_EQ(tmp_col->size(), column_str64->size());
auto* tmp_col_str = assert_cast<ColumnString64*>(tmp_col.get());
EXPECT_EQ(tmp_col_str->get_offsets().size(), column_str64->size());
tmp_col->clear();
EXPECT_EQ(tmp_col->size(), 0);
EXPECT_EQ(tmp_col_str->get_offsets().size(), 0);
EXPECT_EQ(tmp_col_str->get_chars().size(), 0);
}
{
auto tmp_col = column_str64_json->clone();
EXPECT_EQ(tmp_col->size(), column_str64_json->size());
auto* tmp_col_str = assert_cast<ColumnString64*>(tmp_col.get());
EXPECT_EQ(tmp_col_str->get_offsets().size(), column_str64_json->size());
tmp_col->clear();
EXPECT_EQ(tmp_col->size(), 0);
EXPECT_EQ(tmp_col_str->get_offsets().size(), 0);
EXPECT_EQ(tmp_col_str->get_chars().size(), 0);
}
}
TEST_F(ColumnStringTest, replace_column_data) {
EXPECT_THROW(column_str32->replace_column_data(ColumnString(), 0, 0), Exception);
EXPECT_THROW(column_str64->replace_column_data(ColumnString(), 0, 0), Exception);
EXPECT_THROW(column_str32_json->replace_column_data(ColumnString(), 0, 0), Exception);
EXPECT_THROW(column_str64_json->replace_column_data(ColumnString(), 0, 0), Exception);
}
TEST_F(ColumnStringTest, compare_internal) {
column_string_common_test(assert_column_vector_compare_internal_callback, false);
}
TEST_F(ColumnStringTest, convert_column_if_overflow) {
{
auto tmp_col = ColumnString::create();
tmp_col->insert_data("abc", 3);
auto src_size = tmp_col->size();
auto tmp_col_converted = tmp_col->convert_column_if_overflow();
EXPECT_TRUE(tmp_col_converted->is_column_string());
EXPECT_FALSE(tmp_col_converted->is_column_string64());
for (size_t i = 0; i < src_size; ++i) {
EXPECT_EQ(tmp_col_converted->get_data_at(i), tmp_col->get_data_at(i));
}
}
{
auto tmp_col = column_str32->clone();
auto* tmp_col_str32 = assert_cast<ColumnString*>(tmp_col.get());
auto src_size = column_str32->size();
auto chars_size = column_str32->get_chars().size();
auto max_chars_size = config::string_overflow_size;
while (chars_size < max_chars_size) {
tmp_col->insert_range_from_ignore_overflow(*column_str32, 0, column_str32->size());
chars_size = tmp_col_str32->get_chars().size();
}
tmp_col->insert_range_from_ignore_overflow(*column_str32, 0, column_str32->size());
auto tmp_col_row_count = tmp_col->size();
chars_size = tmp_col_str32->get_chars().size();
EXPECT_GT(chars_size, max_chars_size);
auto tmp_col_converted = tmp_col->convert_column_if_overflow();
EXPECT_TRUE(tmp_col_converted->is_column_string64());
for (size_t i = 0; i < tmp_col_row_count; ++i) {
EXPECT_EQ(tmp_col_converted->get_data_at(i), column_str32->get_data_at(i % src_size));
}
}
{
auto tmp_col = column_str32_json->clone();
auto* tmp_col_str32 = assert_cast<ColumnString*>(tmp_col.get());
auto src_size = column_str32_json->size();
auto chars_size = column_str32_json->get_chars().size();
auto max_chars_size = config::string_overflow_size;
while (chars_size < max_chars_size) {
tmp_col->insert_range_from_ignore_overflow(*column_str32_json, 0,
column_str32_json->size());
chars_size = tmp_col_str32->get_chars().size();
}
tmp_col->insert_range_from_ignore_overflow(*column_str32_json, 0,
column_str32_json->size());
auto tmp_col_row_count = tmp_col->size();
chars_size = tmp_col_str32->get_chars().size();
EXPECT_GT(chars_size, max_chars_size);
auto tmp_col_converted = tmp_col->convert_column_if_overflow();
EXPECT_TRUE(tmp_col_converted->is_column_string64());
for (size_t i = 0; i < tmp_col_row_count; ++i) {
EXPECT_EQ(tmp_col_converted->get_data_at(i),
column_str32_json->get_data_at(i % src_size));
}
}
{
auto tmp_col = column_str64_json->clone();
auto* tmp_col_str64 = assert_cast<ColumnString64*>(tmp_col.get());
auto src_size = column_str64_json->size();
auto chars_size = column_str64_json->get_chars().size();
auto max_chars_size = config::string_overflow_size;
while (chars_size < max_chars_size) {
tmp_col->insert_range_from_ignore_overflow(*column_str64_json, 0,
column_str64_json->size());
chars_size = tmp_col_str64->get_chars().size();
}
tmp_col->insert_range_from_ignore_overflow(*column_str64_json, 0,
column_str64_json->size());
auto tmp_col_row_count = tmp_col->size();
chars_size = tmp_col_str64->get_chars().size();
EXPECT_GT(chars_size, max_chars_size);
auto tmp_col_converted = tmp_col->convert_column_if_overflow();
EXPECT_TRUE(tmp_col_converted->is_column_string64());
for (size_t i = 0; i < tmp_col_row_count; ++i) {
EXPECT_EQ(tmp_col_converted->get_data_at(i),
column_str64_json->get_data_at(i % src_size));
}
}
}
TEST_F(ColumnStringTest, resize) {
auto test_func = [](const auto& source_column) {
auto source_size = source_column->size();
auto tmp_col = source_column->clone();
size_t add_count = 10;
tmp_col->resize(source_size + add_count);
EXPECT_EQ(tmp_col->size(), source_size + add_count);
for (size_t i = 0; i != source_size; ++i) {
EXPECT_EQ(tmp_col->get_data_at(i), source_column->get_data_at(i));
}
for (size_t i = 0; i != add_count; ++i) {
EXPECT_EQ(tmp_col->get_data_at(source_size + i).to_string(), "");
}
};
test_func(column_str32);
test_func(column_str64);
test_func(column_str32_json);
test_func(column_str64_json);
}
TEST_F(ColumnStringTest, TestConcat) {
Block block;
vectorized::DataTypePtr str_type = std::make_shared<vectorized::DataTypeString>();
auto str_col0 = ColumnString::create();
std::vector<std::string> vals0 = {"aaa", "bb", "cccc"};
for (auto& v : vals0) {
str_col0->insert_data(v.data(), v.size());
}
block.insert({std::move(str_col0), str_type, "test_str_col0"});
auto str_col1 = ColumnString::create();
std::vector<std::string> vals1 = {"3", "2", "4"};
for (auto& v : vals1) {
str_col1->insert_data(v.data(), v.size());
}
block.insert({std::move(str_col1), str_type, "test_str_col1"});
auto str_col_res = ColumnString::create();
block.insert({std::move(str_col_res), str_type, "test_str_res"});
ColumnNumbers arguments = {0, 1};
FunctionStringConcat func_concat;
auto fn_ctx = FunctionContext::create_context(nullptr, nullptr, {});
{
auto status =
func_concat.open(fn_ctx.get(), FunctionContext::FunctionStateScope::FRAGMENT_LOCAL);
EXPECT_TRUE(status.ok());
}
{
auto status = func_concat.execute_impl(fn_ctx.get(), block, arguments, 2, 3);
EXPECT_TRUE(status.ok());
}
auto actual_res_col = block.get_by_position(2).column;
EXPECT_EQ(actual_res_col->size(), 3);
auto actual_res_col_str = assert_cast<const ColumnString*>(actual_res_col.get());
actual_res_col_str->sanity_check();
}
TEST_F(ColumnStringTest, TestStringInsert) {
{
auto str32_column = ColumnString::create();
std::vector<std::string> vals_tmp = {"x", "yy", "zzz", ""};
auto str32_column_tmp = ColumnString::create();
for (auto& v : vals_tmp) {
str32_column_tmp->insert_data(v.data(), v.size());
}
str32_column->insert_range_from(*str32_column_tmp, 0, vals_tmp.size());
str32_column->insert_range_from(*str32_column_tmp, 0, vals_tmp.size());
auto row_count = str32_column->size();
EXPECT_EQ(row_count, vals_tmp.size() * 2);
for (size_t i = 0; i < row_count; ++i) {
auto row_data = str32_column->get_data_at(i);
EXPECT_EQ(row_data.to_string(), vals_tmp[i % vals_tmp.size()]);
}
}
{
// test insert ColumnString64 to ColumnString
auto str32_column = ColumnString::create();
std::vector<std::string> vals_tmp = {"x", "yy", "zzz", ""};
auto str64_column_tmp = ColumnString64::create();
for (auto& v : vals_tmp) {
str64_column_tmp->insert_data(v.data(), v.size());
}
str32_column->insert_range_from(*str64_column_tmp, 0, vals_tmp.size());
str32_column->insert_range_from(*str64_column_tmp, 0, vals_tmp.size());
auto row_count = str32_column->size();
EXPECT_EQ(row_count, vals_tmp.size() * 2);
for (size_t i = 0; i < row_count; ++i) {
auto row_data = str32_column->get_data_at(i);
EXPECT_EQ(row_data.to_string(), vals_tmp[i % vals_tmp.size()]);
}
}
}
TEST_F(ColumnStringTest, shrink_padding_chars) {
ColumnString::MutablePtr col = ColumnString::create();
col->shrink_padding_chars();
col->insert_data("123\0 ", 7);
col->insert_data("456\0xx", 6);
col->insert_data("78", 2);
col->shrink_padding_chars();
EXPECT_EQ(col->size(), 3);
EXPECT_EQ(col->get_data_at(0), StringRef("123"));
EXPECT_EQ(col->get_data_at(0).size, 3);
EXPECT_EQ(col->get_data_at(1), StringRef("456"));
EXPECT_EQ(col->get_data_at(1).size, 3);
EXPECT_EQ(col->get_data_at(2), StringRef("78"));
EXPECT_EQ(col->get_data_at(2).size, 2);
col->insert_data("xyz", 2); // only xy
EXPECT_EQ(col->size(), 4);
EXPECT_EQ(col->get_data_at(3), StringRef("xy"));
}
TEST_F(ColumnStringTest, sort_column) {
column_string_common_test(assert_sort_column_callback, false);
}
TEST_F(ColumnStringTest, ScalaTypeStringTesterase) {
auto column = ColumnString::create();
std::vector<StringRef> data = {StringRef("asd"), StringRef("1234567"), StringRef("3"),
StringRef("4"), StringRef("5")};
for (auto d : data) {
column->insert_data(d.data, d.size);
}
column->erase(0, 2);
EXPECT_EQ(column->size(), 3);
for (int i = 0; i < column->size(); ++i) {
std::cout << column->get_data_at(i).to_string() << std::endl;
EXPECT_EQ(column->get_data_at(i).to_string(), data[i + 2].to_string());
}
auto column2 = ColumnString::create();
std::vector<StringRef> data2 = {StringRef(""), StringRef("1234567"), StringRef("asd"),
StringRef("4"), StringRef("5")};
for (auto d : data2) {
column2->insert_data(d.data, d.size);
}
column2->erase(0, 2);
EXPECT_EQ(column2->size(), 3);
for (int i = 0; i < column2->size(); ++i) {
std::cout << column2->get_data_at(i).to_string() << std::endl;
EXPECT_EQ(column2->get_data_at(i).to_string(), data2[i + 2].to_string());
}
}
TEST_F(ColumnStringTest, ScalaTypeStringTest2erase) {
auto column = ColumnString::create();
std::vector<StringRef> data = {StringRef("asd"), StringRef("1234567"), StringRef("3"),
StringRef("4"), StringRef("5")};
std::vector<StringRef> res = {StringRef("asd"), StringRef("1234567"), StringRef("5")};
for (auto d : data) {
column->insert_data(d.data, d.size);
}
column->erase(2, 2);
EXPECT_EQ(column->size(), 3);
for (int i = 0; i < column->size(); ++i) {
std::cout << column->get_data_at(i).to_string() << std::endl;
EXPECT_EQ(column->get_data_at(i).to_string(), res[i].to_string());
}
auto column2 = ColumnString::create();
std::vector<StringRef> data2 = {StringRef(""), StringRef("1234567"), StringRef("asd"),
StringRef("4"), StringRef("5")};
std::vector<StringRef> res2 = {StringRef(""), StringRef("1234567"), StringRef("5")};
for (auto d : data2) {
column2->insert_data(d.data, d.size);
}
column2->erase(2, 2);
EXPECT_EQ(column2->size(), 3);
for (int i = 0; i < column2->size(); ++i) {
std::cout << column2->get_data_at(i).to_string() << std::endl;
EXPECT_EQ(column2->get_data_at(i).to_string(), res2[i].to_string());
}
}
TEST_F(ColumnStringTest, is_ascii) {
{
auto column = ColumnString::create();
std::vector<StringRef> data = {StringRef("asd"), StringRef("1234567"), StringRef("3"),
StringRef("4"), StringRef("5")};
for (auto d : data) {
column->insert_data(d.data, d.size);
}
EXPECT_TRUE(column->is_ascii());
}
{
auto column = ColumnString::create();
std::vector<StringRef> data = {StringRef("asd"), StringRef("1234567"),
StringRef("3"), StringRef("4"),
StringRef("5"), StringRef("你好世界")};
for (auto d : data) {
column->insert_data(d.data, d.size);
}
EXPECT_FALSE(column->is_ascii());
}
{
auto column = ColumnString::create();
std::vector<StringRef> data = {StringRef(""), StringRef(""), StringRef(""),
StringRef(""), StringRef(""), StringRef("")};
EXPECT_TRUE(column->is_ascii());
}
}
} // namespace doris::vectorized