blob: e3a8d242d2112379904222a764d011a80844e460 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "vec/utils/stringop_substring.h"
#include <gtest/gtest.h>
#include <cstdint>
#include <string>
#include <vector>
#include "common/logging.h"
#include "runtime/primitive_type.h"
#include "testutil/column_helper.h" // ColumnHelper is used for constructing columns and Block
#include "vec/columns/column_nullable.h"
#include "vec/columns/column_string.h"
#include "vec/core/block.h"
#include "vec/data_types/data_type_number.h"
#include "vec/data_types/data_type_string.h"
namespace doris::vectorized {
TEST(StringOPTest, testStringPushOperations) {
// Create expected result column with various string types
std::vector<std::string> expected_strings = {
"", // Empty string
"abc", // Simple ASCII string
"中文测试", // UTF-8 multi-byte string
"", // NULL marker
" ", // Whitespace only
"!@#$%^&*()", // Special characters
"123456789012345678901234567890", // Medium-length string
std::string(10000, 'x'), // Very long string
"混合English和中文", // Mixed language
"包含\n换行\t制表符", // Contains escape characters
"末尾有空格 " // Trailing spaces
};
std::vector<uint8_t> null_flags = {false, false, false, true, false, false,
false, false, false, false, false}; // The fourth one is NULL
auto expected_col =
ColumnHelper::create_nullable_column<DataTypeString>(expected_strings, null_flags);
auto test_col = ColumnHelper::create_nullable_column<DataTypeString>({}, {});
auto* test_column_nullable = dynamic_cast<const ColumnNullable*>(test_col.get());
auto& test_nested_column = const_cast<ColumnString&>(
static_cast<const ColumnString&>(test_column_nullable->get_nested_column()));
auto& test_chars = test_nested_column.get_chars();
auto& test_offsets = test_nested_column.get_offsets();
test_offsets.resize(expected_strings.size());
NullMap test_null_map(expected_strings.size(), false);
// Use loop and conditions to select different push functions to fill the test column
for (size_t i = 0; i < expected_strings.size(); ++i) {
if (null_flags[i]) {
// Use push_null_string for NULL values
StringOP::push_null_string(i, test_chars, test_offsets, test_null_map);
} else if (expected_strings[i].empty()) {
// Use push_empty_string for empty strings
StringOP::push_empty_string(i, test_chars, test_offsets);
} else {
// Use push_value_string for normal strings
StringOP::push_value_string(expected_strings[i], i, test_chars, test_offsets);
}
}
for (size_t i = 0; i < expected_strings.size(); ++i) {
if (null_flags[i]) {
continue; // Skip content validation for NULL values
}
ASSERT_EQ(static_cast<bool>(test_null_map[i]), static_cast<bool>(null_flags[i]))
<< "Row " << i << " expected to be non-null.";
size_t row_length = test_offsets[i] - test_offsets[i - 1];
ASSERT_EQ(row_length, expected_strings[i].size())
<< "Row " << i << " length mismatch: " << row_length << " vs "
<< expected_strings[i].size();
std::string actual(static_cast<const char*>(static_cast<const void*>(test_chars.data() +
test_offsets[i - 1])),
row_length);
ASSERT_EQ(actual, expected_strings[i]) << "Row " << i << " content mismatch.";
}
}
TEST(StringOPTest, testPushValueStringReservedAndAllowOverFlow) {
// Create expected result column with various string types
std::vector<std::string> expected_strings = {"", "abc", "中文测试", "",
" ", "!@#$%^&*()", "1234567890", "xxxxxx",
"a", " ", "!@#$%^&*()"};
std::vector<uint8_t> null_flags = {false, false, false, true, false, false,
false, false, false, false, false}; // The fourth one is NULL
auto expected_col =
ColumnHelper::create_nullable_column<DataTypeString>(expected_strings, null_flags);
auto test_col = ColumnHelper::create_nullable_column<DataTypeString>({}, {});
auto* test_column_nullable = dynamic_cast<const ColumnNullable*>(test_col.get());
auto& test_nested_column = const_cast<ColumnString&>(
static_cast<const ColumnString&>(test_column_nullable->get_nested_column()));
auto& test_chars = test_nested_column.get_chars();
auto& test_offsets = test_nested_column.get_offsets();
test_offsets.resize(expected_strings.size());
NullMap test_null_map(expected_strings.size(), false);
// Calculate total length of all strings for reserving space
size_t total_length = 0;
for (const auto& str : expected_strings) {
total_length += str.size();
}
test_chars.reserve(total_length);
for (size_t i = 0; i < expected_strings.size(); ++i) {
if (null_flags[i]) {
// Use push_null_string for NULL values
StringOP::push_null_string(i, test_chars, test_offsets, test_null_map);
} else if (expected_strings[i].empty()) {
// Use push_empty_string for empty strings
StringOP::push_empty_string(i, test_chars, test_offsets);
} else {
// Reserve all space at once
StringOP::push_value_string_reserved_and_allow_overflow(expected_strings[i], i,
test_chars, test_offsets);
}
}
for (size_t i = 0; i < expected_strings.size(); ++i) {
if (null_flags[i]) {
continue; // Skip content validation for NULL values
}
ASSERT_EQ(static_cast<bool>(test_null_map[i]), static_cast<bool>(null_flags[i]))
<< "Row " << i << " expected to be non-null.";
size_t row_length = test_offsets[i] - test_offsets[i - 1];
ASSERT_EQ(row_length, expected_strings[i].size())
<< "Row " << i << " length mismatch: " << row_length << " vs "
<< expected_strings[i].size();
std::string actual(static_cast<const char*>(static_cast<const void*>(test_chars.data() +
test_offsets[i - 1])),
row_length);
ASSERT_EQ(actual, expected_strings[i]) << "Row " << i << " content mismatch.";
}
}
TEST(StringOPTest, testFastRepeat) {
const std::string src = "example";
{
int32_t repeat_times = 0;
// Allocate enough buffer (when repeat_times is 0, the size is 0)
std::vector<uint8_t> dst(src.size() * repeat_times);
StringOP::fast_repeat(dst.data(),
static_cast<const uint8_t*>(static_cast<const void*>(src.data())),
src.size(), repeat_times);
// dst length is 0, no content written
ASSERT_EQ(dst.size(), 0);
}
{
for (int32_t repeat_times = 1; repeat_times <= 10; ++repeat_times) {
std::vector<uint8_t> dst(src.size() * repeat_times);
StringOP::fast_repeat(dst.data(),
static_cast<const uint8_t*>(static_cast<const void*>(src.data())),
src.size(), repeat_times);
// Use std::string constructor with pointer to uint8_t for better safety
std::string result(static_cast<const char*>(static_cast<const void*>(dst.data())),
dst.size());
std::string expected;
for (int i = 0; i < repeat_times; ++i) {
expected += src;
}
ASSERT_EQ(result, expected) << "Failed for repeat_times = " << repeat_times;
}
}
}
TEST(StringOPTest, testSubstringExecute) {
// Test case 1: Test empty string with various parameters
std::vector<std::tuple<std::string, int32_t, int32_t, std::string>> test_cases = {
{"", 0, 0, ""}, // Empty string, zero parameters
{"", 1, 5, ""}, // Empty string, positive start position
{"", -1, 3, ""}, // Empty string, negative start position
{"", 2, -1, ""} // Empty string, negative length
};
// Test case 2: Add non-empty string test cases
std::vector<std::tuple<std::string, int32_t, int32_t, std::string>> more_test_cases = {
{"hello", 1, 2, "he"}, // Normal substring from start
{"hello", 2, 3, "ell"}, // Substring from middle
{"hello", -3, 2, "ll"}, // Negative start position
{"hello", 10, 2, ""}, // Start beyond string length
{"hello", 1, 10, "hello"}, // Length beyond string end
{"hello", 1, -1, ""}, // Negative length
{"中文测试", 1, 2, "中文"}, // UTF-8 multi-byte string
{"中文测试", 3, 2, "测试"}, // UTF-8 multi-byte string, partial
{"混合English", 3, 7, "English"} // Mixed language string
};
// First test with empty string
for (const auto& [input_str, start_val, len_val, expected] : test_cases) {
std::vector<std::string> input_strings = {input_str};
std::vector<int32_t> starts = {start_val};
std::vector<int32_t> lengths = {len_val};
// Create a non-empty column (using ColumnVector instead of Nullable)
auto col_strings = ColumnHelper::create_column<DataTypeString>(input_strings);
auto col_starts = ColumnHelper::create_column<DataTypeInt32>(starts);
auto col_lengths = ColumnHelper::create_column<DataTypeInt32>(lengths);
Block block;
block.insert(ColumnWithTypeAndName(col_strings->clone(), std::make_shared<DataTypeString>(),
"str"));
block.insert(ColumnWithTypeAndName(col_starts->clone(), std::make_shared<DataTypeInt32>(),
"start"));
block.insert(ColumnWithTypeAndName(col_lengths->clone(), std::make_shared<DataTypeInt32>(),
"len"));
auto res_col = DataTypeString().create_column();
block.insert(ColumnWithTypeAndName(std::move(res_col), std::make_shared<DataTypeString>(),
"result"));
ColumnNumbers arguments = {0, 1, 2};
uint32_t result_index = 3;
SubstringUtil::substring_execute(block, arguments, result_index, input_strings.size());
// Verify the result
auto result_column = block.get_by_position(result_index).column;
ASSERT_EQ(result_column->size(), 1);
ASSERT_EQ(result_column->get_data_at(0).to_string(), expected)
<< "Failed for input='" << input_str << "', start=" << start_val
<< ", len=" << len_val;
}
// Then test with non-empty strings
for (const auto& [input_str, start_val, len_val, expected] : more_test_cases) {
std::vector<std::string> input_strings = {input_str};
std::vector<int32_t> starts = {start_val};
std::vector<int32_t> lengths = {len_val};
// Create a non-empty column (using ColumnVector instead of Nullable)
auto col_strings = ColumnHelper::create_column<DataTypeString>(input_strings);
auto col_starts = ColumnHelper::create_column<DataTypeInt32>(starts);
auto col_lengths = ColumnHelper::create_column<DataTypeInt32>(lengths);
Block block;
block.insert(ColumnWithTypeAndName(col_strings->clone(), std::make_shared<DataTypeString>(),
"str"));
block.insert(ColumnWithTypeAndName(col_starts->clone(), std::make_shared<DataTypeInt32>(),
"start"));
block.insert(ColumnWithTypeAndName(col_lengths->clone(), std::make_shared<DataTypeInt32>(),
"len"));
auto res_col = DataTypeString().create_column();
block.insert(ColumnWithTypeAndName(std::move(res_col), std::make_shared<DataTypeString>(),
"result"));
ColumnNumbers arguments = {0, 1, 2};
uint32_t result_index = 3;
SubstringUtil::substring_execute(block, arguments, result_index, input_strings.size());
// Verify the result
auto result_column = block.get_by_position(result_index).column;
ASSERT_EQ(result_column->size(), 1);
ASSERT_EQ(result_column->get_data_at(0).to_string(), expected)
<< "Failed for input='" << input_str << "', start=" << start_val
<< ", len=" << len_val;
}
}
} // namespace doris::vectorized