blob: d903d3abeff2fdc82a2279f4ee917f01110635b4 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "vec/columns/column_varbinary.h"
#include <gtest/gtest-message.h>
#include <gtest/gtest-test-part.h>
#include <gtest/gtest.h>
#include <algorithm>
#include <cstdint>
#include <cstring>
#include <iostream>
#include <string>
#include <vector>
#include "common/exception.h"
#include "runtime/primitive_type.h"
#include "vec/columns/column.h"
#include "vec/columns/column_string.h"
#include "vec/common/assert_cast.h"
#include "vec/common/string_ref.h"
#include "vec/common/string_view.h"
#include "vec/core/types.h"
namespace doris::vectorized {
class ColumnVarbinaryTest : public ::testing::Test {
protected:
void SetUp() override {}
void TearDown() override {}
static std::string make_bytes(size_t n, uint8_t seed = 0xAB) {
std::string s;
s.resize(n);
for (size_t i = 0; i < n; ++i) {
s[i] = static_cast<char>(seed + i);
}
if (n >= 3) {
s[n / 3] = '\0';
s[(2 * n) / 3] = '\0';
}
return s;
}
};
TEST_F(ColumnVarbinaryTest, BasicInsertGetPopClear) {
auto col = ColumnVarbinary::create();
EXPECT_EQ(col->get_name(), std::string("ColumnVarbinary"));
EXPECT_EQ(col->size(), 0U);
const size_t inline_len = std::min<size_t>(doris::StringView::kInlineSize, 8);
const std::string small = make_bytes(inline_len, 0x11);
const std::string big = make_bytes(doris::StringView::kInlineSize + 32, 0x22);
size_t before_bytes = col->byte_size();
col->insert_data(small.data(), small.size());
EXPECT_EQ(col->size(), 1U);
auto r0 = col->get_data_at(0);
ASSERT_EQ(r0.size, small.size());
ASSERT_EQ(r0.size, 8U);
ASSERT_EQ(memcmp(r0.data, small.data(), small.size()), 0);
size_t after_small_bytes = col->byte_size();
ASSERT_EQ(after_small_bytes - before_bytes, sizeof(doris::StringView));
ASSERT_EQ(after_small_bytes - before_bytes, 16);
col->insert_default();
EXPECT_EQ(col->size(), 2U);
auto r1 = col->get_data_at(1);
ASSERT_EQ(r1.size, 0U);
col->insert_data(big.data(), big.size());
EXPECT_EQ(col->size(), 3U);
auto r2 = col->get_data_at(2);
ASSERT_EQ(r2.size, big.size());
ASSERT_EQ(r2.size, 44U);
ASSERT_EQ(memcmp(r2.data, big.data(), big.size()), 0);
size_t after_big_bytes = col->byte_size();
// big insert adds one StringView slot + big payload in arena (Arena may add alignment/overhead)
size_t diff = after_big_bytes - after_small_bytes;
std::cout << "after_big_bytes: " << after_big_bytes
<< " after_small_bytes: " << after_small_bytes << " diff: " << diff << std::endl;
ASSERT_GE(diff, sizeof(doris::StringView) + big.size());
// pop_back
col->pop_back(1);
EXPECT_EQ(col->size(), 2U);
auto r_back = col->get_data_at(1);
ASSERT_EQ(r_back.size, 0U);
// clear resets sizes (arena used_size becomes 0)
col->clear();
EXPECT_EQ(col->size(), 0U);
EXPECT_EQ(col->byte_size(), 0U);
}
TEST_F(ColumnVarbinaryTest, InsertFromAndRanges) {
auto src = ColumnVarbinary::create();
std::vector<std::string> vals = {make_bytes(1, 0x01), make_bytes(2, 0x02),
make_bytes(doris::StringView::kInlineSize + 5, 0x03),
make_bytes(0, 0x00), make_bytes(7, 0x05)};
for (auto& v : vals) {
src->insert_data(v.data(), v.size());
}
// insert_from single rows
auto dst1 = ColumnVarbinary::create();
for (size_t i = 0; i < vals.size(); ++i) {
dst1->insert_from(*src, i);
}
ASSERT_EQ(dst1->size(), vals.size());
for (size_t i = 0; i < vals.size(); ++i) {
auto r = dst1->get_data_at(i);
ASSERT_EQ(r.size, vals[i].size());
ASSERT_EQ(memcmp(r.data, vals[i].data(), r.size), 0);
}
// insert_range_from subset
auto dst2 = ColumnVarbinary::create();
dst2->insert_range_from(*src, 1, 3); // expect indices 1,2,3
ASSERT_EQ(dst2->size(), 3U);
for (size_t i = 0; i < 3; ++i) {
auto r = dst2->get_data_at(i);
ASSERT_EQ(r.size, vals[1 + i].size());
ASSERT_EQ(memcmp(r.data, vals[1 + i].data(), r.size), 0);
}
// insert_indices_from with duplicates and reordering
std::vector<uint32_t> indices = {4, 2, 2, 0};
auto dst3 = ColumnVarbinary::create();
dst3->insert_indices_from(*src, indices.data(), indices.data() + indices.size());
ASSERT_EQ(dst3->size(), indices.size());
for (size_t i = 0; i < indices.size(); ++i) {
auto r = dst3->get_data_at(i);
const auto& expect = vals[indices[i]];
ASSERT_EQ(r.size, expect.size());
ASSERT_EQ(memcmp(r.data, expect.data(), r.size), 0);
}
}
TEST_F(ColumnVarbinaryTest, FilterBothModes) {
auto col = ColumnVarbinary::create();
// Mix inline (small) and non-inline (large > kInlineSize) values
std::vector<std::string> vals = {
make_bytes(1, 0x10), // inline
make_bytes(doris::StringView::kInlineSize + 5, 0x91), // non-inline (dropped)
make_bytes(3, 0x12), // inline
make_bytes(doris::StringView::kInlineSize + 7, 0x92), // non-inline
make_bytes(0, 0x00), // empty (dropped)
make_bytes(doris::StringView::kInlineSize + 9, 0x93) // non-inline
};
for (auto& v : vals) {
col->insert_data(v.data(), v.size());
}
IColumn::Filter f = {1, 0, 1, 1, 0, 1};
size_t expected = 4; // number of ones
const auto& ccol = assert_cast<const ColumnVarbinary&>(*col);
ColumnPtr filtered = ccol.filter(f, -1);
const auto& fcol = assert_cast<const ColumnVarbinary&>(*filtered);
ASSERT_EQ(fcol.size(), expected);
std::vector<size_t> kept_idx = {0, 2, 3, 5}; // includes both inline and non-inline
for (size_t i = 0; i < kept_idx.size(); ++i) {
auto r = fcol.get_data_at(i);
const auto& expect = vals[kept_idx[i]];
ASSERT_EQ(r.size, expect.size());
ASSERT_EQ(memcmp(r.data, expect.data(), r.size), 0);
}
auto col_inplace = ColumnVarbinary::create();
for (auto& v : vals) {
col_inplace->insert_data(v.data(), v.size());
}
size_t new_sz = col_inplace->filter(f);
ASSERT_EQ(new_sz, expected);
ASSERT_EQ(col_inplace->size(), expected);
for (size_t i = 0; i < kept_idx.size(); ++i) {
auto r = col_inplace->get_data_at(i);
const auto& expect = vals[kept_idx[i]];
ASSERT_EQ(r.size, expect.size());
ASSERT_EQ(memcmp(r.data, expect.data(), r.size), 0);
}
}
TEST_F(ColumnVarbinaryTest, Permute) {
auto col = ColumnVarbinary::create();
// Include large (non-inline) entries to exercise arena path
std::vector<std::string> vals = {
make_bytes(1, 0x20), // inline
make_bytes(doris::StringView::kInlineSize + 3, 0xA0), // non-inline
make_bytes(3, 0x22), // inline
make_bytes(doris::StringView::kInlineSize + 8, 0xA1) // non-inline
};
for (auto& v : vals) {
col->insert_data(v.data(), v.size());
}
IColumn::Permutation perm = {3, 1, 2, 0};
// limit < size
ColumnPtr p1 = col->permute(perm, 3);
const auto& c1 = assert_cast<const ColumnVarbinary&>(*p1);
ASSERT_EQ(c1.size(), 3U);
for (size_t i = 0; i < 3; ++i) {
auto r = c1.get_data_at(i);
const auto& expect = vals[perm[i]];
ASSERT_EQ(r.size, expect.size());
ASSERT_EQ(memcmp(r.data, expect.data(), r.size), 0);
}
// full size
ColumnPtr p2 = col->permute(perm, vals.size());
const auto& c2 = assert_cast<const ColumnVarbinary&>(*p2);
ASSERT_EQ(c2.size(), vals.size());
for (size_t i = 0; i < vals.size(); ++i) {
auto r = c2.get_data_at(i);
const auto& expect = vals[perm[i]];
ASSERT_EQ(r.size, expect.size());
ASSERT_EQ(memcmp(r.data, expect.data(), r.size), 0);
}
}
TEST_F(ColumnVarbinaryTest, CloneResized) {
auto col = ColumnVarbinary::create();
std::vector<std::string> vals = {make_bytes(1, 0x30), make_bytes(0, 0x00),
make_bytes(doris::StringView::kInlineSize + 1, 0x31)};
for (auto& v : vals) {
col->insert_data(v.data(), v.size());
}
// enlarge
auto c2 = col->clone_resized(5);
const auto& cc2 = assert_cast<const ColumnVarbinary&>(*c2);
ASSERT_EQ(cc2.size(), 5U);
for (size_t i = 0; i < vals.size(); ++i) {
auto r = cc2.get_data_at(i);
ASSERT_EQ(r.size, vals[i].size());
ASSERT_EQ(memcmp(r.data, vals[i].data(), r.size), 0);
}
for (size_t i = vals.size(); i < 5; ++i) {
auto r = cc2.get_data_at(i);
ASSERT_EQ(r.size, 0U); // default rows
}
// shrink
auto c3 = col->clone_resized(2);
const auto& cc3 = assert_cast<const ColumnVarbinary&>(*c3);
ASSERT_EQ(cc3.size(), 2U);
for (size_t i = 0; i < 2; ++i) {
auto r = cc3.get_data_at(i);
ASSERT_EQ(r.size, vals[i].size());
ASSERT_EQ(memcmp(r.data, vals[i].data(), r.size), 0);
}
}
TEST_F(ColumnVarbinaryTest, ReplaceColumnData) {
auto col = ColumnVarbinary::create();
// mix inline and non-inline
std::vector<std::string> vals = {
make_bytes(2, 0x40), // inline
make_bytes(doris::StringView::kInlineSize + 4, 0xB0), // non-inline
make_bytes(4, 0x42) // inline
};
for (auto& v : vals) {
col->insert_data(v.data(), v.size());
}
auto rhs = ColumnVarbinary::create();
std::vector<std::string> rhs_vals = {
make_bytes(doris::StringView::kInlineSize + 7, 0xC0), // non-inline
make_bytes(1, 0x51) // inline
};
for (auto& v : rhs_vals) {
rhs->insert_data(v.data(), v.size());
}
// replace row 0 (inline) with rhs[1] (inline) -> stays inline
col->replace_column_data(*rhs, /*row=*/1, /*self_row=*/0);
auto r0 = col->get_data_at(0);
ASSERT_EQ(r0.size, rhs_vals[1].size());
ASSERT_EQ(memcmp(r0.data, rhs_vals[1].data(), r0.size), 0);
// replace row 2 (inline) with rhs[0] (non-inline)
col->replace_column_data(*rhs, /*row=*/0, /*self_row=*/2);
auto r2 = col->get_data_at(2);
ASSERT_EQ(r2.size, rhs_vals[0].size());
ASSERT_EQ(memcmp(r2.data, rhs_vals[0].data(), r2.size), 0);
}
TEST_F(ColumnVarbinaryTest, SerializeDeserializeRoundtripManual) {
auto col = ColumnVarbinary::create();
std::string v = make_bytes(doris::StringView::kInlineSize + 17, 0x60);
std::vector<char> buf;
auto len = static_cast<uint32_t>(v.size());
buf.resize(sizeof(uint32_t) + v.size());
memcpy(buf.data(), &len, sizeof(uint32_t));
memcpy(buf.data() + sizeof(uint32_t), v.data(), v.size());
const char* p = buf.data();
const char* end = col->deserialize_and_insert_from_arena(p);
ASSERT_EQ(static_cast<size_t>(end - p), sizeof(uint32_t) + v.size());
ASSERT_EQ(col->size(), 1U);
auto r = col->get_data_at(0);
ASSERT_EQ(r.size, v.size());
ASSERT_EQ(memcmp(r.data, v.data(), r.size), 0);
}
TEST_F(ColumnVarbinaryTest, SerializeSizeAtShouldIncludeLengthHeader) {
auto col = ColumnVarbinary::create();
std::string v = make_bytes(9, 0x70);
col->insert_data(v.data(), v.size());
size_t sz = col->serialize_size_at(0);
// Expect payload + 4 bytes length header.
EXPECT_EQ(sz, v.size() + sizeof(uint32_t));
}
TEST_F(ColumnVarbinaryTest, FieldAccessOperatorAndGet) {
auto col = ColumnVarbinary::create();
std::vector<std::string> vals = {
make_bytes(1, 0x11), make_bytes(0, 0x00),
make_bytes(doris::StringView::kInlineSize + 6, 0x12)}; // include non-inline
for (auto& v : vals) {
col->insert_data(v.data(), v.size());
}
for (size_t i = 0; i < vals.size(); ++i) {
// operator[]
Field f = (*col)[i];
const auto& sv = vectorized::get<const StringViewField&>(f);
ASSERT_EQ(sv.size(), vals[i].size());
ASSERT_EQ(memcmp(sv.data(), vals[i].data(), sv.size()), 0);
// get(size_t, Field&)
Field f2;
col->get(i, f2);
const auto& sv2 = vectorized::get<const StringViewField&>(f2);
ASSERT_EQ(sv2.size(), vals[i].size());
ASSERT_EQ(memcmp(sv2.data(), vals[i].data(), sv2.size()), 0);
}
}
TEST_F(ColumnVarbinaryTest, InsertField) {
auto col = ColumnVarbinary::create();
// prepare inline and non-inline fields
std::string inline_v = make_bytes(2, 0x21);
std::string big_v = make_bytes(doris::StringView::kInlineSize + 10, 0x22);
Field f_inline = Field::create_field<TYPE_VARBINARY>(
doris::StringView(inline_v.data(), static_cast<uint32_t>(inline_v.size())));
Field f_big = Field::create_field<TYPE_VARBINARY>(
doris::StringView(big_v.data(), static_cast<uint32_t>(big_v.size())));
col->insert(f_inline);
col->insert(f_big);
ASSERT_EQ(col->size(), 2U);
auto r0 = col->get_data_at(0);
auto r1 = col->get_data_at(1);
ASSERT_EQ(r0.size, inline_v.size());
ASSERT_EQ(memcmp(r0.data, inline_v.data(), r0.size), 0);
ASSERT_EQ(r1.size, big_v.size());
ASSERT_EQ(memcmp(r1.data, big_v.data(), r1.size), 0);
}
TEST_F(ColumnVarbinaryTest, SerializeValueIntoArenaAndImpl) {
auto col = ColumnVarbinary::create();
std::string small = make_bytes(3, 0x31); // inline
std::string big = make_bytes(doris::StringView::kInlineSize + 12, 0x32); // non-inline
col->insert_data(small.data(), small.size());
col->insert_data(big.data(), big.size());
// serialize_value_into_arena (covers serialize_impl indirectly)
Arena arena;
const char* begin = nullptr;
auto sr_inline = col->serialize_value_into_arena(0, arena, begin);
ASSERT_EQ(sr_inline.size, small.size() + sizeof(uint32_t));
uint32_t len_inline;
memcpy(&len_inline, sr_inline.data, sizeof(uint32_t));
ASSERT_EQ(len_inline, small.size());
ASSERT_EQ(memcmp(sr_inline.data + sizeof(uint32_t), small.data(), small.size()), 0);
auto sr_big = col->serialize_value_into_arena(1, arena, begin);
ASSERT_EQ(sr_big.size, big.size() + sizeof(uint32_t));
uint32_t len_big;
memcpy(&len_big, sr_big.data, sizeof(uint32_t));
ASSERT_EQ(len_big, big.size());
ASSERT_EQ(memcmp(sr_big.data + sizeof(uint32_t), big.data(), big.size()), 0);
// direct serialize_impl
char buf[4096];
size_t written = col->serialize_impl(buf, 1);
ASSERT_EQ(written, big.size() + sizeof(uint32_t));
uint32_t len_big2;
memcpy(&len_big2, buf, sizeof(uint32_t));
ASSERT_EQ(len_big2, big.size());
ASSERT_EQ(memcmp(buf + sizeof(uint32_t), big.data(), big.size()), 0);
}
TEST_F(ColumnVarbinaryTest, AllocatedBytesAndHasEnoughCapacity) {
auto dest = ColumnVarbinary::create();
// Grow dest to obtain some spare capacity
for (int i = 0; i < 64; ++i) {
std::string v = make_bytes((i % 5) + 1, static_cast<uint8_t>(0x40 + i)); // mostly inline
dest->insert_data(v.data(), v.size());
}
// Force some non-inline values to ensure arena usage
for (int i = 0; i < 3; ++i) {
auto big =
make_bytes(doris::StringView::kInlineSize + 20 + i, static_cast<uint8_t>(0x90 + i));
dest->insert_data(big.data(), big.size());
}
// Capture capacity & size
size_t cap = dest->get_data().capacity();
size_t sz = dest->size();
ASSERT_GT(cap, sz);
// Ensure allocated_bytes >= byte_size()
ASSERT_GE(dest->allocated_bytes(), dest->byte_size());
// Create src_small with size less than free slots (cap - sz)
size_t free_slots = cap - sz;
auto src_small = ColumnVarbinary::create();
for (size_t i = 0; i < free_slots - 1; ++i) { // leave at least 1 slot
auto v = make_bytes(1, 0x55);
src_small->insert_data(v.data(), v.size());
}
ASSERT_TRUE(dest->has_enough_capacity(*src_small));
// src_big exactly fills free slots -> expect false (need strictly greater)
auto src_big = ColumnVarbinary::create();
for (size_t i = 0; i < free_slots; ++i) {
auto v = make_bytes(1, 0x66);
src_big->insert_data(v.data(), v.size());
}
ASSERT_FALSE(dest->has_enough_capacity(*src_big));
}
TEST_F(ColumnVarbinaryTest, InsertRangeFromOutOfBoundsThrows) {
auto src = ColumnVarbinary::create();
std::vector<std::string> vals = {make_bytes(2, 0x10), make_bytes(3, 0x20)};
for (auto& v : vals) {
src->insert_data(v.data(), v.size());
}
auto dst = ColumnVarbinary::create();
EXPECT_THROW(dst->insert_range_from(*src, /*start=*/1, /*length=*/5), doris::Exception);
}
TEST_F(ColumnVarbinaryTest, GetMaxRowByteSizeMix) {
auto col = ColumnVarbinary::create();
// empty, inline, contains '\0', non-inline
std::string empty;
std::string inline_v = make_bytes(3, 0x01); // inline (<= kInlineSize)
std::string with_zero = std::string("AB\0CD", 5); // explicit embedded zero
std::string big = make_bytes(doris::StringView::kInlineSize + 15, 0x11); // non-inline
col->insert_data(empty.data(), empty.size());
col->insert_data(inline_v.data(), inline_v.size());
col->insert_data(with_zero.data(), with_zero.size());
col->insert_data(big.data(), big.size());
size_t expected = std::max({empty.size(), inline_v.size(), with_zero.size(), big.size()}) +
sizeof(uint32_t);
ASSERT_EQ(col->get_max_row_byte_size(), expected);
}
TEST_F(ColumnVarbinaryTest, SerializeDeserializeKeysArray) {
auto col = ColumnVarbinary::create();
std::vector<std::string> vals = {
std::string(), // empty
std::string("Z", 1), // single char inline
std::string("A\0B", 3), // inline with zero
make_bytes(doris::StringView::kInlineSize + 2, 0x33), // non-inline small
make_bytes(doris::StringView::kInlineSize + 25, 0x44) // non-inline larger
};
for (auto& v : vals) {
col->insert_data(v.data(), v.size());
}
size_t n = vals.size();
size_t per_row_cap = col->get_max_row_byte_size();
std::vector<std::vector<char>> buffers(n); // each row independent buffer
std::vector<StringRef> keys(n);
for (size_t i = 0; i < n; ++i) {
buffers[i].resize(per_row_cap);
keys[i].data = buffers[i].data();
keys[i].size = 0; // used bytes starts at 0
}
// serialize each row independently
col->serialize(keys.data(), n);
for (size_t i = 0; i < n; ++i) {
size_t expected_sz = vals[i].size() + sizeof(uint32_t);
ASSERT_EQ(keys[i].size, expected_sz); // used bytes recorded
// verify header length matches
uint32_t len;
memcpy(&len, buffers[i].data(), sizeof(uint32_t));
ASSERT_EQ(len, vals[i].size());
ASSERT_EQ(memcmp(buffers[i].data() + sizeof(uint32_t), vals[i].data(), vals[i].size()), 0);
}
// Prepare for deserialize into new column
auto col2 = ColumnVarbinary::create();
std::vector<StringRef> dkeys(n);
for (size_t i = 0; i < n; ++i) {
dkeys[i].data = buffers[i].data();
dkeys[i].size = keys[i].size; // remaining bytes to consume
}
col2->deserialize(dkeys.data(), n);
ASSERT_EQ(col2->size(), n);
for (size_t i = 0; i < n; ++i) {
auto r = col2->get_data_at(i);
ASSERT_EQ(r.size, vals[i].size());
ASSERT_EQ(memcmp(r.data, vals[i].data(), r.size), 0);
// After deserialize pointer advanced & size reduced
ASSERT_EQ(dkeys[i].size, 0U);
}
}
TEST_F(ColumnVarbinaryTest, PermuteThrowsOnShortPermutation) {
auto col = ColumnVarbinary::create();
std::vector<std::string> vals = {make_bytes(1, 0x31), make_bytes(1, 0x32),
make_bytes(doris::StringView::kInlineSize + 2, 0x33)};
for (auto& v : vals) {
col->insert_data(v.data(), v.size());
}
IColumn::Permutation perm = {1};
EXPECT_THROW(col->permute(perm, 2), doris::Exception);
}
TEST_F(ColumnVarbinaryTest, ReplaceColumnDataOnNonInlineTarget) {
auto col = ColumnVarbinary::create();
std::string inl = make_bytes(3, 0x41); // inline
std::string big1 = make_bytes(doris::StringView::kInlineSize + 5, 0xB1); // non-inline
std::string big2 = make_bytes(doris::StringView::kInlineSize + 7, 0xB2); // non-inline
col->insert_data(inl.data(), inl.size());
col->insert_data(big1.data(), big1.size()); // row 1: non-inline target
auto rhs = ColumnVarbinary::create();
std::string rhs_inline = make_bytes(2, 0x51); // inline
rhs->insert_data(rhs_inline.data(), rhs_inline.size()); // row 0 inline
rhs->insert_data(big2.data(), big2.size()); // row 1 non-inline
col->replace_column_data(*rhs, /*row=*/0, /*self_row=*/1);
auto r1 = col->get_data_at(1);
ASSERT_EQ(r1.size, rhs_inline.size());
ASSERT_EQ(memcmp(r1.data, rhs_inline.data(), r1.size), 0);
col->replace_column_data(*rhs, /*row=*/1, /*self_row=*/1);
auto r1b = col->get_data_at(1);
ASSERT_EQ(r1b.size, big2.size());
ASSERT_EQ(memcmp(r1b.data, big2.data(), r1b.size), 0);
}
TEST_F(ColumnVarbinaryTest, SerializeSizeAtForNonInline) {
auto col = ColumnVarbinary::create();
std::string small = make_bytes(4, 0x61);
std::string big = make_bytes(doris::StringView::kInlineSize + 9, 0x62);
col->insert_data(small.data(), small.size());
col->insert_data(big.data(), big.size());
EXPECT_EQ(col->serialize_size_at(0), small.size() + sizeof(uint32_t));
EXPECT_EQ(col->serialize_size_at(1), big.size() + sizeof(uint32_t));
}
TEST_F(ColumnVarbinaryTest, CloneResizedZero) {
auto col = ColumnVarbinary::create();
col->insert_data("a", 1);
col->insert_data("", 0);
auto c0 = col->clone_resized(0);
const auto& cc0 = assert_cast<const ColumnVarbinary&>(*c0);
EXPECT_EQ(cc0.size(), 0U);
}
TEST_F(ColumnVarbinaryTest, GetPermutationAscDescIgnoreLimit) {
auto col = ColumnVarbinary::create();
// Deliberately craft strings with shared prefixes & embedded zeros
std::vector<std::string> vals = {
std::string("aa"),
std::string("aa\0", 3),
std::string("aa\0b", 4),
std::string("aaa"),
make_bytes(doris::StringView::kInlineSize + 5, 0x50), // non-inline high bytes
std::string("aab"),
std::string("aa\0aa", 5)};
for (auto& v : vals) {
col->insert_data(v.data(), v.size());
}
IColumn::Permutation perm_asc;
col->get_permutation(/*reverse=*/false, /*limit=*/3, /*nan_hint=*/0,
perm_asc); // limit ignored by impl
ASSERT_EQ(perm_asc.size(), vals.size());
// check ascending ordering
for (size_t i = 1; i < perm_asc.size(); ++i) {
int c = col->compare_at(perm_asc[i - 1], perm_asc[i], *col, 0);
ASSERT_LE(c, 0) << "Permutation not ascending at position " << i;
}
IColumn::Permutation perm_desc;
col->get_permutation(/*reverse=*/true, /*limit=*/vals.size(), /*nan_hint=*/0, perm_desc);
ASSERT_EQ(perm_desc.size(), vals.size());
for (size_t i = 1; i < perm_desc.size(); ++i) {
int c = col->compare_at(perm_desc[i - 1], perm_desc[i], *col, 0);
ASSERT_GE(c, 0) << "Permutation not descending at position " << i;
}
}
TEST_F(ColumnVarbinaryTest, InsertManyStrings) {
auto col = ColumnVarbinary::create();
// Test 1: Insert empty array
{
std::vector<StringRef> empty_refs;
col->insert_many_strings(empty_refs.data(), empty_refs.size());
EXPECT_EQ(col->size(), 0U);
}
// Test 2: Insert single string
{
std::string s1 = "hello";
StringRef ref1(s1.data(), s1.size());
col->insert_many_strings(&ref1, 1);
EXPECT_EQ(col->size(), 1U);
auto data = col->get_data_at(0);
EXPECT_EQ(data.size, 5U);
EXPECT_EQ(memcmp(data.data, "hello", 5), 0);
}
// Test 3: Insert multiple inline strings (size <= kInlineSize)
{
std::string s2 = "abc";
std::string s3 = "def";
std::string s4 = make_bytes(doris::StringView::kInlineSize, 0xAA);
std::vector<StringRef> refs = {StringRef(s2.data(), s2.size()),
StringRef(s3.data(), s3.size()),
StringRef(s4.data(), s4.size())};
col->insert_many_strings(refs.data(), refs.size());
EXPECT_EQ(col->size(), 4U); // 1 from test 2 + 3 new
auto data1 = col->get_data_at(1);
EXPECT_EQ(data1.size, 3U);
EXPECT_EQ(memcmp(data1.data, "abc", 3), 0);
auto data2 = col->get_data_at(2);
EXPECT_EQ(data2.size, 3U);
EXPECT_EQ(memcmp(data2.data, "def", 3), 0);
auto data3 = col->get_data_at(3);
EXPECT_EQ(data3.size, doris::StringView::kInlineSize);
EXPECT_EQ(memcmp(data3.data, s4.data(), s4.size()), 0);
}
// Test 4: Insert multiple large strings (size > kInlineSize)
{
std::string large1 = make_bytes(doris::StringView::kInlineSize + 10, 0x11);
std::string large2 = make_bytes(doris::StringView::kInlineSize + 20, 0x22);
std::string large3 = make_bytes(doris::StringView::kInlineSize + 30, 0x33);
std::vector<StringRef> large_refs = {StringRef(large1.data(), large1.size()),
StringRef(large2.data(), large2.size()),
StringRef(large3.data(), large3.size())};
size_t before_size = col->size();
col->insert_many_strings(large_refs.data(), large_refs.size());
EXPECT_EQ(col->size(), before_size + 3);
auto data_large1 = col->get_data_at(before_size);
EXPECT_EQ(data_large1.size, large1.size());
EXPECT_EQ(memcmp(data_large1.data, large1.data(), large1.size()), 0);
auto data_large2 = col->get_data_at(before_size + 1);
EXPECT_EQ(data_large2.size, large2.size());
EXPECT_EQ(memcmp(data_large2.data, large2.data(), large2.size()), 0);
auto data_large3 = col->get_data_at(before_size + 2);
EXPECT_EQ(data_large3.size, large3.size());
EXPECT_EQ(memcmp(data_large3.data, large3.data(), large3.size()), 0);
}
// Test 5: Insert strings with null bytes
{
std::string null_str1 = std::string("abc\0def", 7);
std::string null_str2 = std::string("\0\0\0", 3);
std::vector<StringRef> null_refs = {StringRef(null_str1.data(), null_str1.size()),
StringRef(null_str2.data(), null_str2.size())};
size_t before_size = col->size();
col->insert_many_strings(null_refs.data(), null_refs.size());
EXPECT_EQ(col->size(), before_size + 2);
auto data_null1 = col->get_data_at(before_size);
EXPECT_EQ(data_null1.size, 7U);
EXPECT_EQ(memcmp(data_null1.data, null_str1.data(), 7), 0);
auto data_null2 = col->get_data_at(before_size + 1);
EXPECT_EQ(data_null2.size, 3U);
EXPECT_EQ(memcmp(data_null2.data, null_str2.data(), 3), 0);
}
// Test 6: Insert mixed inline and non-inline strings
{
std::string small = "xy";
std::string medium = make_bytes(doris::StringView::kInlineSize, 0xBB);
std::string large = make_bytes(doris::StringView::kInlineSize + 50, 0xCC);
std::vector<StringRef> mixed_refs = {StringRef(small.data(), small.size()),
StringRef(medium.data(), medium.size()),
StringRef(large.data(), large.size())};
size_t before_size = col->size();
col->insert_many_strings(mixed_refs.data(), mixed_refs.size());
EXPECT_EQ(col->size(), before_size + 3);
auto data_small = col->get_data_at(before_size);
EXPECT_EQ(data_small.size, 2U);
EXPECT_EQ(memcmp(data_small.data, "xy", 2), 0);
auto data_medium = col->get_data_at(before_size + 1);
EXPECT_EQ(data_medium.size, doris::StringView::kInlineSize);
auto data_large = col->get_data_at(before_size + 2);
EXPECT_EQ(data_large.size, large.size());
EXPECT_EQ(memcmp(data_large.data, large.data(), large.size()), 0);
}
// Test 7: Insert UUID-like binary data (16 bytes)
{
std::string uuid1 = make_bytes(16, 0x55);
std::string uuid2 = make_bytes(16, 0x12);
std::vector<StringRef> uuid_refs = {StringRef(uuid1.data(), uuid1.size()),
StringRef(uuid2.data(), uuid2.size())};
size_t before_size = col->size();
col->insert_many_strings(uuid_refs.data(), uuid_refs.size());
EXPECT_EQ(col->size(), before_size + 2);
auto data_uuid1 = col->get_data_at(before_size);
EXPECT_EQ(data_uuid1.size, 16U);
EXPECT_EQ(memcmp(data_uuid1.data, uuid1.data(), 16), 0);
auto data_uuid2 = col->get_data_at(before_size + 1);
EXPECT_EQ(data_uuid2.size, 16U);
EXPECT_EQ(memcmp(data_uuid2.data, uuid2.data(), 16), 0);
}
}
TEST_F(ColumnVarbinaryTest, InsertManyStringsOverflow) {
auto col = ColumnVarbinary::create();
// Test 1: Insert with max_length larger than actual strings (no overflow)
{
std::string s1 = "hello";
std::string s2 = "world";
std::vector<StringRef> refs = {StringRef(s1.data(), s1.size()),
StringRef(s2.data(), s2.size())};
col->insert_many_strings_overflow(refs.data(), refs.size(), 100);
EXPECT_EQ(col->size(), 2U);
auto data1 = col->get_data_at(0);
EXPECT_EQ(data1.size, 5U);
EXPECT_EQ(memcmp(data1.data, "hello", 5), 0);
auto data2 = col->get_data_at(1);
EXPECT_EQ(data2.size, 5U);
EXPECT_EQ(memcmp(data2.data, "world", 5), 0);
}
// Test 2: Insert with max_length equal to string length (exact fit)
{
std::string s3 = "test123";
StringRef ref3(s3.data(), s3.size());
col->insert_many_strings_overflow(&ref3, 1, 7);
EXPECT_EQ(col->size(), 3U);
auto data3 = col->get_data_at(2);
EXPECT_EQ(data3.size, 7U);
EXPECT_EQ(memcmp(data3.data, "test123", 7), 0);
}
// Test 3: Insert large strings with max_length
// Note: Current implementation doesn't actually truncate, it just calls insert_many_strings
// This test verifies the current behavior
{
std::string large = make_bytes(doris::StringView::kInlineSize + 100, 0xAA);
StringRef ref_large(large.data(), large.size());
size_t before_size = col->size();
col->insert_many_strings_overflow(&ref_large, 1, 50);
EXPECT_EQ(col->size(), before_size + 1);
auto data_large = col->get_data_at(before_size);
// Current implementation doesn't truncate, so full size is preserved
EXPECT_EQ(data_large.size, large.size());
EXPECT_EQ(memcmp(data_large.data, large.data(), large.size()), 0);
}
// Test 4: Insert multiple strings with overflow parameter
{
std::string s4 = make_bytes(20, 0x11);
std::string s5 = make_bytes(30, 0x22);
std::string s6 = make_bytes(40, 0x33);
std::vector<StringRef> refs = {StringRef(s4.data(), s4.size()),
StringRef(s5.data(), s5.size()),
StringRef(s6.data(), s6.size())};
size_t before_size = col->size();
col->insert_many_strings_overflow(refs.data(), refs.size(), 100);
EXPECT_EQ(col->size(), before_size + 3);
// Verify all strings are inserted correctly
auto data4 = col->get_data_at(before_size);
EXPECT_EQ(data4.size, 20U);
EXPECT_EQ(memcmp(data4.data, s4.data(), 20), 0);
auto data5 = col->get_data_at(before_size + 1);
EXPECT_EQ(data5.size, 30U);
EXPECT_EQ(memcmp(data5.data, s5.data(), 30), 0);
auto data6 = col->get_data_at(before_size + 2);
EXPECT_EQ(data6.size, 40U);
EXPECT_EQ(memcmp(data6.data, s6.data(), 40), 0);
}
// Test 5: Insert binary data (like UUID) with overflow
{
std::string uuid = make_bytes(16, 0x55);
StringRef uuid_ref(uuid.data(), uuid.size());
size_t before_size = col->size();
col->insert_many_strings_overflow(&uuid_ref, 1, 32);
EXPECT_EQ(col->size(), before_size + 1);
auto data_uuid = col->get_data_at(before_size);
EXPECT_EQ(data_uuid.size, 16U);
EXPECT_EQ(memcmp(data_uuid.data, uuid.data(), 16), 0);
}
// Test 6: Insert empty strings with max_length
{
std::string empty1;
std::string empty2;
std::vector<StringRef> empty_refs = {StringRef(empty1.data(), empty1.size()),
StringRef(empty2.data(), empty2.size())};
size_t before_size = col->size();
col->insert_many_strings_overflow(empty_refs.data(), empty_refs.size(), 10);
EXPECT_EQ(col->size(), before_size + 2);
auto data_empty1 = col->get_data_at(before_size);
EXPECT_EQ(data_empty1.size, 0U);
auto data_empty2 = col->get_data_at(before_size + 1);
EXPECT_EQ(data_empty2.size, 0U);
}
// Test 7: Insert strings with null bytes and overflow parameter
{
std::string null_data = std::string("abc\0\0\0def", 9);
StringRef null_ref(null_data.data(), null_data.size());
size_t before_size = col->size();
col->insert_many_strings_overflow(&null_ref, 1, 20);
EXPECT_EQ(col->size(), before_size + 1);
auto data_null = col->get_data_at(before_size);
EXPECT_EQ(data_null.size, 9U);
EXPECT_EQ(memcmp(data_null.data, null_data.data(), 9), 0);
}
}
} // namespace doris::vectorized