blob: b51760fbd362f5ca04b7ec9bc6b6d8f72a0b0f09 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "olap/delete_bitmap_calculator.h"
#include <gen_cpp/olap_file.pb.h>
#include <gen_cpp/segment_v2.pb.h>
#include <gtest/gtest-message.h>
#include <gtest/gtest-test-part.h>
#include <algorithm>
#include <memory>
#include <random>
#include <string>
#include <vector>
#include "gtest/gtest_pred_impl.h"
#include "io/fs/file_reader.h"
#include "io/fs/file_writer.h"
#include "io/fs/local_file_system.h"
#include "olap/primary_key_index.h"
#include "olap/row_cursor.h"
#include "olap/rowset/segment_v2/segment.h"
#include "olap/rowset/segment_v2/segment_writer.h"
#include "olap/storage_engine.h"
#include "olap/tablet_meta.h"
#include "olap/tablet_schema.h"
#include "olap/tablet_schema_helper.h"
#include "runtime/exec_env.h"
namespace doris {
using namespace ErrorCode;
static std::string kSegmentDir = "./ut_dir/delete_bitmap_calculator_test";
static RowsetId rowset_id {0};
using Generator = std::function<void(size_t rid, int cid, RowCursorCell& cell)>;
TabletColumnPtr create_int_sequence_value(int32_t id, bool is_nullable = true,
bool is_bf_column = false) {
TabletColumnPtr column = std::make_shared<TabletColumn>();
column->_unique_id = id;
column->_col_name = std::to_string(id);
column->_type = FieldType::OLAP_FIELD_TYPE_INT;
column->_is_key = false;
column->_is_nullable = is_nullable;
column->_length = 4;
column->_index_length = 4;
column->_is_bf_column = is_bf_column;
column->set_name(SEQUENCE_COL);
return column;
}
void build_segment(SegmentWriterOptions opts, TabletSchemaSPtr build_schema, size_t segment_id,
TabletSchemaSPtr query_schema, size_t nrows, Generator generator,
std::shared_ptr<Segment>* res, std::string segment_dir) {
std::string filename = fmt::format("{}_{}.dat", rowset_id.to_string(), segment_id);
std::string path = fmt::format("{}/{}", segment_dir, filename);
auto fs = io::global_local_filesystem();
io::FileWriterPtr file_writer;
Status st = fs->create_file(path, &file_writer);
EXPECT_TRUE(st.ok()) << st.to_string();
SegmentWriter writer(file_writer.get(), segment_id, build_schema, nullptr, nullptr, opts,
nullptr);
st = writer.init();
EXPECT_TRUE(st.ok());
RowCursor row;
auto olap_st = row.init(build_schema);
EXPECT_EQ(Status::OK(), olap_st);
for (size_t rid = 0; rid < nrows; ++rid) {
for (int cid = 0; cid < build_schema->num_columns(); ++cid) {
RowCursorCell cell = row.cell(cid);
generator(rid, cid, cell);
}
EXPECT_TRUE(writer.append_row(row).ok());
}
uint64_t file_size, index_size;
st = writer.finalize(&file_size, &index_size);
EXPECT_TRUE(st.ok());
EXPECT_TRUE(file_writer->close().ok());
EXPECT_NE("", writer.min_encoded_key().to_string());
EXPECT_NE("", writer.max_encoded_key().to_string());
int64_t tablet_id = 100;
st = segment_v2::Segment::open(fs, path, tablet_id, segment_id, rowset_id, query_schema,
io::FileReaderOptions {}, res);
EXPECT_TRUE(st.ok());
EXPECT_EQ(nrows, (*res)->num_rows());
}
// Convenience overload that also returns the on-disk path of the built segment.
void build_segment(SegmentWriterOptions opts, TabletSchemaSPtr build_schema, size_t segment_id,
TabletSchemaSPtr query_schema, size_t nrows, Generator generator,
std::shared_ptr<Segment>* res, std::string segment_dir, std::string* out_path) {
std::string filename = fmt::format("{}_{}.dat", rowset_id.to_string(), segment_id);
std::string path = fmt::format("{}/{}", segment_dir, filename);
if (out_path != nullptr) {
*out_path = path;
}
build_segment(std::move(opts), std::move(build_schema), segment_id, std::move(query_schema),
nrows, std::move(generator), res, std::move(segment_dir));
}
TabletSchemaSPtr create_schema(const std::vector<TabletColumnPtr>& columns,
KeysType keys_type = UNIQUE_KEYS) {
TabletSchemaSPtr res = std::make_shared<TabletSchema>();
for (auto& col : columns) {
res->append_column(*col);
}
res->_keys_type = keys_type;
return res;
}
class DeleteBitmapCalculatorTest : public testing::Test {
public:
void SetUp() override {
auto st = io::global_local_filesystem()->delete_directory(kSegmentDir);
ASSERT_TRUE(st.ok()) << st;
st = io::global_local_filesystem()->create_directory(kSegmentDir);
ASSERT_TRUE(st.ok()) << st;
ExecEnv::GetInstance()->set_storage_engine(
std::make_unique<StorageEngine>(EngineOptions {}));
}
void TearDown() override {
EXPECT_TRUE(io::global_local_filesystem()->delete_directory(kSegmentDir).ok());
}
void run_test(size_t const num_segments, size_t const max_rows_per_segment,
size_t const num_key_columns, bool has_sequence_col,
size_t const num_value_columns, int const random_seed, int const min_value,
int const max_value) {
SegmentWriterOptions opts;
opts.enable_unique_key_merge_on_write = true;
size_t const num_columns = num_key_columns + has_sequence_col + num_value_columns;
size_t const seq_col_idx = has_sequence_col ? num_key_columns : -1;
std::vector<TabletColumnPtr> columns;
for (int i = 0; i < num_key_columns; ++i) {
columns.emplace_back(create_int_key(i));
}
if (has_sequence_col) {
columns.emplace_back(create_int_sequence_value(num_key_columns));
}
for (int i = 0; i < num_value_columns; ++i) {
columns.emplace_back(create_int_value(num_key_columns + has_sequence_col));
}
TabletSchemaSPtr tablet_schema = create_schema(columns, UNIQUE_KEYS);
std::mt19937 rng(random_seed);
std::uniform_int_distribution<int> gen(min_value, max_value);
std::vector<std::shared_ptr<segment_v2::Segment>> segments(num_segments);
std::vector<std::vector<std::vector<int>>> datas(num_segments);
std::map<std::pair<size_t, size_t>, std::vector<int>> data_map;
// each flat_data of data will be a tuple of (column1, column2, ..., segment_id, row_id)
std::vector<std::vector<int>> flat_data;
size_t seq_counter = 0;
// Generate random data, ensuring that there are no identical keys within each segment
// and the keys within each segment are ordered.
// Also, ensure that the sequence values are not equal.
for (size_t sid = 0; sid < num_segments; ++sid) {
auto& segment_data = datas[sid];
for (size_t rid = 0; rid < max_rows_per_segment; ++rid) {
std::vector<int> row;
for (size_t cid = 0; cid < num_columns; ++cid) {
if (cid == seq_col_idx) {
row.emplace_back(++seq_counter);
} else {
row.emplace_back(gen(rng));
}
}
segment_data.emplace_back(row);
}
std::sort(segment_data.begin(), segment_data.end());
segment_data.erase(
std::unique(segment_data.begin(), segment_data.end(),
[&](std::vector<int> const& lhs, std::vector<int> const& rhs) {
return std::vector<int>(lhs.begin(),
lhs.begin() + num_key_columns) ==
std::vector<int>(rhs.begin(),
rhs.begin() + num_key_columns);
}),
segment_data.end());
for (size_t rid = 0; rid < segment_data.size(); ++rid) {
data_map[{sid, rid}] = segment_data[rid];
auto row = segment_data[rid];
row.emplace_back(sid);
row.emplace_back(rid);
flat_data.emplace_back(row);
}
}
// Construct segments using the data generated before.
for (size_t sid = 0; sid < num_segments; ++sid) {
auto& segment = segments[sid];
std::vector<int> row_data;
auto generator = [&](size_t rid, int cid, RowCursorCell& cell) {
cell.set_not_null();
*(int*)cell.mutable_cell_ptr() = data_map[{sid, rid}][cid];
};
build_segment(opts, tablet_schema, sid, tablet_schema, datas[sid].size(), generator,
&segment, kSegmentDir);
}
// find the location of rows to be deleted using `MergeIndexDeleteBitmapCalculator`
// and the result is `result1`
MergeIndexDeleteBitmapCalculator calculator;
size_t seq_col_len = 0;
if (has_sequence_col) {
seq_col_len = tablet_schema->column(tablet_schema->sequence_col_idx()).length() + 1;
}
ASSERT_TRUE(calculator.init(rowset_id, segments, seq_col_len).ok());
DeleteBitmapPtr delete_bitmap = std::make_shared<DeleteBitmap>(0);
ASSERT_TRUE(calculator.calculate_all(delete_bitmap).ok());
std::set<std::pair<size_t, size_t>> result1;
for (auto [bitmap_key, row_ids] : delete_bitmap->delete_bitmap) {
auto segment_id = std::get<1>(bitmap_key);
for (auto row_id : row_ids) {
result1.emplace(segment_id, row_id);
}
}
// find the location of rows to be deleted using naive algorithm
// and the result is `result2`
std::set<std::pair<size_t, size_t>> result2;
std::sort(flat_data.begin(), flat_data.end(),
[&](std::vector<int> const& lhs, std::vector<int> const& rhs) -> bool {
for (size_t cid = 0; cid < num_key_columns; ++cid) {
if (lhs[cid] != rhs[cid]) {
return lhs[cid] < rhs[cid];
}
}
return has_sequence_col ? lhs[seq_col_idx] > rhs[seq_col_idx]
: lhs[num_columns] > rhs[num_columns];
});
for (size_t i = 1; i < flat_data.size(); ++i) {
bool to_delete = true;
for (size_t cid = 0; cid < num_key_columns; ++cid) {
if (flat_data[i][cid] != flat_data[i - 1][cid]) {
to_delete = false;
}
}
if (to_delete) {
result2.emplace(flat_data[i][num_columns], flat_data[i][num_columns + 1]);
}
}
LOG(INFO) << fmt::format("result1.size(): {}, result2.size(): {}", result1.size(),
result2.size());
// if result1 is equal to result2,
// we assume the result of `MergeIndexDeleteBitmapCalculator` is correct.
ASSERT_EQ(result1, result2);
}
};
TEST_F(DeleteBitmapCalculatorTest, no_sequence_column) {
run_test(2, 10, 2, false, 1, 4933, 1, 3);
run_test(4, 100, 2, false, 1, 4933, 1, 15);
run_test(10, 1000, 2, false, 1, 4933, 1, 50);
run_test(10, 8192, 2, false, 1, 4933, 1, 100);
}
TEST_F(DeleteBitmapCalculatorTest, has_sequence_column) {
run_test(2, 10, 2, true, 1, 4933, 1, 3);
run_test(4, 100, 2, true, 1, 4933, 1, 15);
run_test(10, 1000, 2, true, 1, 4933, 1, 50);
run_test(10, 8192, 2, true, 1, 4933, 1, 100);
}
} // namespace doris