| // Licensed to the Apache Software Foundation (ASF) under one |
| // or more contributor license agreements. See the NOTICE file |
| // distributed with this work for additional information |
| // regarding copyright ownership. The ASF licenses this file |
| // to you under the Apache License, Version 2.0 (the |
| // "License"); you may not use this file except in compliance |
| // with the License. You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, |
| // software distributed under the License is distributed on an |
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| // KIND, either express or implied. See the License for the |
| // specific language governing permissions and limitations |
| // under the License. |
| |
| #include "vec/common/schema_util.h" |
| |
| #include <gmock/gmock-more-matchers.h> |
| #include <gtest/gtest.h> |
| |
| #include "olap/rowset/beta_rowset.h" |
| #include "olap/rowset/rowset_fwd.h" |
| #include "olap/rowset/segment_v2/variant/variant_column_writer_impl.h" |
| #include "testutil/variant_util.h" |
| #include "vec/columns/column_nothing.h" |
| #include "vec/columns/column_object.h" |
| #include "vec/common/schema_util.h" |
| #include "vec/data_types/data_type_array.h" |
| #include "vec/data_types/data_type_date_time.h" |
| #include "vec/data_types/data_type_decimal.h" |
| #include "vec/data_types/data_type_ipv4.h" |
| #include "vec/data_types/data_type_nothing.h" |
| #include "vec/data_types/data_type_time_v2.h" |
| #include "vec/data_types/data_type_variant.h" |
| |
| using namespace doris::vectorized; |
| |
| using namespace doris::segment_v2; |
| |
| using namespace doris; |
| |
| class SchemaUtilTest : public testing::Test { |
| public: |
| SchemaUtilTest() = default; |
| virtual ~SchemaUtilTest() = default; |
| }; |
| |
| void construct_column(ColumnPB* column_pb, TabletIndexPB* tablet_index, int64_t index_id, |
| const std::string& index_name, int32_t col_unique_id, |
| const std::string& column_type, const std::string& column_name, |
| const IndexType& index_type) { |
| column_pb->set_unique_id(col_unique_id); |
| column_pb->set_name(column_name); |
| column_pb->set_type(column_type); |
| column_pb->set_is_nullable(true); |
| column_pb->set_is_bf_column(true); |
| tablet_index->set_index_id(index_id); |
| tablet_index->set_index_name(index_name); |
| tablet_index->set_index_type(index_type); |
| tablet_index->add_col_unique_id(col_unique_id); |
| } |
| |
| void construct_subcolumn(TabletSchemaSPtr schema, const FieldType& type, int32_t col_unique_id, |
| std::string_view path, std::vector<TabletColumn>* subcolumns) { |
| TabletColumn subcol; |
| subcol.set_type(type); |
| subcol.set_is_nullable(true); |
| subcol.set_unique_id(-1); |
| subcol.set_parent_unique_id(col_unique_id); |
| vectorized::PathInData col_path(path); |
| subcol.set_path_info(col_path); |
| subcol.set_name(col_path.get_path()); |
| |
| if (type == FieldType::OLAP_FIELD_TYPE_ARRAY) { |
| TabletColumn array_item_col; |
| // double not support inverted index |
| array_item_col.set_type(FieldType::OLAP_FIELD_TYPE_DOUBLE); |
| array_item_col.set_is_nullable(true); |
| array_item_col.set_unique_id(-1); |
| array_item_col.set_parent_unique_id(col_unique_id); |
| |
| subcol.add_sub_column(array_item_col); |
| } |
| |
| schema->append_column(subcol); |
| subcolumns->emplace_back(std::move(subcol)); |
| } |
| |
| // void construct_subcolumn(TabletSchemaSPtr schema, const FieldType& type, |
| // int32_t col_unique_id, std::string_view path, |
| // std::vector<TabletColumn>* subcolumns) { |
| // TabletColumn subcol; |
| // subcol.set_type(type); |
| // subcol.set_is_nullable(true); |
| // subcol.set_unique_id(-1); |
| // subcol.set_parent_unique_id(col_unique_id); |
| // vectorized::PathInData col_path(path); |
| // subcol.set_path_info(col_path); |
| // subcol.set_name(col_path.get_path()); |
| // schema->append_column(subcol); |
| // subcolumns->emplace_back(std::move(subcol)); |
| // } |
| |
| TEST_F(SchemaUtilTest, TestInheritIndex) { |
| // 1. Test basic index inheritance for non-extracted column |
| std::vector<const TabletIndex*> parent_indexes; |
| TabletIndexes subcolumns_indexes; |
| |
| // Create parent index |
| TabletIndexPB pb1; |
| pb1.set_index_id(1); |
| pb1.set_index_name("test_index"); |
| pb1.set_index_type(IndexType::INVERTED); |
| |
| TabletIndex parent_index; |
| parent_index.init_from_pb(pb1); |
| parent_indexes.push_back(&parent_index); |
| |
| // Test index inheritance for normal column (non-extracted) |
| TabletColumn normal_column; |
| normal_column.set_type(FieldType::OLAP_FIELD_TYPE_STRING); |
| normal_column.set_name("test_col"); |
| normal_column.set_unique_id(1); |
| |
| bool result = schema_util::inherit_index(parent_indexes, subcolumns_indexes, normal_column); |
| EXPECT_FALSE(result); |
| |
| // 2. Test index inheritance for extracted column |
| TabletColumn extracted_column; |
| extracted_column.set_type(FieldType::OLAP_FIELD_TYPE_STRING); |
| extracted_column.set_name("extracted_col"); |
| extracted_column.set_unique_id(2); |
| extracted_column.set_parent_unique_id(1); // Set parent column id |
| vectorized::PathInData path("parent.path"); |
| extracted_column.set_path_info(path); |
| |
| result = schema_util::inherit_index(parent_indexes, subcolumns_indexes, extracted_column); |
| EXPECT_TRUE(result); |
| EXPECT_EQ(subcolumns_indexes.size(), 1); |
| EXPECT_EQ(subcolumns_indexes[0]->index_id(), 1); |
| EXPECT_EQ(subcolumns_indexes[0]->index_name(), "test_index"); |
| EXPECT_EQ(subcolumns_indexes[0]->index_type(), IndexType::INVERTED); |
| |
| // 3. Test index inheritance for array type with empty subcolumns |
| TabletColumn empty_array_column; |
| empty_array_column.set_type(FieldType::OLAP_FIELD_TYPE_ARRAY); |
| empty_array_column.set_name("empty_array"); |
| vectorized::PathInData pat("parent.a"); |
| empty_array_column.set_path_info(pat); |
| empty_array_column.set_unique_id(3); |
| // No subcolumns added, so get_sub_columns() will be empty |
| |
| result = schema_util::inherit_index(parent_indexes, subcolumns_indexes, empty_array_column); |
| EXPECT_FALSE(result); |
| |
| // 4. Test index inheritance for array type with non-empty subcolumns |
| TabletColumn array_column; |
| array_column.set_type(FieldType::OLAP_FIELD_TYPE_ARRAY); |
| array_column.set_name("array_with_subcolumns"); |
| array_column.set_unique_id(4); |
| array_column.set_parent_unique_id(1); // Set parent column id |
| vectorized::PathInData path1("parent.a"); |
| array_column.set_path_info(path1); |
| |
| // Add subcolumn to array |
| TabletColumn sub_column; |
| sub_column.set_type(FieldType::OLAP_FIELD_TYPE_INT); |
| sub_column.set_name("sub_col"); |
| sub_column.set_unique_id(5); |
| array_column.add_sub_column(sub_column); |
| |
| result = schema_util::inherit_index(parent_indexes, subcolumns_indexes, array_column); |
| EXPECT_TRUE(result); |
| EXPECT_EQ(subcolumns_indexes.size(), 1); |
| EXPECT_EQ(subcolumns_indexes[0]->index_id(), 1); |
| EXPECT_EQ(subcolumns_indexes[0]->index_name(), "test_index"); |
| EXPECT_EQ(subcolumns_indexes[0]->index_type(), IndexType::INVERTED); |
| |
| // 4.1 Add String subcolumn to array |
| TabletColumn array_column1; |
| array_column1.set_type(FieldType::OLAP_FIELD_TYPE_ARRAY); |
| array_column1.set_name("array_with_subcolumns"); |
| array_column1.set_unique_id(4); |
| array_column1.set_parent_unique_id(1); // Set parent column id |
| array_column1.set_path_info(path1); |
| TabletColumn sub_column1; |
| sub_column1.set_type(FieldType::OLAP_FIELD_TYPE_STRING); |
| sub_column1.set_name("sub_col1"); |
| sub_column1.set_unique_id(6); |
| array_column1.add_sub_column(sub_column1); |
| result = schema_util::inherit_index(parent_indexes, subcolumns_indexes, array_column1); |
| EXPECT_TRUE(result); |
| EXPECT_EQ(subcolumns_indexes.size(), 1); |
| EXPECT_EQ(subcolumns_indexes[0]->index_id(), 1); |
| EXPECT_EQ(subcolumns_indexes[0]->index_name(), "test_index"); |
| EXPECT_EQ(subcolumns_indexes[0]->index_type(), IndexType::INVERTED); |
| |
| // 5. Test empty parent index list |
| std::vector<const TabletIndex*> empty_indexes; |
| TabletIndexes empty_subcolumns_indexes; |
| |
| result = schema_util::inherit_index(empty_indexes, empty_subcolumns_indexes, normal_column); |
| EXPECT_FALSE(result); |
| EXPECT_EQ(empty_subcolumns_indexes.size(), 0); |
| |
| // 6. Test binary Type |
| TabletColumn hll_column; |
| hll_column.set_type(FieldType::OLAP_FIELD_TYPE_HLL); |
| hll_column.set_name("hll_col"); |
| hll_column.set_unique_id(7); |
| hll_column.set_parent_unique_id(1); // Set parent column id |
| vectorized::PathInData decimal_path("parent.hll"); |
| hll_column.set_path_info(decimal_path); |
| result = schema_util::inherit_index(parent_indexes, subcolumns_indexes, hll_column); |
| EXPECT_FALSE(result); |
| } |
| |
| TEST_F(SchemaUtilTest, inherit_column_attributes) { |
| TabletSchemaPB schema_pb; |
| schema_pb.set_keys_type(KeysType::DUP_KEYS); |
| schema_pb.set_inverted_index_storage_format(InvertedIndexStorageFormatPB::V2); |
| |
| construct_column(schema_pb.add_column(), schema_pb.add_index(), 10000, "key_index", 0, "INT", |
| "key", IndexType::INVERTED); |
| construct_column(schema_pb.add_column(), schema_pb.add_index(), 10001, "v1_index", 1, "VARIANT", |
| "v1", IndexType::INVERTED); |
| construct_column(schema_pb.add_column(), schema_pb.add_index(), 10003, "v3_index", 3, "VARIANT", |
| "v3", IndexType::INVERTED); |
| |
| TabletSchemaSPtr tablet_schema = std::make_shared<TabletSchema>(); |
| tablet_schema->init_from_pb(schema_pb); |
| std::vector<TabletColumn> subcolumns; |
| |
| construct_subcolumn(tablet_schema, FieldType::OLAP_FIELD_TYPE_STRING, 1, "v1.b", &subcolumns); |
| construct_subcolumn(tablet_schema, FieldType::OLAP_FIELD_TYPE_INT, 1, "v1.c", &subcolumns); |
| |
| construct_subcolumn(tablet_schema, FieldType::OLAP_FIELD_TYPE_ARRAY, 3, "v3.d", &subcolumns); |
| construct_subcolumn(tablet_schema, FieldType::OLAP_FIELD_TYPE_FLOAT, 3, "v3.a", &subcolumns); |
| |
| schema_util::inherit_column_attributes(tablet_schema); |
| for (const auto& col : subcolumns) { |
| switch (col._parent_col_unique_id) { |
| case 1: |
| EXPECT_EQ(tablet_schema->inverted_indexs(col).size(), 1); |
| break; |
| case 3: |
| EXPECT_EQ(tablet_schema->inverted_indexs(col).size(), 1); |
| break; |
| default: |
| EXPECT_TRUE(false); |
| } |
| } |
| EXPECT_EQ(tablet_schema->inverted_indexes().size(), 7); |
| |
| for (const auto& col : tablet_schema->_cols) { |
| if (!col->is_extracted_column()) { |
| continue; |
| } |
| switch (col->_parent_col_unique_id) { |
| case 1: |
| EXPECT_TRUE(col->is_bf_column()); |
| break; |
| case 3: |
| EXPECT_TRUE(!col->is_bf_column()); |
| break; |
| default: |
| EXPECT_TRUE(false); |
| } |
| } |
| } |
| |
| TEST_F(SchemaUtilTest, test_multiple_index_inheritance) { |
| TabletSchemaPB schema_pb; |
| schema_pb.set_keys_type(KeysType::DUP_KEYS); |
| schema_pb.set_inverted_index_storage_format(InvertedIndexStorageFormatPB::V2); |
| |
| construct_column(schema_pb.add_column(), schema_pb.add_index(), 10000, "v1_index_alpha", 1, |
| "VARIANT", "v1", IndexType::INVERTED); |
| construct_column(schema_pb.add_column(), schema_pb.add_index(), 10001, "v1_index_beta", 1, |
| "VARIANT", "v1", IndexType::INVERTED); |
| |
| TabletSchemaSPtr tablet_schema = std::make_shared<TabletSchema>(); |
| tablet_schema->init_from_pb(schema_pb); |
| std::vector<TabletColumn> subcolumns; |
| |
| construct_subcolumn(tablet_schema, FieldType::OLAP_FIELD_TYPE_STRING, 1, "v1.name", |
| &subcolumns); |
| |
| vectorized::schema_util::inherit_column_attributes(tablet_schema); |
| |
| const auto& subcol = subcolumns[0]; |
| auto inherited_indexes = tablet_schema->inverted_indexs(subcol); |
| |
| EXPECT_EQ(inherited_indexes.size(), 2); |
| EXPECT_EQ(inherited_indexes[0]->index_name(), "v1_index_alpha"); |
| EXPECT_EQ(inherited_indexes[1]->index_name(), "v1_index_beta"); |
| |
| for (const auto& index : inherited_indexes) { |
| EXPECT_EQ(index->get_index_suffix(), "v1%2Ename"); |
| } |
| } |
| |
| TEST_F(SchemaUtilTest, test_index_update_logic) { |
| TabletSchemaPB schema_pb; |
| schema_pb.set_keys_type(KeysType::DUP_KEYS); |
| schema_pb.set_inverted_index_storage_format(InvertedIndexStorageFormatPB::V2); |
| |
| construct_column(schema_pb.add_column(), schema_pb.add_index(), 10000, "v1_index_orig1", 1, |
| "VARIANT", "v1", IndexType::INVERTED); |
| construct_column(schema_pb.add_column(), schema_pb.add_index(), 10001, "v1_index_orig2", 1, |
| "VARIANT", "v1", IndexType::INVERTED); |
| |
| TabletSchemaSPtr tablet_schema = std::make_shared<TabletSchema>(); |
| tablet_schema->init_from_pb(schema_pb); |
| std::vector<TabletColumn> subcolumns; |
| |
| construct_subcolumn(tablet_schema, FieldType::OLAP_FIELD_TYPE_STRING, 1, "v1.name", |
| &subcolumns); |
| vectorized::schema_util::inherit_column_attributes(tablet_schema); |
| |
| const auto& subcol = subcolumns[0]; |
| auto initial_indexes = tablet_schema->inverted_indexs(subcol); |
| ASSERT_EQ(initial_indexes.size(), 2); |
| EXPECT_EQ(initial_indexes[0]->index_name(), "v1_index_orig1"); |
| EXPECT_EQ(initial_indexes[1]->index_name(), "v1_index_orig2"); |
| |
| std::vector<TabletIndex> updated_indexes; |
| TabletIndexPB tablet_index_pb1; |
| tablet_index_pb1.set_index_id(10002); |
| tablet_index_pb1.set_index_name("v1_index_updated1"); |
| tablet_index_pb1.set_index_type(IndexType::INVERTED); |
| tablet_index_pb1.add_col_unique_id(1); |
| TabletIndex tablet_index1; |
| tablet_index1.init_from_pb(tablet_index_pb1); |
| updated_indexes.emplace_back(std::move(tablet_index1)); |
| |
| TabletIndexPB tablet_index_pb2; |
| tablet_index_pb2.set_index_id(10003); |
| tablet_index_pb2.set_index_name("v1_index_updated2"); |
| tablet_index_pb2.set_index_type(IndexType::INVERTED); |
| tablet_index_pb2.add_col_unique_id(1); |
| TabletIndex tablet_index2; |
| tablet_index2.init_from_pb(tablet_index_pb2); |
| updated_indexes.emplace_back(std::move(tablet_index2)); |
| |
| tablet_schema->update_index(tablet_schema->column(1), IndexType::INVERTED, |
| std::move(updated_indexes)); |
| |
| vectorized::schema_util::inherit_column_attributes(tablet_schema); |
| auto updated_subcol_indexes = tablet_schema->inverted_indexs(subcol); |
| |
| EXPECT_EQ(updated_subcol_indexes.size(), 2); |
| EXPECT_EQ(updated_subcol_indexes[0]->index_name(), "v1_index_updated1"); |
| EXPECT_EQ(updated_subcol_indexes[1]->index_name(), "v1_index_updated2"); |
| EXPECT_EQ(updated_subcol_indexes[0]->get_index_suffix(), "v1%2Ename"); |
| } |
| |
| static std::unordered_map<std::string, int> construct_column_map_with_random_values( |
| auto& column_map, int key_size, int value_size, const std::string& prefix) { |
| std::unordered_map<std::string, int> key_value_counts; |
| auto& key = assert_cast<ColumnString&>(column_map->get_keys()); |
| auto& value = assert_cast<ColumnString&>(column_map->get_values()); |
| auto& offsets = column_map->get_offsets(); |
| |
| std::srand(42); |
| |
| for (int i = 0; i < key_size; ++i) { |
| std::string current_key = prefix + std::to_string(i); |
| |
| int value_count = std::rand() % value_size + 1; |
| key_value_counts[current_key] = value_count; |
| |
| for (int j = 0; j < value_count; ++j) { |
| key.insert_data(current_key.data(), current_key.size()); |
| auto value_str = prefix + std::to_string(j); |
| value.insert_data(value_str.data(), value_str.size()); |
| } |
| offsets.push_back(key.size()); |
| } |
| |
| return key_value_counts; |
| } |
| |
| TEST_F(SchemaUtilTest, calculate_variant_stats) { |
| VariantStatisticsPB stats; |
| auto column_map = ColumnMap::create(ColumnString::create(), ColumnString::create(), |
| ColumnArray::ColumnOffsets::create()); |
| |
| const auto& key_value_counts = |
| construct_column_map_with_random_values(column_map, 200, 100, "key_"); |
| |
| // calculate stats |
| schema_util::calculate_variant_stats(*column_map, &stats, 0, 200); |
| EXPECT_EQ(stats.sparse_column_non_null_size_size(), key_value_counts.size()); |
| |
| for (const auto& kv : key_value_counts) { |
| auto it = stats.sparse_column_non_null_size().find(kv.first); |
| EXPECT_NE(it, stats.sparse_column_non_null_size().end()); |
| EXPECT_EQ(it->second, kv.second); |
| } |
| |
| // test with different key size |
| column_map->clear(); |
| const auto& key_value_counts2 = |
| construct_column_map_with_random_values(column_map, 3000, 100, "key_"); |
| schema_util::calculate_variant_stats(*column_map, &stats, 0, 3000); |
| EXPECT_EQ(stats.sparse_column_non_null_size_size(), 3000); |
| |
| for (const auto& [path, size] : stats.sparse_column_non_null_size()) { |
| auto first_size = key_value_counts.find(path) == key_value_counts.end() |
| ? 0 |
| : key_value_counts.find(path)->second; |
| auto second_size = key_value_counts2.find(path) == key_value_counts2.end() |
| ? 0 |
| : key_value_counts2.find(path)->second; |
| EXPECT_EQ(size, first_size + second_size); |
| } |
| |
| // test with max size |
| column_map->clear(); |
| const auto& key_value_counts3 = construct_column_map_with_random_values( |
| column_map, config::variant_max_sparse_column_statistics_size, 5, "key2_"); |
| schema_util::calculate_variant_stats(*column_map, &stats, 0, |
| config::variant_max_sparse_column_statistics_size); |
| EXPECT_EQ(config::variant_max_sparse_column_statistics_size, |
| stats.sparse_column_non_null_size_size()); |
| |
| for (const auto& [path, size] : stats.sparse_column_non_null_size()) { |
| auto first_size = key_value_counts.find(path) == key_value_counts.end() |
| ? 0 |
| : key_value_counts.find(path)->second; |
| auto second_size = key_value_counts2.find(path) == key_value_counts2.end() |
| ? 0 |
| : key_value_counts2.find(path)->second; |
| auto third_size = key_value_counts3.find(path) == key_value_counts3.end() |
| ? 0 |
| : key_value_counts3.find(path)->second; |
| EXPECT_EQ(size, first_size + second_size + third_size); |
| } |
| } |
| |
| TEST_F(SchemaUtilTest, get_subpaths) { |
| TabletSchema schema; |
| TabletColumn variant; |
| variant.set_unique_id(1); |
| variant.set_variant_max_subcolumns_count(3); |
| schema.append_column(variant); |
| std::unordered_map<int32_t, schema_util::PathToNoneNullValues> path_stats; |
| path_stats[1] = { |
| {"path1", 1000}, {"path2", 800}, {"path3", 500}, {"path4", 300}, {"path5", 200}}; |
| |
| // get subpaths |
| std::unordered_map<int32_t, TabletSchema::PathsSetInfo> uid_to_paths_set_info; |
| schema_util::get_subpaths(3, path_stats[1], uid_to_paths_set_info[1]); |
| |
| EXPECT_EQ(uid_to_paths_set_info[1].sub_path_set.size(), 3); |
| EXPECT_EQ(uid_to_paths_set_info[1].sparse_path_set.size(), 2); |
| |
| EXPECT_TRUE(uid_to_paths_set_info[1].sub_path_set.find("path1") != |
| uid_to_paths_set_info[1].sub_path_set.end()); |
| EXPECT_TRUE(uid_to_paths_set_info[1].sub_path_set.find("path2") != |
| uid_to_paths_set_info[1].sub_path_set.end()); |
| EXPECT_TRUE(uid_to_paths_set_info[1].sub_path_set.find("path3") != |
| uid_to_paths_set_info[1].sub_path_set.end()); |
| |
| EXPECT_TRUE(uid_to_paths_set_info[1].sparse_path_set.find("path4") != |
| uid_to_paths_set_info[1].sparse_path_set.end()); |
| EXPECT_TRUE(uid_to_paths_set_info[1].sparse_path_set.find("path5") != |
| uid_to_paths_set_info[1].sparse_path_set.end()); |
| } |
| |
| TEST_F(SchemaUtilTest, get_subpaths_equal_to_max) { |
| TabletSchema schema; |
| TabletColumn variant; |
| variant.set_unique_id(1); |
| variant.set_variant_max_subcolumns_count(3); |
| schema.append_column(variant); |
| |
| std::unordered_map<int32_t, schema_util::PathToNoneNullValues> path_stats; |
| path_stats[1] = {{"path1", 1000}, {"path2", 800}, {"path3", 500}}; |
| |
| std::unordered_map<int32_t, TabletSchema::PathsSetInfo> uid_to_paths_set_info; |
| schema_util::get_subpaths(3, path_stats[1], uid_to_paths_set_info[1]); |
| |
| EXPECT_EQ(uid_to_paths_set_info[1].sub_path_set.size(), 3); |
| EXPECT_EQ(uid_to_paths_set_info[1].sparse_path_set.size(), 0); |
| |
| EXPECT_TRUE(uid_to_paths_set_info[1].sub_path_set.find("path1") != |
| uid_to_paths_set_info[1].sub_path_set.end()); |
| EXPECT_TRUE(uid_to_paths_set_info[1].sub_path_set.find("path2") != |
| uid_to_paths_set_info[1].sub_path_set.end()); |
| EXPECT_TRUE(uid_to_paths_set_info[1].sub_path_set.find("path3") != |
| uid_to_paths_set_info[1].sub_path_set.end()); |
| } |
| |
| TEST_F(SchemaUtilTest, get_subpaths_multiple_variants) { |
| TabletSchema schema; |
| TabletColumn variant1; |
| |
| variant1.set_unique_id(1); |
| variant1.set_variant_max_subcolumns_count(3); |
| schema.append_column(variant1); |
| |
| TabletColumn variant2; |
| variant2.set_unique_id(2); |
| variant2.set_variant_max_subcolumns_count(2); |
| schema.append_column(variant2); |
| |
| TabletColumn variant3; |
| variant3.set_unique_id(3); |
| variant3.set_variant_max_subcolumns_count(4); |
| schema.append_column(variant3); |
| |
| std::unordered_map<int32_t, schema_util::PathToNoneNullValues> path_stats; |
| path_stats[1] = { |
| {"path1", 1000}, {"path2", 800}, {"path3", 500}, {"path4", 300}, {"path5", 200}}; |
| path_stats[2] = {{"path1", 1000}, {"path2", 800}}; |
| path_stats[3] = {{"path1", 1000}, {"path2", 800}, {"path3", 500}, {"path4", 300}}; |
| path_stats[4] = { |
| {"path1", 1000}, {"path2", 800}, {"path3", 500}, {"path4", 300}, {"path5", 200}}; |
| |
| std::unordered_map<int32_t, TabletSchema::PathsSetInfo> uid_to_paths_set_info; |
| schema_util::get_subpaths(3, path_stats[1], uid_to_paths_set_info[1]); |
| schema_util::get_subpaths(2, path_stats[2], uid_to_paths_set_info[2]); |
| schema_util::get_subpaths(4, path_stats[3], uid_to_paths_set_info[3]); |
| |
| EXPECT_EQ(uid_to_paths_set_info[1].sub_path_set.size(), 3); |
| EXPECT_EQ(uid_to_paths_set_info[1].sparse_path_set.size(), 2); |
| |
| EXPECT_EQ(uid_to_paths_set_info[2].sub_path_set.size(), 2); |
| EXPECT_EQ(uid_to_paths_set_info[2].sparse_path_set.size(), 0); |
| |
| EXPECT_EQ(uid_to_paths_set_info[3].sub_path_set.size(), 4); |
| EXPECT_EQ(uid_to_paths_set_info[3].sparse_path_set.size(), 0); |
| |
| EXPECT_TRUE(uid_to_paths_set_info[1].sub_path_set.find("path1") != |
| uid_to_paths_set_info[1].sub_path_set.end()); |
| EXPECT_TRUE(uid_to_paths_set_info[1].sub_path_set.find("path2") != |
| uid_to_paths_set_info[1].sub_path_set.end()); |
| EXPECT_TRUE(uid_to_paths_set_info[1].sub_path_set.find("path3") != |
| uid_to_paths_set_info[1].sub_path_set.end()); |
| |
| EXPECT_TRUE(uid_to_paths_set_info[1].sparse_path_set.find("path4") != |
| uid_to_paths_set_info[1].sparse_path_set.end()); |
| EXPECT_TRUE(uid_to_paths_set_info[1].sparse_path_set.find("path5") != |
| uid_to_paths_set_info[1].sparse_path_set.end()); |
| |
| EXPECT_TRUE(uid_to_paths_set_info[2].sub_path_set.find("path1") != |
| uid_to_paths_set_info[2].sub_path_set.end()); |
| EXPECT_TRUE(uid_to_paths_set_info[2].sub_path_set.find("path2") != |
| uid_to_paths_set_info[2].sub_path_set.end()); |
| |
| EXPECT_TRUE(uid_to_paths_set_info[3].sub_path_set.find("path1") != |
| uid_to_paths_set_info[3].sub_path_set.end()); |
| EXPECT_TRUE(uid_to_paths_set_info[3].sub_path_set.find("path2") != |
| uid_to_paths_set_info[3].sub_path_set.end()); |
| EXPECT_TRUE(uid_to_paths_set_info[3].sub_path_set.find("path3") != |
| uid_to_paths_set_info[3].sub_path_set.end()); |
| EXPECT_TRUE(uid_to_paths_set_info[3].sub_path_set.find("path4") != |
| uid_to_paths_set_info[3].sub_path_set.end()); |
| } |
| |
| TEST_F(SchemaUtilTest, get_subpaths_no_path_stats) { |
| TabletSchema schema; |
| TabletColumn variant; |
| variant.set_unique_id(1); |
| variant.set_variant_max_subcolumns_count(3); |
| schema.append_column(variant); |
| |
| std::unordered_map<int32_t, schema_util::PathToNoneNullValues> path_stats; |
| path_stats[2] = {{"path1", 1000}, {"path2", 800}}; |
| |
| std::unordered_map<int32_t, TabletSchema::PathsSetInfo> uid_to_paths_set_info; |
| schema_util::get_subpaths(3, path_stats[2], uid_to_paths_set_info[2]); |
| |
| EXPECT_EQ(uid_to_paths_set_info[1].sub_path_set.size(), 0); |
| EXPECT_EQ(uid_to_paths_set_info[1].sparse_path_set.size(), 0); |
| } |
| |
| TEST_F(SchemaUtilTest, generate_sub_column_info_based) { |
| TabletColumn variant; |
| variant.set_unique_id(10); |
| variant.set_variant_max_subcolumns_count(3); |
| |
| TabletColumn subcolumn; |
| subcolumn.set_name("profile.id.*"); |
| subcolumn.set_type(FieldType::OLAP_FIELD_TYPE_INT); |
| variant.add_sub_column(subcolumn); |
| |
| TabletColumn subcolumn2; |
| subcolumn2.set_name("profile.name.?"); |
| subcolumn2.set_type(FieldType::OLAP_FIELD_TYPE_STRING); |
| variant.add_sub_column(subcolumn2); |
| |
| TabletColumn subcolumn3; |
| subcolumn3.set_name("id[0-9]"); |
| subcolumn3.set_type(FieldType::OLAP_FIELD_TYPE_INT); |
| variant.add_sub_column(subcolumn3); |
| |
| TabletColumn subcolumn4; |
| subcolumn4.set_name("id[0-9].*"); |
| subcolumn4.set_type(FieldType::OLAP_FIELD_TYPE_INT); |
| variant.add_sub_column(subcolumn4); |
| |
| TabletSchema schema; |
| schema.append_column(variant); |
| |
| TabletSchema::SubColumnInfo sub_column_info; |
| bool match = |
| schema_util::generate_sub_column_info(schema, 10, "profile.id.name", &sub_column_info); |
| EXPECT_TRUE(match); |
| EXPECT_EQ(sub_column_info.column.parent_unique_id(), 10); |
| |
| match = schema_util::generate_sub_column_info(schema, 10, "profile.name.x", &sub_column_info); |
| EXPECT_TRUE(match); |
| EXPECT_EQ(sub_column_info.column.parent_unique_id(), 10); |
| |
| match = schema_util::generate_sub_column_info(schema, 10, "profile.name.xx", &sub_column_info); |
| EXPECT_FALSE(match); |
| |
| match = schema_util::generate_sub_column_info(schema, 10, "id5", &sub_column_info); |
| EXPECT_TRUE(match); |
| |
| match = schema_util::generate_sub_column_info(schema, 10, "id5.profile.name", &sub_column_info); |
| EXPECT_TRUE(match); |
| } |
| |
| TEST_F(SchemaUtilTest, generate_sub_column_info_advanced) { |
| TabletColumn variant; |
| variant.set_unique_id(10); |
| variant.set_variant_max_subcolumns_count(3); |
| |
| TabletColumn subcolumn; |
| subcolumn.set_name("profile?id"); |
| subcolumn.set_type(FieldType::OLAP_FIELD_TYPE_ARRAY); |
| TabletColumn subcolumn_item; |
| subcolumn_item.set_type(FieldType::OLAP_FIELD_TYPE_INT); |
| subcolumn.add_sub_column(subcolumn_item); |
| variant.add_sub_column(subcolumn); |
| |
| TabletColumn subcolumn2; |
| subcolumn2.set_name("profile?id.*"); |
| subcolumn2.set_type(FieldType::OLAP_FIELD_TYPE_ARRAY); |
| TabletColumn subcolumn2_item; |
| subcolumn2_item.set_type(FieldType::OLAP_FIELD_TYPE_STRING); |
| subcolumn2.add_sub_column(subcolumn2_item); |
| variant.add_sub_column(subcolumn2); |
| |
| TabletColumn subcolumn3; |
| subcolumn3.set_name("profile.id[0-9]"); |
| subcolumn3.set_type(FieldType::OLAP_FIELD_TYPE_DECIMAL64); |
| variant.add_sub_column(subcolumn3); |
| |
| TabletSchema schema; |
| schema.append_column(variant); |
| |
| TabletIndex index; |
| index._properties["field_pattern"] = "profile?id.*"; |
| index._col_unique_ids = {10}; |
| schema.append_index(std::move(index)); |
| |
| TabletIndex index2; |
| index2._properties["field_pattern"] = "profile.id[0-9]"; |
| index2._col_unique_ids = {10}; |
| schema.append_index(std::move(index2)); |
| |
| TabletSchema::SubColumnInfo sub_column_info; |
| bool match = |
| schema_util::generate_sub_column_info(schema, 10, "profile.id.name", &sub_column_info); |
| EXPECT_TRUE(match); |
| EXPECT_EQ(sub_column_info.column.parent_unique_id(), 10); |
| EXPECT_FALSE(sub_column_info.indexes.empty()); |
| |
| match = schema_util::generate_sub_column_info(schema, 10, "profile.id2", &sub_column_info); |
| EXPECT_TRUE(match); |
| EXPECT_EQ(sub_column_info.column.parent_unique_id(), 10); |
| EXPECT_FALSE(sub_column_info.indexes.empty()); |
| |
| match = schema_util::generate_sub_column_info(schema, 10, "profilexid", &sub_column_info); |
| EXPECT_TRUE(match); |
| EXPECT_EQ(sub_column_info.column.parent_unique_id(), 10); |
| EXPECT_TRUE(sub_column_info.indexes.empty()); |
| } |
| |
| TEST_F(SchemaUtilTest, TestArrayDimensions) { |
| // Test get_number_of_dimensions for DataType |
| auto array_type = std::make_shared<DataTypeArray>(std::make_shared<DataTypeInt32>()); |
| auto nested_array_type = std::make_shared<DataTypeArray>(array_type); |
| |
| EXPECT_EQ(schema_util::get_number_of_dimensions(*array_type), 1); |
| EXPECT_EQ(schema_util::get_number_of_dimensions(*nested_array_type), 2); |
| EXPECT_EQ(schema_util::get_number_of_dimensions(*std::make_shared<DataTypeInt32>()), 0); |
| |
| // Test get_number_of_dimensions for Column |
| auto array_column = |
| ColumnArray::create(ColumnInt32::create(), ColumnArray::ColumnOffsets::create()); |
| auto nested_array_column = |
| ColumnArray::create(array_column->get_ptr(), ColumnArray::ColumnOffsets::create()); |
| |
| EXPECT_EQ(schema_util::get_number_of_dimensions(*array_column), 1); |
| EXPECT_EQ(schema_util::get_number_of_dimensions(*nested_array_column), 2); |
| EXPECT_EQ(schema_util::get_number_of_dimensions(*ColumnInt32::create()), 0); |
| |
| // Test get_base_type_of_array |
| auto base_type = schema_util::get_base_type_of_array(array_type); |
| EXPECT_EQ(base_type->get_type_id(), TypeIndex::Int32); |
| |
| base_type = schema_util::get_base_type_of_array(nested_array_type); |
| EXPECT_EQ(base_type->get_type_id(), TypeIndex::Int32); |
| |
| // Test create_empty_array_field |
| auto array_field = schema_util::create_empty_array_field(2); |
| EXPECT_EQ(array_field.size(), 1); |
| EXPECT_TRUE(array_field[0].get<Array>().empty()); |
| } |
| |
| TEST_F(SchemaUtilTest, TestIntegerConversion) { |
| // Test conversion between integers |
| EXPECT_FALSE(schema_util::is_conversion_required_between_integers(TypeIndex::Int8, |
| TypeIndex::Int16)); |
| EXPECT_FALSE(schema_util::is_conversion_required_between_integers(TypeIndex::Int8, |
| TypeIndex::Int32)); |
| EXPECT_FALSE(schema_util::is_conversion_required_between_integers(TypeIndex::Int16, |
| TypeIndex::Int32)); |
| |
| EXPECT_TRUE(schema_util::is_conversion_required_between_integers(TypeIndex::Int32, |
| TypeIndex::Int16)); |
| EXPECT_TRUE(schema_util::is_conversion_required_between_integers(TypeIndex::Int64, |
| TypeIndex::Int32)); |
| |
| EXPECT_FALSE(schema_util::is_conversion_required_between_integers(TypeIndex::UInt8, |
| TypeIndex::UInt16)); |
| EXPECT_TRUE(schema_util::is_conversion_required_between_integers(TypeIndex::UInt32, |
| TypeIndex::UInt16)); |
| } |
| |
| TEST_F(SchemaUtilTest, TestColumnCasting) { |
| // Test cast_column |
| auto src_type = std::make_shared<DataTypeInt32>(); |
| auto dst_type = std::make_shared<DataTypeInt64>(); |
| |
| auto column = ColumnInt32::create(); |
| column->insert(42); |
| |
| ColumnWithTypeAndName src_col; |
| src_col.type = src_type; |
| src_col.column = column->get_ptr(); |
| src_col.name = "test_col"; |
| |
| ColumnPtr result; |
| auto status = schema_util::cast_column(src_col, dst_type, &result); |
| |
| EXPECT_TRUE(status.ok()); |
| EXPECT_EQ(result->get_int(0), 42); |
| EXPECT_EQ(result->get_name(), TypeName<Int64>::get()); |
| } |
| |
| TEST_F(SchemaUtilTest, TestGetColumnByType) { |
| // Test get_column_by_type |
| auto int_type = std::make_shared<DataTypeInt32>(); |
| auto string_type = std::make_shared<DataTypeString>(); |
| auto array_type = std::make_shared<DataTypeArray>(std::make_shared<DataTypeInt32>()); |
| auto nullable_type = make_nullable(int_type); |
| |
| schema_util::ExtraInfo ext_info; |
| ext_info.unique_id = 1; |
| ext_info.parent_unique_id = 2; |
| ext_info.path_info = PathInData("test.path"); |
| |
| // Test integer type |
| auto int_column = schema_util::get_column_by_type(int_type, "int_col", ext_info); |
| EXPECT_EQ(int_column.name(), "int_col"); |
| EXPECT_EQ(int_column.type(), FieldType::OLAP_FIELD_TYPE_INT); |
| EXPECT_EQ(int_column.unique_id(), 1); |
| EXPECT_EQ(int_column.parent_unique_id(), 2); |
| EXPECT_EQ(int_column.path_info_ptr()->get_path(), "test.path"); |
| |
| // Test string type |
| auto string_column = schema_util::get_column_by_type(string_type, "string_col", ext_info); |
| EXPECT_EQ(string_column.type(), FieldType::OLAP_FIELD_TYPE_STRING); |
| EXPECT_EQ(string_column.length(), INT_MAX); |
| |
| // Test array type |
| auto array_column = schema_util::get_column_by_type(array_type, "array_col", ext_info); |
| EXPECT_EQ(array_column.type(), FieldType::OLAP_FIELD_TYPE_ARRAY); |
| EXPECT_EQ(array_column.get_sub_column(0).type(), FieldType::OLAP_FIELD_TYPE_INT); |
| |
| // Test nullable type |
| auto nullable_column = schema_util::get_column_by_type(nullable_type, "nullable_col", ext_info); |
| EXPECT_TRUE(nullable_column.is_nullable()); |
| EXPECT_EQ(nullable_column.type(), FieldType::OLAP_FIELD_TYPE_INT); |
| } |
| |
| //TEST_F(SchemaUtilTest, TestGetSortedSubcolumns) { |
| // // Create test subcolumns |
| // vectorized::ColumnObject::Subcolumns subcolumns; |
| // |
| // auto create_subcolumn = [](const std::string& path) { |
| // auto subcol = std::make_shared<vectorized::ColumnObject::Subcolumn>(); |
| // subcol->path = path; |
| // return subcol; |
| // }; |
| // |
| // subcolumns.push_back(create_subcolumn("c")); |
| // subcolumns.push_back(create_subcolumn("a")); |
| // subcolumns.push_back(create_subcolumn("b")); |
| // |
| // auto sorted = schema_util::get_sorted_subcolumns(subcolumns); |
| // |
| // EXPECT_EQ(sorted.size(), 3); |
| // EXPECT_EQ(sorted[0]->path, "a"); |
| // EXPECT_EQ(sorted[1]->path, "b"); |
| // EXPECT_EQ(sorted[2]->path, "c"); |
| //} |
| |
| TEST_F(SchemaUtilTest, TestHasSchemaIndexDiff) { |
| TabletSchemaPB schema1_pb; |
| TabletSchemaPB schema2_pb; |
| |
| // Setup first schema |
| construct_column(schema1_pb.add_column(), schema1_pb.add_index(), 10000, "test_index", 1, "INT", |
| "test_col", IndexType::INVERTED); |
| auto* col1 = schema1_pb.mutable_column(0); |
| col1->set_is_bf_column(false); |
| |
| // Setup second schema with different index |
| construct_column(schema2_pb.add_column(), schema2_pb.add_index(), 10000, "test_index", 1, "INT", |
| "test_col", IndexType::BLOOMFILTER); |
| auto* col2 = schema2_pb.mutable_column(0); |
| col2->set_is_bf_column(true); |
| |
| TabletSchemaSPtr schema1 = std::make_shared<TabletSchema>(); |
| TabletSchemaSPtr schema2 = std::make_shared<TabletSchema>(); |
| schema1->init_from_pb(schema1_pb); |
| schema2->init_from_pb(schema2_pb); |
| |
| EXPECT_TRUE(schema_util::has_schema_index_diff(schema1.get(), schema2.get(), 0, 0)); |
| } |
| |
| TEST_F(SchemaUtilTest, TestParseVariantColumns) { |
| // Create a block with variant column |
| Block block; |
| |
| // Create a variant column with JSON string data |
| auto variant_type = std::make_shared<DataTypeObject>(10); |
| auto variant_column = ColumnObject::create(10); |
| auto root_column = ColumnString::create(); |
| root_column->insert("{'a': 1, 'b': 'test'}"); |
| variant_column->create_root(std::make_shared<DataTypeString>(), root_column->get_ptr()); |
| |
| block.insert({variant_column->get_ptr(), variant_type, "variant_col"}); |
| |
| std::vector<int> variant_pos {0}; |
| ParseConfig config; |
| |
| auto status = schema_util::parse_variant_columns(block, variant_pos, config); |
| EXPECT_TRUE(status.ok()); |
| |
| // Check the parsed variant column |
| const auto& result_column = block.get_by_position(0).column; |
| std::cout << "Result column name: " << result_column->get_name() << std::endl; |
| EXPECT_TRUE(result_column->get_name().find("variant") == std::string::npos); |
| |
| const auto& obj_column = assert_cast<const ColumnObject&>(*result_column); |
| EXPECT_TRUE(obj_column.is_scalar_variant()); |
| } |
| |
| TEST_F(SchemaUtilTest, TestGetLeastCommonSchema) { |
| // Create test schemas |
| TabletSchemaPB schema1_pb; |
| schema1_pb.set_keys_type(KeysType::DUP_KEYS); |
| construct_column(schema1_pb.add_column(), schema1_pb.add_index(), 10000, "v1_index", 1, |
| "VARIANT", "v1", IndexType::INVERTED); |
| |
| TabletSchemaPB schema2_pb; |
| schema2_pb.set_keys_type(KeysType::DUP_KEYS); |
| construct_column(schema2_pb.add_column(), schema2_pb.add_index(), 10000, "v1_index", 1, |
| "VARIANT", "v1", IndexType::INVERTED); |
| |
| TabletSchemaSPtr schema1 = std::make_shared<TabletSchema>(); |
| TabletSchemaSPtr schema2 = std::make_shared<TabletSchema>(); |
| schema1->init_from_pb(schema1_pb); |
| schema2->init_from_pb(schema2_pb); |
| |
| std::vector<TabletSchemaSPtr> schemas {schema1, schema2}; |
| TabletSchemaSPtr result_schema; |
| |
| auto status = schema_util::get_least_common_schema(schemas, nullptr, result_schema); |
| EXPECT_TRUE(status.ok()); |
| EXPECT_EQ(result_schema->num_columns(), 1); |
| } |
| |
| TEST_F(SchemaUtilTest, TestGetSizeOfInteger) { |
| // Test all integer types |
| EXPECT_EQ(schema_util::get_size_of_interger(TypeIndex::Int8), sizeof(int8_t)); |
| EXPECT_EQ(schema_util::get_size_of_interger(TypeIndex::Int16), sizeof(int16_t)); |
| EXPECT_EQ(schema_util::get_size_of_interger(TypeIndex::Int32), sizeof(int32_t)); |
| EXPECT_EQ(schema_util::get_size_of_interger(TypeIndex::Int64), sizeof(int64_t)); |
| EXPECT_EQ(schema_util::get_size_of_interger(TypeIndex::Int128), sizeof(int128_t)); |
| |
| EXPECT_EQ(schema_util::get_size_of_interger(TypeIndex::UInt8), sizeof(uint8_t)); |
| EXPECT_EQ(schema_util::get_size_of_interger(TypeIndex::UInt16), sizeof(uint16_t)); |
| EXPECT_EQ(schema_util::get_size_of_interger(TypeIndex::UInt32), sizeof(uint32_t)); |
| EXPECT_EQ(schema_util::get_size_of_interger(TypeIndex::UInt64), sizeof(uint64_t)); |
| EXPECT_EQ(schema_util::get_size_of_interger(TypeIndex::UInt128), sizeof(uint128_t)); |
| |
| // Test invalid type |
| // EXPECT_THROW(schema_util::get_size_of_interger(TypeIndex::String), Exception); |
| } |
| |
| TEST_F(SchemaUtilTest, TestCastColumnEdgeCases) { |
| // Test casting from Nothing type |
| auto nothing_type = std::make_shared<DataTypeNothing>(); |
| auto dst_type = std::make_shared<DataTypeInt32>(); |
| |
| auto nothing_column = ColumnNothing::create(1); |
| ColumnWithTypeAndName src_col; |
| src_col.type = nothing_type; |
| src_col.column = nothing_column->get_ptr(); |
| src_col.name = "nothing_col"; |
| |
| ColumnPtr result; |
| auto status = schema_util::cast_column(src_col, dst_type, &result); |
| EXPECT_TRUE(status.ok()); |
| EXPECT_EQ(result->size(), 1); |
| |
| // Test casting to variant type |
| auto variant_type = std::make_shared<DataTypeObject>(10); |
| auto nullable_array_type = |
| make_nullable(std::make_shared<DataTypeArray>(std::make_shared<DataTypeInt32>())); |
| auto array_column = |
| ColumnArray::create(ColumnInt32::create(), ColumnArray::ColumnOffsets::create()); |
| auto nullable_array_column = make_nullable(array_column->get_ptr()); |
| |
| ColumnWithTypeAndName array_col; |
| array_col.type = nullable_array_type; |
| array_col.column = nullable_array_column; |
| array_col.name = "array_col"; |
| |
| // test Array Type cast Int will throw Exception |
| auto int_type = std::make_shared<DataTypeInt32>(); |
| Status st = schema_util::cast_column(array_col, int_type, &result); |
| EXPECT_FALSE(st.ok()); |
| |
| ColumnPtr result1; |
| status = schema_util::cast_column(array_col, variant_type, &result1); |
| EXPECT_TRUE(status.ok()); |
| EXPECT_FALSE(result1->is_nullable()); |
| |
| auto variant_type_nullable = make_nullable(variant_type); |
| status = schema_util::cast_column(array_col, variant_type_nullable, &result1); |
| EXPECT_TRUE(status.ok()); |
| EXPECT_TRUE(result1->is_nullable()); |
| |
| // Test casting from variant to variant |
| auto variant_column = ColumnObject::create(10); |
| variant_column->create_root(nullable_array_type, nullable_array_column->assume_mutable()); |
| |
| ColumnWithTypeAndName variant_col; |
| variant_col.type = variant_type; |
| variant_col.column = variant_column->get_ptr(); |
| variant_col.name = "variant_col"; |
| |
| ColumnPtr result2; |
| status = schema_util::cast_column(variant_col, variant_type, &result2); |
| EXPECT_TRUE(status.ok()); |
| EXPECT_FALSE(result2->is_nullable()); |
| } |
| |
| TEST_F(SchemaUtilTest, TestCastColumnWithExecuteFailure) { |
| // Create a complex type to simple type conversion scenario, this conversion usually fails |
| auto complex_type = std::make_shared<DataTypeArray>(std::make_shared<DataTypeIPv4>()); |
| auto simple_type = std::make_shared<DataTypeJsonb>(); |
| |
| // Insert some test dataset |
| auto nested_array = |
| ColumnArray::create(ColumnIPv4::create(), ColumnArray::ColumnOffsets::create()); |
| nested_array->insert(Array(IPv4(1))); |
| nested_array->insert(Array(IPv4(2))); |
| |
| ColumnWithTypeAndName src_col; |
| src_col.type = complex_type; |
| src_col.column = nested_array->get_ptr(); |
| src_col.name = "array_col"; |
| |
| // Try converting to a simple type, which should fail and return the default value |
| ColumnPtr result; |
| auto status = schema_util::cast_column(src_col, simple_type, &result); |
| |
| // Check result |
| EXPECT_TRUE(status.ok()); |
| EXPECT_EQ(result->size(), 2); |
| EXPECT_EQ(result->get_data_at(0).size, 26); |
| } |
| |
| TEST_F(SchemaUtilTest, TestGetColumnByTypeEdgeCases) { |
| // Test decimal type |
| auto decimal_type = std::make_shared<DataTypeDecimal<Decimal128V2>>(18, 2); |
| schema_util::ExtraInfo ext_info; |
| auto decimal_column = schema_util::get_column_by_type(decimal_type, "decimal_col", ext_info); |
| EXPECT_EQ(decimal_column.type(), FieldType::OLAP_FIELD_TYPE_DECIMAL); |
| EXPECT_EQ(decimal_column.precision(), 18); |
| EXPECT_EQ(decimal_column.frac(), 2); |
| |
| // Test datetime type |
| auto datetime_type = std::make_shared<DataTypeDateTime>(); |
| auto datetime_column = schema_util::get_column_by_type(datetime_type, "datetime_col", ext_info); |
| EXPECT_EQ(datetime_column.type(), FieldType::OLAP_FIELD_TYPE_DATETIME); |
| |
| // Test datetime v2 type |
| auto datetime_v2_type = std::make_shared<DataTypeDateTimeV2>(6); |
| auto datetime_v2_column = |
| schema_util::get_column_by_type(datetime_v2_type, "datetime_v2_col", ext_info); |
| EXPECT_EQ(datetime_v2_column.type(), FieldType::OLAP_FIELD_TYPE_DATETIMEV2); |
| EXPECT_EQ(datetime_v2_column.precision(), -1); |
| EXPECT_EQ(datetime_v2_column.frac(), 6); |
| |
| // Test invalid type |
| auto invalid_type = std::make_shared<DataTypeNothing>(); |
| EXPECT_THROW(schema_util::get_column_by_type(invalid_type, "invalid_col", ext_info), Exception); |
| } |
| |
| TEST_F(SchemaUtilTest, TestUpdateLeastSchemaInternal) { |
| // Create test schemas and types |
| std::map<PathInData, DataTypes> subcolumns_types; |
| auto schema = std::make_shared<TabletSchema>(); |
| |
| // Add some test columns |
| TabletColumn base_col; |
| base_col.set_unique_id(1); |
| base_col.set_name("test_variant"); |
| base_col.set_type(FieldType::OLAP_FIELD_TYPE_VARIANT); |
| schema->append_column(base_col); |
| |
| // Add different types for same path |
| PathInData test_path("test_variant.a"); |
| subcolumns_types[test_path] = {std::make_shared<DataTypeInt32>(), |
| std::make_shared<DataTypeInt64>()}; |
| |
| // Add array types with different dimensions |
| PathInData array_path("test_variant.b"); |
| subcolumns_types[array_path] = { |
| std::make_shared<DataTypeArray>(std::make_shared<DataTypeInt32>()), |
| std::make_shared<DataTypeArray>( |
| std::make_shared<DataTypeArray>(std::make_shared<DataTypeInt32>()))}; |
| |
| // Add path with single type |
| PathInData single_path("test_variant.c"); |
| subcolumns_types[single_path] = {std::make_shared<DataTypeString>()}; |
| |
| std::map<std::string, TabletColumnPtr> typed_columns; |
| schema_util::update_least_schema_internal(subcolumns_types, schema, false, 1, typed_columns); |
| |
| // Check results |
| EXPECT_EQ(schema->num_columns(), 4); // base + 3 subcolumns |
| |
| // Check that array path was converted to JSONB due to dimension mismatch |
| int array_col_idx = schema->field_index("test_variant.b"); |
| EXPECT_GE(array_col_idx, 0); |
| EXPECT_EQ(schema->column(array_col_idx).type(), FieldType::OLAP_FIELD_TYPE_JSONB); |
| |
| // Check that mixed integer types were promoted |
| int int_col_idx = schema->field_index("test_variant.a"); |
| EXPECT_GE(int_col_idx, 0); |
| EXPECT_EQ(schema->column(int_col_idx).type(), FieldType::OLAP_FIELD_TYPE_BIGINT); |
| } |
| |
| TEST_F(SchemaUtilTest, TestUpdateLeastCommonSchema) { |
| // Create test schemas |
| std::vector<TabletSchemaSPtr> schemas; |
| auto schema1 = std::make_shared<TabletSchema>(); |
| auto schema2 = std::make_shared<TabletSchema>(); |
| |
| // Add variant column to both schemas |
| TabletColumn variant_col; |
| variant_col.set_unique_id(1); |
| variant_col.set_name("test_variant"); |
| variant_col.set_type(FieldType::OLAP_FIELD_TYPE_VARIANT); |
| schema1->append_column(variant_col); |
| schema2->append_column(variant_col); |
| |
| // Add different subcolumns to schemas |
| TabletColumn subcol1; |
| subcol1.set_name("test_variant.a"); |
| subcol1.set_type(FieldType::OLAP_FIELD_TYPE_INT); |
| subcol1.set_parent_unique_id(1); |
| subcol1.set_path_info(PathInData("test_variant.a")); |
| schema1->append_column(subcol1); |
| |
| TabletColumn subcol2; |
| subcol2.set_name("test_variant.a"); |
| subcol2.set_type(FieldType::OLAP_FIELD_TYPE_BIGINT); |
| subcol2.set_parent_unique_id(1); |
| subcol2.set_path_info(PathInData("test_variant.a")); |
| schema2->append_column(subcol2); |
| |
| schemas.push_back(schema1); |
| schemas.push_back(schema2); |
| |
| auto result_schema = std::make_shared<TabletSchema>(); |
| result_schema->append_column(variant_col); |
| |
| std::set<PathInData> path_set; |
| schema_util::update_least_common_schema(schemas, result_schema, 1, &path_set); |
| |
| // Check results |
| EXPECT_EQ(result_schema->num_columns(), 2); // variant + subcolumn |
| EXPECT_EQ(path_set.size(), 1); |
| EXPECT_TRUE(path_set.find(PathInData("test_variant.a")) != path_set.end()); |
| |
| // Check that subcolumn type was promoted to BIGINT |
| int subcol_idx = result_schema->field_index("test_variant.a"); |
| EXPECT_GE(subcol_idx, 0); |
| EXPECT_EQ(result_schema->column(subcol_idx).type(), FieldType::OLAP_FIELD_TYPE_BIGINT); |
| } |
| |
| TEST_F(SchemaUtilTest, TestUpdateLeastCommonSchema2) { |
| // Create common schema with a variant column |
| TabletSchemaSPtr common_schema = std::make_shared<TabletSchema>(); |
| TabletColumn variant_col; |
| variant_col.set_unique_id(1); |
| variant_col.set_name("test_variant"); |
| variant_col.set_type(FieldType::OLAP_FIELD_TYPE_VARIANT); |
| |
| // Create subcolumns for variant column in common_schema |
| TabletColumn sub_col1; |
| sub_col1.set_name("test_variant.field1"); |
| sub_col1.set_type(FieldType::OLAP_FIELD_TYPE_INT); |
| sub_col1.set_parent_unique_id(1); |
| vectorized::PathInData path1("test_variant.field1"); |
| sub_col1.set_path_info(path1); |
| variant_col.add_sub_column(sub_col1); |
| |
| TabletColumn sub_col2; |
| sub_col2.set_name("test_variant.field2"); |
| sub_col2.set_type(FieldType::OLAP_FIELD_TYPE_STRING); |
| sub_col2.set_parent_unique_id(1); |
| vectorized::PathInData path2("test_variant.field2"); |
| sub_col2.set_path_info(path2); |
| variant_col.add_sub_column(sub_col2); |
| |
| common_schema->append_column(variant_col); |
| |
| // Create schemas vector with two schemas |
| std::vector<TabletSchemaSPtr> schemas; |
| // Schema1: doesn't have the variant column |
| auto schema1 = std::make_shared<TabletSchema>(); |
| schemas.push_back(schema1); |
| |
| // Schema2: has variant column with different subcolumns |
| auto schema2 = std::make_shared<TabletSchema>(); |
| TabletColumn variant_col2; |
| variant_col2.set_unique_id(1); |
| variant_col2.set_name("test_variant"); |
| variant_col2.set_type(FieldType::OLAP_FIELD_TYPE_VARIANT); |
| |
| // Add subcolumns to schema2's variant column |
| TabletColumn sub_col3; |
| sub_col3.set_name("test_variant.field3"); |
| sub_col3.set_type(FieldType::OLAP_FIELD_TYPE_INT); |
| sub_col3.set_parent_unique_id(1); |
| vectorized::PathInData path3("test_variant.field3"); |
| sub_col3.set_path_info(path3); |
| variant_col2.add_sub_column(sub_col3); |
| |
| // Add a subcolumn with same path but different type |
| TabletColumn sub_col1_different_type; |
| sub_col1_different_type.set_name("test_variant.field1"); |
| sub_col1_different_type.set_type(FieldType::OLAP_FIELD_TYPE_BIGINT); |
| sub_col1_different_type.set_parent_unique_id(1); |
| sub_col1_different_type.set_path_info(path1); |
| variant_col2.add_sub_column(sub_col1_different_type); |
| |
| schema2->append_column(variant_col2); |
| schemas.push_back(schema2); |
| |
| // Create path_set that contains some paths |
| std::set<PathInData> path_set; |
| path_set.insert(path1); |
| path_set.insert(path2); |
| path_set.insert(path3); |
| |
| // Test update_least_common_schema |
| // This should cover: |
| // 1. schema->field_index(variant_col_unique_id) == -1 branch (via schema1) |
| // 2. The for loop over sparse_columns() (via schema2) |
| // 3. subcolumns_types.find(*col->path_info_ptr()) != subcolumns_types.end() branch |
| schema_util::update_least_common_schema(schemas, common_schema, 1, &path_set); |
| |
| // Verify results |
| const auto& result_variant = common_schema->column_by_uid(1); |
| |
| // Check that all subcolumns are present |
| EXPECT_EQ(result_variant.get_sub_columns().size(), 2); |
| |
| // Check that field1 has the most compatible type (should be BIGINT due to type promotion) |
| bool found_field1 = false; |
| bool found_field2 = false; |
| bool found_field3 = false; |
| |
| for (const auto& col : result_variant.get_sub_columns()) { |
| if (col->name() == "test_variant.field1") { |
| found_field1 = true; |
| EXPECT_EQ(col->type(), FieldType::OLAP_FIELD_TYPE_INT); |
| } else if (col->name() == "test_variant.field2") { |
| found_field2 = true; |
| EXPECT_EQ(col->type(), FieldType::OLAP_FIELD_TYPE_STRING); |
| } else if (col->name() == "test_variant.field3") { |
| EXPECT_EQ(col->type(), FieldType::OLAP_FIELD_TYPE_INT); |
| } |
| } |
| |
| EXPECT_TRUE(found_field1); |
| EXPECT_TRUE(found_field2); |
| EXPECT_FALSE(found_field3); |
| } |
| |
| TEST_F(SchemaUtilTest, TestUpdateLeastCommonSchema3) { |
| // Create common schema with a variant column |
| TabletSchemaSPtr common_schema = std::make_shared<TabletSchema>(); |
| TabletColumn variant_col; |
| variant_col.set_unique_id(1); |
| variant_col.set_name("test_variant"); |
| variant_col.set_type(FieldType::OLAP_FIELD_TYPE_VARIANT); |
| |
| // Create sparse columns for variant column in common_schema |
| TabletColumn sparse_col1; |
| sparse_col1.set_name("test_variant.sparse1"); |
| sparse_col1.set_type(FieldType::OLAP_FIELD_TYPE_INT); |
| sparse_col1.set_parent_unique_id(1); |
| vectorized::PathInData path1("test_variant.sparse1"); |
| sparse_col1.set_path_info(path1); |
| variant_col.append_sparse_column(sparse_col1); |
| |
| TabletColumn sparse_col2; |
| sparse_col2.set_name("test_variant.sparse2"); |
| sparse_col2.set_type(FieldType::OLAP_FIELD_TYPE_STRING); |
| sparse_col2.set_parent_unique_id(1); |
| vectorized::PathInData path2("test_variant.sparse2"); |
| sparse_col2.set_path_info(path2); |
| variant_col.append_sparse_column(sparse_col2); |
| |
| common_schema->append_column(variant_col); |
| |
| // Create schemas vector with two schemas |
| std::vector<TabletSchemaSPtr> schemas; |
| |
| // Schema1: doesn't have the variant column |
| auto schema1 = std::make_shared<TabletSchema>(); |
| schemas.push_back(schema1); |
| |
| // Schema2: has variant column with different sparse columns |
| auto schema2 = std::make_shared<TabletSchema>(); |
| TabletColumn variant_col2; |
| variant_col2.set_unique_id(1); |
| variant_col2.set_name("test_variant"); |
| variant_col2.set_type(FieldType::OLAP_FIELD_TYPE_VARIANT); |
| |
| // Add sparse columns to schema2's variant column |
| TabletColumn sparse_col3; |
| sparse_col3.set_name("test_variant.sparse3"); |
| sparse_col3.set_type(FieldType::OLAP_FIELD_TYPE_INT); |
| sparse_col3.set_parent_unique_id(1); |
| vectorized::PathInData path3("test_variant.sparse3"); |
| sparse_col3.set_path_info(path3); |
| variant_col2.append_sparse_column(sparse_col3); |
| |
| TabletColumn sparse_col4; |
| sparse_col4.set_name("test_variant.sparse4"); |
| sparse_col4.set_type(FieldType::OLAP_FIELD_TYPE_DOUBLE); |
| sparse_col4.set_parent_unique_id(1); |
| vectorized::PathInData path4("test_variant.sparse4"); |
| sparse_col4.set_path_info(path4); |
| variant_col2.append_sparse_column(sparse_col4); |
| |
| schema2->append_column(variant_col2); |
| schemas.push_back(schema2); |
| |
| // Create path_set that contains some but not all sparse column paths |
| std::set<PathInData> path_set; |
| path_set.insert(path1); // from common_schema |
| path_set.insert(path3); // from schema2 |
| |
| // Test update_least_sparse_column |
| // This should cover: |
| // 1. schema->field_index(variant_col_unique_id) == -1 branch (via schema1) |
| // 2. The for loop over sparse_columns() (via schema2) |
| // 3. path_set.find(*col->path_info_ptr()) == path_set.end() branch (via sparse_col4) |
| schema_util::update_least_common_schema(schemas, common_schema, 1, &path_set); |
| |
| // Verify that only sparse columns not in path_set are kept |
| const auto& result_variant = common_schema->column_by_uid(1); |
| EXPECT_EQ(result_variant.sparse_columns().size(), 2); |
| |
| // Check that sparse_col2 and sparse_col4 are kept (they weren't in path_set) |
| bool found_sparse2 = false; |
| bool found_sparse4 = false; |
| for (const auto& col : result_variant.sparse_columns()) { |
| if (col->name() == "test_variant.sparse2") { |
| found_sparse2 = true; |
| } else if (col->name() == "test_variant.sparse4") { |
| found_sparse4 = true; |
| } |
| } |
| EXPECT_TRUE(found_sparse2); |
| EXPECT_FALSE(found_sparse4); |
| } |
| |
| TEST_F(SchemaUtilTest, TestUpdateLeastSparseColumn) { |
| // Create test schemas |
| std::vector<TabletSchemaSPtr> schemas; |
| auto schema1 = std::make_shared<TabletSchema>(); |
| auto schema2 = std::make_shared<TabletSchema>(); |
| |
| // Add variant column to both schemas |
| TabletColumn variant_col; |
| variant_col.set_unique_id(1); |
| variant_col.set_name("test_variant"); |
| variant_col.set_type(FieldType::OLAP_FIELD_TYPE_VARIANT); |
| |
| // Add sparse columns to schemas |
| TabletColumn sparse_col1; |
| sparse_col1.set_name("test_variant.sparse1"); |
| sparse_col1.set_type(FieldType::OLAP_FIELD_TYPE_INT); |
| sparse_col1.set_parent_unique_id(1); |
| sparse_col1.set_path_info(PathInData("test_variant.sparse1")); |
| |
| TabletColumn sparse_col2; |
| sparse_col2.set_name("test_variant.sparse2"); |
| sparse_col2.set_type(FieldType::OLAP_FIELD_TYPE_BIGINT); |
| sparse_col2.set_parent_unique_id(1); |
| sparse_col2.set_path_info(PathInData("test_variant.sparse2")); |
| |
| schema1->append_column(variant_col); |
| schema1->append_column(sparse_col1); |
| schema2->append_column(variant_col); |
| schema2->append_column(sparse_col2); |
| |
| schemas.push_back(schema1); |
| schemas.push_back(schema2); |
| |
| auto result_schema = std::make_shared<TabletSchema>(); |
| result_schema->append_column(variant_col); |
| |
| std::set<PathInData> path_set; |
| path_set.insert(PathInData("test_variant.other_path")); // This path should be excluded |
| |
| schema_util::update_least_sparse_column(schemas, result_schema, 1, path_set); |
| |
| // Check results : why 0? |
| EXPECT_EQ(result_schema->column_by_uid(1).sparse_columns().size(), 0); |
| } |
| |
| TEST_F(SchemaUtilTest, TestUpdateLeastSparseColumn2) { |
| // Test case 1: schema doesn't have the variant column |
| TabletSchema schema; |
| TabletColumn variant; |
| variant.set_unique_id(2); // Different ID than what we'll search for |
| schema.append_column(variant); |
| |
| std::vector<TabletSchemaSPtr> schemas; |
| auto schema1 = std::make_shared<TabletSchema>(); |
| auto schema2 = std::make_shared<TabletSchema>(); |
| schemas.push_back(schema1); |
| schemas.push_back(schema2); |
| |
| auto result_schema = std::make_shared<TabletSchema>(); |
| std::set<PathInData> path_set; |
| path_set.insert(PathInData("test.path")); |
| |
| // This should handle the case where field_index returns -1 |
| // schema_util::update_least_sparse_column(schemas, result_schema, 1, path_set); |
| // EXPECT_EQ(result_schema->num_columns(), 0); |
| |
| // Test case 2: schema has variant column but no sparse columns |
| TabletColumn variant2; |
| variant2.set_unique_id(1); |
| variant2.set_name("test_variant"); |
| variant2.set_type(FieldType::OLAP_FIELD_TYPE_VARIANT); |
| result_schema->append_column(variant2); |
| |
| // This should handle the case where sparse_columns is empty |
| schema_util::update_least_sparse_column(schemas, result_schema, 1, path_set); |
| EXPECT_EQ(result_schema->column_by_uid(1).sparse_columns().size(), 0); |
| |
| // Test case 3: schema has variant column with sparse columns but empty path_set |
| TabletColumn sparse_col; |
| sparse_col.set_name("test_variant.sparse"); |
| sparse_col.set_type(FieldType::OLAP_FIELD_TYPE_INT); |
| sparse_col.set_parent_unique_id(1); |
| sparse_col.set_path_info(PathInData("test_variant.sparse")); |
| variant2.append_sparse_column(sparse_col); |
| |
| // dropped Variant Col |
| |
| std::set<PathInData> empty_path_set; |
| schema_util::update_least_sparse_column(schemas, result_schema, 1, empty_path_set); |
| EXPECT_EQ(result_schema->column_by_uid(1).sparse_columns().size(), 0); |
| } |
| |
| TEST_F(SchemaUtilTest, TestUpdateLeastSparseColumn3) { |
| // Create common schema with a variant column |
| TabletSchemaSPtr common_schema = std::make_shared<TabletSchema>(); |
| TabletColumn variant_col; |
| variant_col.set_unique_id(1); |
| variant_col.set_name("test_variant"); |
| variant_col.set_type(FieldType::OLAP_FIELD_TYPE_VARIANT); |
| |
| // Create sparse columns for variant column in common_schema |
| TabletColumn sparse_col1; |
| sparse_col1.set_name("test_variant.sparse1"); |
| sparse_col1.set_type(FieldType::OLAP_FIELD_TYPE_INT); |
| sparse_col1.set_parent_unique_id(1); |
| vectorized::PathInData path1("test_variant.sparse1"); |
| sparse_col1.set_path_info(path1); |
| variant_col.append_sparse_column(sparse_col1); |
| |
| TabletColumn sparse_col2; |
| sparse_col2.set_name("test_variant.sparse2"); |
| sparse_col2.set_type(FieldType::OLAP_FIELD_TYPE_STRING); |
| sparse_col2.set_parent_unique_id(1); |
| vectorized::PathInData path2("test_variant.sparse2"); |
| sparse_col2.set_path_info(path2); |
| variant_col.append_sparse_column(sparse_col2); |
| |
| common_schema->append_column(variant_col); |
| |
| // Create schemas vector with two schemas |
| std::vector<TabletSchemaSPtr> schemas; |
| |
| // Schema1: doesn't have the variant column |
| auto schema1 = std::make_shared<TabletSchema>(); |
| schemas.push_back(schema1); |
| |
| // Schema2: has variant column with different sparse columns |
| auto schema2 = std::make_shared<TabletSchema>(); |
| TabletColumn variant_col2; |
| variant_col2.set_unique_id(1); |
| variant_col2.set_name("test_variant"); |
| variant_col2.set_type(FieldType::OLAP_FIELD_TYPE_VARIANT); |
| |
| // Add sparse columns to schema2's variant column |
| TabletColumn sparse_col3; |
| sparse_col3.set_name("test_variant.sparse3"); |
| sparse_col3.set_type(FieldType::OLAP_FIELD_TYPE_INT); |
| sparse_col3.set_parent_unique_id(1); |
| vectorized::PathInData path3("test_variant.sparse3"); |
| sparse_col3.set_path_info(path3); |
| variant_col2.append_sparse_column(sparse_col3); |
| |
| TabletColumn sparse_col4; |
| sparse_col4.set_name("test_variant.sparse4"); |
| sparse_col4.set_type(FieldType::OLAP_FIELD_TYPE_DOUBLE); |
| sparse_col4.set_parent_unique_id(1); |
| vectorized::PathInData path4("test_variant.sparse4"); |
| sparse_col4.set_path_info(path4); |
| variant_col2.append_sparse_column(sparse_col4); |
| |
| schema2->append_column(variant_col2); |
| schemas.push_back(schema2); |
| |
| // Create path_set that contains some but not all sparse column paths |
| std::set<PathInData> path_set; |
| path_set.insert(path1); // from common_schema |
| path_set.insert(path3); // from schema2 |
| |
| // Test update_least_sparse_column |
| // This should cover: |
| // 1. schema->field_index(variant_col_unique_id) == -1 branch (via schema1) |
| // 2. The for loop over sparse_columns() (via schema2) |
| // 3. path_set.find(*col->path_info_ptr()) == path_set.end() branch (via sparse_col4) |
| schema_util::update_least_sparse_column(schemas, common_schema, 1, path_set); |
| |
| // Verify that only sparse columns not in path_set are kept |
| const auto& result_variant = common_schema->column_by_uid(1); |
| EXPECT_EQ(result_variant.sparse_columns().size(), 3); |
| |
| // Check that sparse_col2 and sparse_col4 are kept (they weren't in path_set) |
| bool found_sparse2 = false; |
| bool found_sparse4 = false; |
| for (const auto& col : result_variant.sparse_columns()) { |
| if (col->name() == "test_variant.sparse2") { |
| found_sparse2 = true; |
| } else if (col->name() == "test_variant.sparse4") { |
| found_sparse4 = true; |
| } |
| } |
| EXPECT_TRUE(found_sparse2); |
| EXPECT_TRUE(found_sparse4); |
| } |
| |
| TEST_F(SchemaUtilTest, TestGetCompactionSchema) { |
| // Create test rowsets |
| std::vector<RowsetSharedPtr> rowsets; |
| RowsetMetaSharedPtr rowset_meta = std::make_shared<RowsetMeta>(); |
| |
| // Create schema for rowsets |
| TabletSchemaPB schema_pb; |
| schema_pb.set_keys_type(KeysType::DUP_KEYS); |
| construct_column(schema_pb.add_column(), schema_pb.add_index(), 10000, "v1_index", 1, "VARIANT", |
| "v1", IndexType::INVERTED); |
| |
| auto schema = std::make_shared<TabletSchema>(); |
| schema->init_from_pb(schema_pb); |
| |
| // Add path statistics |
| std::unordered_map<int32_t, schema_util::PathToNoneNullValues> path_stats; |
| path_stats[1] = {{"v1.a", 1000}, {"v1.b", 800}, {"v1.c", 500}, {"v1.d", 300}, {"v1.e", 200}}; |
| |
| // Mock rowset behavior |
| // BetaRowset rowset1(schema, rowset_meta, ""); |
| // BetaRowset rowset2(schema, rowset_meta, ""); |
| auto rowset1 = std::make_shared<BetaRowset>(schema, rowset_meta, ""); |
| auto rowset2 = std::make_shared<BetaRowset>(schema, rowset_meta, ""); |
| rowsets.push_back(rowset1); |
| rowsets.push_back(rowset2); |
| |
| auto target_schema = std::make_shared<TabletSchema>(); |
| target_schema->init_from_pb(schema_pb); |
| |
| auto status = schema_util::get_extended_compaction_schema(rowsets, target_schema); |
| EXPECT_TRUE(status.ok()); |
| |
| // Check that paths were properly distributed between subcolumns and sparse columns |
| const auto& variant_col = target_schema->column_by_uid(1); |
| // this is not work!!! |
| EXPECT_EQ(variant_col.get_sub_columns().size(), 0); |
| EXPECT_EQ(variant_col.sparse_columns().size(), 0); |
| } |
| |
| TEST_F(SchemaUtilTest, TestGetSortedSubcolumns) { |
| // Create test subcolumns |
| vectorized::ColumnObject::Subcolumns subcolumns; |
| auto obj = VariantUtil::construct_dst_varint_column(); |
| |
| auto sorted = schema_util::get_sorted_subcolumns(obj->get_subcolumns()); |
| std::vector<std::string> expected_paths = {"", "v.b", "v.b.d", "v.c.d", "v.e", "v.f"}; |
| EXPECT_EQ(sorted.size(), 6); |
| int i = 0; |
| for (auto iter = sorted.begin(); iter != sorted.end(); ++iter) { |
| EXPECT_EQ(iter.operator*()->path.get_path(), expected_paths[i++]); |
| } |
| } |
| |
| TEST_F(SchemaUtilTest, TestCreateSparseColumn) { |
| TabletColumn variant; |
| variant.set_name("test_variant"); |
| variant.set_unique_id(42); |
| variant.set_aggregation_method(FieldAggregationMethod::OLAP_FIELD_AGGREGATION_GENERIC); |
| |
| auto sparse_column = schema_util::create_sparse_column(variant); |
| |
| EXPECT_EQ(sparse_column.name(), "test_variant." + SPARSE_COLUMN_PATH); |
| EXPECT_EQ(sparse_column.type(), FieldType::OLAP_FIELD_TYPE_MAP); |
| EXPECT_EQ(sparse_column.aggregation(), FieldAggregationMethod::OLAP_FIELD_AGGREGATION_GENERIC); |
| EXPECT_EQ(sparse_column.parent_unique_id(), 42); |
| EXPECT_EQ(sparse_column.path_info_ptr()->get_path(), "test_variant." + SPARSE_COLUMN_PATH); |
| |
| // Check map value columns |
| EXPECT_EQ(sparse_column.get_sub_column(0).type(), FieldType::OLAP_FIELD_TYPE_STRING); |
| EXPECT_EQ(sparse_column.get_sub_column(1).type(), FieldType::OLAP_FIELD_TYPE_STRING); |
| } |
| |
| TEST_F(SchemaUtilTest, TestParseVariantColumnsEdgeCases) { |
| Block block; |
| |
| // Test parsing from string to variant |
| auto variant_type = std::make_shared<DataTypeObject>(10); |
| auto variant_column = ColumnObject::create(10); |
| auto root_column = ColumnString::create(); |
| |
| // Add some test JSON data |
| root_column->insert("{'a': 1, 'b': 'test'}"); |
| root_column->insert("{'a': 2, 'c': [1,2,3]}"); |
| root_column->insert("{'a': 3, 'd': {'x': 1}}"); |
| |
| variant_column->create_root(std::make_shared<DataTypeString>(), root_column->get_ptr()); |
| block.insert({variant_column->get_ptr(), variant_type, "variant_col"}); |
| |
| std::vector<int> variant_pos {0}; |
| ParseConfig config; |
| |
| auto status = schema_util::parse_variant_columns(block, variant_pos, config); |
| EXPECT_TRUE(status.ok()); |
| |
| // Test parsing from JSONB to variant |
| auto jsonb_type = std::make_shared<DataTypeJsonb>(); |
| auto jsonb_column = ColumnString::create(); |
| jsonb_column->insert("{'x': 1}"); |
| |
| auto variant_column2 = ColumnObject::create(10); |
| variant_column2->create_root(jsonb_type, jsonb_column->get_ptr()); |
| |
| Block block2; |
| block2.insert({variant_column2->get_ptr(), variant_type, "variant_col2"}); |
| |
| status = schema_util::parse_variant_columns(block2, {0}, config); |
| EXPECT_TRUE(status.ok()); |
| |
| // Test parsing already parsed variant |
| auto variant_column3 = ColumnObject::create(10); |
| variant_column3->finalize(); |
| |
| Block block3; |
| block3.insert({variant_column3->get_ptr(), variant_type, "variant_col3"}); |
| |
| status = schema_util::parse_variant_columns(block3, {0}, config); |
| EXPECT_TRUE(status.ok()); |
| } |
| |
| TEST_F(SchemaUtilTest, TestParseVariantColumnsWithNulls) { |
| Block block; |
| |
| // Create a nullable variant column |
| auto variant_type = make_nullable(std::make_shared<DataTypeObject>(10)); |
| auto string_type = make_nullable(std::make_shared<DataTypeString>()); |
| |
| auto string_column = ColumnString::create(); |
| string_column->insert("{'a': 1}"); |
| auto nullable_string = make_nullable(string_column->get_ptr()); |
| |
| auto variant_column = ColumnObject::create(10); |
| variant_column->create_root(string_type, nullable_string->assume_mutable()); |
| auto nullable_variant = make_nullable(variant_column->get_ptr()); |
| |
| block.insert({nullable_variant, variant_type, "nullable_variant"}); |
| |
| std::vector<int> variant_pos {0}; |
| ParseConfig config; |
| |
| auto status = schema_util::parse_variant_columns(block, variant_pos, config); |
| EXPECT_TRUE(status.ok()); |
| |
| const auto& result_column = block.get_by_position(0).column; |
| EXPECT_TRUE(result_column->is_nullable()); |
| } |
| |
| TEST_F(SchemaUtilTest, get_compaction_typed_columns) { |
| TabletColumn variant; |
| variant.set_unique_id(10); |
| variant.set_variant_max_subcolumns_count(3); |
| |
| TabletColumn subcolumn; |
| subcolumn.set_name("profile.id.*"); |
| subcolumn.set_type(FieldType::OLAP_FIELD_TYPE_INT); |
| variant.add_sub_column(subcolumn); |
| |
| TabletColumn subcolumn2; |
| subcolumn2.set_name("profile.name.?"); |
| subcolumn2.set_type(FieldType::OLAP_FIELD_TYPE_STRING); |
| variant.add_sub_column(subcolumn2); |
| |
| TabletSchemaSPtr schema = std::make_shared<TabletSchema>(); |
| schema->append_column(variant); |
| |
| std::unordered_set<std::string> typed_paths; |
| typed_paths.insert("profile.id.name"); |
| TabletSchemaSPtr output_schema = std::make_shared<TabletSchema>(); |
| TabletColumnPtr parent_column = std::make_shared<TabletColumn>(variant); |
| TabletSchema::PathsSetInfo paths_set_info; |
| EXPECT_TRUE(schema_util::get_compaction_typed_columns(schema, typed_paths, parent_column, |
| output_schema, paths_set_info) |
| .ok()); |
| EXPECT_EQ(output_schema->num_columns(), 1); |
| EXPECT_EQ(output_schema->column(0).type(), FieldType::OLAP_FIELD_TYPE_INT); |
| EXPECT_EQ(paths_set_info.typed_path_set.size(), 1); |
| |
| typed_paths.insert("abc"); |
| EXPECT_FALSE(schema_util::get_compaction_typed_columns(schema, typed_paths, parent_column, |
| output_schema, paths_set_info) |
| .ok()); |
| } |
| |
| TEST_F(SchemaUtilTest, get_compaction_nested_columns) { |
| TabletColumn variant; |
| variant.set_unique_id(20); |
| variant.set_variant_max_subcolumns_count(3); |
| |
| TabletSchemaSPtr schema = std::make_shared<TabletSchema>(); |
| schema->append_column(variant); |
| |
| std::unordered_set<vectorized::PathInData, vectorized::PathInData::Hash> nested_paths; |
| vectorized::PathInData path1("profile.address"); |
| vectorized::PathInData path2("profile.phone"); |
| nested_paths.insert(path1); |
| nested_paths.insert(path2); |
| |
| TabletSchemaSPtr output_schema = std::make_shared<TabletSchema>(); |
| TabletSchema::PathsSetInfo paths_set_info; |
| |
| doris::vectorized::schema_util::PathToDataTypes path_to_data_types; |
| path_to_data_types[path1] = {std::make_shared<vectorized::DataTypeInt32>(), |
| std::make_shared<vectorized::DataTypeString>()}; |
| path_to_data_types[path2] = {std::make_shared<vectorized::DataTypeString>(), |
| std::make_shared<vectorized::DataTypeString>()}; |
| TabletColumnPtr parent_column = std::make_shared<TabletColumn>(variant); |
| |
| Status st = schema_util::get_compaction_nested_columns( |
| nested_paths, path_to_data_types, parent_column, output_schema, paths_set_info); |
| |
| EXPECT_TRUE(st.ok()); |
| EXPECT_EQ(output_schema->num_columns(), 2); |
| for (const auto& column : output_schema->columns()) { |
| // std::cout << "column name: " << column->name() << " type: " << (int)column->type() << std::endl; |
| if (column->name().ends_with("address")) { |
| EXPECT_EQ(column->type(), FieldType::OLAP_FIELD_TYPE_JSONB); |
| } else if (column->name().ends_with("phone")) { |
| EXPECT_EQ(column->type(), FieldType::OLAP_FIELD_TYPE_STRING); |
| } |
| } |
| |
| std::unordered_set<vectorized::PathInData, vectorized::PathInData::Hash> bad_nested_paths; |
| bad_nested_paths.insert(vectorized::PathInData("not_exist")); |
| TabletSchemaSPtr bad_output_schema = std::make_shared<TabletSchema>(); |
| TabletSchema::PathsSetInfo bad_paths_set_info; |
| Status st2 = schema_util::get_compaction_nested_columns(bad_nested_paths, path_to_data_types, |
| parent_column, bad_output_schema, |
| bad_paths_set_info); |
| EXPECT_FALSE(st2.ok()); |
| } |
| |
| TEST_F(SchemaUtilTest, get_compaction_subcolumns) { |
| TabletColumn variant; |
| variant.set_unique_id(30); |
| variant.set_variant_max_subcolumns_count(3); |
| variant.set_aggregation_method(FieldAggregationMethod::OLAP_FIELD_AGGREGATION_NONE); |
| |
| TabletSchemaSPtr schema = std::make_shared<TabletSchema>(); |
| schema->append_column(variant); |
| |
| TabletColumnPtr parent_column = std::make_shared<TabletColumn>(variant); |
| |
| TabletSchema::PathsSetInfo paths_set_info; |
| paths_set_info.sub_path_set.insert("a"); |
| paths_set_info.sub_path_set.insert("b"); |
| doris::vectorized::schema_util::PathToDataTypes path_to_data_types; |
| std::unordered_set<std::string> sparse_paths; |
| TabletSchemaSPtr output_schema = std::make_shared<TabletSchema>(); |
| |
| schema_util::get_compaction_subcolumns(paths_set_info, parent_column, schema, |
| path_to_data_types, sparse_paths, output_schema); |
| EXPECT_EQ(output_schema->num_columns(), 2); |
| for (const auto& column : output_schema->columns()) { |
| EXPECT_EQ(column->type(), FieldType::OLAP_FIELD_TYPE_VARIANT); |
| } |
| |
| output_schema = std::make_shared<TabletSchema>(); |
| path_to_data_types.clear(); |
| path_to_data_types[vectorized::PathInData("a")] = { |
| std::make_shared<vectorized::DataTypeInt32>()}; |
| path_to_data_types[vectorized::PathInData("b")] = { |
| std::make_shared<vectorized::DataTypeString>()}; |
| schema_util::get_compaction_subcolumns(paths_set_info, parent_column, schema, |
| path_to_data_types, sparse_paths, output_schema); |
| EXPECT_EQ(output_schema->num_columns(), 2); |
| bool found_int = false, found_str = false; |
| for (const auto& column : output_schema->columns()) { |
| if (column->name().ends_with("a")) { |
| EXPECT_EQ(column->type(), FieldType::OLAP_FIELD_TYPE_INT); |
| found_int = true; |
| } else if (column->name().ends_with("b")) { |
| EXPECT_EQ(column->type(), FieldType::OLAP_FIELD_TYPE_STRING); |
| found_str = true; |
| } |
| } |
| EXPECT_TRUE(found_int && found_str); |
| |
| output_schema = std::make_shared<TabletSchema>(); |
| sparse_paths.insert("a"); |
| schema_util::get_compaction_subcolumns(paths_set_info, parent_column, schema, |
| path_to_data_types, sparse_paths, output_schema); |
| EXPECT_EQ(output_schema->num_columns(), 2); |
| for (const auto& column : output_schema->columns()) { |
| if (column->name().ends_with("a")) { |
| EXPECT_EQ(column->type(), FieldType::OLAP_FIELD_TYPE_VARIANT); |
| } else if (column->name().ends_with("b")) { |
| EXPECT_EQ(column->type(), FieldType::OLAP_FIELD_TYPE_STRING); |
| } |
| } |
| |
| output_schema = std::make_shared<TabletSchema>(); |
| sparse_paths.clear(); |
| |
| for (int i = 0; i < config::variant_max_sparse_column_statistics_size + 1; ++i) { |
| sparse_paths.insert("dummy" + std::to_string(i)); |
| } |
| schema_util::get_compaction_subcolumns(paths_set_info, parent_column, schema, |
| path_to_data_types, sparse_paths, output_schema); |
| EXPECT_EQ(output_schema->num_columns(), 2); |
| for (const auto& column : output_schema->columns()) { |
| EXPECT_EQ(column->type(), FieldType::OLAP_FIELD_TYPE_VARIANT); |
| } |
| } |
| |
| TEST_F(SchemaUtilTest, get_compaction_subcolumns_advanced) { |
| TabletColumn variant; |
| variant.set_unique_id(30); |
| variant.set_variant_max_subcolumns_count(3); |
| variant.set_aggregation_method(FieldAggregationMethod::OLAP_FIELD_AGGREGATION_NONE); |
| variant.set_variant_enable_typed_paths_to_sparse(true); |
| TabletColumn subcolumn; |
| subcolumn.set_name("c"); |
| subcolumn.set_type(FieldType::OLAP_FIELD_TYPE_DATEV2); |
| variant.add_sub_column(subcolumn); |
| TabletColumn subcolumn2; |
| subcolumn2.set_name("d"); |
| subcolumn2.set_type(FieldType::OLAP_FIELD_TYPE_DATEV2); |
| variant.add_sub_column(subcolumn2); |
| |
| TabletSchemaSPtr schema = std::make_shared<TabletSchema>(); |
| schema->append_column(variant); |
| |
| TabletColumnPtr parent_column = std::make_shared<TabletColumn>(variant); |
| |
| TabletSchema::PathsSetInfo paths_set_info; |
| paths_set_info.sub_path_set.insert("a"); |
| paths_set_info.sub_path_set.insert("b"); |
| paths_set_info.sub_path_set.insert("c"); |
| paths_set_info.sub_path_set.insert("d"); |
| doris::vectorized::schema_util::PathToDataTypes path_to_data_types; |
| std::unordered_set<std::string> sparse_paths; |
| TabletSchemaSPtr output_schema = std::make_shared<TabletSchema>(); |
| |
| schema_util::get_compaction_subcolumns(paths_set_info, parent_column, schema, |
| path_to_data_types, sparse_paths, output_schema); |
| EXPECT_EQ(output_schema->num_columns(), 4); |
| for (const auto& column : output_schema->columns()) { |
| if (column->name().ends_with("a") || column->name().ends_with("b")) { |
| EXPECT_EQ(column->type(), FieldType::OLAP_FIELD_TYPE_VARIANT); |
| } else if (column->name().ends_with("c") || column->name().ends_with("d")) { |
| EXPECT_EQ(column->type(), FieldType::OLAP_FIELD_TYPE_DATEV2); |
| } |
| } |
| |
| output_schema = std::make_shared<TabletSchema>(); |
| path_to_data_types.clear(); |
| path_to_data_types[vectorized::PathInData("a")] = { |
| std::make_shared<vectorized::DataTypeInt32>()}; |
| path_to_data_types[vectorized::PathInData("b")] = { |
| std::make_shared<vectorized::DataTypeString>()}; |
| schema_util::get_compaction_subcolumns(paths_set_info, parent_column, schema, |
| path_to_data_types, sparse_paths, output_schema); |
| EXPECT_EQ(output_schema->num_columns(), 4); |
| bool found_int = false, found_str = false; |
| for (const auto& column : output_schema->columns()) { |
| if (column->name().ends_with("a")) { |
| EXPECT_EQ(column->type(), FieldType::OLAP_FIELD_TYPE_INT); |
| found_int = true; |
| } else if (column->name().ends_with("b")) { |
| EXPECT_EQ(column->type(), FieldType::OLAP_FIELD_TYPE_STRING); |
| found_str = true; |
| } else if (column->name().ends_with("c") || column->name().ends_with("d")) { |
| EXPECT_EQ(column->type(), FieldType::OLAP_FIELD_TYPE_DATEV2); |
| } |
| } |
| EXPECT_TRUE(found_int && found_str); |
| |
| output_schema = std::make_shared<TabletSchema>(); |
| sparse_paths.insert("a"); |
| schema_util::get_compaction_subcolumns(paths_set_info, parent_column, schema, |
| path_to_data_types, sparse_paths, output_schema); |
| EXPECT_EQ(output_schema->num_columns(), 4); |
| for (const auto& column : output_schema->columns()) { |
| if (column->name().ends_with("a")) { |
| EXPECT_EQ(column->type(), FieldType::OLAP_FIELD_TYPE_VARIANT); |
| } else if (column->name().ends_with("b")) { |
| EXPECT_EQ(column->type(), FieldType::OLAP_FIELD_TYPE_STRING); |
| } else if (column->name().ends_with("c") || column->name().ends_with("d")) { |
| EXPECT_EQ(column->type(), FieldType::OLAP_FIELD_TYPE_DATEV2); |
| } |
| } |
| |
| output_schema = std::make_shared<TabletSchema>(); |
| sparse_paths.clear(); |
| |
| for (int i = 0; i < config::variant_max_sparse_column_statistics_size + 1; ++i) { |
| sparse_paths.insert("dummy" + std::to_string(i)); |
| } |
| schema_util::get_compaction_subcolumns(paths_set_info, parent_column, schema, |
| path_to_data_types, sparse_paths, output_schema); |
| EXPECT_EQ(output_schema->num_columns(), 4); |
| for (const auto& column : output_schema->columns()) { |
| if (column->name().ends_with("a") || column->name().ends_with("b")) { |
| EXPECT_EQ(column->type(), FieldType::OLAP_FIELD_TYPE_VARIANT); |
| } else if (column->name().ends_with("c") || column->name().ends_with("d")) { |
| EXPECT_EQ(column->type(), FieldType::OLAP_FIELD_TYPE_DATEV2); |
| } |
| } |
| } |