blob: 31932c2e9c20101c57a3c2c1807586fd21108782 [file]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "format/table/hive_reader.h"
#include <cctz/time_zone.h>
#include <gen_cpp/Descriptors_types.h>
#include <gen_cpp/PaloInternalService_types.h>
#include <gen_cpp/PlanNodes_types.h>
#include <gen_cpp/Types_types.h>
#include <gtest/gtest.h>
#include <iostream>
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "common/object_pool.h"
#include "core/block/block.h"
#include "core/block/column_with_type_and_name.h"
#include "core/column/column.h"
#include "core/column/column_array.h"
#include "core/column/column_nullable.h"
#include "core/column/column_struct.h"
#include "core/data_type/data_type.h"
#include "core/data_type/data_type_array.h"
#include "core/data_type/data_type_factory.hpp"
#include "core/data_type/data_type_nullable.h"
#include "core/data_type/data_type_number.h"
#include "core/data_type/data_type_string.h"
#include "core/data_type/data_type_struct.h"
#include "format/orc/vorc_reader.h"
#include "format/parquet/vparquet_reader.h"
#include "io/fs/file_meta_cache.h"
#include "io/fs/file_reader_writer_fwd.h"
#include "io/fs/file_system.h"
#include "io/fs/local_file_system.h"
#include "runtime/descriptors.h"
#include "runtime/runtime_state.h"
#include "storage/olap_scan_common.h"
#include "util/timezone_utils.h"
namespace doris::table {
class HiveReaderTest : public ::testing::Test {
protected:
void SetUp() override {
cache = std::make_unique<doris::FileMetaCache>(1024);
// Setup timezone
doris::TimezoneUtils::find_cctz_time_zone(doris::TimezoneUtils::default_time_zone,
timezone_obj);
}
void TearDown() override { cache.reset(); }
// Helper function to create complex struct types for testing
void create_complex_struct_types(DataTypePtr& coordinates_struct_type,
DataTypePtr& address_struct_type,
DataTypePtr& phone_struct_type,
DataTypePtr& contact_struct_type,
DataTypePtr& hobby_element_struct_type,
DataTypePtr& hobbies_array_type,
DataTypePtr& profile_struct_type, DataTypePtr& name_type) {
// Create name column type (direct field)
name_type = make_nullable(std::make_shared<DataTypeString>());
// First create coordinates struct type
std::vector<DataTypePtr> coordinates_types = {
make_nullable(std::make_shared<DataTypeFloat64>()), // lat (field ID 10)
make_nullable(std::make_shared<DataTypeFloat64>()) // lng (field ID 11)
};
std::vector<std::string> coordinates_names = {"lat", "lng"};
coordinates_struct_type = make_nullable(
std::make_shared<DataTypeStruct>(coordinates_types, coordinates_names));
// Create address struct type (with street, city, coordinates)
std::vector<DataTypePtr> address_types = {
make_nullable(std::make_shared<DataTypeString>()), // street (field ID 7)
make_nullable(std::make_shared<DataTypeString>()), // city (field ID 8)
coordinates_struct_type // coordinates (field ID 9)
};
std::vector<std::string> address_names = {"street", "city", "coordinates"};
address_struct_type =
make_nullable(std::make_shared<DataTypeStruct>(address_types, address_names));
// Create phone struct type
std::vector<DataTypePtr> phone_types = {
make_nullable(std::make_shared<DataTypeString>()), // country_code (field ID 14)
make_nullable(std::make_shared<DataTypeString>()) // number (field ID 15)
};
std::vector<std::string> phone_names = {"country_code", "number"};
phone_struct_type =
make_nullable(std::make_shared<DataTypeStruct>(phone_types, phone_names));
// Create contact struct type (with email, phone)
std::vector<DataTypePtr> contact_types = {
make_nullable(std::make_shared<DataTypeString>()), // email (field ID 12)
phone_struct_type // phone (field ID 13)
};
std::vector<std::string> contact_names = {"email", "phone"};
contact_struct_type =
make_nullable(std::make_shared<DataTypeStruct>(contact_types, contact_names));
// Create hobby element struct type for array elements
std::vector<DataTypePtr> hobby_element_types = {
make_nullable(std::make_shared<DataTypeString>()), // name (field ID 17)
make_nullable(std::make_shared<DataTypeInt32>()) // level (field ID 18)
};
std::vector<std::string> hobby_element_names = {"name", "level"};
hobby_element_struct_type = make_nullable(
std::make_shared<DataTypeStruct>(hobby_element_types, hobby_element_names));
// Create hobbies array type
hobbies_array_type =
make_nullable(std::make_shared<DataTypeArray>(hobby_element_struct_type));
// Create complete profile struct type (with address, contact, hobbies)
std::vector<DataTypePtr> profile_types = {
address_struct_type, // address (field ID 4)
contact_struct_type, // contact (field ID 5)
hobbies_array_type // hobbies (field ID 6)
};
std::vector<std::string> profile_names = {"address", "contact", "hobbies"};
profile_struct_type =
make_nullable(std::make_shared<DataTypeStruct>(profile_types, profile_names));
}
// Helper function to create tuple descriptor
const TupleDescriptor* create_tuple_descriptor(
DescriptorTbl** desc_tbl, ObjectPool& obj_pool, TDescriptorTable& t_desc_table,
TTableDescriptor& t_table_desc, const std::vector<std::string>& column_names,
const std::vector<int>& column_positions,
const std::vector<TPrimitiveType::type>& column_types) {
// Create table descriptor with complex schema
auto create_table_desc = [](TDescriptorTable& t_desc_table, TTableDescriptor& t_table_desc,
const std::vector<std::string>& table_column_names,
const std::vector<int>& table_column_positions,
const std::vector<TPrimitiveType::type>& types) {
t_table_desc.__set_id(0);
t_table_desc.__set_tableType(TTableType::OLAP_TABLE);
t_table_desc.__set_numCols(0);
t_table_desc.__set_numClusteringCols(0);
t_desc_table.tableDescriptors.push_back(t_table_desc);
t_desc_table.__isset.tableDescriptors = true;
for (int i = 0; i < table_column_names.size(); i++) {
TSlotDescriptor tslot_desc;
TTypeDesc type;
if (table_column_names[i] == "profile") {
// STRUCT/ARRAY nodes set contains_nulls; SCALAR nodes do not
TTypeNode struct_node;
struct_node.__set_type(TTypeNodeType::STRUCT);
std::vector<TStructField> struct_fields;
TStructField address_field;
address_field.__set_name("address");
address_field.__set_contains_null(true);
struct_fields.push_back(address_field);
TStructField contact_field;
contact_field.__set_name("contact");
contact_field.__set_contains_null(true);
struct_fields.push_back(contact_field);
TStructField hobbies_field;
hobbies_field.__set_name("hobbies");
hobbies_field.__set_contains_null(true);
struct_fields.push_back(hobbies_field);
struct_node.__set_struct_fields(struct_fields);
type.types.push_back(struct_node);
TTypeNode address_node;
address_node.__set_type(TTypeNodeType::STRUCT);
std::vector<TStructField> address_fields;
TStructField street_field;
street_field.__set_name("street");
street_field.__set_contains_null(true);
address_fields.push_back(street_field);
TStructField city_field;
city_field.__set_name("city");
city_field.__set_contains_null(true);
address_fields.push_back(city_field);
TStructField coordinates_field;
coordinates_field.__set_name("coordinates");
coordinates_field.__set_contains_null(true);
address_fields.push_back(coordinates_field);
address_node.__set_struct_fields(address_fields);
type.types.push_back(address_node);
TTypeNode street_node;
street_node.__set_type(TTypeNodeType::SCALAR);
TScalarType street_scalar;
street_scalar.__set_type(TPrimitiveType::STRING);
street_node.__set_scalar_type(street_scalar);
type.types.push_back(street_node);
TTypeNode city_node;
city_node.__set_type(TTypeNodeType::SCALAR);
TScalarType city_scalar;
city_scalar.__set_type(TPrimitiveType::STRING);
city_node.__set_scalar_type(city_scalar);
type.types.push_back(city_node);
TTypeNode coordinates_node;
coordinates_node.__set_type(TTypeNodeType::STRUCT);
std::vector<TStructField> coordinates_fields;
TStructField lat_field;
lat_field.__set_name("lat");
lat_field.__set_contains_null(true);
coordinates_fields.push_back(lat_field);
TStructField lng_field;
lng_field.__set_name("lng");
lng_field.__set_contains_null(true);
coordinates_fields.push_back(lng_field);
coordinates_node.__set_struct_fields(coordinates_fields);
type.types.push_back(coordinates_node);
TTypeNode lat_node;
lat_node.__set_type(TTypeNodeType::SCALAR);
TScalarType lat_scalar;
lat_scalar.__set_type(TPrimitiveType::DOUBLE);
lat_node.__set_scalar_type(lat_scalar);
type.types.push_back(lat_node);
TTypeNode lng_node;
lng_node.__set_type(TTypeNodeType::SCALAR);
TScalarType lng_scalar;
lng_scalar.__set_type(TPrimitiveType::DOUBLE);
lng_node.__set_scalar_type(lng_scalar);
type.types.push_back(lng_node);
TTypeNode contact_node;
contact_node.__set_type(TTypeNodeType::STRUCT);
std::vector<TStructField> contact_fields;
TStructField email_field;
email_field.__set_name("email");
email_field.__set_contains_null(true);
contact_fields.push_back(email_field);
TStructField phone_field;
phone_field.__set_name("phone");
phone_field.__set_contains_null(true);
contact_fields.push_back(phone_field);
contact_node.__set_struct_fields(contact_fields);
type.types.push_back(contact_node);
TTypeNode email_node;
email_node.__set_type(TTypeNodeType::SCALAR);
TScalarType email_scalar;
email_scalar.__set_type(TPrimitiveType::STRING);
email_node.__set_scalar_type(email_scalar);
type.types.push_back(email_node);
TTypeNode phone_node;
phone_node.__set_type(TTypeNodeType::STRUCT);
std::vector<TStructField> phone_fields;
TStructField country_code_field;
country_code_field.__set_name("country_code");
country_code_field.__set_contains_null(true);
phone_fields.push_back(country_code_field);
TStructField number_field;
number_field.__set_name("number");
number_field.__set_contains_null(true);
phone_fields.push_back(number_field);
phone_node.__set_struct_fields(phone_fields);
type.types.push_back(phone_node);
TTypeNode country_code_node;
country_code_node.__set_type(TTypeNodeType::SCALAR);
TScalarType country_code_scalar;
country_code_scalar.__set_type(TPrimitiveType::STRING);
country_code_node.__set_scalar_type(country_code_scalar);
type.types.push_back(country_code_node);
TTypeNode number_node;
number_node.__set_type(TTypeNodeType::SCALAR);
TScalarType number_scalar;
number_scalar.__set_type(TPrimitiveType::STRING);
number_node.__set_scalar_type(number_scalar);
type.types.push_back(number_node);
TTypeNode hobbies_node;
hobbies_node.__set_type(TTypeNodeType::ARRAY);
hobbies_node.__set_contains_nulls({true});
type.types.push_back(hobbies_node);
TTypeNode hobby_element_node;
hobby_element_node.__set_type(TTypeNodeType::STRUCT);
std::vector<TStructField> hobby_element_fields;
TStructField hobby_name_field;
hobby_name_field.__set_name("name");
hobby_name_field.__set_contains_null(true);
hobby_element_fields.push_back(hobby_name_field);
TStructField hobby_level_field;
hobby_level_field.__set_name("level");
hobby_level_field.__set_contains_null(true);
hobby_element_fields.push_back(hobby_level_field);
hobby_element_node.__set_struct_fields(hobby_element_fields);
type.types.push_back(hobby_element_node);
TTypeNode hobby_name_node;
hobby_name_node.__set_type(TTypeNodeType::SCALAR);
TScalarType hobby_name_scalar;
hobby_name_scalar.__set_type(TPrimitiveType::STRING);
hobby_name_node.__set_scalar_type(hobby_name_scalar);
type.types.push_back(hobby_name_node);
TTypeNode hobby_level_node;
hobby_level_node.__set_type(TTypeNodeType::SCALAR);
TScalarType hobby_level_scalar;
hobby_level_scalar.__set_type(TPrimitiveType::INT);
hobby_level_node.__set_scalar_type(hobby_level_scalar);
type.types.push_back(hobby_level_node);
tslot_desc.__set_slotType(type);
} else {
// Regular type
TTypeNode node;
node.__set_type(TTypeNodeType::SCALAR);
TScalarType scalar_type;
scalar_type.__set_type(types[i]);
node.__set_scalar_type(scalar_type);
type.types.push_back(node);
tslot_desc.__set_slotType(type);
}
tslot_desc.__set_id(i);
tslot_desc.__set_parent(0);
tslot_desc.__set_colName(table_column_names[i]);
tslot_desc.__set_columnPos(table_column_positions[i]);
tslot_desc.__set_byteOffset(0);
tslot_desc.__set_nullIndicatorByte(0);
tslot_desc.__set_nullIndicatorBit(-1);
tslot_desc.__set_slotIdx(0);
tslot_desc.__set_isMaterialized(true);
// Set column_access_paths only for the profile field
if (table_column_names[i] == "profile") {
{
std::vector<TColumnAccessPath> access_paths;
// address.coordinates.lat
TColumnAccessPath path1;
path1.__set_type(doris::TAccessPathType::DATA);
TDataAccessPath data_path1;
data_path1.__set_path({"profile", "address", "coordinates", "lat"});
path1.__set_data_access_path(data_path1);
access_paths.push_back(path1);
// address.coordinates.lng
TColumnAccessPath path2;
path2.__set_type(doris::TAccessPathType::DATA);
TDataAccessPath data_path2;
data_path2.__set_path({"profile", "address", "coordinates", "lng"});
path2.__set_data_access_path(data_path2);
access_paths.push_back(path2);
// contact.email
TColumnAccessPath path3;
path3.__set_type(doris::TAccessPathType::DATA);
TDataAccessPath data_path3;
data_path3.__set_path({"profile", "contact", "email"});
path3.__set_data_access_path(data_path3);
access_paths.push_back(path3);
// hobbies[].element.level
TColumnAccessPath path4;
path4.__set_type(doris::TAccessPathType::DATA);
TDataAccessPath data_path4;
data_path4.__set_path({"profile", "hobbies", "*", "level"});
path4.__set_data_access_path(data_path4);
access_paths.push_back(path4);
tslot_desc.__set_all_access_paths(access_paths);
}
}
t_desc_table.slotDescriptors.push_back(tslot_desc);
}
t_desc_table.__isset.slotDescriptors = true;
TTupleDescriptor t_tuple_desc;
t_tuple_desc.__set_id(0);
t_tuple_desc.__set_byteSize(16);
t_tuple_desc.__set_numNullBytes(0);
t_tuple_desc.__set_tableId(0);
t_tuple_desc.__isset.tableId = true;
t_desc_table.tupleDescriptors.push_back(t_tuple_desc);
};
create_table_desc(t_desc_table, t_table_desc, column_names, column_positions, column_types);
EXPECT_TRUE(DescriptorTbl::create(&obj_pool, t_desc_table, desc_tbl).ok());
return (*desc_tbl)->get_tuple_descriptor(0);
}
// Helper function to recursively print column row counts and check size > 0
void verify_test_results(Block& block, size_t read_rows) {
// Verify that we read some data
EXPECT_GT(read_rows, 0) << "Should read at least one row";
EXPECT_EQ(block.rows(), read_rows);
// Verify column count matches expected (2 columns: name, profile)
EXPECT_EQ(block.columns(), 2);
// Verify column names and types
auto columns_with_names = block.get_columns_with_type_and_name();
std::vector<std::string> expected_column_names = {"name", "profile"};
for (size_t i = 0; i < expected_column_names.size(); i++) {
EXPECT_EQ(columns_with_names[i].name, expected_column_names[i]);
}
// Verify column types
EXPECT_TRUE(columns_with_names[0].type->get_name().find("String") !=
std::string::npos); // name is STRING
EXPECT_TRUE(columns_with_names[1].type->get_name().find("Struct") !=
std::string::npos); // profile is STRUCT
// Print row count for each column and nested subcolumns
std::cout << "Block rows: " << block.rows() << std::endl;
// Helper function to recursively print column row counts
std::function<void(const ColumnPtr&, const DataTypePtr&, const std::string&, int)>
print_column_rows = [&](const ColumnPtr& col, const DataTypePtr& type,
const std::string& name, int depth) {
std::string indent(depth * 2, ' ');
std::cout << indent << name << " row count: " << col->size() << std::endl;
EXPECT_GT(col->size(), 0) << name << " column/subcolumn size should be > 0";
// Check if it's a nullable column
if (const auto* nullable_col =
check_and_get_column<ColumnNullable>(col.get())) {
auto nested_type =
assert_cast<const DataTypeNullable*>(type.get())->get_nested_type();
// Only add ".nested" suffix for non-leaf (complex) nullable columns
// Leaf columns like String, Int, etc. should not get the ".nested" suffix
bool is_complex_type =
(typeid_cast<const DataTypeStruct*>(nested_type.get()) !=
nullptr) ||
(typeid_cast<const DataTypeArray*>(nested_type.get()) != nullptr) ||
(typeid_cast<const DataTypeMap*>(nested_type.get()) != nullptr);
std::string nested_name = is_complex_type ? name + ".nested" : name;
print_column_rows(nullable_col->get_nested_column_ptr(), nested_type,
nested_name, depth + (is_complex_type ? 1 : 0));
}
// Check if it's a struct column
else if (const auto* struct_col =
check_and_get_column<ColumnStruct>(col.get())) {
auto struct_type = assert_cast<const DataTypeStruct*>(type.get());
for (size_t i = 0; i < struct_col->tuple_size(); ++i) {
std::string field_name = struct_type->get_element_name(i);
auto field_type = struct_type->get_element(i);
print_column_rows(struct_col->get_column_ptr(i), field_type,
name + "." + field_name, depth + 1);
}
}
// Check if it's an array column
else if (const auto* array_col = check_and_get_column<ColumnArray>(col.get())) {
auto array_type = assert_cast<const DataTypeArray*>(type.get());
auto element_type = array_type->get_nested_type();
print_column_rows(array_col->get_data_ptr(), element_type, name + ".data",
depth + 1);
}
};
// Print row counts for all columns
for (size_t i = 0; i < block.columns(); ++i) {
const auto& column_with_name = block.get_by_position(i);
print_column_rows(column_with_name.column, column_with_name.type, column_with_name.name,
0);
EXPECT_EQ(column_with_name.column->size(), block.rows())
<< "Column " << column_with_name.name << " size mismatch";
}
}
std::unique_ptr<doris::FileMetaCache> cache;
cctz::time_zone timezone_obj;
};
// Test reading real Hive Parquet file using HiveTableReader
TEST_F(HiveReaderTest, read_hive_parquet_file) {
// Read only: name, profile.address.coordinates.lat, profile.address.coordinates.lng, profile.contact.email
// Setup table descriptor for test columns with new schema:
/**
Schema:
message table {
required int64 id = 1;
required binary name (STRING) = 2;
required group profile = 3 {
optional group address = 4 {
optional binary street (STRING) = 7;
optional binary city (STRING) = 8;
optional group coordinates = 9 {
optional double lat = 10;
optional double lng = 11;
}
}
optional group contact = 5 {
optional binary email (STRING) = 12;
optional group phone = 13 {
optional binary country_code (STRING) = 14;
optional binary number (STRING) = 15;
}
}
optional group hobbies (LIST) = 6 {
repeated group list {
optional group element = 16 {
optional binary name (STRING) = 17;
optional int32 level = 18;
}
}
}
}
}
*/
// Open the Hive Parquet test file
auto local_fs = io::global_local_filesystem();
io::FileReaderSPtr file_reader;
std::string test_file =
"./be/test/exec/test_data/complex_user_profiles_iceberg_parquet/data/"
"00000-0-a0022aad-d3b6-4e73-b181-f0a09aac7034-0-00001.parquet";
auto st = local_fs->open_file(test_file, &file_reader);
if (!st.ok()) {
GTEST_SKIP() << "Test file not found: " << test_file;
return;
}
// Setup runtime state
RuntimeState runtime_state = RuntimeState(TQueryOptions(), TQueryGlobals());
// Setup scan parameters
TFileScanRangeParams scan_params;
scan_params.format_type = TFileFormatType::FORMAT_PARQUET;
TFileRangeDesc scan_range;
scan_range.start_offset = 0;
scan_range.size = file_reader->size(); // Read entire file
scan_range.path = test_file;
// Create mock profile
RuntimeProfile profile("test_profile");
// Create HiveParquetReader (directly inherits ParquetReader)
cctz::time_zone ctz;
TimezoneUtils::find_cctz_time_zone(TimezoneUtils::default_time_zone, ctz);
auto hive_reader =
std::make_unique<HiveParquetReader>(&profile, scan_params, scan_range, 1024, &ctz,
nullptr, &runtime_state, nullptr, cache.get());
// Set file reader for the hive reader (inherited from ParquetReader)
hive_reader->set_file_reader(file_reader);
// Create complex struct types using helper function
DataTypePtr coordinates_struct_type, address_struct_type, phone_struct_type;
DataTypePtr contact_struct_type, hobby_element_struct_type, hobbies_array_type;
DataTypePtr profile_struct_type, name_type;
create_complex_struct_types(coordinates_struct_type, address_struct_type, phone_struct_type,
contact_struct_type, hobby_element_struct_type, hobbies_array_type,
profile_struct_type, name_type);
// Create tuple descriptor using helper function
DescriptorTbl* desc_tbl;
ObjectPool obj_pool;
TDescriptorTable t_desc_table;
TTableDescriptor t_table_desc;
std::vector<std::string> table_column_names = {"name", "profile"};
std::vector<int> table_column_positions = {1, 2};
std::vector<TPrimitiveType::type> table_column_types = {
TPrimitiveType::STRING, TPrimitiveType::STRUCT // profile 用 STRUCT 类型
};
const TupleDescriptor* tuple_descriptor =
create_tuple_descriptor(&desc_tbl, obj_pool, t_desc_table, t_table_desc,
table_column_names, table_column_positions, table_column_types);
std::unordered_map<std::string, uint32_t> col_name_to_block_idx = {{"name", 0}, {"profile", 1}};
// Use the template method init_reader (inherited from ParquetReader)
// on_before_init_columns hook in HiveParquetReader will do schema matching
ParquetInitContext pq_ctx;
pq_ctx.column_names = table_column_names;
pq_ctx.col_name_to_block_idx = &col_name_to_block_idx;
pq_ctx.tuple_descriptor = tuple_descriptor;
pq_ctx.params = &scan_params;
pq_ctx.range = &scan_range;
st = hive_reader->init_reader(&pq_ctx);
ASSERT_TRUE(st.ok()) << st;
// set_fill_columns logic is now inlined in _do_init_reader,
// so no separate call is needed.
// Create block for reading nested structure (not flattened)
Block block;
{
MutableColumnPtr name_column = name_type->create_column();
block.insert(ColumnWithTypeAndName(std::move(name_column), name_type, "name"));
// Add profile column (nested struct)
MutableColumnPtr profile_column = profile_struct_type->create_column();
block.insert(
ColumnWithTypeAndName(std::move(profile_column), profile_struct_type, "profile"));
}
// Read data from the file
size_t read_rows = 0;
bool eof = false;
st = hive_reader->get_next_block(&block, &read_rows, &eof);
ASSERT_TRUE(st.ok()) << st;
// Verify test results using helper function
verify_test_results(block, read_rows);
}
// Test reading real Hive Orc file using HiveTableReader
TEST_F(HiveReaderTest, read_hive_rrc_file) {
// Read only: name, profile.address.coordinates.lat, profile.address.coordinates.lng, profile.contact.email
// Setup table descriptor for test columns with new schema:
/**
Schema:
message table {
required int64 id = 1;
required binary name (STRING) = 2;
required group profile = 3 {
optional group address = 4 {
optional binary street (STRING) = 7;
optional binary city (STRING) = 8;
optional group coordinates = 9 {
optional double lat = 10;
optional double lng = 11;
}
}
optional group contact = 5 {
optional binary email (STRING) = 12;
optional group phone = 13 {
optional binary country_code (STRING) = 14;
optional binary number (STRING) = 15;
}
}
optional group hobbies (LIST) = 6 {
repeated group list {
optional group element = 16 {
optional binary name (STRING) = 17;
optional int32 level = 18;
}
}
}
}
}
*/
// Open the Hive Orc test file
auto local_fs = io::global_local_filesystem();
io::FileReaderSPtr file_reader;
std::string test_file =
"./be/test/exec/test_data/complex_user_profiles_iceberg_orc/data/"
"00000-0-e4897963-0081-4127-bebe-35dc7dc1edeb-0-00001.orc";
auto st = local_fs->open_file(test_file, &file_reader);
if (!st.ok()) {
GTEST_SKIP() << "Test file not found: " << test_file;
return;
}
// Setup runtime state
RuntimeState runtime_state = RuntimeState(TQueryOptions(), TQueryGlobals());
// Setup scan parameters
TFileScanRangeParams scan_params;
scan_params.format_type = TFileFormatType::FORMAT_ORC;
TFileRangeDesc scan_range;
scan_range.start_offset = 0;
scan_range.size = file_reader->size(); // Read entire file
scan_range.path = test_file;
// Create mock profile
RuntimeProfile profile("test_profile");
// Create HiveOrcReader (directly inherits OrcReader)
auto hive_reader =
std::make_unique<HiveOrcReader>(&profile, &runtime_state, scan_params, scan_range, 1024,
"CST", nullptr, nullptr, cache.get());
// Create complex struct types using helper function
DataTypePtr coordinates_struct_type, address_struct_type, phone_struct_type;
DataTypePtr contact_struct_type, hobby_element_struct_type, hobbies_array_type;
DataTypePtr profile_struct_type, name_type;
create_complex_struct_types(coordinates_struct_type, address_struct_type, phone_struct_type,
contact_struct_type, hobby_element_struct_type, hobbies_array_type,
profile_struct_type, name_type);
// Create tuple descriptor using helper function
DescriptorTbl* desc_tbl;
ObjectPool obj_pool;
TDescriptorTable t_desc_table;
TTableDescriptor t_table_desc;
std::vector<std::string> table_column_names = {"name", "profile"};
std::vector<int> table_column_positions = {1, 2};
std::vector<TPrimitiveType::type> table_column_types = {
TPrimitiveType::STRING, TPrimitiveType::STRUCT // profile 用 STRUCT 类型
};
const TupleDescriptor* tuple_descriptor =
create_tuple_descriptor(&desc_tbl, obj_pool, t_desc_table, t_table_desc,
table_column_names, table_column_positions, table_column_types);
std::unordered_map<std::string, uint32_t> col_name_to_block_idx = {{"name", 0}, {"profile", 1}};
OrcInitContext orc_ctx;
orc_ctx.column_names = table_column_names;
orc_ctx.col_name_to_block_idx = &col_name_to_block_idx;
orc_ctx.tuple_descriptor = tuple_descriptor;
orc_ctx.params = &scan_params;
orc_ctx.range = &scan_range;
st = hive_reader->init_reader(&orc_ctx);
ASSERT_TRUE(st.ok()) << st;
// set_fill_columns logic is now inlined in _do_init_reader,
// so no separate call is needed.
// Create block for reading nested structure (not flattened)
Block block;
{
MutableColumnPtr name_column = name_type->create_column();
block.insert(ColumnWithTypeAndName(std::move(name_column), name_type, "name"));
// Add profile column (nested struct)
MutableColumnPtr profile_column = profile_struct_type->create_column();
block.insert(
ColumnWithTypeAndName(std::move(profile_column), profile_struct_type, "profile"));
}
// Read data from the file
size_t read_rows = 0;
bool eof = false;
st = hive_reader->get_next_block(&block, &read_rows, &eof);
ASSERT_TRUE(st.ok()) << st;
// Verify test results using helper function
verify_test_results(block, read_rows);
}
} // namespace doris::table