blob: 8841ab2df1a46746d46a2712cf1dd92a81175659 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "exec/csv_scan_node.h"
#include <gtest/gtest.h>
#include <boost/scoped_ptr.hpp>
#include <vector>
#include "gen_cpp/PlanNodes_types.h"
#include "gen_cpp/Types_types.h"
#include "runtime/row_batch.h"
#include "runtime/runtime_state.h"
#include "runtime/tuple_row.h"
#include "util/cpu_info.h"
#include "util/debug_util.h"
#include "util/disk_info.h"
#include "util/logging.h"
namespace doris {
class CsvScanNodeTest : public testing::Test {
public:
CsvScanNodeTest() {}
~CsvScanNodeTest() {}
protected:
virtual void SetUp() {
config::periodic_counter_update_period_ms = 500;
config::storage_root_path = "./data";
_env.reset(new ExecEnv());
system("mkdir -p ./test_run/output/");
system("pwd");
system("cp -r ./be/test/exec/test_data/csv_scan_node ./test_run/.");
init();
}
virtual void TearDown() {
_obj_pool.clear();
_env.reset();
// system("rm -rf ./test_run");
}
void init();
void init_desc_tbl();
void init_runtime_state();
private:
ObjectPool _obj_pool;
TDescriptorTable _t_desc_table;
DescriptorTbl* _desc_tbl;
TPlanNode _tnode;
boost::scoped_ptr<ExecEnv> _env;
RuntimeState* _state;
}; // end class CsvScanNodeTest
void CsvScanNodeTest::init() {
_env->init_for_tests();
init_desc_tbl();
init_runtime_state();
}
void CsvScanNodeTest::init_runtime_state() {
_state = _obj_pool.add(new RuntimeState(TUniqueId(), TQueryOptions(), "", _env.get()));
_state->set_desc_tbl(_desc_tbl);
_state->_load_dir = "./test_run/output/";
_state->init_mem_trackers(TUniqueId());
}
void CsvScanNodeTest::init_desc_tbl() {
// TTableDescriptor
TTableDescriptor t_table_desc;
t_table_desc.id = 0;
t_table_desc.tableType = TTableType::OLAP_TABLE;
t_table_desc.numCols = 0;
t_table_desc.numClusteringCols = 0;
t_table_desc.olapTable.tableName = "test";
t_table_desc.tableName = "test_table_name";
t_table_desc.dbName = "test_db_name";
t_table_desc.__isset.olapTable = true;
_t_desc_table.tableDescriptors.push_back(t_table_desc);
_t_desc_table.__isset.tableDescriptors = true;
// TSlotDescriptor
std::vector<TSlotDescriptor> slot_descs;
int offset = 1;
int i = 0;
// int_column
{
TSlotDescriptor t_slot_desc;
t_slot_desc.__set_id(i);
t_slot_desc.__set_slotType(gen_type_desc(TPrimitiveType::INT));
t_slot_desc.__set_columnPos(i);
t_slot_desc.__set_byteOffset(offset);
t_slot_desc.__set_nullIndicatorByte(0);
t_slot_desc.__set_nullIndicatorBit(-1);
t_slot_desc.__set_slotIdx(i);
t_slot_desc.__set_isMaterialized(true);
t_slot_desc.__set_colName("int_column");
slot_descs.push_back(t_slot_desc);
offset += sizeof(int32_t);
}
++i;
// decimal_column
{
TSlotDescriptor t_slot_desc;
t_slot_desc.__set_id(i);
TTypeDesc ttype = gen_type_desc(TPrimitiveType::DECIMAL);
ttype.types[0].scalar_type.__set_precision(10);
ttype.types[0].scalar_type.__set_scale(5);
t_slot_desc.__set_slotType(ttype);
t_slot_desc.__set_columnPos(i);
t_slot_desc.__set_byteOffset(offset);
t_slot_desc.__set_nullIndicatorByte(0);
t_slot_desc.__set_nullIndicatorBit(-1);
t_slot_desc.__set_slotIdx(i);
t_slot_desc.__set_isMaterialized(true);
t_slot_desc.__set_colName("decimal_column");
slot_descs.push_back(t_slot_desc);
offset += sizeof(DecimalValue);
}
++i;
// date_column
{
TSlotDescriptor t_slot_desc;
t_slot_desc.__set_id(i);
t_slot_desc.__set_slotType(gen_type_desc(TPrimitiveType::DATE));
t_slot_desc.__set_columnPos(i);
t_slot_desc.__set_byteOffset(offset);
t_slot_desc.__set_nullIndicatorByte(0);
t_slot_desc.__set_nullIndicatorBit(-1);
t_slot_desc.__set_slotIdx(i);
t_slot_desc.__set_isMaterialized(true);
t_slot_desc.__set_colName("date_column");
slot_descs.push_back(t_slot_desc);
offset += sizeof(DateTimeValue);
}
++i;
// fix_len_string_column
{
TSlotDescriptor t_slot_desc;
t_slot_desc.__set_id(i);
TTypeDesc ttype = gen_type_desc(TPrimitiveType::CHAR);
ttype.types[0].scalar_type.__set_len(5);
t_slot_desc.__set_slotType(ttype);
t_slot_desc.__set_columnPos(i);
t_slot_desc.__set_byteOffset(offset);
t_slot_desc.__set_nullIndicatorByte(0);
t_slot_desc.__set_nullIndicatorBit(-1);
t_slot_desc.__set_slotIdx(i);
t_slot_desc.__set_isMaterialized(true);
t_slot_desc.__set_colName("fix_len_string_column");
slot_descs.push_back(t_slot_desc);
offset += sizeof(StringValue);
}
_t_desc_table.__set_slotDescriptors(slot_descs);
// TTupleDescriptor
TTupleDescriptor t_tuple_desc;
t_tuple_desc.id = 0;
t_tuple_desc.byteSize = offset;
t_tuple_desc.numNullBytes = 1;
t_tuple_desc.tableId = 0;
t_tuple_desc.__isset.tableId = true;
_t_desc_table.tupleDescriptors.push_back(t_tuple_desc);
DescriptorTbl::create(&_obj_pool, _t_desc_table, &_desc_tbl);
// node
_tnode.node_id = 0;
_tnode.node_type = TPlanNodeType::CSV_SCAN_NODE;
_tnode.num_children = 0;
_tnode.limit = -1;
_tnode.row_tuples.push_back(0);
_tnode.nullable_tuples.push_back(false);
_tnode.csv_scan_node.tuple_id = 0;
_tnode.csv_scan_node.__set_column_separator(",");
_tnode.csv_scan_node.__set_line_delimiter("\n");
// column_type_mapping
std::map<std::string, TColumnType> column_type_map;
{
TColumnType column_type;
column_type.__set_type(TPrimitiveType::INT);
column_type_map["int_column"] = column_type;
}
{
TColumnType column_type;
column_type.__set_type(TPrimitiveType::DECIMAL);
column_type.__set_precision(10);
column_type.__set_scale(5);
column_type_map["decimal_column"] = column_type;
}
{
TColumnType column_type;
column_type.__set_type(TPrimitiveType::DATE);
column_type_map["date_column"] = column_type;
}
{
TColumnType column_type;
column_type.__set_type(TPrimitiveType::BIGINT);
column_type.__set_len(5);
column_type_map["fix_len_string_column"] = column_type;
}
_tnode.csv_scan_node.__set_column_type_mapping(column_type_map);
std::vector<std::string> columns;
columns.push_back("int_column");
columns.push_back("date_column");
columns.push_back("decimal_column");
columns.push_back("fix_len_string_column");
_tnode.csv_scan_node.__set_columns(columns);
_tnode.csv_scan_node.__isset.unspecified_columns = true;
_tnode.csv_scan_node.__isset.default_values = true;
_tnode.csv_scan_node.max_filter_ratio = 0.5;
_tnode.__isset.csv_scan_node = true;
}
TEST_F(CsvScanNodeTest, NormalUse) {
std::vector<std::string> file_paths;
file_paths.push_back("./test_run/csv_scan_node/normal_use");
_tnode.csv_scan_node.__set_file_paths(file_paths);
CsvScanNode scan_node(&_obj_pool, _tnode, *_desc_tbl);
Status status = scan_node.prepare(_state);
ASSERT_TRUE(status.ok());
status = scan_node.open(_state);
ASSERT_TRUE(status.ok());
auto tracker = std::make_shared<MemTracker>();
RowBatch row_batch(scan_node._row_descriptor, _state->batch_size(), tracker.get());
bool eos = false;
while (!eos) {
status = scan_node.get_next(_state, &row_batch, &eos);
ASSERT_TRUE(status.ok());
// int num = std::min(row_batch.num_rows(), 10);
int num = row_batch.num_rows();
std::cout << "num: " << num << std::endl;
ASSERT_EQ(num, 6);
for (int i = 0; i < num; ++i) {
TupleRow* row = row_batch.get_row(i);
// LOG(WARNING) << "input row[" << i << "]: " << print_row(row, scan_node._row_descriptor);
std::cout << "input row: " << print_row(row, scan_node._row_descriptor) << std::endl;
if (i == 0) {
ASSERT_EQ(std::string("[(1 -12345.67891 2015-04-20 abc\0\0)]", 35),
print_row(row, scan_node._row_descriptor));
}
}
}
ASSERT_TRUE(scan_node.close(_state).ok());
}
TEST_F(CsvScanNodeTest, continuousDelim) {
std::vector<std::string> file_paths;
file_paths.push_back("./test_run/csv_scan_node/continuous_delim");
_tnode.csv_scan_node.__set_file_paths(file_paths);
CsvScanNode scan_node(&_obj_pool, _tnode, *_desc_tbl);
Status status = scan_node.prepare(_state);
ASSERT_TRUE(status.ok());
status = scan_node.open(_state);
ASSERT_TRUE(status.ok());
RowBatch row_batch(scan_node._row_descriptor, _state->batch_size(), tracker.get());
bool eos = false;
while (!eos) {
status = scan_node.get_next(_state, &row_batch, &eos);
ASSERT_TRUE(status.ok());
// int num = std::min(row_batch.num_rows(), 10);
int num = row_batch.num_rows();
std::cout << "num: " << num << std::endl;
ASSERT_EQ(num, 1);
for (int i = 0; i < num; ++i) {
TupleRow* row = row_batch.get_row(i);
// LOG(WARNING) << "input row[" << i << "]: " << print_row(row, scan_node._row_descriptor);
std::cout << "input row: " << print_row(row, scan_node._row_descriptor) << std::endl;
if (i == 0) {
ASSERT_EQ(std::string("[(1 -12345.67891 2015-04-20 \0\0\0\0\0)]", 35),
print_row(row, scan_node._row_descriptor));
}
}
}
ASSERT_TRUE(scan_node.close(_state).ok());
}
TEST_F(CsvScanNodeTest, wrong_decimal_format_test) {
std::vector<std::string> file_paths;
file_paths.push_back("./test_run/csv_scan_node/wrong_decimal_format");
_tnode.csv_scan_node.__set_file_paths(file_paths);
CsvScanNode scan_node(&_obj_pool, _tnode, *_desc_tbl);
Status status = scan_node.prepare(_state);
ASSERT_TRUE(status.ok());
status = scan_node.open(_state);
ASSERT_TRUE(status.ok());
auto tracker = std::make_shared<MemTracker>();
RowBatch row_batch(scan_node._row_descriptor, _state->batch_size(), tracker.get());
bool eos = false;
while (!eos) {
status = scan_node.get_next(_state, &row_batch, &eos);
ASSERT_TRUE(status.ok());
// int num = std::min(row_batch.num_rows(), 10);
int num = row_batch.num_rows();
std::cout << "num: " << num << std::endl;
ASSERT_EQ(0, num);
}
// Failed because reach max_filter_ratio
ASSERT_TRUE(!scan_node.close(_state).ok());
}
TEST_F(CsvScanNodeTest, fill_fix_len_stringi_test) {
std::vector<std::string> file_paths;
file_paths.push_back("./test_run/csv_scan_node/fill_string_len");
_tnode.csv_scan_node.__set_file_paths(file_paths);
CsvScanNode scan_node(&_obj_pool, _tnode, *_desc_tbl);
Status status = scan_node.prepare(_state);
ASSERT_TRUE(status.ok());
status = scan_node.open(_state);
ASSERT_TRUE(status.ok());
auto tracker = std::make_shared<MemTracker>();
RowBatch row_batch(scan_node._row_descriptor, _state->batch_size(), tracker.get());
bool eos = false;
while (!eos) {
status = scan_node.get_next(_state, &row_batch, &eos);
ASSERT_TRUE(status.ok());
// int num = std::min(row_batch.num_rows(), 10);
int num = row_batch.num_rows();
std::cout << "num: " << num << std::endl;
ASSERT_TRUE(num > 0);
// 1,2015-04-20,12345.67891,abcdefg
for (int i = 0; i < num; ++i) {
TupleRow* row = row_batch.get_row(i);
LOG(WARNING) << "input row[" << i << "]: " << print_row(row, scan_node._row_descriptor);
std::cout << "input row: " << print_row(row, scan_node._row_descriptor) << std::endl;
if (i == 0) {
ASSERT_EQ(std::string("[(1 12345.67891 2015-04-20 ab\0\0\0)]", 34),
print_row(row, scan_node._row_descriptor));
Tuple* tuple = row->get_tuple(0);
StringValue* str_slot =
tuple->get_string_slot(_t_desc_table.slotDescriptors[3].byteOffset);
std::cout << "str_slot len: " << str_slot->len << std::endl;
ASSERT_EQ(5, str_slot->len);
}
}
}
ASSERT_TRUE(scan_node.close(_state).ok());
}
TEST_F(CsvScanNodeTest, wrong_fix_len_string_format_test) {
std::vector<std::string> file_paths;
file_paths.push_back("./test_run/csv_scan_node/wrong_fix_len_string");
_tnode.csv_scan_node.__set_file_paths(file_paths);
CsvScanNode scan_node(&_obj_pool, _tnode, *_desc_tbl);
Status status = scan_node.prepare(_state);
ASSERT_TRUE(status.ok());
status = scan_node.open(_state);
ASSERT_TRUE(status.ok());
auto tracker = std::make_shared<MemTracker>();
RowBatch row_batch(scan_node._row_descriptor, _state->batch_size(), tracker.get());
bool eos = false;
while (!eos) {
status = scan_node.get_next(_state, &row_batch, &eos);
ASSERT_TRUE(status.ok());
// int num = std::min(row_batch.num_rows(), 10);
int num = row_batch.num_rows();
std::cout << "num: " << num << std::endl;
ASSERT_EQ(0, num);
}
// Failed because reach max_filter_ratio
ASSERT_TRUE(!scan_node.close(_state).ok());
}
// 待补充测试case
// 1. 字符串导入
// 2. 不指定有默认值的列
// 3. 文件中有但表中没有的列,导入命令中跳过该列
// 4. max_filter_ratio
} // end namespace doris
int main(int argc, char** argv) {
// std::string conffile = std::string(getenv("DORIS_HOME")) + "/conf/be.conf";
// if (!doris::config::init(conffile.c_str(), false)) {
// fprintf(stderr, "error read config file. \n");
// return -1;
// }
doris::config::read_size = 8388608;
doris::config::min_buffer_size = 1024;
doris::init_glog("be-test");
::testing::InitGoogleTest(&argc, argv);
doris::CpuInfo::init();
doris::DiskInfo::init();
return RUN_ALL_TESTS();
}