blob: d83445129cbb602df8f6f07a6d3d15950779cbad [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "config.h"
#if USE_PARQUET && USE_ARROW
#include <iostream>
#include <Core/Block.h>
#include <Core/ColumnWithTypeAndName.h>
#include <Core/Field.h>
#include <DataTypes/DataTypeArray.h>
#include <DataTypes/DataTypeFactory.h>
#include <DataTypes/DataTypeMap.h>
#include <DataTypes/DataTypeNullable.h>
#include <DataTypes/DataTypeString.h>
#include <Processors/Formats/Impl/ArrowColumnToCHColumn.h>
#include <Processors/Formats/Impl/CHColumnToArrowColumn.h>
#include <arrow/table.h>
#include <gtest/gtest.h>
using namespace DB;
template <typename Column>
static void printColumns(const std::vector<Column> & columns)
{
for (size_t col_i = 0; col_i < columns.size(); ++col_i)
for (size_t row_i = 0; row_i < columns[col_i]->size(); ++row_i)
std::cout << "col:" << col_i << ",row:" << row_i << "," << toString((*columns[col_i])[row_i]) << std::endl;
}
template <typename Column>
static bool haveEqualColumns(const Column & lhs, const Column & rhs)
{
if (lhs->size() != rhs->size())
return false;
for (size_t row_i = 0; row_i < lhs->size(); ++row_i)
if ((*lhs)[row_i] != (*rhs)[row_i])
return false;
return true;
}
TEST(ParquetWrite, ComplexTypes)
{
Block header;
/// map field
{
ColumnWithTypeAndName col;
col.name = "map_field";
String str_type = "Nullable(Map(Int32, Nullable(Int64)))";
col.type = DataTypeFactory::instance().get(str_type);
header.insert(std::move(col));
}
/// map field with null
{
ColumnWithTypeAndName col;
col.name = "map_field_with_null";
String str_type = "Nullable(Map(Int32, Nullable(Int64)))";
col.type = DataTypeFactory::instance().get(str_type);
header.insert(std::move(col));
}
/// map field with empty map
{
ColumnWithTypeAndName col;
col.name = "map_field_with_empty_map";
String str_type = "Nullable(Map(Int32, Nullable(Int64)))";
col.type = DataTypeFactory::instance().get(str_type);
header.insert(std::move(col));
}
/// array field
{
ColumnWithTypeAndName col;
col.name = "array_field";
String str_type = "Nullable(Array(Nullable(Int32)))";
col.type = DataTypeFactory::instance().get(str_type);
header.insert(std::move(col));
}
/// array field with null
{
ColumnWithTypeAndName col;
col.name = "array_field_with_null";
String str_type = "Nullable(Array(Nullable(Int32)))";
col.type = DataTypeFactory::instance().get(str_type);
header.insert(std::move(col));
}
/// array field with empty array
{
ColumnWithTypeAndName col;
col.name = "array_field_with_empty_array";
String str_type = "Nullable(Array(Nullable(Int32)))";
col.type = DataTypeFactory::instance().get(str_type);
header.insert(std::move(col));
}
/// struct_field
{
ColumnWithTypeAndName col;
col.name = "struct_field";
String str_type = "Nullable(Tuple(Nullable(Int32), Nullable(Int64)))";
col.type = DataTypeFactory::instance().get(str_type);
header.insert(std::move(col));
}
/// struct field with null
{
ColumnWithTypeAndName col;
col.name = "struct_field_with_null";
String str_type = "Nullable(Tuple(Nullable(Int32), Nullable(Int64)))";
col.type = DataTypeFactory::instance().get(str_type);
header.insert(std::move(col));
}
auto mutable_columns = header.mutateColumns();
size_t rows = 10;
for (size_t i = 0; i < rows; i++)
{
/// map_field
{
Map map;
Tuple tuple1{Int32(i), Int64(i + 1)};
map.emplace_back(std::move(tuple1));
Tuple tuple2{Int32(i + 2), Int64(i + 3)};
map.emplace_back(std::move(tuple2));
mutable_columns[0]->insert(std::move(map));
}
/// map_field_with_null
{
mutable_columns[1]->insert({});
}
/// map_field_with_empty_map
{
mutable_columns[2]->insert(Map{});
}
/// array_field
{
Array array;
array.emplace_back(Int32(i));
array.emplace_back(Int32(i + 1));
mutable_columns[3]->insert(std::move(array));
}
/// array_field_with_null
{
mutable_columns[4]->insert({});
}
/// array_field_with_empty_array
{
mutable_columns[5]->insert(Array{});
}
/// struct_field
{
Tuple tuple;
tuple.emplace_back(Int32(i));
tuple.emplace_back(Int64(i + 1));
mutable_columns[6]->insert(std::move(tuple));
}
/// struct_field_with_null
{
mutable_columns[7]->insert({});
}
}
/// Convert CH Block to Arrow Table
std::shared_ptr<arrow::Table> arrow_table;
FormatSettings format_settings;
CHColumnToArrowColumn ch2arrow(
header,
"Parquet",
CHColumnToArrowColumn::Settings{
format_settings.arrow.output_string_as_string,
format_settings.arrow.output_fixed_string_as_fixed_byte_array,
format_settings.arrow.low_cardinality_as_dictionary,
format_settings.arrow.use_signed_indexes_for_dictionary,
format_settings.arrow.use_64_bit_indexes_for_dictionary});
Chunk input_chunk{std::move(mutable_columns), rows};
std::vector<Chunk> input_chunks;
input_chunks.push_back(std::move(input_chunk));
ch2arrow.chChunkToArrowTable(arrow_table, input_chunks, header.columns());
/// Convert Arrow Table to CH Block
ArrowColumnToCHColumn arrow2ch(
header, "Parquet", format_settings, std::nullopt, std::nullopt, true, true, FormatSettings::DateTimeOverflowBehavior::Ignore, false);
Chunk output_chunk = arrow2ch.arrowTableToCHChunk(arrow_table, arrow_table->num_rows(), nullptr, nullptr);
/// Compare input and output columns
const auto & input_columns = input_chunks.back().getColumns();
const auto & output_columns = output_chunk.getColumns();
for (size_t col_i = 0; col_i < header.columns(); ++col_i)
EXPECT_TRUE(haveEqualColumns(input_columns[col_i], output_columns[col_i]));
}
#endif