blob: 29ae716ce7921d5225f352035995ec9c7ad5066a [file] [log] [blame]
/*
* Copyright 2024-present Alibaba Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <memory>
#include "arrow/api.h"
#include "paimon/common/utils/arrow/status_utils.h"
#include "paimon/result.h"
namespace paimon::test {
class DictArrayConverter {
public:
DictArrayConverter() = delete;
~DictArrayConverter() = delete;
// deep copy dictionary array to string array/binary array
static Result<std::shared_ptr<arrow::Array>> ConvertDictArray(
const std::shared_ptr<arrow::Array>& array, arrow::MemoryPool* pool) {
arrow::Type::type kind = array->type_id();
switch (kind) {
case arrow::Type::type::STRUCT: {
// convert array
auto struct_array =
arrow::internal::checked_pointer_cast<arrow::StructArray>(array);
arrow::ArrayVector new_children;
std::size_t size = struct_array->fields().size();
for (size_t i = 0; i < size; i++) {
std::shared_ptr<arrow::Array> child = struct_array->field(static_cast<int>(i));
PAIMON_ASSIGN_OR_RAISE(std::shared_ptr<arrow::Array> new_child,
ConvertDictArray(child, pool));
new_children.push_back(new_child);
}
// convert type
arrow::FieldVector fields;
fields.reserve(new_children.size());
for (size_t i = 0; i < new_children.size(); i++) {
// Note: For test consistency, intentionally left nullable unspecified, as ORC
// discard nullable information, making it impossible to align.
// Moreover, this detail is currently not important for users.
fields.push_back(arrow::field(struct_array->type()->field(i)->name(),
new_children[i]->type()));
}
return std::make_shared<arrow::StructArray>(arrow::struct_(fields),
struct_array->length(), new_children,
struct_array->null_bitmap());
}
case arrow::Type::type::LIST: {
auto list_array = arrow::internal::checked_pointer_cast<arrow::ListArray>(array);
PAIMON_ASSIGN_OR_RAISE(std::shared_ptr<arrow::Array> value_array,
ConvertDictArray(list_array->values(), pool));
return std::make_shared<arrow::ListArray>(
arrow::list(value_array->type()), list_array->length(),
list_array->value_offsets(), value_array, list_array->null_bitmap(),
list_array->null_count(), list_array->offset());
}
case arrow::Type::type::MAP: {
auto map_array = arrow::internal::checked_pointer_cast<arrow::MapArray>(array);
PAIMON_ASSIGN_OR_RAISE(std::shared_ptr<arrow::Array> key_array,
ConvertDictArray(map_array->keys(), pool));
PAIMON_ASSIGN_OR_RAISE(std::shared_ptr<arrow::Array> item_array,
ConvertDictArray(map_array->items(), pool));
auto map_type =
arrow::internal::checked_pointer_cast<arrow::MapType>(map_array->type());
auto new_map_type = std::make_shared<arrow::MapType>(
key_array->type(), item_array->type(), map_type->keys_sorted());
return std::make_shared<arrow::MapArray>(
new_map_type, map_array->length(), map_array->value_offsets(), key_array,
item_array, map_array->null_bitmap(), map_array->null_count(),
map_array->offset());
}
case arrow::Type::type::DICTIONARY: {
auto dict_array =
arrow::internal::checked_pointer_cast<arrow::DictionaryArray>(array);
auto dict_type = arrow::internal::checked_pointer_cast<arrow::DictionaryType>(
dict_array->type());
auto value_type_id = dict_type->value_type()->id();
auto index_type_id = dict_type->index_type()->id();
if (value_type_id == arrow::Type::type::STRING &&
index_type_id == arrow::Type::type::INT32) {
return ConvertDictionaryArrayToBinaryArray<
arrow::StringArray, arrow::Int32Array, arrow::StringBuilder>(dict_array,
pool);
} else if (value_type_id == arrow::Type::type::LARGE_STRING &&
index_type_id == arrow::Type::type::INT64) {
return ConvertDictionaryArrayToBinaryArray<
arrow::LargeStringArray, arrow::Int64Array, arrow::StringBuilder>(
dict_array, pool);
} else {
return Status::Invalid(
"only support [STRING, INT32] or [LARGE_STRING, INT64] for "
"DictionaryArray");
}
}
default: {
return array;
}
}
}
private:
template <typename DictArrayType, typename IndicesArrayType, typename BuilderType>
static Result<std::shared_ptr<arrow::Array>> ConvertDictionaryArrayToBinaryArray(
const std::shared_ptr<arrow::DictionaryArray>& dict_array, arrow::MemoryPool* pool) {
auto dictionary = std::dynamic_pointer_cast<DictArrayType>(dict_array->dictionary());
auto indices = std::dynamic_pointer_cast<IndicesArrayType>(dict_array->indices());
auto string_builder = std::make_shared<BuilderType>(pool);
for (int64_t i = 0; i < dict_array->length(); ++i) {
if (dict_array->IsNull(i)) {
PAIMON_RETURN_NOT_OK_FROM_ARROW(string_builder->AppendNull());
} else {
int64_t dict_index = indices->Value(i);
PAIMON_RETURN_NOT_OK_FROM_ARROW(
string_builder->Append(dictionary->GetString(dict_index)));
}
}
std::shared_ptr<arrow::Array> string_array;
PAIMON_RETURN_NOT_OK_FROM_ARROW(string_builder->Finish(&string_array));
return string_array;
}
};
} // namespace paimon::test