blob: f7ffe5f0529003988226fb335284fe88f5ae4f83 [file] [log] [blame]
/*
* Copyright 2024-present Alibaba Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "paimon/common/data/columnar/columnar_array.h"
#include <utility>
#include "arrow/api.h"
#include "arrow/array/array_decimal.h"
#include "arrow/array/array_nested.h"
#include "arrow/array/array_primitive.h"
#include "arrow/type_traits.h"
#include "arrow/util/checked_cast.h"
#include "arrow/util/decimal.h"
#include "fmt/format.h"
#include "paimon/common/data/columnar/columnar_map.h"
#include "paimon/common/data/columnar/columnar_row.h"
#include "paimon/common/utils/date_time_utils.h"
namespace paimon {
Status ColumnarArray::CheckNoNull() const {
for (int i = 0; i < length_; i++) {
if (IsNullAt(i)) {
return Status::Invalid(fmt::format("row {} is null", i));
}
}
return Status::OK();
}
Decimal ColumnarArray::GetDecimal(int32_t pos, int32_t precision, int32_t scale) const {
using ArrayType = typename arrow::TypeTraits<arrow::Decimal128Type>::ArrayType;
auto array = arrow::internal::checked_cast<const ArrayType*>(array_);
assert(array);
arrow::Decimal128 decimal(array->GetValue(offset_ + pos));
return Decimal(precision, scale,
static_cast<Decimal::int128_t>(decimal.high_bits()) << 64 | decimal.low_bits());
}
Timestamp ColumnarArray::GetTimestamp(int32_t pos, int32_t precision) const {
using ArrayType = typename arrow::TypeTraits<arrow::TimestampType>::ArrayType;
auto array = arrow::internal::checked_cast<const ArrayType*>(array_);
assert(array);
int64_t data = array->Value(offset_ + pos);
auto timestamp_type =
arrow::internal::checked_pointer_cast<arrow::TimestampType>(array->type());
// for orc format, data is saved as nano, therefore, Timestamp convert should consider precision
// in arrow array rather than input precision
DateTimeUtils::TimeType time_type = DateTimeUtils::GetTimeTypeFromArrowType(timestamp_type);
auto [milli, nano] = DateTimeUtils::TimestampConverter(
data, time_type, DateTimeUtils::TimeType::MILLISECOND, DateTimeUtils::TimeType::NANOSECOND);
return Timestamp(milli, nano);
}
std::shared_ptr<InternalArray> ColumnarArray::GetArray(int32_t pos) const {
auto list_array = arrow::internal::checked_cast<const arrow::ListArray*>(array_);
assert(list_array);
int32_t offset = list_array->value_offset(offset_ + pos);
int32_t length = list_array->value_length(offset_ + pos);
return std::make_shared<ColumnarArray>(list_array->values(), pool_, offset, length);
}
std::shared_ptr<InternalMap> ColumnarArray::GetMap(int32_t pos) const {
auto map_array = arrow::internal::checked_cast<const arrow::MapArray*>(array_);
assert(map_array);
int32_t offset = map_array->value_offset(offset_ + pos);
int32_t length = map_array->value_length(offset_ + pos);
return std::make_shared<ColumnarMap>(map_array->keys(), map_array->items(), pool_, offset,
length);
}
std::shared_ptr<InternalRow> ColumnarArray::GetRow(int32_t pos, int32_t num_fields) const {
auto struct_array = arrow::internal::checked_cast<const arrow::StructArray*>(array_);
assert(struct_array);
return std::make_shared<ColumnarRow>(struct_array->fields(), pool_, offset_ + pos);
}
Result<std::vector<char>> ColumnarArray::ToBooleanArray() const {
PAIMON_RETURN_NOT_OK(CheckNoNull());
std::vector<char> res(length_);
for (int i = 0; i < length_; i++) {
bool element = GetBoolean(i);
res[i] = element ? static_cast<char>(1) : static_cast<char>(0);
}
return res;
}
Result<std::vector<char>> ColumnarArray::ToByteArray() const {
PAIMON_RETURN_NOT_OK(CheckNoNull());
std::vector<char> res(length_);
for (int i = 0; i < length_; i++) {
res[i] = GetByte(i);
}
return res;
}
Result<std::vector<int16_t>> ColumnarArray::ToShortArray() const {
PAIMON_RETURN_NOT_OK(CheckNoNull());
std::vector<int16_t> res(length_);
for (int i = 0; i < length_; i++) {
res[i] = GetShort(i);
}
return res;
}
Result<std::vector<int32_t>> ColumnarArray::ToIntArray() const {
PAIMON_RETURN_NOT_OK(CheckNoNull());
std::vector<int32_t> res(length_);
for (int i = 0; i < length_; i++) {
res[i] = GetInt(i);
}
return res;
}
Result<std::vector<int64_t>> ColumnarArray::ToLongArray() const {
PAIMON_RETURN_NOT_OK(CheckNoNull());
std::vector<int64_t> res(length_);
for (int i = 0; i < length_; i++) {
res[i] = GetLong(i);
}
return res;
}
Result<std::vector<float>> ColumnarArray::ToFloatArray() const {
PAIMON_RETURN_NOT_OK(CheckNoNull());
std::vector<float> res(length_);
for (int i = 0; i < length_; i++) {
res[i] = GetFloat(i);
}
return res;
}
Result<std::vector<double>> ColumnarArray::ToDoubleArray() const {
PAIMON_RETURN_NOT_OK(CheckNoNull());
std::vector<double> res(length_);
for (int i = 0; i < length_; i++) {
res[i] = GetDouble(i);
}
return res;
}
} // namespace paimon