blob: c210f9ed273e1c565c43c62f43ee26144c0520fe [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
#include "iceberg/transform.h"
#include <format>
#include <regex>
#include <utility>
#include "iceberg/expression/predicate.h"
#include "iceberg/expression/term.h"
#include "iceberg/result.h"
#include "iceberg/transform_function.h"
#include "iceberg/type.h"
#include "iceberg/util/checked_cast.h"
#include "iceberg/util/macros.h"
#include "iceberg/util/projection_util_internal.h"
#include "iceberg/util/string_util.h"
#include "iceberg/util/transform_util.h"
namespace iceberg {
namespace {
constexpr std::string_view kUnknownName = "unknown";
constexpr std::string_view kIdentityName = "identity";
constexpr std::string_view kBucketName = "bucket";
constexpr std::string_view kTruncateName = "truncate";
constexpr std::string_view kYearName = "year";
constexpr std::string_view kMonthName = "month";
constexpr std::string_view kDayName = "day";
constexpr std::string_view kHourName = "hour";
constexpr std::string_view kVoidName = "void";
} // namespace
std::shared_ptr<Transform> Transform::Identity() {
static auto instance =
std::shared_ptr<Transform>(new Transform(TransformType::kIdentity));
return instance;
}
std::shared_ptr<Transform> Transform::Year() {
static auto instance = std::shared_ptr<Transform>(new Transform(TransformType::kYear));
return instance;
}
std::shared_ptr<Transform> Transform::Month() {
static auto instance = std::shared_ptr<Transform>(new Transform(TransformType::kMonth));
return instance;
}
std::shared_ptr<Transform> Transform::Day() {
static auto instance = std::shared_ptr<Transform>(new Transform(TransformType::kDay));
return instance;
}
std::shared_ptr<Transform> Transform::Hour() {
static auto instance = std::shared_ptr<Transform>(new Transform(TransformType::kHour));
return instance;
}
std::shared_ptr<Transform> Transform::Void() {
static auto instance = std::shared_ptr<Transform>(new Transform(TransformType::kVoid));
return instance;
}
std::shared_ptr<Transform> Transform::Bucket(int32_t num_buckets) {
return std::shared_ptr<Transform>(new Transform(TransformType::kBucket, num_buckets));
}
std::shared_ptr<Transform> Transform::Truncate(int32_t width) {
return std::shared_ptr<Transform>(new Transform(TransformType::kTruncate, width));
}
Transform::Transform(TransformType transform_type) : transform_type_(transform_type) {}
Transform::Transform(TransformType transform_type, int32_t param)
: transform_type_(transform_type), param_(param) {}
TransformType Transform::transform_type() const { return transform_type_; }
Result<std::shared_ptr<TransformFunction>> Transform::Bind(
const std::shared_ptr<Type>& source_type) const {
auto type_str = TransformTypeToString(transform_type_);
switch (transform_type_) {
case TransformType::kIdentity:
return IdentityTransform::Make(source_type);
case TransformType::kBucket: {
if (auto param = std::get_if<int32_t>(&param_)) {
return BucketTransform::Make(source_type, *param);
}
return InvalidArgument("Bucket requires int32 param, none found in transform '{}'",
type_str);
}
case TransformType::kTruncate: {
if (auto param = std::get_if<int32_t>(&param_)) {
return TruncateTransform::Make(source_type, *param);
}
return InvalidArgument(
"Truncate requires int32 param, none found in transform '{}'", type_str);
}
case TransformType::kYear:
return YearTransform::Make(source_type);
case TransformType::kMonth:
return MonthTransform::Make(source_type);
case TransformType::kDay:
return DayTransform::Make(source_type);
case TransformType::kHour:
return HourTransform::Make(source_type);
case TransformType::kVoid:
return VoidTransform::Make(source_type);
default:
return NotSupported("Unsupported transform type: '{}'", type_str);
}
}
bool Transform::CanTransform(const Type& source_type) const {
switch (transform_type_) {
case TransformType::kIdentity:
if (!source_type.is_primitive()) [[unlikely]] {
return false;
}
return true;
case TransformType::kVoid:
case TransformType::kUnknown:
return true;
case TransformType::kBucket:
switch (source_type.type_id()) {
case TypeId::kInt:
case TypeId::kLong:
case TypeId::kDecimal:
case TypeId::kDate:
case TypeId::kTime:
case TypeId::kTimestamp:
case TypeId::kTimestampTz:
case TypeId::kString:
case TypeId::kUuid:
case TypeId::kFixed:
case TypeId::kBinary:
return true;
default:
return false;
}
case TransformType::kTruncate:
switch (source_type.type_id()) {
case TypeId::kInt:
case TypeId::kLong:
case TypeId::kString:
case TypeId::kBinary:
case TypeId::kDecimal:
return true;
default:
return false;
}
case TransformType::kYear:
case TransformType::kMonth:
switch (source_type.type_id()) {
case TypeId::kDate:
case TypeId::kTimestamp:
case TypeId::kTimestampTz:
return true;
default:
return false;
}
case TransformType::kDay:
switch (source_type.type_id()) {
case TypeId::kDate:
case TypeId::kTimestamp:
case TypeId::kTimestampTz:
return true;
default:
return false;
}
case TransformType::kHour:
switch (source_type.type_id()) {
case TypeId::kTimestamp:
case TypeId::kTimestampTz:
return true;
default:
return false;
}
}
std::unreachable();
}
bool Transform::PreservesOrder() const {
switch (transform_type_) {
case TransformType::kUnknown:
case TransformType::kVoid:
case TransformType::kBucket:
return false;
case TransformType::kIdentity:
case TransformType::kTruncate:
case TransformType::kYear:
case TransformType::kMonth:
case TransformType::kDay:
case TransformType::kHour:
return true;
}
std::unreachable();
}
bool Transform::SatisfiesOrderOf(const Transform& other) const {
auto other_type = other.transform_type();
switch (transform_type_) {
case TransformType::kIdentity:
// ordering by value is the same as long as the other preserves order
return other.PreservesOrder();
case TransformType::kTruncate: {
if (other_type != TransformType::kTruncate) {
return false;
}
return std::get<int32_t>(param_) >= std::get<int32_t>(other.param_);
}
case TransformType::kHour:
return other_type == TransformType::kHour || other_type == TransformType::kDay ||
other_type == TransformType::kMonth || other_type == TransformType::kYear;
case TransformType::kDay:
return other_type == TransformType::kDay || other_type == TransformType::kMonth ||
other_type == TransformType::kYear;
case TransformType::kMonth:
return other_type == TransformType::kMonth || other_type == TransformType::kYear;
case TransformType::kYear:
case TransformType::kBucket:
case TransformType::kUnknown:
case TransformType::kVoid:
return *this == other;
}
std::unreachable();
}
Result<std::unique_ptr<UnboundPredicate>> Transform::Project(
std::string_view name, const std::shared_ptr<BoundPredicate>& predicate) {
switch (transform_type_) {
case TransformType::kIdentity:
return ProjectionUtil::IdentityProject(name, predicate);
case TransformType::kBucket: {
// If the predicate has a transformed child that matches the given transform, return
// a predicate.
if (predicate->term()->kind() == Term::Kind::kTransform) {
const auto boundTransform =
internal::checked_pointer_cast<BoundTransform>(predicate->term());
if (*this == *boundTransform->transform()) {
return ProjectionUtil::RemoveTransform(name, predicate);
} else {
return nullptr;
}
}
ICEBERG_ASSIGN_OR_RAISE(auto func, Bind(predicate->term()->type()));
return ProjectionUtil::BucketProject(name, predicate, func);
}
case TransformType::kTruncate: {
// If the predicate has a transformed child that matches the given transform, return
// a predicate.
if (predicate->term()->kind() == Term::Kind::kTransform) {
const auto boundTransform =
internal::checked_pointer_cast<BoundTransform>(predicate->term());
if (*this == *boundTransform->transform()) {
return ProjectionUtil::RemoveTransform(name, predicate);
} else {
return nullptr;
}
}
ICEBERG_ASSIGN_OR_RAISE(auto func, Bind(predicate->term()->type()));
return ProjectionUtil::TruncateProject(name, predicate, func);
}
case TransformType::kYear:
case TransformType::kMonth:
case TransformType::kDay:
case TransformType::kHour: {
// If the predicate has a transformed child that matches the given transform, return
// a predicate.
if (predicate->term()->kind() == Term::Kind::kTransform) {
const auto boundTransform =
internal::checked_pointer_cast<BoundTransform>(predicate->term());
if (*this == *boundTransform->transform()) {
return ProjectionUtil::RemoveTransform(name, predicate);
} else {
return nullptr;
}
}
ICEBERG_ASSIGN_OR_RAISE(auto func, Bind(predicate->term()->type()));
return ProjectionUtil::TemporalProject(name, predicate, func);
}
case TransformType::kUnknown:
case TransformType::kVoid:
return nullptr;
}
std::unreachable();
}
Result<std::unique_ptr<UnboundPredicate>> Transform::ProjectStrict(
std::string_view name, const std::shared_ptr<BoundPredicate>& predicate) {
switch (transform_type_) {
case TransformType::kIdentity:
return ProjectionUtil::IdentityProject(name, predicate);
case TransformType::kBucket: {
// If the predicate has a transformed child that matches the given transform, return
// a predicate.
if (predicate->term()->kind() == Term::Kind::kTransform) {
const auto boundTransform =
internal::checked_pointer_cast<BoundTransform>(predicate->term());
if (*this == *boundTransform->transform()) {
return ProjectionUtil::RemoveTransform(name, predicate);
} else {
return nullptr;
}
}
ICEBERG_ASSIGN_OR_RAISE(auto func, Bind(predicate->term()->type()));
return ProjectionUtil::BucketProjectStrict(name, predicate, func);
}
case TransformType::kTruncate: {
// If the predicate has a transformed child that matches the given transform, return
// a predicate.
if (predicate->term()->kind() == Term::Kind::kTransform) {
const auto boundTransform =
internal::checked_pointer_cast<BoundTransform>(predicate->term());
if (*this == *boundTransform->transform()) {
return ProjectionUtil::RemoveTransform(name, predicate);
} else {
return nullptr;
}
}
ICEBERG_ASSIGN_OR_RAISE(auto func, Bind(predicate->term()->type()));
return ProjectionUtil::TruncateProjectStrict(name, predicate, func);
}
case TransformType::kYear:
case TransformType::kMonth:
case TransformType::kDay:
case TransformType::kHour: {
// If the predicate has a transformed child that matches the given transform, return
// a predicate.
if (predicate->term()->kind() == Term::Kind::kTransform) {
const auto boundTransform =
internal::checked_pointer_cast<BoundTransform>(predicate->term());
if (*this == *boundTransform->transform()) {
return ProjectionUtil::RemoveTransform(name, predicate);
} else {
return nullptr;
}
}
ICEBERG_ASSIGN_OR_RAISE(auto func, Bind(predicate->term()->type()));
return ProjectionUtil::TemporalProjectStrict(name, predicate, func);
}
case TransformType::kUnknown:
case TransformType::kVoid:
return nullptr;
}
std::unreachable();
}
Result<std::string> Transform::ToHumanString(const Literal& value) {
if (value.IsNull()) {
return "null";
}
if (value.IsAboveMax() || value.IsBelowMin()) [[unlikely]] {
return NotSupported("Cannot transfrom human string for value: {}", value.ToString());
}
switch (transform_type_) {
case TransformType::kYear: {
if (!std::holds_alternative<int32_t>(value.value())) [[unlikely]] {
return NotSupported("Transfrom human year from type {} is not supported",
value.type()->ToString());
}
return TransformUtil::HumanYear(std::get<int32_t>(value.value()));
}
case TransformType::kMonth: {
if (!std::holds_alternative<int32_t>(value.value())) [[unlikely]] {
return NotSupported("Transfrom human month from type {} is not supported",
value.type()->ToString());
}
return TransformUtil::HumanMonth(std::get<int32_t>(value.value()));
}
case TransformType::kDay: {
if (!std::holds_alternative<int32_t>(value.value())) [[unlikely]] {
return NotSupported("Transfrom human day from type {} is not supported",
value.type()->ToString());
}
return TransformUtil::HumanDay(std::get<int32_t>(value.value()));
}
case TransformType::kHour: {
if (!std::holds_alternative<int32_t>(value.value())) [[unlikely]] {
return NotSupported("Transfrom human hour from type {} is not supported",
value.type()->ToString());
}
return TransformUtil::HumanHour(std::get<int32_t>(value.value()));
}
case TransformType::kIdentity:
case TransformType::kBucket:
case TransformType::kTruncate:
case TransformType::kUnknown:
case TransformType::kVoid: {
switch (value.type()->type_id()) {
case TypeId::kDate:
return TransformUtil::HumanDay(std::get<int32_t>(value.value()));
case TypeId::kTime:
return TransformUtil::HumanTime(std::get<int64_t>(value.value()));
case TypeId::kTimestamp:
return TransformUtil::HumanTimestamp(std::get<int64_t>(value.value()));
case TypeId::kTimestampTz:
return TransformUtil::HumanTimestampWithZone(std::get<int64_t>(value.value()));
case TypeId::kFixed:
case TypeId::kBinary: {
const auto& binary_data = std::get<std::vector<uint8_t>>(value.value());
return TransformUtil::Base64Encode(
{reinterpret_cast<const char*>(binary_data.data()), binary_data.size()});
}
case TypeId::kDecimal: {
const auto& decimal_type = internal::checked_cast<DecimalType&>(*value.type());
const auto& decimal = std::get<::iceberg::Decimal>(value.value());
return decimal.ToString(decimal_type.scale());
}
case TypeId::kString:
return std::get<std::string>(value.value());
default:
return value.ToString();
}
}
}
std::unreachable();
}
bool TransformFunction::Equals(const TransformFunction& other) const {
return transform_type_ == other.transform_type_ && *source_type_ == *other.source_type_;
}
std::string Transform::ToString() const {
switch (transform_type_) {
case TransformType::kIdentity:
case TransformType::kYear:
case TransformType::kMonth:
case TransformType::kDay:
case TransformType::kHour:
case TransformType::kVoid:
case TransformType::kUnknown:
return std::format("{}", TransformTypeToString(transform_type_));
case TransformType::kBucket:
case TransformType::kTruncate:
return std::format("{}[{}]", TransformTypeToString(transform_type_),
std::get<int32_t>(param_));
}
std::unreachable();
}
std::string Transform::DedupName() const { return ToString(); }
Result<std::string> Transform::GeneratePartitionName(std::string_view source_name) const {
switch (transform_type_) {
case TransformType::kIdentity:
return std::string(source_name);
case TransformType::kBucket:
return std::format("{}_bucket_{}", source_name, std::get<int32_t>(param_));
case TransformType::kTruncate:
return std::format("{}_trunc_{}", source_name, std::get<int32_t>(param_));
case TransformType::kYear:
case TransformType::kMonth:
case TransformType::kDay:
case TransformType::kHour:
return std::format("{}_{}", source_name, TransformTypeToString(transform_type_));
case TransformType::kVoid:
return std::format("{}_null", source_name);
case TransformType::kUnknown:
return Invalid("Cannot generate partition name for unknown transform");
}
std::unreachable();
}
TransformFunction::TransformFunction(TransformType transform_type,
std::shared_ptr<Type> source_type)
: transform_type_(transform_type), source_type_(std::move(source_type)) {}
TransformType TransformFunction::transform_type() const { return transform_type_; }
std::shared_ptr<Type> const& TransformFunction::source_type() const {
return source_type_;
}
bool Transform::Equals(const Transform& other) const {
return transform_type_ == other.transform_type_ && param_ == other.param_;
}
Result<std::shared_ptr<Transform>> TransformFromString(std::string_view transform_str) {
if (transform_str == kIdentityName) return Transform::Identity();
if (transform_str == kYearName) return Transform::Year();
if (transform_str == kMonthName) return Transform::Month();
if (transform_str == kDayName) return Transform::Day();
if (transform_str == kHourName) return Transform::Hour();
if (transform_str == kVoidName) return Transform::Void();
// Match bucket[16] or truncate[4]
static const std::regex param_regex(
std::format(R"(({}|{})\[(\d+)\])", kBucketName, kTruncateName));
std::string str(transform_str);
std::smatch match;
if (std::regex_match(str, match, param_regex)) {
const std::string type_str = match[1];
ICEBERG_ASSIGN_OR_RAISE(const auto param,
StringUtils::ParseNumber<int32_t>(match[2].str()));
if (type_str == kBucketName) {
return Transform::Bucket(param);
}
if (type_str == kTruncateName) {
return Transform::Truncate(param);
}
}
return InvalidArgument("Invalid Transform string: {}", transform_str);
}
} // namespace iceberg