blob: 873b3ca6e88ee5f57785bd76fa64079d258fd310 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
#pragma once
/// \file iceberg/transform.h
#include <cstdint>
#include <memory>
#include <string_view>
#include <utility>
#include <variant>
#include "iceberg/expression/literal.h"
#include "iceberg/iceberg_export.h"
#include "iceberg/result.h"
#include "iceberg/type_fwd.h"
#include "iceberg/util/formattable.h"
namespace iceberg {
/// \brief Transform types used for partitioning
enum class TransformType {
/// Used to represent some customized transform that can't be recognized or supported
/// now.
kUnknown,
/// Equal to source value, unmodified
kIdentity,
/// Hash of value, mod `N`
kBucket,
/// Value truncated to width `W`
kTruncate,
/// Extract a date or timestamp year, as years from 1970
kYear,
/// Extract a date or timestamp month, as months from 1970-01
kMonth,
/// Extract a date or timestamp day, as days from 1970-01-01
kDay,
/// Extract a timestamp hour, as hours from 1970-01-01 00:00:00
kHour,
/// Always produces `null`
kVoid,
};
/// \brief Get the relative transform name
ICEBERG_EXPORT constexpr std::string_view TransformTypeToString(TransformType type) {
switch (type) {
case TransformType::kUnknown:
return "unknown";
case TransformType::kIdentity:
return "identity";
case TransformType::kBucket:
return "bucket";
case TransformType::kTruncate:
return "truncate";
case TransformType::kYear:
return "year";
case TransformType::kMonth:
return "month";
case TransformType::kDay:
return "day";
case TransformType::kHour:
return "hour";
case TransformType::kVoid:
return "void";
}
std::unreachable();
}
/// \brief Represents a transform used in partitioning or sorting in Iceberg.
///
/// This class supports binding to a source type and instantiating the corresponding
/// TransformFunction, as well as serialization-friendly introspection.
class ICEBERG_EXPORT Transform : public util::Formattable {
public:
/// \brief Returns a shared singleton instance of the Identity transform.
///
/// This transform leaves values unchanged and is commonly used for direct partitioning.
/// \return A shared pointer to the Identity transform.
static std::shared_ptr<Transform> Identity();
/// \brief Creates a shared instance of the Bucket transform.
///
/// Buckets values using a hash modulo operation. Commonly used for distributing data.
/// \param num_buckets The number of buckets.
/// \return A shared pointer to the Bucket transform.
static std::shared_ptr<Transform> Bucket(int32_t num_buckets);
/// \brief Creates a shared instance of the Truncate transform.
///
/// Truncates values to a fixed width (e.g., for strings or binary data).
/// \param width The width to truncate to.
/// \return A shared pointer to the Truncate transform.
static std::shared_ptr<Transform> Truncate(int32_t width);
/// \brief Creates a shared singleton instance of the Year transform.
///
/// Extracts the number of years from a date or timestamp since the epoch.
/// \return A shared pointer to the Year transform.
static std::shared_ptr<Transform> Year();
/// \brief Creates a shared singleton instance of the Month transform.
///
/// Extracts the number of months from a date or timestamp since the epoch.
/// \return A shared pointer to the Month transform.
static std::shared_ptr<Transform> Month();
/// \brief Creates a shared singleton instance of the Day transform.
///
/// Extracts the number of days from a date or timestamp since the epoch.
/// \return A shared pointer to the Day transform.
static std::shared_ptr<Transform> Day();
/// \brief Creates a shared singleton instance of the Hour transform.
///
/// Extracts the number of hours from a timestamp since the epoch.
/// \return A shared pointer to the Hour transform.
static std::shared_ptr<Transform> Hour();
/// \brief Creates a shared singleton instance of the Void transform.
///
/// Ignores values and always returns null. Useful for testing or special cases.
/// \return A shared pointer to the Void transform.
static std::shared_ptr<Transform> Void();
/// \brief Returns the transform type.
TransformType transform_type() const;
/// \brief Binds this transform to a source type, returning a typed TransformFunction.
///
/// This creates a concrete transform implementation based on the transform type and
/// parameter.
/// \param source_type The source column type to bind to.
/// \return A TransformFunction instance wrapped in `expected`, or an error on failure.
Result<std::shared_ptr<TransformFunction>> Bind(
const std::shared_ptr<Type>& source_type) const;
/// \brief Checks whether this function can be applied to the given Type.
/// \param source_type The source type to check.
/// \return true if this transform can be applied to the type, false otherwise
bool CanTransform(const Type& source_type) const;
/// \brief Whether the transform preserves the order of values (is monotonic).
bool PreservesOrder() const;
/// \brief Whether ordering by this transform's result satisfies the ordering of another
/// transform's result.
///
/// For example, sorting by day(ts) will produce an ordering that is also by month(ts)
/// or year(ts). However, sorting by day(ts) will not satisfy the order of hour(ts) or
/// identity(ts).
/// \param other The other transform to compare with.
/// \return true if ordering by this transform is equivalent to ordering by the other
/// transform.
bool SatisfiesOrderOf(const Transform& other) const;
/// \brief Transforms a BoundPredicate to an inclusive predicate on the partition values
/// produced by the transform.
///
/// This inclusive transform guarantees that if predicate->Test(value) is true, then
/// Projected(transform(value)) is true.
/// \param name The name of the partition column.
/// \param predicate The predicate to project.
/// \return A Result containing either a unique pointer to the projected predicate,
/// nullptr if the projection cannot be performed, or an Error if the projection fails.
Result<std::unique_ptr<UnboundPredicate>> Project(
std::string_view name, const std::shared_ptr<BoundPredicate>& predicate);
/// \brief Transforms a BoundPredicate to a strict predicate on the partition values
/// produced by the transform.
///
/// This strict transform guarantees that if Projected(transform(value)) is true, then
/// predicate->Test(value) is also true.
/// \param name The name of the partition column.
/// \param predicate The predicate to project.
/// \return A Result containing either a unique pointer to the projected predicate,
/// nullptr if the projection cannot be performed, or an Error if the projection fails.
Result<std::unique_ptr<UnboundPredicate>> ProjectStrict(
std::string_view name, const std::shared_ptr<BoundPredicate>& predicate);
/// \brief Returns a human-readable string representation of a transformed value.
///
/// \param value The literal value to be transformed.
/// \return A human-readable string representation of the value
Result<std::string> ToHumanString(const Literal& value);
/// \brief Returns a string representation of this transform (e.g., "bucket[16]").
std::string ToString() const override;
/// \brief Return the unique transform name to check if similar transforms for the same
/// source field are added multiple times in partition spec builder.
std::string DedupName() const;
/// \brief Generates a partition name for the transform.
/// \param source_name The name of the source column.
/// \return A string representation of the partition name.
Result<std::string> GeneratePartitionName(std::string_view source_name) const;
/// \brief Equality comparison.
friend bool operator==(const Transform& lhs, const Transform& rhs) {
return lhs.Equals(rhs);
}
private:
/// \brief Constructs a Transform of the specified type (for non-parametric types).
/// \param transform_type The transform type (e.g., identity, year, day).
explicit Transform(TransformType transform_type);
/// \brief Constructs a parameterized Transform (e.g., bucket(16), truncate(4)).
/// \param transform_type The transform type.
/// \param param The integer parameter associated with the transform.
Transform(TransformType transform_type, int32_t param);
/// \brief Checks equality with another Transform instance.
[[nodiscard]] virtual bool Equals(const Transform& other) const;
TransformType transform_type_;
/// Optional parameter (e.g., num_buckets, width)
std::variant<std::monostate, int32_t> param_;
};
/// \brief Converts a string representation of a transform into a Transform instance.
///
/// This function parses the provided string to identify the corresponding transform type
/// (e.g., "identity", "year", "bucket[16]"), and creates a shared pointer to the
/// corresponding Transform object. It supports both simple transforms (like "identity")
/// and parameterized transforms (like "bucket[16]" or "truncate[4]").
///
/// \param transform_str The string representation of the transform type.
/// \return A Result containing either a shared pointer to the corresponding Transform
/// instance or an Error if the string does not match any valid transform type.
ICEBERG_EXPORT Result<std::shared_ptr<Transform>> TransformFromString(
std::string_view transform_str);
/// \brief A transform function used for partitioning.
class ICEBERG_EXPORT TransformFunction {
public:
virtual ~TransformFunction() = default;
TransformFunction(TransformType transform_type, std::shared_ptr<Type> source_type);
/// \brief Transform an input Literal to a new Literal
///
/// All transforms must return null for a null input value.
virtual Result<Literal> Transform(const Literal& literal) = 0;
/// \brief Get the transform type
TransformType transform_type() const;
/// \brief Get the source type of transform function
const std::shared_ptr<Type>& source_type() const;
/// \brief Get the result type of transform function
///
/// Note: This method defines both the physical and display representation of the
/// partition field. The physical representation must conform to the Iceberg spec. The
/// display representation can deviate from the spec, such as by transforming the value
/// into a more human-readable format.
virtual std::shared_ptr<Type> ResultType() const = 0;
friend bool operator==(const TransformFunction& lhs, const TransformFunction& rhs) {
return lhs.Equals(rhs);
}
private:
/// \brief Compare two partition specs for equality.
[[nodiscard]] virtual bool Equals(const TransformFunction& other) const;
TransformType transform_type_;
std::shared_ptr<Type> source_type_;
};
} // namespace iceberg