blob: 89c4cdc83dffc37692ac378bf5627fd76f3fc00d [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
#include "iceberg/partition_spec.h"
#include <format>
#include <memory>
#include <gmock/gmock.h>
#include <gtest/gtest.h>
#include <nlohmann/json.hpp>
#include "iceberg/json_serde_internal.h"
#include "iceberg/partition_field.h"
#include "iceberg/row/partition_values.h"
#include "iceberg/schema.h"
#include "iceberg/schema_field.h"
#include "iceberg/test/matchers.h"
#include "iceberg/transform.h"
#include "iceberg/util/formatter.h" // IWYU pragma: keep
namespace iceberg {
TEST(PartitionSpecTest, Basics) {
{
SchemaField field1(5, "ts", timestamp(), true);
SchemaField field2(7, "bar", string(), true);
auto identity_transform = Transform::Identity();
PartitionField pt_field1(5, 1000, "day", identity_transform);
PartitionField pt_field2(5, 1001, "hour", identity_transform);
ICEBERG_UNWRAP_OR_FAIL(auto spec, PartitionSpec::Make(100, {pt_field1, pt_field2}));
ASSERT_EQ(*spec, *spec);
ASSERT_EQ(100, spec->spec_id());
std::span<const PartitionField> fields = spec->fields();
ASSERT_EQ(2, fields.size());
ASSERT_EQ(pt_field1, fields[0]);
ASSERT_EQ(pt_field2, fields[1]);
auto spec_str =
"partition_spec[spec_id<100>,\n day (1000 identity(5))\n hour (1001 "
"identity(5))\n]";
EXPECT_EQ(spec_str, spec->ToString());
EXPECT_EQ(spec_str, std::format("{}", *spec));
}
}
TEST(PartitionSpecTest, Equality) {
SchemaField field1(5, "ts", timestamp(), true);
SchemaField field2(7, "bar", string(), true);
auto identity_transform = Transform::Identity();
PartitionField pt_field1(5, 1000, "day", identity_transform);
PartitionField pt_field2(7, 1001, "hour", identity_transform);
PartitionField pt_field3(7, 1001, "hour", identity_transform);
ICEBERG_UNWRAP_OR_FAIL(auto spec1, PartitionSpec::Make(100, {pt_field1, pt_field2}));
ICEBERG_UNWRAP_OR_FAIL(auto spec2, PartitionSpec::Make(101, {pt_field1, pt_field2}));
ICEBERG_UNWRAP_OR_FAIL(auto spec3, PartitionSpec::Make(101, {pt_field1}));
ICEBERG_UNWRAP_OR_FAIL(auto spec4, PartitionSpec::Make(101, {pt_field3, pt_field1}));
ICEBERG_UNWRAP_OR_FAIL(auto spec5, PartitionSpec::Make(100, {pt_field1, pt_field2}));
ICEBERG_UNWRAP_OR_FAIL(auto spec6, PartitionSpec::Make(100, {pt_field2, pt_field1}));
ASSERT_EQ(*spec1, *spec1);
ASSERT_NE(*spec1, *spec2);
ASSERT_NE(*spec2, *spec1);
ASSERT_NE(*spec1, *spec3);
ASSERT_NE(*spec3, *spec1);
ASSERT_NE(*spec1, *spec4);
ASSERT_NE(*spec4, *spec1);
ASSERT_EQ(*spec1, *spec5);
ASSERT_EQ(*spec5, *spec1);
ASSERT_NE(*spec1, *spec6);
ASSERT_NE(*spec6, *spec1);
}
TEST(PartitionSpecTest, PartitionSchemaTest) {
SchemaField field1(5, "ts", timestamp(), true);
SchemaField field2(7, "bar", string(), true);
Schema schema({field1, field2}, 100);
auto identity_transform = Transform::Identity();
PartitionField pt_field1(5, 1000, "day", identity_transform);
PartitionField pt_field2(7, 1001, "hour", identity_transform);
ICEBERG_UNWRAP_OR_FAIL(auto spec, PartitionSpec::Make(100, {pt_field1, pt_field2}));
ICEBERG_UNWRAP_OR_FAIL(auto partition_type, spec->PartitionType(schema));
ASSERT_EQ(2, partition_type->fields().size());
EXPECT_EQ(pt_field1.name(), partition_type->fields()[0].name());
EXPECT_EQ(pt_field1.field_id(), partition_type->fields()[0].field_id());
EXPECT_EQ(pt_field2.name(), partition_type->fields()[1].name());
EXPECT_EQ(pt_field2.field_id(), partition_type->fields()[1].field_id());
}
TEST(PartitionSpecTest, PartitionTypeTest) {
nlohmann::json json = R"(
{
"spec-id": 1,
"fields": [ {
"source-id": 4,
"field-id": 1000,
"name": "__ts_day",
"transform": "day"
}, {
"source-id": 1,
"field-id": 1001,
"name": "__id_bucket",
"transform": "bucket[16]"
}, {
"source-id": 2,
"field-id": 1002,
"name": "__id_truncate",
"transform": "truncate[4]"
} ]
})"_json;
auto field1 = SchemaField::MakeRequired(1, "id", int32());
auto field2 = SchemaField::MakeRequired(2, "name", string());
auto field3 = SchemaField::MakeRequired(3, "ts", timestamp());
auto field4 = SchemaField::MakeRequired(4, "ts_day", timestamp());
auto field5 = SchemaField::MakeRequired(5, "id_bucket", int32());
auto field6 = SchemaField::MakeRequired(6, "id_truncate", int32());
auto const schema = std::make_shared<Schema>(
std::vector<SchemaField>{field1, field2, field3, field4, field5, field6},
Schema::kInitialSchemaId);
ICEBERG_UNWRAP_OR_FAIL(auto parsed_spec, PartitionSpecFromJson(schema, json, 1));
ICEBERG_UNWRAP_OR_FAIL(auto partition_type, parsed_spec->PartitionType(*schema));
SchemaField pt_field1(1000, "__ts_day", date(), true);
SchemaField pt_field2(1001, "__id_bucket", int32(), true);
SchemaField pt_field3(1002, "__id_truncate", string(), true);
ASSERT_EQ(3, partition_type->fields().size());
EXPECT_EQ(pt_field1, partition_type->fields()[0]);
EXPECT_EQ(pt_field2, partition_type->fields()[1]);
EXPECT_EQ(pt_field3, partition_type->fields()[2]);
}
TEST(PartitionSpecTest, InvalidTransformForType) {
// Test Day transform on string type (should fail)
auto field_string = SchemaField::MakeRequired(6, "s", string());
Schema schema_string({field_string}, Schema::kInitialSchemaId);
PartitionField pt_field_invalid(6, 1005, "s_day", Transform::Day());
auto result = PartitionSpec::Make(schema_string, 1, {pt_field_invalid}, false);
EXPECT_THAT(result, IsError(ErrorKind::kInvalidArgument));
EXPECT_THAT(result, HasErrorMessage("Invalid source type"));
// Test that Void transform does not error out even with incompatible types
// Void transform is used for V1 partition-spec when a field gets deleted
PartitionField pt_field_void(6, 1006, "s_void", Transform::Void());
auto result_void = PartitionSpec::Make(schema_string, 1, {pt_field_void}, false);
EXPECT_THAT(result_void, IsOk());
}
TEST(PartitionSpecTest, SourceIdNotFound) {
auto field1 = SchemaField::MakeRequired(1, "id", int64());
auto field2 = SchemaField::MakeRequired(2, "ts", timestamp());
Schema schema({field1, field2}, Schema::kInitialSchemaId);
// Try to create partition field with source ID 99 which doesn't exist
PartitionField pt_field_invalid(99, 1000, "Test", Transform::Identity());
auto result = PartitionSpec::Make(schema, 1, {pt_field_invalid}, false);
EXPECT_THAT(result, IsError(ErrorKind::kInvalidArgument));
EXPECT_THAT(result, HasErrorMessage("Cannot find source column for partition field"));
}
TEST(PartitionSpecTest, AllowMissingFields) {
auto field1 = SchemaField::MakeRequired(1, "id", int64());
auto field2 = SchemaField::MakeRequired(2, "ts", timestamp());
Schema schema({field1, field2}, Schema::kInitialSchemaId);
// Create partition field with source ID 99 which doesn't exist
PartitionField pt_field_missing(99, 1000, "Test", Transform::Identity());
// Without allow_missing_fields, this should fail
auto result_no_allow =
PartitionSpec::Make(schema, 1, {pt_field_missing}, false, std::nullopt);
EXPECT_THAT(result_no_allow, IsError(ErrorKind::kInvalidArgument));
EXPECT_THAT(result_no_allow,
HasErrorMessage("Cannot find source column for partition field"));
// With allow_missing_fields, this should succeed (e.g., for evolved schemas where
// source field was dropped)
auto result_allow =
PartitionSpec::Make(schema, 1, {pt_field_missing}, true, std::nullopt);
EXPECT_THAT(result_allow, IsOk());
}
TEST(PartitionSpecTest, PartitionFieldInStruct) {
auto field1 = SchemaField::MakeRequired(1, "id", int64());
auto field2 = SchemaField::MakeRequired(2, "ts", timestamp());
Schema base_schema({field1, field2}, Schema::kInitialSchemaId);
auto struct_type =
std::make_shared<StructType>(std::vector<SchemaField>{field1, field2});
auto outer_struct = SchemaField::MakeRequired(11, "MyStruct", struct_type);
Schema schema({outer_struct}, Schema::kInitialSchemaId);
PartitionField pt_field(1, 1000, "id_partition", Transform::Identity());
EXPECT_THAT(PartitionSpec::Make(schema, 1, {pt_field}, false), IsOk());
}
TEST(PartitionSpecTest, PartitionFieldInStructInStruct) {
auto field1 = SchemaField::MakeRequired(1, "id", int64());
auto field2 = SchemaField::MakeRequired(2, "ts", timestamp());
auto inner_struct =
std::make_shared<StructType>(std::vector<SchemaField>{field1, field2});
auto inner_field = SchemaField::MakeRequired(11, "Inner", inner_struct);
auto outer_struct = std::make_shared<StructType>(std::vector<SchemaField>{inner_field});
SchemaField outer_field(12, "Outer", outer_struct, true);
Schema schema({outer_field}, Schema::kInitialSchemaId);
PartitionField pt_field(1, 1000, "id_partition", Transform::Identity());
EXPECT_THAT(PartitionSpec::Make(schema, 1, {pt_field}, false), IsOk());
}
TEST(PartitionSpecTest, PartitionFieldInList) {
auto list_type = std::make_shared<ListType>(1, int32(), /*element_required=*/false);
auto list_field = SchemaField::MakeRequired(2, "MyList", list_type);
Schema schema({list_field}, Schema::kInitialSchemaId);
// Try to partition on the list element field (field ID 1 is the element)
PartitionField pt_field(1, 1000, "element_partition", Transform::Identity());
auto result = PartitionSpec::Make(schema, 1, {pt_field}, false);
EXPECT_THAT(result, IsError(ErrorKind::kInvalidArgument));
EXPECT_THAT(result, HasErrorMessage("Invalid partition field parent"));
}
TEST(PartitionSpecTest, PartitionFieldInStructInList) {
auto struct_in_list = std::make_shared<StructType>(
std::vector<SchemaField>{SchemaField(1, "Foo", int32(), true)});
auto list_type = std::make_shared<ListType>(2, struct_in_list,
/*element_required=*/false);
auto list_field = SchemaField::MakeRequired(3, "MyList", list_type);
Schema schema({list_field}, Schema::kInitialSchemaId);
PartitionField pt_field(1, 1000, "foo_partition", Transform::Identity());
auto result = PartitionSpec::Make(schema, 1, {pt_field}, false);
EXPECT_THAT(result, IsError(ErrorKind::kInvalidArgument));
EXPECT_THAT(result, HasErrorMessage("Invalid partition field parent"));
}
TEST(PartitionSpecTest, PartitionFieldInMap) {
auto key_field = SchemaField::MakeRequired(1, "key", int32());
auto value_field = SchemaField::MakeRequired(2, "value", int32());
auto map_type = std::make_shared<MapType>(key_field, value_field);
auto map_field = SchemaField::MakeRequired(3, "MyMap", map_type);
Schema schema({map_field}, Schema::kInitialSchemaId);
PartitionField pt_field_key(1, 1000, "key_partition", Transform::Identity());
auto result_key = PartitionSpec::Make(schema, 1, {pt_field_key}, false);
EXPECT_THAT(result_key, IsError(ErrorKind::kInvalidArgument));
EXPECT_THAT(result_key, HasErrorMessage("Invalid partition field parent"));
PartitionField pt_field_value(2, 1001, "value_partition", Transform::Identity());
auto result_value = PartitionSpec::Make(schema, 1, {pt_field_value}, false);
EXPECT_THAT(result_value, IsError(ErrorKind::kInvalidArgument));
EXPECT_THAT(result_value, HasErrorMessage("Invalid partition field parent"));
}
TEST(PartitionSpecTest, PartitionFieldInStructInMap) {
auto struct_key = std::make_shared<StructType>(
std::vector<SchemaField>{SchemaField(1, "Foo", int32(), true)});
auto struct_value = std::make_shared<StructType>(
std::vector<SchemaField>{SchemaField(2, "Bar", int32(), true)});
auto key_field = SchemaField::MakeRequired(3, "key", struct_key);
auto value_field = SchemaField::MakeRequired(4, "value", struct_value);
auto map_type = std::make_shared<MapType>(key_field, value_field);
auto map_field = SchemaField::MakeRequired(5, "MyMap", map_type);
Schema schema({map_field}, Schema::kInitialSchemaId);
PartitionField pt_field_key(1, 1000, "foo_partition", Transform::Identity());
auto result_key = PartitionSpec::Make(schema, 1, {pt_field_key}, false);
EXPECT_THAT(result_key, IsError(ErrorKind::kInvalidArgument));
EXPECT_THAT(result_key, HasErrorMessage("Invalid partition field parent"));
PartitionField pt_field_value(2, 1001, "bar_partition", Transform::Identity());
auto result_value = PartitionSpec::Make(schema, 1, {pt_field_value}, false);
EXPECT_THAT(result_value, IsError(ErrorKind::kInvalidArgument));
EXPECT_THAT(result_value, HasErrorMessage("Invalid partition field parent"));
}
TEST(PartitionSpecTest, ValidateRedundantPartitionsExactDuplicates) {
// Create a schema with different field types
auto ts_field = SchemaField::MakeRequired(1, "ts", timestamp());
auto id_field = SchemaField::MakeRequired(2, "id", int64());
Schema schema({ts_field, id_field}, Schema::kInitialSchemaId);
// Test: exact duplicate transforms on same field (same dedup name)
{
PartitionField field1(1, 1000, "ts_day_1_0", Transform::Day());
PartitionField field2(1, 1001, "ts_day_1_1", Transform::Day());
auto result = PartitionSpec::Make(schema, 1, {field1, field2}, false);
EXPECT_THAT(result, IsError(ErrorKind::kValidationFailed));
EXPECT_THAT(result, HasErrorMessage("Cannot add redundant partition"));
EXPECT_THAT(result, HasErrorMessage("conflicts with"));
}
// Test: same bucket size on same field (redundant)
{
PartitionField bucket1(2, 1000, "id_bucket_16_2_0", Transform::Bucket(16));
PartitionField bucket2(2, 1001, "id_bucket_16_2_1", Transform::Bucket(16));
auto result = PartitionSpec::Make(schema, 1, {bucket1, bucket2}, false);
EXPECT_THAT(result, IsError(ErrorKind::kValidationFailed));
EXPECT_THAT(result, HasErrorMessage("Cannot add redundant partition"));
}
// Test: same truncate width on same field (redundant)
{
auto name_field = SchemaField::MakeRequired(3, "name", string());
Schema schema_with_string({name_field}, Schema::kInitialSchemaId);
PartitionField truncate1(3, 1000, "name_trunc_4_3_1", Transform::Truncate(4));
PartitionField truncate2(3, 1001, "name_trunc_4_3_2", Transform::Truncate(4));
auto result =
PartitionSpec::Make(schema_with_string, 1, {truncate1, truncate2}, false);
EXPECT_THAT(result, IsError(ErrorKind::kValidationFailed));
EXPECT_THAT(result, HasErrorMessage("Cannot add redundant partition"));
}
}
TEST(PartitionSpecTest, ValidateRedundantPartitionsAllowedCases) {
// Create a schema with different field types
auto ts_field = SchemaField::MakeRequired(1, "ts", timestamp());
auto id_field = SchemaField::MakeRequired(2, "id", int64());
auto name_field = SchemaField::MakeRequired(3, "name", string());
Schema schema({ts_field, id_field, name_field}, Schema::kInitialSchemaId);
// Test: different bucket sizes on same field (allowed - different dedup names)
{
PartitionField bucket16(2, 1000, "id_bucket_16_2", Transform::Bucket(16));
PartitionField bucket32(2, 1001, "id_bucket_32_2", Transform::Bucket(32));
auto result = PartitionSpec::Make(schema, 1, {bucket16, bucket32}, false);
EXPECT_THAT(result, IsOk());
}
// Test: different truncate widths on same field (allowed - different dedup names)
{
PartitionField truncate4(3, 1000, "name_trunc_4_3", Transform::Truncate(4));
PartitionField truncate8(3, 1001, "name_trunc_8_3", Transform::Truncate(8));
auto result = PartitionSpec::Make(schema, 1, {truncate4, truncate8}, false);
EXPECT_THAT(result, IsOk());
}
// Test: same transforms on different fields (allowed)
{
PartitionField ts_day(1, 1000, "ts_day_1", Transform::Day());
PartitionField id_bucket(2, 1001, "id_bucket_2", Transform::Bucket(16));
auto result = PartitionSpec::Make(schema, 1, {ts_day, id_bucket}, false);
EXPECT_THAT(result, IsOk());
}
// Test: different transforms on same field (allowed if dedup names differ)
{
PartitionField ts_day(1, 1000, "ts_day_1", Transform::Day());
PartitionField ts_month(1, 1001, "ts_month_1", Transform::Month());
// This should be allowed since Day and Month have different dedup names
// The Java logic only checks for exact dedup name matches
auto result = PartitionSpec::Make(schema, 1, {ts_day, ts_month}, false);
EXPECT_THAT(result, IsOk());
}
// Test: single partition field (no redundancy possible)
{
PartitionField single_field(1, 1000, "ts_year_1", Transform::Year());
auto result = PartitionSpec::Make(schema, 1, {single_field}, false);
EXPECT_THAT(result, IsOk());
}
}
TEST(PartitionSpecTest, ValidateRedundantPartitionsIdentityTransforms) {
// Create a schema with different field types
auto id_field = SchemaField::MakeRequired(1, "id", int64());
auto name_field = SchemaField::MakeRequired(2, "name", string());
Schema schema({id_field, name_field}, Schema::kInitialSchemaId);
// Test: multiple identity transforms on same field (redundant)
{
PartitionField identity1(1, 1000, "id_1_0", Transform::Identity());
PartitionField identity2(1, 1001, "id_1_1", Transform::Identity());
auto result = PartitionSpec::Make(schema, 1, {identity1, identity2}, false);
EXPECT_THAT(result, IsError(ErrorKind::kValidationFailed));
EXPECT_THAT(result, HasErrorMessage("Cannot add redundant partition"));
}
// Test: identity transforms on different fields (allowed)
{
PartitionField id_identity(1, 1000, "id_1", Transform::Identity());
PartitionField name_identity(2, 1001, "name_2", Transform::Identity());
auto result = PartitionSpec::Make(schema, 1, {id_identity, name_identity}, false);
EXPECT_THAT(result, IsOk());
}
}
TEST(PartitionSpecTest, PartitionPath) {
// Create a schema with different field types
auto id_field = SchemaField::MakeRequired(1, "id", int64());
auto name_field = SchemaField::MakeRequired(2, "name", string());
auto ts_field = SchemaField::MakeRequired(3, "ts", timestamp());
Schema schema({id_field, name_field, ts_field}, Schema::kInitialSchemaId);
// Create partition fields
PartitionField id_field_partition(1, 1000, "id_partition", Transform::Identity());
PartitionField name_field_partition(2, 1001, "name_partition", Transform::Identity());
PartitionField ts_field_partition(3, 1002, "ts_partition", Transform::Day());
// Create partition spec
ICEBERG_UNWRAP_OR_FAIL(
auto spec,
PartitionSpec::Make(schema, 1,
{id_field_partition, name_field_partition, ts_field_partition},
false));
{
// Invalid partition values
PartitionValues part_data({Literal::Int(123)});
auto result = spec->PartitionPath(part_data);
EXPECT_THAT(result, IsError(ErrorKind::kInvalidArgument));
EXPECT_THAT(result, HasErrorMessage("Partition spec and data mismatch"));
}
{
// Normal partition values
PartitionValues part_data(
{Literal::Int(123), Literal::String("val2"), Literal::Date(19489)});
ICEBERG_UNWRAP_OR_FAIL(auto path, spec->PartitionPath(part_data));
std::string expected = "id_partition=123/name_partition=val2/ts_partition=2023-05-12";
EXPECT_EQ(expected, path);
}
{
// Partition values with special characters
PartitionValues part_data(
{Literal::Int(123), Literal::String("val#2"), Literal::Date(19489)});
ICEBERG_UNWRAP_OR_FAIL(auto path, spec->PartitionPath(part_data));
std::string expected =
"id_partition=123/name_partition=val%232/ts_partition=2023-05-12";
EXPECT_EQ(expected, path);
}
}
} // namespace iceberg