blob: 7b9d4f582623a668845b712411797e4df3d44322 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
#include "iceberg/update/update_statistics.h"
#include <algorithm>
#include <memory>
#include <gmock/gmock.h>
#include <gtest/gtest.h>
#include "iceberg/result.h"
#include "iceberg/statistics_file.h"
#include "iceberg/test/matchers.h"
#include "iceberg/test/update_test_base.h"
namespace iceberg {
class UpdateStatisticsTest : public UpdateTestBase {
protected:
// Helper function to create a statistics file
std::shared_ptr<StatisticsFile> MakeStatisticsFile(int64_t snapshot_id,
const std::string& path,
int64_t file_size = 1024,
int64_t footer_size = 128) {
auto stats_file = std::make_shared<StatisticsFile>();
stats_file->snapshot_id = snapshot_id;
stats_file->path = path;
stats_file->file_size_in_bytes = file_size;
stats_file->file_footer_size_in_bytes = footer_size;
BlobMetadata blob;
blob.type = "apache-datasketches-theta-v1";
blob.source_snapshot_id = snapshot_id;
blob.source_snapshot_sequence_number = 1;
blob.fields = {1, 2};
blob.properties = {{"ndv", "100"}};
stats_file->blob_metadata.push_back(blob);
return stats_file;
}
// Helper to find statistics file by snapshot_id in the result vector
std::shared_ptr<StatisticsFile> FindStatistics(
const std::vector<std::pair<int64_t, std::shared_ptr<StatisticsFile>>>& to_set,
int64_t snapshot_id) {
auto it = std::ranges::find_if(
to_set, [snapshot_id](const auto& p) { return p.first == snapshot_id; });
return it != to_set.end() ? it->second : nullptr;
}
};
TEST_F(UpdateStatisticsTest, EmptyUpdate) {
ICEBERG_UNWRAP_OR_FAIL(auto update, table_->NewUpdateStatistics());
ICEBERG_UNWRAP_OR_FAIL(auto result, update->Apply());
EXPECT_TRUE(result.to_set.empty());
EXPECT_TRUE(result.to_remove.empty());
}
TEST_F(UpdateStatisticsTest, SetStatistics) {
ICEBERG_UNWRAP_OR_FAIL(auto update, table_->NewUpdateStatistics());
auto stats_file =
MakeStatisticsFile(1, "/warehouse/test_table/metadata/stats-1.puffin");
update->SetStatistics(stats_file);
ICEBERG_UNWRAP_OR_FAIL(auto result, update->Apply());
EXPECT_EQ(result.to_set.size(), 1);
EXPECT_TRUE(result.to_remove.empty());
EXPECT_EQ(FindStatistics(result.to_set, 1), stats_file);
}
TEST_F(UpdateStatisticsTest, SetMultipleStatistics) {
ICEBERG_UNWRAP_OR_FAIL(auto update, table_->NewUpdateStatistics());
auto stats_file_1 =
MakeStatisticsFile(1, "/warehouse/test_table/metadata/stats-1.puffin");
auto stats_file_2 =
MakeStatisticsFile(2, "/warehouse/test_table/metadata/stats-2.puffin");
update->SetStatistics(stats_file_1).SetStatistics(stats_file_2);
ICEBERG_UNWRAP_OR_FAIL(auto result, update->Apply());
EXPECT_EQ(result.to_set.size(), 2);
EXPECT_TRUE(result.to_remove.empty());
EXPECT_EQ(FindStatistics(result.to_set, 1), stats_file_1);
EXPECT_EQ(FindStatistics(result.to_set, 2), stats_file_2);
}
TEST_F(UpdateStatisticsTest, RemoveStatistics) {
ICEBERG_UNWRAP_OR_FAIL(auto update, table_->NewUpdateStatistics());
update->RemoveStatistics(1);
ICEBERG_UNWRAP_OR_FAIL(auto result, update->Apply());
EXPECT_TRUE(result.to_set.empty());
EXPECT_EQ(result.to_remove.size(), 1);
EXPECT_THAT(result.to_remove, ::testing::Contains(1));
}
TEST_F(UpdateStatisticsTest, RemoveMultipleStatistics) {
ICEBERG_UNWRAP_OR_FAIL(auto update, table_->NewUpdateStatistics());
update->RemoveStatistics(1).RemoveStatistics(2).RemoveStatistics(3);
ICEBERG_UNWRAP_OR_FAIL(auto result, update->Apply());
EXPECT_TRUE(result.to_set.empty());
EXPECT_EQ(result.to_remove.size(), 3);
EXPECT_THAT(result.to_remove, ::testing::UnorderedElementsAre(1, 2, 3));
}
TEST_F(UpdateStatisticsTest, SetAndRemoveDifferentSnapshots) {
ICEBERG_UNWRAP_OR_FAIL(auto update, table_->NewUpdateStatistics());
auto stats_file =
MakeStatisticsFile(1, "/warehouse/test_table/metadata/stats-1.puffin");
update->SetStatistics(stats_file).RemoveStatistics(2);
ICEBERG_UNWRAP_OR_FAIL(auto result, update->Apply());
EXPECT_EQ(result.to_set.size(), 1);
EXPECT_EQ(FindStatistics(result.to_set, 1), stats_file);
EXPECT_EQ(result.to_remove.size(), 1);
EXPECT_THAT(result.to_remove, ::testing::Contains(2));
}
TEST_F(UpdateStatisticsTest, ReplaceStatistics) {
ICEBERG_UNWRAP_OR_FAIL(auto update, table_->NewUpdateStatistics());
auto stats_file_1 =
MakeStatisticsFile(1, "/warehouse/test_table/metadata/stats-1a.puffin");
auto stats_file_2 =
MakeStatisticsFile(1, "/warehouse/test_table/metadata/stats-1b.puffin", 2048, 256);
// Set statistics for snapshot 1, then replace it
update->SetStatistics(stats_file_1).SetStatistics(stats_file_2);
ICEBERG_UNWRAP_OR_FAIL(auto result, update->Apply());
EXPECT_EQ(result.to_set.size(), 1);
EXPECT_TRUE(result.to_remove.empty());
// Should have the second one (replacement)
EXPECT_EQ(FindStatistics(result.to_set, 1), stats_file_2);
EXPECT_NE(FindStatistics(result.to_set, 1), stats_file_1);
}
TEST_F(UpdateStatisticsTest, SetThenRemoveSameSnapshot) {
ICEBERG_UNWRAP_OR_FAIL(auto update, table_->NewUpdateStatistics());
auto stats_file =
MakeStatisticsFile(1, "/warehouse/test_table/metadata/stats-1.puffin");
// Set statistics for snapshot 1, then remove it
update->SetStatistics(stats_file).RemoveStatistics(1);
ICEBERG_UNWRAP_OR_FAIL(auto result, update->Apply());
EXPECT_TRUE(result.to_set.empty());
EXPECT_EQ(result.to_remove.size(), 1);
EXPECT_THAT(result.to_remove, ::testing::Contains(1));
}
TEST_F(UpdateStatisticsTest, RemoveThenSetSameSnapshot) {
ICEBERG_UNWRAP_OR_FAIL(auto update, table_->NewUpdateStatistics());
auto stats_file =
MakeStatisticsFile(1, "/warehouse/test_table/metadata/stats-1.puffin");
// Remove statistics for snapshot 1, then set new ones
update->RemoveStatistics(1).SetStatistics(stats_file);
ICEBERG_UNWRAP_OR_FAIL(auto result, update->Apply());
EXPECT_EQ(result.to_set.size(), 1);
EXPECT_TRUE(result.to_remove.empty());
EXPECT_EQ(FindStatistics(result.to_set, 1), stats_file);
}
TEST_F(UpdateStatisticsTest, SetNullStatistics) {
ICEBERG_UNWRAP_OR_FAIL(auto update, table_->NewUpdateStatistics());
update->SetStatistics(nullptr);
auto result = update->Apply();
EXPECT_THAT(result, IsError(ErrorKind::kValidationFailed));
EXPECT_THAT(result, HasErrorMessage("Statistics file cannot be null"));
}
TEST_F(UpdateStatisticsTest, CommitSuccess) {
// Test empty commit
ICEBERG_UNWRAP_OR_FAIL(auto empty_update, table_->NewUpdateStatistics());
EXPECT_THAT(empty_update->Commit(), IsOk());
// Reload table after first commit
ICEBERG_UNWRAP_OR_FAIL(auto reloaded, catalog_->LoadTable(table_ident_));
// Get a valid snapshot ID from the table
ICEBERG_UNWRAP_OR_FAIL(auto current_snapshot, reloaded->current_snapshot());
int64_t snapshot_id = current_snapshot->snapshot_id;
// Test commit with statistics changes
ICEBERG_UNWRAP_OR_FAIL(auto update, reloaded->NewUpdateStatistics());
auto stats_file =
MakeStatisticsFile(snapshot_id, "/warehouse/test_table/metadata/stats-1.puffin");
update->SetStatistics(stats_file);
EXPECT_THAT(update->Commit(), IsOk());
// Verify the statistics were committed and persisted
ICEBERG_UNWRAP_OR_FAIL(auto final_table, catalog_->LoadTable(table_ident_));
const auto& statistics = final_table->metadata()->statistics;
EXPECT_EQ(statistics.size(), 1);
EXPECT_EQ(*statistics[0], *stats_file);
}
} // namespace iceberg