blob: 60539adfa47537b956bae39c8cd06b4597fd2238 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
#include "iceberg/util/data_file_set.h"
#include <gtest/gtest.h>
#include "iceberg/file_format.h"
#include "iceberg/manifest/manifest_entry.h"
#include "iceberg/row/partition_values.h"
namespace iceberg {
class DataFileSetTest : public ::testing::Test {
protected:
std::shared_ptr<DataFile> CreateDataFile(const std::string& path, int64_t size = 100) {
auto file = std::make_shared<DataFile>();
file->file_path = path;
file->file_format = FileFormatType::kParquet;
file->file_size_in_bytes = size;
file->record_count = 10;
file->content = DataFile::Content::kData;
return file;
}
};
TEST_F(DataFileSetTest, EmptySet) {
DataFileSet set;
EXPECT_TRUE(set.empty());
EXPECT_EQ(set.size(), 0);
EXPECT_EQ(set.begin(), set.end());
EXPECT_TRUE(set.as_span().empty());
}
TEST_F(DataFileSetTest, InsertSingleFile) {
DataFileSet set;
auto file = CreateDataFile("/path/to/file.parquet");
auto [iter, inserted] = set.insert(file);
EXPECT_TRUE(inserted);
EXPECT_EQ(*iter, file);
EXPECT_FALSE(set.empty());
EXPECT_EQ(set.size(), 1);
}
TEST_F(DataFileSetTest, InsertDuplicateFile) {
DataFileSet set;
auto file1 = CreateDataFile("/path/to/file.parquet");
auto file2 = CreateDataFile("/path/to/file.parquet"); // Same path
auto [iter1, inserted1] = set.insert(file1);
EXPECT_TRUE(inserted1);
auto [iter2, inserted2] = set.insert(file2);
EXPECT_FALSE(inserted2);
EXPECT_EQ(iter1, iter2); // Should point to the same element
EXPECT_EQ(set.size(), 1); // Should still be size 1
}
TEST_F(DataFileSetTest, InsertDifferentFiles) {
DataFileSet set;
auto file1 = CreateDataFile("/path/to/file1.parquet");
auto file2 = CreateDataFile("/path/to/file2.parquet");
auto file3 = CreateDataFile("/path/to/file3.parquet");
set.insert(file1);
set.insert(file2);
set.insert(file3);
EXPECT_EQ(set.size(), 3);
EXPECT_FALSE(set.empty());
}
TEST_F(DataFileSetTest, InsertionOrderPreserved) {
DataFileSet set;
auto file1 = CreateDataFile("/path/to/file1.parquet");
auto file2 = CreateDataFile("/path/to/file2.parquet");
auto file3 = CreateDataFile("/path/to/file3.parquet");
set.insert(file1);
set.insert(file2);
set.insert(file3);
// Iterate and verify order
std::vector<std::string> paths;
for (const auto& file : set) {
paths.push_back(file->file_path);
}
EXPECT_EQ(paths.size(), 3);
EXPECT_EQ(paths[0], "/path/to/file1.parquet");
EXPECT_EQ(paths[1], "/path/to/file2.parquet");
EXPECT_EQ(paths[2], "/path/to/file3.parquet");
}
TEST_F(DataFileSetTest, AsSpan) {
DataFileSet set;
EXPECT_TRUE(set.as_span().empty());
// Single element
auto file0 = CreateDataFile("/path/to/file0.parquet");
set.insert(file0);
{
auto span = set.as_span();
EXPECT_EQ(span.size(), 1);
EXPECT_EQ(span[0]->file_path, "/path/to/file0.parquet");
EXPECT_EQ(span[0], file0); // Same pointer, span is a view
}
// Multiple elements
auto file1 = CreateDataFile("/path/to/file1.parquet");
auto file2 = CreateDataFile("/path/to/file2.parquet");
set.insert(file1);
set.insert(file2);
auto span = set.as_span();
EXPECT_EQ(span.size(), 3);
EXPECT_EQ(span[0]->file_path, "/path/to/file0.parquet");
EXPECT_EQ(span[1]->file_path, "/path/to/file1.parquet");
EXPECT_EQ(span[2]->file_path, "/path/to/file2.parquet");
// Span matches set iteration order and identity
size_t i = 0;
for (const auto& file : set) {
EXPECT_EQ(span[i], file) << "Span element " << i << " should match set iterator";
++i;
}
EXPECT_EQ(i, span.size());
// Span works with range-for
i = 0;
for (const auto& file : span) {
EXPECT_EQ(file->file_path, span[i]->file_path);
++i;
}
EXPECT_EQ(i, 3);
set.clear();
EXPECT_TRUE(set.as_span().empty());
}
TEST_F(DataFileSetTest, InsertDuplicatePreservesOrder) {
DataFileSet set;
auto file1 = CreateDataFile("/path/to/file1.parquet");
auto file2 = CreateDataFile("/path/to/file2.parquet");
auto file3 = CreateDataFile("/path/to/file1.parquet"); // Duplicate of file1
set.insert(file1);
set.insert(file2);
set.insert(file3); // Should not insert, but order should be preserved
EXPECT_EQ(set.size(), 2);
std::vector<std::string> paths;
for (const auto& file : set) {
paths.push_back(file->file_path);
}
EXPECT_EQ(paths[0], "/path/to/file1.parquet");
EXPECT_EQ(paths[1], "/path/to/file2.parquet");
}
TEST_F(DataFileSetTest, InsertNullFile) {
DataFileSet set;
std::shared_ptr<DataFile> null_file = nullptr;
auto [iter, inserted] = set.insert(null_file);
EXPECT_FALSE(inserted);
EXPECT_EQ(iter, set.end());
EXPECT_TRUE(set.empty());
EXPECT_EQ(set.size(), 0);
}
TEST_F(DataFileSetTest, InsertMoveSemantics) {
DataFileSet set;
auto file1 = CreateDataFile("/path/to/file1.parquet");
auto file2 = CreateDataFile("/path/to/file2.parquet");
// Insert using move
auto [iter1, inserted1] = set.insert(std::move(file1));
EXPECT_TRUE(inserted1);
EXPECT_EQ(file1, nullptr); // Should be moved
// Insert using copy
auto [iter2, inserted2] = set.insert(file2);
EXPECT_TRUE(inserted2);
EXPECT_NE(file2, nullptr); // Should still be valid
EXPECT_EQ(set.size(), 2);
}
TEST_F(DataFileSetTest, Clear) {
DataFileSet set;
set.insert(CreateDataFile("/path/to/file1.parquet"));
set.insert(CreateDataFile("/path/to/file2.parquet"));
EXPECT_EQ(set.size(), 2);
set.clear();
EXPECT_TRUE(set.empty());
EXPECT_EQ(set.size(), 0);
EXPECT_EQ(set.begin(), set.end());
}
TEST_F(DataFileSetTest, IteratorOperations) {
DataFileSet set;
auto file1 = CreateDataFile("/path/to/file1.parquet");
auto file2 = CreateDataFile("/path/to/file2.parquet");
auto file3 = CreateDataFile("/path/to/file3.parquet");
set.insert(file1);
set.insert(file2);
set.insert(file3);
// Test const iterators
const auto& const_set = set;
EXPECT_NE(const_set.begin(), const_set.end());
EXPECT_NE(const_set.cbegin(), const_set.cend());
// Test iterator increment
auto it = set.begin();
EXPECT_EQ((*it)->file_path, "/path/to/file1.parquet");
++it;
EXPECT_EQ((*it)->file_path, "/path/to/file2.parquet");
++it;
EXPECT_EQ((*it)->file_path, "/path/to/file3.parquet");
++it;
EXPECT_EQ(it, set.end());
}
TEST_F(DataFileSetTest, RangeBasedForLoop) {
DataFileSet set;
set.insert(CreateDataFile("/path/to/file1.parquet"));
set.insert(CreateDataFile("/path/to/file2.parquet"));
set.insert(CreateDataFile("/path/to/file3.parquet"));
int count = 0;
for (const auto& file : set) {
EXPECT_NE(file, nullptr);
++count;
}
EXPECT_EQ(count, 3);
}
TEST_F(DataFileSetTest, CaseSensitivePaths) {
DataFileSet set;
auto file1 = CreateDataFile("/path/to/file.parquet");
auto file2 = CreateDataFile("/path/to/FILE.parquet"); // Different case
set.insert(file1);
set.insert(file2);
// Should be treated as different files
EXPECT_EQ(set.size(), 2);
}
TEST_F(DataFileSetTest, MultipleInsertsSameFile) {
DataFileSet set;
auto file = CreateDataFile("/path/to/file.parquet");
// Insert the same file multiple times
set.insert(file);
set.insert(file);
set.insert(file);
EXPECT_EQ(set.size(), 1);
}
} // namespace iceberg