blob: de128cef19549704f90463e5a9e9a7e4c261be91 [file] [log] [blame]
/*
* Copyright 2024-present Alibaba Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "paimon/common/data/blob_utils.h"
#include "arrow/api.h"
#include "arrow/c/bridge.h"
#include "gtest/gtest.h"
#include "paimon/common/data/blob_defs.h"
#include "paimon/data/blob.h"
#include "paimon/testing/utils/testharness.h"
namespace paimon::test {
class BlobUtilsTest : public ::testing::Test {
private:
std::shared_ptr<arrow::KeyValueMetadata> CreateBlobMetadata() {
std::unordered_map<std::string, std::string> blob_metadata_map = {
{BLOB_EXTENSION_TYPE_KEY, BLOB_EXTENSION_TYPE_VALUE}};
return std::make_shared<arrow::KeyValueMetadata>(blob_metadata_map);
}
};
TEST_F(BlobUtilsTest, IsBlobMetadata) {
auto correct_metadata = CreateBlobMetadata();
EXPECT_TRUE(BlobUtils::IsBlobMetadata(correct_metadata));
EXPECT_FALSE(BlobUtils::IsBlobMetadata(nullptr));
std::unordered_map<std::string, std::string> wrong_metadata_map = {
{BLOB_EXTENSION_TYPE_KEY, "paimon.type.varchar"}};
auto wrong_metadata = std::make_shared<arrow::KeyValueMetadata>(wrong_metadata_map);
EXPECT_FALSE(BlobUtils::IsBlobMetadata(wrong_metadata));
std::unordered_map<std::string, std::string> no_extension_metadata_map = {
{"other_key", BLOB_EXTENSION_TYPE_VALUE}};
auto no_extension_metadata =
std::make_shared<arrow::KeyValueMetadata>(no_extension_metadata_map);
EXPECT_FALSE(BlobUtils::IsBlobMetadata(no_extension_metadata));
}
TEST_F(BlobUtilsTest, IsBlobField) {
std::shared_ptr<arrow::Field> blob_field = BlobUtils::ToArrowField("f1", true);
EXPECT_TRUE(BlobUtils::IsBlobField(blob_field));
auto int_field = arrow::field("i_int", arrow::int32());
EXPECT_FALSE(BlobUtils::IsBlobField(int_field));
auto binary_field_no_meta = arrow::field("b_no_meta", arrow::large_binary());
EXPECT_FALSE(BlobUtils::IsBlobField(binary_field_no_meta));
auto wrong_meta = std::make_shared<arrow::KeyValueMetadata>(
std::unordered_map<std::string, std::string>{{"other_key", "value"}});
auto binary_field_wrong_meta =
arrow::field("b_wrong_meta", arrow::large_binary(), false, wrong_meta);
EXPECT_FALSE(BlobUtils::IsBlobField(binary_field_wrong_meta));
}
TEST_F(BlobUtilsTest, SeparateBlobSchema) {
auto int_field = arrow::field("f1_int", arrow::int32());
auto string_field = arrow::field("f2_string", arrow::utf8());
std::shared_ptr<arrow::Field> blob_field_1 = BlobUtils::ToArrowField("f3_blob_1", true);
{
std::shared_ptr<arrow::Schema> original_schema =
arrow::schema({int_field, string_field, blob_field_1});
BlobUtils::SeparatedSchemas schemas = BlobUtils::SeparateBlobSchema(original_schema);
std::shared_ptr<arrow::Schema> expected_main_schema =
arrow::schema({int_field, string_field});
ASSERT_TRUE(schemas.main_schema->Equals(*expected_main_schema));
std::shared_ptr<arrow::Schema> expected_blob_schema = arrow::schema({blob_field_1});
ASSERT_TRUE(schemas.blob_schema->Equals(*expected_blob_schema));
}
{
std::shared_ptr<arrow::Schema> no_blob_schema = arrow::schema({int_field, string_field});
BlobUtils::SeparatedSchemas no_blob_schemas = BlobUtils::SeparateBlobSchema(no_blob_schema);
ASSERT_TRUE(no_blob_schemas.main_schema->Equals(*no_blob_schema));
ASSERT_EQ(no_blob_schemas.blob_schema->num_fields(), 0);
}
{
std::shared_ptr<arrow::Schema> only_blob_schema = arrow::schema({blob_field_1});
BlobUtils::SeparatedSchemas only_blob_schemas =
BlobUtils::SeparateBlobSchema(only_blob_schema);
ASSERT_TRUE(only_blob_schemas.blob_schema->Equals(*only_blob_schema));
ASSERT_EQ(only_blob_schemas.main_schema->num_fields(), 0);
}
}
TEST_F(BlobUtilsTest, SeparateBlobArray) {
auto int_field = arrow::field("f1_int", arrow::int32());
std::shared_ptr<arrow::Field> blob_field = BlobUtils::ToArrowField("f2_blob", false);
auto string_field = arrow::field("f3_string", arrow::utf8());
auto schema = arrow::schema({int_field, blob_field, string_field});
arrow::Int32Builder int_builder;
ASSERT_TRUE(int_builder.AppendValues({1, 2, 3}).ok());
auto int_array = int_builder.Finish().ValueOrDie();
arrow::StringBuilder string_builder;
ASSERT_TRUE(string_builder.AppendValues({"a", "b", "c"}).ok());
auto string_array = string_builder.Finish().ValueOrDie();
arrow::LargeBinaryBuilder blob_builder;
ASSERT_TRUE(blob_builder.Append("1", 1).ok());
ASSERT_TRUE(blob_builder.Append("2", 1).ok());
ASSERT_TRUE(blob_builder.Append("3", 1).ok());
auto blob_array_data = blob_builder.Finish().ValueOrDie();
auto raw_struct_array =
arrow::StructArray::Make({int_array, blob_array_data, string_array}, schema->fields())
.ValueOrDie();
std::shared_ptr<arrow::StructArray> struct_array =
std::static_pointer_cast<arrow::StructArray>(raw_struct_array);
ASSERT_OK_AND_ASSIGN(auto separated, BlobUtils::SeparateBlobArray(struct_array));
std::shared_ptr<arrow::DataType> expected_main_type = arrow::struct_({int_field, string_field});
ASSERT_TRUE(separated.main_array->type()->Equals(*expected_main_type));
ASSERT_EQ(separated.main_array->num_fields(), 2);
ASSERT_TRUE(separated.main_array->field(0)->Equals(*int_array));
ASSERT_TRUE(separated.main_array->field(1)->Equals(*string_array));
std::shared_ptr<arrow::DataType> expected_blob_type = arrow::struct_({blob_field});
ASSERT_TRUE(separated.blob_array->type()->Equals(*expected_blob_type));
ASSERT_EQ(separated.blob_array->num_fields(), 1);
ASSERT_TRUE(separated.blob_array->field(0)->Equals(*blob_array_data));
}
} // namespace paimon::test