blob: 7ccbaa2cc1721bf2723c4313055dcec1c2ad3e58 [file]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <cstdint>
#include <map>
#include <memory>
#include <optional>
#include <string>
#include <utility>
#include <vector>
#include "arrow/api.h"
#include "arrow/array/array_base.h"
#include "arrow/array/array_nested.h"
#include "arrow/c/abi.h"
#include "arrow/c/bridge.h"
#include "arrow/ipc/json_simple.h"
#include "gtest/gtest.h"
#include "paimon/common/utils/arrow/arrow_input_stream_adapter.h"
#include "paimon/common/utils/arrow/mem_utils.h"
#include "paimon/common/utils/decimal_utils.h"
#include "paimon/data/decimal.h"
#include "paimon/data/timestamp.h"
#include "paimon/defs.h"
#include "paimon/format/parquet/parquet_file_batch_reader.h"
#include "paimon/format/parquet/parquet_format_defs.h"
#include "paimon/format/parquet/parquet_format_writer.h"
#include "paimon/fs/file_system.h"
#include "paimon/memory/memory_pool.h"
#include "paimon/predicate/literal.h"
#include "paimon/predicate/predicate_builder.h"
#include "paimon/result.h"
#include "paimon/testing/utils/read_result_collector.h"
#include "paimon/testing/utils/testharness.h"
#include "parquet/properties.h"
namespace paimon {
class Predicate;
} // namespace paimon
namespace paimon::parquet::test {
class PredicatePushdownTest : public ::testing::Test {
public:
void SetUp() override {
pool_ = GetDefaultPool();
arrow_pool_ = GetArrowPool(pool_);
batch_size_ = 10;
arrow::FieldVector fields = {
arrow::field("f0", arrow::utf8()), arrow::field("f1", arrow::float32()),
arrow::field("f2", arrow::int64()), arrow::field("f3", arrow::boolean()),
arrow::field("f4", arrow::int64()), arrow::field("f5", arrow::binary())};
struct_array_ = std::dynamic_pointer_cast<arrow::StructArray>(
arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(fields), R"([
["apple", 4.0, 4, true, null, "add"], ["banana", 4.0, 6, true, null, "bad"],
["camera", 4.0, 8, true, null, "cat"], ["data", null, 10, true, null, "dad"]
])")
.ValueOrDie());
dir_ = paimon::test::UniqueTestDirectory::Create();
ASSERT_TRUE(dir_);
file_name_ = dir_->Str() + "/test.data";
fs_ = dir_->GetFileSystem();
}
void TearDown() override {}
void PrepareTestData(const std::shared_ptr<arrow::StructArray>& struct_array) {
auto data_type = struct_array->struct_type();
auto data_schema = arrow::schema(data_type->fields());
auto data_arrow_array = std::make_unique<ArrowArray>();
ASSERT_TRUE(arrow::ExportArray(*struct_array, data_arrow_array.get()).ok());
ASSERT_OK_AND_ASSIGN(std::shared_ptr<OutputStream> out,
fs_->Create(file_name_, /*overwrite=*/false));
::parquet::WriterProperties::Builder builder;
builder.write_batch_size(batch_size_);
auto writer_properties = builder.build();
ASSERT_OK_AND_ASSIGN(
auto format_writer,
ParquetFormatWriter::Create(out, data_schema, writer_properties,
DEFAULT_PARQUET_WRITER_MAX_MEMORY_USE, arrow_pool_));
ASSERT_OK(format_writer->AddBatch(data_arrow_array.get()));
ASSERT_OK(format_writer->Finish());
ASSERT_OK(out->Close());
}
void CheckResult(const std::shared_ptr<arrow::Schema>& read_schema,
const std::shared_ptr<Predicate>& predicate,
const std::shared_ptr<arrow::Array>& expected_array,
uint32_t predicate_node_count_limit =
paimon::parquet::DEFAULT_PARQUET_READ_PREDICATE_NODE_COUNT_LIMIT) {
ASSERT_OK_AND_ASSIGN(std::shared_ptr<InputStream> in, fs_->Open(file_name_));
ASSERT_OK_AND_ASSIGN(uint64_t length, in->Length());
auto in_stream = std::make_shared<ArrowInputStreamAdapter>(in, arrow_pool_, length);
std::map<std::string, std::string> options;
options[paimon::parquet::PARQUET_READ_PREDICATE_NODE_COUNT_LIMIT] =
std::to_string(predicate_node_count_limit);
ASSERT_OK_AND_ASSIGN(auto batch_reader,
ParquetFileBatchReader::Create(std::move(in_stream), arrow_pool_,
options, batch_size_));
std::unique_ptr<ArrowSchema> c_schema = std::make_unique<ArrowSchema>();
auto arrow_status = arrow::ExportSchema(*read_schema, c_schema.get());
ASSERT_TRUE(arrow_status.ok());
ASSERT_OK(batch_reader->SetReadSchema(c_schema.get(), predicate,
/*selection_bitmap=*/std::nullopt));
ASSERT_OK_AND_ASSIGN(auto arrow_array,
paimon::test::ReadResultCollector::CollectResult(batch_reader.get()));
if (expected_array) {
ASSERT_TRUE(arrow_array);
auto expected_chunk_array = std::make_shared<arrow::ChunkedArray>(expected_array);
ASSERT_TRUE(expected_chunk_array->Equals(arrow_array)) << arrow_array->ToString();
} else {
ASSERT_FALSE(arrow_array);
}
}
private:
std::shared_ptr<arrow::MemoryPool> arrow_pool_;
std::shared_ptr<MemoryPool> pool_;
int32_t batch_size_;
std::shared_ptr<arrow::StructArray> struct_array_;
std::shared_ptr<FileSystem> fs_;
std::unique_ptr<paimon::test::UniqueTestDirectory> dir_;
std::string file_name_;
};
TEST_F(PredicatePushdownTest, TestIntDoubleData) {
PrepareTestData(struct_array_);
auto data_type = struct_array_->struct_type();
arrow::FieldVector fields = {data_type->GetFieldByName("f0"), data_type->GetFieldByName("f1"),
data_type->GetFieldByName("f2"), data_type->GetFieldByName("f3"),
data_type->GetFieldByName("f4")};
auto read_schema = arrow::schema(fields);
std::shared_ptr<arrow::Array> expected_array =
arrow::StructArray::Make(
{struct_array_->GetFieldByName("f0"), struct_array_->GetFieldByName("f1"),
struct_array_->GetFieldByName("f2"), struct_array_->GetFieldByName("f3"),
struct_array_->GetFieldByName("f4")},
fields)
.ValueOrDie();
{
// f1 == 4, has data
auto predicate =
PredicateBuilder::Equal(/*field_index=*/1, /*field_name=*/"f1", FieldType::FLOAT,
Literal(static_cast<float>(4.0)));
CheckResult(read_schema, predicate, expected_array);
}
{
// f1 == 6, no data
auto predicate =
PredicateBuilder::Equal(/*field_index=*/1, /*field_name=*/"f1", FieldType::FLOAT,
Literal(static_cast<float>(6.0)));
CheckResult(read_schema, predicate, /*expected_array=*/
nullptr);
}
{
// f1 != 4, no data
auto predicate = PredicateBuilder::NotEqual(
/*field_index=*/1, /*field_name=*/"f1", FieldType::FLOAT,
Literal(static_cast<float>(4.0)));
CheckResult(read_schema, predicate, /*expected_array=*/nullptr);
}
{
// f2 != 4, has data
auto predicate = PredicateBuilder::NotEqual(/*field_index=*/2, /*field_name=*/"f2",
FieldType::BIGINT, Literal(4l));
CheckResult(read_schema, predicate, expected_array);
}
{
// f2 == 6, has data
auto predicate = PredicateBuilder::Equal(/*field_index=*/2, /*field_name=*/"f2",
FieldType::BIGINT, Literal(6l));
CheckResult(read_schema, predicate, expected_array);
}
{
// f2 == 1, no data
auto predicate = PredicateBuilder::Equal(/*field_index=*/2, /*field_name=*/"f2",
FieldType::BIGINT, Literal(1l));
CheckResult(read_schema, predicate, /*expected_array=*/
nullptr);
}
{
// f2 in [1,2,3], no data
auto predicate =
PredicateBuilder::In(/*field_index=*/2, /*field_name=*/"f2", FieldType::BIGINT,
{Literal(1l), Literal(2l), Literal(3l)});
CheckResult(read_schema, predicate, /*expected_array=*/nullptr);
}
{
// f2 in [1,2,3] but has small predicate node limit, has data
auto predicate =
PredicateBuilder::In(/*field_index=*/2, /*field_name=*/"f2", FieldType::BIGINT,
{Literal(1l), Literal(2l), Literal(3l)});
CheckResult(read_schema, predicate, expected_array,
/*predicate_node_count_limit=*/1);
}
{
// f2 not in [1,2,3], has data
auto predicate =
PredicateBuilder::NotIn(/*field_index=*/2, /*field_name=*/"f2", FieldType::BIGINT,
{Literal(1l), Literal(2l), Literal(3l)});
CheckResult(read_schema, predicate, expected_array);
}
{
// f2 in [2,3,4], has data
auto predicate =
PredicateBuilder::In(/*field_index=*/2, /*field_name=*/"f2", FieldType::BIGINT,
{Literal(2l), Literal(3l), Literal(4l)});
CheckResult(read_schema, predicate, expected_array);
}
{
// f2 not in [2,3,4], has data
auto predicate =
PredicateBuilder::NotIn(/*field_index=*/2, /*field_name=*/"f2", FieldType::BIGINT,
{Literal(2l), Literal(3l), Literal(4l)});
CheckResult(read_schema, predicate, expected_array);
}
}
TEST_F(PredicatePushdownTest, TestBoolData) {
PrepareTestData(struct_array_);
auto data_type = struct_array_->struct_type();
arrow::FieldVector fields = {data_type->GetFieldByName("f0"), data_type->GetFieldByName("f1"),
data_type->GetFieldByName("f2"), data_type->GetFieldByName("f3"),
data_type->GetFieldByName("f4")};
auto read_schema = arrow::schema(fields);
std::shared_ptr<arrow::Array> expected_array =
arrow::StructArray::Make(
{struct_array_->GetFieldByName("f0"), struct_array_->GetFieldByName("f1"),
struct_array_->GetFieldByName("f2"), struct_array_->GetFieldByName("f3"),
struct_array_->GetFieldByName("f4")},
fields)
.ValueOrDie();
{
// f3 is null, no data
auto predicate =
PredicateBuilder::IsNull(/*field_index=*/3, /*field_name=*/"f3", FieldType::BOOLEAN);
CheckResult(read_schema, predicate, /*expected_array=*/nullptr);
}
{
// f3 is not null, has data
auto predicate =
PredicateBuilder::IsNotNull(/*field_index=*/3, /*field_name=*/"f3", FieldType::BOOLEAN);
CheckResult(read_schema, predicate, expected_array);
}
{
// f3 == true, has data
auto predicate = PredicateBuilder::Equal(/*field_index=*/3, /*field_name=*/"f3",
FieldType::BOOLEAN, Literal(true));
CheckResult(read_schema, predicate, expected_array);
}
{
auto predicate = PredicateBuilder::In(/*field_index=*/3, /*field_name=*/"f3",
FieldType::BOOLEAN, {Literal(false)});
CheckResult(read_schema, predicate, /*expected_array=*/nullptr);
}
}
TEST_F(PredicatePushdownTest, TestStringData) {
PrepareTestData(struct_array_);
auto data_type = struct_array_->struct_type();
arrow::FieldVector fields = {data_type->GetFieldByName("f0"), data_type->GetFieldByName("f1"),
data_type->GetFieldByName("f2"), data_type->GetFieldByName("f3"),
data_type->GetFieldByName("f4")};
auto read_schema = arrow::schema(fields);
std::shared_ptr<arrow::Array> expected_array =
arrow::StructArray::Make(
{struct_array_->GetFieldByName("f0"), struct_array_->GetFieldByName("f1"),
struct_array_->GetFieldByName("f2"), struct_array_->GetFieldByName("f3"),
struct_array_->GetFieldByName("f4")},
fields)
.ValueOrDie();
{
// f0 is null, no data
auto predicate =
PredicateBuilder::IsNull(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING);
CheckResult(read_schema, predicate, /*expected_array=*/nullptr);
}
{
// f0 is not null, has data
auto predicate =
PredicateBuilder::IsNotNull(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING);
CheckResult(read_schema, predicate, expected_array);
}
{
// f0 == apple, has data
auto predicate =
PredicateBuilder::Equal(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING,
Literal(FieldType::STRING, "apple", 5));
CheckResult(read_schema, predicate, expected_array);
}
{
// f0 == anything, no data
auto predicate =
PredicateBuilder::Equal(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING,
Literal(FieldType::STRING, "anything", 8));
CheckResult(read_schema, predicate, /*expected_array=*/nullptr);
}
{
// f0 > zooooooo, no data
auto predicate = PredicateBuilder::GreaterThan(
/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING,
Literal(FieldType::STRING, "zooooooo", 8));
CheckResult(read_schema, predicate, /*expected_array=*/nullptr);
}
{
// f0 like 'ba%', has data
ASSERT_OK_AND_ASSIGN(const auto predicate,
PredicateBuilder::StartsWith(
/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING,
Literal(FieldType::STRING, "ba", 2)));
CheckResult(read_schema, predicate, /*expected_array=*/expected_array);
}
{
// f0 like '%ta', has data
ASSERT_OK_AND_ASSIGN(const auto predicate,
PredicateBuilder::EndsWith(
/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING,
Literal(FieldType::STRING, "ta", 2)));
CheckResult(read_schema, predicate, /*expected_array=*/expected_array);
}
{
// f0 like '%me%', has data
ASSERT_OK_AND_ASSIGN(const auto predicate,
PredicateBuilder::Contains(
/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING,
Literal(FieldType::STRING, "me", 2)));
CheckResult(read_schema, predicate, /*expected_array=*/expected_array);
}
{
// f0 like 'me', has data
ASSERT_OK_AND_ASSIGN(const auto predicate,
PredicateBuilder::Like(
/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING,
Literal(FieldType::STRING, "me", 2)));
CheckResult(read_schema, predicate, /*expected_array=*/expected_array);
}
}
TEST_F(PredicatePushdownTest, TestBinaryData) {
PrepareTestData(struct_array_);
auto data_type = struct_array_->struct_type();
arrow::FieldVector fields = {data_type->GetFieldByName("f5")};
auto read_schema = arrow::schema(fields);
std::shared_ptr<arrow::Array> expected_array =
arrow::StructArray::Make({struct_array_->GetFieldByName("f5")}, fields).ValueOrDie();
// paimon does not pushdown binary type to parquet, will skip this predicate
{
// f5 < aaa, do not have data, but binary predicate will be skipped
auto predicate =
PredicateBuilder::LessThan(/*field_index=*/5, /*field_name=*/"f5", FieldType::BINARY,
Literal(FieldType::BINARY, "aaa", 3));
CheckResult(read_schema, predicate, expected_array);
}
{
// f5 >= zoo, do not have data, but binary predicate will be skipped
auto predicate = PredicateBuilder::GreaterOrEqual(
/*field_index=*/5, /*field_name=*/"f5", FieldType::BINARY,
Literal(FieldType::BINARY, "zoo", 3));
CheckResult(read_schema, predicate, expected_array);
}
}
TEST_F(PredicatePushdownTest, TestPredicatePushdownWithAllDataNull) {
PrepareTestData(struct_array_);
auto data_type = struct_array_->struct_type();
arrow::FieldVector fields = {data_type->GetFieldByName("f0"), data_type->GetFieldByName("f1"),
data_type->GetFieldByName("f2"), data_type->GetFieldByName("f3"),
data_type->GetFieldByName("f4")};
auto read_schema = arrow::schema(fields);
std::shared_ptr<arrow::Array> expected_array =
arrow::StructArray::Make(
{struct_array_->GetFieldByName("f0"), struct_array_->GetFieldByName("f1"),
struct_array_->GetFieldByName("f2"), struct_array_->GetFieldByName("f3"),
struct_array_->GetFieldByName("f4")},
fields)
.ValueOrDie();
{
// f4 is null, has data
auto predicate =
PredicateBuilder::IsNull(/*field_index=*/4, /*field_name=*/"f4", FieldType::BIGINT);
CheckResult(read_schema, predicate, expected_array);
}
// other predicate, always return IS_NULL (no data)
{
// f4 in [1,2], no data
auto predicate = PredicateBuilder::In(/*field_index=*/4, /*field_name=*/"f4",
FieldType::BIGINT, {Literal(1l), Literal(2l)});
CheckResult(read_schema, predicate, /*expected_array=*/nullptr);
}
{
// f4 not in [1,2], no data
auto predicate = PredicateBuilder::NotIn(/*field_index=*/4, /*field_name=*/"f4",
FieldType::BIGINT, {Literal(1l), Literal(2l)});
CheckResult(read_schema, predicate, /*expected_array=*/nullptr);
}
{
// f4 >= 3, no data
auto predicate = PredicateBuilder::GreaterOrEqual(/*field_index=*/4, /*field_name=*/"f4",
FieldType::BIGINT, Literal(3l));
CheckResult(read_schema, predicate, /*expected_array=*/nullptr);
}
{
// f4 <= 3, no data
auto predicate = PredicateBuilder::LessOrEqual(/*field_index=*/4, /*field_name=*/"f4",
FieldType::BIGINT, Literal(3l));
CheckResult(read_schema, predicate, /*expected_array=*/nullptr);
}
}
TEST_F(PredicatePushdownTest, TestCompoundPredicate) {
PrepareTestData(struct_array_);
auto read_schema = arrow::schema(struct_array_->struct_type()->fields());
std::shared_ptr<arrow::Array> expected_array = struct_array_;
{
// f2 < 6 and f1 == 4 and f3 == true, has data
ASSERT_OK_AND_ASSIGN(
auto predicate,
PredicateBuilder::And(
{PredicateBuilder::LessThan(/*field_index=*/2, /*field_name=*/"f2",
FieldType::BIGINT, Literal(6l)),
PredicateBuilder::Equal(/*field_index=*/1, /*field_name=*/"f1", FieldType::FLOAT,
Literal(static_cast<float>(4.0))),
PredicateBuilder::Equal(/*field_index=*/3, /*field_name=*/"f3", FieldType::BOOLEAN,
Literal(true))}));
ASSERT_TRUE(predicate);
CheckResult(read_schema, predicate, expected_array);
}
{
// f2 < 6 and f1 == 4 and f3 is null, no data
ASSERT_OK_AND_ASSIGN(
auto predicate,
PredicateBuilder::And(
{PredicateBuilder::LessThan(/*field_index=*/2, /*field_name=*/"f2",
FieldType::BIGINT, Literal(6l)),
PredicateBuilder::Equal(/*field_index=*/1, /*field_name=*/"f1", FieldType::FLOAT,
Literal(static_cast<float>(4.0))),
PredicateBuilder::IsNull(/*field_index=*/3, /*field_name=*/"f3",
FieldType::BOOLEAN)}));
ASSERT_TRUE(predicate);
CheckResult(read_schema, predicate, /*expected_array=*/nullptr);
}
{
// f2 < 6 and f1 == 4 and f5 is null, no data
ASSERT_OK_AND_ASSIGN(
auto predicate,
PredicateBuilder::And(
{PredicateBuilder::LessThan(/*field_index=*/2, /*field_name=*/"f2",
FieldType::BIGINT, Literal(6l)),
PredicateBuilder::Equal(/*field_index=*/1, /*field_name=*/"f1", FieldType::FLOAT,
Literal(static_cast<float>(4.0))),
PredicateBuilder::IsNull(/*field_index=*/5, /*field_name=*/"f5",
FieldType::BINARY)}));
ASSERT_TRUE(predicate);
CheckResult(read_schema, predicate, /*expected_array=*/nullptr);
}
{
// f2 < 6 and f1 == 4 and f5 == zoo, has data
ASSERT_OK_AND_ASSIGN(
auto predicate,
PredicateBuilder::And(
{PredicateBuilder::LessThan(/*field_index=*/2, /*field_name=*/"f2",
FieldType::BIGINT, Literal(6l)),
PredicateBuilder::Equal(/*field_index=*/1, /*field_name=*/"f1", FieldType::FLOAT,
Literal(static_cast<float>(4.0))),
PredicateBuilder::Equal(/*field_index=*/5, /*field_name=*/"f5", FieldType::BINARY,
Literal(FieldType::BINARY, "zoo", 3))}));
ASSERT_TRUE(predicate);
CheckResult(read_schema, predicate, expected_array);
}
{
// f2 < 6 and f1 == 5 and f5 is null, no data
ASSERT_OK_AND_ASSIGN(
auto predicate,
PredicateBuilder::And(
{PredicateBuilder::LessThan(/*field_index=*/2, /*field_name=*/"f2",
FieldType::BIGINT, Literal(6l)),
PredicateBuilder::Equal(/*field_index=*/1, /*field_name=*/"f1", FieldType::FLOAT,
Literal(static_cast<float>(5.0))),
PredicateBuilder::IsNull(/*field_index=*/5, /*field_name=*/"f5",
FieldType::BINARY)}));
ASSERT_TRUE(predicate);
CheckResult(read_schema, predicate, /*expected_array=*/nullptr);
}
{
// f2 < 6 or f1 == 4, has data
ASSERT_OK_AND_ASSIGN(
auto predicate,
PredicateBuilder::Or(
{PredicateBuilder::LessThan(/*field_index=*/2, /*field_name=*/"f2",
FieldType::BIGINT, Literal(6l)),
PredicateBuilder::Equal(/*field_index=*/1, /*field_name=*/"f1", FieldType::FLOAT,
Literal(static_cast<float>(4.0)))}));
ASSERT_TRUE(predicate);
CheckResult(read_schema, predicate, expected_array);
}
{
// f2 < 6 or f1 == 5, has data
ASSERT_OK_AND_ASSIGN(
auto predicate,
PredicateBuilder::Or(
{PredicateBuilder::LessThan(/*field_index=*/2, /*field_name=*/"f2",
FieldType::BIGINT, Literal(6l)),
PredicateBuilder::Equal(/*field_index=*/1, /*field_name=*/"f1", FieldType::FLOAT,
Literal(static_cast<float>(5.0)))}));
ASSERT_TRUE(predicate);
CheckResult(read_schema, predicate, expected_array);
}
{
// f2 < 2 or f5 is null, no data
ASSERT_OK_AND_ASSIGN(
auto predicate,
PredicateBuilder::Or({PredicateBuilder::LessThan(/*field_index=*/2, /*field_name=*/"f2",
FieldType::BIGINT, Literal(2l)),
PredicateBuilder::IsNull(/*field_index=*/5, /*field_name=*/"f5",
FieldType::BINARY)}));
ASSERT_TRUE(predicate);
CheckResult(read_schema, predicate, nullptr);
}
{
// f2 < 2 or f5 = zoo, ignore <or predicate> as it contains binary
ASSERT_OK_AND_ASSIGN(
auto predicate,
PredicateBuilder::Or(
{PredicateBuilder::LessThan(/*field_index=*/2, /*field_name=*/"f2",
FieldType::BIGINT, Literal(2l)),
PredicateBuilder::Equal(/*field_index=*/5, /*field_name=*/"f5", FieldType::BINARY,
Literal(FieldType::BINARY, "zoo", 3))}));
ASSERT_TRUE(predicate);
CheckResult(read_schema, predicate, expected_array);
}
{
// f2 < 2 or f1 == 4 or f3 == false, has data
ASSERT_OK_AND_ASSIGN(
auto predicate,
PredicateBuilder::Or(
{PredicateBuilder::LessThan(/*field_index=*/2, /*field_name=*/"f2",
FieldType::BIGINT, Literal(2l)),
PredicateBuilder::Equal(/*field_index=*/1, /*field_name=*/"f1", FieldType::FLOAT,
Literal(static_cast<float>(4.0))),
PredicateBuilder::Equal(/*field_index=*/3, /*field_name=*/"f3", FieldType::BOOLEAN,
Literal(false))}));
ASSERT_TRUE(predicate);
CheckResult(read_schema, predicate, expected_array);
}
{
// f2 < 2 or f1 == 5 or f3 is null, no data
ASSERT_OK_AND_ASSIGN(
auto predicate,
PredicateBuilder::Or(
{PredicateBuilder::LessThan(/*field_index=*/2, /*field_name=*/"f2",
FieldType::BIGINT, Literal(2l)),
PredicateBuilder::Equal(/*field_index=*/1, /*field_name=*/"f1", FieldType::FLOAT,
Literal(static_cast<float>(5.0))),
PredicateBuilder::IsNull(/*field_index=*/3, /*field_name=*/"f3",
FieldType::BOOLEAN)}));
ASSERT_TRUE(predicate);
CheckResult(read_schema, predicate, /*expected_array=*/nullptr);
}
}
TEST_F(PredicatePushdownTest, TestComplexType) {
arrow::FieldVector fields = {
arrow::field("f1", arrow::int32()),
arrow::field("f2", arrow::int32()),
arrow::field("f3", arrow::date32()),
arrow::field("f4", arrow::timestamp(arrow::TimeUnit::NANO)),
arrow::field("f5", arrow::decimal128(23, 5)),
};
auto read_schema = arrow::schema(fields);
auto expected_array = std::dynamic_pointer_cast<arrow::StructArray>(
arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(fields), R"([
[10, 1, 1234, "2033-05-18 03:33:20.0", "123456789987654321.45678"],
[10, 1, 19909, "2033-05-18 03:33:20.000001001", "12.30000"],
[10, 1, 0, "2008-12-28 00:00:00.000123456", "12.30000"],
[10, 1, 100, "2008-12-28 00:00:00.00012345", "-123.45000"],
[10, 1, 100, "1899-01-01 00:59:20.001001001", "0.00000"],
[10, 1, 20006, "2024-10-10 10:10:10.100100100", "1728551410100.10010"]
])")
.ValueOrDie());
PrepareTestData(expected_array);
// date
{
auto predicate =
PredicateBuilder::IsNull(/*field_index=*/2, /*field_name=*/"f3", FieldType::DATE);
CheckResult(read_schema, predicate, /*expected_array=*/nullptr);
}
{
auto predicate =
PredicateBuilder::IsNotNull(/*field_index=*/2, /*field_name=*/"f3", FieldType::DATE);
CheckResult(read_schema, predicate, expected_array);
}
{
auto predicate = PredicateBuilder::Equal(/*field_index=*/2, /*field_name=*/"f3",
FieldType::DATE, Literal(FieldType::DATE, 4));
CheckResult(read_schema, predicate, expected_array);
}
{
auto predicate = PredicateBuilder::Equal(/*field_index=*/2, /*field_name=*/"f3",
FieldType::DATE, Literal(FieldType::DATE, -111));
CheckResult(read_schema, predicate, /*expected_array=*/nullptr);
}
{
auto predicate = PredicateBuilder::LessThan(
/*field_index=*/2, /*field_name=*/"f3", FieldType::DATE,
Literal(FieldType::DATE, 20006));
CheckResult(read_schema, predicate, expected_array);
}
{
auto predicate = PredicateBuilder::GreaterThan(
/*field_index=*/2, /*field_name=*/"f3", FieldType::DATE,
Literal(FieldType::DATE, 20006));
CheckResult(read_schema, predicate, /*expected_array=*/nullptr);
}
// timestamp, always be ignored
{
auto predicate =
PredicateBuilder::IsNull(/*field_index=*/3, /*field_name=*/"f4", FieldType::TIMESTAMP);
CheckResult(read_schema, predicate, /*expected_array=*/
expected_array);
}
{
auto predicate = PredicateBuilder::IsNotNull(/*field_index=*/3, /*field_name=*/"f4",
FieldType::TIMESTAMP);
CheckResult(read_schema, predicate, /*expected_array=*/
expected_array);
}
{
auto predicate =
PredicateBuilder::Equal(/*field_index=*/3, /*field_name=*/"f4", FieldType::TIMESTAMP,
Literal(Timestamp(2240521239999l, 0)));
CheckResult(read_schema, predicate, /*expected_array=*/expected_array);
}
{
auto predicate =
PredicateBuilder::LessThan(/*field_index=*/3, /*field_name=*/"f4", FieldType::TIMESTAMP,
Literal(Timestamp(1230422400000l, 123460)));
CheckResult(read_schema, predicate, expected_array);
}
{
auto predicate = PredicateBuilder::GreaterThan(/*field_index=*/3, /*field_name=*/"f4",
FieldType::TIMESTAMP,
Literal(Timestamp(2000000000000l, 1001)));
CheckResult(read_schema, predicate, /*expected_array=*/expected_array);
}
// decimal
{
auto predicate =
PredicateBuilder::IsNull(/*field_index=*/4, /*field_name=*/"f5", FieldType::DECIMAL);
CheckResult(read_schema, predicate, /*expected_array=*/nullptr);
}
{
auto predicate =
PredicateBuilder::IsNotNull(/*field_index=*/4, /*field_name=*/"f5", FieldType::DECIMAL);
CheckResult(read_schema, predicate, expected_array);
}
{
auto predicate = PredicateBuilder::Equal(
/*field_index=*/4, /*field_name=*/"f5", FieldType::DECIMAL,
Literal(Decimal(23, 5, 123456)));
CheckResult(read_schema, predicate, expected_array);
}
{
auto predicate = PredicateBuilder::Equal(
/*field_index=*/4, /*field_name=*/"f5", FieldType::DECIMAL,
Literal(Decimal(22, 3, -123456)));
CheckResult(read_schema, predicate, /*expected_array=*/nullptr);
}
{
auto predicate = PredicateBuilder::LessThan(
/*field_index=*/4, /*field_name=*/"f5", FieldType::DECIMAL,
Literal(Decimal(23, 3, DecimalUtils::StrToInt128("-123456789987654321567").value())));
CheckResult(read_schema, predicate, /*expected_array=*/nullptr);
}
{
auto predicate = PredicateBuilder::LessThan(
/*field_index=*/4, /*field_name=*/"f5", FieldType::DECIMAL,
Literal(Decimal(23, 3, DecimalUtils::StrToInt128("123456789987654321567").value())));
CheckResult(read_schema, predicate, expected_array);
}
{
auto predicate = PredicateBuilder::GreaterThan(
/*field_index=*/4, /*field_name=*/"f5", FieldType::DECIMAL,
Literal(Decimal(23, 3, DecimalUtils::StrToInt128("123456789987654321567").value())));
CheckResult(read_schema, predicate, /*expected_array=*/nullptr);
}
{
auto predicate = PredicateBuilder::In(
/*field_index=*/4, /*field_name=*/"f5", FieldType::DECIMAL,
{Literal(Decimal(23, 5, DecimalUtils::StrToInt128("-12345678998765432134567").value())),
Literal(Decimal(23, 5, 1234567))});
CheckResult(read_schema, predicate, expected_array);
}
{
auto predicate = PredicateBuilder::NotIn(
/*field_index=*/4, /*field_name=*/"f5", FieldType::DECIMAL,
{Literal(Decimal(23, 5, DecimalUtils::StrToInt128("-12345678998765432134567").value())),
Literal(Decimal(23, 5, 1234567))});
CheckResult(read_schema, predicate, expected_array);
}
}
TEST_F(PredicatePushdownTest, TestAllNullOrAllSameValue) {
arrow::FieldVector fields = {arrow::field("f1", arrow::int32()),
arrow::field("f2", arrow::int32())};
auto read_schema = arrow::schema(fields);
auto expected_array = std::dynamic_pointer_cast<arrow::StructArray>(
arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(fields), R"([
[null, 10],
[null, 10],
[null, 10]
])")
.ValueOrDie());
PrepareTestData(expected_array);
// for f1
{
auto predicate =
PredicateBuilder::IsNull(/*field_index=*/0, /*field_name=*/"f1", FieldType::INT);
CheckResult(read_schema, predicate, expected_array);
}
{
auto predicate =
PredicateBuilder::IsNotNull(/*field_index=*/0, /*field_name=*/"f1", FieldType::INT);
CheckResult(read_schema, predicate, /*expected_array=*/nullptr);
}
{
auto predicate = PredicateBuilder::Equal(/*field_index=*/0, /*field_name=*/"f1",
FieldType::INT, Literal(10));
CheckResult(read_schema, predicate, /*expected_array=*/nullptr);
}
{
auto predicate = PredicateBuilder::NotEqual(/*field_index=*/0, /*field_name=*/"f1",
FieldType::INT, Literal(10));
CheckResult(read_schema, predicate, /*expected_array=*/nullptr);
}
{
auto predicate = PredicateBuilder::GreaterThan(/*field_index=*/0, /*field_name=*/"f1",
FieldType::INT, Literal(10));
CheckResult(read_schema, predicate, /*expected_array=*/nullptr);
}
{
auto predicate = PredicateBuilder::GreaterOrEqual(/*field_index=*/0, /*field_name=*/"f1",
FieldType::INT, Literal(10));
CheckResult(read_schema, predicate, /*expected_array=*/nullptr);
}
{
auto predicate = PredicateBuilder::LessThan(/*field_index=*/0, /*field_name=*/"f1",
FieldType::INT, Literal(10));
CheckResult(read_schema, predicate, /*expected_array=*/nullptr);
}
{
auto predicate = PredicateBuilder::LessOrEqual(/*field_index=*/0, /*field_name=*/"f1",
FieldType::INT, Literal(10));
CheckResult(read_schema, predicate, /*expected_array=*/nullptr);
}
{
auto predicate = PredicateBuilder::In(/*field_index=*/0, /*field_name=*/"f1",
FieldType::INT, {Literal(10), Literal(20)});
CheckResult(read_schema, predicate, /*expected_array=*/nullptr);
}
{
auto predicate = PredicateBuilder::NotIn(/*field_index=*/0, /*field_name=*/"f1",
FieldType::INT, {Literal(10), Literal(20)});
CheckResult(read_schema, predicate, /*expected_array=*/nullptr);
}
// for f2
{
auto predicate = PredicateBuilder::NotEqual(/*field_index=*/1, /*field_name=*/"f2",
FieldType::INT, Literal(10));
CheckResult(read_schema, predicate, /*expected_array=*/nullptr);
}
{
auto predicate = PredicateBuilder::NotEqual(/*field_index=*/1, /*field_name=*/"f2",
FieldType::INT, Literal(30));
CheckResult(read_schema, predicate, expected_array);
}
{
auto predicate = PredicateBuilder::In(/*field_index=*/1, /*field_name=*/"f2",
FieldType::INT, {Literal(10), Literal(20)});
CheckResult(read_schema, predicate, expected_array);
}
{
auto predicate = PredicateBuilder::NotIn(/*field_index=*/1, /*field_name=*/"f2",
FieldType::INT, {Literal(10), Literal(20)});
CheckResult(read_schema, predicate, /*expected_array=*/nullptr);
}
{
auto predicate = PredicateBuilder::NotIn(/*field_index=*/1, /*field_name=*/"f2",
FieldType::INT, {Literal(20), Literal(30)});
CheckResult(read_schema, predicate, expected_array);
}
}
} // namespace paimon::parquet::test