cpp-ch/local-engine/tests/benchmark_parquet_read.cpp - gluten - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *    http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 #include <random>
 #include <Core/Block.h>
 #include <DataTypes/DataTypeDate32.h>
 #include <DataTypes/DataTypeString.h>
 #include <IO/ReadBufferFromFile.h>
 #include <Interpreters/JoinInfo.h>
 #include <Processors/Executors/PullingPipelineExecutor.h>
 #include <Processors/Formats/Impl/ArrowColumnToCHColumn.h>
 #include <Processors/Formats/Impl/ParquetBlockInputFormat.h>
 #include <QueryPipeline/QueryPipeline.h>
 #include <QueryPipeline/QueryPipelineBuilder.h>
 #include <Storages/Output/NormalFileWriter.h>
 #include <Storages/Parquet/ParquetMeta.h>
 #include <Storages/Parquet/VectorizedParquetRecordReader.h>
 #include <Storages/SubstraitSource/Iceberg/IcebergMetadataColumn.h>
 #include <Storages/SubstraitSource/SubstraitFileSource.h>
 #include <Storages/SubstraitSource/substrait_fwd.h>
 #include <benchmark/benchmark.h>
 #include <substrait/plan.pb.h>
 #include <tests/utils/TempFilePath.h>
 #include <tests/utils/gluten_test_util.h>
 #include <Poco/Path.h>
 #include <Poco/URI.h>
 #include <Poco/Util/MapConfiguration.h>
 #include <Common/BlockTypeUtils.h>
 #include <Common/DebugUtils.h>
 #include <Common/QueryContext.h>

 namespace
 {

 void BM_ColumnIndexRead_NoFilter(benchmark::State & state)
 {
     using namespace DB;

     std::string file = local_engine::test::third_party_data(
         "benchmark/column_index/lineitem/part-00000-9395e12a-3620-4085-9677-c63b920353f4-c000.snappy.parquet");
     auto header = local_engine::toShared(local_engine::toSampleBlock(local_engine::test::readParquetSchema(file)));
     FormatSettings format_settings;
     Block res;
     for (auto _ : state)
     {
         local_engine::ParquetMetaBuilder metaBuilder{
             .format_settings = format_settings,
             .collectPageIndex = true,
             .collectSkipRowGroup = false,
             .case_insensitive = format_settings.parquet.case_insensitive_column_matching,
             .allow_missing_columns = format_settings.parquet.allow_missing_columns};
         ReadBufferFromFilePRead fileReader(file);
         metaBuilder.build(fileReader, *header);
         local_engine::ColumnIndexRowRangesProvider provider{metaBuilder};
         auto format = std::make_shared<local_engine ::VectorizedParquetBlockInputFormat>(fileReader, header, provider, format_settings);
         auto pipeline = QueryPipeline(std::move(format));
         auto reader = std::make_unique<PullingPipelineExecutor>(pipeline);
         while (reader->pull(res))
         {
             // debug::headBlock(res);
         }
     }
 }

 void BM_ColumnIndexRead_Old(benchmark::State & state)
 {
     using namespace DB;

     std::string file = local_engine::test::third_party_data(
         "benchmark/column_index/lineitem/part-00000-9395e12a-3620-4085-9677-c63b920353f4-c000.snappy.parquet");
     auto header = local_engine::toShared(local_engine::toSampleBlock(local_engine::test::readParquetSchema(file)));
     FormatSettings format_settings;
     Block res;
     for (auto _ : state)
     {
         ReadBufferFromFilePRead fileReader(file);
         auto global_context = local_engine::QueryContext::globalContext();
         auto parser_group = std::make_shared<FormatFilterInfo>(nullptr, global_context, nullptr);
         auto parser_shared_resources
             = std::make_shared<FormatParserSharedResources>(global_context->getSettingsRef(), /*num_streams_=*/1);
         auto format = std::make_shared<ParquetBlockInputFormat>(*in, header, format_settings, parser_shared_resources, parser_group, 8192);
         auto pipeline = QueryPipeline(std::move(format));
         auto reader = std::make_unique<PullingPipelineExecutor>(pipeline);
         while (reader->pull(res))
         {
             // debug::headBlock(res);
         }
     }
 }

 void BM_ParquetReadDate32(benchmark::State & state)
 {
     using namespace DB;
     auto header = local_engine::toShared(Block{
         ColumnWithTypeAndName(DataTypeDate32().createColumn(), std::make_shared<DataTypeDate32>(), "l_shipdate"),
         ColumnWithTypeAndName(DataTypeDate32().createColumn(), std::make_shared<DataTypeDate32>(), "l_commitdate"),
         ColumnWithTypeAndName(DataTypeDate32().createColumn(), std::make_shared<DataTypeDate32>(), "l_receiptdate")});
     std::string file{GLUTEN_SOURCE_TPCH_DIR("lineitem/part-00000-d08071cb-0dfa-42dc-9198-83cb334ccda3-c000.snappy.parquet")};
     FormatSettings format_settings;
     Block res;
     for (auto _ : state)
     {
         auto in = std::make_unique<ReadBufferFromFile>(file);
         auto global_context = local_engine::QueryContext::globalContext();
         auto parser_group = std::make_shared<FormatFilterInfo>(nullptr, global_context, nullptr);
         auto parser_shared_resources
             = std::make_shared<FormatParserSharedResources>(global_context->getSettingsRef(), /*num_streams_=*/1);
         auto format = std::make_shared<ParquetBlockInputFormat>(fileReader, header, format_settings, parser_shared_resources, parser_group, 8192);
         auto pipeline = QueryPipeline(std::move(format));
         auto reader = std::make_unique<PullingPipelineExecutor>(pipeline);
         while (reader->pull(res))
         {
             // debug::headBlock(res);
         }
     }
 }

 void BM_OptimizedParquetReadString(benchmark::State & state)
 {
     using namespace DB;
     using namespace local_engine;
     Block header{
         ColumnWithTypeAndName(DataTypeString().createColumn(), std::make_shared<DataTypeString>(), "l_returnflag"),
         ColumnWithTypeAndName(DataTypeString().createColumn(), std::make_shared<DataTypeString>(), "l_linestatus")};
     std::string file{GLUTEN_SOURCE_TPCH_URI("lineitem/part-00000-d08071cb-0dfa-42dc-9198-83cb334ccda3-c000.snappy.parquet")};
     Block res;

     for (auto _ : state)
     {
         substrait::ReadRel::LocalFiles files;
         substrait::ReadRel::LocalFiles::FileOrFiles * file_item = files.add_items();
         file_item->set_uri_file(file);
         substrait::ReadRel::LocalFiles::FileOrFiles::ParquetReadOptions parquet_format;
         file_item->mutable_parquet()->CopyFrom(parquet_format);

         auto builder = std::make_unique<QueryPipelineBuilder>();
         builder->init(
             Pipe(std::make_shared<local_engine::SubstraitFileSource>(local_engine::QueryContext::globalContext(), header, files)));
         auto pipeline = QueryPipelineBuilder::getPipeline(std::move(*builder));
         auto reader = PullingPipelineExecutor(pipeline);
         while (reader.pull(res))
         {
             // debug::headBlock(res);
         }
     }
 }

 void BM_OptimizedParquetReadDate32(benchmark::State & state)
 {
     using namespace DB;
     using namespace local_engine;
     Block header{
         ColumnWithTypeAndName(DataTypeDate32().createColumn(), std::make_shared<DataTypeDate32>(), "l_shipdate"),
         ColumnWithTypeAndName(DataTypeDate32().createColumn(), std::make_shared<DataTypeDate32>(), "l_commitdate"),
         ColumnWithTypeAndName(DataTypeDate32().createColumn(), std::make_shared<DataTypeDate32>(), "l_receiptdate")};
     std::string file{GLUTEN_SOURCE_TPCH_URI("lineitem/part-00000-d08071cb-0dfa-42dc-9198-83cb334ccda3-c000.snappy.parquet")};
     Block res;

     for (auto _ : state)
     {
         substrait::ReadRel::LocalFiles files;
         substrait::ReadRel::LocalFiles::FileOrFiles * file_item = files.add_items();
         file_item->set_uri_file(file);
         substrait::ReadRel::LocalFiles::FileOrFiles::ParquetReadOptions parquet_format;
         file_item->mutable_parquet()->CopyFrom(parquet_format);

         auto builder = std::make_unique<QueryPipelineBuilder>();
         builder->init(
             Pipe(std::make_shared<local_engine::SubstraitFileSource>(local_engine::QueryContext::globalContext(), header, files)));
         auto pipeline = QueryPipelineBuilder::getPipeline(std::move(*builder));
         auto reader = PullingPipelineExecutor(pipeline);
         while (reader.pull(res))
         {
             // debug::headBlock(res);
         }
     }
 }

 substrait::ReadRel::LocalFiles createLocalFiles(const std::string & filename, const bool use_local_format)
 {
     substrait::ReadRel::LocalFiles files;
     substrait::ReadRel::LocalFiles::FileOrFiles * file_item = files.add_items();
     file_item->set_uri_file("file://" + filename);
     file_item->set_start(0);
     file_item->set_length(std::filesystem::file_size(filename));
     const substrait::ReadRel::LocalFiles::FileOrFiles::ParquetReadOptions parquet_format;
     file_item->mutable_parquet()->CopyFrom(parquet_format);

     auto config = Poco::AutoPtr(new Poco::Util::MapConfiguration());
     config->setBool("use_local_format", use_local_format);
     local_engine::QueryContext::globalMutableContext()->setConfig(config);

     return files;
 }

 void doRead(const substrait::ReadRel::LocalFiles & files, const std::shared_ptr<const DB::ActionsDAG> & pushDown, const DB::Block & header)
 {
     const auto builder = std::make_unique<DB::QueryPipelineBuilder>();
     const auto source = std::make_shared<local_engine::SubstraitFileSource>(local_engine::QueryContext::globalContext(), header, files);
     source->setKeyCondition(pushDown, local_engine::QueryContext::globalContext());
     builder->init(DB::Pipe(source));
     auto pipeline = DB::QueryPipelineBuilder::getPipeline(std::move(*builder));
     auto reader = DB::PullingPipelineExecutor(pipeline);
     auto result = header.cloneEmpty();
     size_t total_rows = 0;
     while (reader.pull(result))
     {
 #ifndef NDEBUG
         debug::headBlock(result);
 #endif
         total_rows += result.rows();
     }
 #ifndef NDEBUG
     std::cerr << "rows:" << total_rows << std::endl;
 #endif
 }

 void BM_ColumnIndexRead_Filter_ReturnAllResult(benchmark::State & state)
 {
     using namespace DB;

     const std::string filename = local_engine::test::third_party_data(
         "benchmark/column_index/lineitem/part-00000-9395e12a-3620-4085-9677-c63b920353f4-c000.snappy.parquet");
     const std::string filter1 = "l_shipdate is not null AND l_shipdate <= toDate32('1998-09-01')";
     const substrait::ReadRel::LocalFiles files = createLocalFiles(filename, true);
     const local_engine::RowType schema = local_engine::test::readParquetSchema(filename);
     auto pushDownOpt = local_engine::test::parseFilter(filter1, schema);
     auto pushDown = pushDownOpt ? std::make_shared<const ActionsDAG>(std::move(*pushDownOpt)) : nullptr;

     const Block header = {local_engine::toSampleBlock(schema)};

     for (auto _ : state)
         doRead(files, pushDown, header);
     local_engine::QueryContext::globalMutableContext()->setConfig(Poco::AutoPtr(new Poco::Util::MapConfiguration()));
 }

 void BM_ColumnIndexRead_Filter_ReturnHalfResult(benchmark::State & state)
 {
     using namespace DB;

     const std::string filename = local_engine::test::third_party_data(
         "benchmark/column_index/lineitem/part-00000-9395e12a-3620-4085-9677-c63b920353f4-c000.snappy.parquet");
     const std::string filter1 = "l_orderkey is not null AND l_orderkey > 300977829";
     const substrait::ReadRel::LocalFiles files = createLocalFiles(filename, true);
     const local_engine::RowType schema = local_engine::test::readParquetSchema(filename);
     auto pushDownOpt = local_engine::test::parseFilter(filter1, schema);
     auto pushDown = pushDownOpt ? std::make_shared<const ActionsDAG>(std::move(*pushDownOpt)) : nullptr;
     const Block header = {local_engine::toSampleBlock(schema)};

     for (auto _ : state)
         doRead(files, pushDown, header);
     local_engine::QueryContext::globalMutableContext()->setConfig(Poco::AutoPtr(new Poco::Util::MapConfiguration()));
 }

 //// Iceberg perf test
 ///

 size_t readFileWithDeletesAndGetRowCount(
     const std::string & file, const DB::Block & header, const local_engine::SubstraitIcebergDeleteFile * delete_file)
 {
     using namespace DB;
     using namespace local_engine;
     using namespace local_engine::test;

     substrait::ReadRel::LocalFiles files;
     substrait::ReadRel::LocalFiles::FileOrFiles * file_item = files.add_items();
     file_item->set_uri_file("file://" + file);
     file_item->set_start(0);
     file_item->set_length(std::filesystem::file_size(file));

     substrait::ReadRel::LocalFiles::FileOrFiles::IcebergReadOptions iceberg_options;
     substrait::ReadRel::LocalFiles::FileOrFiles::ParquetReadOptions parquet_format;
     iceberg_options.mutable_parquet()->CopyFrom(parquet_format);

     if (delete_file)
         iceberg_options.add_delete_files()->CopyFrom(*delete_file);

     file_item->mutable_iceberg()->CopyFrom(iceberg_options);


     auto builder = std::make_unique<QueryPipelineBuilder>();
     builder->init(Pipe(std::make_shared<SubstraitFileSource>(QueryContext::globalContext(), header, files)));
     auto pipeline = QueryPipelineBuilder::getPipeline(std::move(*builder));
     auto reader = PullingPipelineExecutor(pipeline);


     size_t total_read_rows = 0;
     DB::Block res;
     while (reader.pull(res))
         total_read_rows += res.rows();
     return total_read_rows;
 }

 std::pair<size_t, int64_t> calculateRowsAndDeleteCount(benchmark::State & state, const std::string & file_path, const DB::Block & header)
 {
     using namespace DB;
     using namespace local_engine;

     FormatSettings format_settings;

     ParquetMetaBuilder metaBuilder{
         .format_settings = format_settings,
         .case_insensitive = format_settings.parquet.case_insensitive_column_matching,
         .allow_missing_columns = format_settings.parquet.allow_missing_columns};
     ReadBufferFromFilePRead fileReader(file_path);
     metaBuilder.build(fileReader, header);

     size_t total_rows = std::ranges::fold_left(
         metaBuilder.readRowGroups, static_cast<size_t>(0), [](size_t sum, const auto & row_group) { return sum + row_group.num_rows; });

     // Calculate delete count based on percentage
     int64_t delete_percentage = state.range(0);
     int64_t delete_count = total_rows * delete_percentage / 100;

     return {total_rows, delete_count};
 }

 // Helper function to write a Parquet file with position deletes
 std::shared_ptr<local_engine::test::TempFilePath>
 writePositionDeleteFile(const std::string & base_file_path, const std::vector<int64_t> & delete_positions)
 {
     using namespace local_engine;
     using namespace local_engine::test;
     using namespace local_engine::iceberg;

     auto delete_file_path = TempFilePath::tmp("parquet");

     // Create the file path column with base file path repeated
     std::string uri_file_path = "file://" + base_file_path;
     auto file_path_vector = createColumn<std::string>(delete_positions.size(), [&](size_t /*row*/) { return uri_file_path; });

     // Create the position column with delete positions
     auto deletePosVector = createColumn<int64_t>(delete_positions);

     // Create the block with both columns
     DB::Block delete_block{
         {file_path_vector,
          IcebergMetadataColumn::icebergDeleteFilePathColumn()->type,
          IcebergMetadataColumn::icebergDeleteFilePathColumn()->name},
         {deletePosVector, IcebergMetadataColumn::icebergDeletePosColumn()->type, IcebergMetadataColumn::icebergDeletePosColumn()->name}};

     // Write the block to the delete file
     const DB::ContextPtr context = QueryContext::globalContext();
     const Poco::Path file{delete_file_path->string()};
     const Poco::URI fileUri{file};

     auto writer = NormalFileWriter::create(context, fileUri.toString(), delete_block, "parquet");
     writer->write(delete_block);
     writer->close();

     return delete_file_path;
 }
 std::pair<local_engine::SubstraitIcebergDeleteFile, std::shared_ptr<local_engine::test::TempFilePath>>
 createPositionDeleteFile(int64_t delete_count, size_t total_rows, const std::string & base_file_path)
 {
     if (delete_count == 0)
         return {{}, nullptr};

     assert(delete_count > 0);
     using namespace local_engine;
     using namespace local_engine::test;

     std::vector<int64_t> delete_positions;
     delete_positions.reserve(delete_count);

     std::vector<int64_t> all_positions(total_rows);
     std::iota(all_positions.begin(), all_positions.end(), 0);

     std::mt19937 g(std::random_device{}());
     std::ranges::shuffle(all_positions, g);

     delete_positions.assign(all_positions.begin(), all_positions.begin() + delete_count);
     std::ranges::sort(delete_positions);

     std::shared_ptr<TempFilePath> delete_file_path = writePositionDeleteFile(base_file_path, delete_positions);

     SubstraitIcebergDeleteFile delete_file;
     delete_file.set_filecontent(IcebergReadOptions::POSITION_DELETES);
     delete_file.set_filepath("file://" + delete_file_path->string());
     delete_file.set_recordcount(delete_count);
     delete_file.set_filesize(std::filesystem::file_size(delete_file_path->string()));

     substrait::ReadRel::LocalFiles::FileOrFiles::ParquetReadOptions parquet_format;
     delete_file.mutable_parquet()->CopyFrom(parquet_format);

     return {delete_file, delete_file_path};
 }

 // Helper function to write a Parquet file with equality deletes
 std::shared_ptr<local_engine::test::TempFilePath>
 writeEqualityDeleteFile(const std::vector<int64_t> & delete_values, const std::string & column_name = "l_shipdate")
 {
     using namespace local_engine;
     using namespace local_engine::test;

     auto delete_file_path = TempFilePath::tmp("parquet");

     // Create the column with values to be deleted
     auto delete_values_vector = createColumn<int64_t>(delete_values);

     // Create the block with the delete values
     DB::Block delete_block{{delete_values_vector, std::make_shared<DB::DataTypeInt64>(), column_name}};

     // Write the block to the delete file
     const DB::ContextPtr context = QueryContext::globalContext();
     const Poco::Path file{delete_file_path->string()};
     const Poco::URI fileUri{file};

     auto writer = NormalFileWriter::create(context, fileUri.toString(), delete_block, "parquet");
     writer->write(delete_block);
     writer->close();

     return delete_file_path;
 }

 std::pair<local_engine::SubstraitIcebergDeleteFile, std::shared_ptr<local_engine::test::TempFilePath>>
 createEqualityDeleteFile(int64_t delete_count)
 {
     if (delete_count == 0)
         return {{}, nullptr};

     assert(delete_count > 0);
     using namespace local_engine;
     using namespace local_engine::test;

     std::vector<int64_t> delete_values;
     delete_values.reserve(delete_count);

     // For simplicity in the benchmark, we uniformly select deletion values from the minimum and maximum values.
     // Therefore, some deletion values do not exist, and we cannot determine whether the benchmarking is correct
     // based on the number of deletions.
     //
     // This is acceptable for identifying performance issues with EqualityDeletes.

     // +-------+-------------------------+---------------+---------------+
     // |count()|countDistinct(l_orderkey)|min(l_orderkey)|max(l_orderkey)|
     // +-------+-------------------------+---------------+---------------+
     // |8333867|                  8105793|              7|      599999972|
     // +-------+-------------------------+---------------+---------------+
     constexpr int64_t min_orderkey = 7;
     constexpr int64_t max_orderkey = 599999972;
     std::uniform_int_distribution<int64_t> distrib(min_orderkey, max_orderkey);

     std::mt19937 gen(std::random_device{}());
     for (int64_t i = 0; i < delete_count; ++i)
         delete_values.push_back(distrib(gen));
     std::ranges::sort(delete_values);


     std::shared_ptr<TempFilePath> delete_file_path = writeEqualityDeleteFile(delete_values, "l_orderkey");

     SubstraitIcebergDeleteFile delete_file;
     delete_file.set_filecontent(IcebergReadOptions::EQUALITY_DELETES);
     delete_file.set_filepath("file://" + delete_file_path->string());
     delete_file.set_recordcount(delete_count);
     delete_file.set_filesize(std::filesystem::file_size(delete_file_path->string()));
     delete_file.add_equalityfieldids(1); // l_orderkey

     substrait::ReadRel::LocalFiles::FileOrFiles::ParquetReadOptions parquet_format;
     delete_file.mutable_parquet()->CopyFrom(parquet_format);

     return {delete_file, delete_file_path};
 }

 template <bool is_position_delete>
 void BM_IcebergReadWithDeletes(benchmark::State & state)
 {
     using namespace DB;
     using namespace local_engine;
     using namespace local_engine::test;

     std::string file
         = third_party_data("benchmark/column_index/lineitem/part-00000-9395e12a-3620-4085-9677-c63b920353f4-c000.snappy.parquet");
     Block header{ColumnWithTypeAndName(DataTypeDate32().createColumn(), std::make_shared<DataTypeDate32>(), "l_shipdate")};
     auto [total_rows, delete_count] = calculateRowsAndDeleteCount(state, file, header);
     Block res;

     auto [delete_file, delete_file_path]
         = is_position_delete ? createPositionDeleteFile(delete_count, total_rows, file) : createEqualityDeleteFile(delete_count);

     for (auto _ : state)
     {
         size_t total_read_rows = readFileWithDeletesAndGetRowCount(file, header, delete_count ? &delete_file : nullptr);

         if (is_position_delete && total_read_rows != total_rows - delete_count)
             throw Exception(DB::ErrorCodes::LOGICAL_ERROR, "Expected {}, but got {} ", total_rows - delete_count, total_read_rows);

         // see `createEqualityDeleteFile` for `total_read_rows < total_rows - delete_count`
         if (!is_position_delete && total_read_rows < total_rows - delete_count)
             throw Exception(DB::ErrorCodes::LOGICAL_ERROR, "Read at least {}, but got {} ", total_rows - delete_count, total_read_rows);
     }
 }

 void BM_IcebergReadWithPositionDeletes(benchmark::State & state)
 {
     return BM_IcebergReadWithDeletes<true>(state);
 }
 void BM_IcebergReadWithEqualityDeletes(benchmark::State & state)
 {
     return BM_IcebergReadWithDeletes<false>(state);
 }

 }

 BENCHMARK(BM_ColumnIndexRead_Old)->Unit(benchmark::kMillisecond)->Iterations(20);
 BENCHMARK(BM_ColumnIndexRead_NoFilter)->Unit(benchmark::kMillisecond)->Iterations(20);
 BENCHMARK(BM_ColumnIndexRead_Filter_ReturnAllResult)->Unit(benchmark::kMillisecond)->Iterations(20);
 BENCHMARK(BM_ColumnIndexRead_Filter_ReturnHalfResult)->Unit(benchmark::kMillisecond)->Iterations(20);
 BENCHMARK(BM_ParquetReadDate32)->Unit(benchmark::kMillisecond)->Iterations(10);
 BENCHMARK(BM_OptimizedParquetReadString)->Unit(benchmark::kMillisecond)->Iterations(10);
 BENCHMARK(BM_OptimizedParquetReadDate32)->Unit(benchmark::kMillisecond)->Iterations(200);
 BENCHMARK(BM_IcebergReadWithPositionDeletes)->Unit(benchmark::kMillisecond)->Iterations(10)->Arg(0)->Arg(1)->Arg(10)->Arg(50)->Arg(100);
 BENCHMARK(BM_IcebergReadWithEqualityDeletes)->Unit(benchmark::kMillisecond)->Iterations(10)->Arg(0)->Arg(1)->Arg(5)->Arg(10)->Arg(50);