cpp-ch/local-engine/tests/benchmark_spark_row.cpp - gluten - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *    http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 #include <string>
 #include <vector>
 #include <Core/Block.h>
 #include <DataTypes/DataTypeFactory.h>
 #include <IO/ReadBufferFromFile.h>
 #include <Interpreters/Context.h>
 #include <Parser/CHColumnToSparkRow.h>
 #include <Parser/SparkRowToCHColumn.h>
 #include <Processors/Executors/PullingPipelineExecutor.h>
 #include <Processors/Formats/Impl/ArrowColumnToCHColumn.h>
 #include <Processors/Formats/Impl/ParquetBlockInputFormat.h>
 #include <QueryPipeline/QueryPipeline.h>
 #include <base/types.h>
 #include <benchmark/benchmark.h>
 #include <Common/BlockTypeUtils.h>
 #include <Common/QueryContext.h>

 using namespace DB;
 using namespace local_engine;

 struct NameType
 {
     String name;
     String type;
 };

 using NameTypes = std::vector<NameType>;

 static Block getLineitemHeader(const NameTypes & name_types)
 {
     auto & factory = DataTypeFactory::instance();
     ColumnsWithTypeAndName columns(name_types.size());
     for (size_t i = 0; i < columns.size(); ++i)
     {
         columns[i].name = name_types[i].name;
         columns[i].type = factory.get(name_types[i].type);
     }
     return std::move(Block(columns));
 }

 static void readParquetFile(const SharedHeader & header, const String & file, Block & block)
 {
     auto in = std::make_unique<ReadBufferFromFile>(file);
     FormatSettings format_settings;
     auto global_context = QueryContext::globalContext();
     auto parser_group = std::make_shared<FormatFilterInfo>(nullptr, global_context, nullptr);
     auto parser_shared_resources
         = std::make_shared<FormatParserSharedResources>(global_context->getSettingsRef(), /*num_streams_=*/1);
     auto format = std::make_shared<ParquetBlockInputFormat>(*in, header, format_settings, parser_shared_resources, std::move(parser_group), 8192);
     auto pipeline = QueryPipeline(std::move(format));
     auto reader = std::make_unique<PullingPipelineExecutor>(pipeline);
     while (reader->pull(block))
         return;
 }

 static void BM_CHColumnToSparkRow_Lineitem(benchmark::State & state)
 {
     const NameTypes name_types = {
         {"l_orderkey", "Nullable(Int64)"},
         {"l_partkey", "Nullable(Int64)"},
         {"l_suppkey", "Nullable(Int64)"},
         {"l_linenumber", "Nullable(Int64)"},
         {"l_quantity", "Nullable(Float64)"},
         {"l_extendedprice", "Nullable(Float64)"},
         {"l_discount", "Nullable(Float64)"},
         {"l_tax", "Nullable(Float64)"},
         {"l_returnflag", "Nullable(String)"},
         {"l_linestatus", "Nullable(String)"},
         {"l_shipdate", "Nullable(Date32)"},
         {"l_commitdate", "Nullable(Date32)"},
         {"l_receiptdate", "Nullable(Date32)"},
         {"l_shipinstruct", "Nullable(String)"},
         {"l_shipmode", "Nullable(String)"},
         {"l_comment", "Nullable(String)"},
     };

     auto header = toShared(getLineitemHeader(name_types));
     const String file = "/data1/liyang/cppproject/gluten/gluten-core/src/test/resources/tpch-data/lineitem/"
                         "part-00000-d08071cb-0dfa-42dc-9198-83cb334ccda3-c000.snappy.parquet";
     Block block;
     readParquetFile(header, file, block);
     // std::cerr << "read_rows:" << block.rows() << std::endl;
     CHColumnToSparkRow converter;
     for (auto _ : state)
     {
         auto spark_row_info = converter.convertCHColumnToSparkRow(block);
         converter.freeMem(spark_row_info->getBufferAddress(), spark_row_info->getTotalBytes());
     }
 }


 static void BM_SparkRowToCHColumn_Lineitem(benchmark::State & state)
 {
     const NameTypes name_types = {
         {"l_orderkey", "Nullable(Int64)"},
         {"l_partkey", "Nullable(Int64)"},
         {"l_suppkey", "Nullable(Int64)"},
         {"l_linenumber", "Nullable(Int64)"},
         {"l_quantity", "Nullable(Float64)"},
         {"l_extendedprice", "Nullable(Float64)"},
         {"l_discount", "Nullable(Float64)"},
         {"l_tax", "Nullable(Float64)"},
         {"l_returnflag", "Nullable(String)"},
         {"l_linestatus", "Nullable(String)"},
         {"l_shipdate", "Nullable(Date32)"},
         {"l_commitdate", "Nullable(Date32)"},
         {"l_receiptdate", "Nullable(Date32)"},
         {"l_shipinstruct", "Nullable(String)"},
         {"l_shipmode", "Nullable(String)"},
         {"l_comment", "Nullable(String)"},
     };

     auto header = toShared(getLineitemHeader(name_types));
     const String file = "/data1/liyang/cppproject/gluten/gluten-core/src/test/resources/tpch-data/lineitem/"
                         "part-00000-d08071cb-0dfa-42dc-9198-83cb334ccda3-c000.snappy.parquet";
     Block in_block;
     readParquetFile(header, file, in_block);

     CHColumnToSparkRow spark_row_converter;
     auto spark_row_info = spark_row_converter.convertCHColumnToSparkRow(in_block);
     for (auto _ : state) [[maybe_unused]]
         auto out_block = SparkRowToCHColumn::convertSparkRowInfoToCHColumn(*spark_row_info, *header);
 }

 BENCHMARK(BM_CHColumnToSparkRow_Lineitem)->Unit(benchmark::kMillisecond)->Iterations(10);
 BENCHMARK(BM_SparkRowToCHColumn_Lineitem)->Unit(benchmark::kMillisecond)->Iterations(10);
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	#include <string>
	#include <vector>
	#include <Core/Block.h>
	#include <DataTypes/DataTypeFactory.h>
	#include <IO/ReadBufferFromFile.h>
	#include <Interpreters/Context.h>
	#include <Parser/CHColumnToSparkRow.h>
	#include <Parser/SparkRowToCHColumn.h>
	#include <Processors/Executors/PullingPipelineExecutor.h>
	#include <Processors/Formats/Impl/ArrowColumnToCHColumn.h>
	#include <Processors/Formats/Impl/ParquetBlockInputFormat.h>
	#include <QueryPipeline/QueryPipeline.h>
	#include <base/types.h>
	#include <benchmark/benchmark.h>
	#include <Common/BlockTypeUtils.h>
	#include <Common/QueryContext.h>

	using namespace DB;
	using namespace local_engine;

	struct NameType
	{
	String name;
	String type;
	};

	using NameTypes = std::vector<NameType>;

	static Block getLineitemHeader(const NameTypes & name_types)
	{
	auto & factory = DataTypeFactory::instance();
	ColumnsWithTypeAndName columns(name_types.size());
	for (size_t i = 0; i < columns.size(); ++i)
	{
	columns[i].name = name_types[i].name;
	columns[i].type = factory.get(name_types[i].type);
	}
	return std::move(Block(columns));
	}

	static void readParquetFile(const SharedHeader & header, const String & file, Block & block)
	{
	auto in = std::make_unique<ReadBufferFromFile>(file);
	FormatSettings format_settings;
	auto global_context = QueryContext::globalContext();
	auto parser_group = std::make_shared<FormatFilterInfo>(nullptr, global_context, nullptr);
	auto parser_shared_resources
	= std::make_shared<FormatParserSharedResources>(global_context->getSettingsRef(), /num_streams_=/1);
	auto format = std::make_shared<ParquetBlockInputFormat>(*in, header, format_settings, parser_shared_resources, std::move(parser_group), 8192);
	auto pipeline = QueryPipeline(std::move(format));
	auto reader = std::make_unique<PullingPipelineExecutor>(pipeline);
	while (reader->pull(block))
	return;
	}

	static void BM_CHColumnToSparkRow_Lineitem(benchmark::State & state)
	{
	const NameTypes name_types = {
	{"l_orderkey", "Nullable(Int64)"},
	{"l_partkey", "Nullable(Int64)"},
	{"l_suppkey", "Nullable(Int64)"},
	{"l_linenumber", "Nullable(Int64)"},
	{"l_quantity", "Nullable(Float64)"},
	{"l_extendedprice", "Nullable(Float64)"},
	{"l_discount", "Nullable(Float64)"},
	{"l_tax", "Nullable(Float64)"},
	{"l_returnflag", "Nullable(String)"},
	{"l_linestatus", "Nullable(String)"},
	{"l_shipdate", "Nullable(Date32)"},
	{"l_commitdate", "Nullable(Date32)"},
	{"l_receiptdate", "Nullable(Date32)"},
	{"l_shipinstruct", "Nullable(String)"},
	{"l_shipmode", "Nullable(String)"},
	{"l_comment", "Nullable(String)"},
	};

	auto header = toShared(getLineitemHeader(name_types));
	const String file = "/data1/liyang/cppproject/gluten/gluten-core/src/test/resources/tpch-data/lineitem/"
	"part-00000-d08071cb-0dfa-42dc-9198-83cb334ccda3-c000.snappy.parquet";
	Block block;
	readParquetFile(header, file, block);
	// std::cerr << "read_rows:" << block.rows() << std::endl;
	CHColumnToSparkRow converter;
	for (auto _ : state)
	{
	auto spark_row_info = converter.convertCHColumnToSparkRow(block);
	converter.freeMem(spark_row_info->getBufferAddress(), spark_row_info->getTotalBytes());
	}
	}


	static void BM_SparkRowToCHColumn_Lineitem(benchmark::State & state)
	{
	const NameTypes name_types = {
	{"l_orderkey", "Nullable(Int64)"},
	{"l_partkey", "Nullable(Int64)"},
	{"l_suppkey", "Nullable(Int64)"},
	{"l_linenumber", "Nullable(Int64)"},
	{"l_quantity", "Nullable(Float64)"},
	{"l_extendedprice", "Nullable(Float64)"},
	{"l_discount", "Nullable(Float64)"},
	{"l_tax", "Nullable(Float64)"},
	{"l_returnflag", "Nullable(String)"},
	{"l_linestatus", "Nullable(String)"},
	{"l_shipdate", "Nullable(Date32)"},
	{"l_commitdate", "Nullable(Date32)"},
	{"l_receiptdate", "Nullable(Date32)"},
	{"l_shipinstruct", "Nullable(String)"},
	{"l_shipmode", "Nullable(String)"},
	{"l_comment", "Nullable(String)"},
	};

	auto header = toShared(getLineitemHeader(name_types));
	const String file = "/data1/liyang/cppproject/gluten/gluten-core/src/test/resources/tpch-data/lineitem/"
	"part-00000-d08071cb-0dfa-42dc-9198-83cb334ccda3-c000.snappy.parquet";
	Block in_block;
	readParquetFile(header, file, in_block);

	CHColumnToSparkRow spark_row_converter;
	auto spark_row_info = spark_row_converter.convertCHColumnToSparkRow(in_block);
	for (auto _ : state) [[maybe_unused]]
	auto out_block = SparkRowToCHColumn::convertSparkRowInfoToCHColumn(spark_row_info, header);
	}

	BENCHMARK(BM_CHColumnToSparkRow_Lineitem)->Unit(benchmark::kMillisecond)->Iterations(10);
	BENCHMARK(BM_SparkRowToCHColumn_Lineitem)->Unit(benchmark::kMillisecond)->Iterations(10);