PARQUET-1094: Add benchmark for boolean Arrow column I/O
Author: Uwe L. Korn <uwe@apache.org>
Closes #391 from xhochy/PARQUET-1094 and squashes the following commits:
089bb3c [Uwe L. Korn] PARQUET-1094: Add benchmark for boolean Arrow column I/O
diff --git a/src/parquet/arrow/arrow-reader-writer-benchmark.cc b/src/parquet/arrow/arrow-reader-writer-benchmark.cc
index 84a6fb3..e899e10 100644
--- a/src/parquet/arrow/arrow-reader-writer-benchmark.cc
+++ b/src/parquet/arrow/arrow-reader-writer-benchmark.cc
@@ -27,6 +27,7 @@
#include "arrow/api.h"
+using arrow::BooleanBuilder;
using arrow::NumericBuilder;
#define ABORT_NOT_OK(s) \
@@ -66,6 +67,11 @@
using arrow_type = ::arrow::DoubleType;
};
+template <>
+struct benchmark_traits<BooleanType> {
+ using arrow_type = ::arrow::BooleanType;
+};
+
template <typename ParquetType>
using ArrowType = typename benchmark_traits<ParquetType>::arrow_type;
@@ -86,11 +92,11 @@
state.SetBytesProcessed(bytes_processed);
}
-template <bool nullable, typename ParquetType>
+template <typename ParquetType>
std::shared_ptr<::arrow::Table> TableFromVector(
- const std::vector<typename ParquetType::c_type>& vec) {
+ const std::vector<typename ParquetType::c_type>& vec, bool nullable) {
::arrow::TypePtr type = std::make_shared<ArrowType<ParquetType>>();
- NumericBuilder<ArrowType<ParquetType>> builder(type, ::arrow::default_memory_pool());
+ NumericBuilder<ArrowType<ParquetType>> builder;
if (nullable) {
std::vector<uint8_t> valid_bytes(BENCHMARK_SIZE, 0);
int n = {0};
@@ -101,7 +107,32 @@
}
std::shared_ptr<::arrow::Array> array;
ABORT_NOT_OK(builder.Finish(&array));
- auto field = std::make_shared<::arrow::Field>("column", type, nullable);
+
+ auto field = ::arrow::field("column", type, nullable);
+ auto schema = std::make_shared<::arrow::Schema>(
+ std::vector<std::shared_ptr<::arrow::Field>>({field}));
+ auto column = std::make_shared<::arrow::Column>(field, array);
+ return std::make_shared<::arrow::Table>(
+ schema, std::vector<std::shared_ptr<::arrow::Column>>({column}));
+}
+
+template <>
+std::shared_ptr<::arrow::Table> TableFromVector<BooleanType>(const std::vector<bool>& vec,
+ bool nullable) {
+ BooleanBuilder builder;
+ if (nullable) {
+ std::vector<bool> valid_bytes(BENCHMARK_SIZE, 0);
+ int n = {0};
+ std::generate(valid_bytes.begin(), valid_bytes.end(),
+ [&n] { return (n++ % 2) != 0; });
+ ABORT_NOT_OK(builder.Append(vec, valid_bytes));
+ } else {
+ ABORT_NOT_OK(builder.Append(vec));
+ }
+ std::shared_ptr<::arrow::Array> array;
+ ABORT_NOT_OK(builder.Finish(&array));
+
+ auto field = ::arrow::field("column", ::arrow::boolean(), nullable);
auto schema = std::make_shared<::arrow::Schema>(
std::vector<std::shared_ptr<::arrow::Field>>({field}));
auto column = std::make_shared<::arrow::Column>(field, array);
@@ -113,7 +144,7 @@
static void BM_WriteColumn(::benchmark::State& state) {
format::ColumnChunk thrift_metadata;
std::vector<typename ParquetType::c_type> values(BENCHMARK_SIZE, 128);
- std::shared_ptr<::arrow::Table> table = TableFromVector<nullable, ParquetType>(values);
+ std::shared_ptr<::arrow::Table> table = TableFromVector<ParquetType>(values, nullable);
while (state.KeepRunning()) {
auto output = std::make_shared<InMemoryOutputStream>();
@@ -132,10 +163,13 @@
BENCHMARK_TEMPLATE2(BM_WriteColumn, false, DoubleType);
BENCHMARK_TEMPLATE2(BM_WriteColumn, true, DoubleType);
+BENCHMARK_TEMPLATE2(BM_WriteColumn, false, BooleanType);
+BENCHMARK_TEMPLATE2(BM_WriteColumn, true, BooleanType);
+
template <bool nullable, typename ParquetType>
static void BM_ReadColumn(::benchmark::State& state) {
std::vector<typename ParquetType::c_type> values(BENCHMARK_SIZE, 128);
- std::shared_ptr<::arrow::Table> table = TableFromVector<nullable, ParquetType>(values);
+ std::shared_ptr<::arrow::Table> table = TableFromVector<ParquetType>(values, nullable);
auto output = std::make_shared<InMemoryOutputStream>();
ABORT_NOT_OK(
WriteTable(*table, ::arrow::default_memory_pool(), output, BENCHMARK_SIZE));
@@ -160,6 +194,9 @@
BENCHMARK_TEMPLATE2(BM_ReadColumn, false, DoubleType);
BENCHMARK_TEMPLATE2(BM_ReadColumn, true, DoubleType);
+BENCHMARK_TEMPLATE2(BM_ReadColumn, false, BooleanType);
+BENCHMARK_TEMPLATE2(BM_ReadColumn, true, BooleanType);
+
} // namespace benchmark
} // namespace parquet