blob: 8a4f3e0c55810c4d3035d718c3cf298e0175e425 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "benchmark/benchmark.h"
#include <array>
#include <bitset>
#include <cstdint>
#include <cstdlib>
#include <cstring>
#include <memory>
#include <utility>
#include "arrow/array/array_base.h"
#include "arrow/array/array_primitive.h"
#include "arrow/buffer.h"
#include "arrow/result.h"
#include "arrow/testing/gtest_util.h"
#include "arrow/testing/random.h"
#include "arrow/testing/util.h"
#include "arrow/type_fwd.h"
#include "arrow/util/bit_run_reader.h"
#include "arrow/util/bit_util.h"
#include "arrow/util/bitmap.h"
#include "arrow/util/bitmap_generate.h"
#include "arrow/util/bitmap_ops.h"
#include "arrow/util/bitmap_reader.h"
#include "arrow/util/bitmap_visit.h"
#include "arrow/util/bitmap_writer.h"
namespace arrow {
namespace BitUtil {
constexpr int64_t kBufferSize = 1024 * 8;
#ifdef ARROW_WITH_BENCHMARKS_REFERENCE
// A naive bitmap reader implementation, meant as a baseline against
// internal::BitmapReader
class NaiveBitmapReader {
public:
NaiveBitmapReader(const uint8_t* bitmap, int64_t start_offset, int64_t length)
: bitmap_(bitmap), position_(0) {}
bool IsSet() const { return BitUtil::GetBit(bitmap_, position_); }
bool IsNotSet() const { return !IsSet(); }
void Next() { ++position_; }
private:
const uint8_t* bitmap_;
uint64_t position_;
};
// A naive bitmap writer implementation, meant as a baseline against
// internal::BitmapWriter
class NaiveBitmapWriter {
public:
NaiveBitmapWriter(uint8_t* bitmap, int64_t start_offset, int64_t length)
: bitmap_(bitmap), position_(0) {}
void Set() {
const int64_t byte_offset = position_ / 8;
const int64_t bit_offset = position_ % 8;
auto bit_set_mask = (1U << bit_offset);
bitmap_[byte_offset] = static_cast<uint8_t>(bitmap_[byte_offset] | bit_set_mask);
}
void Clear() {
const int64_t byte_offset = position_ / 8;
const int64_t bit_offset = position_ % 8;
auto bit_clear_mask = 0xFFU ^ (1U << bit_offset);
bitmap_[byte_offset] = static_cast<uint8_t>(bitmap_[byte_offset] & bit_clear_mask);
}
void Next() { ++position_; }
void Finish() {}
int64_t position() const { return position_; }
private:
uint8_t* bitmap_;
int64_t position_;
};
#endif
static std::shared_ptr<Buffer> CreateRandomBuffer(int64_t nbytes) {
auto buffer = *AllocateBuffer(nbytes);
memset(buffer->mutable_data(), 0, nbytes);
random_bytes(nbytes, /*seed=*/0, buffer->mutable_data());
return std::move(buffer);
}
static std::shared_ptr<Buffer> CreateRandomBitsBuffer(int64_t nbits,
int64_t set_percentage) {
::arrow::random::RandomArrayGenerator rag(/*seed=*/23);
double set_probability =
static_cast<double>(set_percentage == -1 ? 0 : set_percentage) / 100.0;
std::shared_ptr<Buffer> buffer =
rag.Boolean(nbits, set_probability)->data()->buffers[1];
if (set_percentage == -1) {
internal::BitmapWriter writer(buffer->mutable_data(), /*start_offset=*/0,
/*length=*/nbits);
for (int x = 0; x < nbits; x++) {
if (x % 2 == 0) {
writer.Set();
} else {
writer.Clear();
}
writer.Next();
}
}
return buffer;
}
template <typename DoAnd>
static void BenchmarkAndImpl(benchmark::State& state, DoAnd&& do_and) {
int64_t nbytes = state.range(0);
int64_t offset = state.range(1);
std::shared_ptr<Buffer> buffer_1 = CreateRandomBuffer(nbytes);
std::shared_ptr<Buffer> buffer_2 = CreateRandomBuffer(nbytes);
std::shared_ptr<Buffer> buffer_3 = CreateRandomBuffer(nbytes);
const int64_t num_bits = nbytes * 8 - offset;
internal::Bitmap bitmap_1{buffer_1, 0, num_bits};
internal::Bitmap bitmap_2{buffer_2, offset, num_bits};
internal::Bitmap bitmap_3{buffer_3, 0, num_bits};
for (auto _ : state) {
do_and({bitmap_1, bitmap_2}, &bitmap_3);
auto total = internal::CountSetBits(bitmap_3.buffer()->data(), bitmap_3.offset(),
bitmap_3.length());
benchmark::DoNotOptimize(total);
}
state.SetBytesProcessed(state.iterations() * nbytes);
}
static void BenchmarkBitmapAnd(benchmark::State& state) {
BenchmarkAndImpl(state, [](const internal::Bitmap(&bitmaps)[2], internal::Bitmap* out) {
internal::BitmapAnd(bitmaps[0].buffer()->data(), bitmaps[0].offset(),
bitmaps[1].buffer()->data(), bitmaps[1].offset(),
bitmaps[0].length(), 0, out->buffer()->mutable_data());
});
}
static void BenchmarkBitmapVisitBitsetAnd(benchmark::State& state) {
BenchmarkAndImpl(state, [](const internal::Bitmap(&bitmaps)[2], internal::Bitmap* out) {
int64_t i = 0;
internal::Bitmap::VisitBits(
bitmaps, [&](std::bitset<2> bits) { out->SetBitTo(i++, bits[0] && bits[1]); });
});
}
static void BenchmarkBitmapVisitUInt8And(benchmark::State& state) {
BenchmarkAndImpl(state, [](const internal::Bitmap(&bitmaps)[2], internal::Bitmap* out) {
int64_t i = 0;
internal::Bitmap::VisitWords(bitmaps, [&](std::array<uint8_t, 2> uint8s) {
reinterpret_cast<uint8_t*>(out->buffer()->mutable_data())[i++] =
uint8s[0] & uint8s[1];
});
});
}
static void BenchmarkBitmapVisitUInt64And(benchmark::State& state) {
BenchmarkAndImpl(state, [](const internal::Bitmap(&bitmaps)[2], internal::Bitmap* out) {
int64_t i = 0;
internal::Bitmap::VisitWords(bitmaps, [&](std::array<uint64_t, 2> uint64s) {
reinterpret_cast<uint64_t*>(out->buffer()->mutable_data())[i++] =
uint64s[0] & uint64s[1];
});
});
}
template <typename BitmapReaderType>
static void BenchmarkBitmapReader(benchmark::State& state, int64_t nbytes) {
std::shared_ptr<Buffer> buffer = CreateRandomBuffer(nbytes);
const int64_t num_bits = nbytes * 8;
const uint8_t* bitmap = buffer->data();
for (auto _ : state) {
{
BitmapReaderType reader(bitmap, 0, num_bits);
int64_t total = 0;
for (int64_t i = 0; i < num_bits; i++) {
total += reader.IsSet();
reader.Next();
}
benchmark::DoNotOptimize(total);
}
{
BitmapReaderType reader(bitmap, 0, num_bits);
int64_t total = 0;
for (int64_t i = 0; i < num_bits; i++) {
total += !reader.IsNotSet();
reader.Next();
}
benchmark::DoNotOptimize(total);
}
}
state.SetBytesProcessed(2LL * state.iterations() * nbytes);
}
template <typename BitRunReaderType>
static void BenchmarkBitRunReader(benchmark::State& state, int64_t set_percentage) {
constexpr int64_t kNumBits = 4096;
auto buffer = CreateRandomBitsBuffer(kNumBits, set_percentage);
for (auto _ : state) {
{
BitRunReaderType reader(buffer->data(), 0, kNumBits);
int64_t set_total = 0;
internal::BitRun br;
do {
br = reader.NextRun();
set_total += br.set ? br.length : 0;
} while (br.length != 0);
benchmark::DoNotOptimize(set_total);
}
}
state.SetBytesProcessed(state.iterations() * (kNumBits / 8));
}
template <typename SetBitRunReaderType>
static void BenchmarkSetBitRunReader(benchmark::State& state, int64_t set_percentage) {
constexpr int64_t kNumBits = 4096;
auto buffer = CreateRandomBitsBuffer(kNumBits, set_percentage);
for (auto _ : state) {
{
SetBitRunReaderType reader(buffer->data(), 0, kNumBits);
int64_t set_total = 0;
internal::SetBitRun br;
do {
br = reader.NextRun();
set_total += br.length;
} while (br.length != 0);
benchmark::DoNotOptimize(set_total);
}
}
state.SetBytesProcessed(state.iterations() * (kNumBits / 8));
}
template <typename VisitBitsFunctorType>
static void BenchmarkVisitBits(benchmark::State& state, int64_t nbytes) {
std::shared_ptr<Buffer> buffer = CreateRandomBuffer(nbytes);
const int64_t num_bits = nbytes * 8;
const uint8_t* bitmap = buffer->data();
for (auto _ : state) {
{
int64_t total = 0;
const auto visit = [&total](bool value) -> void { total += value; };
VisitBitsFunctorType()(bitmap, 0, num_bits, visit);
benchmark::DoNotOptimize(total);
}
{
int64_t total = 0;
const auto visit = [&total](bool value) -> void { total += value; };
VisitBitsFunctorType()(bitmap, 0, num_bits, visit);
benchmark::DoNotOptimize(total);
}
}
state.SetBytesProcessed(2LL * state.iterations() * nbytes);
}
constexpr bool pattern[] = {false, false, false, true, true, true};
static_assert(
(sizeof(pattern) / sizeof(pattern[0])) % 8 != 0,
"pattern must not be a multiple of 8, otherwise gcc can optimize with a memset");
template <typename BitmapWriterType>
static void BenchmarkBitmapWriter(benchmark::State& state, int64_t nbytes) {
std::shared_ptr<Buffer> buffer = CreateRandomBuffer(nbytes);
const int64_t num_bits = nbytes * 8;
uint8_t* bitmap = buffer->mutable_data();
for (auto _ : state) {
BitmapWriterType writer(bitmap, 0, num_bits);
int64_t pattern_index = 0;
for (int64_t i = 0; i < num_bits; i++) {
if (pattern[pattern_index++]) {
writer.Set();
} else {
writer.Clear();
}
if (pattern_index == sizeof(pattern) / sizeof(bool)) {
pattern_index = 0;
}
writer.Next();
}
writer.Finish();
benchmark::ClobberMemory();
}
state.SetBytesProcessed(state.iterations() * nbytes);
}
template <typename GenerateBitsFunctorType>
static void BenchmarkGenerateBits(benchmark::State& state, int64_t nbytes) {
std::shared_ptr<Buffer> buffer = CreateRandomBuffer(nbytes);
const int64_t num_bits = nbytes * 8;
uint8_t* bitmap = buffer->mutable_data();
while (state.KeepRunning()) {
int64_t pattern_index = 0;
const auto generate = [&]() -> bool {
bool b = pattern[pattern_index++];
if (pattern_index == sizeof(pattern) / sizeof(bool)) {
pattern_index = 0;
}
return b;
};
GenerateBitsFunctorType()(bitmap, 0, num_bits, generate);
benchmark::ClobberMemory();
}
state.SetBytesProcessed(state.iterations() * nbytes);
}
static void BitmapReader(benchmark::State& state) {
BenchmarkBitmapReader<internal::BitmapReader>(state, state.range(0));
}
static void BitmapUInt64Reader(benchmark::State& state) {
const int64_t nbytes = state.range(0);
std::shared_ptr<Buffer> buffer = CreateRandomBuffer(nbytes);
const int64_t num_bits = nbytes * 8;
const uint8_t* bitmap = buffer->data();
for (auto _ : state) {
{
internal::BitmapUInt64Reader reader(bitmap, 0, num_bits);
uint64_t total = 0;
for (int64_t i = 0; i < num_bits; i += 64) {
total += reader.NextWord();
}
benchmark::DoNotOptimize(total);
}
}
state.SetBytesProcessed(state.iterations() * nbytes);
}
static void BitRunReader(benchmark::State& state) {
BenchmarkBitRunReader<internal::BitRunReader>(state, state.range(0));
}
static void BitRunReaderLinear(benchmark::State& state) {
BenchmarkBitRunReader<internal::BitRunReaderLinear>(state, state.range(0));
}
static void SetBitRunReader(benchmark::State& state) {
BenchmarkSetBitRunReader<internal::SetBitRunReader>(state, state.range(0));
}
static void ReverseSetBitRunReader(benchmark::State& state) {
BenchmarkSetBitRunReader<internal::ReverseSetBitRunReader>(state, state.range(0));
}
static void BitmapWriter(benchmark::State& state) {
BenchmarkBitmapWriter<internal::BitmapWriter>(state, state.range(0));
}
static void FirstTimeBitmapWriter(benchmark::State& state) {
BenchmarkBitmapWriter<internal::FirstTimeBitmapWriter>(state, state.range(0));
}
struct GenerateBitsFunctor {
template <class Generator>
void operator()(uint8_t* bitmap, int64_t start_offset, int64_t length, Generator&& g) {
return internal::GenerateBits(bitmap, start_offset, length, g);
}
};
struct GenerateBitsUnrolledFunctor {
template <class Generator>
void operator()(uint8_t* bitmap, int64_t start_offset, int64_t length, Generator&& g) {
return internal::GenerateBitsUnrolled(bitmap, start_offset, length, g);
}
};
struct VisitBitsFunctor {
template <class Visitor>
void operator()(const uint8_t* bitmap, int64_t start_offset, int64_t length,
Visitor&& g) {
return internal::VisitBits(bitmap, start_offset, length, g);
}
};
struct VisitBitsUnrolledFunctor {
template <class Visitor>
void operator()(const uint8_t* bitmap, int64_t start_offset, int64_t length,
Visitor&& g) {
return internal::VisitBitsUnrolled(bitmap, start_offset, length, g);
}
};
static void GenerateBits(benchmark::State& state) {
BenchmarkGenerateBits<GenerateBitsFunctor>(state, state.range(0));
}
static void GenerateBitsUnrolled(benchmark::State& state) {
BenchmarkGenerateBits<GenerateBitsUnrolledFunctor>(state, state.range(0));
}
static void VisitBits(benchmark::State& state) {
BenchmarkVisitBits<VisitBitsFunctor>(state, state.range(0));
}
static void VisitBitsUnrolled(benchmark::State& state) {
BenchmarkVisitBits<VisitBitsUnrolledFunctor>(state, state.range(0));
}
static void SetBitsTo(benchmark::State& state) {
int64_t nbytes = state.range(0);
std::shared_ptr<Buffer> buffer = CreateRandomBuffer(nbytes);
for (auto _ : state) {
BitUtil::SetBitsTo(buffer->mutable_data(), /*offset=*/0, nbytes * 8, true);
}
state.SetBytesProcessed(state.iterations() * nbytes);
}
template <int64_t OffsetSrc, int64_t OffsetDest = 0>
static void CopyBitmap(benchmark::State& state) { // NOLINT non-const reference
const int64_t buffer_size = state.range(0);
const int64_t bits_size = buffer_size * 8;
std::shared_ptr<Buffer> buffer = CreateRandomBuffer(buffer_size);
const uint8_t* src = buffer->data();
const int64_t length = bits_size - OffsetSrc;
auto copy = *AllocateEmptyBitmap(length);
for (auto _ : state) {
internal::CopyBitmap(src, OffsetSrc, length, copy->mutable_data(), OffsetDest);
}
state.SetBytesProcessed(state.iterations() * buffer_size);
}
static void CopyBitmapWithoutOffset(
benchmark::State& state) { // NOLINT non-const reference
CopyBitmap<0>(state);
}
// Trigger the slow path where the source buffer is not byte aligned.
static void CopyBitmapWithOffset(benchmark::State& state) { // NOLINT non-const reference
CopyBitmap<4>(state);
}
// Trigger the slow path where both source and dest buffer are not byte aligned.
static void CopyBitmapWithOffsetBoth(benchmark::State& state) { CopyBitmap<3, 7>(state); }
// Benchmark the worst case of comparing two identical bitmap
template <int64_t Offset = 0>
static void BitmapEquals(benchmark::State& state) {
const int64_t buffer_size = state.range(0);
const int64_t bits_size = buffer_size * 8;
std::shared_ptr<Buffer> buffer = CreateRandomBuffer(buffer_size);
const uint8_t* src = buffer->data();
const int64_t offset = Offset;
const int64_t length = bits_size - offset;
auto copy = *AllocateEmptyBitmap(length + offset);
internal::CopyBitmap(src, 0, length, copy->mutable_data(), offset);
for (auto _ : state) {
auto is_same = internal::BitmapEquals(src, 0, copy->data(), offset, length);
benchmark::DoNotOptimize(is_same);
}
state.SetBytesProcessed(state.iterations() * buffer_size);
}
static void BitmapEqualsWithoutOffset(benchmark::State& state) { BitmapEquals<0>(state); }
static void BitmapEqualsWithOffset(benchmark::State& state) { BitmapEquals<4>(state); }
#ifdef ARROW_WITH_BENCHMARKS_REFERENCE
static void ReferenceNaiveBitmapReader(benchmark::State& state) {
BenchmarkBitmapReader<NaiveBitmapReader>(state, state.range(0));
}
BENCHMARK(ReferenceNaiveBitmapReader)->Arg(kBufferSize);
#endif
void SetBitRunReaderPercentageArg(benchmark::internal::Benchmark* bench) {
bench->Arg(-1)->Arg(0)->Arg(10)->Arg(25)->Arg(50)->Arg(60)->Arg(75)->Arg(99);
}
BENCHMARK(BitmapReader)->Arg(kBufferSize);
BENCHMARK(BitmapUInt64Reader)->Arg(kBufferSize);
BENCHMARK(BitRunReader)->Apply(SetBitRunReaderPercentageArg);
BENCHMARK(BitRunReaderLinear)->Apply(SetBitRunReaderPercentageArg);
BENCHMARK(SetBitRunReader)->Apply(SetBitRunReaderPercentageArg);
BENCHMARK(ReverseSetBitRunReader)->Apply(SetBitRunReaderPercentageArg);
BENCHMARK(VisitBits)->Arg(kBufferSize);
BENCHMARK(VisitBitsUnrolled)->Arg(kBufferSize);
BENCHMARK(SetBitsTo)->Arg(2)->Arg(1 << 4)->Arg(1 << 10)->Arg(1 << 17);
#ifdef ARROW_WITH_BENCHMARKS_REFERENCE
static void ReferenceNaiveBitmapWriter(benchmark::State& state) {
BenchmarkBitmapWriter<NaiveBitmapWriter>(state, state.range(0));
}
BENCHMARK(ReferenceNaiveBitmapWriter)->Arg(kBufferSize);
#endif
BENCHMARK(BitmapWriter)->Arg(kBufferSize);
BENCHMARK(FirstTimeBitmapWriter)->Arg(kBufferSize);
BENCHMARK(GenerateBits)->Arg(kBufferSize);
BENCHMARK(GenerateBitsUnrolled)->Arg(kBufferSize);
BENCHMARK(CopyBitmapWithoutOffset)->Arg(kBufferSize);
BENCHMARK(CopyBitmapWithOffset)->Arg(kBufferSize);
BENCHMARK(CopyBitmapWithOffsetBoth)->Arg(kBufferSize);
BENCHMARK(BitmapEqualsWithoutOffset)->Arg(kBufferSize);
BENCHMARK(BitmapEqualsWithOffset)->Arg(kBufferSize);
#define AND_BENCHMARK_RANGES \
{ \
{kBufferSize * 4, kBufferSize * 16}, { 0, 2 } \
}
BENCHMARK(BenchmarkBitmapAnd)->Ranges(AND_BENCHMARK_RANGES);
BENCHMARK(BenchmarkBitmapVisitBitsetAnd)->Ranges(AND_BENCHMARK_RANGES);
BENCHMARK(BenchmarkBitmapVisitUInt8And)->Ranges(AND_BENCHMARK_RANGES);
BENCHMARK(BenchmarkBitmapVisitUInt64And)->Ranges(AND_BENCHMARK_RANGES);
} // namespace BitUtil
} // namespace arrow