blob: 014b3fe412ee227009e06b3755e845932c38e023 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include <cmath>
#include <unordered_set>
#include <vector>
#include "common/names.h"
#include "kudu/util/random.h"
#include "testutil/gtest-util.h"
#include "util/parquet-bloom-filter.h"
namespace impala {
TEST(ParquetBloomFilter, OptimalByteSize) {
// Check that results are consistent with kudu::BlockBloomFilter::MinLogSpace(ndv, fpp).
std::vector<size_t> ndvs = {1, 100, 500, 1000, 10000, 100000, 1024*1024};
std::vector<double> fpps = {0.01, 0.001};
for (uint64_t ndv : ndvs) {
for (double fpp : fpps) {
const int expected_log_bytes = ParquetBloomFilter::MinLogSpace(ndv, fpp);
const int expected_bytes = std::pow(2, expected_log_bytes);
const int actual_bytes = ParquetBloomFilter::OptimalByteSize(ndv, fpp);
if (expected_bytes < ParquetBloomFilter::MIN_BYTES) {
EXPECT_EQ(actual_bytes, ParquetBloomFilter::MIN_BYTES)
<< "NDV: " << ndv << ", FPP: " << fpp << ".";
} else if (expected_bytes > ParquetBloomFilter::MAX_BYTES) {
EXPECT_EQ(actual_bytes, ParquetBloomFilter::MAX_BYTES)
<< "NDV: " << ndv << ", FPP: " << fpp << ".";
} else {
EXPECT_EQ(actual_bytes, expected_bytes)
<< "NDV: " << ndv << ", FPP: " << fpp << ".";
}
}
}
}
TEST(ParquetBloomFilter, OptimalByteSizeEdgeCase) {
const int space = ParquetBloomFilter::OptimalByteSize(1, 0.75);
EXPECT_GE(space, ParquetBloomFilter::MIN_BYTES)
<< "OptimalByteSize should always be >= ParquetBloomFilter::MIN_BYTES";
}
// A ParquetBloomFilter with storage.
struct BloomWrapper {
std::unique_ptr<ParquetBloomFilter> bloom;
std::unique_ptr<vector<uint8_t>> storage;
};
BloomWrapper CreateBloomFilter(int ndv, double fpp) {
const int storage_size = ParquetBloomFilter::OptimalByteSize(ndv, fpp);
BloomWrapper res;
res.bloom = std::make_unique<ParquetBloomFilter>();
res.storage = std::make_unique<vector<uint8_t>>(storage_size, 0);
Status status = res.bloom->Init(res.storage->data(), storage_size, true);
EXPECT_TRUE(status.ok()) << status.GetDetail();
return res;
}
// Make a random uint64_t, avoiding the absent high bit and the low-entropy low bits
// produced by rand().
uint64_t MakeRand() {
uint32_t high = (rand() >> 8) & 0xffff;
high <<= 16;
high |= (rand() >> 8) & 0xffff;
uint32_t low = (rand() >> 8) & 0xffff;
low <<= 16;
low |= (rand() >> 8) & 0xffff;
uint64_t result = high;
result <<= 32;
result |= low;
return result;
}
// We can Insert() hashes into a Bloom filter with different spaces.
TEST(ParquetBloomFilter, Insert) {
srand(0);
for (int ndv = 100; ndv <= 100000; ndv *= 10) {
BloomWrapper wrapper = CreateBloomFilter(ndv, 0.01);
for (int i = 0; i < ndv; ++i) {
wrapper.bloom->Insert(MakeRand());
}
}
}
// After Insert()ing something into a Bloom filter, it can be found again immediately.
TEST(ParquetBloomFilter, Find) {
srand(0);
for (int ndv = 100; ndv <= 100000; ndv *= 10) {
BloomWrapper wrapper = CreateBloomFilter(ndv, 0.01);
for (int i = 0; i < ndv; ++i) {
const uint64_t to_insert = MakeRand();
wrapper.bloom->Insert(to_insert);
EXPECT_TRUE(wrapper.bloom->Find(to_insert));
}
}
}
TEST(ParquetBloomFilter, HashAndFind) {
srand(0);
for (int ndv = 100; ndv <= 100000; ndv *= 10) {
BloomWrapper wrapper = CreateBloomFilter(ndv, 0.01);
for (int i = 0; i < ndv; ++i) {
const uint64_t to_insert = MakeRand();
wrapper.bloom->HashAndInsert(reinterpret_cast<const uint8_t*>(&to_insert),
sizeof(uint64_t));
EXPECT_TRUE(wrapper.bloom->HashAndFind(reinterpret_cast<const uint8_t*>(&to_insert),
sizeof(uint64_t)));
}
}
}
// After Insert()ing something into a Bloom filter, it can be found again much later.
TEST(ParquetBloomFilter, CumulativeFind) {
srand(0);
for (int ndv = 100; ndv <= 1000; ndv *= 10) {
std::vector<uint64_t> inserted;
BloomWrapper wrapper = CreateBloomFilter(ndv, 0.01);
for (int i = 0; i < ndv; ++i) {
const uint64_t to_insert = MakeRand();
inserted.push_back(to_insert);
wrapper.bloom->Insert(to_insert);
for (int n = 0; n < inserted.size(); ++n) {
EXPECT_TRUE(wrapper.bloom->Find(inserted[n]));
}
}
}
}
// The empirical false positives we find when looking for random items is with a constant
// factor of the false positive probability the Bloom filter was constructed for.
TEST(ParquetBloomFilter, FindInvalid) {
// We use a deterministic pseudorandom number generator with a set seed. The reason is
// that with a run-dependent seed, there will always be inputs that can fail. That's a
// potential argument for this to be a benchmark rather than a test, although the
// measured quantity would be not time but deviation from predicted fpp.
kudu::Random rgen(867 + 5309);
static const int find_limit = 1 << 22;
std::unordered_set<uint64_t> to_find;
while (to_find.size() < find_limit) {
to_find.insert(rgen.Next64());
}
static const int max_log_ndv = 19;
std::unordered_set<uint64_t> to_insert;
while (to_insert.size() < (1ull << max_log_ndv)) {
const uint64_t candidate = rgen.Next64();
if (to_find.find(candidate) == to_find.end()) {
to_insert.insert(candidate);
}
}
// NOTE: Even though this was generated with a deterministic pseudorandom number
// generator, the order of items in the vector is dependent on the implementation
// of unordered_set, which can change with different libstdc++ versions. Since
// the tests below use the first X entries, changes in the order also change the
// test.
vector<uint64_t> shuffled_insert(to_insert.begin(), to_insert.end());
for (int log_ndv = 12; log_ndv < max_log_ndv; ++log_ndv) {
for (int log_fpp = 4; log_fpp < 12; ++log_fpp) {
double fpp = 1.0 / (1 << log_fpp);
const size_t ndv = 1 << log_ndv;
BloomWrapper wrapper = CreateBloomFilter(ndv, fpp);
ParquetBloomFilter* bloom = wrapper.bloom.get();
// Fill up a BF with exactly as much ndv as we planned for it:
for (size_t i = 0; i < ndv; ++i) {
bloom->Insert(shuffled_insert[i]);
}
int found = 0;
// Now we sample from the set of possible hashes, looking for hits.
for (const uint64_t hash_to_find : to_find) {
bool elem_found = bloom->Find(hash_to_find);
found += elem_found;
}
EXPECT_LE(found, find_limit * fpp * 2)
<< "Too many false positives with -log2(fpp) = " << log_fpp
<< " and ndv = " << ndv;
// Because the space is rounded up to a power of 2, we might actually get a lower
// fpp than the one passed to MinLogSpace().
const int log_space = log2(wrapper.storage->size());
const double expected_fpp =
ParquetBloomFilter::FalsePositiveProb(ndv, log_space);
// Fudge factors are present because filter characteristics are true in the limit,
// and will deviate for small samples.
EXPECT_GE(found, find_limit * expected_fpp * 0.75)
<< "Too few false positives with -log2(fpp) = " << log_fpp
<< " expected_fpp = " << expected_fpp;
EXPECT_LE(found, find_limit * expected_fpp * 1.25)
<< "Too many false positives with -log2(fpp) = " << log_fpp
<< " expected_fpp = " << expected_fpp;
}
}
}
} // namespace impala