be/src/util/parquet-bloom-filter-test.cc - impala - Git at Google

 // Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements.  See the NOTICE file
 // distributed with this work for additional information
 // regarding copyright ownership.  The ASF licenses this file
 // to you under the Apache License, Version 2.0 (the
 // "License"); you may not use this file except in compliance
 // with the License.  You may obtain a copy of the License at
 //
 //   http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing,
 // software distributed under the License is distributed on an
 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.

 #include <cmath>
 #include <unordered_set>
 #include <vector>

 #include "common/names.h"
 #include "kudu/util/random.h"
 #include "testutil/gtest-util.h"
 #include "util/parquet-bloom-filter.h"

 namespace impala {

 TEST(ParquetBloomFilter, OptimalByteSize) {
   // Check that results are consistent with kudu::BlockBloomFilter::MinLogSpace(ndv, fpp).
   std::vector<size_t> ndvs = {1, 100, 500, 1000, 10000, 100000, 1024*1024};
   std::vector<double> fpps = {0.01, 0.001};

   for (uint64_t ndv : ndvs) {
     for (double fpp : fpps) {
       const int expected_log_bytes = ParquetBloomFilter::MinLogSpace(ndv, fpp);
       const int expected_bytes = std::pow(2, expected_log_bytes);
       const int actual_bytes = ParquetBloomFilter::OptimalByteSize(ndv, fpp);

       if (expected_bytes < ParquetBloomFilter::MIN_BYTES) {
         EXPECT_EQ(actual_bytes, ParquetBloomFilter::MIN_BYTES)
             << "NDV: " << ndv << ", FPP: " << fpp << ".";
       } else if (expected_bytes > ParquetBloomFilter::MAX_BYTES) {
         EXPECT_EQ(actual_bytes, ParquetBloomFilter::MAX_BYTES)
             << "NDV: " << ndv << ", FPP: " << fpp << ".";
       } else {
         EXPECT_EQ(actual_bytes, expected_bytes)
             << "NDV: " << ndv << ", FPP: " << fpp << ".";
       }
     }
   }
 }

 TEST(ParquetBloomFilter, OptimalByteSizeEdgeCase) {
   const int space = ParquetBloomFilter::OptimalByteSize(1, 0.75);
   EXPECT_GE(space, ParquetBloomFilter::MIN_BYTES)
       << "OptimalByteSize should always be >= ParquetBloomFilter::MIN_BYTES";
 }

 // A ParquetBloomFilter with storage.
 struct BloomWrapper {
   std::unique_ptr<ParquetBloomFilter> bloom;
   std::unique_ptr<vector<uint8_t>> storage;
 };

 BloomWrapper CreateBloomFilter(int ndv, double fpp) {
   const int storage_size = ParquetBloomFilter::OptimalByteSize(ndv, fpp);
   BloomWrapper res;
   res.bloom = std::make_unique<ParquetBloomFilter>();
   res.storage = std::make_unique<vector<uint8_t>>(storage_size, 0);
   Status status = res.bloom->Init(res.storage->data(), storage_size, true);
   EXPECT_TRUE(status.ok()) << status.GetDetail();
   return res;
 }

 // Make a random uint64_t, avoiding the absent high bit and the low-entropy low bits
 // produced by rand().
 uint64_t MakeRand() {
   uint32_t high = (rand() >> 8) & 0xffff;
   high <<= 16;
   high |= (rand() >> 8) & 0xffff;

   uint32_t low = (rand() >> 8) & 0xffff;
   low <<= 16;
   low |= (rand() >> 8) & 0xffff;

   uint64_t result = high;
   result <<= 32;
   result |= low;
   return result;
 }

 // We can Insert() hashes into a Bloom filter with different spaces.
 TEST(ParquetBloomFilter, Insert) {
   srand(0);
   for (int ndv = 100; ndv <= 100000; ndv *= 10) {
     BloomWrapper wrapper = CreateBloomFilter(ndv, 0.01);
     for (int i = 0; i < ndv; ++i) {
       wrapper.bloom->Insert(MakeRand());
     }
   }
 }

 // After Insert()ing something into a Bloom filter, it can be found again immediately.
 TEST(ParquetBloomFilter, Find) {
   srand(0);
   for (int ndv = 100; ndv <= 100000; ndv *= 10) {
     BloomWrapper wrapper = CreateBloomFilter(ndv, 0.01);
     for (int i = 0; i < ndv; ++i) {
       const uint64_t to_insert = MakeRand();
       wrapper.bloom->Insert(to_insert);
       EXPECT_TRUE(wrapper.bloom->Find(to_insert));
     }
   }
 }

 TEST(ParquetBloomFilter, HashAndFind) {
   srand(0);
   for (int ndv = 100; ndv <= 100000; ndv *= 10) {
     BloomWrapper wrapper = CreateBloomFilter(ndv, 0.01);
     for (int i = 0; i < ndv; ++i) {
       const uint64_t to_insert = MakeRand();
       wrapper.bloom->HashAndInsert(reinterpret_cast<const uint8_t*>(&to_insert),
           sizeof(uint64_t));
       EXPECT_TRUE(wrapper.bloom->HashAndFind(reinterpret_cast<const uint8_t*>(&to_insert),
               sizeof(uint64_t)));
     }
   }
 }


 // After Insert()ing something into a Bloom filter, it can be found again much later.
 TEST(ParquetBloomFilter, CumulativeFind) {
   srand(0);
   for (int ndv = 100; ndv <= 1000; ndv *= 10) {
     std::vector<uint64_t> inserted;
     BloomWrapper wrapper = CreateBloomFilter(ndv, 0.01);
     for (int i = 0; i < ndv; ++i) {
       const uint64_t to_insert = MakeRand();
       inserted.push_back(to_insert);
       wrapper.bloom->Insert(to_insert);
       for (int n = 0; n < inserted.size(); ++n) {
         EXPECT_TRUE(wrapper.bloom->Find(inserted[n]));
       }
     }
   }
 }

 // The empirical false positives we find when looking for random items is with a constant
 // factor of the false positive probability the Bloom filter was constructed for.
 TEST(ParquetBloomFilter, FindInvalid) {
   // We use a deterministic pseudorandom number generator with a set seed. The reason is
   // that with a run-dependent seed, there will always be inputs that can fail. That's a
   // potential argument for this to be a benchmark rather than a test, although the
   // measured quantity would be not time but deviation from predicted fpp.
   kudu::Random rgen(867 + 5309);
   static const int find_limit = 1 << 22;
   std::unordered_set<uint64_t> to_find;
   while (to_find.size() < find_limit) {
     to_find.insert(rgen.Next64());
   }
   static const int max_log_ndv = 19;
   std::unordered_set<uint64_t> to_insert;
   while (to_insert.size() < (1ull << max_log_ndv)) {
     const uint64_t candidate = rgen.Next64();
     if (to_find.find(candidate) == to_find.end()) {
       to_insert.insert(candidate);
     }
   }
   // NOTE: Even though this was generated with a deterministic pseudorandom number
   // generator, the order of items in the vector is dependent on the implementation
   // of unordered_set, which can change with different libstdc++ versions. Since
   // the tests below use the first X entries, changes in the order also change the
   // test.
   vector<uint64_t> shuffled_insert(to_insert.begin(), to_insert.end());
   for (int log_ndv = 12; log_ndv < max_log_ndv; ++log_ndv) {
     for (int log_fpp = 4; log_fpp < 12; ++log_fpp) {
       double fpp = 1.0 / (1 << log_fpp);
       const size_t ndv = 1 << log_ndv;
       BloomWrapper wrapper = CreateBloomFilter(ndv, fpp);
       ParquetBloomFilter* bloom = wrapper.bloom.get();

       // Fill up a BF with exactly as much ndv as we planned for it:
       for (size_t i = 0; i < ndv; ++i) {
         bloom->Insert(shuffled_insert[i]);
       }
       int found = 0;
       // Now we sample from the set of possible hashes, looking for hits.
       for (const uint64_t hash_to_find : to_find) {
         bool elem_found = bloom->Find(hash_to_find);
         found += elem_found;
       }
       EXPECT_LE(found, find_limit * fpp * 2)
           << "Too many false positives with -log2(fpp) = " << log_fpp
           << " and ndv = " << ndv;

       // Because the space is rounded up to a power of 2, we might actually get a lower
       // fpp than the one passed to MinLogSpace().
       const int log_space = log2(wrapper.storage->size());
       const double expected_fpp =
           ParquetBloomFilter::FalsePositiveProb(ndv, log_space);
       // Fudge factors are present because filter characteristics are true in the limit,
       // and will deviate for small samples.
       EXPECT_GE(found, find_limit * expected_fpp * 0.75)
           << "Too few false positives with -log2(fpp) = " << log_fpp
           << " expected_fpp = " << expected_fpp;
       EXPECT_LE(found, find_limit * expected_fpp * 1.25)
           << "Too many false positives with -log2(fpp) = " << log_fpp
           << " expected_fpp = " << expected_fpp;
     }
   }
 }

 } // namespace impala
	// Licensed to the Apache Software Foundation (ASF) under one
	// or more contributor license agreements. See the NOTICE file
	// distributed with this work for additional information
	// regarding copyright ownership. The ASF licenses this file
	// to you under the Apache License, Version 2.0 (the
	// "License"); you may not use this file except in compliance
	// with the License. You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing,
	// software distributed under the License is distributed on an
	// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	// KIND, either express or implied. See the License for the
	// specific language governing permissions and limitations
	// under the License.

	#include <cmath>
	#include <unordered_set>
	#include <vector>

	#include "common/names.h"
	#include "kudu/util/random.h"
	#include "testutil/gtest-util.h"
	#include "util/parquet-bloom-filter.h"

	namespace impala {

	TEST(ParquetBloomFilter, OptimalByteSize) {
	// Check that results are consistent with kudu::BlockBloomFilter::MinLogSpace(ndv, fpp).
	std::vector<size_t> ndvs = {1, 100, 500, 1000, 10000, 100000, 1024*1024};
	std::vector<double> fpps = {0.01, 0.001};

	for (uint64_t ndv : ndvs) {
	for (double fpp : fpps) {
	const int expected_log_bytes = ParquetBloomFilter::MinLogSpace(ndv, fpp);
	const int expected_bytes = std::pow(2, expected_log_bytes);
	const int actual_bytes = ParquetBloomFilter::OptimalByteSize(ndv, fpp);

	if (expected_bytes < ParquetBloomFilter::MIN_BYTES) {
	EXPECT_EQ(actual_bytes, ParquetBloomFilter::MIN_BYTES)
	<< "NDV: " << ndv << ", FPP: " << fpp << ".";
	} else if (expected_bytes > ParquetBloomFilter::MAX_BYTES) {
	EXPECT_EQ(actual_bytes, ParquetBloomFilter::MAX_BYTES)
	<< "NDV: " << ndv << ", FPP: " << fpp << ".";
	} else {
	EXPECT_EQ(actual_bytes, expected_bytes)
	<< "NDV: " << ndv << ", FPP: " << fpp << ".";
	}
	}
	}
	}

	TEST(ParquetBloomFilter, OptimalByteSizeEdgeCase) {
	const int space = ParquetBloomFilter::OptimalByteSize(1, 0.75);
	EXPECT_GE(space, ParquetBloomFilter::MIN_BYTES)
	<< "OptimalByteSize should always be >= ParquetBloomFilter::MIN_BYTES";
	}

	// A ParquetBloomFilter with storage.
	struct BloomWrapper {
	std::unique_ptr<ParquetBloomFilter> bloom;
	std::unique_ptr<vector<uint8_t>> storage;
	};

	BloomWrapper CreateBloomFilter(int ndv, double fpp) {
	const int storage_size = ParquetBloomFilter::OptimalByteSize(ndv, fpp);
	BloomWrapper res;
	res.bloom = std::make_unique<ParquetBloomFilter>();
	res.storage = std::make_unique<vector<uint8_t>>(storage_size, 0);
	Status status = res.bloom->Init(res.storage->data(), storage_size, true);
	EXPECT_TRUE(status.ok()) << status.GetDetail();
	return res;
	}

	// Make a random uint64_t, avoiding the absent high bit and the low-entropy low bits
	// produced by rand().
	uint64_t MakeRand() {
	uint32_t high = (rand() >> 8) & 0xffff;
	high <<= 16;
	high \|= (rand() >> 8) & 0xffff;

	uint32_t low = (rand() >> 8) & 0xffff;
	low <<= 16;
	low \|= (rand() >> 8) & 0xffff;

	uint64_t result = high;
	result <<= 32;
	result \|= low;
	return result;
	}

	// We can Insert() hashes into a Bloom filter with different spaces.
	TEST(ParquetBloomFilter, Insert) {
	srand(0);
	for (int ndv = 100; ndv <= 100000; ndv *= 10) {
	BloomWrapper wrapper = CreateBloomFilter(ndv, 0.01);
	for (int i = 0; i < ndv; ++i) {
	wrapper.bloom->Insert(MakeRand());
	}
	}
	}

	// After Insert()ing something into a Bloom filter, it can be found again immediately.
	TEST(ParquetBloomFilter, Find) {
	srand(0);
	for (int ndv = 100; ndv <= 100000; ndv *= 10) {
	BloomWrapper wrapper = CreateBloomFilter(ndv, 0.01);
	for (int i = 0; i < ndv; ++i) {
	const uint64_t to_insert = MakeRand();
	wrapper.bloom->Insert(to_insert);
	EXPECT_TRUE(wrapper.bloom->Find(to_insert));
	}
	}
	}

	TEST(ParquetBloomFilter, HashAndFind) {
	srand(0);
	for (int ndv = 100; ndv <= 100000; ndv *= 10) {
	BloomWrapper wrapper = CreateBloomFilter(ndv, 0.01);
	for (int i = 0; i < ndv; ++i) {
	const uint64_t to_insert = MakeRand();
	wrapper.bloom->HashAndInsert(reinterpret_cast<const uint8_t*>(&to_insert),
	sizeof(uint64_t));
	EXPECT_TRUE(wrapper.bloom->HashAndFind(reinterpret_cast<const uint8_t*>(&to_insert),
	sizeof(uint64_t)));
	}
	}
	}


	// After Insert()ing something into a Bloom filter, it can be found again much later.
	TEST(ParquetBloomFilter, CumulativeFind) {
	srand(0);
	for (int ndv = 100; ndv <= 1000; ndv *= 10) {
	std::vector<uint64_t> inserted;
	BloomWrapper wrapper = CreateBloomFilter(ndv, 0.01);
	for (int i = 0; i < ndv; ++i) {
	const uint64_t to_insert = MakeRand();
	inserted.push_back(to_insert);
	wrapper.bloom->Insert(to_insert);
	for (int n = 0; n < inserted.size(); ++n) {
	EXPECT_TRUE(wrapper.bloom->Find(inserted[n]));
	}
	}
	}
	}

	// The empirical false positives we find when looking for random items is with a constant
	// factor of the false positive probability the Bloom filter was constructed for.
	TEST(ParquetBloomFilter, FindInvalid) {
	// We use a deterministic pseudorandom number generator with a set seed. The reason is
	// that with a run-dependent seed, there will always be inputs that can fail. That's a
	// potential argument for this to be a benchmark rather than a test, although the
	// measured quantity would be not time but deviation from predicted fpp.
	kudu::Random rgen(867 + 5309);
	static const int find_limit = 1 << 22;
	std::unordered_set<uint64_t> to_find;
	while (to_find.size() < find_limit) {
	to_find.insert(rgen.Next64());
	}
	static const int max_log_ndv = 19;
	std::unordered_set<uint64_t> to_insert;
	while (to_insert.size() < (1ull << max_log_ndv)) {
	const uint64_t candidate = rgen.Next64();
	if (to_find.find(candidate) == to_find.end()) {
	to_insert.insert(candidate);
	}
	}
	// NOTE: Even though this was generated with a deterministic pseudorandom number
	// generator, the order of items in the vector is dependent on the implementation
	// of unordered_set, which can change with different libstdc++ versions. Since
	// the tests below use the first X entries, changes in the order also change the
	// test.
	vector<uint64_t> shuffled_insert(to_insert.begin(), to_insert.end());
	for (int log_ndv = 12; log_ndv < max_log_ndv; ++log_ndv) {
	for (int log_fpp = 4; log_fpp < 12; ++log_fpp) {
	double fpp = 1.0 / (1 << log_fpp);
	const size_t ndv = 1 << log_ndv;
	BloomWrapper wrapper = CreateBloomFilter(ndv, fpp);
	ParquetBloomFilter* bloom = wrapper.bloom.get();

	// Fill up a BF with exactly as much ndv as we planned for it:
	for (size_t i = 0; i < ndv; ++i) {
	bloom->Insert(shuffled_insert[i]);
	}
	int found = 0;
	// Now we sample from the set of possible hashes, looking for hits.
	for (const uint64_t hash_to_find : to_find) {
	bool elem_found = bloom->Find(hash_to_find);
	found += elem_found;
	}
	EXPECT_LE(found, find_limit * fpp * 2)
	<< "Too many false positives with -log2(fpp) = " << log_fpp
	<< " and ndv = " << ndv;

	// Because the space is rounded up to a power of 2, we might actually get a lower
	// fpp than the one passed to MinLogSpace().
	const int log_space = log2(wrapper.storage->size());
	const double expected_fpp =
	ParquetBloomFilter::FalsePositiveProb(ndv, log_space);
	// Fudge factors are present because filter characteristics are true in the limit,
	// and will deviate for small samples.
	EXPECT_GE(found, find_limit * expected_fpp * 0.75)
	<< "Too few false positives with -log2(fpp) = " << log_fpp
	<< " expected_fpp = " << expected_fpp;
	EXPECT_LE(found, find_limit * expected_fpp * 1.25)
	<< "Too many false positives with -log2(fpp) = " << log_fpp
	<< " expected_fpp = " << expected_fpp;
	}
	}
	}

	} // namespace impala