blob: 41b63e649875c48702cf3bf275db033c3036394b [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
#include <catch2/catch.hpp>
#include "bloom_filter.hpp"
#ifdef TEST_BINARY_INPUT_PATH
static std::string testBinaryInputPath = TEST_BINARY_INPUT_PATH;
#else
static std::string testBinaryInputPath = "test/";
#endif
namespace datasketches {
TEST_CASE("bloom_filter: invalid constructor args", "[bloom_filter]") {
REQUIRE_THROWS_AS(bloom_filter::builder::create_by_size(0, 4), std::invalid_argument);
REQUIRE_THROWS_AS(bloom_filter::builder::create_by_size(1L << 60, 4), std::invalid_argument);
REQUIRE_THROWS_AS(bloom_filter::builder::create_by_size(65535, 0), std::invalid_argument);
}
TEST_CASE("bloom_filter: standard constructors", "[bloom_filter]") {
uint64_t num_items = 4000;
double fpp = 0.01;
uint64_t num_bits = bloom_filter::builder::suggest_num_filter_bits(num_items, fpp);
uint16_t num_hashes = bloom_filter::builder::suggest_num_hashes(num_items, num_bits);
uint64_t seed = 89023;
auto bf = bloom_filter::builder::create_by_size(num_bits, num_hashes, seed);
uint64_t adjusted_num_bits = (num_bits + 63) & ~0x3F; // round up to the nearest multiple of 64
REQUIRE(bf.get_capacity() == adjusted_num_bits);
REQUIRE(bf.get_num_hashes() == num_hashes);
REQUIRE(bf.get_seed() == seed);
REQUIRE(bf.is_empty());
// should match above
bf = bloom_filter::builder::create_by_accuracy(num_items, fpp, seed);
REQUIRE(bf.get_capacity() == adjusted_num_bits);
REQUIRE(bf.get_num_hashes() == num_hashes);
REQUIRE(bf.get_seed() == seed);
REQUIRE(bf.is_empty());
// same for initializing memory in-place
size_t serialized_size_bytes = bloom_filter::get_serialized_size_bytes(num_bits);
uint8_t* bytes = new uint8_t[serialized_size_bytes];
bf = bloom_filter::builder::initialize_by_size(bytes, serialized_size_bytes, num_bits, num_hashes, seed);
REQUIRE(bf.get_capacity() == adjusted_num_bits);
REQUIRE(bf.get_num_hashes() == num_hashes);
REQUIRE(bf.get_seed() == seed);
REQUIRE(bf.is_empty());
bf = bloom_filter::builder::initialize_by_accuracy(bytes, serialized_size_bytes, num_items, fpp, seed);
REQUIRE(bf.get_capacity() == adjusted_num_bits);
REQUIRE(bf.get_num_hashes() == num_hashes);
REQUIRE(bf.get_seed() == seed);
REQUIRE(bf.is_empty());
delete [] bytes;
}
TEST_CASE("bloom_filter: basic operations", "[bloom_filter]") {
uint64_t num_items = 5000;
double fpp = 0.01;
uint64_t seed = 4897301548054ULL;
auto bf = bloom_filter::builder::create_by_accuracy(num_items, fpp, seed);
REQUIRE(bf.is_empty());
REQUIRE(bf.get_bits_used() == 0);
for (uint64_t i = 0; i < num_items; ++i) {
bf.query_and_update(i);
}
REQUIRE(!bf.is_empty());
// filter is about 50% full at target capacity
// since seed is fixed we expect an exact value every time
// but leaving the approximate test in since that's more the "expectation"
REQUIRE(bf.get_bits_used() == 24793); // exact value is not important but should be consistent
REQUIRE(bf.get_bits_used() == Approx(0.5 * bf.get_capacity()).epsilon(0.05)); // just over 3.3% in practice
uint32_t num_found = 0;
for (uint64_t i = num_items; i < bf.get_capacity(); ++i) {
if (bf.query(i)) {
++num_found;
}
}
// fpp is average with significant variance -- even at 12% it would fail occasionally
REQUIRE(num_found == 423);
//REQUIRE(num_found == Approx((bf.get_capacity() - num_items) * fpp).epsilon(0.12));
auto bytes = bf.serialize();
// initialize in memory and run the same tests
// also checking against the results from the first part
uint8_t* bf_memory = new uint8_t[bytes.size()];
auto bf2 = bloom_filter::builder::initialize_by_accuracy(bf_memory, bytes.size(), num_items, fpp, bf.get_seed());
REQUIRE(bf2.is_empty());
REQUIRE(bf2.get_bits_used() == 0);
for (uint64_t i = 0; i < num_items; ++i) {
bf2.query_and_update(i);
}
REQUIRE(!bf2.is_empty());
REQUIRE(bf2.get_bits_used() == bf.get_bits_used()); // should exactly match above
uint32_t num_found2 = 0;
for (uint64_t i = num_items; i < bf2.get_capacity(); ++i) {
if (bf2.query(i)) {
++num_found2;
}
}
REQUIRE(num_found == num_found2); // should exactly match above
auto bytes2 = bf2.serialize();
REQUIRE(bytes.size() == bytes2.size());
for (size_t i = 0; i < bytes.size(); ++i) {
REQUIRE(bytes[i] == bytes2[i]);
}
// check that raw memory also matches serialized sketch
const uint8_t* bf_bytes = bf2.get_wrapped_memory();
REQUIRE(bf_bytes == bf_memory);
for (size_t i = 0; i < bytes.size(); ++i) {
REQUIRE(bf_bytes[i] == bytes[i]);
}
// ensure the filters reset properly
bf.reset();
REQUIRE(bf.is_empty());
REQUIRE(bf.get_bits_used() == 0);
bf2.reset();
REQUIRE(bf2.is_empty());
REQUIRE(bf2.get_bits_used() == 0);
delete [] bf_memory;
}
TEST_CASE("bloom_filter: inversion", "[bloom_filter]") {
uint64_t num_bits = 8192;
uint16_t num_hashes = 3;
auto bf = bloom_filter::builder::create_by_size(num_bits, num_hashes);
uint64_t n = 500;
for (uint64_t i = 0; i < n; ++i) {
bf.update(i);
}
uint64_t num_bits_set = bf.get_bits_used();
bf.invert();
REQUIRE(bf.get_bits_used() == num_bits - num_bits_set);
// original items should be mostly not-present
uint32_t num_found = 0;
for (uint64_t i = 0; i < n; ++i) {
if (bf.query(i)) {
++num_found;
}
}
REQUIRE(num_found < n / 10);
// many other items should be "present"
num_found = 0;
for (uint64_t i = n; i < num_bits; ++i) {
if (bf.query(i)) {
++num_found;
}
}
REQUIRE(num_found > n);
}
TEST_CASE("bloom_filter: incompatible set operations", "[bloom_filter]") {
uint64_t num_bits = 32768;
uint16_t num_hashes = 4;
auto bf1 = bloom_filter::builder::create_by_size(num_bits, num_hashes);
// mismatched num bits
auto bf2 = bloom_filter::builder::create_by_size(2 * num_bits, num_hashes);
REQUIRE_THROWS_AS(bf1.union_with(bf2), std::invalid_argument);
// mismatched num hashes
auto bf3 = bloom_filter::builder::create_by_size(num_bits, 2 * num_hashes);
REQUIRE_THROWS_AS(bf1.intersect(bf2), std::invalid_argument);
// mismatched seed
auto bf4 = bloom_filter::builder::create_by_size(num_bits, num_hashes, bf1.get_seed() + 1);
REQUIRE_THROWS_AS(bf1.union_with(bf4), std::invalid_argument);
}
TEST_CASE("bloom_filter: basic union", "[bloom_filter]") {
const uint64_t num_bits = 12288;
const uint16_t num_hashes = 4;
auto bf1 = bloom_filter::builder::create_by_size(num_bits, num_hashes);
auto bf2 = bloom_filter::builder::create_by_size(num_bits, num_hashes, bf1.get_seed());
const uint64_t n = 1000;
const uint32_t max_item = 3 * n / 2 - 1;
for (uint64_t i = 0; i < n; ++i) {
bf1.query_and_update(i);
bf2.update(n / 2 + i);
}
bf1.union_with(bf2);
for (uint64_t i = 0; i < max_item; ++i) {
REQUIRE(bf1.query(i));
}
uint32_t num_found = 0;
for (uint64_t i = max_item; i < num_bits; ++i) {
if (bf1.query(i)) {
++num_found;
}
}
REQUIRE(num_found < num_bits / 10); // not being super strict
}
TEST_CASE("bloom_filter: basic intersection", "[bloom_filter]") {
const uint64_t num_bits = 8192;
const uint16_t num_hahes = 5;
auto bf1 = bloom_filter::builder::create_by_size(num_bits, num_hahes);
auto bf2 = bloom_filter::builder::create_by_size(num_bits, num_hahes, bf1.get_seed());
const uint64_t n = 1024;
const uint32_t max_item = 3 * n / 2 - 1;
for (uint64_t i = 0; i < n; ++i) {
bf1.update(i);
bf2.update(n / 2 + i);
}
bf1.intersect(bf2);
// overlap bit should all be set
for (uint64_t i = n / 2; i < n; ++i) {
REQUIRE(bf1.query(i));
}
uint32_t num_found = 0;
for (uint64_t i = 0; i < n / 2; ++i) {
if (bf1.query(i)) {
++num_found;
}
}
for (uint64_t i = max_item; i < num_bits; ++i) {
if (bf1.query(i)) {
++num_found;
}
}
REQUIRE(num_found < num_bits / 10); // not being super strict
}
TEST_CASE("bloom_filter: empty serialization", "[bloom_filter]") {
const uint64_t num_bits = 32769;
const uint16_t num_hashes = 7;
auto bf = bloom_filter::builder::create_by_size(num_bits, num_hashes);
auto bytes = bf.serialize();
REQUIRE(bytes.size() == bf.get_serialized_size_bytes());
auto bf_bytes = bloom_filter::deserialize(bytes.data(), bytes.size());
REQUIRE(bf.get_capacity() == bf_bytes.get_capacity());
REQUIRE(bf.get_seed() == bf_bytes.get_seed());
REQUIRE(bf.get_num_hashes() == bf_bytes.get_num_hashes());
REQUIRE(bf_bytes.is_empty());
std::stringstream ss(std::ios::in | std::ios::out | std::ios::binary);
bf.serialize(ss);
auto bf_stream = bloom_filter::deserialize(ss);
REQUIRE(bf.get_capacity() == bf_stream.get_capacity());
REQUIRE(bf.get_seed() == bf_stream.get_seed());
REQUIRE(bf.get_num_hashes() == bf_stream.get_num_hashes());
REQUIRE(bf_stream.is_empty());
// read-only wrap should work
auto bf_wrap = bloom_filter::wrap(bytes.data(), bytes.size());
REQUIRE(bf.get_capacity() == bf_wrap.get_capacity());
REQUIRE(bf.get_seed() == bf_wrap.get_seed());
REQUIRE(bf.get_num_hashes() == bf_wrap.get_num_hashes());
REQUIRE(bf_wrap.is_empty());
// writable wrap should not
REQUIRE_THROWS_AS(bloom_filter::writable_wrap(bytes.data(), bytes.size()), std::invalid_argument);
}
TEST_CASE("bloom_filter: non-empty serialization", "[bloom_filter]") {
const uint64_t num_bits = 32768;
const uint16_t num_hashes = 5;
auto bf = bloom_filter::builder::create_by_size(num_bits, num_hashes);
const uint64_t n = 1000;
for (uint64_t i = 0; i < n; ++i) {
bf.update(0.5 + i); // testing floats
}
// test more items without updating, assuming some false positives
// so we can check that we get the same number of false positives
// with the same query items
uint64_t fp_count = 0;
for (uint64_t i = n; i < num_bits; ++i) {
fp_count += bf.query(0.5 + i) ? 1 : 0;
}
auto bytes = bf.serialize();
REQUIRE(bytes.size() == bf.get_serialized_size_bytes());
auto bf_bytes = bloom_filter::deserialize(bytes.data(), bytes.size());
REQUIRE(bf.get_capacity() == bf_bytes.get_capacity());
REQUIRE(bf.get_seed() == bf_bytes.get_seed());
REQUIRE(bf.get_num_hashes() == bf_bytes.get_num_hashes());
REQUIRE(!bf_bytes.is_empty());
REQUIRE(bf.is_memory_owned());
uint64_t fp_count_bytes = 0;
for (uint64_t i = 0; i < num_bits; ++i) {
bool val = bf_bytes.query(0.5 + i);
if (i < n)
REQUIRE(val);
else if (val)
++fp_count_bytes;
}
REQUIRE(fp_count_bytes == fp_count);
std::stringstream ss(std::ios::in | std::ios::out | std::ios::binary);
bf.serialize(ss);
auto bf_stream = bloom_filter::deserialize(ss);
REQUIRE(bf.get_capacity() == bf_stream.get_capacity());
REQUIRE(bf.get_seed() == bf_stream.get_seed());
REQUIRE(bf.get_num_hashes() == bf_stream.get_num_hashes());
REQUIRE(!bf_stream.is_empty());
REQUIRE(bf_stream.is_memory_owned());
uint64_t fp_count_stream = 0;
for (uint64_t i = 0; i < num_bits; ++i) {
bool val = bf_stream.query(0.5 + i);
if (i < n)
REQUIRE(val);
else if (val)
++fp_count_stream;
}
REQUIRE(fp_count_stream == fp_count);
// read-only wrap
auto bf_wrap = bloom_filter::wrap(bytes.data(), bytes.size());
REQUIRE(bf.get_capacity() == bf_wrap.get_capacity());
REQUIRE(bf.get_seed() == bf_wrap.get_seed());
REQUIRE(bf.get_num_hashes() == bf_wrap.get_num_hashes());
REQUIRE(!bf_wrap.is_empty());
REQUIRE(!bf_wrap.is_memory_owned());
uint64_t fp_count_wrap = 0;
for (uint64_t i = 0; i < num_bits; ++i) {
bool val = bf_wrap.query(0.5 + i);
if (i < n)
REQUIRE(val);
else if (val)
++fp_count_wrap;
}
REQUIRE(fp_count_wrap == fp_count);
REQUIRE_THROWS_AS(bf_wrap.update(-1.0), std::logic_error);
REQUIRE_THROWS_AS(bf_wrap.query_and_update(-2.0), std::logic_error);
REQUIRE_THROWS_AS(bf_wrap.reset(), std::logic_error);
// writable wrap
auto bf_writable = bloom_filter::writable_wrap(bytes.data(), bytes.size());
REQUIRE(bf.get_capacity() == bf_writable.get_capacity());
REQUIRE(bf.get_seed() == bf_writable.get_seed());
REQUIRE(bf.get_num_hashes() == bf_writable.get_num_hashes());
REQUIRE(!bf_writable.is_empty());
REQUIRE(!bf_writable.is_memory_owned());
uint64_t fp_count_writable = 0;
for (uint64_t i = 0; i < num_bits; ++i) {
bool val = bf_writable.query(0.5 + i);
if (i < n)
REQUIRE(val);
else if (val)
++fp_count_writable;
}
REQUIRE(fp_count_writable == fp_count);
REQUIRE(!bf_writable.query(-1.0));
bf_writable.update(-1.0);
REQUIRE(bf_writable.query(-1.0));
// not good memory management to do this, but because we wrapped the same bytes as both
// read-only adn writable, that update should ahve changed the read-only version, too
REQUIRE(bf_wrap.query(-1.0));
}
} // namespace datasketches