blob: ad4e3eb9520e181465201dccfd17e88a37f8ee70 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#ifndef KUDU_UTIL_BLOOM_FILTER_H
#define KUDU_UTIL_BLOOM_FILTER_H
#include <cstddef>
#include <cstdint>
#include "kudu/gutil/gscoped_ptr.h"
#include "kudu/gutil/hash/city.h"
#include "kudu/gutil/macros.h"
#include "kudu/gutil/port.h"
#include "kudu/util/bitmap.h"
#include "kudu/util/slice.h"
namespace kudu {
// Probe calculated from a given key. This caches the calculated
// hash values which are necessary for probing into a Bloom Filter,
// so that when many bloom filters have to be consulted for a given
// key, we only need to calculate the hashes once.
//
// This is implemented based on the idea of double-hashing from the following paper:
// "Less Hashing, Same Performance: Building a Better Bloom Filter"
// Kirsch and Mitzenmacher, ESA 2006
// https://www.eecs.harvard.edu/~michaelm/postscripts/tr-02-05.pdf
//
// Currently, the implementation uses the 64-bit City Hash.
// TODO: an SSE CRC32 hash is probably ~20% faster. Come back to this
// at some point.
class BloomKeyProbe {
public:
// Default constructor - this is only used to instantiate an object
// and later reassign by assignment from another instance
BloomKeyProbe() {}
// Construct a probe from the given key.
//
// NOTE: proper operation requires that the referenced memory remain
// valid for the lifetime of this object.
explicit BloomKeyProbe(const Slice &key) : key_(key) {
uint64_t h = util_hash::CityHash64(
reinterpret_cast<const char *>(key.data()),
key.size());
// Use the top and bottom halves of the 64-bit hash
// as the two independent hash functions for mixing.
h_1_ = static_cast<uint32_t>(h);
h_2_ = static_cast<uint32_t>(h >> 32);
}
const Slice &key() const { return key_; }
// The initial hash value. See MixHash() for usage example.
uint32_t initial_hash() const {
return h_1_;
}
// Mix the given hash function with the second calculated hash
// value. A sequence of independent hashes can be calculated
// by repeatedly calling MixHash() on its previous result.
ATTRIBUTE_NO_SANITIZE_INTEGER
uint32_t MixHash(uint32_t h) const {
return h + h_2_;
}
private:
Slice key_;
// The two hashes.
uint32_t h_1_;
uint32_t h_2_;
};
// Sizing parameters for the constructor to BloomFilterBuilder.
// This is simply to provide a nicer API than a bunch of overloaded
// constructors.
class BloomFilterSizing {
public:
// Size the bloom filter by a fixed size and false positive rate.
//
// Picks the number of entries to achieve the above.
static BloomFilterSizing BySizeAndFPRate(size_t n_bytes, double fp_rate);
// Size the bloom filer by an expected count and false positive rate.
//
// Picks the number of bytes to achieve the above.
static BloomFilterSizing ByCountAndFPRate(size_t expected_count, double fp_rate);
size_t n_bytes() const { return n_bytes_; }
size_t expected_count() const { return expected_count_; }
private:
BloomFilterSizing(size_t n_bytes, size_t expected_count) :
n_bytes_(n_bytes),
expected_count_(expected_count)
{}
size_t n_bytes_;
size_t expected_count_;
};
// Builder for a BloomFilter structure.
class BloomFilterBuilder {
public:
// Create a bloom filter.
// See BloomFilterSizing static methods to specify this argument.
explicit BloomFilterBuilder(const BloomFilterSizing &sizing);
// Clear all entries, reset insertion count.
void Clear();
// Add the given key to the bloom filter.
void AddKey(const BloomKeyProbe &probe);
// Return an estimate of the false positive rate.
double false_positive_rate() const;
int n_bytes() const {
return n_bits_ / 8;
}
int n_bits() const {
return n_bits_;
}
// Return a slice view into this Bloom Filter, suitable for
// writing out to a file.
const Slice slice() const {
return Slice(&bitmap_[0], n_bytes());
}
// Return the number of hashes that are calculated for each entry
// in the bloom filter.
size_t n_hashes() const { return n_hashes_; }
size_t expected_count() const { return expected_count_; }
// Return the number of keys inserted.
size_t count() const { return n_inserted_; }
private:
DISALLOW_COPY_AND_ASSIGN(BloomFilterBuilder);
size_t n_bits_;
gscoped_array<uint8_t> bitmap_;
// The number of hash functions to compute.
size_t n_hashes_;
// The expected number of elements, for which the bloom is optimized.
size_t expected_count_;
// The number of elements inserted so far since the last Reset.
size_t n_inserted_;
};
// Wrapper around a byte array for reading it as a bloom filter.
class BloomFilter {
public:
BloomFilter() : bitmap_(nullptr) {}
BloomFilter(const Slice &data, size_t n_hashes);
// Return true if the filter may contain the given key.
bool MayContainKey(const BloomKeyProbe &probe) const;
private:
friend class BloomFilterBuilder;
static uint32_t PickBit(uint32_t hash, size_t n_bits);
size_t n_bits_;
const uint8_t *bitmap_;
size_t n_hashes_;
};
////////////////////////////////////////////////////////////
// Inline implementations
////////////////////////////////////////////////////////////
inline uint32_t BloomFilter::PickBit(uint32_t hash, size_t n_bits) {
switch (n_bits) {
// Fast path for the default bloom filter block size. Bitwise math
// is much faster than division.
case 4096 * 8:
return hash & (n_bits - 1);
default:
return hash % n_bits;
}
}
inline void BloomFilterBuilder::AddKey(const BloomKeyProbe &probe) {
uint32_t h = probe.initial_hash();
for (size_t i = 0; i < n_hashes_; i++) {
uint32_t bitpos = BloomFilter::PickBit(h, n_bits_);
BitmapSet(&bitmap_[0], bitpos);
h = probe.MixHash(h);
}
n_inserted_++;
}
inline bool BloomFilter::MayContainKey(const BloomKeyProbe &probe) const {
uint32_t h = probe.initial_hash();
// Basic unrolling by 2s gives a small benefit here since the two bit positions
// can be calculated in parallel -- it's a 50% chance that the first will be
// set even if it's a bloom miss, in which case we can parallelize the load.
int rem_hashes = n_hashes_;
while (rem_hashes >= 2) {
uint32_t bitpos1 = PickBit(h, n_bits_);
h = probe.MixHash(h);
uint32_t bitpos2 = PickBit(h, n_bits_);
h = probe.MixHash(h);
if (!BitmapTest(&bitmap_[0], bitpos1) ||
!BitmapTest(&bitmap_[0], bitpos2)) {
return false;
}
rem_hashes -= 2;
}
while (rem_hashes) {
uint32_t bitpos = PickBit(h, n_bits_);
if (!BitmapTest(&bitmap_[0], bitpos)) {
return false;
}
h = probe.MixHash(h);
rem_hashes--;
}
return true;
}
} // namespace kudu
#endif