cpp/src/parquet/bloom_filter.h - arrow - Git at Google

 // Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements.  See the NOTICE file
 // distributed with this work for additional information
 // regarding copyright ownership.  The ASF licenses this file
 // to you under the Apache License, Version 2.0 (the
 // "License"); you may not use this file except in compliance
 // with the License.  You may obtain a copy of the License at
 //
 //   http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing,
 // software distributed under the License is distributed on an
 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.

 #ifndef PARQUET_BLOOM_FILTER_H
 #define PARQUET_BLOOM_FILTER_H

 #include <cmath>
 #include <cstdint>
 #include <memory>

 #include "arrow/util/logging.h"
 #include "parquet/hasher.h"
 #include "parquet/platform.h"
 #include "parquet/types.h"

 namespace arrow {

 class MemoryPool;

 }  // namespace arrow

 namespace parquet {

 // A Bloom filter is a compact structure to indicate whether an item is not in a set or
 // probably in a set. The Bloom filter usually consists of a bit set that represents a
 // set of elements, a hash strategy and a Bloom filter algorithm.
 class PARQUET_EXPORT BloomFilter {
  public:
   // Maximum Bloom filter size, it sets to HDFS default block size 128MB
   // This value will be reconsidered when implementing Bloom filter producer.
   static constexpr uint32_t kMaximumBloomFilterBytes = 128 * 1024 * 1024;

   /// Determine whether an element exist in set or not.
   ///
   /// @param hash the element to contain.
   /// @return false if value is definitely not in set, and true means PROBABLY
   /// in set.
   virtual bool FindHash(uint64_t hash) const = 0;

   /// Insert element to set represented by Bloom filter bitset.
   /// @param hash the hash of value to insert into Bloom filter.
   virtual void InsertHash(uint64_t hash) = 0;

   /// Write this Bloom filter to an output stream. A Bloom filter structure should
   /// include bitset length, hash strategy, algorithm, and bitset.
   ///
   /// @param sink the output stream to write
   virtual void WriteTo(ArrowOutputStream* sink) const = 0;

   /// Get the number of bytes of bitset
   virtual uint32_t GetBitsetSize() const = 0;

   /// Compute hash for 32 bits value by using its plain encoding result.
   ///
   /// @param value the value to hash.
   /// @return hash result.
   virtual uint64_t Hash(int32_t value) const = 0;

   /// Compute hash for 64 bits value by using its plain encoding result.
   ///
   /// @param value the value to hash.
   /// @return hash result.
   virtual uint64_t Hash(int64_t value) const = 0;

   /// Compute hash for float value by using its plain encoding result.
   ///
   /// @param value the value to hash.
   /// @return hash result.
   virtual uint64_t Hash(float value) const = 0;

   /// Compute hash for double value by using its plain encoding result.
   ///
   /// @param value the value to hash.
   /// @return hash result.
   virtual uint64_t Hash(double value) const = 0;

   /// Compute hash for Int96 value by using its plain encoding result.
   ///
   /// @param value the value to hash.
   /// @return hash result.
   virtual uint64_t Hash(const Int96* value) const = 0;

   /// Compute hash for ByteArray value by using its plain encoding result.
   ///
   /// @param value the value to hash.
   /// @return hash result.
   virtual uint64_t Hash(const ByteArray* value) const = 0;

   /// Compute hash for fixed byte array value by using its plain encoding result.
   ///
   /// @param value the value address.
   /// @param len the value length.
   /// @return hash result.
   virtual uint64_t Hash(const FLBA* value, uint32_t len) const = 0;

   virtual ~BloomFilter() {}

  protected:
   // Hash strategy available for Bloom filter.
   enum class HashStrategy : uint32_t { MURMUR3_X64_128 = 0 };

   // Bloom filter algorithm.
   enum class Algorithm : uint32_t { BLOCK = 0 };
 };

 // The BlockSplitBloomFilter is implemented using block-based Bloom filters from
 // Putze et al.'s "Cache-,Hash- and Space-Efficient Bloom filters". The basic idea is to
 // hash the item to a tiny Bloom filter which size fit a single cache line or smaller.
 //
 // This implementation sets 8 bits in each tiny Bloom filter. Each tiny Bloom
 // filter is 32 bytes to take advantage of 32-byte SIMD instructions.
 class PARQUET_EXPORT BlockSplitBloomFilter : public BloomFilter {
  public:
   /// The constructor of BlockSplitBloomFilter. It uses murmur3_x64_128 as hash function.
   BlockSplitBloomFilter();

   /// Initialize the BlockSplitBloomFilter. The range of num_bytes should be within
   /// [kMinimumBloomFilterBytes, kMaximumBloomFilterBytes], it will be
   /// rounded up/down to lower/upper bound if num_bytes is out of range and also
   /// will be rounded up to a power of 2.
   ///
   /// @param num_bytes The number of bytes to store Bloom filter bitset.
   void Init(uint32_t num_bytes);

   /// Initialize the BlockSplitBloomFilter. It copies the bitset as underlying
   /// bitset because the given bitset may not satisfy the 32-byte alignment requirement
   /// which may lead to segfault when performing SIMD instructions. It is the caller's
   /// responsibility to free the bitset passed in. This is used when reconstructing
   /// a Bloom filter from a parquet file.
   ///
   /// @param bitset The given bitset to initialize the Bloom filter.
   /// @param num_bytes  The number of bytes of given bitset.
   void Init(const uint8_t* bitset, uint32_t num_bytes);

   // Minimum Bloom filter size, it sets to 32 bytes to fit a tiny Bloom filter.
   static constexpr uint32_t kMinimumBloomFilterBytes = 32;

   /// Calculate optimal size according to the number of distinct values and false
   /// positive probability.
   ///
   /// @param ndv The number of distinct values.
   /// @param fpp The false positive probability.
   /// @return it always return a value between kMinimumBloomFilterBytes and
   /// kMaximumBloomFilterBytes, and the return value is always a power of 2
   static uint32_t OptimalNumOfBits(uint32_t ndv, double fpp) {
     DCHECK(fpp > 0.0 && fpp < 1.0);
     const double m = -8.0 * ndv / log(1 - pow(fpp, 1.0 / 8));
     uint32_t num_bits;

     // Handle overflow.
     if (m < 0 || m > kMaximumBloomFilterBytes << 3) {
       num_bits = static_cast<uint32_t>(kMaximumBloomFilterBytes << 3);
     } else {
       num_bits = static_cast<uint32_t>(m);
     }

     // Round up to lower bound
     if (num_bits < kMinimumBloomFilterBytes << 3) {
       num_bits = kMinimumBloomFilterBytes << 3;
     }

     // Get next power of 2 if bits is not power of 2.
     if ((num_bits & (num_bits - 1)) != 0) {
       num_bits = static_cast<uint32_t>(::arrow::BitUtil::NextPower2(num_bits));
     }

     // Round down to upper bound
     if (num_bits > kMaximumBloomFilterBytes << 3) {
       num_bits = kMaximumBloomFilterBytes << 3;
     }

     return num_bits;
   }

   bool FindHash(uint64_t hash) const override;
   void InsertHash(uint64_t hash) override;
   void WriteTo(ArrowOutputStream* sink) const override;
   uint32_t GetBitsetSize() const override { return num_bytes_; }

   uint64_t Hash(int64_t value) const override { return hasher_->Hash(value); }
   uint64_t Hash(float value) const override { return hasher_->Hash(value); }
   uint64_t Hash(double value) const override { return hasher_->Hash(value); }
   uint64_t Hash(const Int96* value) const override { return hasher_->Hash(value); }
   uint64_t Hash(const ByteArray* value) const override { return hasher_->Hash(value); }
   uint64_t Hash(int32_t value) const override { return hasher_->Hash(value); }
   uint64_t Hash(const FLBA* value, uint32_t len) const override {
     return hasher_->Hash(value, len);
   }

   /// Deserialize the Bloom filter from an input stream. It is used when reconstructing
   /// a Bloom filter from a parquet filter.
   ///
   /// @param input_stream The input stream from which to construct the Bloom filter
   /// @return The BlockSplitBloomFilter.
   static BlockSplitBloomFilter Deserialize(ArrowInputStream* input_stream);

  private:
   // Bytes in a tiny Bloom filter block.
   static constexpr int kBytesPerFilterBlock = 32;

   // The number of bits to be set in each tiny Bloom filter
   static constexpr int kBitsSetPerBlock = 8;

   // A mask structure used to set bits in each tiny Bloom filter.
   struct BlockMask {
     uint32_t item[kBitsSetPerBlock];
   };

   // The block-based algorithm needs eight odd SALT values to calculate eight indexes
   // of bit to set, one bit in each 32-bit word.
   static constexpr uint32_t SALT[kBitsSetPerBlock] = {
       0x47b6137bU, 0x44974d91U, 0x8824ad5bU, 0xa2b7289dU,
       0x705495c7U, 0x2df1424bU, 0x9efc4947U, 0x5c6bfb31U};

   /// Set bits in mask array according to input key.
   /// @param key the value to calculate mask values.
   /// @param mask the mask array is used to set inside a block
   void SetMask(uint32_t key, BlockMask& mask) const;

   // Memory pool to allocate aligned buffer for bitset
   ::arrow::MemoryPool* pool_;

   // The underlying buffer of bitset.
   std::shared_ptr<Buffer> data_;

   // The number of bytes of Bloom filter bitset.
   uint32_t num_bytes_;

   // Hash strategy used in this Bloom filter.
   HashStrategy hash_strategy_;

   // Algorithm used in this Bloom filter.
   Algorithm algorithm_;

   // The hash pointer points to actual hash class used.
   std::unique_ptr<Hasher> hasher_;
 };

 }  // namespace parquet

 #endif  // PARQUET_BLOOM_FILTER_H
	// Licensed to the Apache Software Foundation (ASF) under one
	// or more contributor license agreements. See the NOTICE file
	// distributed with this work for additional information
	// regarding copyright ownership. The ASF licenses this file
	// to you under the Apache License, Version 2.0 (the
	// "License"); you may not use this file except in compliance
	// with the License. You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing,
	// software distributed under the License is distributed on an
	// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	// KIND, either express or implied. See the License for the
	// specific language governing permissions and limitations
	// under the License.

	#ifndef PARQUET_BLOOM_FILTER_H
	#define PARQUET_BLOOM_FILTER_H

	#include <cmath>
	#include <cstdint>
	#include <memory>

	#include "arrow/util/logging.h"
	#include "parquet/hasher.h"
	#include "parquet/platform.h"
	#include "parquet/types.h"

	namespace arrow {

	class MemoryPool;

	} // namespace arrow

	namespace parquet {

	// A Bloom filter is a compact structure to indicate whether an item is not in a set or
	// probably in a set. The Bloom filter usually consists of a bit set that represents a
	// set of elements, a hash strategy and a Bloom filter algorithm.
	class PARQUET_EXPORT BloomFilter {
	public:
	// Maximum Bloom filter size, it sets to HDFS default block size 128MB
	// This value will be reconsidered when implementing Bloom filter producer.
	static constexpr uint32_t kMaximumBloomFilterBytes = 128 * 1024 * 1024;

	/// Determine whether an element exist in set or not.
	///
	/// @param hash the element to contain.
	/// @return false if value is definitely not in set, and true means PROBABLY
	/// in set.
	virtual bool FindHash(uint64_t hash) const = 0;

	/// Insert element to set represented by Bloom filter bitset.
	/// @param hash the hash of value to insert into Bloom filter.
	virtual void InsertHash(uint64_t hash) = 0;

	/// Write this Bloom filter to an output stream. A Bloom filter structure should
	/// include bitset length, hash strategy, algorithm, and bitset.
	///
	/// @param sink the output stream to write
	virtual void WriteTo(ArrowOutputStream* sink) const = 0;

	/// Get the number of bytes of bitset
	virtual uint32_t GetBitsetSize() const = 0;

	/// Compute hash for 32 bits value by using its plain encoding result.
	///
	/// @param value the value to hash.
	/// @return hash result.
	virtual uint64_t Hash(int32_t value) const = 0;

	/// Compute hash for 64 bits value by using its plain encoding result.
	///
	/// @param value the value to hash.
	/// @return hash result.
	virtual uint64_t Hash(int64_t value) const = 0;

	/// Compute hash for float value by using its plain encoding result.
	///
	/// @param value the value to hash.
	/// @return hash result.
	virtual uint64_t Hash(float value) const = 0;

	/// Compute hash for double value by using its plain encoding result.
	///
	/// @param value the value to hash.
	/// @return hash result.
	virtual uint64_t Hash(double value) const = 0;

	/// Compute hash for Int96 value by using its plain encoding result.
	///
	/// @param value the value to hash.
	/// @return hash result.
	virtual uint64_t Hash(const Int96* value) const = 0;

	/// Compute hash for ByteArray value by using its plain encoding result.
	///
	/// @param value the value to hash.
	/// @return hash result.
	virtual uint64_t Hash(const ByteArray* value) const = 0;

	/// Compute hash for fixed byte array value by using its plain encoding result.
	///
	/// @param value the value address.
	/// @param len the value length.
	/// @return hash result.
	virtual uint64_t Hash(const FLBA* value, uint32_t len) const = 0;

	virtual ~BloomFilter() {}

	protected:
	// Hash strategy available for Bloom filter.
	enum class HashStrategy : uint32_t { MURMUR3_X64_128 = 0 };

	// Bloom filter algorithm.
	enum class Algorithm : uint32_t { BLOCK = 0 };
	};

	// The BlockSplitBloomFilter is implemented using block-based Bloom filters from
	// Putze et al.'s "Cache-,Hash- and Space-Efficient Bloom filters". The basic idea is to
	// hash the item to a tiny Bloom filter which size fit a single cache line or smaller.
	//
	// This implementation sets 8 bits in each tiny Bloom filter. Each tiny Bloom
	// filter is 32 bytes to take advantage of 32-byte SIMD instructions.
	class PARQUET_EXPORT BlockSplitBloomFilter : public BloomFilter {
	public:
	/// The constructor of BlockSplitBloomFilter. It uses murmur3_x64_128 as hash function.
	BlockSplitBloomFilter();

	/// Initialize the BlockSplitBloomFilter. The range of num_bytes should be within
	/// [kMinimumBloomFilterBytes, kMaximumBloomFilterBytes], it will be
	/// rounded up/down to lower/upper bound if num_bytes is out of range and also
	/// will be rounded up to a power of 2.
	///
	/// @param num_bytes The number of bytes to store Bloom filter bitset.
	void Init(uint32_t num_bytes);

	/// Initialize the BlockSplitBloomFilter. It copies the bitset as underlying
	/// bitset because the given bitset may not satisfy the 32-byte alignment requirement
	/// which may lead to segfault when performing SIMD instructions. It is the caller's
	/// responsibility to free the bitset passed in. This is used when reconstructing
	/// a Bloom filter from a parquet file.
	///
	/// @param bitset The given bitset to initialize the Bloom filter.
	/// @param num_bytes The number of bytes of given bitset.
	void Init(const uint8_t* bitset, uint32_t num_bytes);

	// Minimum Bloom filter size, it sets to 32 bytes to fit a tiny Bloom filter.
	static constexpr uint32_t kMinimumBloomFilterBytes = 32;

	/// Calculate optimal size according to the number of distinct values and false
	/// positive probability.
	///
	/// @param ndv The number of distinct values.
	/// @param fpp The false positive probability.
	/// @return it always return a value between kMinimumBloomFilterBytes and
	/// kMaximumBloomFilterBytes, and the return value is always a power of 2
	static uint32_t OptimalNumOfBits(uint32_t ndv, double fpp) {
	DCHECK(fpp > 0.0 && fpp < 1.0);
	const double m = -8.0 * ndv / log(1 - pow(fpp, 1.0 / 8));
	uint32_t num_bits;

	// Handle overflow.
	if (m < 0 \|\| m > kMaximumBloomFilterBytes << 3) {
	num_bits = static_cast<uint32_t>(kMaximumBloomFilterBytes << 3);
	} else {
	num_bits = static_cast<uint32_t>(m);
	}

	// Round up to lower bound
	if (num_bits < kMinimumBloomFilterBytes << 3) {
	num_bits = kMinimumBloomFilterBytes << 3;
	}

	// Get next power of 2 if bits is not power of 2.
	if ((num_bits & (num_bits - 1)) != 0) {
	num_bits = static_cast<uint32_t>(::arrow::BitUtil::NextPower2(num_bits));
	}

	// Round down to upper bound
	if (num_bits > kMaximumBloomFilterBytes << 3) {
	num_bits = kMaximumBloomFilterBytes << 3;
	}

	return num_bits;
	}

	bool FindHash(uint64_t hash) const override;
	void InsertHash(uint64_t hash) override;
	void WriteTo(ArrowOutputStream* sink) const override;
	uint32_t GetBitsetSize() const override { return num_bytes_; }

	uint64_t Hash(int64_t value) const override { return hasher_->Hash(value); }
	uint64_t Hash(float value) const override { return hasher_->Hash(value); }
	uint64_t Hash(double value) const override { return hasher_->Hash(value); }
	uint64_t Hash(const Int96* value) const override { return hasher_->Hash(value); }
	uint64_t Hash(const ByteArray* value) const override { return hasher_->Hash(value); }
	uint64_t Hash(int32_t value) const override { return hasher_->Hash(value); }
	uint64_t Hash(const FLBA* value, uint32_t len) const override {
	return hasher_->Hash(value, len);
	}

	/// Deserialize the Bloom filter from an input stream. It is used when reconstructing
	/// a Bloom filter from a parquet filter.
	///
	/// @param input_stream The input stream from which to construct the Bloom filter
	/// @return The BlockSplitBloomFilter.
	static BlockSplitBloomFilter Deserialize(ArrowInputStream* input_stream);

	private:
	// Bytes in a tiny Bloom filter block.
	static constexpr int kBytesPerFilterBlock = 32;

	// The number of bits to be set in each tiny Bloom filter
	static constexpr int kBitsSetPerBlock = 8;

	// A mask structure used to set bits in each tiny Bloom filter.
	struct BlockMask {
	uint32_t item[kBitsSetPerBlock];
	};

	// The block-based algorithm needs eight odd SALT values to calculate eight indexes
	// of bit to set, one bit in each 32-bit word.
	static constexpr uint32_t SALT[kBitsSetPerBlock] = {
	0x47b6137bU, 0x44974d91U, 0x8824ad5bU, 0xa2b7289dU,
	0x705495c7U, 0x2df1424bU, 0x9efc4947U, 0x5c6bfb31U};

	/// Set bits in mask array according to input key.
	/// @param key the value to calculate mask values.
	/// @param mask the mask array is used to set inside a block
	void SetMask(uint32_t key, BlockMask& mask) const;

	// Memory pool to allocate aligned buffer for bitset
	::arrow::MemoryPool* pool_;

	// The underlying buffer of bitset.
	std::shared_ptr<Buffer> data_;

	// The number of bytes of Bloom filter bitset.
	uint32_t num_bytes_;

	// Hash strategy used in this Bloom filter.
	HashStrategy hash_strategy_;

	// Algorithm used in this Bloom filter.
	Algorithm algorithm_;

	// The hash pointer points to actual hash class used.
	std::unique_ptr<Hasher> hasher_;
	};

	} // namespace parquet

	#endif // PARQUET_BLOOM_FILTER_H