| /** |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| #ifndef ORC_BLOOMFILTER_IMPL_HH |
| #define ORC_BLOOMFILTER_IMPL_HH |
| |
| #include "orc/BloomFilter.hh" |
| #include "wrap/orc-proto-wrapper.hh" |
| |
| #include <cmath> |
| #include <sstream> |
| #include <vector> |
| |
| namespace orc { |
| |
| /** |
| * Bare metal bit set implementation. For performance reasons, this implementation does not check |
| * for index bounds nor expand the bit set size if the specified index is greater than the size. |
| */ |
| class BitSet { |
| public: |
| /** |
| * Creates an empty BitSet |
| * |
| * @param numBits - number of bits used |
| */ |
| BitSet(uint64_t numBits); |
| |
| /** |
| * Creates BitSet from serialized uint64_t buffer |
| * |
| * @param bits - serialized uint64_t buffer of bitset |
| * @param numBits - number of bits used |
| */ |
| BitSet(const uint64_t * bits, uint64_t numBits); |
| |
| /** |
| * Sets the bit at specified index. |
| * |
| * @param index - position |
| */ |
| void set(uint64_t index); |
| |
| /** |
| * Returns true if the bit is set in the specified index. |
| * |
| * @param index - position |
| * @return - value at the bit position |
| */ |
| bool get(uint64_t index); |
| |
| /** |
| * Number of bits |
| */ |
| uint64_t bitSize(); |
| |
| /** |
| * Combines the two BitSets using bitwise OR. |
| */ |
| void merge(const BitSet& other); |
| |
| /** |
| * Clears the bit set. |
| */ |
| void clear(); |
| |
| /** |
| * Gets underlying raw data |
| */ |
| const uint64_t * getData() const; |
| |
| /** |
| * Compares two BitSets |
| */ |
| bool operator==(const BitSet& other) const; |
| |
| private: |
| std::vector<uint64_t> mData; |
| }; |
| |
| /** |
| * BloomFilter is a probabilistic data structure for set membership check. |
| * BloomFilters are highly space efficient when compared to using a HashSet. |
| * Because of the probabilistic nature of bloom filter false positive (element |
| * not present in bloom filter but test() says true) are possible but false |
| * negatives are not possible (if element is present then test() will never |
| * say false). The false positive probability is configurable (default: 5%) |
| * depending on which storage requirement may increase or decrease. Lower the |
| * false positive probability greater is the space requirement. |
| * |
| * Bloom filters are sensitive to number of elements that will be inserted in |
| * the bloom filter. During the creation of bloom filter expected number of |
| * entries must be specified. If the number of insertions exceed the specified |
| * initial number of entries then false positive probability will increase |
| * accordingly. |
| * |
| * Internally, this implementation of bloom filter uses Murmur3 fast |
| * non-cryptographic hash algorithm. Although Murmur2 is slightly faster than |
| * Murmur3 in Java, it suffers from hash collisions for specific sequence of |
| * repeating bytes. Check the following link for more info |
| * https://code.google.com/p/smhasher/wiki/MurmurHash2Flaw |
| * |
| * Note that this class is here for backwards compatibility, because it uses |
| * the JVM default character set for strings. All new users should |
| * BloomFilterUtf8, which always uses UTF8 for the encoding. |
| */ |
| class BloomFilterImpl : public BloomFilter { |
| public: |
| /** |
| * Creates an empty BloomFilter |
| * |
| * @param expectedEntries - number of entries it will hold |
| * @param fpp - false positive probability |
| */ |
| BloomFilterImpl(uint64_t expectedEntries, double fpp=DEFAULT_FPP); |
| |
| /** |
| * Creates a BloomFilter by deserializing the proto-buf version |
| * |
| * caller should make sure input proto::BloomFilter is valid |
| */ |
| BloomFilterImpl(const proto::BloomFilter& bloomFilter); |
| |
| /** |
| * Adds a new element to the BloomFilter |
| */ |
| void addBytes(const char * data, int64_t length); |
| void addLong(int64_t data); |
| void addDouble(double data); |
| |
| /** |
| * Test if the element exists in BloomFilter |
| */ |
| bool testBytes(const char * data, int64_t length) const override; |
| bool testLong(int64_t data) const override; |
| bool testDouble(double data) const override; |
| |
| uint64_t sizeInBytes() const; |
| uint64_t getBitSize() const; |
| int32_t getNumHashFunctions() const; |
| |
| void merge(const BloomFilterImpl& other); |
| |
| void reset(); |
| |
| bool operator==(const BloomFilterImpl& other) const; |
| |
| private: |
| friend struct BloomFilterUTF8Utils; |
| |
| // compute k hash values from hash64 and set bits |
| void addHash(uint64_t hash64); |
| |
| // compute k hash values from hash64 and check bits |
| bool testHash(uint64_t hash64) const; |
| |
| void serialize(proto::BloomFilter& bloomFilter) const; |
| |
| private: |
| static constexpr double DEFAULT_FPP = 0.05; |
| uint64_t mNumBits; |
| int32_t mNumHashFunctions; |
| std::unique_ptr<BitSet> mBitSet; |
| }; |
| |
| struct BloomFilterUTF8Utils { |
| // serialize BloomFilter in protobuf |
| static void serialize(const BloomFilterImpl& in, proto::BloomFilter& out) { |
| in.serialize(out); |
| } |
| |
| // deserialize BloomFilter from protobuf |
| static std::unique_ptr<BloomFilter> |
| deserialize(const proto::Stream_Kind& streamKind, |
| const proto::ColumnEncoding& columnEncoding, |
| const proto::BloomFilter& bloomFilter); |
| }; |
| |
| } |
| |
| #endif //ORC_BLOOMFILTER_IMPL_HH |