blob: 3da58346a09bd699d2ff3bfb586ae9cd2242f1fc [file] [log] [blame]
// @@@ START COPYRIGHT @@@
//
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
//
// @@@ END COPYRIGHT @@@
/*
**************************************************************************
* *
* Counting Bloom Filters *
* *
* *
* *
**************************************************************************
*/
#ifndef INCLUDE_BLOOM_FILTER_H
#define INCLUDE_BLOOM_FILTER_H
#include "Platform.h"
#include "NABoolean.h"
#include "Collections.h"
#include "NAMemory.h"
#include "VarInt.h"
#define MAX_NUM_HASH_FUNCS 5
const UInt32 char_size = 0x08; // 8 bits in 1 char(unsigned)
class BloomFilter : public NABasicObject
{
public:
BloomFilter(NAHeap* heap, UInt32 maxHashFuncs,
UInt32 n, // # of distinct elements
float p // probability of false positives
);
BloomFilter(NAHeap* heap);
~BloomFilter();
double density();
// set the key length info for all keys, useful when dealing
// keys of fixed length (e.g. the info is SWAP_FOUR for SQL INTEGER).
void setKenLengthInfo(Int32 x) { keyLenInfo_ = x; };
virtual ULng32 packIntoBuffer(char*& buffer, NABoolean swapBytes);
virtual ULng32 unpackBuffer(char*& buffer);
protected:
UInt32 computeFinalHash(UInt32 hash, Lng32 hashFunc);
protected:
UInt32 m_; // hash table size in bits
UInt16 k_; // the number of hash functions
Int32 keyLenInfo_;
NAHeap* heap_;
};
class simple_cbf_key : public NABasicObject
{
public:
simple_cbf_key(char* key = NULL, UInt32 key_len = 0, NAHeap* heap = NULL);
simple_cbf_key(const simple_cbf_key&);
~simple_cbf_key();
// equality operator
NABoolean operator ==(const simple_cbf_key& other) const
{
return ( keyLen_ == other.keyLen_ &&
strncmp(key_, other.key_, keyLen_) == 0 );
}
simple_cbf_key& operator =(const simple_cbf_key& other);
char* getKey() const { return key_; };
UInt32 getKeyLen() const { return keyLen_; };
friend ULng32 scbfHashFunc(const simple_cbf_key& key);
protected:
UInt32 keyLen_;
char* key_;
NAHeap* heap_;
};
// The data structure to represent a key to look up the
// overflow table.
//
// The bucket_ (0-based) field stores the bucket number (the interval in
// a histogram) that the key is associated with.
class cbf_key : public simple_cbf_key
{
public:
enum MFV_ENUM { NONE=2, MFV=0, MFV2=1 };
cbf_key(char* key, UInt32 key_len, UInt32 bucket, cbf_key::MFV_ENUM mfv, NAHeap* heap);
// This one declared but not defined in fast-stats prototype.
cbf_key(char* key = NULL, UInt32 key_len = 0, NAHeap* heap = NULL);
cbf_key(const cbf_key&);
~cbf_key() {}
cbf_key& operator =(const cbf_key& other);
// equality operator
// NABoolean operator ==(const cbf_key& other) const
// {
// return ( keyLen_ == other.keyLen_ &&
// strncmp(key_, other.key_, keyLen_) == 0 );
// }
//char* getKey() const { return key_; };
//UInt32 getKeyLen() const { return keyLen_; };
UInt32 getBucket() const { return bucket_; };
MFV_ENUM getMFV() const { return mfv_; }
friend ULng32 cbfHashFunc(const cbf_key& key);
protected:
//char* key_;
//UInt32 keyLen_;
UInt32 bucket_;
MFV_ENUM mfv_;
//NAHeap* heap_;
};
class freqStruct : public NABasicObject
{
public:
freqStruct(UInt64 freq=0, UInt32 bucket=0) :
freq_(freq), bucket_(bucket) {};
virtual ~freqStruct() {};
void setFreq(UInt64 f) { freq_ = f; };
void setBucket(UInt32 b) { bucket_ = b; };
UInt64 getFreq() { return freq_; };
UInt32 getBucket() { return bucket_; };
Int32 operator==(freqStruct&);
protected:
UInt64 freq_;
Int32 bucket_;
};
class CountingBloomFilter : public BloomFilter
{
public:
CountingBloomFilter(NAHeap* heap, UInt32 maxHashFuncs,
UInt32 n, // # of distinct elements
float p, // probability of false positives
UInt32 nonOverflowFreq, // non-overflow freq of n elements
UInt32 avgSkewedElements,
// # of buckets that keys will fall in.
// Assume each key maps to one
// bucket only
UInt32 numBuckets=1
);
CountingBloomFilter(NAHeap* heap);
~CountingBloomFilter();
// Clear the data structure by removing all frequency info.
void clear();
enum INSERT_ENUM { NORMAL=0, NEW_MFV=1, NO_SLOT=2, PARAM_ERROR=3 };
// Insert into CBF a key. The bucketIdx is the bucket number
// (0-based) of the bucket that the key falls in. A key belongs to one
// bucket only.
//
// If the key is ready present in CBF, its frequency increases.
//
// CBF returns TRUE most of time, except when a key is with a NONE MFV status
// and its frequency overflows into the synoposis table when the CBF is
// CountingBloomFilterWithKnownSkews.
CountingBloomFilter::INSERT_ENUM
insert(char * key, UInt32 key_len, UInt32 bucketIdx=0, cbf_key::MFV_ENUM = cbf_key::NONE);
// Remove from CBF a key. The bucketIdx is the bucket number
// (0-based) of the bucket that the key falls in. A key belongs to one
// bucket only.
//
// If the key is ready present in CBF with a positive frequency , its frequency decreases.
// When the frequency becomes zero, the key is removed from CBF.
//
// Return TRUE when the frequency is positive before the deletion
// FALSE when the frequency is 0 before the deletion
//
NABoolean remove(char * key, UInt32 key_len, UInt32 bucketIdx=0, cbf_key::MFV_ENUM = cbf_key::NONE);
// Check if the key is present in CBF. Returns TRUE
NABoolean contain(char *, UInt32 key_len, UInt64* freq = NULL, UInt32* bucket = NULL);
// Method to fetch freq of freq in non-overflow area
// Usage:
// for ( CollIndex b = 0; b<cbf.numBuckets(); b++ ) {
// for ( CollIndex i = 0; i<cbf.lowF2s(b).entries(); i++ ) {
// UInt32 freq = cbf.lowF2s(b)[i]; // freq is fi
// }
// UInt32 uec = cbf.uec(b); // uec is # of unique value in bucket b
// UInt32 tf = cbf.totalFreq(b); // tf is total frequency in bucket b
// }
UInt32 numBuckets() { return numBuckets_; };
VarUIntArray& lowF2s(UInt32 bucket = 1) { return *freq2sL_[bucket]; } ;
UInt32 uec(UInt32 bucket = 1) { return uecs_[bucket]; } ;
UInt32 totalFreq(UInt32 bucket = 1) { return totalFreqs_[bucket]; } ;
// fetch a particular freq and freqfreq from overflow area.
//
// Usage:
// cbf.computeOverflowF2s();
// for (COllIndex i=0; i<cbf.getOverflowEntries(); i++) {
// x=cbf.highF2(i, y, b); // x, y and b are freqFreq, freq and
// bucket# of the ith entry
// in overflow area, respectively.
// }
virtual UInt32 getOverflowEntries() = 0;
virtual UInt64 highF2(UInt32 i, UInt64& freq, UInt32& bucket);
// Compute f2s for overflow entries. Must be called before
// the f2s can be fetched through highF2() calls.
virtual void computeOverflowF2s() = 0;
// return total freq for both low and high freq items.
UInt64 totalFreqForAll();
// debug helper methods
void printfreqfreq();
void display_key(const char* msg, char *, UInt32 key_len, UInt32 bucket=0);
void display_hash_entry(const char* msg, UInt32 hash);
void display_msg(const char* msg);
virtual NABoolean canHandleArbitrarySkewedValue() = 0;
void computeSumOfFrequencySquared(double* sumSq, Int32 sz);
// Save the current Fi values. Only one copy is saved.
void saveFi();
// Compute the max std dev of [fi(after) - fi(before)] for all buckets
double computeMaxStdDevForDeltaOfFi();
// Compute the standard deviation of variable = [fi(after) - fi(before)].
// fi(before) is the Fi copy saved with call to saveACopyOfFi(),
// fi(after) is the fi currently contained in cbf.
double computeStdDevForDeltaOfFi(Lng32 i);
static
UInt64 estimateMemoryInBytes(
UInt32 maxHashFuncs,
UInt32 n, // # of distinct elements
float p, // probability of false positives
// non-overflow freq of n elements
UInt32 nonOverflowFreq,
UInt32 avgSkewedElements,
// # of buckets that keys will fall in.
// Assume each key maps to one
// bucket only
UInt32 numBuckets=1
);
ULng32 packIntoBuffer(char*& buffer, NABoolean swapBytes);
ULng32 unpackBuffer(char*& buffer);
void setTotalMemSize(Lng32 x) { totalMemSz_ = x; };
Lng32 getTotalMemSize() { return totalMemSz_ ; };
void outputParams(char* buffer) { strcpy(buffer, paramBuf); };
void setLogFile(char* filename) { filename_ = filename; };
protected:
virtual void insertIntoOverflowTable(const cbf_key&) = 0;
virtual void removeFromOverflowTable(const cbf_key&) = 0;
virtual UInt64* searchOverflowTable(const cbf_key&) = 0;
virtual void computeSumOfFrequencySquaredHighFreq(double* sumSq) = 0;
protected:
VarUIntArray freqsL_; // counters for low freq keys
VarUIntArrayPtr* freq2sL_; // freq of freq in non-overflow area
VarUIntArray uecs_; // unique values per bucket
VarUIntArray totalFreqs_; // total frequency per bucket
UInt32 numBuckets_; // # of buckets dividing keys into disjoint sets
freqStruct* freqsH_; // freq in overflow area.
UInt64* freq2sH_; // freq of freq in overflow area
UInt32 actualOverflowF2s_; // actual # of freq of freq in overflow area.
VarUIntArrayPtr* savedFreq2sL_; // Saved copy of freq of freq in
// non-overflow area
Lng32 totalMemSz_; // total mem footprint when pack into a buffer
VarUIntArray bucketNums_; // Bucket nums for low freq keys. That is,
// bucketNums_[x] is the bucket # for
// the key whose low frequency is
// freqsL_[x]. We do not allow two
// keys from different buckets to collide.
// bucketNums_[x] == 0 when the slot
// is not taken by any key.
char paramBuf[200]; // to save the params used by debugging
char* filename_;
};
// A general CBF where the # of potential overflow keys is unbounded.
class GeneralCountingBloomFilter : public CountingBloomFilter
{
public:
GeneralCountingBloomFilter(NAHeap* heap,
UInt32 maxHashFuncs, // # of distinct elements
UInt32 n, // # of distinct elements
float p, // probability of false positives
UInt32 nonOverflowFreq, // non-overflow freq of n elements
UInt32 avgSkewedElements,
// # of buckets that keys will fall in.
// Assume each key maps to one
// bucket only
UInt32 numBuckets=1
);
~GeneralCountingBloomFilter();
virtual UInt32 getOverflowEntries()
{ return overflowCountTable_.entries(); };
NABoolean canHandleArbitrarySkewedValue() { return TRUE; } ;
void computeSumOfFrequencySquaredHighFreq(double* sumSq);
protected:
virtual void insertIntoOverflowTable(const cbf_key&);
virtual void removeFromOverflowTable(const cbf_key&);
virtual UInt64* searchOverflowTable(const cbf_key& key)
{ return overflowCountTable_.getFirstValue(&key); };
virtual void computeOverflowF2s();
static
UInt64 estimateMemoryInBytesForOverflowTable(UInt32 avgSkewedElements,
UInt32 numBuckets
);
protected:
NAHashDictionary<cbf_key, UInt64> overflowCountTable_;
};
//
// A CBF where the # of overflow keys is bounded (to 2 per bucket now).
//
class CountingBloomFilterWithKnownSkews : public CountingBloomFilter
{
public:
CountingBloomFilterWithKnownSkews(NAHeap* heap,
UInt32 maxHashFuncs,
UInt32 n, // expected # of distinct keys with low frequency
float p, // probability of false positives
UInt32 nonOverflowFreq, // avg frequency of keys with low frequency
UInt32 avgSkewedElements, // expected # of distinct keys with high frequency
UInt32 numBuckets=1 // # of buckets that keys will fall in. Assume each key
// maps to one bucket only
);
CountingBloomFilterWithKnownSkews(NAHeap* heap);
~CountingBloomFilterWithKnownSkews();
static
UInt64 estimateMemoryInBytes(
UInt32 maxHashFuncs,
UInt32 n, // # of distinct elements
float p, // probability of false positives
// non-overflow freq of n elements
UInt32 nonOverflowFreq,
UInt32 avgSkewedElements,
// # of buckets that keys will fall in.
// Assume each key maps to one
// bucket only
UInt32 numBuckets=1
);
virtual UInt32 getOverflowEntries() { return actualOverflowF2s_; };
virtual UInt64 highF2(UInt32 i, UInt64& freq, UInt32& bucket);
ULng32 packIntoBuffer(char*& buffer, NABoolean swapBytes);
ULng32 unpackBuffer(char*& buffer);
protected:
virtual void insertIntoOverflowTable(const cbf_key&);
virtual void removeFromOverflowTable(const cbf_key&);
virtual UInt64* searchOverflowTable(const cbf_key& key);
virtual void computeOverflowF2s() {};
NABoolean canHandleArbitrarySkewedValue() { return FALSE; } ;
UInt32 indexInOverflowArray(const cbf_key& key)
{ return 2*key.getBucket() + (UInt32)key.getMFV(); }
static
UInt64 estimateMemoryInBytesForOverflowTable(UInt32 avgSkewedElements,
UInt32 numBuckets
);
void computeSumOfFrequencySquaredHighFreq(double* sumSq);
protected:
};
//----------------------- new code for faststats ---------------------------
class FastStatsCountingBloomFilter : public BloomFilter
{
public:
FastStatsCountingBloomFilter(NAHeap* heap,
UInt32 maxHashFuncs, // # of distinct elements
UInt32 n, // # of distinct elements
float p, // probability of false positives
UInt32 maxNonOverflowFreq // max of non-overflow
// freq of n elements
);
~FastStatsCountingBloomFilter();
NABoolean insert(char * key, UInt32 key_len, UInt32 frequency = 1);
NABoolean remove(char * key, UInt32 key_len);
NABoolean contain(char * key, UInt32 key_len, UInt64* freq = 0);
virtual UInt32 getOverflowEntries()
{ return overflowCountTable_.entries(); };
NABoolean canHandleArbitrarySkewedValue() { return TRUE; } ;
void computeSumOfFrequencySquaredHighFreq(double* sumSq);
void printfreq(const char* colNames);
const NAList<simple_cbf_key>& getAllKeys() { return keys_; };
UInt64 getSizeInBytes(
UInt32 maxHashFuncs,
UInt32 n, // # of distinct elements
float p, // probability of false positives
UInt32 nonOverflowFreq, // non-overflow freq
NABoolean isChar, Int32 actualFixAmount
);
protected:
virtual void insertIntoOverflowTable(const simple_cbf_key&, UInt32 freq = 1);
virtual void removeFromOverflowTable(const simple_cbf_key&);
virtual UInt64* searchOverflowTable(const simple_cbf_key& key)
{ return overflowCountTable_.getFirstValue(&key); };
virtual void computeOverflowF2s();
static
UInt64 estimateMemoryInBytesForOverflowTable(UInt32 avgSkewedElements,
UInt32 numBuckets
);
protected:
VarUIntArray counters_; // counters for low freq keys
NAList<simple_cbf_key> keys_;
NAHashDictionary<simple_cbf_key, UInt64> overflowCountTable_;
};
#endif