blob: 2f487bdb6f9bd92bcdebd3ce542ee8e14c4b8463 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
#ifndef COUNT_MIN_HPP_
#define COUNT_MIN_HPP_
#include <iterator>
#include "common_defs.hpp"
namespace datasketches {
/*
* C++ implementation of the CountMin sketch data structure of Cormode and Muthukrishnan.
* [1] - http://dimacs.rutgers.edu/~graham/pubs/papers/cm-full.pdf
* The template type W is the type of the vector that contains the weights of the objects inserted into the sketch,
* not the type of the input items themselves.
* @author Charlie Dickens
*/
template <typename W,
typename Allocator = std::allocator<W>>
class count_min_sketch{
static_assert(std::is_arithmetic<W>::value, "Arithmetic type expected");
public:
using allocator_type = Allocator;
/**
* Creates an instance of the sketch given parameters _num_hashes, _num_buckets and hash seed, `seed`.
* @param num_hashes : number of hash functions in the sketch. Equivalently the number of rows in the array
* @param num_buckets : number of buckets that hash functions map into. Equivalently the number of columns in the array
* @param seed for hash function
*
* The items inserted into the sketch can be arbitrary type, so long as they are hashable via murmurhash.
* Only update and estimate methods are added for uint64_t and string types.
*/
count_min_sketch(uint8_t num_hashes, uint32_t num_buckets, uint64_t seed = DEFAULT_SEED, const Allocator& allocator = Allocator()) ;
/**
* @return configured _num_hashes of this sketch
*/
uint8_t get_num_hashes() const;
/**
* @return configured _num_buckets of this sketch
*/
uint32_t get_num_buckets() const;
/**
* @return configured seed of this sketch
*/
uint64_t get_seed() const;
/**
* @return epsilon : double
* The maximum permissible error for any frequency estimate query.
* epsilon = ceil(e / _num_buckets)
*/
double get_relative_error() const;
/**
* @return _total_weight : typename W
* The total weight currently inserted into the stream.
*/
W get_total_weight() const;
/*
* @param relative_error : double -- the desired accuracy within which estimates should lie.
* For example, when relative_error = 0.05, then the returned frequency estimates satisfy the
* `relative_error` guarantee that never overestimates the weights but may underestimate the weights
* by 5% of the total weight in the sketch.
* @return number_of_buckets : the number of hash buckets at every level of the
* sketch required in order to obtain the specified relative error.
* [1] - Section 3 ``Data Structure'', page 6.
*/
static uint32_t suggest_num_buckets(double relative_error) ;
/*
* @param confidence : double -- the desired confidence with which estimates should be correct.
* For example, with 95% confidence, frequency estimates satisfy the `relative_error` guarantee.
* @return number_of_hashes : the number of hash functions that are required in
* order to achieve the specified confidence of the sketch.
* confidence = 1 - delta, with delta denoting the sketch failure probability in the literature.
* [1] - Section 3 ``Data Structure'', page 6.
*/
static uint8_t suggest_num_hashes(double confidence) ;
/**
* Specific get_estimate function for uint64_t type
* see generic get_estimate function
* @param item : uint64_t type.
* @return an estimate of the item's frequency.
*/
W get_estimate(uint64_t item) const ;
/**
* Specific get_estimate function for int64_t type
* see generic get_estimate function
* @param item : uint64_t type.
* @return an estimate of the item's frequency.
*/
W get_estimate(int64_t item) const ;
/**
* Specific get_estimate function for std::string type
* see generic get_estimate function
* @param item : std::string type
* @return an estimate of the item's frequency.
*/
W get_estimate(const std::string& item) const;
/**
* This is the generic estimate query function for any of the given datatypes.
* Query the sketch for the estimate of a given item.
* @param item : pointer to the data item to be query from the sketch.
* @param size : size_t
* @return the estimated frequency of the item denoted f_est satisfying
* f_true - relative_error*_total_weight <= f_est <= f_true
*/
W get_estimate(const void* item, size_t size) const ;
/**
* Query the sketch for the upper bound of a given item.
* @param item : uint64_t or std::string to query
* @return the upper bound on the true frequency of the item
* f_true <= f_est + relative_error*_total_weight
*/
W get_upper_bound(const void* item, size_t size) const;
W get_upper_bound(int64_t) const ;
W get_upper_bound(uint64_t) const ;
W get_upper_bound(const std::string& item) const;
/**
* Query the sketch for the lower bound of a given item.
* @param item : uint64_t or std::string to query
* @return the lower bound for the query result, f_est, on the true frequency, f_est of the item
* f_true - relative_error*_total_weight <= f_est
*/
W get_lower_bound(const void* item, size_t size) const ;
W get_lower_bound(int64_t) const ;
W get_lower_bound(uint64_t) const ;
W get_lower_bound(const std::string& item) const ;
/*
* Update this sketch with given data of any type.
* This is a "universal" update that covers all cases above,
* but may produce different hashes.
* @param item pointer to the data item to be inserted into the sketch.
* @param size of the data in bytes
* @return vector of uint64_t which each represent the index to which `value' must update in the sketch
*/
void update(const void* item, size_t size, W weight) ;
/**
* Update this sketch with a given uint64_t item.
* @param item : uint64_t to update the sketch with
* @param weight : arithmetic type
* void function which inserts an item of type uint64_t into the sketch
*/
void update(uint64_t item, W weight) ;
void update(uint64_t item) ;
void update(int64_t item, W weight) ;
void update(int64_t item) ;
/**
* Update this sketch with a given string.
* @param item : string to update the sketch with
* @param weight : arithmetic type
* void function which inserts an item of type std::string into the sketch
*/
void update(const std::string& item, W weight) ;
void update(const std::string& item) ;
/*
* merges a separate count_min_sketch into this count_min_sketch.
*/
void merge(const count_min_sketch &other_sketch) ;
/**
* Returns true if this sketch is empty.
* A Count Min Sketch is defined to be empty iff weight == 0
* This can only ever happen if all items inserted to the sketch have weights that cancel each other out.
* @return empty flag
*/
bool is_empty() const ;
/**
* @brief Returns a string describing the sketch
* @return A string with a human-readable description of the sketch
*/
string<Allocator> to_string() const;
// Iterators
using const_iterator = typename std::vector<W, Allocator>::const_iterator ;
const_iterator begin() const;
const_iterator end() const;
/**
* This method serializes the sketch into a given stream in a binary form
* @param os output stream
* The byte output has the following structure
* Byte 0:
* 1 - if and only if the sketch is empty
* 0 - otherwise
*
* Byte 1 (serial version), byte 2 (family id), byte 3 (flags):
* 00000001 - default for now.
*
* Bytes 4 - 7:
* uint8_t zero corresponding to ``empty''
*
* Byte 8:
* uint_8 for number of hash functions
*
* Bytes 9, 13
* 4 bytes : uint32 for number of buckets.
*
* Bytes 14, 15:
* seed_hash
*
* Byte 16:
* uint8_t zero corresponding to ``empty''
*
* All remaining bytes from 17-24 follow the pattern of
* Bytes 17-24:
* Sketch array entry
*
0 || 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
||is_empty|ser__ver|familyId| flags |xxxxxxxx|xxxxxxxx|xxxxxxxx|xxxxxxxx|
1 || 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
||---------- _num_buckets -----------|num_hash|__seed__ __hash__|xxxxxxxx|
2 || 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
||---------------------------- total weight ----------------------------|
3 || 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
||---------------------------- sketch entries ---------------------------|
...
*
*
*/
/**
* Computes size needed to serialize the current state of the sketch.
* @return size in bytes needed to serialize this sketch
*/
size_t get_serialized_size_bytes() const;
/**
* This method serializes a binary image of the sketch to an output stream.
*/
void serialize(std::ostream& os) const;
// This is a convenience alias for users
// The type returned by the following serialize method
using vector_bytes = std::vector<uint8_t, typename std::allocator_traits<Allocator>::template rebind_alloc<uint8_t>>;
/**
* This method serializes the sketch as a vector of bytes.
* An optional header can be reserved in front of the sketch.
* It is an uninitialized space of a given size.
* This header is used in Datasketches PostgreSQL extension.
* @param header_size_bytes space to reserve in front of the sketch
*/
vector_bytes serialize(unsigned header_size_bytes = 0) const;
/**
* This method deserializes a sketch from a given stream.
* @param is input stream
* @param seed the seed for the hash function that was used to create the sketch
* @return an instance of a sketch
*/
static count_min_sketch deserialize(std::istream& is, uint64_t seed=DEFAULT_SEED, const Allocator& allocator = Allocator());
/**
* This method deserializes a sketch from a given array of bytes.
* @param bytes pointer to the array of bytes
* @param size the size of the array
* @param seed the seed for the hash function that was used to create the sketch
* @return an instance of the sketch
*/
static count_min_sketch deserialize(const void* bytes, size_t size, uint64_t seed=DEFAULT_SEED, const Allocator& allocator = Allocator());
/**
* Returns the allocator for this sketch.
* @return allocator
*/
allocator_type get_allocator() const;
private:
Allocator _allocator;
uint8_t _num_hashes ;
uint32_t _num_buckets ;
std::vector<W, Allocator> _sketch_array ; // the array stored by the sketch
uint64_t _seed ;
W _total_weight ;
std::vector<uint64_t> hash_seeds ;
enum flags {IS_EMPTY};
static const uint8_t PREAMBLE_LONGS_SHORT = 2; // Empty -> need second byte for sketch parameters
static const uint8_t PREAMBLE_LONGS_FULL = 3; // Not empty -> need (at least) third byte for total weight.
static const uint8_t SERIAL_VERSION_1 = 1;
static const uint8_t FAMILY_ID = 18;
static const uint8_t NULL_8 = 0;
static const uint32_t NULL_32 = 0;
/**
* Throws an error if the header is not valid.
* @param preamble_longs
* @param serial_version
* @param flags_byte
*/
static void check_header_validity(uint8_t preamble_longs, uint8_t serial_version, uint8_t family_id, uint8_t flags_byte);
/*
* Obtain the hash values when inserting an item into the sketch.
* @param item pointer to the data item to be inserted into the sketch.
* @param size of the data in bytes
* @return vector of uint64_t which each represent the index to which `value' must update in the sketch
*/
std::vector<uint64_t> get_hashes(const void* item, size_t size) const;
};
} /* namespace datasketches */
#include "count_min_impl.hpp"
#endif