blob: 383e5736ed7121e8833a8b0a26ad2291388513b0 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
#ifndef TUPLE_SKETCH_HPP_
#define TUPLE_SKETCH_HPP_
#include <string>
#include "serde.hpp"
#include "theta_update_sketch_base.hpp"
namespace datasketches {
// forward declarations
template<typename S, typename A> class tuple_sketch;
template<typename S, typename U, typename P, typename A> class update_tuple_sketch;
template<typename S, typename A> class compact_tuple_sketch;
template<typename A> class theta_sketch_alloc;
template<typename K, typename V>
struct pair_extract_key {
K& operator()(std::pair<K, V>& entry) const {
return entry.first;
}
const K& operator()(const std::pair<K, V>& entry) const {
return entry.first;
}
};
/**
* Base class for Tuple sketch.
* This is an extension of Theta sketch that allows keeping arbitrary Summary associated with each retained key.
*/
template<
typename Summary,
typename Allocator = std::allocator<Summary>
>
class tuple_sketch {
public:
using Entry = std::pair<uint64_t, Summary>;
using ExtractKey = pair_extract_key<uint64_t, Summary>;
using iterator = theta_iterator<Entry, ExtractKey>;
using const_iterator = theta_const_iterator<Entry, ExtractKey>;
virtual ~tuple_sketch() = default;
/**
* @return allocator
*/
virtual Allocator get_allocator() const = 0;
/**
* @return true if this sketch represents an empty set (not the same as no retained entries!)
*/
virtual bool is_empty() const = 0;
/**
* @return estimate of the distinct count of the input stream
*/
double get_estimate() const;
/**
* Returns the approximate lower error bound given a number of standard deviations over an arbitrary number of
* items stored in the sketch.
* This parameter is similar to the number of standard deviations of the normal distribution
* and corresponds to approximately 67%, 95% and 99% confidence intervals.
* @param num_std_devs number of Standard Deviations (1, 2 or 3)
* @param num_subset_entries number of items from {0, 1, ..., get_num_retained()} over which to estimate the bound
* @return the lower bound
*/
double get_lower_bound(uint8_t num_std_devs, uint32_t num_subset_entries) const ;
/**
* Returns the approximate lower error bound given a number of standard deviations.
* This parameter is similar to the number of standard deviations of the normal distribution
* and corresponds to approximately 67%, 95% and 99% confidence intervals.
* @param num_std_devs number of Standard Deviations (1, 2 or 3)
* @return the lower bound
*/
double get_lower_bound(uint8_t num_std_devs) const;
/**
* Returns the approximate upper error bound given a number of standard deviations over an arbitrary number of
* items stored in the sketch.
* This parameter is similar to the number of standard deviations of the normal distribution
* and corresponds to approximately 67%, 95% and 99% confidence intervals.
* @param num_std_devs number of Standard Deviations (1, 2 or 3)
* @param num_subset_entries number of items from {0, 1, ..., get_num_retained()} over which to estimate the bound
* @return the lower bound
*/
double get_upper_bound(uint8_t num_std_devs, uint32_t num_subset_entries) const ;
/**
* Returns the approximate upper error bound given a number of standard deviations.
* This parameter is similar to the number of standard deviations of the normal distribution
* and corresponds to approximately 67%, 95% and 99% confidence intervals.
* @param num_std_devs number of Standard Deviations (1, 2 or 3)
* @return the upper bound
*/
double get_upper_bound(uint8_t num_std_devs) const;
/**
* @return true if the sketch is in estimation mode (as opposed to exact mode)
*/
bool is_estimation_mode() const;
/**
* @return theta as a fraction from 0 to 1 (effective sampling rate)
*/
double get_theta() const;
/**
* @return theta as a positive integer between 0 and LLONG_MAX
*/
virtual uint64_t get_theta64() const = 0;
/**
* @return the number of retained entries in the sketch
*/
virtual uint32_t get_num_retained() const = 0;
/**
* @return hash of the seed that was used to hash the input
*/
virtual uint16_t get_seed_hash() const = 0;
/**
* @return true if retained entries are ordered
*/
virtual bool is_ordered() const = 0;
/**
* Provides a human-readable summary of this sketch as a string
* @param print_items if true include the list of items retained by the sketch
* @return sketch summary as a string
*/
string<Allocator> to_string(bool print_items = false) const;
/**
* Iterator over entries in this sketch.
* @return begin iterator
*/
virtual iterator begin() = 0;
/**
* Iterator pointing past the valid range.
* Not to be incremented or dereferenced.
* @return end iterator
*/
virtual iterator end() = 0;
/**
* Const iterator over entries in this sketch.
* @return begin const iterator
*/
virtual const_iterator begin() const = 0;
/**
* Const iterator pointing past the valid range.
* Not to be incremented or dereferenced.
* @return end const iterator
*/
virtual const_iterator end() const = 0;
protected:
virtual void print_specifics(std::ostringstream& os) const = 0;
static uint16_t get_seed_hash(uint64_t seed);
static void check_sketch_type(uint8_t actual, uint8_t expected);
static void check_serial_version(uint8_t actual, uint8_t expected);
static void check_seed_hash(uint16_t actual, uint16_t expected);
};
// update sketch
// for types with defined default constructor and + operation
template<typename Summary, typename Update>
struct default_tuple_update_policy {
Summary create() const {
return Summary();
}
void update(Summary& summary, const Update& update) const {
summary += update;
}
};
/**
* Update Tuple sketch.
* The purpose of this class is to build a Tuple sketch from input data via the update() methods.
* There is no constructor. Use builder instead.
*/
template<
typename Summary,
typename Update = Summary,
typename Policy = default_tuple_update_policy<Summary, Update>,
typename Allocator = std::allocator<Summary>
>
class update_tuple_sketch: public tuple_sketch<Summary, Allocator> {
public:
using Base = tuple_sketch<Summary, Allocator>;
using Entry = typename Base::Entry;
using ExtractKey = typename Base::ExtractKey;
using iterator = typename Base::iterator;
using const_iterator = typename Base::const_iterator;
using AllocEntry = typename std::allocator_traits<Allocator>::template rebind_alloc<Entry>;
using tuple_map = theta_update_sketch_base<Entry, ExtractKey, AllocEntry>;
using resize_factor = typename tuple_map::resize_factor;
// No constructor here. Use builder instead.
class builder;
update_tuple_sketch(const update_tuple_sketch&) = default;
update_tuple_sketch(update_tuple_sketch&&) noexcept = default;
virtual ~update_tuple_sketch() = default;
update_tuple_sketch& operator=(const update_tuple_sketch&) = default;
update_tuple_sketch& operator=(update_tuple_sketch&&) = default;
virtual Allocator get_allocator() const;
virtual bool is_empty() const;
virtual bool is_ordered() const;
virtual uint64_t get_theta64() const;
virtual uint32_t get_num_retained() const;
virtual uint16_t get_seed_hash() const;
/**
* @return configured nominal number of entries in the sketch
*/
uint8_t get_lg_k() const;
/**
* @return configured resize factor of the sketch
*/
resize_factor get_rf() const;
/**
* Update this sketch with a given string.
* @param key string to update the sketch with
* @param value to update the sketch with
*/
template<typename FwdUpdate>
inline void update(const std::string& key, FwdUpdate&& value);
/**
* Update this sketch with a given unsigned 64-bit integer.
* @param key uint64_t to update the sketch with
* @param value to update the sketch with
*/
template<typename FwdUpdate>
inline void update(uint64_t key, FwdUpdate&& value);
/**
* Update this sketch with a given signed 64-bit integer.
* @param key int64_t to update the sketch with
* @param value to update the sketch with
*/
template<typename FwdUpdate>
inline void update(int64_t key, FwdUpdate&& value);
/**
* Update this sketch with a given unsigned 32-bit integer.
* For compatibility with Java implementation.
* @param key uint32_t to update the sketch with
* @param value to update the sketch with
*/
template<typename FwdUpdate>
inline void update(uint32_t key, FwdUpdate&& value);
/**
* Update this sketch with a given signed 32-bit integer.
* For compatibility with Java implementation.
* @param key int32_t to update the sketch with
* @param value to update the sketch with
*/
template<typename FwdUpdate>
inline void update(int32_t key, FwdUpdate&& value);
/**
* Update this sketch with a given unsigned 16-bit integer.
* For compatibility with Java implementation.
* @param key uint16_t to update the sketch with
* @param value to update the sketch with
*/
template<typename FwdUpdate>
inline void update(uint16_t key, FwdUpdate&& value);
/**
* Update this sketch with a given signed 16-bit integer.
* For compatibility with Java implementation.
* @param key int16_t to update the sketch with
* @param value to update the sketch with
*/
template<typename FwdUpdate>
inline void update(int16_t key, FwdUpdate&& value);
/**
* Update this sketch with a given unsigned 8-bit integer.
* For compatibility with Java implementation.
* @param key uint8_t to update the sketch with
* @param value to update the sketch with
*/
template<typename FwdUpdate>
inline void update(uint8_t key, FwdUpdate&& value);
/**
* Update this sketch with a given signed 8-bit integer.
* For compatibility with Java implementation.
* @param key int8_t to update the sketch with
* @param value to update the sketch with
*/
template<typename FwdUpdate>
inline void update(int8_t key, FwdUpdate&& value);
/**
* Update this sketch with a given double-precision floating point value.
* For compatibility with Java implementation.
* @param key double to update the sketch with
* @param value to update the sketch with
*/
template<typename FwdUpdate>
inline void update(double key, FwdUpdate&& value);
/**
* Update this sketch with a given floating point value.
* For compatibility with Java implementation.
* @param key float to update the sketch with
* @param value to update the sketch with
*/
template<typename FwdUpdate>
inline void update(float key, FwdUpdate&& value);
/**
* Update this sketch with given data of any type.
* This is a "universal" update that covers all cases above,
* but may produce different hashes.
* Be very careful to hash input values consistently using the same approach
* both over time and on different platforms
* and while passing sketches between C++ environment and Java environment.
* Otherwise two sketches that should represent overlapping sets will be disjoint
* For instance, for signed 32-bit values call update(int32_t) method above,
* which does widening conversion to int64_t, if compatibility with Java is expected
* @param key pointer to the data
* @param length of the data in bytes
* @param value to update the sketch with
*/
template<typename FwdUpdate>
void update(const void* key, size_t length, FwdUpdate&& value);
/**
* Remove retained entries in excess of the nominal size k (if any)
*/
void trim();
/**
* Reset the sketch to the initial empty state
*/
void reset();
/**
* Converts this sketch to a compact sketch (ordered or unordered).
* @param ordered optional flag to specify if an ordered sketch should be produced
* @return compact sketch
*/
compact_tuple_sketch<Summary, Allocator> compact(bool ordered = true) const;
virtual iterator begin();
virtual iterator end();
virtual const_iterator begin() const;
virtual const_iterator end() const;
protected:
Policy policy_;
tuple_map map_;
// for builder
update_tuple_sketch(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t theta, uint64_t seed, const Policy& policy, const Allocator& allocator);
virtual void print_specifics(std::ostringstream& os) const;
};
/**
* Compact Tuple sketch.
* This is an immutable form of the Tuple sketch, the form that can be serialized and deserialized.
*/
template<
typename Summary,
typename Allocator = std::allocator<Summary>
>
class compact_tuple_sketch: public tuple_sketch<Summary, Allocator> {
public:
using Base = tuple_sketch<Summary, Allocator>;
using Entry = typename Base::Entry;
using ExtractKey = typename Base::ExtractKey;
using iterator = typename Base::iterator;
using const_iterator = typename Base::const_iterator;
using AllocEntry = typename std::allocator_traits<Allocator>::template rebind_alloc<Entry>;
using AllocU64 = typename std::allocator_traits<Allocator>::template rebind_alloc<uint64_t>;
using AllocBytes = typename std::allocator_traits<Allocator>::template rebind_alloc<uint8_t>;
using vector_bytes = std::vector<uint8_t, AllocBytes>;
using comparator = compare_by_key<ExtractKey>;
static const uint8_t SERIAL_VERSION_LEGACY = 1;
static const uint8_t SERIAL_VERSION = 3;
static const uint8_t SKETCH_FAMILY = 9;
static const uint8_t SKETCH_TYPE = 1;
static const uint8_t SKETCH_TYPE_LEGACY = 5;
enum flags { IS_BIG_ENDIAN, IS_READ_ONLY, IS_EMPTY, IS_COMPACT, IS_ORDERED };
// Instances of this type can be obtained:
// - by compacting an update_tuple_sketch
// - as a result of a set operation
// - by deserializing a previously serialized compact sketch
/**
* Copy constructor.
* Constructs a compact sketch from another sketch (either update or compact)
* @param other sketch to be copied
* @param ordered if true make the resulting sketch ordered
*/
compact_tuple_sketch(const Base& other, bool ordered);
/**
* Copy constructor.
* @param other sketch to be copied
*/
compact_tuple_sketch(const compact_tuple_sketch& other) = default;
/**
* Move constructor.
* @param other sketch to be moved
*/
compact_tuple_sketch(compact_tuple_sketch&&) noexcept;
virtual ~compact_tuple_sketch() = default;
/**
* Copy assignment
* @param other sketch to be copied
* @return reference to this sketch
*/
compact_tuple_sketch& operator=(const compact_tuple_sketch& other) = default;
/**
* Move assignment
* @param other sketch to be moved
* @return reference to this sketch
*/
compact_tuple_sketch& operator=(compact_tuple_sketch&& other) = default;
/**
* Constructor from Theta sketch
* @param other Theta sketch to be constructed from
* @param summary Summary instance to be associated with each entry
* @param ordered if true make the resulting sketch ordered
*/
compact_tuple_sketch(const theta_sketch_alloc<AllocU64>& other, const Summary& summary, bool ordered = true);
virtual Allocator get_allocator() const;
virtual bool is_empty() const;
virtual bool is_ordered() const;
virtual uint64_t get_theta64() const;
virtual uint32_t get_num_retained() const;
virtual uint16_t get_seed_hash() const;
/**
* This method serializes the sketch into a given stream in a binary form
* @param os output stream
* @param sd instance of a SerDe
*/
template<typename SerDe = serde<Summary>>
void serialize(std::ostream& os, const SerDe& sd = SerDe()) const;
/**
* This method serializes the sketch as a vector of bytes.
* An optional header can be reserved in front of the sketch.
* It is a blank space of a given size.
* This header is used in Datasketches PostgreSQL extension.
* @param header_size_bytes space to reserve in front of the sketch
* @param sd instance of a SerDe
* @return serialized sketch as a vector of bytes
*/
template<typename SerDe = serde<Summary>>
vector_bytes serialize(unsigned header_size_bytes = 0, const SerDe& sd = SerDe()) const;
virtual iterator begin();
virtual iterator end();
virtual const_iterator begin() const;
virtual const_iterator end() const;
/**
* This method deserializes a sketch from a given stream.
* @param is input stream
* @param seed the seed for the hash function that was used to create the sketch
* @param sd instance of a SerDe
* @param allocator instance of an Allocator
* @return an instance of a sketch
*/
template<typename SerDe = serde<Summary>>
static compact_tuple_sketch deserialize(std::istream& is, uint64_t seed = DEFAULT_SEED,
const SerDe& sd = SerDe(), const Allocator& allocator = Allocator());
/**
* This method deserializes a sketch from a given array of bytes.
* @param bytes pointer to the array of bytes
* @param size the size of the array
* @param seed the seed for the hash function that was used to create the sketch
* @param sd instance of a SerDe
* @param allocator instance of an Allocator
* @return an instance of the sketch
*/
template<typename SerDe = serde<Summary>>
static compact_tuple_sketch deserialize(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED,
const SerDe& sd = SerDe(), const Allocator& allocator = Allocator());
protected:
bool is_empty_;
bool is_ordered_;
uint16_t seed_hash_;
uint64_t theta_;
std::vector<Entry, AllocEntry> entries_;
/**
* Computes size needed to serialize summaries in the sketch.
* This version is for fixed-size arithmetic types (integral and floating point).
* @return size in bytes needed to serialize summaries in this sketch
*/
template<typename SerDe, typename SS = Summary, typename std::enable_if<std::is_arithmetic<SS>::value, int>::type = 0>
size_t get_serialized_size_summaries_bytes(const SerDe& sd) const;
/**
* Computes size needed to serialize summaries in the sketch.
* This version is for all other types and can be expensive since every item needs to be looked at.
* @return size in bytes needed to serialize summaries in this sketch
*/
template<typename SerDe, typename SS = Summary, typename std::enable_if<!std::is_arithmetic<SS>::value, int>::type = 0>
size_t get_serialized_size_summaries_bytes(const SerDe& sd) const;
// for deserialize
class deleter_of_summaries {
public:
deleter_of_summaries(uint32_t num, bool destroy, const Allocator& allocator):
allocator_(allocator), num_(num), destroy_(destroy) {}
void set_destroy(bool destroy) { destroy_ = destroy; }
void operator() (Summary* ptr) {
if (ptr != nullptr) {
if (destroy_) {
for (uint32_t i = 0; i < num_; ++i) ptr[i].~Summary();
}
allocator_.deallocate(ptr, num_);
}
}
private:
Allocator allocator_;
uint32_t num_;
bool destroy_;
};
virtual void print_specifics(std::ostringstream& os) const;
template<typename E, typename EK, typename P, typename S, typename CS, typename A> friend class theta_union_base;
template<typename E, typename EK, typename P, typename S, typename CS, typename A> friend class theta_intersection_base;
template<typename E, typename EK, typename CS, typename A> friend class theta_set_difference_base;
compact_tuple_sketch(bool is_empty, bool is_ordered, uint16_t seed_hash, uint64_t theta, std::vector<Entry, AllocEntry>&& entries);
};
/// Tuple base builder
template<typename Derived, typename Policy, typename Allocator>
class tuple_base_builder: public theta_base_builder<Derived, Allocator> {
public:
tuple_base_builder(const Policy& policy, const Allocator& allocator);
protected:
Policy policy_;
};
/// Update Tuple sketch builder
template<typename S, typename U, typename P, typename A>
class update_tuple_sketch<S, U, P, A>::builder: public tuple_base_builder<builder, P, A> {
public:
/**
* Constructor
* Creates and instance of the builder with default parameters.
* @param policy user-defined way of creating and updating Summary
* @param allocator instance of an Allocator to pass to created sketches
*/
builder(const P& policy = P(), const A& allocator = A());
/**
* This is to create an instance of the sketch with predefined parameters.
* @return an instance of the sketch
*/
update_tuple_sketch<S, U, P, A> build() const;
};
} /* namespace datasketches */
#include "tuple_sketch_impl.hpp"
#endif