blob: 734f4778051d26cd0e8b3d056ebf80c45fabeb85 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
#ifndef DENSITY_SKETCH_HPP_
#define DENSITY_SKETCH_HPP_
#include <type_traits>
#include <vector>
#include <functional>
#include <numeric>
#include <cmath>
#include "common_defs.hpp"
/*
* Based on the following paper:
* Zohar Karnin, Edo Liberty "Discrepancy, Coresets, and Sketches in Machine Learning"
* https://proceedings.mlr.press/v99/karnin19a/karnin19a.pdf
*
* Inspired by the following implementation:
* https://github.com/edoliberty/streaming-quantiles/blob/f688c8161a25582457b0a09deb4630a81406293b/gde.py
*/
namespace datasketches {
template<typename T>
struct gaussian_kernel {
T operator()(const std::vector<T>& v1, const std::vector<T>& v2) const {
return exp(-std::inner_product(v1.begin(), v1.end(), v2.begin(), 0.0, std::plus<T>(), [](T a, T b){return (a-b)*(a-b);}));
}
};
template<
typename T,
typename Kernel = gaussian_kernel<T>,
typename Allocator = std::allocator<T>
>
class density_sketch {
static_assert(std::is_floating_point<T>::value, "Floating point type expected");
public:
using Vector = std::vector<T, Allocator>;
using Level = std::vector<Vector, typename std::allocator_traits<Allocator>::template rebind_alloc<Vector>>;
using Levels = std::vector<Level, typename std::allocator_traits<Allocator>::template rebind_alloc<Level>>;
/**
* Constructor
* @param k controls the size and error of the sketch.
* @param dim dimension of the input domain
* @param kernel to use by this instance
* @param allocator to use by this instance
*/
density_sketch(uint16_t k, uint32_t dim, const Kernel& kernel = Kernel(), const Allocator& allocator = Allocator());
/**
* Returns configured parameter K
* @return parameter K
*/
uint16_t get_k() const;
/**
* Returns configured dimensions
* @return dimensions
*/
uint32_t get_dim() const;
/**
* Returns true if this sketch is empty.
* @return empty flag
*/
bool is_empty() const;
/**
* Returns the length of the input stream (number of points observed by this sketch).
* @return stream length
*/
uint64_t get_n() const;
/**
* Returns the number of retained points in the sketch.
* @return number of retained points
*/
uint32_t get_num_retained() const;
/**
* Returns true if this sketch is in estimation mode.
* @return estimation mode flag
*/
bool is_estimation_mode() const;
/**
* Updates this sketch with a given point.
* @param point given point
*/
template<typename FwdVector>
void update(FwdVector&& point);
/**
* Merges another sketch into this one.
* @param other sketch to merge into this one
*/
template<typename FwdSketch>
void merge(FwdSketch&& other);
T get_estimate(const std::vector<T>& point) const;
/**
* Returns an instance of the allocator for this sketch.
* @return allocator
*/
Allocator get_allocator() const;
/**
* This method serializes the sketch into a given stream in a binary form
* @param os output stream
*/
void serialize(std::ostream& os) const;
using vector_bytes = std::vector<uint8_t, typename std::allocator_traits<Allocator>::template rebind_alloc<uint8_t>>;
/**
* This method serializes the sketch as a vector of bytes.
* An optional header can be reserved in front of the sketch.
* It is an uninitialized space of a given size.
* This header is used in Datasketches PostgreSQL extension.
* @param header_size_bytes space to reserve in front of the sketch
*/
vector_bytes serialize(unsigned header_size_bytes = 0) const;
/**
* This method deserializes a sketch from a given stream.
* @param is input stream
* @param kernel the kernel function to use for this sketch
* @param allocator the memory allocator to use with this sketch
* @return an instance of the sketch
*/
static density_sketch deserialize(std::istream& is,
const Kernel& kernel=Kernel(), const Allocator& allocator = Allocator());
/**
* This method deserializes a sketch from a given array of bytes.
* @param bytes pointer to the array of bytes
* @param size the size of the array
* @param kernel the kernel function to use for this sketch
* @param allocator the memory allocator to use with this sketch
* @return an instance of the sketch
*/
static density_sketch deserialize(const void* bytes, size_t size,
const Kernel& kernel=Kernel(), const Allocator& allocator = Allocator());
/**
* Prints a summary of the sketch.
* @param print_levels if true include information about levels
* @param print_items if true include sketch data
*/
string<Allocator> to_string(bool print_levels = false, bool print_items = false) const;
class const_iterator;
const_iterator begin() const;
const_iterator end() const;
private:
enum flags { RESERVED0, RESERVED1, IS_EMPTY };
static const uint8_t PREAMBLE_INTS_SHORT = 3;
static const uint8_t PREAMBLE_INTS_LONG = 6;
static const uint8_t FAMILY_ID = 19;
static const uint8_t SERIAL_VERSION = 1;
static const size_t LEVELS_ARRAY_START = 5;
Allocator allocator_;
Kernel kernel_;
uint16_t k_;
uint32_t dim_;
uint32_t num_retained_;
uint64_t n_;
Levels levels_;
void compact();
void compact_level(unsigned height);
static void check_k(uint16_t k);
static void check_serial_version(uint8_t serial_version);
static void check_family_id(uint8_t family_id);
static void check_header_validity(uint8_t preamble_ints, uint8_t flags_byte, uint8_t serial_version);
density_sketch(uint16_t k, uint32_t dim, uint32_t num_retained, uint64_t n, Levels&& levels,
const Kernel& kernel = Kernel());
};
template<typename T, typename K, typename A>
class density_sketch<T, K, A>::const_iterator {
public:
using Vector = density_sketch<T, K, A>::Vector;
using iterator_category = std::input_iterator_tag;
using value_type = std::pair<const Vector&, const uint64_t>;
using difference_type = void;
using pointer = return_value_holder<value_type>;
using reference = const value_type;
const_iterator& operator++();
const_iterator& operator++(int);
bool operator==(const const_iterator& other) const;
bool operator!=(const const_iterator& other) const;
const value_type operator*() const;
const return_value_holder<value_type> operator->() const;
private:
using LevelsIterator = typename density_sketch<T, K, A>::Levels::const_iterator;
using LevelIterator = typename density_sketch<T, K, A>::Level::const_iterator;
LevelsIterator levels_it_;
LevelsIterator levels_end_;
LevelIterator level_it_;
unsigned height_;
friend class density_sketch<T, K, A>;
const_iterator(LevelsIterator begin, LevelsIterator end);
};
} /* namespace datasketches */
#include "density_sketch_impl.hpp"
#endif