blob: 7ddb28285c694719c38386cf0031468a193fb8fd [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
#ifndef _EBPPS_SAMPLE_HPP_
#define _EBPPS_SAMPLE_HPP_
#include "common_defs.hpp"
#include "optional.hpp"
#include "serde.hpp"
#include <memory>
#include <vector>
namespace datasketches {
template<
typename T,
typename A = std::allocator<T>
>
class ebpps_sample {
public:
explicit ebpps_sample(uint32_t k, const A& allocator = A());
// for deserialization
class items_deleter;
ebpps_sample(std::vector<T, A>&& data, optional<T>&& partial_item, double c, const A& allocator = A());
// used instead of having a single-item constructor for update/merge calls
template<typename TT>
void replace_content(TT&& item, double theta);
void reset();
void downsample(double theta);
template<typename FwdSample>
void merge(FwdSample&& other);
// standard way to query the sample
using result_type = std::vector<T, A>;
result_type get_sample() const;
double get_c() const;
// intended for internal use
// returns only full items
result_type get_full_items() const;
// intended for internal use
// handles only the partial item
bool has_partial_item() const;
T get_partial_item() const;
string<A> to_string() const;
/**
* @brief Returns the number of items contained in the sample
* Computes the number of items, full or partial, currently in the sample.
* The result should match ceiling(c);
* @return the number of items contained in the sample
*/
inline uint32_t get_num_retained_items() const;
/**
* Computes size needed to serialize the current state of the sample.
* This version is for fixed-size arithmetic types (integral and floating point).
* @param sd instance of a SerDe
* @return size in bytes needed to serialize the items in this sample
*/
template<typename TT = T, typename SerDe = serde<T>, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type = 0>
inline size_t get_serialized_size_bytes(const SerDe& sd = SerDe()) const;
/**
* Computes size needed to serialize the items in the sample.
* This version is for all other types and can be expensive since every item needs to be looked at.
* @param sd instance of a SerDe
* @return size in bytes needed to serialize the items in this sample
*/
template<typename TT = T, typename SerDe = serde<T>, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type = 0>
inline size_t get_serialized_size_bytes(const SerDe& sd = SerDe()) const;
// This is a convenience alias for users
// The type returned by the following serialize method
using vector_bytes = std::vector<uint8_t, typename std::allocator_traits<A>::template rebind_alloc<uint8_t>>;
/**
* This method serializes the sample as a vector of bytes.
* @param ptr pointer to start of pre-allocated memory to use
* @param end_ptr pointer to end of pre-allocated memory to use
* @param sd instance of a SerDe
* @return number of bytes written
*/
template<typename SerDe = serde<T>>
size_t serialize(uint8_t* ptr, const uint8_t* end_ptr, const SerDe& sd = SerDe()) const;
/**
* This method serializes the sample into a given stream in a binary form
* @param os output stream
* @param sd instance of a SerDe
*/
template<typename SerDe = serde<T>>
void serialize(std::ostream& os, const SerDe& sd = SerDe()) const;
/**
* This method deserializes a sample from a given array of bytes.
* @param ptr pointer to the array of bytes
* @param size the size of the array
* @param sd instance of a SerDe
* @param allocator instance of an allocator
* @return a std::pair with a sample and the number of bytes read
*/
template<typename SerDe = serde<T>>
static std::pair<ebpps_sample, size_t> deserialize(const uint8_t* ptr, size_t size, const SerDe& sd = SerDe(), const A& allocator = A());
/**
* This method deserializes a sample from a given stream.
* @param is input stream
* @param sd instance of a SerDe
* @param allocator instance of an allocator
* @return an instance of a sample
*/
template<typename SerDe = serde<T>>
static ebpps_sample deserialize(std::istream& is, const SerDe& sd = SerDe(), const A& allocator = A());
class const_iterator;
/**
* Iterator pointing to the first item in the sample.
* If the sample is empty, the returned iterator must not be dereferenced or incremented
* @param force_partial forces the inclusion of the partial item, if one exists
* @return iterator pointing to the first item in the sample
*/
const_iterator begin() const;
/**
* Iterator pointing to the past-the-end item in the sample.
* The past-the-end item is the hypothetical item that would follow the last item.
* It does not point to any item, and must not be dereferenced or incremented.
* @return iterator pointing to the past-the-end item in the sample
*/
const_iterator end() const;
private:
A allocator_;
double c_; // Current sample size, including fractional part
optional<T> partial_item_; // a sample item corresponding to a partial weight
std::vector<T, A> data_; // stored sampled items
template<typename FwdItem>
inline void set_partial(FwdItem&& item);
void swap_with_partial();
void move_one_to_partial();
void subsample(uint32_t num_samples);
static inline uint32_t random_idx(uint32_t max);
static inline double next_double();
friend class const_iterator;
};
template<typename T, typename A>
class ebpps_sample<T, A>::const_iterator {
public:
using iterator_category = std::input_iterator_tag;
using value_type = const T&;
using difference_type = void;
using pointer = const return_value_holder<value_type>;
using reference = value_type;
const_iterator(const const_iterator& other);
const_iterator& operator++();
const_iterator& operator++(int);
bool operator==(const const_iterator& other) const;
bool operator!=(const const_iterator& other) const;
reference operator*() const;
pointer operator->() const;
private:
static const size_t PARTIAL_IDX = static_cast<size_t>(-1);
// default iterator over sample
const_iterator(const ebpps_sample<T, A>* sample);
const ebpps_sample<T, A>* sample_;
size_t idx_;
bool use_partial_;
friend class ebpps_sample;
};
} // namespace datasketches
#include "ebpps_sample_impl.hpp"
#endif // _EBPPS_SAMPLE_HPP_