blob: c3ab6af0be835e60b2eaeb67905fcb6eb50947a7 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
#ifndef QUANTILES_SORTED_VIEW_HPP_
#define QUANTILES_SORTED_VIEW_HPP_
#include <functional>
#include <cmath>
#include "common_defs.hpp"
namespace datasketches {
/**
* Sorted view for quantiles sketches (REQ, KLL and Quantiles)
*/
template<
typename T,
typename Comparator, // strict weak ordering function (see C++ named requirements: Compare)
typename Allocator
>
class quantiles_sorted_view {
public:
/// Entry type
using Entry = typename std::conditional<std::is_arithmetic<T>::value, std::pair<T, uint64_t>, std::pair<const T*, uint64_t>>::type;
using AllocEntry = typename std::allocator_traits<Allocator>::template rebind_alloc<Entry>;
using Container = std::vector<Entry, AllocEntry>;
/// @private
quantiles_sorted_view(uint32_t num, const Comparator& comparator, const Allocator& allocator);
/// @private
template<typename Iterator>
void add(Iterator begin, Iterator end, uint64_t weight);
/// @private
void convert_to_cummulative();
class const_iterator;
/**
* Iterator pointing to the first entry in the view.
* If the view is empty, the returned iterator must not be dereferenced or incremented.
* @return iterator pointing to the first entry
*/
const_iterator begin() const;
/**
* Iterator pointing to the past-the-end entry in the view.
* The past-the-end entry is the hypothetical entry that would follow the last entry.
* It does not point to any entry, and must not be dereferenced or incremented.
* @return iterator pointing to the past-the-end entry
*/
const_iterator end() const;
/// @return size of the view
size_t size() const;
/**
* Returns an approximation to the normalized rank of the given item.
*
* <p>If the view is empty this throws std::runtime_error.
*
* @param item to be ranked
* @param inclusive if true the weight of the given item is included into the rank.
* Otherwise the rank equals the sum of the weights of all items that are less than the given item
* according to the Comparator.
*
* @return an approximate normalized rank of the given item (0 to 1 inclusive)
*/
double get_rank(const T& item, bool inclusive = true) const;
/**
* Quantile return type.
* This is to return quantiles either by value (for arithmetic types) or by const reference (for all other types)
*/
using quantile_return_type = typename std::conditional<std::is_arithmetic<T>::value, T, const T&>::type;
/**
* Returns an item from the sketch that is the best approximation to an item
* from the original stream with the given normalized rank.
*
* <p>If the view is empty this throws std::runtime_error.
*
* @param rank of an item in the hypothetical sorted stream.
* @param inclusive if true, the given rank is considered inclusive (includes weight of an item)
*
* @return approximate quantile associated with the given normalized rank
*/
quantile_return_type get_quantile(double rank, bool inclusive = true) const;
using vector_double = std::vector<double, typename std::allocator_traits<Allocator>::template rebind_alloc<double>>;
/**
* Returns an approximation to the Cumulative Distribution Function (CDF), which is the
* cumulative analog of the PMF, of the input stream given a set of split points (items).
*
* <p>If the view is empty this throws std::runtime_error.
*
* @param split_points an array of <i>m</i> unique, monotonically increasing items
* that divide the input domain into <i>m+1</i> consecutive disjoint intervals.
*
* @param size the number of split points in the array
*
* @param inclusive if true the rank of an item includes its own weight, and therefore
* if the sketch contains items equal to a slit point, then in CDF such items are
* included into the interval to the left of split point. Otherwise they are included into
* the interval to the right of split point.
*
* @return an array of m+1 doubles, which are a consecutive approximation to the CDF
* of the input stream given the split_points. The value at array position j of the returned
* CDF array is the sum of the returned values in positions 0 through j of the returned PMF
* array. This can be viewed as array of ranks of the given split points plus one more value
* that is always 1.
*/
vector_double get_CDF(const T* split_points, uint32_t size, bool inclusive = true) const;
/**
* Returns an approximation to the Probability Mass Function (PMF) of the input stream
* given a set of split points (items).
*
* <p>If the view is empty this throws std::runtime_error.
*
* @param split_points an array of <i>m</i> unique, monotonically increasing items
* that divide the input domain into <i>m+1</i> consecutive disjoint intervals (bins).
*
* @param size the number of split points in the array
*
* @param inclusive if true the rank of an item includes its own weight, and therefore
* if the sketch contains items equal to a slit point, then in PMF such items are
* included into the interval to the left of split point. Otherwise they are included into the interval
* to the right of split point.
*
* @return an array of m+1 doubles each of which is an approximation
* to the fraction of the input stream items (the mass) that fall into one of those intervals.
*/
vector_double get_PMF(const T* split_points, uint32_t size, bool inclusive = true) const;
private:
Comparator comparator_;
uint64_t total_weight_;
Container entries_;
static inline const T& deref_helper(const T* t) { return *t; }
static inline T deref_helper(T t) { return t; }
struct compare_pairs_by_first {
explicit compare_pairs_by_first(const Comparator& comparator): comparator_(comparator) {}
bool operator()(const Entry& a, const Entry& b) const {
return comparator_(deref_helper(a.first), deref_helper(b.first));
}
Comparator comparator_;
};
struct compare_pairs_by_second {
bool operator()(const Entry& a, const Entry& b) const {
return a.second < b.second;
}
};
template<typename TT = T, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type = 0>
static inline T ref_helper(const T& t) { return t; }
template<typename TT = T, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type = 0>
static inline const T* ref_helper(const T& t) { return std::addressof(t); }
template<typename TT = T, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type = 0>
static inline Entry make_dummy_entry(uint64_t weight) { return Entry(0, weight); }
template<typename TT = T, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type = 0>
static inline Entry make_dummy_entry(uint64_t weight) { return Entry(nullptr, weight); }
template<typename TT = T, typename std::enable_if<std::is_floating_point<TT>::value, int>::type = 0>
static inline void check_split_points(const T* items, uint32_t size) {
for (uint32_t i = 0; i < size ; i++) {
if (std::isnan(items[i])) {
throw std::invalid_argument("Values must not be NaN");
}
if ((i < (size - 1)) && !(Comparator()(items[i], items[i + 1]))) {
throw std::invalid_argument("Values must be unique and monotonically increasing");
}
}
}
template<typename TT = T, typename std::enable_if<!std::is_floating_point<TT>::value, int>::type = 0>
static inline void check_split_points(const T* items, uint32_t size) {
for (uint32_t i = 0; i < size ; i++) {
if ((i < (size - 1)) && !(Comparator()(items[i], items[i + 1]))) {
throw std::invalid_argument("Items must be unique and monotonically increasing");
}
}
}
};
template<typename T, typename C, typename A>
class quantiles_sorted_view<T, C, A>::const_iterator: public quantiles_sorted_view<T, C, A>::Container::const_iterator {
public:
using Base = typename quantiles_sorted_view<T, C, A>::Container::const_iterator;
using value_type = typename std::conditional<std::is_arithmetic<T>::value, typename Base::value_type, std::pair<const T&, const uint64_t>>::type;
template<typename TT = T, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type = 0>
const value_type operator*() const { return Base::operator*(); }
template<typename TT = T, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type = 0>
const value_type operator*() const { return value_type(*(Base::operator*().first), Base::operator*().second); }
template<typename TT = T, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type = 0>
const value_type* operator->() const { return Base::operator->(); }
template<typename TT = T, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type = 0>
const return_value_holder<value_type> operator->() const { return **this; }
uint64_t get_weight() const {
if (*this == begin) return Base::operator*().second;
return Base::operator*().second - (*this - 1).operator*().second;
}
uint64_t get_cumulative_weight(bool inclusive = true) const {
return inclusive ? Base::operator*().second : Base::operator*().second - get_weight();
}
private:
Base begin;
friend class quantiles_sorted_view<T, C, A>;
const_iterator(const Base& it, const Base& begin): Base(it), begin(begin) {}
};
} /* namespace datasketches */
#include "quantiles_sorted_view_impl.hpp"
#endif