req/include/req_sketch_impl.hpp - datasketches-cpp - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 #ifndef REQ_SKETCH_IMPL_HPP_
 #define REQ_SKETCH_IMPL_HPP_

 #include <stdexcept>
 #include <cmath>
 #include <sstream>
 #include <algorithm>

 #include "count_zeros.hpp"

 namespace datasketches {

 template<typename T, bool H, typename C, typename A>
 req_compactor<T, H, C, A>::req_compactor(uint8_t lg_weight, uint32_t section_size, const A& allocator):
 lg_weight_(lg_weight),
 coin_(false),
 sorted_(true),
 section_size_raw_(section_size),
 section_size_(section_size),
 num_sections_(req_constants::INIT_NUM_SECTIONS),
 num_compactions_(0),
 state_(0),
 items_(allocator)
 {}

 template<typename T, bool H, typename C, typename A>
 bool req_compactor<T, H, C, A>::is_sorted() const {
   return sorted_;
 }

 template<typename T, bool H, typename C, typename A>
 uint32_t req_compactor<T, H, C, A>::get_num_items() const {
   return items_.size();
 }

 template<typename T, bool H, typename C, typename A>
 uint32_t req_compactor<T, H, C, A>::get_nom_capacity() const {
   return 2 * num_sections_ * section_size_;
 }

 //template<typename T, bool H, typename C, typename A>
 //uint8_t req_compactor<T, H, C, A>::get_lg_weight() const {
 //  return lg_weight_;
 //}

 template<typename T, bool H, typename C, typename A>
 template<bool inclusive>
 uint64_t req_compactor<T, H, C, A>::compute_weight(const T& item) const {
   if (!sorted_) const_cast<req_compactor*>(this)->sort(); // allow sorting as a side effect
   auto it = inclusive ?
       std::upper_bound(items_.begin(), items_.end(), item, C()) :
       std::lower_bound(items_.begin(), items_.end(), item, C());
   return std::distance(items_.begin(), it) << lg_weight_;
 }

 template<typename T, bool H, typename C, typename A>
 template<typename FwdT>
 void req_compactor<T, H, C, A>::append(FwdT&& item) {
   items_.push_back(std::forward<FwdT>(item));
   sorted_ = false;
 }

 template<typename T, bool H, typename C, typename A>
 void req_compactor<T, H, C, A>::sort() {
   std::sort(items_.begin(), items_.end(), C());
   sorted_ = true;
 }

 template<typename T, bool H, typename C, typename A>
 void req_compactor<T, H, C, A>::merge_sort_in(std::vector<T, A>&& items) {
   if (!sorted_) throw std::logic_error("compactor must be sorted at this point");
   if (items_.capacity() < items_.size() + items.size()) items_.reserve(items_.size() + items.size());
   auto middle = items_.end();
   std::move(items.begin(), items.end(), std::back_inserter(items_));
   std::inplace_merge(items_.begin(), middle, items_.end());
 }

 template<typename T, bool H, typename C, typename A>
 std::vector<T, A> req_compactor<T, H, C, A>::compact() {
   // choose a part of the buffer to compact
   const uint32_t secs_to_compact = std::min(static_cast<uint32_t>(count_trailing_zeros_in_u32(~state_) + 1), num_sections_);
   const size_t compaction_range = compute_compaction_range(secs_to_compact);
   const uint32_t compact_from = compaction_range & 0xFFFFFFFFLL; // low 32
   const uint32_t compact_to = compaction_range >> 32; // high 32
   if (compact_to - compact_from < 2) throw std::logic_error("compaction range error");

   if ((num_compactions_ & 1) == 1) { coin_ = !coin_; } // for odd flip coin;
   else { coin_ = req_random_bit(); } // random coin flip

   auto promote = get_evens_or_odds(items_.begin() + compact_from, items_.begin() + compact_to, coin_);
   items_.erase(items_.begin() + compact_from, items_.begin() + compact_to);

   ++num_compactions_;
   ++state_;
   ensure_enough_sections();
   return promote;
 }

 template<typename T, bool H, typename C, typename A>
 bool req_compactor<T, H, C, A>::ensure_enough_sections() {
   const double ssr = section_size_raw_ / sqrt(2);
   const uint32_t ne = nearest_even(ssr);
   if (num_compactions_ >= 1 << (num_sections_ - 1) && ne >= req_constants::MIN_K) {
     section_size_raw_ = ssr;
     section_size_ = ne;
     num_sections_ <<= 1;
     //ensure_capacity(2 * get_nom_capacity());
     return true;
   }
   return false;
 }

 template<typename T, bool H, typename C, typename A>
 size_t req_compactor<T, H, C, A>::compute_compaction_range(uint32_t secs_to_compact) const {
   const uint32_t num_items = items_.size();
   uint32_t non_compact = get_nom_capacity() / 2 + (num_sections_ - secs_to_compact) * section_size_;
   // make compacted region even
   if ((num_items - non_compact & 1) == 1) ++non_compact;
   const size_t low = H ? 0 : non_compact;
   const size_t high = H ? num_items - non_compact : num_items;
   return (high << 32) + low;
 }

 template<typename T, bool H, typename C, typename A>
 uint32_t req_compactor<T, H, C, A>::nearest_even(double value) {
   return static_cast<uint32_t>(round(value / 2)) << 1;
 }

 template<typename T, bool H, typename C, typename A>
 template<typename Iter>
 std::vector<T, A> req_compactor<T, H, C, A>::get_evens_or_odds(Iter from, Iter to, bool odds) {
   std::vector<T, A> result;
   if (from == to) return result;
   Iter i = from;
   if (odds) ++i;
   while (i != to) {
     result.push_back(*i);
     ++i;
     if (i == to) break;
     ++i;
   }
   return result;
 }

 // sketch

 template<typename T, bool H, typename C, typename S, typename A>
 req_sketch<T, H, C, S, A>::req_sketch(uint32_t k, const A& allocator):
 allocator_(allocator),
 k_(k),
 max_nom_size_(0),
 num_retained_(0),
 n_(0)
 {
   grow();
 }

 template<typename T, bool H, typename C, typename S, typename A>
 bool req_sketch<T, H, C, S, A>::is_empty() const {
   return n_ == 0;
 }

 template<typename T, bool H, typename C, typename S, typename A>
 uint64_t req_sketch<T, H, C, S, A>::get_n() const {
   return n_;
 }

 template<typename T, bool H, typename C, typename S, typename A>
 uint32_t req_sketch<T, H, C, S, A>::get_num_retained() const {
   return num_retained_;
 }

 template<typename T, bool H, typename C, typename S, typename A>
 bool req_sketch<T, H, C, S, A>::is_estimation_mode() const {
   return compactors_.size() > 1;
 }

 template<typename T, bool H, typename C, typename S, typename A>
 template<typename FwdT>
 void req_sketch<T, H, C, S, A>::update(FwdT&& item) {
 //  if (Float.isNaN(item)) { return; }
 //  if (isEmpty()) {
 //    minValue = item;
 //    maxValue = item;
 //  } else {
 //    if (item < minValue) { minValue = item; }
 //    if (item > maxValue) { maxValue = item; }
 //  }
   compactors_[0].append(item);
   ++num_retained_;
   ++n_;
   if (num_retained_ == max_nom_size_) {
     compactors_[0].sort();
     compress();
   }
   //  aux = null;
 }


 template<typename T, bool H, typename C, typename S, typename A>
 template<bool inclusive>
 double req_sketch<T, H, C, S, A>::get_rank(const T& item) const {
   uint64_t weight = 0;
   for (const auto& compactor: compactors_) {
     weight += compactor.template compute_weight<inclusive>(item);
   }
   return static_cast<double>(weight) / n_;
 }

 template<typename T, bool H, typename C, typename S, typename A>
 void req_sketch<T, H, C, S, A>::grow() {
   const uint8_t lg_weight = get_num_levels();
   compactors_.push_back(Compactor(lg_weight, k_, allocator_));
   update_max_nom_size();
 }

 template<typename T, bool H, typename C, typename S, typename A>
 uint8_t req_sketch<T, H, C, S, A>::get_num_levels() const {
   return compactors_.size();
 }

 template<typename T, bool H, typename C, typename S, typename A>
 void req_sketch<T, H, C, S, A>::update_max_nom_size() {
   max_nom_size_ = 0;
   for (const auto& compactor: compactors_) max_nom_size_ += compactor.get_nom_capacity();
 }

 template<typename T, bool H, typename C, typename S, typename A>
 void req_sketch<T, H, C, S, A>::update_num_retained() {
   num_retained_ = 0;
   for (const auto& compactor: compactors_) num_retained_ += compactor.get_num_items();
 }

 template<typename T, bool H, typename C, typename S, typename A>
 void req_sketch<T, H, C, S, A>::compress() {
   for (size_t h = 0; h < compactors_.size(); ++h) {
     if (compactors_[h].get_num_items() >= compactors_[h].get_nom_capacity()) {
       if (h + 1 >= get_num_levels()) { // at the top?
         grow(); // add a level, increases max_nom_size
       }
       auto promoted = compactors_[h].compact();
       compactors_[h + 1].merge_sort_in(std::move(promoted));
       update_num_retained();
       if (num_retained_ < max_nom_size_) break;
     }
   }
   update_max_nom_size();
   // aux = null;
 }

 template<typename T, bool H, typename C, typename S, typename A>
 string<A> req_sketch<T, H, C, S, A>::to_string(bool print_levels, bool print_items) const {
   std::basic_ostringstream<char, std::char_traits<char>, AllocChar<A>> os;
   os << "### REQ sketch summary:" << std::endl;
   os << "   K              : " << k_ << std::endl;
   os << "   N              : " << n_ << std::endl;
   os << "   Empty          : " << (is_empty() ? "true" : "false") << std::endl;
   os << "   Estimation mode: " << (is_estimation_mode() ? "true" : "false") << std::endl;
   os << "   Levels         : " << compactors_.size() << std::endl;
   os << "   Sorted         : " << (compactors_[0].is_sorted() ? "true" : "false") << std::endl;
   os << "   Capacity items : " << max_nom_size_ << std::endl;
   os << "   Retained items : " << num_retained_ << std::endl;
 //  os << "   Storage bytes  : " << get_serialized_size_bytes() << std::endl;
 //  if (!is_empty()) {
 //    os << "   Min value      : " << *min_value_ << std::endl;
 //    os << "   Max value      : " << *max_value_ << std::endl;
 //  }
   os << "### End sketch summary" << std::endl;

   if (print_levels) {
     os << "### REQ sketch levels:" << std::endl;
     os << "   index: nominal capacity, actual size" << std::endl;
     for (uint8_t i = 0; i < compactors_.size(); i++) {
       os << "   " << (unsigned int) i << ": "
         << compactors_[i].get_nom_capacity() << ", "
         << compactors_[i].get_num_items() << std::endl;
     }
     os << "### End sketch levels" << std::endl;
   }

   if (print_items) {
     os << "### REQ sketch data:" << std::endl;
     os << "### End sketch data" << std::endl;
   }
   return os.str();
 }

 } /* namespace datasketches */

 #endif
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/

	#ifndef REQ_SKETCH_IMPL_HPP_
	#define REQ_SKETCH_IMPL_HPP_

	#include <stdexcept>
	#include <cmath>
	#include <sstream>
	#include <algorithm>

	#include "count_zeros.hpp"

	namespace datasketches {

	template<typename T, bool H, typename C, typename A>
	req_compactor<T, H, C, A>::req_compactor(uint8_t lg_weight, uint32_t section_size, const A& allocator):
	lg_weight_(lg_weight),
	coin_(false),
	sorted_(true),
	section_size_raw_(section_size),
	section_size_(section_size),
	num_sections_(req_constants::INIT_NUM_SECTIONS),
	num_compactions_(0),
	state_(0),
	items_(allocator)
	{}

	template<typename T, bool H, typename C, typename A>
	bool req_compactor<T, H, C, A>::is_sorted() const {
	return sorted_;
	}

	template<typename T, bool H, typename C, typename A>
	uint32_t req_compactor<T, H, C, A>::get_num_items() const {
	return items_.size();
	}

	template<typename T, bool H, typename C, typename A>
	uint32_t req_compactor<T, H, C, A>::get_nom_capacity() const {
	return 2 * num_sections_ * section_size_;
	}

	//template<typename T, bool H, typename C, typename A>
	//uint8_t req_compactor<T, H, C, A>::get_lg_weight() const {
	// return lg_weight_;
	//}

	template<typename T, bool H, typename C, typename A>
	template<bool inclusive>
	uint64_t req_compactor<T, H, C, A>::compute_weight(const T& item) const {
	if (!sorted_) const_cast<req_compactor*>(this)->sort(); // allow sorting as a side effect
	auto it = inclusive ?
	std::upper_bound(items_.begin(), items_.end(), item, C()) :
	std::lower_bound(items_.begin(), items_.end(), item, C());
	return std::distance(items_.begin(), it) << lg_weight_;
	}

	template<typename T, bool H, typename C, typename A>
	template<typename FwdT>
	void req_compactor<T, H, C, A>::append(FwdT&& item) {
	items_.push_back(std::forward<FwdT>(item));
	sorted_ = false;
	}

	template<typename T, bool H, typename C, typename A>
	void req_compactor<T, H, C, A>::sort() {
	std::sort(items_.begin(), items_.end(), C());
	sorted_ = true;
	}

	template<typename T, bool H, typename C, typename A>
	void req_compactor<T, H, C, A>::merge_sort_in(std::vector<T, A>&& items) {
	if (!sorted_) throw std::logic_error("compactor must be sorted at this point");
	if (items_.capacity() < items_.size() + items.size()) items_.reserve(items_.size() + items.size());
	auto middle = items_.end();
	std::move(items.begin(), items.end(), std::back_inserter(items_));
	std::inplace_merge(items_.begin(), middle, items_.end());
	}

	template<typename T, bool H, typename C, typename A>
	std::vector<T, A> req_compactor<T, H, C, A>::compact() {
	// choose a part of the buffer to compact
	const uint32_t secs_to_compact = std::min(static_cast<uint32_t>(count_trailing_zeros_in_u32(~state_) + 1), num_sections_);
	const size_t compaction_range = compute_compaction_range(secs_to_compact);
	const uint32_t compact_from = compaction_range & 0xFFFFFFFFLL; // low 32
	const uint32_t compact_to = compaction_range >> 32; // high 32
	if (compact_to - compact_from < 2) throw std::logic_error("compaction range error");

	if ((num_compactions_ & 1) == 1) { coin_ = !coin_; } // for odd flip coin;
	else { coin_ = req_random_bit(); } // random coin flip

	auto promote = get_evens_or_odds(items_.begin() + compact_from, items_.begin() + compact_to, coin_);
	items_.erase(items_.begin() + compact_from, items_.begin() + compact_to);

	++num_compactions_;
	++state_;
	ensure_enough_sections();
	return promote;
	}

	template<typename T, bool H, typename C, typename A>
	bool req_compactor<T, H, C, A>::ensure_enough_sections() {
	const double ssr = section_size_raw_ / sqrt(2);
	const uint32_t ne = nearest_even(ssr);
	if (num_compactions_ >= 1 << (num_sections_ - 1) && ne >= req_constants::MIN_K) {
	section_size_raw_ = ssr;
	section_size_ = ne;
	num_sections_ <<= 1;
	//ensure_capacity(2 * get_nom_capacity());
	return true;
	}
	return false;
	}

	template<typename T, bool H, typename C, typename A>
	size_t req_compactor<T, H, C, A>::compute_compaction_range(uint32_t secs_to_compact) const {
	const uint32_t num_items = items_.size();
	uint32_t non_compact = get_nom_capacity() / 2 + (num_sections_ - secs_to_compact) * section_size_;
	// make compacted region even
	if ((num_items - non_compact & 1) == 1) ++non_compact;
	const size_t low = H ? 0 : non_compact;
	const size_t high = H ? num_items - non_compact : num_items;
	return (high << 32) + low;
	}

	template<typename T, bool H, typename C, typename A>
	uint32_t req_compactor<T, H, C, A>::nearest_even(double value) {
	return static_cast<uint32_t>(round(value / 2)) << 1;
	}

	template<typename T, bool H, typename C, typename A>
	template<typename Iter>
	std::vector<T, A> req_compactor<T, H, C, A>::get_evens_or_odds(Iter from, Iter to, bool odds) {
	std::vector<T, A> result;
	if (from == to) return result;
	Iter i = from;
	if (odds) ++i;
	while (i != to) {
	result.push_back(*i);
	++i;
	if (i == to) break;
	++i;
	}
	return result;
	}

	// sketch

	template<typename T, bool H, typename C, typename S, typename A>
	req_sketch<T, H, C, S, A>::req_sketch(uint32_t k, const A& allocator):
	allocator_(allocator),
	k_(k),
	max_nom_size_(0),
	num_retained_(0),
	n_(0)
	{
	grow();
	}

	template<typename T, bool H, typename C, typename S, typename A>
	bool req_sketch<T, H, C, S, A>::is_empty() const {
	return n_ == 0;
	}

	template<typename T, bool H, typename C, typename S, typename A>
	uint64_t req_sketch<T, H, C, S, A>::get_n() const {
	return n_;
	}

	template<typename T, bool H, typename C, typename S, typename A>
	uint32_t req_sketch<T, H, C, S, A>::get_num_retained() const {
	return num_retained_;
	}

	template<typename T, bool H, typename C, typename S, typename A>
	bool req_sketch<T, H, C, S, A>::is_estimation_mode() const {
	return compactors_.size() > 1;
	}

	template<typename T, bool H, typename C, typename S, typename A>
	template<typename FwdT>
	void req_sketch<T, H, C, S, A>::update(FwdT&& item) {
	// if (Float.isNaN(item)) { return; }
	// if (isEmpty()) {
	// minValue = item;
	// maxValue = item;
	// } else {
	// if (item < minValue) { minValue = item; }
	// if (item > maxValue) { maxValue = item; }
	// }
	compactors_[0].append(item);
	++num_retained_;
	++n_;
	if (num_retained_ == max_nom_size_) {
	compactors_[0].sort();
	compress();
	}
	// aux = null;
	}


	template<typename T, bool H, typename C, typename S, typename A>
	template<bool inclusive>
	double req_sketch<T, H, C, S, A>::get_rank(const T& item) const {
	uint64_t weight = 0;
	for (const auto& compactor: compactors_) {
	weight += compactor.template compute_weight<inclusive>(item);
	}
	return static_cast<double>(weight) / n_;
	}

	template<typename T, bool H, typename C, typename S, typename A>
	void req_sketch<T, H, C, S, A>::grow() {
	const uint8_t lg_weight = get_num_levels();
	compactors_.push_back(Compactor(lg_weight, k_, allocator_));
	update_max_nom_size();
	}

	template<typename T, bool H, typename C, typename S, typename A>
	uint8_t req_sketch<T, H, C, S, A>::get_num_levels() const {
	return compactors_.size();
	}

	template<typename T, bool H, typename C, typename S, typename A>
	void req_sketch<T, H, C, S, A>::update_max_nom_size() {
	max_nom_size_ = 0;
	for (const auto& compactor: compactors_) max_nom_size_ += compactor.get_nom_capacity();
	}

	template<typename T, bool H, typename C, typename S, typename A>
	void req_sketch<T, H, C, S, A>::update_num_retained() {
	num_retained_ = 0;
	for (const auto& compactor: compactors_) num_retained_ += compactor.get_num_items();
	}

	template<typename T, bool H, typename C, typename S, typename A>
	void req_sketch<T, H, C, S, A>::compress() {
	for (size_t h = 0; h < compactors_.size(); ++h) {
	if (compactors_[h].get_num_items() >= compactors_[h].get_nom_capacity()) {
	if (h + 1 >= get_num_levels()) { // at the top?
	grow(); // add a level, increases max_nom_size
	}
	auto promoted = compactors_[h].compact();
	compactors_[h + 1].merge_sort_in(std::move(promoted));
	update_num_retained();
	if (num_retained_ < max_nom_size_) break;
	}
	}
	update_max_nom_size();
	// aux = null;
	}

	template<typename T, bool H, typename C, typename S, typename A>
	string<A> req_sketch<T, H, C, S, A>::to_string(bool print_levels, bool print_items) const {
	std::basic_ostringstream<char, std::char_traits<char>, AllocChar<A>> os;
	os << "### REQ sketch summary:" << std::endl;
	os << " K : " << k_ << std::endl;
	os << " N : " << n_ << std::endl;
	os << " Empty : " << (is_empty() ? "true" : "false") << std::endl;
	os << " Estimation mode: " << (is_estimation_mode() ? "true" : "false") << std::endl;
	os << " Levels : " << compactors_.size() << std::endl;
	os << " Sorted : " << (compactors_[0].is_sorted() ? "true" : "false") << std::endl;
	os << " Capacity items : " << max_nom_size_ << std::endl;
	os << " Retained items : " << num_retained_ << std::endl;
	// os << " Storage bytes : " << get_serialized_size_bytes() << std::endl;
	// if (!is_empty()) {
	// os << " Min value : " << *min_value_ << std::endl;
	// os << " Max value : " << *max_value_ << std::endl;
	// }
	os << "### End sketch summary" << std::endl;

	if (print_levels) {
	os << "### REQ sketch levels:" << std::endl;
	os << " index: nominal capacity, actual size" << std::endl;
	for (uint8_t i = 0; i < compactors_.size(); i++) {
	os << " " << (unsigned int) i << ": "
	<< compactors_[i].get_nom_capacity() << ", "
	<< compactors_[i].get_num_items() << std::endl;
	}
	os << "### End sketch levels" << std::endl;
	}

	if (print_items) {
	os << "### REQ sketch data:" << std::endl;
	os << "### End sketch data" << std::endl;
	}
	return os.str();
	}

	} /* namespace datasketches */

	#endif