Merge branch 'master' into theta_tuple_common_base
diff --git a/theta/CMakeLists.txt b/theta/CMakeLists.txt
index a68b70b..7ba60bc 100644
--- a/theta/CMakeLists.txt
+++ b/theta/CMakeLists.txt
@@ -33,9 +33,21 @@
target_compile_features(theta INTERFACE cxx_std_11)
set(theta_HEADERS "")
-list(APPEND theta_HEADERS "include/theta_sketch.hpp;include/theta_union.hpp;include/theta_intersection.hpp")
-list(APPEND theta_HEADERS "include/theta_a_not_b.hpp;include/theta_sketch_impl.hpp")
-list(APPEND theta_HEADERS "include/theta_union_impl.hpp;include/theta_intersection_impl.hpp;include/theta_a_not_b_impl.hpp")
+list(APPEND theta_HEADERS "include/theta_sketch.hpp;include/theta_sketch_impl.hpp")
+list(APPEND theta_HEADERS "include/theta_union.hpp;include/theta_union_impl.hpp")
+list(APPEND theta_HEADERS "include/theta_intersection.hpp;include/theta_intersection_impl.hpp")
+list(APPEND theta_HEADERS "include/theta_a_not_b.hpp;include/theta_a_not_b_impl.hpp")
+list(APPEND tuple_HEADERS "include/theta_jaccard_similarity.hpp")
+list(APPEND tuple_HEADERS "include/theta_comparators.hpp")
+list(APPEND tuple_HEADERS "include/theta_constants.hpp")
+list(APPEND tuple_HEADERS "include/theta_helpers.hpp")
+list(APPEND tuple_HEADERS "include/theta_update_sketch_base.hpp;include/theta_update_sketch_base_impl.hpp")
+list(APPEND tuple_HEADERS "include/theta_union_base.hpp;include/theta_union_base_impl.hpp")
+list(APPEND tuple_HEADERS "include/theta_intersection_base.hpp;include/theta_intersection_base_impl.hpp")
+list(APPEND tuple_HEADERS "include/theta_set_difference_base.hpp;include/theta_set_difference_base_impl.hpp")
+list(APPEND tuple_HEADERS "include/theta_jaccard_similarity_base.hpp")
+list(APPEND tuple_HEADERS "include/bounds_on_ratios_in_sampled_sets.hpp")
+list(APPEND tuple_HEADERS "include/bounds_on_ratios_in_theta_sketched_sets.hpp")
install(TARGETS theta
EXPORT ${PROJECT_NAME}
@@ -54,4 +66,19 @@
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_union_impl.hpp
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_intersection_impl.hpp
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_a_not_b_impl.hpp
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_jaccard_similarity.hpp
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_comparators.hpp
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_constants.hpp
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_helpers.hpp
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_update_sketch_base.hpp
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_update_sketch_base_impl.hpp
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_union_base.hpp
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_union_base_impl.hpp
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_intersection_base.hpp
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_intersection_base_impl.hpp
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_set_difference_base.hpp
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_set_difference_base_impl.hpp
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_jaccard_similarity_base.hpp
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/bounds_on_ratios_in_sampled_sets.hpp
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/bounds_on_ratios_in_theta_sketched_sets.hpp
)
diff --git a/tuple/include/bounds_on_ratios_in_sampled_sets.hpp b/theta/include/bounds_on_ratios_in_sampled_sets.hpp
similarity index 98%
rename from tuple/include/bounds_on_ratios_in_sampled_sets.hpp
rename to theta/include/bounds_on_ratios_in_sampled_sets.hpp
index 39accd5..e2c5433 100644
--- a/tuple/include/bounds_on_ratios_in_sampled_sets.hpp
+++ b/theta/include/bounds_on_ratios_in_sampled_sets.hpp
@@ -21,8 +21,9 @@
#define BOUNDS_ON_RATIOS_IN_SAMPLED_SETS_HPP_
#include <cstdint>
+#include <string>
-#include <bounds_binomial_proportions.hpp>
+#include "bounds_binomial_proportions.hpp"
namespace datasketches {
diff --git a/tuple/include/bounds_on_ratios_in_theta_sketched_sets.hpp b/theta/include/bounds_on_ratios_in_theta_sketched_sets.hpp
similarity index 98%
rename from tuple/include/bounds_on_ratios_in_theta_sketched_sets.hpp
rename to theta/include/bounds_on_ratios_in_theta_sketched_sets.hpp
index 3cf9cb4..1779ec1 100644
--- a/tuple/include/bounds_on_ratios_in_theta_sketched_sets.hpp
+++ b/theta/include/bounds_on_ratios_in_theta_sketched_sets.hpp
@@ -23,7 +23,7 @@
#include <cstdint>
#include <stdexcept>
-#include <bounds_on_ratios_in_sampled_sets.hpp>
+#include "bounds_on_ratios_in_sampled_sets.hpp"
namespace datasketches {
diff --git a/theta/include/theta_a_not_b.hpp b/theta/include/theta_a_not_b.hpp
index db66ac7..4beef60 100644
--- a/theta/include/theta_a_not_b.hpp
+++ b/theta/include/theta_a_not_b.hpp
@@ -20,51 +20,34 @@
#ifndef THETA_A_NOT_B_HPP_
#define THETA_A_NOT_B_HPP_
-#include <memory>
-#include <functional>
-#include <climits>
-
#include "theta_sketch.hpp"
-#include "common_defs.hpp"
+#include "theta_set_difference_base.hpp"
namespace datasketches {
-/*
- * author Alexander Saydakov
- * author Lee Rhodes
- * author Kevin Lang
- */
-
-template<typename A>
+template<typename Allocator = std::allocator<uint64_t>>
class theta_a_not_b_alloc {
public:
- /**
- * Creates an instance of the a-not-b operation (set difference) with a given has seed.
- * @param seed hash seed
- */
- explicit theta_a_not_b_alloc(uint64_t seed = DEFAULT_SEED);
+ using Entry = uint64_t;
+ using ExtractKey = trivial_extract_key;
+ using CompactSketch = compact_theta_sketch_alloc<Allocator>;
+ using State = theta_set_difference_base<Entry, ExtractKey, CompactSketch, Allocator>;
+
+ explicit theta_a_not_b_alloc(uint64_t seed = DEFAULT_SEED, const Allocator& allocator = Allocator());
/**
* Computes the a-not-b set operation given two sketches.
* @return the result of a-not-b
*/
- compact_theta_sketch_alloc<A> compute(const theta_sketch_alloc<A>& a, const theta_sketch_alloc<A>& b, bool ordered = true) const;
+ template<typename FwdSketch, typename Sketch>
+ CompactSketch compute(FwdSketch&& a, const Sketch& b, bool ordered = true) const;
private:
- typedef typename std::allocator_traits<A>::template rebind_alloc<uint64_t> AllocU64;
- uint16_t seed_hash_;
-
- class less_than {
- public:
- explicit less_than(uint64_t value): value(value) {}
- bool operator()(uint64_t value) const { return value < this->value; }
- private:
- uint64_t value;
- };
+ State state_;
};
// alias with default allocator for convenience
-typedef theta_a_not_b_alloc<std::allocator<void>> theta_a_not_b;
+using theta_a_not_b = theta_a_not_b_alloc<std::allocator<uint64_t>>;
} /* namespace datasketches */
diff --git a/theta/include/theta_a_not_b_impl.hpp b/theta/include/theta_a_not_b_impl.hpp
index 4343ee3..4c17bbf 100644
--- a/theta/include/theta_a_not_b_impl.hpp
+++ b/theta/include/theta_a_not_b_impl.hpp
@@ -26,56 +26,15 @@
namespace datasketches {
-/*
- * author Alexander Saydakov
- * author Lee Rhodes
- * author Kevin Lang
- */
-
template<typename A>
-theta_a_not_b_alloc<A>::theta_a_not_b_alloc(uint64_t seed):
-seed_hash_(theta_sketch_alloc<A>::get_seed_hash(seed))
+theta_a_not_b_alloc<A>::theta_a_not_b_alloc(uint64_t seed, const A& allocator):
+state_(seed, allocator)
{}
template<typename A>
-compact_theta_sketch_alloc<A> theta_a_not_b_alloc<A>::compute(const theta_sketch_alloc<A>& a, const theta_sketch_alloc<A>& b, bool ordered) const {
- if (a.is_empty() || a.get_num_retained() == 0 || b.is_empty()) return compact_theta_sketch_alloc<A>(a, ordered);
- if (a.get_seed_hash() != seed_hash_) throw std::invalid_argument("A seed hash mismatch");
- if (b.get_seed_hash() != seed_hash_) throw std::invalid_argument("B seed hash mismatch");
-
- const uint64_t theta = std::min(a.get_theta64(), b.get_theta64());
- vector_u64<A> keys;
- bool is_empty = a.is_empty();
-
- if (b.get_num_retained() == 0) {
- std::copy_if(a.begin(), a.end(), std::back_inserter(keys), less_than(theta));
- } else {
- if (a.is_ordered() && b.is_ordered()) { // sort-based
- std::set_difference(a.begin(), a.end(), b.begin(), b.end(), conditional_back_inserter(keys, less_than(theta)));
- } else { // hash-based
- const uint8_t lg_size = lg_size_from_count(b.get_num_retained(), update_theta_sketch_alloc<A>::REBUILD_THRESHOLD);
- vector_u64<A> b_hash_table(1 << lg_size, 0);
- for (auto key: b) {
- if (key < theta) {
- update_theta_sketch_alloc<A>::hash_search_or_insert(key, b_hash_table.data(), lg_size);
- } else if (b.is_ordered()) {
- break; // early stop
- }
- }
-
- // scan A lookup B
- for (auto key: a) {
- if (key < theta) {
- if (!update_theta_sketch_alloc<A>::hash_search(key, b_hash_table.data(), lg_size)) keys.push_back(key);
- } else if (a.is_ordered()) {
- break; // early stop
- }
- }
- }
- }
- if (keys.empty() && theta == theta_sketch_alloc<A>::MAX_THETA) is_empty = true;
- if (ordered && !a.is_ordered()) std::sort(keys.begin(), keys.end());
- return compact_theta_sketch_alloc<A>(is_empty, theta, std::move(keys), seed_hash_, a.is_ordered() || ordered);
+template<typename FwdSketch, typename Sketch>
+auto theta_a_not_b_alloc<A>::compute(FwdSketch&& a, const Sketch& b, bool ordered) const -> CompactSketch {
+ return state_.compute(std::forward<FwdSketch>(a), b, ordered);
}
} /* namespace datasketches */
diff --git a/tuple/include/theta_comparators.hpp b/theta/include/theta_comparators.hpp
similarity index 100%
rename from tuple/include/theta_comparators.hpp
rename to theta/include/theta_comparators.hpp
diff --git a/tuple/include/theta_constants.hpp b/theta/include/theta_constants.hpp
similarity index 97%
rename from tuple/include/theta_constants.hpp
rename to theta/include/theta_constants.hpp
index 989681f..d5d6fd9 100644
--- a/tuple/include/theta_constants.hpp
+++ b/theta/include/theta_constants.hpp
@@ -20,6 +20,8 @@
#ifndef THETA_CONSTANTS_HPP_
#define THETA_CONSTANTS_HPP_
+#include <climits>
+
namespace datasketches {
namespace theta_constants {
diff --git a/tuple/include/theta_helpers.hpp b/theta/include/theta_helpers.hpp
similarity index 100%
rename from tuple/include/theta_helpers.hpp
rename to theta/include/theta_helpers.hpp
diff --git a/theta/include/theta_intersection.hpp b/theta/include/theta_intersection.hpp
index 5945c52..98a8bf1 100644
--- a/theta/include/theta_intersection.hpp
+++ b/theta/include/theta_intersection.hpp
@@ -20,29 +20,28 @@
#ifndef THETA_INTERSECTION_HPP_
#define THETA_INTERSECTION_HPP_
-#include <memory>
-#include <functional>
-#include <climits>
-
#include "theta_sketch.hpp"
-#include "common_defs.hpp"
+#include "theta_intersection_base.hpp"
namespace datasketches {
-/*
- * author Alexander Saydakov
- * author Lee Rhodes
- * author Kevin Lang
- */
-
-template<typename A>
+template<typename Allocator = std::allocator<uint64_t>>
class theta_intersection_alloc {
public:
- /**
- * Creates an instance of the intersection with a given hash seed.
- * @param seed hash seed
- */
- explicit theta_intersection_alloc(uint64_t seed = DEFAULT_SEED);
+ using Entry = uint64_t;
+ using ExtractKey = trivial_extract_key;
+ using Sketch = theta_sketch_alloc<Allocator>;
+ using CompactSketch = compact_theta_sketch_alloc<Allocator>;
+
+ struct pass_through_policy {
+ uint64_t operator()(uint64_t internal_entry, uint64_t incoming_entry) const {
+ unused(incoming_entry);
+ return internal_entry;
+ }
+ };
+ using State = theta_intersection_base<Entry, ExtractKey, pass_through_policy, Sketch, CompactSketch, Allocator>;
+
+ explicit theta_intersection_alloc(uint64_t seed = DEFAULT_SEED, const Allocator& allocator = Allocator());
/**
* Updates the intersection with a given sketch.
@@ -50,7 +49,8 @@
* can reduce the current set to leave the overlapping subset only.
* @param sketch represents input set for the intersection
*/
- void update(const theta_sketch_alloc<A>& sketch);
+ template<typename FwdSketch>
+ void update(FwdSketch&& sketch);
/**
* Produces a copy of the current state of the intersection.
@@ -59,7 +59,7 @@
* @param ordered optional flag to specify if ordered sketch should be produced
* @return the result of the intersection
*/
- compact_theta_sketch_alloc<A> get_result(bool ordered = true) const;
+ CompactSketch get_result(bool ordered = true) const;
/**
* Returns true if the state of the intersection is defined (not infinite "universe").
@@ -68,21 +68,14 @@
bool has_result() const;
private:
- typedef typename std::allocator_traits<A>::template rebind_alloc<uint64_t> AllocU64;
- bool is_valid_;
- bool is_empty_;
- uint64_t theta_;
- uint8_t lg_size_;
- vector_u64<A> keys_;
- uint32_t num_keys_;
- uint16_t seed_hash_;
+ State state_;
};
// alias with default allocator for convenience
-typedef theta_intersection_alloc<std::allocator<void>> theta_intersection;
+using theta_intersection = theta_intersection_alloc<std::allocator<uint64_t>>;
} /* namespace datasketches */
#include "theta_intersection_impl.hpp"
-# endif
+#endif
diff --git a/tuple/include/theta_intersection_base.hpp b/theta/include/theta_intersection_base.hpp
similarity index 100%
rename from tuple/include/theta_intersection_base.hpp
rename to theta/include/theta_intersection_base.hpp
diff --git a/tuple/include/theta_intersection_base_impl.hpp b/theta/include/theta_intersection_base_impl.hpp
similarity index 100%
rename from tuple/include/theta_intersection_base_impl.hpp
rename to theta/include/theta_intersection_base_impl.hpp
diff --git a/theta/include/theta_intersection_impl.hpp b/theta/include/theta_intersection_impl.hpp
index d090b3a..a0c4291 100644
--- a/theta/include/theta_intersection_impl.hpp
+++ b/theta/include/theta_intersection_impl.hpp
@@ -20,109 +20,27 @@
#ifndef THETA_INTERSECTION_IMPL_HPP_
#define THETA_INTERSECTION_IMPL_HPP_
-#include <algorithm>
-
namespace datasketches {
-/*
- * author Alexander Saydakov
- * author Lee Rhodes
- * author Kevin Lang
- */
-
template<typename A>
-theta_intersection_alloc<A>::theta_intersection_alloc(uint64_t seed):
-is_valid_(false),
-is_empty_(false),
-theta_(theta_sketch_alloc<A>::MAX_THETA),
-lg_size_(0),
-keys_(),
-num_keys_(0),
-seed_hash_(theta_sketch_alloc<A>::get_seed_hash(seed))
+theta_intersection_alloc<A>::theta_intersection_alloc(uint64_t seed, const A& allocator):
+state_(seed, pass_through_policy(), allocator)
{}
template<typename A>
-void theta_intersection_alloc<A>::update(const theta_sketch_alloc<A>& sketch) {
- if (is_empty_) return;
- if (!sketch.is_empty() && sketch.get_seed_hash() != seed_hash_) throw std::invalid_argument("seed hash mismatch");
- is_empty_ |= sketch.is_empty();
- theta_ = std::min(theta_, sketch.get_theta64());
- if (is_valid_ && num_keys_ == 0) return;
- if (sketch.get_num_retained() == 0) {
- is_valid_ = true;
- if (keys_.size() > 0) {
- keys_.resize(0);
- lg_size_ = 0;
- num_keys_ = 0;
- }
- return;
- }
- if (!is_valid_) { // first update, clone incoming sketch
- is_valid_ = true;
- lg_size_ = lg_size_from_count(sketch.get_num_retained(), update_theta_sketch_alloc<A>::REBUILD_THRESHOLD);
- keys_.resize(1 << lg_size_, 0);
- for (auto key: sketch) {
- if (!update_theta_sketch_alloc<A>::hash_search_or_insert(key, keys_.data(), lg_size_)) {
- throw std::invalid_argument("duplicate key, possibly corrupted input sketch");
- }
- ++num_keys_;
- }
- if (num_keys_ != sketch.get_num_retained()) throw std::invalid_argument("num keys mismatch, possibly corrupted input sketch");
- } else { // intersection
- const uint32_t max_matches = std::min(num_keys_, sketch.get_num_retained());
- vector_u64<A> matched_keys(max_matches);
- uint32_t match_count = 0;
- uint32_t count = 0;
- for (auto key: sketch) {
- if (key < theta_) {
- if (update_theta_sketch_alloc<A>::hash_search(key, keys_.data(), lg_size_)) {
- if (match_count == max_matches) throw std::invalid_argument("max matches exceeded, possibly corrupted input sketch");
- matched_keys[match_count++] = key;
- }
- } else if (sketch.is_ordered()) {
- break; // early stop
- }
- ++count;
- }
- if (count > sketch.get_num_retained()) {
- throw std::invalid_argument(" more keys then expected, possibly corrupted input sketch");
- } else if (!sketch.is_ordered() && count < sketch.get_num_retained()) {
- throw std::invalid_argument(" fewer keys then expected, possibly corrupted input sketch");
- }
- if (match_count == 0) {
- keys_.resize(0);
- lg_size_ = 0;
- num_keys_ = 0;
- if (theta_ == theta_sketch_alloc<A>::MAX_THETA) is_empty_ = true;
- } else {
- const uint8_t lg_size = lg_size_from_count(match_count, update_theta_sketch_alloc<A>::REBUILD_THRESHOLD);
- if (lg_size != lg_size_) {
- lg_size_ = lg_size;
- keys_.resize(1 << lg_size_);
- }
- std::fill(keys_.begin(), keys_.end(), 0);
- for (uint32_t i = 0; i < match_count; i++) {
- update_theta_sketch_alloc<A>::hash_search_or_insert(matched_keys[i], keys_.data(), lg_size_);
- }
- num_keys_ = match_count;
- }
- }
+template<typename SS>
+void theta_intersection_alloc<A>::update(SS&& sketch) {
+ state_.update(std::forward<SS>(sketch));
}
template<typename A>
-compact_theta_sketch_alloc<A> theta_intersection_alloc<A>::get_result(bool ordered) const {
- if (!is_valid_) throw std::invalid_argument("calling get_result() before calling update() is undefined");
- vector_u64<A> keys(num_keys_);
- if (num_keys_ > 0) {
- std::copy_if(keys_.begin(), keys_.end(), keys.begin(), [](uint64_t key) { return key != 0; });
- if (ordered) std::sort(keys.begin(), keys.end());
- }
- return compact_theta_sketch_alloc<A>(is_empty_, theta_, std::move(keys), seed_hash_, ordered);
+auto theta_intersection_alloc<A>::get_result(bool ordered) const -> CompactSketch {
+ return state_.get_result(ordered);
}
template<typename A>
bool theta_intersection_alloc<A>::has_result() const {
- return is_valid_;
+ return state_.has_result();
}
} /* namespace datasketches */
diff --git a/tuple/include/theta_constants.hpp b/theta/include/theta_jaccard_similarity.hpp
similarity index 61%
copy from tuple/include/theta_constants.hpp
copy to theta/include/theta_jaccard_similarity.hpp
index 989681f..417ed54 100644
--- a/tuple/include/theta_constants.hpp
+++ b/theta/include/theta_jaccard_similarity.hpp
@@ -17,18 +17,21 @@
* under the License.
*/
-#ifndef THETA_CONSTANTS_HPP_
-#define THETA_CONSTANTS_HPP_
+#ifndef THETA_JACCARD_SIMILARITY_HPP_
+#define THETA_JACCARD_SIMILARITY_HPP_
+
+#include "theta_jaccard_similarity_base.hpp"
+#include "theta_union.hpp"
+#include "theta_intersection.hpp"
namespace datasketches {
-namespace theta_constants {
- enum resize_factor { X1, X2, X4, X8 };
- static const uint64_t MAX_THETA = LLONG_MAX; // signed max for compatibility with Java
- static const uint8_t MIN_LG_K = 5;
- static const uint8_t MAX_LG_K = 26;
-}
+template<typename Allocator = std::allocator<uint64_t>>
+using theta_jaccard_similarity_alloc = jaccard_similarity_base<theta_union_alloc<Allocator>, theta_intersection_alloc<Allocator>, trivial_extract_key>;
+
+// alias with default allocator for convenience
+using theta_jaccard_similarity = theta_jaccard_similarity_alloc<std::allocator<uint64_t>>;
} /* namespace datasketches */
-#endif
+# endif
diff --git a/tuple/include/jaccard_similarity.hpp b/theta/include/theta_jaccard_similarity_base.hpp
similarity index 85%
rename from tuple/include/jaccard_similarity.hpp
rename to theta/include/theta_jaccard_similarity_base.hpp
index a77884b..cb18601 100644
--- a/tuple/include/jaccard_similarity.hpp
+++ b/theta/include/theta_jaccard_similarity_base.hpp
@@ -17,19 +17,16 @@
* under the License.
*/
-#ifndef JACCARD_SIMILARITY_BASE_HPP_
-#define JACCARD_SIMILARITY_BASE_HPP_
+#ifndef THETA_JACCARD_SIMILARITY_BASE_HPP_
+#define THETA_JACCARD_SIMILARITY_BASE_HPP_
#include <memory>
#include <array>
-#include <theta_union_experimental.hpp>
-#include <theta_intersection_experimental.hpp>
-#include <tuple_union.hpp>
-#include <tuple_intersection.hpp>
-#include <bounds_on_ratios_in_theta_sketched_sets.hpp>
-#include <ceiling_power_of_2.hpp>
-#include <common_defs.hpp>
+#include "theta_constants.hpp"
+#include "bounds_on_ratios_in_theta_sketched_sets.hpp"
+#include "ceiling_power_of_2.hpp"
+#include "common_defs.hpp"
namespace datasketches {
@@ -154,19 +151,6 @@
};
-template<typename Allocator>
-using theta_jaccard_similarity_alloc = jaccard_similarity_base<theta_union_experimental<Allocator>, theta_intersection_experimental<Allocator>, trivial_extract_key>;
-
-// alias with default allocator for convenience
-using theta_jaccard_similarity = theta_jaccard_similarity_alloc<std::allocator<uint64_t>>;
-
-template<
- typename Summary,
- typename IntersectionPolicy,
- typename UnionPolicy = default_union_policy<Summary>,
- typename Allocator = std::allocator<Summary>>
-using tuple_jaccard_similarity = jaccard_similarity_base<tuple_union<Summary, UnionPolicy, Allocator>, tuple_intersection<Summary, IntersectionPolicy, Allocator>, pair_extract_key<uint64_t, Summary>>;
-
} /* namespace datasketches */
# endif
diff --git a/tuple/include/theta_set_difference_base.hpp b/theta/include/theta_set_difference_base.hpp
similarity index 100%
rename from tuple/include/theta_set_difference_base.hpp
rename to theta/include/theta_set_difference_base.hpp
diff --git a/tuple/include/theta_set_difference_base_impl.hpp b/theta/include/theta_set_difference_base_impl.hpp
similarity index 100%
rename from tuple/include/theta_set_difference_base_impl.hpp
rename to theta/include/theta_set_difference_base_impl.hpp
diff --git a/theta/include/theta_sketch.hpp b/theta/include/theta_sketch.hpp
index b809f71..2e24168 100644
--- a/theta/include/theta_sketch.hpp
+++ b/theta/include/theta_sketch.hpp
@@ -20,45 +20,29 @@
#ifndef THETA_SKETCH_HPP_
#define THETA_SKETCH_HPP_
-#include <memory>
-#include <functional>
-#include <climits>
-#include <vector>
-
-#include "common_defs.hpp"
+#include "theta_update_sketch_base.hpp"
namespace datasketches {
-/*
- * author Alexander Saydakov
- * author Lee Rhodes
- * author Kevin Lang
- */
-
-// forward-declarations
-template<typename A> class theta_sketch_alloc;
-template<typename A> class update_theta_sketch_alloc;
-template<typename A> class compact_theta_sketch_alloc;
-template<typename A> class theta_union_alloc;
-template<typename A> class theta_intersection_alloc;
-template<typename A> class theta_a_not_b_alloc;
-
-// for serialization as raw bytes
-template<typename A> using AllocU8 = typename std::allocator_traits<A>::template rebind_alloc<uint8_t>;
-template<typename A> using vector_u8 = std::vector<uint8_t, AllocU8<A>>;
-
-template<typename A>
+template<typename Allocator = std::allocator<uint64_t>>
class theta_sketch_alloc {
public:
- static const uint64_t MAX_THETA = LLONG_MAX; // signed max for compatibility with Java
- static const uint8_t SERIAL_VERSION = 3;
+ using Entry = uint64_t;
+ using ExtractKey = trivial_extract_key;
+ using iterator = theta_iterator<Entry, ExtractKey>;
+ using const_iterator = theta_const_iterator<Entry, ExtractKey>;
virtual ~theta_sketch_alloc() = default;
/**
+ * @return allocator
+ */
+ virtual Allocator get_allocator() const = 0;
+
+ /**
* @return true if this sketch represents an empty set (not the same as no retained entries!)
*/
- bool is_empty() const;
+ virtual bool is_empty() const = 0;
/**
* @return estimate of the distinct count of the input stream
@@ -96,13 +80,16 @@
/**
* @return theta as a positive integer between 0 and LLONG_MAX
*/
- uint64_t get_theta64() const;
+ virtual uint64_t get_theta64() const = 0;
/**
* @return the number of retained entries in the sketch
*/
virtual uint32_t get_num_retained() const = 0;
+ /**
+ * @return hash of the seed that was used to hash the input
+ */
virtual uint16_t get_seed_hash() const = 0;
/**
@@ -111,109 +98,82 @@
virtual bool is_ordered() const = 0;
/**
- * Writes a human-readable summary of this sketch to a given stream
+ * Provides a human-readable summary of this sketch as a string
* @param print_items if true include the list of items retained by the sketch
+ * @return sketch summary as a string
*/
- virtual string<A> to_string(bool print_items = false) const = 0;
-
- /**
- * This method serializes the sketch into a given stream in a binary form
- * @param os output stream
- */
- virtual void serialize(std::ostream& os) const = 0;
-
- // This is a convenience alias for users
- // The type returned by the following serialize method
- typedef vector_u8<A> vector_bytes;
-
- /**
- * This method serializes the sketch as a vector of bytes.
- * An optional header can be reserved in front of the sketch.
- * It is an uninitialized space of a given size.
- * This header is used in Datasketches PostgreSQL extension.
- * @param header_size_bytes space to reserve in front of the sketch
- */
- virtual vector_bytes serialize(unsigned header_size_bytes = 0) const = 0;
-
- // This is a convenience alias for users
- // The type returned by the following deserialize methods
- // It is not possible to return instances of an abstract type, so this has to be a pointer
- typedef std::unique_ptr<theta_sketch_alloc<A>, std::function<void(theta_sketch_alloc<A>*)>> unique_ptr;
-
- /**
- * This method deserializes a sketch from a given stream.
- * @param is input stream
- * @param seed the seed for the hash function that was used to create the sketch
- * @return an instance of a sketch as a unique_ptr
- */
- static unique_ptr deserialize(std::istream& is, uint64_t seed = DEFAULT_SEED);
-
- /**
- * This method deserializes a sketch from a given array of bytes.
- * @param bytes pointer to the array of bytes
- * @param size the size of the array
- * @param seed the seed for the hash function that was used to create the sketch
- * @return an instance of the sketch
- */
- static unique_ptr deserialize(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED);
-
- class const_iterator;
+ virtual string<Allocator> to_string(bool print_items = false) const;
/**
* Iterator over hash values in this sketch.
* @return begin iterator
*/
- virtual const_iterator begin() const = 0;
+ virtual iterator begin() = 0;
/**
* Iterator pointing past the valid range.
* Not to be incremented or dereferenced.
* @return end iterator
*/
+ virtual iterator end() = 0;
+
+ /**
+ * Const iterator over hash values in this sketch.
+ * @return begin iterator
+ */
+ virtual const_iterator begin() const = 0;
+
+ /**
+ * Const iterator pointing past the valid range.
+ * Not to be incremented or dereferenced.
+ * @return end iterator
+ */
virtual const_iterator end() const = 0;
protected:
- enum flags { IS_BIG_ENDIAN, IS_READ_ONLY, IS_EMPTY, IS_COMPACT, IS_ORDERED };
-
- bool is_empty_;
- uint64_t theta_;
-
- theta_sketch_alloc(bool is_empty, uint64_t theta);
-
- static uint16_t get_seed_hash(uint64_t seed);
-
- static void check_sketch_type(uint8_t actual, uint8_t expected);
- static void check_serial_version(uint8_t actual, uint8_t expected);
- static void check_seed_hash(uint16_t actual, uint16_t expected);
-
- friend theta_intersection_alloc<A>;
- friend theta_a_not_b_alloc<A>;
+ using ostrstream = std::basic_ostringstream<char, std::char_traits<char>, AllocChar<Allocator>>;
+ virtual void print_specifics(ostrstream& os) const = 0;
};
-// update sketch
+// forward declaration
+template<typename A> class compact_theta_sketch_alloc;
-template<typename A> using AllocU64 = typename std::allocator_traits<A>::template rebind_alloc<uint64_t>;
-template<typename A> using vector_u64 = std::vector<uint64_t, AllocU64<A>>;
-
-template<typename A>
-class update_theta_sketch_alloc: public theta_sketch_alloc<A> {
+template<typename Allocator = std::allocator<uint64_t>>
+class update_theta_sketch_alloc: public theta_sketch_alloc<Allocator> {
public:
- class builder;
- enum resize_factor { X1, X2, X4, X8 };
- static const uint8_t SKETCH_TYPE = 2;
+ using Base = theta_sketch_alloc<Allocator>;
+ using Entry = typename Base::Entry;
+ using ExtractKey = typename Base::ExtractKey;
+ using iterator = typename Base::iterator;
+ using const_iterator = typename Base::const_iterator;
+ using theta_table = theta_update_sketch_base<Entry, ExtractKey, Allocator>;
+ using resize_factor = typename theta_table::resize_factor;
// No constructor here. Use builder instead.
+ class builder;
+ update_theta_sketch_alloc(const update_theta_sketch_alloc&) = default;
+ update_theta_sketch_alloc(update_theta_sketch_alloc&&) noexcept = default;
virtual ~update_theta_sketch_alloc() = default;
+ update_theta_sketch_alloc& operator=(const update_theta_sketch_alloc&) = default;
+ update_theta_sketch_alloc& operator=(update_theta_sketch_alloc&&) = default;
- virtual uint32_t get_num_retained() const;
- virtual uint16_t get_seed_hash() const;
+ virtual Allocator get_allocator() const;
+ virtual bool is_empty() const;
virtual bool is_ordered() const;
- virtual string<A> to_string(bool print_items = false) const;
- virtual void serialize(std::ostream& os) const;
- typedef vector_u8<A> vector_bytes; // alias for users
- // header space is reserved, but not initialized
- virtual vector_bytes serialize(unsigned header_size_bytes = 0) const;
+ virtual uint16_t get_seed_hash() const;
+ virtual uint64_t get_theta64() const;
+ virtual uint32_t get_num_retained() const;
+
+ /**
+ * @return configured nominal number of entries in the sketch
+ */
+ uint8_t get_lg_k() const;
+
+ /**
+ * @return configured resize factor of the sketch
+ */
+ resize_factor get_rf() const;
/**
* Update this sketch with a given string.
@@ -302,7 +262,7 @@
* @param data pointer to the data
* @param length of the data in bytes
*/
- void update(const void* data, unsigned length);
+ void update(const void* data, size_t length);
/**
* Remove retained entries in excess of the nominal size k (if any)
@@ -314,105 +274,85 @@
* @param ordered optional flag to specify if ordered sketch should be produced
* @return compact sketch
*/
- compact_theta_sketch_alloc<A> compact(bool ordered = true) const;
+ compact_theta_sketch_alloc<Allocator> compact(bool ordered = true) const;
- virtual typename theta_sketch_alloc<A>::const_iterator begin() const;
- virtual typename theta_sketch_alloc<A>::const_iterator end() const;
-
- /**
- * This method deserializes a sketch from a given stream.
- * @param is input stream
- * @param seed the seed for the hash function that was used to create the sketch
- * @return an instance of a sketch
- */
- static update_theta_sketch_alloc<A> deserialize(std::istream& is, uint64_t seed = DEFAULT_SEED);
-
- /**
- * This method deserializes a sketch from a given array of bytes.
- * @param bytes pointer to the array of bytes
- * @param size the size of the array
- * @param seed the seed for the hash function that was used to create the sketch
- * @return an instance of the sketch
- */
- static update_theta_sketch_alloc<A> deserialize(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED);
+ virtual iterator begin();
+ virtual iterator end();
+ virtual const_iterator begin() const;
+ virtual const_iterator end() const;
private:
- // resize threshold = 0.5 tuned for speed
- static constexpr double RESIZE_THRESHOLD = 0.5;
- // hash table rebuild threshold = 15/16
- static constexpr double REBUILD_THRESHOLD = 15.0 / 16.0;
-
- static constexpr uint8_t STRIDE_HASH_BITS = 7;
- static constexpr uint32_t STRIDE_MASK = (1 << STRIDE_HASH_BITS) - 1;
-
- uint8_t lg_cur_size_;
- uint8_t lg_nom_size_;
- vector_u64<A> keys_;
- uint32_t num_keys_;
- resize_factor rf_;
- float p_;
- uint64_t seed_;
- uint32_t capacity_;
+ theta_table table_;
// for builder
- update_theta_sketch_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t seed);
+ update_theta_sketch_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta,
+ uint64_t seed, const Allocator& allocator);
- // for deserialize
- update_theta_sketch_alloc(bool is_empty, uint64_t theta, uint8_t lg_cur_size, uint8_t lg_nom_size, vector_u64<A>&& keys, uint32_t num_keys, resize_factor rf, float p, uint64_t seed);
-
- void resize();
- void rebuild();
-
- friend theta_union_alloc<A>;
- void internal_update(uint64_t hash);
-
- friend theta_intersection_alloc<A>;
- friend theta_a_not_b_alloc<A>;
- static inline uint32_t get_capacity(uint8_t lg_cur_size, uint8_t lg_nom_size);
- static inline uint32_t get_stride(uint64_t hash, uint8_t lg_size);
- static bool hash_search_or_insert(uint64_t hash, uint64_t* table, uint8_t lg_size);
- static bool hash_search(uint64_t hash, const uint64_t* table, uint8_t lg_size);
-
- friend theta_sketch_alloc<A>;
- static update_theta_sketch_alloc<A> internal_deserialize(std::istream& is, resize_factor rf, uint8_t lg_cur_size, uint8_t lg_nom_size, uint8_t flags_byte, uint64_t seed);
- static update_theta_sketch_alloc<A> internal_deserialize(const void* bytes, size_t size, resize_factor rf, uint8_t lg_cur_size, uint8_t lg_nom_size, uint8_t flags_byte, uint64_t seed);
+ using ostrstream = typename Base::ostrstream;
+ virtual void print_specifics(ostrstream& os) const;
};
// compact sketch
-template<typename A>
-class compact_theta_sketch_alloc: public theta_sketch_alloc<A> {
+template<typename Allocator = std::allocator<uint64_t>>
+class compact_theta_sketch_alloc: public theta_sketch_alloc<Allocator> {
public:
+ using Base = theta_sketch_alloc<Allocator>;
+ using iterator = typename Base::iterator;
+ using const_iterator = typename Base::const_iterator;
+ using AllocBytes = typename std::allocator_traits<Allocator>::template rebind_alloc<uint8_t>;
+ using vector_bytes = std::vector<uint8_t, AllocBytes>;
+
+ static const uint8_t SERIAL_VERSION = 3;
static const uint8_t SKETCH_TYPE = 3;
- // No constructor here.
// Instances of this type can be obtained:
- // - by compacting an update_theta_sketch
+ // - by compacting an update_theta_sketch_alloc
// - as a result of a set operation
// - by deserializing a previously serialized compact sketch
- compact_theta_sketch_alloc(const theta_sketch_alloc<A>& other, bool ordered);
+ compact_theta_sketch_alloc(const Base& other, bool ordered);
+ compact_theta_sketch_alloc(const compact_theta_sketch_alloc&) = default;
+ compact_theta_sketch_alloc(compact_theta_sketch_alloc&&) noexcept = default;
virtual ~compact_theta_sketch_alloc() = default;
+ compact_theta_sketch_alloc& operator=(const compact_theta_sketch_alloc&) = default;
+ compact_theta_sketch_alloc& operator=(compact_theta_sketch_alloc&&) = default;
+ virtual Allocator get_allocator() const;
+ virtual bool is_empty() const;
+ virtual bool is_ordered() const;
+ virtual uint64_t get_theta64() const;
virtual uint32_t get_num_retained() const;
virtual uint16_t get_seed_hash() const;
- virtual bool is_ordered() const;
- virtual string<A> to_string(bool print_items = false) const;
- virtual void serialize(std::ostream& os) const;
- typedef vector_u8<A> vector_bytes; // alias for users
- // header space is reserved, but not initialized
- virtual vector_bytes serialize(unsigned header_size_bytes = 0) const;
- virtual typename theta_sketch_alloc<A>::const_iterator begin() const;
- virtual typename theta_sketch_alloc<A>::const_iterator end() const;
+ /**
+ * This method serializes the sketch into a given stream in a binary form
+ * @param os output stream
+ */
+ void serialize(std::ostream& os) const;
+
+ /**
+ * This method serializes the sketch as a vector of bytes.
+ * An optional header can be reserved in front of the sketch.
+ * It is an uninitialized space of a given size.
+ * This header is used in Datasketches PostgreSQL extension.
+ * @param header_size_bytes space to reserve in front of the sketch
+ */
+ vector_bytes serialize(unsigned header_size_bytes = 0) const;
+
+ virtual iterator begin();
+ virtual iterator end();
+ virtual const_iterator begin() const;
+ virtual const_iterator end() const;
/**
* This method deserializes a sketch from a given stream.
* @param is input stream
* @param seed the seed for the hash function that was used to create the sketch
- * @return an instance of a sketch
+ * @return an instance of the sketch
*/
- static compact_theta_sketch_alloc<A> deserialize(std::istream& is, uint64_t seed = DEFAULT_SEED);
+ static compact_theta_sketch_alloc deserialize(std::istream& is,
+ uint64_t seed = DEFAULT_SEED, const Allocator& allocator = Allocator());
/**
* This method deserializes a sketch from a given array of bytes.
@@ -421,110 +361,36 @@
* @param seed the seed for the hash function that was used to create the sketch
* @return an instance of the sketch
*/
- static compact_theta_sketch_alloc<A> deserialize(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED);
+ static compact_theta_sketch_alloc deserialize(const void* bytes, size_t size,
+ uint64_t seed = DEFAULT_SEED, const Allocator& allocator = Allocator());
+
+ // for internal use
+ compact_theta_sketch_alloc(bool is_empty, bool is_ordered, uint16_t seed_hash, uint64_t theta, std::vector<uint64_t, Allocator>&& entries);
private:
- typedef typename std::allocator_traits<A>::template rebind_alloc<uint64_t> AllocU64;
+ enum flags { IS_BIG_ENDIAN, IS_READ_ONLY, IS_EMPTY, IS_COMPACT, IS_ORDERED };
- vector_u64<A> keys_;
- uint16_t seed_hash_;
+ bool is_empty_;
bool is_ordered_;
+ uint16_t seed_hash_;
+ uint64_t theta_;
+ std::vector<uint64_t, Allocator> entries_;
- friend theta_sketch_alloc<A>;
- friend update_theta_sketch_alloc<A>;
- friend theta_union_alloc<A>;
- friend theta_intersection_alloc<A>;
- friend theta_a_not_b_alloc<A>;
- compact_theta_sketch_alloc(bool is_empty, uint64_t theta, vector_u64<A>&& keys, uint16_t seed_hash, bool is_ordered);
- static compact_theta_sketch_alloc<A> internal_deserialize(std::istream& is, uint8_t preamble_longs, uint8_t flags_byte, uint16_t seed_hash);
- static compact_theta_sketch_alloc<A> internal_deserialize(const void* bytes, size_t size, uint8_t preamble_longs, uint8_t flags_byte, uint16_t seed_hash);
+ using ostrstream = typename Base::ostrstream;
+ virtual void print_specifics(ostrstream& os) const;
};
-// builder
-
-template<typename A>
-class update_theta_sketch_alloc<A>::builder {
+template<typename Allocator>
+class update_theta_sketch_alloc<Allocator>::builder: public theta_base_builder<builder, Allocator> {
public:
- static const uint8_t MIN_LG_K = 5;
- static const uint8_t DEFAULT_LG_K = 12;
- static const resize_factor DEFAULT_RESIZE_FACTOR = X8;
-
- /**
- * Creates and instance of the builder with default parameters.
- */
- builder();
-
- /**
- * Set log2(k), where k is a nominal number of entries in the sketch
- * @param lg_k base 2 logarithm of nominal number of entries
- * @return this builder
- */
- builder& set_lg_k(uint8_t lg_k);
-
- /**
- * Set resize factor for the internal hash table (defaults to 8)
- * @param rf resize factor
- * @return this builder
- */
- builder& set_resize_factor(resize_factor rf);
-
- /**
- * Set sampling probability (initial theta). The default is 1, so the sketch retains
- * all entries until it reaches the limit, at which point it goes into the estimation mode
- * and reduces the effective sampling probability (theta) as necessary.
- * @param p sampling probability
- * @return this builder
- */
- builder& set_p(float p);
-
- /**
- * Set the seed for the hash function. Should be used carefully if needed.
- * Sketches produced with different seed are not compatible
- * and cannot be mixed in set operations.
- * @param seed hash seed
- * @return this builder
- */
- builder& set_seed(uint64_t seed);
-
- /**
- * This is to create an instance of the sketch with predefined parameters.
- * @return and instance of the sketch
- */
- update_theta_sketch_alloc<A> build() const;
-
-private:
- uint8_t lg_k_;
- resize_factor rf_;
- float p_;
- uint64_t seed_;
-
- static uint8_t starting_sub_multiple(uint8_t lg_tgt, uint8_t lg_min, uint8_t lg_rf);
+ builder(const Allocator& allocator = Allocator());
+ update_theta_sketch_alloc build() const;
};
-// iterator
-template<typename A>
-class theta_sketch_alloc<A>::const_iterator: public std::iterator<std::input_iterator_tag, uint64_t> {
-public:
- const_iterator& operator++();
- const_iterator operator++(int);
- bool operator==(const const_iterator& other) const;
- bool operator!=(const const_iterator& other) const;
- uint64_t operator*() const;
-
-private:
- const uint64_t* keys_;
- uint32_t size_;
- uint32_t index_;
- const_iterator(const uint64_t* keys, uint32_t size, uint32_t index);
- friend class update_theta_sketch_alloc<A>;
- friend class compact_theta_sketch_alloc<A>;
-};
-
-
// aliases with default allocator for convenience
-typedef theta_sketch_alloc<std::allocator<void>> theta_sketch;
-typedef update_theta_sketch_alloc<std::allocator<void>> update_theta_sketch;
-typedef compact_theta_sketch_alloc<std::allocator<void>> compact_theta_sketch;
+using theta_sketch = theta_sketch_alloc<std::allocator<uint64_t>>;
+using update_theta_sketch = update_theta_sketch_alloc<std::allocator<uint64_t>>;
+using compact_theta_sketch = compact_theta_sketch_alloc<std::allocator<uint64_t>>;
} /* namespace datasketches */
diff --git a/theta/include/theta_sketch_impl.hpp b/theta/include/theta_sketch_impl.hpp
index 579a675..0bee4c9 100644
--- a/theta/include/theta_sketch_impl.hpp
+++ b/theta/include/theta_sketch_impl.hpp
@@ -20,35 +20,23 @@
#ifndef THETA_SKETCH_IMPL_HPP_
#define THETA_SKETCH_IMPL_HPP_
-#include <algorithm>
-#include <cmath>
-#include <memory>
-#include <functional>
-#include <istream>
-#include <ostream>
#include <sstream>
+#include <vector>
-#include "MurmurHash3.h"
#include "serde.hpp"
#include "binomial_bounds.hpp"
-#include "memory_operations.hpp"
+#include "theta_helpers.hpp"
namespace datasketches {
-/*
- * author Alexander Saydakov
- * author Lee Rhodes
- * author Kevin Lang
- */
+template<typename A>
+bool theta_sketch_alloc<A>::is_estimation_mode() const {
+ return get_theta64() < theta_constants::MAX_THETA && !is_empty();
+}
template<typename A>
-theta_sketch_alloc<A>::theta_sketch_alloc(bool is_empty, uint64_t theta):
-is_empty_(is_empty), theta_(theta)
-{}
-
-template<typename A>
-bool theta_sketch_alloc<A>::is_empty() const {
- return is_empty_;
+double theta_sketch_alloc<A>::get_theta() const {
+ return static_cast<double>(get_theta64()) / theta_constants::MAX_THETA;
}
template<typename A>
@@ -69,182 +57,47 @@
}
template<typename A>
-bool theta_sketch_alloc<A>::is_estimation_mode() const {
- return theta_ < MAX_THETA && !is_empty_;
-}
-
-template<typename A>
-double theta_sketch_alloc<A>::get_theta() const {
- return (double) theta_ / MAX_THETA;
-}
-
-template<typename A>
-uint64_t theta_sketch_alloc<A>::get_theta64() const {
- return theta_;
-}
-
-template<typename A>
-typename theta_sketch_alloc<A>::unique_ptr theta_sketch_alloc<A>::deserialize(std::istream& is, uint64_t seed) {
- uint8_t preamble_longs;
- is.read((char*)&preamble_longs, sizeof(preamble_longs));
- uint8_t serial_version;
- is.read((char*)&serial_version, sizeof(serial_version));
- uint8_t type;
- is.read((char*)&type, sizeof(type));
- uint8_t lg_nom_size;
- is.read((char*)&lg_nom_size, sizeof(lg_nom_size));
- uint8_t lg_cur_size;
- is.read((char*)&lg_cur_size, sizeof(lg_cur_size));
- uint8_t flags_byte;
- is.read((char*)&flags_byte, sizeof(flags_byte));
- uint16_t seed_hash;
- is.read((char*)&seed_hash, sizeof(seed_hash));
-
- check_serial_version(serial_version, SERIAL_VERSION);
-
- if (type == update_theta_sketch_alloc<A>::SKETCH_TYPE) {
- check_seed_hash(seed_hash, get_seed_hash(seed));
- typename update_theta_sketch_alloc<A>::resize_factor rf = static_cast<typename update_theta_sketch_alloc<A>::resize_factor>(preamble_longs >> 6);
- typedef typename std::allocator_traits<A>::template rebind_alloc<update_theta_sketch_alloc<A>> AU;
- return unique_ptr(
- static_cast<theta_sketch_alloc<A>*>(new (AU().allocate(1)) update_theta_sketch_alloc<A>(update_theta_sketch_alloc<A>::internal_deserialize(is, rf, lg_cur_size, lg_nom_size, flags_byte, seed))),
- [](theta_sketch_alloc<A>* ptr) {
- ptr->~theta_sketch_alloc();
- AU().deallocate(static_cast<update_theta_sketch_alloc<A>*>(ptr), 1);
- }
- );
- } else if (type == compact_theta_sketch_alloc<A>::SKETCH_TYPE) {
- const bool is_empty = flags_byte & (1 << theta_sketch_alloc<A>::flags::IS_EMPTY);
- if (!is_empty) check_seed_hash(seed_hash, get_seed_hash(seed));
- typedef typename std::allocator_traits<A>::template rebind_alloc<compact_theta_sketch_alloc<A>> AC;
- return unique_ptr(
- static_cast<theta_sketch_alloc<A>*>(new (AC().allocate(1)) compact_theta_sketch_alloc<A>(compact_theta_sketch_alloc<A>::internal_deserialize(is, preamble_longs, flags_byte, seed_hash))),
- [](theta_sketch_alloc<A>* ptr) {
- ptr->~theta_sketch_alloc();
- AC().deallocate(static_cast<compact_theta_sketch_alloc<A>*>(ptr), 1);
- }
- );
+string<A> theta_sketch_alloc<A>::to_string(bool detail) const {
+ ostrstream os;
+ os << "### Theta sketch summary:" << std::endl;
+ os << " num retained entries : " << get_num_retained() << std::endl;
+ os << " seed hash : " << get_seed_hash() << std::endl;
+ os << " empty? : " << (is_empty() ? "true" : "false") << std::endl;
+ os << " ordered? : " << (is_ordered() ? "true" : "false") << std::endl;
+ os << " estimation mode? : " << (is_estimation_mode() ? "true" : "false") << std::endl;
+ os << " theta (fraction) : " << get_theta() << std::endl;
+ os << " theta (raw 64-bit) : " << get_theta64() << std::endl;
+ os << " estimate : " << this->get_estimate() << std::endl;
+ os << " lower bound 95% conf : " << this->get_lower_bound(2) << std::endl;
+ os << " upper bound 95% conf : " << this->get_upper_bound(2) << std::endl;
+ print_specifics(os);
+ os << "### End sketch summary" << std::endl;
+ if (detail) {
+ os << "### Retained entries" << std::endl;
+ for (const auto& hash: *this) {
+ os << hash << std::endl;
+ }
+ os << "### End retained entries" << std::endl;
}
- throw std::invalid_argument("unsupported sketch type " + std::to_string((int) type));
-}
-
-template<typename A>
-typename theta_sketch_alloc<A>::unique_ptr theta_sketch_alloc<A>::deserialize(const void* bytes, size_t size, uint64_t seed) {
- ensure_minimum_memory(size, static_cast<size_t>(8));
- const char* ptr = static_cast<const char*>(bytes);
- uint8_t preamble_longs;
- ptr += copy_from_mem(ptr, &preamble_longs, sizeof(preamble_longs));
- uint8_t serial_version;
- ptr += copy_from_mem(ptr, &serial_version, sizeof(serial_version));
- uint8_t type;
- ptr += copy_from_mem(ptr, &type, sizeof(type));
- uint8_t lg_nom_size;
- ptr += copy_from_mem(ptr, &lg_nom_size, sizeof(lg_nom_size));
- uint8_t lg_cur_size;
- ptr += copy_from_mem(ptr, &lg_cur_size, sizeof(lg_cur_size));
- uint8_t flags_byte;
- ptr += copy_from_mem(ptr, &flags_byte, sizeof(flags_byte));
- uint16_t seed_hash;
- ptr += copy_from_mem(ptr, &seed_hash, sizeof(seed_hash));
-
- check_serial_version(serial_version, SERIAL_VERSION);
-
- if (type == update_theta_sketch_alloc<A>::SKETCH_TYPE) {
- check_seed_hash(seed_hash, get_seed_hash(seed));
- typename update_theta_sketch_alloc<A>::resize_factor rf = static_cast<typename update_theta_sketch_alloc<A>::resize_factor>(preamble_longs >> 6);
- typedef typename std::allocator_traits<A>::template rebind_alloc<update_theta_sketch_alloc<A>> AU;
- return unique_ptr(
- static_cast<theta_sketch_alloc<A>*>(new (AU().allocate(1)) update_theta_sketch_alloc<A>(
- update_theta_sketch_alloc<A>::internal_deserialize(ptr, size - (ptr - static_cast<const char*>(bytes)), rf, lg_cur_size, lg_nom_size, flags_byte, seed))
- ),
- [](theta_sketch_alloc<A>* ptr) {
- ptr->~theta_sketch_alloc();
- AU().deallocate(static_cast<update_theta_sketch_alloc<A>*>(ptr), 1);
- }
- );
- } else if (type == compact_theta_sketch_alloc<A>::SKETCH_TYPE) {
- const bool is_empty = flags_byte & (1 << theta_sketch_alloc<A>::flags::IS_EMPTY);
- if (!is_empty) check_seed_hash(seed_hash, get_seed_hash(seed));
- typedef typename std::allocator_traits<A>::template rebind_alloc<compact_theta_sketch_alloc<A>> AC;
- return unique_ptr(
- static_cast<theta_sketch_alloc<A>*>(new (AC().allocate(1)) compact_theta_sketch_alloc<A>(
- compact_theta_sketch_alloc<A>::internal_deserialize(ptr, size - (ptr - static_cast<const char*>(bytes)), preamble_longs, flags_byte, seed_hash))
- ),
- [](theta_sketch_alloc<A>* ptr) {
- ptr->~theta_sketch_alloc();
- AC().deallocate(static_cast<compact_theta_sketch_alloc<A>*>(ptr), 1);
- }
- );
- }
- throw std::invalid_argument("unsupported sketch type " + std::to_string((int) type));
-}
-
-template<typename A>
-uint16_t theta_sketch_alloc<A>::get_seed_hash(uint64_t seed) {
- HashState hashes;
- MurmurHash3_x64_128(&seed, sizeof(seed), 0, hashes);
- return hashes.h1;
-}
-
-template<typename A>
-void theta_sketch_alloc<A>::check_sketch_type(uint8_t actual, uint8_t expected) {
- if (actual != expected) {
- throw std::invalid_argument("Sketch type mismatch: expected " + std::to_string((int)expected) + ", actual " + std::to_string((int)actual));
- }
-}
-
-template<typename A>
-void theta_sketch_alloc<A>::check_serial_version(uint8_t actual, uint8_t expected) {
- if (actual != expected) {
- throw std::invalid_argument("Sketch serial version mismatch: expected " + std::to_string((int)expected) + ", actual " + std::to_string((int)actual));
- }
-}
-
-template<typename A>
-void theta_sketch_alloc<A>::check_seed_hash(uint16_t actual, uint16_t expected) {
- if (actual != expected) {
- throw std::invalid_argument("Sketch seed hash mismatch: expected " + std::to_string(expected) + ", actual " + std::to_string(actual));
- }
+ return os.str();
}
// update sketch
template<typename A>
-update_theta_sketch_alloc<A>::update_theta_sketch_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t seed):
-theta_sketch_alloc<A>(true, theta_sketch_alloc<A>::MAX_THETA),
-lg_cur_size_(lg_cur_size),
-lg_nom_size_(lg_nom_size),
-keys_(1 << lg_cur_size_, 0),
-num_keys_(0),
-rf_(rf),
-p_(p),
-seed_(seed),
-capacity_(get_capacity(lg_cur_size, lg_nom_size))
-{
- if (p < 1) this->theta_ *= p;
-}
-
-template<typename A>
-update_theta_sketch_alloc<A>::update_theta_sketch_alloc(bool is_empty, uint64_t theta, uint8_t lg_cur_size, uint8_t lg_nom_size, vector_u64<A>&& keys, uint32_t num_keys, resize_factor rf, float p, uint64_t seed):
-theta_sketch_alloc<A>(is_empty, theta),
-lg_cur_size_(lg_cur_size),
-lg_nom_size_(lg_nom_size),
-keys_(std::move(keys)),
-num_keys_(num_keys),
-rf_(rf),
-p_(p),
-seed_(seed),
-capacity_(get_capacity(lg_cur_size, lg_nom_size))
+update_theta_sketch_alloc<A>::update_theta_sketch_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf,
+ uint64_t theta, uint64_t seed, const A& allocator):
+table_(lg_cur_size, lg_nom_size, rf, theta, seed, allocator)
{}
template<typename A>
-uint32_t update_theta_sketch_alloc<A>::get_num_retained() const {
- return num_keys_;
+A update_theta_sketch_alloc<A>::get_allocator() const {
+ return table_.allocator_;
}
template<typename A>
-uint16_t update_theta_sketch_alloc<A>::get_seed_hash() const {
- return theta_sketch_alloc<A>::get_seed_hash(seed_);
+bool update_theta_sketch_alloc<A>::is_empty() const {
+ return table_.is_empty_;
}
template<typename A>
@@ -253,169 +106,28 @@
}
template<typename A>
-string<A> update_theta_sketch_alloc<A>::to_string(bool print_items) const {
- std::basic_ostringstream<char, std::char_traits<char>, AllocChar<A>> os;
- os << "### Update Theta sketch summary:" << std::endl;
- os << " lg nominal size : " << (int) lg_nom_size_ << std::endl;
- os << " lg current size : " << (int) lg_cur_size_ << std::endl;
- os << " num retained keys : " << num_keys_ << std::endl;
- os << " resize factor : " << (1 << rf_) << std::endl;
- os << " sampling probability : " << p_ << std::endl;
- os << " seed hash : " << this->get_seed_hash() << std::endl;
- os << " empty? : " << (this->is_empty() ? "true" : "false") << std::endl;
- os << " ordered? : " << (this->is_ordered() ? "true" : "false") << std::endl;
- os << " estimation mode? : " << (this->is_estimation_mode() ? "true" : "false") << std::endl;
- os << " theta (fraction) : " << this->get_theta() << std::endl;
- os << " theta (raw 64-bit) : " << this->theta_ << std::endl;
- os << " estimate : " << this->get_estimate() << std::endl;
- os << " lower bound 95% conf : " << this->get_lower_bound(2) << std::endl;
- os << " upper bound 95% conf : " << this->get_upper_bound(2) << std::endl;
- os << "### End sketch summary" << std::endl;
- if (print_items) {
- os << "### Retained keys" << std::endl;
- for (auto key: *this) os << " " << key << std::endl;
- os << "### End retained keys" << std::endl;
- }
- return os.str();
+uint64_t update_theta_sketch_alloc<A>::get_theta64() const {
+ return table_.theta_;
}
template<typename A>
-void update_theta_sketch_alloc<A>::serialize(std::ostream& os) const {
- const uint8_t preamble_longs_and_rf = 3 | (rf_ << 6);
- os.write((char*)&preamble_longs_and_rf, sizeof(preamble_longs_and_rf));
- const uint8_t serial_version = theta_sketch_alloc<A>::SERIAL_VERSION;
- os.write((char*)&serial_version, sizeof(serial_version));
- const uint8_t type = SKETCH_TYPE;
- os.write((char*)&type, sizeof(type));
- os.write((char*)&lg_nom_size_, sizeof(lg_nom_size_));
- os.write((char*)&lg_cur_size_, sizeof(lg_cur_size_));
- const uint8_t flags_byte(
- (this->is_empty() ? 1 << theta_sketch_alloc<A>::flags::IS_EMPTY : 0)
- );
- os.write((char*)&flags_byte, sizeof(flags_byte));
- const uint16_t seed_hash = get_seed_hash();
- os.write((char*)&seed_hash, sizeof(seed_hash));
- os.write((char*)&num_keys_, sizeof(num_keys_));
- os.write((char*)&p_, sizeof(p_));
- os.write((char*)&(this->theta_), sizeof(uint64_t));
- os.write((char*)keys_.data(), sizeof(uint64_t) * keys_.size());
+uint32_t update_theta_sketch_alloc<A>::get_num_retained() const {
+ return table_.num_entries_;
}
template<typename A>
-vector_u8<A> update_theta_sketch_alloc<A>::serialize(unsigned header_size_bytes) const {
- const uint8_t preamble_longs = 3;
- const size_t size = header_size_bytes + sizeof(uint64_t) * preamble_longs + sizeof(uint64_t) * keys_.size();
- vector_u8<A> bytes(size);
- uint8_t* ptr = bytes.data() + header_size_bytes;
-
- const uint8_t preamble_longs_and_rf = preamble_longs | (rf_ << 6);
- ptr += copy_to_mem(&preamble_longs_and_rf, ptr, sizeof(preamble_longs_and_rf));
- const uint8_t serial_version = theta_sketch_alloc<A>::SERIAL_VERSION;
- ptr += copy_to_mem(&serial_version, ptr, sizeof(serial_version));
- const uint8_t type = SKETCH_TYPE;
- ptr += copy_to_mem(&type, ptr, sizeof(type));
- ptr += copy_to_mem(&lg_nom_size_, ptr, sizeof(lg_nom_size_));
- ptr += copy_to_mem(&lg_cur_size_, ptr, sizeof(lg_cur_size_));
- const uint8_t flags_byte(
- (this->is_empty() ? 1 << theta_sketch_alloc<A>::flags::IS_EMPTY : 0)
- );
- ptr += copy_to_mem(&flags_byte, ptr, sizeof(flags_byte));
- const uint16_t seed_hash = get_seed_hash();
- ptr += copy_to_mem(&seed_hash, ptr, sizeof(seed_hash));
- ptr += copy_to_mem(&num_keys_, ptr, sizeof(num_keys_));
- ptr += copy_to_mem(&p_, ptr, sizeof(p_));
- ptr += copy_to_mem(&(this->theta_), ptr, sizeof(uint64_t));
- ptr += copy_to_mem(keys_.data(), ptr, sizeof(uint64_t) * keys_.size());
-
- return bytes;
+uint16_t update_theta_sketch_alloc<A>::get_seed_hash() const {
+ return compute_seed_hash(table_.seed_);
}
template<typename A>
-update_theta_sketch_alloc<A> update_theta_sketch_alloc<A>::deserialize(std::istream& is, uint64_t seed) {
- uint8_t preamble_longs;
- is.read((char*)&preamble_longs, sizeof(preamble_longs));
- resize_factor rf = static_cast<resize_factor>(preamble_longs >> 6);
- preamble_longs &= 0x3f; // remove resize factor
- uint8_t serial_version;
- is.read((char*)&serial_version, sizeof(serial_version));
- uint8_t type;
- is.read((char*)&type, sizeof(type));
- uint8_t lg_nom_size;
- is.read((char*)&lg_nom_size, sizeof(lg_nom_size));
- uint8_t lg_cur_size;
- is.read((char*)&lg_cur_size, sizeof(lg_cur_size));
- uint8_t flags_byte;
- is.read((char*)&flags_byte, sizeof(flags_byte));
- uint16_t seed_hash;
- is.read((char*)&seed_hash, sizeof(seed_hash));
- theta_sketch_alloc<A>::check_sketch_type(type, SKETCH_TYPE);
- theta_sketch_alloc<A>::check_serial_version(serial_version, theta_sketch_alloc<A>::SERIAL_VERSION);
- theta_sketch_alloc<A>::check_seed_hash(seed_hash, theta_sketch_alloc<A>::get_seed_hash(seed));
- return internal_deserialize(is, rf, lg_cur_size, lg_nom_size, flags_byte, seed);
+uint8_t update_theta_sketch_alloc<A>::get_lg_k() const {
+ return table_.lg_nom_size_;
}
template<typename A>
-update_theta_sketch_alloc<A> update_theta_sketch_alloc<A>::internal_deserialize(std::istream& is, resize_factor rf, uint8_t lg_cur_size, uint8_t lg_nom_size, uint8_t flags_byte, uint64_t seed) {
- uint32_t num_keys;
- is.read((char*)&num_keys, sizeof(num_keys));
- float p;
- is.read((char*)&p, sizeof(p));
- uint64_t theta;
- is.read((char*)&theta, sizeof(theta));
- vector_u64<A> keys(1 << lg_cur_size);
- is.read((char*)keys.data(), sizeof(uint64_t) * keys.size());
- const bool is_empty = flags_byte & (1 << theta_sketch_alloc<A>::flags::IS_EMPTY);
- if (!is.good()) throw std::runtime_error("error reading from std::istream");
- return update_theta_sketch_alloc<A>(is_empty, theta, lg_cur_size, lg_nom_size, std::move(keys), num_keys, rf, p, seed);
-}
-
-template<typename A>
-update_theta_sketch_alloc<A> update_theta_sketch_alloc<A>::deserialize(const void* bytes, size_t size, uint64_t seed) {
- ensure_minimum_memory(size, 8);
- const char* ptr = static_cast<const char*>(bytes);
- uint8_t preamble_longs;
- ptr += copy_from_mem(ptr, &preamble_longs, sizeof(preamble_longs));
- resize_factor rf = static_cast<resize_factor>(preamble_longs >> 6);
- preamble_longs &= 0x3f; // remove resize factor
- uint8_t serial_version;
- ptr += copy_from_mem(ptr, &serial_version, sizeof(serial_version));
- uint8_t type;
- ptr += copy_from_mem(ptr, &type, sizeof(type));
- uint8_t lg_nom_size;
- ptr += copy_from_mem(ptr, &lg_nom_size, sizeof(lg_nom_size));
- uint8_t lg_cur_size;
- ptr += copy_from_mem(ptr, &lg_cur_size, sizeof(lg_cur_size));
- uint8_t flags_byte;
- ptr += copy_from_mem(ptr, &flags_byte, sizeof(flags_byte));
- uint16_t seed_hash;
- ptr += copy_from_mem(ptr, &seed_hash, sizeof(seed_hash));
- theta_sketch_alloc<A>::check_sketch_type(type, SKETCH_TYPE);
- theta_sketch_alloc<A>::check_serial_version(serial_version, theta_sketch_alloc<A>::SERIAL_VERSION);
- theta_sketch_alloc<A>::check_seed_hash(seed_hash, theta_sketch_alloc<A>::get_seed_hash(seed));
- return internal_deserialize(ptr, size - (ptr - static_cast<const char*>(bytes)), rf, lg_cur_size, lg_nom_size, flags_byte, seed);
-}
-
-template<typename A>
-update_theta_sketch_alloc<A> update_theta_sketch_alloc<A>::internal_deserialize(const void* bytes, size_t size, resize_factor rf, uint8_t lg_cur_size, uint8_t lg_nom_size, uint8_t flags_byte, uint64_t seed) {
- const uint32_t table_size = 1 << lg_cur_size;
- ensure_minimum_memory(size, 16 + sizeof(uint64_t) * table_size);
- const char* ptr = static_cast<const char*>(bytes);
- uint32_t num_keys;
- ptr += copy_from_mem(ptr, &num_keys, sizeof(num_keys));
- float p;
- ptr += copy_from_mem(ptr, &p, sizeof(p));
- uint64_t theta;
- ptr += copy_from_mem(ptr, &theta, sizeof(theta));
- vector_u64<A> keys(table_size);
- ptr += copy_from_mem(ptr, keys.data(), sizeof(uint64_t) * table_size);
- const bool is_empty = flags_byte & (1 << theta_sketch_alloc<A>::flags::IS_EMPTY);
- return update_theta_sketch_alloc<A>(is_empty, theta, lg_cur_size, lg_nom_size, std::move(keys), num_keys, rf, p, seed);
-}
-
-template<typename A>
-void update_theta_sketch_alloc<A>::update(const std::string& value) {
- if (value.empty()) return;
- update(value.c_str(), value.length());
+auto update_theta_sketch_alloc<A>::get_rf() const -> resize_factor {
+ return table_.rf_;
}
template<typename A>
@@ -460,19 +172,7 @@
template<typename A>
void update_theta_sketch_alloc<A>::update(double value) {
- union {
- int64_t long_value;
- double double_value;
- } long_double_union;
-
- if (value == 0.0) {
- long_double_union.double_value = 0.0; // canonicalize -0.0 to 0.0
- } else if (std::isnan(value)) {
- long_double_union.long_value = 0x7ff8000000000000L; // canonicalize NaN using value from Java's Double.doubleToLongBits()
- } else {
- long_double_union.double_value = value;
- }
- update(&long_double_union, sizeof(long_double_union));
+ update(canonical_double(value));
}
template<typename A>
@@ -481,162 +181,100 @@
}
template<typename A>
-void update_theta_sketch_alloc<A>::update(const void* data, unsigned length) {
- HashState hashes;
- MurmurHash3_x64_128(data, length, seed_, hashes);
- const uint64_t hash = hashes.h1 >> 1; // Java implementation does logical shift >>> to make values positive
- internal_update(hash);
+void update_theta_sketch_alloc<A>::update(const std::string& value) {
+ if (value.empty()) return;
+ update(value.c_str(), value.length());
}
template<typename A>
+void update_theta_sketch_alloc<A>::update(const void* data, size_t length) {
+ const uint64_t hash = table_.hash_and_screen(data, length);
+ if (hash == 0) return;
+ auto result = table_.find(hash);
+ if (!result.second) {
+ table_.insert(result.first, hash);
+ }
+}
+
+template<typename A>
+void update_theta_sketch_alloc<A>::trim() {
+ table_.trim();
+}
+
+template<typename A>
+auto update_theta_sketch_alloc<A>::begin() -> iterator {
+ return iterator(table_.entries_, 1 << table_.lg_cur_size_, 0);
+}
+
+template<typename A>
+auto update_theta_sketch_alloc<A>::end() -> iterator {
+ return iterator(nullptr, 0, 1 << table_.lg_cur_size_);
+}
+
+template<typename A>
+auto update_theta_sketch_alloc<A>::begin() const -> const_iterator {
+ return const_iterator(table_.entries_, 1 << table_.lg_cur_size_, 0);
+}
+
+template<typename A>
+auto update_theta_sketch_alloc<A>::end() const -> const_iterator {
+ return const_iterator(nullptr, 0, 1 << table_.lg_cur_size_);
+}
+template<typename A>
compact_theta_sketch_alloc<A> update_theta_sketch_alloc<A>::compact(bool ordered) const {
return compact_theta_sketch_alloc<A>(*this, ordered);
}
template<typename A>
-void update_theta_sketch_alloc<A>::internal_update(uint64_t hash) {
- this->is_empty_ = false;
- if (hash >= this->theta_ || hash == 0) return; // hash == 0 is reserved to mark empty slots in the table
- if (hash_search_or_insert(hash, keys_.data(), lg_cur_size_)) {
- num_keys_++;
- if (num_keys_ > capacity_) {
- if (lg_cur_size_ <= lg_nom_size_) {
- resize();
- } else {
- rebuild();
- }
- }
- }
+void update_theta_sketch_alloc<A>::print_specifics(ostrstream& os) const {
+ os << " lg nominal size : " << static_cast<int>(table_.lg_nom_size_) << std::endl;
+ os << " lg current size : " << static_cast<int>(table_.lg_cur_size_) << std::endl;
+ os << " resize factor : " << (1 << table_.rf_) << std::endl;
}
-template<typename A>
-void update_theta_sketch_alloc<A>::trim() {
- if (num_keys_ > static_cast<uint32_t>(1 << lg_nom_size_)) rebuild();
-}
+// builder
template<typename A>
-void update_theta_sketch_alloc<A>::resize() {
- const uint8_t lg_tgt_size = lg_nom_size_ + 1;
- const uint8_t factor = std::max(1, std::min(static_cast<int>(rf_), lg_tgt_size - lg_cur_size_));
- const uint8_t lg_new_size = lg_cur_size_ + factor;
- const uint32_t new_size = 1 << lg_new_size;
- vector_u64<A> new_keys(new_size, 0);
- for (uint32_t i = 0; i < keys_.size(); i++) {
- if (keys_[i] != 0) {
- hash_search_or_insert(keys_[i], new_keys.data(), lg_new_size); // TODO hash_insert
- }
- }
- keys_ = std::move(new_keys);
- lg_cur_size_ += factor;
- capacity_ = get_capacity(lg_cur_size_, lg_nom_size_);
-}
+update_theta_sketch_alloc<A>::builder::builder(const A& allocator): theta_base_builder<builder, A>(allocator) {}
template<typename A>
-void update_theta_sketch_alloc<A>::rebuild() {
- const uint32_t pivot = (1 << lg_nom_size_) + keys_.size() - num_keys_;
- std::nth_element(keys_.begin(), keys_.begin() + pivot, keys_.end());
- this->theta_ = keys_[pivot];
- vector_u64<A> new_keys(keys_.size(), 0);
- num_keys_ = 0;
- for (uint32_t i = 0; i < keys_.size(); i++) {
- if (keys_[i] != 0 && keys_[i] < this->theta_) {
- hash_search_or_insert(keys_[i], new_keys.data(), lg_cur_size_); // TODO hash_insert
- num_keys_++;
- }
- }
- keys_ = std::move(new_keys);
-}
-
-template<typename A>
-uint32_t update_theta_sketch_alloc<A>::get_capacity(uint8_t lg_cur_size, uint8_t lg_nom_size) {
- const double fraction = (lg_cur_size <= lg_nom_size) ? RESIZE_THRESHOLD : REBUILD_THRESHOLD;
- return std::floor(fraction * (1 << lg_cur_size));
-}
-
-template<typename A>
-uint32_t update_theta_sketch_alloc<A>::get_stride(uint64_t hash, uint8_t lg_size) {
- // odd and independent of index assuming lg_size lowest bits of the hash were used for the index
- return (2 * static_cast<uint32_t>((hash >> lg_size) & STRIDE_MASK)) + 1;
-}
-
-template<typename A>
-bool update_theta_sketch_alloc<A>::hash_search_or_insert(uint64_t hash, uint64_t* table, uint8_t lg_size) {
- const uint32_t mask = (1 << lg_size) - 1;
- const uint32_t stride = get_stride(hash, lg_size);
- uint32_t cur_probe = static_cast<uint32_t>(hash) & mask;
-
- // search for duplicate or zero
- const uint32_t loop_index = cur_probe;
- do {
- const uint64_t value = table[cur_probe];
- if (value == 0) {
- table[cur_probe] = hash; // insert value
- return true;
- } else if (value == hash) {
- return false; // found a duplicate
- }
- cur_probe = (cur_probe + stride) & mask;
- } while (cur_probe != loop_index);
- throw std::logic_error("key not found and no empty slots!");
-}
-
-template<typename A>
-bool update_theta_sketch_alloc<A>::hash_search(uint64_t hash, const uint64_t* table, uint8_t lg_size) {
- const uint32_t mask = (1 << lg_size) - 1;
- const uint32_t stride = update_theta_sketch_alloc<A>::get_stride(hash, lg_size);
- uint32_t cur_probe = static_cast<uint32_t>(hash) & mask;
- const uint32_t loop_index = cur_probe;
- do {
- const uint64_t value = table[cur_probe];
- if (value == 0) {
- return false;
- } else if (value == hash) {
- return true;
- }
- cur_probe = (cur_probe + stride) & mask;
- } while (cur_probe != loop_index);
- throw std::logic_error("key not found and search wrapped");
-}
-
-template<typename A>
-typename theta_sketch_alloc<A>::const_iterator update_theta_sketch_alloc<A>::begin() const {
- return typename theta_sketch_alloc<A>::const_iterator(keys_.data(), keys_.size(), 0);
-}
-
-template<typename A>
-typename theta_sketch_alloc<A>::const_iterator update_theta_sketch_alloc<A>::end() const {
- return typename theta_sketch_alloc<A>::const_iterator(keys_.data(), keys_.size(), keys_.size());
+update_theta_sketch_alloc<A> update_theta_sketch_alloc<A>::builder::build() const {
+ return update_theta_sketch_alloc(this->starting_lg_size(), this->lg_k_, this->rf_, this->starting_theta(), this->seed_, this->allocator_);
}
// compact sketch
template<typename A>
-compact_theta_sketch_alloc<A>::compact_theta_sketch_alloc(bool is_empty, uint64_t theta, vector_u64<A>&& keys, uint16_t seed_hash, bool is_ordered):
-theta_sketch_alloc<A>(is_empty, theta),
-keys_(std::move(keys)),
+compact_theta_sketch_alloc<A>::compact_theta_sketch_alloc(const Base& other, bool ordered):
+is_empty_(other.is_empty()),
+is_ordered_(other.is_ordered() || ordered),
+seed_hash_(other.get_seed_hash()),
+theta_(other.get_theta64()),
+entries_(other.get_allocator())
+{
+ entries_.reserve(other.get_num_retained());
+ std::copy(other.begin(), other.end(), std::back_inserter(entries_));
+ if (ordered && !other.is_ordered()) std::sort(entries_.begin(), entries_.end());
+}
+
+template<typename A>
+compact_theta_sketch_alloc<A>::compact_theta_sketch_alloc(bool is_empty, bool is_ordered, uint16_t seed_hash, uint64_t theta,
+ std::vector<uint64_t, A>&& entries):
+is_empty_(is_empty),
+is_ordered_(is_ordered),
seed_hash_(seed_hash),
-is_ordered_(is_ordered)
+theta_(theta),
+entries_(std::move(entries))
{}
template<typename A>
-compact_theta_sketch_alloc<A>::compact_theta_sketch_alloc(const theta_sketch_alloc<A>& other, bool ordered):
-theta_sketch_alloc<A>(other),
-keys_(other.get_num_retained()),
-seed_hash_(other.get_seed_hash()),
-is_ordered_(other.is_ordered() || ordered)
-{
- std::copy(other.begin(), other.end(), keys_.begin());
- if (ordered && !other.is_ordered()) std::sort(keys_.begin(), keys_.end());
+A compact_theta_sketch_alloc<A>::get_allocator() const {
+ return entries_.get_allocator();
}
template<typename A>
-uint32_t compact_theta_sketch_alloc<A>::get_num_retained() const {
- return keys_.size();
-}
-
-template<typename A>
-uint16_t compact_theta_sketch_alloc<A>::get_seed_hash() const {
- return seed_hash_;
+bool compact_theta_sketch_alloc<A>::is_empty() const {
+ return is_empty_;
}
template<typename A>
@@ -645,153 +283,163 @@
}
template<typename A>
-string<A> compact_theta_sketch_alloc<A>::to_string(bool print_items) const {
- std::basic_ostringstream<char, std::char_traits<char>, AllocChar<A>> os;
- os << "### Compact Theta sketch summary:" << std::endl;
- os << " num retained keys : " << keys_.size() << std::endl;
- os << " seed hash : " << this->get_seed_hash() << std::endl;
- os << " empty? : " << (this->is_empty() ? "true" : "false") << std::endl;
- os << " ordered? : " << (this->is_ordered() ? "true" : "false") << std::endl;
- os << " estimation mode? : " << (this->is_estimation_mode() ? "true" : "false") << std::endl;
- os << " theta (fraction) : " << this->get_theta() << std::endl;
- os << " theta (raw 64-bit) : " << this->theta_ << std::endl;
- os << " estimate : " << this->get_estimate() << std::endl;
- os << " lower bound 95% conf : " << this->get_lower_bound(2) << std::endl;
- os << " upper bound 95% conf : " << this->get_upper_bound(2) << std::endl;
- os << "### End sketch summary" << std::endl;
- if (print_items) {
- os << "### Retained keys" << std::endl;
- for (auto key: *this) os << " " << key << std::endl;
- os << "### End retained keys" << std::endl;
- }
- return os.str();
+uint64_t compact_theta_sketch_alloc<A>::get_theta64() const {
+ return theta_;
}
template<typename A>
+uint32_t compact_theta_sketch_alloc<A>::get_num_retained() const {
+ return entries_.size();
+}
+
+template<typename A>
+uint16_t compact_theta_sketch_alloc<A>::get_seed_hash() const {
+ return seed_hash_;
+}
+
+template<typename A>
+auto compact_theta_sketch_alloc<A>::begin() -> iterator {
+ return iterator(entries_.data(), entries_.size(), 0);
+}
+
+template<typename A>
+auto compact_theta_sketch_alloc<A>::end() -> iterator {
+ return iterator(nullptr, 0, entries_.size());
+}
+
+template<typename A>
+auto compact_theta_sketch_alloc<A>::begin() const -> const_iterator {
+ return const_iterator(entries_.data(), entries_.size(), 0);
+}
+
+template<typename A>
+auto compact_theta_sketch_alloc<A>::end() const -> const_iterator {
+ return const_iterator(nullptr, 0, entries_.size());
+}
+
+template<typename A>
+void compact_theta_sketch_alloc<A>::print_specifics(ostrstream&) const {}
+
+template<typename A>
void compact_theta_sketch_alloc<A>::serialize(std::ostream& os) const {
- const bool is_single_item = keys_.size() == 1 && !this->is_estimation_mode();
+ const bool is_single_item = entries_.size() == 1 && !this->is_estimation_mode();
const uint8_t preamble_longs = this->is_empty() || is_single_item ? 1 : this->is_estimation_mode() ? 3 : 2;
os.write(reinterpret_cast<const char*>(&preamble_longs), sizeof(preamble_longs));
- const uint8_t serial_version = theta_sketch_alloc<A>::SERIAL_VERSION;
+ const uint8_t serial_version = SERIAL_VERSION;
os.write(reinterpret_cast<const char*>(&serial_version), sizeof(serial_version));
const uint8_t type = SKETCH_TYPE;
os.write(reinterpret_cast<const char*>(&type), sizeof(type));
const uint16_t unused16 = 0;
os.write(reinterpret_cast<const char*>(&unused16), sizeof(unused16));
const uint8_t flags_byte(
- (1 << theta_sketch_alloc<A>::flags::IS_COMPACT) |
- (1 << theta_sketch_alloc<A>::flags::IS_READ_ONLY) |
- (this->is_empty() ? 1 << theta_sketch_alloc<A>::flags::IS_EMPTY : 0) |
- (this->is_ordered() ? 1 << theta_sketch_alloc<A>::flags::IS_ORDERED : 0)
+ (1 << flags::IS_COMPACT) |
+ (1 << flags::IS_READ_ONLY) |
+ (this->is_empty() ? 1 << flags::IS_EMPTY : 0) |
+ (this->is_ordered() ? 1 << flags::IS_ORDERED : 0)
);
os.write(reinterpret_cast<const char*>(&flags_byte), sizeof(flags_byte));
const uint16_t seed_hash = get_seed_hash();
- os.write((char*)&seed_hash, sizeof(seed_hash));
+ os.write(reinterpret_cast<const char*>(&seed_hash), sizeof(seed_hash));
if (!this->is_empty()) {
if (!is_single_item) {
- const uint32_t num_keys = keys_.size();
- os.write((char*)&num_keys, sizeof(num_keys));
+ const uint32_t num_entries = entries_.size();
+ os.write(reinterpret_cast<const char*>(&num_entries), sizeof(num_entries));
const uint32_t unused32 = 0;
- os.write((char*)&unused32, sizeof(unused32));
+ os.write(reinterpret_cast<const char*>(&unused32), sizeof(unused32));
if (this->is_estimation_mode()) {
- os.write((char*)&(this->theta_), sizeof(uint64_t));
+ os.write(reinterpret_cast<const char*>(&(this->theta_)), sizeof(uint64_t));
}
}
- os.write((char*)keys_.data(), sizeof(uint64_t) * keys_.size());
+ os.write(reinterpret_cast<const char*>(entries_.data()), entries_.size() * sizeof(uint64_t));
}
}
template<typename A>
-vector_u8<A> compact_theta_sketch_alloc<A>::serialize(unsigned header_size_bytes) const {
- const bool is_single_item = keys_.size() == 1 && !this->is_estimation_mode();
+auto compact_theta_sketch_alloc<A>::serialize(unsigned header_size_bytes) const -> vector_bytes {
+ const bool is_single_item = entries_.size() == 1 && !this->is_estimation_mode();
const uint8_t preamble_longs = this->is_empty() || is_single_item ? 1 : this->is_estimation_mode() ? 3 : 2;
- const size_t size = header_size_bytes + sizeof(uint64_t) * preamble_longs + sizeof(uint64_t) * keys_.size();
- vector_u8<A> bytes(size);
+ const size_t size = header_size_bytes + sizeof(uint64_t) * preamble_longs
+ + sizeof(uint64_t) * entries_.size();
+ vector_bytes bytes(size, 0, entries_.get_allocator());
uint8_t* ptr = bytes.data() + header_size_bytes;
ptr += copy_to_mem(&preamble_longs, ptr, sizeof(preamble_longs));
- const uint8_t serial_version = theta_sketch_alloc<A>::SERIAL_VERSION;
+ const uint8_t serial_version = SERIAL_VERSION;
ptr += copy_to_mem(&serial_version, ptr, sizeof(serial_version));
const uint8_t type = SKETCH_TYPE;
ptr += copy_to_mem(&type, ptr, sizeof(type));
const uint16_t unused16 = 0;
ptr += copy_to_mem(&unused16, ptr, sizeof(unused16));
const uint8_t flags_byte(
- (1 << theta_sketch_alloc<A>::flags::IS_COMPACT) |
- (1 << theta_sketch_alloc<A>::flags::IS_READ_ONLY) |
- (this->is_empty() ? 1 << theta_sketch_alloc<A>::flags::IS_EMPTY : 0) |
- (this->is_ordered() ? 1 << theta_sketch_alloc<A>::flags::IS_ORDERED : 0)
+ (1 << flags::IS_COMPACT) |
+ (1 << flags::IS_READ_ONLY) |
+ (this->is_empty() ? 1 << flags::IS_EMPTY : 0) |
+ (this->is_ordered() ? 1 << flags::IS_ORDERED : 0)
);
ptr += copy_to_mem(&flags_byte, ptr, sizeof(flags_byte));
const uint16_t seed_hash = get_seed_hash();
ptr += copy_to_mem(&seed_hash, ptr, sizeof(seed_hash));
if (!this->is_empty()) {
if (!is_single_item) {
- const uint32_t num_keys = keys_.size();
- ptr += copy_to_mem(&num_keys, ptr, sizeof(num_keys));
+ const uint32_t num_entries = entries_.size();
+ ptr += copy_to_mem(&num_entries, ptr, sizeof(num_entries));
const uint32_t unused32 = 0;
ptr += copy_to_mem(&unused32, ptr, sizeof(unused32));
if (this->is_estimation_mode()) {
- ptr += copy_to_mem(&(this->theta_), ptr, sizeof(uint64_t));
+ ptr += copy_to_mem(&theta_, ptr, sizeof(uint64_t));
}
}
- ptr += copy_to_mem(keys_.data(), ptr, sizeof(uint64_t) * keys_.size());
+ ptr += copy_to_mem(entries_.data(), ptr, entries_.size() * sizeof(uint64_t));
}
-
return bytes;
}
template<typename A>
-compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(std::istream& is, uint64_t seed) {
+compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(std::istream& is, uint64_t seed, const A& allocator) {
uint8_t preamble_longs;
- is.read((char*)&preamble_longs, sizeof(preamble_longs));
+ is.read(reinterpret_cast<char*>(&preamble_longs), sizeof(preamble_longs));
uint8_t serial_version;
- is.read((char*)&serial_version, sizeof(serial_version));
+ is.read(reinterpret_cast<char*>(&serial_version), sizeof(serial_version));
uint8_t type;
- is.read((char*)&type, sizeof(type));
+ is.read(reinterpret_cast<char*>(&type), sizeof(type));
uint16_t unused16;
- is.read((char*)&unused16, sizeof(unused16));
+ is.read(reinterpret_cast<char*>(&unused16), sizeof(unused16));
uint8_t flags_byte;
- is.read((char*)&flags_byte, sizeof(flags_byte));
+ is.read(reinterpret_cast<char*>(&flags_byte), sizeof(flags_byte));
uint16_t seed_hash;
- is.read((char*)&seed_hash, sizeof(seed_hash));
- theta_sketch_alloc<A>::check_sketch_type(type, SKETCH_TYPE);
- theta_sketch_alloc<A>::check_serial_version(serial_version, theta_sketch_alloc<A>::SERIAL_VERSION);
- const bool is_empty = flags_byte & (1 << theta_sketch_alloc<A>::flags::IS_EMPTY);
- if (!is_empty) theta_sketch_alloc<A>::check_seed_hash(seed_hash, theta_sketch_alloc<A>::get_seed_hash(seed));
- return internal_deserialize(is, preamble_longs, flags_byte, seed_hash);
-}
+ is.read(reinterpret_cast<char*>(&seed_hash), sizeof(seed_hash));
+ checker<true>::check_sketch_type(type, SKETCH_TYPE);
+ checker<true>::check_serial_version(serial_version, SERIAL_VERSION);
+ const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
+ if (!is_empty) checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
-template<typename A>
-compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::internal_deserialize(std::istream& is, uint8_t preamble_longs, uint8_t flags_byte, uint16_t seed_hash) {
- uint64_t theta = theta_sketch_alloc<A>::MAX_THETA;
- uint32_t num_keys = 0;
-
- const bool is_empty = flags_byte & (1 << theta_sketch_alloc<A>::flags::IS_EMPTY);
+ uint64_t theta = theta_constants::MAX_THETA;
+ uint32_t num_entries = 0;
if (!is_empty) {
if (preamble_longs == 1) {
- num_keys = 1;
+ num_entries = 1;
} else {
- is.read((char*)&num_keys, sizeof(num_keys));
+ is.read(reinterpret_cast<char*>(&num_entries), sizeof(num_entries));
uint32_t unused32;
- is.read((char*)&unused32, sizeof(unused32));
+ is.read(reinterpret_cast<char*>(&unused32), sizeof(unused32));
if (preamble_longs > 2) {
- is.read((char*)&theta, sizeof(theta));
+ is.read(reinterpret_cast<char*>(&theta), sizeof(theta));
}
}
}
- vector_u64<A> keys(num_keys);
- if (!is_empty) is.read((char*)keys.data(), sizeof(uint64_t) * keys.size());
+ std::vector<uint64_t, A> entries(num_entries, 0, allocator);
+ if (!is_empty) is.read(reinterpret_cast<char*>(entries.data()), sizeof(uint64_t) * entries.size());
- const bool is_ordered = flags_byte & (1 << theta_sketch_alloc<A>::flags::IS_ORDERED);
- if (!is.good()) throw std::runtime_error("error reading from std::istream");
- return compact_theta_sketch_alloc<A>(is_empty, theta, std::move(keys), seed_hash, is_ordered);
+ const bool is_ordered = flags_byte & (1 << flags::IS_ORDERED);
+ if (!is.good()) throw std::runtime_error("error reading from std::istream");
+ return compact_theta_sketch_alloc(is_empty, is_ordered, seed_hash, theta, std::move(entries));
}
template<typename A>
-compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(const void* bytes, size_t size, uint64_t seed) {
+compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(const void* bytes, size_t size, uint64_t seed, const A& allocator) {
ensure_minimum_memory(size, 8);
const char* ptr = static_cast<const char*>(bytes);
+ const char* base = ptr;
uint8_t preamble_longs;
ptr += copy_from_mem(ptr, &preamble_longs, sizeof(preamble_longs));
uint8_t serial_version;
@@ -804,28 +452,19 @@
ptr += copy_from_mem(ptr, &flags_byte, sizeof(flags_byte));
uint16_t seed_hash;
ptr += copy_from_mem(ptr, &seed_hash, sizeof(seed_hash));
- theta_sketch_alloc<A>::check_sketch_type(type, SKETCH_TYPE);
- theta_sketch_alloc<A>::check_serial_version(serial_version, theta_sketch_alloc<A>::SERIAL_VERSION);
- const bool is_empty = flags_byte & (1 << theta_sketch_alloc<A>::flags::IS_EMPTY);
- if (!is_empty) theta_sketch_alloc<A>::check_seed_hash(seed_hash, theta_sketch_alloc<A>::get_seed_hash(seed));
- return internal_deserialize(ptr, size - (ptr - static_cast<const char*>(bytes)), preamble_longs, flags_byte, seed_hash);
-}
+ checker<true>::check_sketch_type(type, SKETCH_TYPE);
+ checker<true>::check_serial_version(serial_version, SERIAL_VERSION);
+ const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
+ if (!is_empty) checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
-template<typename A>
-compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::internal_deserialize(const void* bytes, size_t size, uint8_t preamble_longs, uint8_t flags_byte, uint16_t seed_hash) {
- const char* ptr = static_cast<const char*>(bytes);
- const char* base = ptr;
-
- uint64_t theta = theta_sketch_alloc<A>::MAX_THETA;
- uint32_t num_keys = 0;
-
- const bool is_empty = flags_byte & (1 << theta_sketch_alloc<A>::flags::IS_EMPTY);
+ uint64_t theta = theta_constants::MAX_THETA;
+ uint32_t num_entries = 0;
if (!is_empty) {
if (preamble_longs == 1) {
- num_keys = 1;
+ num_entries = 1;
} else {
ensure_minimum_memory(size, 8); // read the first prelong before this method
- ptr += copy_from_mem(ptr, &num_keys, sizeof(num_keys));
+ ptr += copy_from_mem(ptr, &num_entries, sizeof(num_entries));
uint32_t unused32;
ptr += copy_from_mem(ptr, &unused32, sizeof(unused32));
if (preamble_longs > 2) {
@@ -834,106 +473,16 @@
}
}
}
- const size_t keys_size_bytes = sizeof(uint64_t) * num_keys;
- check_memory_size(ptr - base + keys_size_bytes, size);
- vector_u64<A> keys(num_keys);
- if (!is_empty) ptr += copy_from_mem(ptr, keys.data(), keys_size_bytes);
+ const size_t entries_size_bytes = sizeof(uint64_t) * num_entries;
+ check_memory_size(ptr - base + entries_size_bytes, size);
+ std::vector<uint64_t, A> entries(num_entries, 0, allocator);
+ if (!is_empty) ptr += copy_from_mem(ptr, entries.data(), entries_size_bytes);
- const bool is_ordered = flags_byte & (1 << theta_sketch_alloc<A>::flags::IS_ORDERED);
- return compact_theta_sketch_alloc<A>(is_empty, theta, std::move(keys), seed_hash, is_ordered);
-}
-
-template<typename A>
-typename theta_sketch_alloc<A>::const_iterator compact_theta_sketch_alloc<A>::begin() const {
- return typename theta_sketch_alloc<A>::const_iterator(keys_.data(), keys_.size(), 0);
-}
-
-template<typename A>
-typename theta_sketch_alloc<A>::const_iterator compact_theta_sketch_alloc<A>::end() const {
- return typename theta_sketch_alloc<A>::const_iterator(keys_.data(), keys_.size(), keys_.size());
-}
-
-// builder
-
-template<typename A>
-update_theta_sketch_alloc<A>::builder::builder():
-lg_k_(DEFAULT_LG_K), rf_(DEFAULT_RESIZE_FACTOR), p_(1), seed_(DEFAULT_SEED) {}
-
-template<typename A>
-typename update_theta_sketch_alloc<A>::builder& update_theta_sketch_alloc<A>::builder::set_lg_k(uint8_t lg_k) {
- if (lg_k < MIN_LG_K) {
- throw std::invalid_argument("lg_k must not be less than " + std::to_string(MIN_LG_K) + ": " + std::to_string(lg_k));
- }
- lg_k_ = lg_k;
- return *this;
-}
-
-template<typename A>
-typename update_theta_sketch_alloc<A>::builder& update_theta_sketch_alloc<A>::builder::set_resize_factor(resize_factor rf) {
- rf_ = rf;
- return *this;
-}
-
-template<typename A>
-typename update_theta_sketch_alloc<A>::builder& update_theta_sketch_alloc<A>::builder::set_p(float p) {
- p_ = p;
- return *this;
-}
-
-template<typename A>
-typename update_theta_sketch_alloc<A>::builder& update_theta_sketch_alloc<A>::builder::set_seed(uint64_t seed) {
- seed_ = seed;
- return *this;
-}
-
-template<typename A>
-uint8_t update_theta_sketch_alloc<A>::builder::starting_sub_multiple(uint8_t lg_tgt, uint8_t lg_min, uint8_t lg_rf) {
- return (lg_tgt <= lg_min) ? lg_min : (lg_rf == 0) ? lg_tgt : ((lg_tgt - lg_min) % lg_rf) + lg_min;
-}
-
-template<typename A>
-update_theta_sketch_alloc<A> update_theta_sketch_alloc<A>::builder::build() const {
- return update_theta_sketch_alloc<A>(starting_sub_multiple(lg_k_ + 1, MIN_LG_K, static_cast<uint8_t>(rf_)), lg_k_, rf_, p_, seed_);
-}
-
-// iterator
-
-template<typename A>
-theta_sketch_alloc<A>::const_iterator::const_iterator(const uint64_t* keys, uint32_t size, uint32_t index):
-keys_(keys), size_(size), index_(index) {
- while (index_ < size_ && keys_[index_] == 0) ++index_;
-}
-
-template<typename A>
-typename theta_sketch_alloc<A>::const_iterator& theta_sketch_alloc<A>::const_iterator::operator++() {
- do {
- ++index_;
- } while (index_ < size_ && keys_[index_] == 0);
- return *this;
-}
-
-template<typename A>
-typename theta_sketch_alloc<A>::const_iterator theta_sketch_alloc<A>::const_iterator::operator++(int) {
- const_iterator tmp(*this);
- operator++();
- return tmp;
-}
-
-template<typename A>
-bool theta_sketch_alloc<A>::const_iterator::operator==(const const_iterator& other) const {
- return index_ == other.index_;
-}
-
-template<typename A>
-bool theta_sketch_alloc<A>::const_iterator::operator!=(const const_iterator& other) const {
- return index_ != other.index_;
-}
-
-template<typename A>
-uint64_t theta_sketch_alloc<A>::const_iterator::operator*() const {
- return keys_[index_];
+ const bool is_ordered = flags_byte & (1 << flags::IS_ORDERED);
+ return compact_theta_sketch_alloc(is_empty, is_ordered, seed_hash, theta, std::move(entries));
}
} /* namespace datasketches */
#endif
+
diff --git a/theta/include/theta_union.hpp b/theta/include/theta_union.hpp
index 6cf8ccc..74716e0 100644
--- a/theta/include/theta_union.hpp
+++ b/theta/include/theta_union.hpp
@@ -20,103 +20,69 @@
#ifndef THETA_UNION_HPP_
#define THETA_UNION_HPP_
-#include <memory>
-#include <functional>
-#include <climits>
-
+#include "serde.hpp"
#include "theta_sketch.hpp"
+#include "theta_union_base.hpp"
namespace datasketches {
-/*
- * author Alexander Saydakov
- * author Lee Rhodes
- * author Kevin Lang
- */
-
-template<typename A>
+template<typename Allocator = std::allocator<uint64_t>>
class theta_union_alloc {
public:
- class builder;
+ using Entry = uint64_t;
+ using ExtractKey = trivial_extract_key;
+ using Sketch = theta_sketch_alloc<Allocator>;
+ using CompactSketch = compact_theta_sketch_alloc<Allocator>;
+ using resize_factor = theta_constants::resize_factor;
+
+ struct pass_through_policy {
+ uint64_t operator()(uint64_t internal_entry, uint64_t incoming_entry) const {
+ unused(incoming_entry);
+ return internal_entry;
+ }
+ };
+ using State = theta_union_base<Entry, ExtractKey, pass_through_policy, Sketch, CompactSketch, Allocator>;
// No constructor here. Use builder instead.
+ class builder;
/**
* This method is to update the union with a given sketch
* @param sketch to update the union with
*/
- void update(const theta_sketch_alloc<A>& sketch);
+ void update(const Sketch& sketch);
/**
* This method produces a copy of the current state of the union as a compact sketch.
* @param ordered optional flag to specify if ordered sketch should be produced
* @return the result of the union
*/
- compact_theta_sketch_alloc<A> get_result(bool ordered = true) const;
+ CompactSketch get_result(bool ordered = true) const;
private:
- bool is_empty_;
- uint64_t theta_;
- update_theta_sketch_alloc<A> state_;
+ State state_;
// for builder
- theta_union_alloc(uint64_t theta, update_theta_sketch_alloc<A>&& state);
+ theta_union_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta, uint64_t seed, const Allocator& allocator);
};
-// builder
-
template<typename A>
-class theta_union_alloc<A>::builder {
+class theta_union_alloc<A>::builder: public theta_base_builder<builder, A> {
public:
- typedef typename update_theta_sketch_alloc<A>::resize_factor resize_factor;
-
- /**
- * Set log2(k), where k is a nominal number of entries in the sketch
- * @param lg_k base 2 logarithm of nominal number of entries
- * @return this builder
- */
- builder& set_lg_k(uint8_t lg_k);
-
- /**
- * Set resize factor for the internal hash table (defaults to 8)
- * @param rf resize factor
- * @return this builder
- */
- builder& set_resize_factor(resize_factor rf);
-
- /**
- * Set sampling probability (initial theta). The default is 1, so the sketch retains
- * all entries until it reaches the limit, at which point it goes into the estimation mode
- * and reduces the effective sampling probability (theta) as necessary.
- * @param p sampling probability
- * @return this builder
- */
- builder& set_p(float p);
-
- /**
- * Set the seed for the hash function. Should be used carefully if needed.
- * Sketches produced with different seed are not compatible
- * and cannot be mixed in set operations.
- * @param seed hash seed
- * @return this builder
- */
- builder& set_seed(uint64_t seed);
+ builder(const A& allocator = A());
/**
* This is to create an instance of the union with predefined parameters.
- * @return and instance of the union
+ * @return an instance of the union
*/
theta_union_alloc<A> build() const;
-
-private:
- typename update_theta_sketch_alloc<A>::builder sketch_builder;
};
// alias with default allocator for convenience
-typedef theta_union_alloc<std::allocator<void>> theta_union;
+using theta_union = theta_union_alloc<std::allocator<uint64_t>>;
} /* namespace datasketches */
#include "theta_union_impl.hpp"
-# endif
+#endif
diff --git a/tuple/include/theta_union_base.hpp b/theta/include/theta_union_base.hpp
similarity index 97%
rename from tuple/include/theta_union_base.hpp
rename to theta/include/theta_union_base.hpp
index 3072630..d41f5bd 100644
--- a/tuple/include/theta_union_base.hpp
+++ b/theta/include/theta_union_base.hpp
@@ -30,7 +30,7 @@
typename Policy,
typename Sketch,
typename CompactSketch,
- typename Allocator = std::allocator<Entry>
+ typename Allocator
>
class theta_union_base {
public:
diff --git a/tuple/include/theta_union_base_impl.hpp b/theta/include/theta_union_base_impl.hpp
similarity index 97%
rename from tuple/include/theta_union_base_impl.hpp
rename to theta/include/theta_union_base_impl.hpp
index a86ba3e..ec8ce56 100644
--- a/tuple/include/theta_union_base_impl.hpp
+++ b/theta/include/theta_union_base_impl.hpp
@@ -17,6 +17,9 @@
* under the License.
*/
+#ifndef THETA_UNION_BASE_IMPL_HPP_
+#define THETA_UNION_BASE_IMPL_HPP_
+
#include <algorithm>
#include "conditional_forward.hpp"
@@ -82,3 +85,5 @@
}
} /* namespace datasketches */
+
+#endif
diff --git a/theta/include/theta_union_impl.hpp b/theta/include/theta_union_impl.hpp
index 4d8ebaa..88de353 100644
--- a/theta/include/theta_union_impl.hpp
+++ b/theta/include/theta_union_impl.hpp
@@ -22,86 +22,29 @@
namespace datasketches {
-/*
- * author Alexander Saydakov
- * author Lee Rhodes
- * author Kevin Lang
- */
+template<typename A>
+theta_union_alloc<A>::theta_union_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta, uint64_t seed, const A& allocator):
+state_(lg_cur_size, lg_nom_size, rf, theta, seed, pass_through_policy(), allocator)
+{}
template<typename A>
-theta_union_alloc<A>::theta_union_alloc(uint64_t theta, update_theta_sketch_alloc<A>&& state):
-is_empty_(true), theta_(theta), state_(std::move(state)) {}
-
-template<typename A>
-void theta_union_alloc<A>::update(const theta_sketch_alloc<A>& sketch) {
- if (sketch.is_empty()) return;
- if (sketch.get_seed_hash() != state_.get_seed_hash()) throw std::invalid_argument("seed hash mismatch");
- is_empty_ = false;
- if (sketch.get_theta64() < theta_) theta_ = sketch.get_theta64();
- if (sketch.is_ordered()) {
- for (auto hash: sketch) {
- if (hash >= theta_) break; // early stop
- state_.internal_update(hash);
- }
- } else {
- for (auto hash: sketch) if (hash < theta_) state_.internal_update(hash);
- }
- if (state_.get_theta64() < theta_) theta_ = state_.get_theta64();
+void theta_union_alloc<A>::update(const Sketch& sketch) {
+ state_.update(sketch);
}
template<typename A>
-compact_theta_sketch_alloc<A> theta_union_alloc<A>::get_result(bool ordered) const {
- if (is_empty_) return state_.compact(ordered);
- const uint32_t nom_num_keys = 1 << state_.lg_nom_size_;
- if (theta_ >= state_.theta_ && state_.get_num_retained() <= nom_num_keys) return state_.compact(ordered);
- uint64_t theta = std::min(theta_, state_.get_theta64());
- vector_u64<A> keys(state_.get_num_retained());
- uint32_t num_keys = 0;
- for (auto key: state_) {
- if (key < theta) keys[num_keys++] = key;
- }
- if (num_keys > nom_num_keys) {
- std::nth_element(keys.begin(), keys.begin() + nom_num_keys, keys.begin() + num_keys);
- theta = keys[nom_num_keys];
- num_keys = nom_num_keys;
- }
- if (num_keys != state_.get_num_retained()) {
- keys.resize(num_keys);
- }
- if (ordered) std::sort(keys.begin(), keys.end());
- return compact_theta_sketch_alloc<A>(false, theta, std::move(keys), state_.get_seed_hash(), ordered);
-}
-
-// builder
-
-template<typename A>
-typename theta_union_alloc<A>::builder& theta_union_alloc<A>::builder::set_lg_k(uint8_t lg_k) {
- sketch_builder.set_lg_k(lg_k);
- return *this;
+auto theta_union_alloc<A>::get_result(bool ordered) const -> CompactSketch {
+ return state_.get_result(ordered);
}
template<typename A>
-typename theta_union_alloc<A>::builder& theta_union_alloc<A>::builder::set_resize_factor(resize_factor rf) {
- sketch_builder.set_resize_factor(rf);
- return *this;
-}
+theta_union_alloc<A>::builder::builder(const A& allocator): theta_base_builder<builder, A>(allocator) {}
template<typename A>
-typename theta_union_alloc<A>::builder& theta_union_alloc<A>::builder::set_p(float p) {
- sketch_builder.set_p(p);
- return *this;
-}
-
-template<typename A>
-typename theta_union_alloc<A>::builder& theta_union_alloc<A>::builder::set_seed(uint64_t seed) {
- sketch_builder.set_seed(seed);
- return *this;
-}
-
-template<typename A>
-theta_union_alloc<A> theta_union_alloc<A>::builder::build() const {
- update_theta_sketch_alloc<A> sketch = sketch_builder.build();
- return theta_union_alloc(sketch.get_theta64(), std::move(sketch));
+auto theta_union_alloc<A>::builder::build() const -> theta_union_alloc {
+ return theta_union_alloc(
+ this->starting_sub_multiple(this->lg_k_ + 1, this->MIN_LG_K, static_cast<uint8_t>(this->rf_)),
+ this->lg_k_, this->rf_, this->starting_theta(), this->seed_, this->allocator_);
}
} /* namespace datasketches */
diff --git a/tuple/include/theta_update_sketch_base.hpp b/theta/include/theta_update_sketch_base.hpp
similarity index 95%
rename from tuple/include/theta_update_sketch_base.hpp
rename to theta/include/theta_update_sketch_base.hpp
index 425a8bd..337f68f 100644
--- a/tuple/include/theta_update_sketch_base.hpp
+++ b/theta/include/theta_update_sketch_base.hpp
@@ -34,7 +34,7 @@
template<
typename Entry,
typename ExtractKey,
- typename Allocator = std::allocator<Entry>
+ typename Allocator
>
struct theta_update_sketch_base {
using resize_factor = theta_constants::resize_factor;
@@ -147,7 +147,7 @@
static uint8_t starting_sub_multiple(uint8_t lg_tgt, uint8_t lg_min, uint8_t lg_rf);
};
-// key extractors
+// key extractor
struct trivial_extract_key {
template<typename T>
@@ -156,17 +156,7 @@
}
};
-template<typename K, typename V>
-struct pair_extract_key {
- K& operator()(std::pair<K, V>& entry) const {
- return entry.first;
- }
- const K& operator()(const std::pair<K, V>& entry) const {
- return entry.first;
- }
-};
-
-// not zero
+// key not zero
template<typename Entry, typename ExtractKey>
class key_not_zero {
diff --git a/tuple/include/theta_update_sketch_base_impl.hpp b/theta/include/theta_update_sketch_base_impl.hpp
similarity index 98%
rename from tuple/include/theta_update_sketch_base_impl.hpp
rename to theta/include/theta_update_sketch_base_impl.hpp
index bbf845b..a343c78 100644
--- a/tuple/include/theta_update_sketch_base_impl.hpp
+++ b/theta/include/theta_update_sketch_base_impl.hpp
@@ -17,6 +17,9 @@
* under the License.
*/
+#ifndef THETA_UPDATE_SKETCH_BASE_IMPL_HPP_
+#define THETA_UPDATE_SKETCH_BASE_IMPL_HPP_
+
#include <iostream>
#include <sstream>
#include <algorithm>
@@ -387,3 +390,5 @@
}
} /* namespace datasketches */
+
+#endif
diff --git a/theta/test/CMakeLists.txt b/theta/test/CMakeLists.txt
index c7d3a5d..7df65c4 100644
--- a/theta/test/CMakeLists.txt
+++ b/theta/test/CMakeLists.txt
@@ -42,4 +42,5 @@
theta_union_test.cpp
theta_intersection_test.cpp
theta_a_not_b_test.cpp
+ theta_jaccard_similarity_test.cpp
)
diff --git a/tuple/test/theta_jaccard_similarity_test.cpp b/theta/test/theta_jaccard_similarity_test.cpp
similarity index 97%
rename from tuple/test/theta_jaccard_similarity_test.cpp
rename to theta/test/theta_jaccard_similarity_test.cpp
index fda1a6d..9354d1c 100644
--- a/tuple/test/theta_jaccard_similarity_test.cpp
+++ b/theta/test/theta_jaccard_similarity_test.cpp
@@ -20,12 +20,11 @@
#include <iostream>
#include <catch.hpp>
-#include <jaccard_similarity.hpp>
+
+#include "theta_jaccard_similarity.hpp"
namespace datasketches {
-using update_theta_sketch = update_theta_sketch_experimental<>;
-
TEST_CASE("theta jaccard: empty", "[theta_sketch]") {
auto sk_a = update_theta_sketch::builder().build();
auto sk_b = update_theta_sketch::builder().build();
diff --git a/theta/test/theta_sketch_test.cpp b/theta/test/theta_sketch_test.cpp
index 6fc4ac7..f817a3e 100644
--- a/theta/test/theta_sketch_test.cpp
+++ b/theta/test/theta_sketch_test.cpp
@@ -17,10 +17,10 @@
* under the License.
*/
-#include <catch.hpp>
#include <fstream>
#include <sstream>
+#include <catch.hpp>
#include <theta_sketch.hpp>
namespace datasketches {
@@ -134,75 +134,7 @@
REQUIRE(compact_sketch.get_upper_bound(1) > n);
}
-TEST_CASE("theta sketch: deserialize update empty from java as base", "[theta_sketch]") {
- std::ifstream is;
- is.exceptions(std::ios::failbit | std::ios::badbit);
- is.open(inputPath + "theta_update_empty_from_java.sk", std::ios::binary);
- auto sketchptr = theta_sketch::deserialize(is);
- REQUIRE(sketchptr->is_empty());
- REQUIRE_FALSE(sketchptr->is_estimation_mode());
- REQUIRE(sketchptr->get_num_retained() == 0);
- REQUIRE(sketchptr->get_theta() == 1.0);
- REQUIRE(sketchptr->get_estimate() == 0.0);
- REQUIRE(sketchptr->get_lower_bound(1) == 0.0);
- REQUIRE(sketchptr->get_upper_bound(1) == 0.0);
-}
-
-TEST_CASE("theta sketch: deserialize update empty from java as subclass", "[theta_sketch]") {
- std::ifstream is;
- is.exceptions(std::ios::failbit | std::ios::badbit);
- is.open(inputPath + "theta_update_empty_from_java.sk", std::ios::binary);
- auto sketch = update_theta_sketch::deserialize(is);
- REQUIRE(sketch.is_empty());
- REQUIRE_FALSE(sketch.is_estimation_mode());
- REQUIRE(sketch.get_num_retained() == 0);
- REQUIRE(sketch.get_theta() == 1.0);
- REQUIRE(sketch.get_estimate() == 0.0);
- REQUIRE(sketch.get_lower_bound(1) == 0.0);
- REQUIRE(sketch.get_upper_bound(1) == 0.0);
-}
-
-TEST_CASE("theta sketch: deserialize update estimation from java as base", "[theta_sketch]") {
- std::ifstream is;
- is.exceptions(std::ios::failbit | std::ios::badbit);
- is.open(inputPath + "theta_update_estimation_from_java.sk", std::ios::binary);
- auto sketchptr = theta_sketch::deserialize(is);
- REQUIRE_FALSE(sketchptr->is_empty());
- REQUIRE(sketchptr->is_estimation_mode());
- REQUIRE(sketchptr->get_num_retained() == 5324);
- REQUIRE(sketchptr->get_estimate() == Approx(10000.0).margin(10000 * 0.01));
- REQUIRE(sketchptr->get_lower_bound(1) < 10000);
- REQUIRE(sketchptr->get_upper_bound(1) > 10000);
-}
-
-TEST_CASE("theta sketch: deserialize update estimation from java as subclass", "[theta_sketch]") {
- std::ifstream is;
- is.exceptions(std::ios::failbit | std::ios::badbit);
- is.open(inputPath + "theta_update_estimation_from_java.sk", std::ios::binary);
- auto sketch = update_theta_sketch::deserialize(is);
- REQUIRE_FALSE(sketch.is_empty());
- REQUIRE(sketch.is_estimation_mode());
- REQUIRE(sketch.get_num_retained() == 5324);
- REQUIRE(sketch.get_estimate() == Approx(10000.0).margin(10000 * 0.01));
- REQUIRE(sketch.get_lower_bound(1) < 10000);
- REQUIRE(sketch.get_upper_bound(1) > 10000);
-}
-
-TEST_CASE("theta sketch: deserialize compact empty from java as base", "[theta_sketch]") {
- std::ifstream is;
- is.exceptions(std::ios::failbit | std::ios::badbit);
- is.open(inputPath + "theta_compact_empty_from_java.sk", std::ios::binary);
- auto sketchptr = theta_sketch::deserialize(is);
- REQUIRE(sketchptr->is_empty());
- REQUIRE_FALSE(sketchptr->is_estimation_mode());
- REQUIRE(sketchptr->get_num_retained() == 0);
- REQUIRE(sketchptr->get_theta() == 1.0);
- REQUIRE(sketchptr->get_estimate() == 0.0);
- REQUIRE(sketchptr->get_lower_bound(1) == 0.0);
- REQUIRE(sketchptr->get_upper_bound(1) == 0.0);
-}
-
-TEST_CASE("theta sketch: deserialize compact empty from java as subclass", "[theta_sketch]") {
+TEST_CASE("theta sketch: deserialize compact empty from java", "[theta_sketch]") {
std::ifstream is;
is.exceptions(std::ios::failbit | std::ios::badbit);
is.open(inputPath + "theta_compact_empty_from_java.sk", std::ios::binary);
@@ -216,21 +148,7 @@
REQUIRE(sketch.get_upper_bound(1) == 0.0);
}
-TEST_CASE("theta sketch: deserialize single item from java as base", "[theta_sketch]") {
- std::ifstream is;
- is.exceptions(std::ios::failbit | std::ios::badbit);
- is.open(inputPath + "theta_compact_single_item_from_java.sk", std::ios::binary);
- auto sketchptr = theta_sketch::deserialize(is);
- REQUIRE_FALSE(sketchptr->is_empty());
- REQUIRE_FALSE(sketchptr->is_estimation_mode());
- REQUIRE(sketchptr->get_num_retained() == 1);
- REQUIRE(sketchptr->get_theta() == 1.0);
- REQUIRE(sketchptr->get_estimate() == 1.0);
- REQUIRE(sketchptr->get_lower_bound(1) == 1.0);
- REQUIRE(sketchptr->get_upper_bound(1) == 1.0);
-}
-
-TEST_CASE("theta sketch: deserialize single item from java as subclass", "[theta_sketch]") {
+TEST_CASE("theta sketch: deserialize single item from java", "[theta_sketch]") {
std::ifstream is;
is.exceptions(std::ios::failbit | std::ios::badbit);
is.open(inputPath + "theta_compact_single_item_from_java.sk", std::ios::binary);
@@ -244,55 +162,21 @@
REQUIRE(sketch.get_upper_bound(1) == 1.0);
}
-TEST_CASE("theta sketch: deserialize compact estimation from java as base", "[theta_sketch]") {
- std::ifstream is;
- is.exceptions(std::ios::failbit | std::ios::badbit);
- is.open(inputPath + "theta_compact_estimation_from_java.sk", std::ios::binary);
- auto sketchptr = theta_sketch::deserialize(is);
- REQUIRE_FALSE(sketchptr->is_empty());
- REQUIRE(sketchptr->is_estimation_mode());
- REQUIRE(sketchptr->is_ordered());
- REQUIRE(sketchptr->get_num_retained() == 4342);
- REQUIRE(sketchptr->get_theta() == Approx(0.531700444213199).margin(1e-10));
- REQUIRE(sketchptr->get_estimate() == Approx(8166.25234614053).margin(1e-10));
- REQUIRE(sketchptr->get_lower_bound(2) == Approx(7996.956955317471).margin(1e-10));
- REQUIRE(sketchptr->get_upper_bound(2) == Approx(8339.090301078124).margin(1e-10));
-
- // the same construction process in Java must have produced exactly the same sketch
- update_theta_sketch update_sketch = update_theta_sketch::builder().build();
- const int n = 8192;
- for (int i = 0; i < n; i++) update_sketch.update(i);
- REQUIRE(sketchptr->get_num_retained() == update_sketch.get_num_retained());
- REQUIRE(sketchptr->get_theta() == Approx(update_sketch.get_theta()).margin(1e-10));
- REQUIRE(sketchptr->get_estimate() == Approx(update_sketch.get_estimate()).margin(1e-10));
- REQUIRE(sketchptr->get_lower_bound(1) == Approx(update_sketch.get_lower_bound(1)).margin(1e-10));
- REQUIRE(sketchptr->get_upper_bound(1) == Approx(update_sketch.get_upper_bound(1)).margin(1e-10));
- REQUIRE(sketchptr->get_lower_bound(2) == Approx(update_sketch.get_lower_bound(2)).margin(1e-10));
- REQUIRE(sketchptr->get_upper_bound(2) == Approx(update_sketch.get_upper_bound(2)).margin(1e-10));
- REQUIRE(sketchptr->get_lower_bound(3) == Approx(update_sketch.get_lower_bound(3)).margin(1e-10));
- REQUIRE(sketchptr->get_upper_bound(3) == Approx(update_sketch.get_upper_bound(3)).margin(1e-10));
- compact_theta_sketch compact_sketch = update_sketch.compact();
- // the sketches are ordered, so the iteration sequence must match exactly
- auto iter = sketchptr->begin();
- for (auto key: compact_sketch) {
- REQUIRE(*iter == key);
- ++iter;
- }
-}
-
-TEST_CASE("theta sketch: deserialize compact estimation from java as subclass", "[theta_sketch]") {
+TEST_CASE("theta sketch: deserialize compact estimation from java", "[theta_sketch]") {
std::ifstream is;
is.exceptions(std::ios::failbit | std::ios::badbit);
is.open(inputPath + "theta_compact_estimation_from_java.sk", std::ios::binary);
auto sketch = compact_theta_sketch::deserialize(is);
REQUIRE_FALSE(sketch.is_empty());
REQUIRE(sketch.is_estimation_mode());
+ REQUIRE(sketch.is_ordered());
REQUIRE(sketch.get_num_retained() == 4342);
REQUIRE(sketch.get_theta() == Approx(0.531700444213199).margin(1e-10));
REQUIRE(sketch.get_estimate() == Approx(8166.25234614053).margin(1e-10));
REQUIRE(sketch.get_lower_bound(2) == Approx(7996.956955317471).margin(1e-10));
REQUIRE(sketch.get_upper_bound(2) == Approx(8339.090301078124).margin(1e-10));
+ // the same construction process in Java must have produced exactly the same sketch
update_theta_sketch update_sketch = update_theta_sketch::builder().build();
const int n = 8192;
for (int i = 0; i < n; i++) update_sketch.update(i);
@@ -305,132 +189,51 @@
REQUIRE(sketch.get_upper_bound(2) == Approx(update_sketch.get_upper_bound(2)).margin(1e-10));
REQUIRE(sketch.get_lower_bound(3) == Approx(update_sketch.get_lower_bound(3)).margin(1e-10));
REQUIRE(sketch.get_upper_bound(3) == Approx(update_sketch.get_upper_bound(3)).margin(1e-10));
+ compact_theta_sketch compact_sketch = update_sketch.compact();
+ // the sketches are ordered, so the iteration sequence must match exactly
+ auto iter = sketch.begin();
+ for (const auto& key: compact_sketch) {
+ REQUIRE(*iter == key);
+ ++iter;
+ }
}
-TEST_CASE("theta sketch: serialize deserialize stream and bytes equivalency", "[theta_sketch]") {
+TEST_CASE("theta sketch: serialize deserialize stream and bytes equivalence", "[theta_sketch]") {
update_theta_sketch update_sketch = update_theta_sketch::builder().build();
const int n = 8192;
for (int i = 0; i < n; i++) update_sketch.update(i);
- // update sketch stream and bytes comparison
- {
- std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
- update_sketch.serialize(s);
- auto bytes = update_sketch.serialize();
- REQUIRE(bytes.size() == static_cast<size_t>(s.tellp()));
- for (size_t i = 0; i < bytes.size(); ++i) {
- REQUIRE(((char*)bytes.data())[i] == (char)s.get());
- }
-
- // deserialize as base class
- {
- s.seekg(0); // rewind
- auto deserialized_sketch_ptr1 = theta_sketch::deserialize(s);
- auto deserialized_sketch_ptr2 = theta_sketch::deserialize(bytes.data(), bytes.size());
- REQUIRE(bytes.size() == static_cast<size_t>(s.tellg()));
- REQUIRE(deserialized_sketch_ptr2->is_empty() == deserialized_sketch_ptr1->is_empty());
- REQUIRE(deserialized_sketch_ptr2->is_ordered() == deserialized_sketch_ptr1->is_ordered());
- REQUIRE(deserialized_sketch_ptr2->get_num_retained() == deserialized_sketch_ptr1->get_num_retained());
- REQUIRE(deserialized_sketch_ptr2->get_theta() == deserialized_sketch_ptr1->get_theta());
- REQUIRE(deserialized_sketch_ptr2->get_estimate() == deserialized_sketch_ptr1->get_estimate());
- REQUIRE(deserialized_sketch_ptr2->get_lower_bound(1) == deserialized_sketch_ptr1->get_lower_bound(1));
- REQUIRE(deserialized_sketch_ptr2->get_upper_bound(1) == deserialized_sketch_ptr1->get_upper_bound(1));
- // hash tables must be identical since they are restored from dumps, and iteration is deterministic
- auto iter = deserialized_sketch_ptr1->begin();
- for (auto key: *deserialized_sketch_ptr2) {
- REQUIRE(*iter == key);
- ++iter;
- }
- }
-
- // deserialize as subclass
- {
- s.seekg(0); // rewind
- update_theta_sketch deserialized_sketch1 = update_theta_sketch::deserialize(s);
- update_theta_sketch deserialized_sketch2 = update_theta_sketch::deserialize(bytes.data(), bytes.size());
- REQUIRE(bytes.size() == static_cast<size_t>(s.tellg()));
- REQUIRE(deserialized_sketch2.is_empty() == deserialized_sketch1.is_empty());
- REQUIRE(deserialized_sketch2.is_ordered() == deserialized_sketch1.is_ordered());
- REQUIRE(deserialized_sketch2.get_num_retained() == deserialized_sketch1.get_num_retained());
- REQUIRE(deserialized_sketch2.get_theta() == deserialized_sketch1.get_theta());
- REQUIRE(deserialized_sketch2.get_estimate() == deserialized_sketch1.get_estimate());
- REQUIRE(deserialized_sketch2.get_lower_bound(1) == deserialized_sketch1.get_lower_bound(1));
- REQUIRE(deserialized_sketch2.get_upper_bound(1) == deserialized_sketch1.get_upper_bound(1));
- // hash tables must be identical since they are restored from dumps, and iteration is deterministic
- auto iter = deserialized_sketch1.begin();
- for (auto key: deserialized_sketch2) {
- REQUIRE(*iter == key);
- ++iter;
- }
- }
+ std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
+ update_sketch.compact().serialize(s);
+ auto bytes = update_sketch.compact().serialize();
+ REQUIRE(bytes.size() == static_cast<size_t>(s.tellp()));
+ for (size_t i = 0; i < bytes.size(); ++i) {
+ REQUIRE(((char*)bytes.data())[i] == (char)s.get());
}
- // compact sketch stream and bytes comparison
- {
- std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
- update_sketch.compact().serialize(s);
- auto bytes = update_sketch.compact().serialize();
- REQUIRE(bytes.size() == static_cast<size_t>(s.tellp()));
- for (size_t i = 0; i < bytes.size(); ++i) {
- REQUIRE(((char*)bytes.data())[i] == (char)s.get());
- }
-
- // deserialize as base class
- {
- s.seekg(0); // rewind
- auto deserialized_sketch_ptr1 = theta_sketch::deserialize(s);
- auto deserialized_sketch_ptr2 = theta_sketch::deserialize(bytes.data(), bytes.size());
- REQUIRE(bytes.size() == static_cast<size_t>(s.tellg()));
- REQUIRE(deserialized_sketch_ptr2->is_empty() == deserialized_sketch_ptr1->is_empty());
- REQUIRE(deserialized_sketch_ptr2->is_ordered() == deserialized_sketch_ptr1->is_ordered());
- REQUIRE(deserialized_sketch_ptr2->get_num_retained() == deserialized_sketch_ptr1->get_num_retained());
- REQUIRE(deserialized_sketch_ptr2->get_theta() == deserialized_sketch_ptr1->get_theta());
- REQUIRE(deserialized_sketch_ptr2->get_estimate() == deserialized_sketch_ptr1->get_estimate());
- REQUIRE(deserialized_sketch_ptr2->get_lower_bound(1) == deserialized_sketch_ptr1->get_lower_bound(1));
- REQUIRE(deserialized_sketch_ptr2->get_upper_bound(1) == deserialized_sketch_ptr1->get_upper_bound(1));
- // the sketches are ordered, so the iteration sequence must match exactly
- auto iter = deserialized_sketch_ptr1->begin();
- for (auto key: *deserialized_sketch_ptr2) {
- REQUIRE(*iter == key);
- ++iter;
- }
- }
-
- // deserialize as subclass
- {
- s.seekg(0); // rewind
- compact_theta_sketch deserialized_sketch1 = compact_theta_sketch::deserialize(s);
- compact_theta_sketch deserialized_sketch2 = compact_theta_sketch::deserialize(bytes.data(), bytes.size());
- REQUIRE(bytes.size() == static_cast<size_t>(s.tellg()));
- REQUIRE(deserialized_sketch2.is_empty() == deserialized_sketch1.is_empty());
- REQUIRE(deserialized_sketch2.is_ordered() == deserialized_sketch1.is_ordered());
- REQUIRE(deserialized_sketch2.get_num_retained() == deserialized_sketch1.get_num_retained());
- REQUIRE(deserialized_sketch2.get_theta() == deserialized_sketch1.get_theta());
- REQUIRE(deserialized_sketch2.get_estimate() == deserialized_sketch1.get_estimate());
- REQUIRE(deserialized_sketch2.get_lower_bound(1) == deserialized_sketch1.get_lower_bound(1));
- REQUIRE(deserialized_sketch2.get_upper_bound(1) == deserialized_sketch1.get_upper_bound(1));
- // the sketches are ordered, so the iteration sequence must match exactly
- auto iter = deserialized_sketch1.begin();
- for (auto key: deserialized_sketch2) {
- REQUIRE(*iter == key);
- ++iter;
- }
- }
+ s.seekg(0); // rewind
+ compact_theta_sketch deserialized_sketch1 = compact_theta_sketch::deserialize(s);
+ compact_theta_sketch deserialized_sketch2 = compact_theta_sketch::deserialize(bytes.data(), bytes.size());
+ REQUIRE(bytes.size() == static_cast<size_t>(s.tellg()));
+ REQUIRE(deserialized_sketch2.is_empty() == deserialized_sketch1.is_empty());
+ REQUIRE(deserialized_sketch2.is_ordered() == deserialized_sketch1.is_ordered());
+ REQUIRE(deserialized_sketch2.get_num_retained() == deserialized_sketch1.get_num_retained());
+ REQUIRE(deserialized_sketch2.get_theta() == deserialized_sketch1.get_theta());
+ REQUIRE(deserialized_sketch2.get_estimate() == deserialized_sketch1.get_estimate());
+ REQUIRE(deserialized_sketch2.get_lower_bound(1) == deserialized_sketch1.get_lower_bound(1));
+ REQUIRE(deserialized_sketch2.get_upper_bound(1) == deserialized_sketch1.get_upper_bound(1));
+ // the sketches are ordered, so the iteration sequence must match exactly
+ auto iter = deserialized_sketch1.begin();
+ for (auto key: deserialized_sketch2) {
+ REQUIRE(*iter == key);
+ ++iter;
}
}
-TEST_CASE("theta sketch: deserialize update single item buffer overrun", "[theta_sketch]") {
- update_theta_sketch update_sketch = update_theta_sketch::builder().build();
- update_sketch.update(1);
- theta_sketch::vector_bytes bytes = update_sketch.serialize();
- REQUIRE_THROWS_AS(update_theta_sketch::deserialize(bytes.data(), 7), std::out_of_range);
- REQUIRE_THROWS_AS(update_theta_sketch::deserialize(bytes.data(), bytes.size() - 1), std::out_of_range);
-}
-
TEST_CASE("theta sketch: deserialize compact single item buffer overrun", "[theta_sketch]") {
update_theta_sketch update_sketch = update_theta_sketch::builder().build();
update_sketch.update(1);
- theta_sketch::vector_bytes bytes = update_sketch.compact().serialize();
+ auto bytes = update_sketch.compact().serialize();
REQUIRE_THROWS_AS(compact_theta_sketch::deserialize(bytes.data(), 7), std::out_of_range);
REQUIRE_THROWS_AS(compact_theta_sketch::deserialize(bytes.data(), bytes.size() - 1), std::out_of_range);
}
diff --git a/tuple/CMakeLists.txt b/tuple/CMakeLists.txt
index ea03566..2c3f111 100644
--- a/tuple/CMakeLists.txt
+++ b/tuple/CMakeLists.txt
@@ -29,7 +29,7 @@
$<INSTALL_INTERFACE:$<INSTALL_PREFIX>/include>
)
-target_link_libraries(tuple INTERFACE common)
+target_link_libraries(tuple INTERFACE common theta)
target_compile_features(tuple INTERFACE cxx_std_11)
set(tuple_HEADERS "")
@@ -37,23 +37,11 @@
list(APPEND tuple_HEADERS "include/tuple_union.hpp;include/tuple_union_impl.hpp")
list(APPEND tuple_HEADERS "include/tuple_intersection.hpp;include/tuple_intersection_impl.hpp")
list(APPEND tuple_HEADERS "include/tuple_a_not_b.hpp;include/tuple_a_not_b_impl.hpp")
+list(APPEND tuple_HEADERS "include/tuple_jaccard_similarity.hpp")
list(APPEND tuple_HEADERS "include/array_of_doubles_sketch.hpp;include/array_of_doubles_sketch_impl.hpp")
list(APPEND tuple_HEADERS "include/array_of_doubles_union.hpp;include/array_of_doubles_union_impl.hpp")
list(APPEND tuple_HEADERS "include/array_of_doubles_intersection.hpp;include/array_of_doubles_intersection_impl.hpp")
list(APPEND tuple_HEADERS "include/array_of_doubles_a_not_b.hpp;include/array_of_doubles_a_not_b_impl.hpp")
-list(APPEND tuple_HEADERS "include/theta_update_sketch_base.hpp;include/theta_update_sketch_base_impl.hpp")
-list(APPEND tuple_HEADERS "include/theta_union_base.hpp;include/theta_union_base_impl.hpp")
-list(APPEND tuple_HEADERS "include/theta_intersection_base.hpp;include/theta_intersection_base_impl.hpp")
-list(APPEND tuple_HEADERS "include/theta_set_difference_base.hpp;include/theta_set_difference_base_impl.hpp")
-list(APPEND tuple_HEADERS "include/theta_sketch_experimental.hpp;include/theta_sketch_experimental_impl.hpp")
-list(APPEND tuple_HEADERS "include/theta_union_experimental.hpp;include/theta_union_experimental_impl.hpp")
-list(APPEND tuple_HEADERS "include/theta_intersection_experimental.hpp;include/theta_intersection_experimental_impl.hpp")
-list(APPEND tuple_HEADERS "include/theta_a_not_b_experimental.hpp;include/theta_a_not_b_experimental_impl.hpp")
-list(APPEND tuple_HEADERS "include/bounds_on_ratios_in_sampled_sets.hpp")
-list(APPEND tuple_HEADERS "include/bounds_on_ratios_in_theta_sketched_sets.hpp")
-list(APPEND tuple_HEADERS "include/jaccard_similarity.hpp")
-list(APPEND tuple_HEADERS "include/theta_comparators.hpp")
-list(APPEND tuple_HEADERS "include/theta_constants.hpp")
install(TARGETS tuple
EXPORT ${PROJECT_NAME}
@@ -72,6 +60,7 @@
${CMAKE_CURRENT_SOURCE_DIR}/include/tuple_intersection_impl.hpp
${CMAKE_CURRENT_SOURCE_DIR}/include/tuple_a_not_b.hpp
${CMAKE_CURRENT_SOURCE_DIR}/include/tuple_a_not_b_impl.hpp
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/tuple_jaccard_similarity.hpp
${CMAKE_CURRENT_SOURCE_DIR}/include/array_of_doubles_sketch.hpp
${CMAKE_CURRENT_SOURCE_DIR}/include/array_of_doubles_sketch_impl.hpp
${CMAKE_CURRENT_SOURCE_DIR}/include/array_of_doubles_union.hpp
@@ -80,25 +69,4 @@
${CMAKE_CURRENT_SOURCE_DIR}/include/array_of_doubles_intersection_impl.hpp
${CMAKE_CURRENT_SOURCE_DIR}/include/array_of_doubles_a_not_b.hpp
${CMAKE_CURRENT_SOURCE_DIR}/include/array_of_doubles_a_not_b_impl.hpp
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_update_sketch_base.hpp
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_update_sketch_base_impl.hpp
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_union_base.hpp
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_union_base_impl.hpp
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_intersection_base.hpp
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_intersection_base_impl.hpp
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_set_difference_base.hpp
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_set_difference_base_impl.hpp
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_sketch_experimental.hpp
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_sketch_experimental_impl.hpp
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_union_experimental.hpp
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_union_experimental_impl.hpp
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_intersection_experimental.hpp
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_intersection_experimental_impl.hpp
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_a_not_b_experimental.hpp
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_a_not_b_experimental_impl.hpp
- ${CMAKE_CURRENT_SOURCE_DIR}/include/bounds_on_ratios_in_sampled_sets.hpp
- ${CMAKE_CURRENT_SOURCE_DIR}/include/bounds_on_ratios_in_theta_sketched_sets.hpp
- ${CMAKE_CURRENT_SOURCE_DIR}/include/jaccard_similarity.hpp
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_comparators.hpp
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_constants.hpp
)
diff --git a/tuple/include/theta_a_not_b_experimental.hpp b/tuple/include/theta_a_not_b_experimental.hpp
deleted file mode 100644
index ba35dc7..0000000
--- a/tuple/include/theta_a_not_b_experimental.hpp
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#ifndef THETA_A_NOT_B_EXPERIMENTAL_HPP_
-#define THETA_A_NOT_B_EXPERIMENTAL_HPP_
-
-#include "theta_sketch_experimental.hpp"
-#include "theta_set_difference_base.hpp"
-
-namespace datasketches {
-
-template<typename Allocator = std::allocator<uint64_t>>
-class theta_a_not_b_experimental {
-public:
- using Entry = uint64_t;
- using ExtractKey = trivial_extract_key;
- using CompactSketch = compact_theta_sketch_experimental<Allocator>;
- using State = theta_set_difference_base<Entry, ExtractKey, CompactSketch, Allocator>;
-
- explicit theta_a_not_b_experimental(uint64_t seed = DEFAULT_SEED, const Allocator& allocator = Allocator());
-
- /**
- * Computes the a-not-b set operation given two sketches.
- * @return the result of a-not-b
- */
- template<typename FwdSketch, typename Sketch>
- CompactSketch compute(FwdSketch&& a, const Sketch& b, bool ordered = true) const;
-
-private:
- State state_;
-};
-
-} /* namespace datasketches */
-
-#include "theta_a_not_b_experimental_impl.hpp"
-
-#endif
diff --git a/tuple/include/theta_a_not_b_experimental_impl.hpp b/tuple/include/theta_a_not_b_experimental_impl.hpp
deleted file mode 100644
index 1fc321b..0000000
--- a/tuple/include/theta_a_not_b_experimental_impl.hpp
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-namespace datasketches {
-
-template<typename A>
-theta_a_not_b_experimental<A>::theta_a_not_b_experimental(uint64_t seed, const A& allocator):
-state_(seed, allocator)
-{}
-
-template<typename A>
-template<typename FwdSketch, typename Sketch>
-auto theta_a_not_b_experimental<A>::compute(FwdSketch&& a, const Sketch& b, bool ordered) const -> CompactSketch {
- return state_.compute(std::forward<FwdSketch>(a), b, ordered);
-}
-
-} /* namespace datasketches */
diff --git a/tuple/include/theta_intersection_experimental.hpp b/tuple/include/theta_intersection_experimental.hpp
deleted file mode 100644
index 293b2e9..0000000
--- a/tuple/include/theta_intersection_experimental.hpp
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#ifndef THETA_INTERSECTION_EXPERIMENTAL_HPP_
-#define THETA_INTERSECTION_EXPERIMENTAL_HPP_
-
-#include "theta_sketch_experimental.hpp"
-#include "theta_intersection_base.hpp"
-
-namespace datasketches {
-
-template<typename Allocator = std::allocator<uint64_t>>
-class theta_intersection_experimental {
-public:
- using Entry = uint64_t;
- using ExtractKey = trivial_extract_key;
- using Sketch = theta_sketch_experimental<Allocator>;
- using CompactSketch = compact_theta_sketch_experimental<Allocator>;
-
- struct pass_through_policy {
- uint64_t operator()(uint64_t internal_entry, uint64_t incoming_entry) const {
- unused(incoming_entry);
- return internal_entry;
- }
- };
- using State = theta_intersection_base<Entry, ExtractKey, pass_through_policy, Sketch, CompactSketch, Allocator>;
-
- explicit theta_intersection_experimental(uint64_t seed = DEFAULT_SEED, const Allocator& allocator = Allocator());
-
- /**
- * Updates the intersection with a given sketch.
- * The intersection can be viewed as starting from the "universe" set, and every update
- * can reduce the current set to leave the overlapping subset only.
- * @param sketch represents input set for the intersection
- */
- template<typename FwdSketch>
- void update(FwdSketch&& sketch);
-
- /**
- * Produces a copy of the current state of the intersection.
- * If update() was not called, the state is the infinite "universe",
- * which is considered an undefined state, and throws an exception.
- * @param ordered optional flag to specify if ordered sketch should be produced
- * @return the result of the intersection
- */
- CompactSketch get_result(bool ordered = true) const;
-
- /**
- * Returns true if the state of the intersection is defined (not infinite "universe").
- * @return true if the state is valid
- */
- bool has_result() const;
-
-private:
- State state_;
-};
-
-} /* namespace datasketches */
-
-#include "theta_intersection_experimental_impl.hpp"
-
-#endif
diff --git a/tuple/include/theta_intersection_experimental_impl.hpp b/tuple/include/theta_intersection_experimental_impl.hpp
deleted file mode 100644
index e8bcfbb..0000000
--- a/tuple/include/theta_intersection_experimental_impl.hpp
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-namespace datasketches {
-
-template<typename A>
-theta_intersection_experimental<A>::theta_intersection_experimental(uint64_t seed, const A& allocator):
-state_(seed, pass_through_policy(), allocator)
-{}
-
-template<typename A>
-template<typename SS>
-void theta_intersection_experimental<A>::update(SS&& sketch) {
- state_.update(std::forward<SS>(sketch));
-}
-
-template<typename A>
-auto theta_intersection_experimental<A>::get_result(bool ordered) const -> CompactSketch {
- return state_.get_result(ordered);
-}
-
-template<typename A>
-bool theta_intersection_experimental<A>::has_result() const {
- return state_.has_result();
-}
-
-} /* namespace datasketches */
diff --git a/tuple/include/theta_sketch_experimental.hpp b/tuple/include/theta_sketch_experimental.hpp
deleted file mode 100644
index 2056687..0000000
--- a/tuple/include/theta_sketch_experimental.hpp
+++ /dev/null
@@ -1,393 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#ifndef THETA_SKETCH_EXPERIMENTAL_HPP_
-#define THETA_SKETCH_EXPERIMENTAL_HPP_
-
-#include "theta_update_sketch_base.hpp"
-
-namespace datasketches {
-
-// experimental theta sketch derived from the same base as tuple sketch
-
-template<typename Allocator = std::allocator<uint64_t>>
-class theta_sketch_experimental {
-public:
- using Entry = uint64_t;
- using ExtractKey = trivial_extract_key;
- using iterator = theta_iterator<Entry, ExtractKey>;
- using const_iterator = theta_const_iterator<Entry, ExtractKey>;
-
- virtual ~theta_sketch_experimental() = default;
-
- /**
- * @return allocator
- */
- virtual Allocator get_allocator() const = 0;
-
- /**
- * @return true if this sketch represents an empty set (not the same as no retained entries!)
- */
- virtual bool is_empty() const = 0;
-
- /**
- * @return estimate of the distinct count of the input stream
- */
- double get_estimate() const;
-
- /**
- * Returns the approximate lower error bound given a number of standard deviations.
- * This parameter is similar to the number of standard deviations of the normal distribution
- * and corresponds to approximately 67%, 95% and 99% confidence intervals.
- * @param num_std_devs number of Standard Deviations (1, 2 or 3)
- * @return the lower bound
- */
- double get_lower_bound(uint8_t num_std_devs) const;
-
- /**
- * Returns the approximate upper error bound given a number of standard deviations.
- * This parameter is similar to the number of standard deviations of the normal distribution
- * and corresponds to approximately 67%, 95% and 99% confidence intervals.
- * @param num_std_devs number of Standard Deviations (1, 2 or 3)
- * @return the upper bound
- */
- double get_upper_bound(uint8_t num_std_devs) const;
-
- /**
- * @return true if the sketch is in estimation mode (as opposed to exact mode)
- */
- bool is_estimation_mode() const;
-
- /**
- * @return theta as a fraction from 0 to 1 (effective sampling rate)
- */
- double get_theta() const;
-
- /**
- * @return theta as a positive integer between 0 and LLONG_MAX
- */
- virtual uint64_t get_theta64() const = 0;
-
- /**
- * @return the number of retained entries in the sketch
- */
- virtual uint32_t get_num_retained() const = 0;
-
- /**
- * @return hash of the seed that was used to hash the input
- */
- virtual uint16_t get_seed_hash() const = 0;
-
- /**
- * @return true if retained entries are ordered
- */
- virtual bool is_ordered() const = 0;
-
- /**
- * Provides a human-readable summary of this sketch as a string
- * @param print_items if true include the list of items retained by the sketch
- * @return sketch summary as a string
- */
- virtual string<Allocator> to_string(bool print_items = false) const;
-
- /**
- * Iterator over hash values in this sketch.
- * @return begin iterator
- */
- virtual iterator begin() = 0;
-
- /**
- * Iterator pointing past the valid range.
- * Not to be incremented or dereferenced.
- * @return end iterator
- */
- virtual iterator end() = 0;
-
- /**
- * Const iterator over hash values in this sketch.
- * @return begin iterator
- */
- virtual const_iterator begin() const = 0;
-
- /**
- * Const iterator pointing past the valid range.
- * Not to be incremented or dereferenced.
- * @return end iterator
- */
- virtual const_iterator end() const = 0;
-
-protected:
- virtual void print_specifics(std::ostringstream& os) const = 0;
-};
-
-// forward declaration
-template<typename A> class compact_theta_sketch_experimental;
-
-template<typename Allocator = std::allocator<uint64_t>>
-class update_theta_sketch_experimental: public theta_sketch_experimental<Allocator> {
-public:
- using Base = theta_sketch_experimental<Allocator>;
- using Entry = typename Base::Entry;
- using ExtractKey = typename Base::ExtractKey;
- using iterator = typename Base::iterator;
- using const_iterator = typename Base::const_iterator;
- using theta_table = theta_update_sketch_base<Entry, ExtractKey, Allocator>;
- using resize_factor = typename theta_table::resize_factor;
-
- // No constructor here. Use builder instead.
- class builder;
-
- update_theta_sketch_experimental(const update_theta_sketch_experimental&) = default;
- update_theta_sketch_experimental(update_theta_sketch_experimental&&) noexcept = default;
- virtual ~update_theta_sketch_experimental() = default;
- update_theta_sketch_experimental& operator=(const update_theta_sketch_experimental&) = default;
- update_theta_sketch_experimental& operator=(update_theta_sketch_experimental&&) = default;
-
- virtual Allocator get_allocator() const;
- virtual bool is_empty() const;
- virtual bool is_ordered() const;
- virtual uint16_t get_seed_hash() const;
- virtual uint64_t get_theta64() const;
- virtual uint32_t get_num_retained() const;
-
- /**
- * @return configured nominal number of entries in the sketch
- */
- uint8_t get_lg_k() const;
-
- /**
- * @return configured resize factor of the sketch
- */
- resize_factor get_rf() const;
-
- /**
- * Update this sketch with a given string.
- * @param value string to update the sketch with
- */
- void update(const std::string& value);
-
- /**
- * Update this sketch with a given unsigned 64-bit integer.
- * @param value uint64_t to update the sketch with
- */
- void update(uint64_t value);
-
- /**
- * Update this sketch with a given signed 64-bit integer.
- * @param value int64_t to update the sketch with
- */
- void update(int64_t value);
-
- /**
- * Update this sketch with a given unsigned 32-bit integer.
- * For compatibility with Java implementation.
- * @param value uint32_t to update the sketch with
- */
- void update(uint32_t value);
-
- /**
- * Update this sketch with a given signed 32-bit integer.
- * For compatibility with Java implementation.
- * @param value int32_t to update the sketch with
- */
- void update(int32_t value);
-
- /**
- * Update this sketch with a given unsigned 16-bit integer.
- * For compatibility with Java implementation.
- * @param value uint16_t to update the sketch with
- */
- void update(uint16_t value);
-
- /**
- * Update this sketch with a given signed 16-bit integer.
- * For compatibility with Java implementation.
- * @param value int16_t to update the sketch with
- */
- void update(int16_t value);
-
- /**
- * Update this sketch with a given unsigned 8-bit integer.
- * For compatibility with Java implementation.
- * @param value uint8_t to update the sketch with
- */
- void update(uint8_t value);
-
- /**
- * Update this sketch with a given signed 8-bit integer.
- * For compatibility with Java implementation.
- * @param value int8_t to update the sketch with
- */
- void update(int8_t value);
-
- /**
- * Update this sketch with a given double-precision floating point value.
- * For compatibility with Java implementation.
- * @param value double to update the sketch with
- */
- void update(double value);
-
- /**
- * Update this sketch with a given floating point value.
- * For compatibility with Java implementation.
- * @param value float to update the sketch with
- */
- void update(float value);
-
- /**
- * Update this sketch with given data of any type.
- * This is a "universal" update that covers all cases above,
- * but may produce different hashes.
- * Be very careful to hash input values consistently using the same approach
- * both over time and on different platforms
- * and while passing sketches between C++ environment and Java environment.
- * Otherwise two sketches that should represent overlapping sets will be disjoint
- * For instance, for signed 32-bit values call update(int32_t) method above,
- * which does widening conversion to int64_t, if compatibility with Java is expected
- * @param data pointer to the data
- * @param length of the data in bytes
- */
- void update(const void* data, size_t length);
-
- /**
- * Remove retained entries in excess of the nominal size k (if any)
- */
- void trim();
-
- /**
- * Converts this sketch to a compact sketch (ordered or unordered).
- * @param ordered optional flag to specify if ordered sketch should be produced
- * @return compact sketch
- */
- compact_theta_sketch_experimental<Allocator> compact(bool ordered = true) const;
-
- virtual iterator begin();
- virtual iterator end();
- virtual const_iterator begin() const;
- virtual const_iterator end() const;
-
-private:
- theta_table table_;
-
- // for builder
- update_theta_sketch_experimental(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta,
- uint64_t seed, const Allocator& allocator);
-
- virtual void print_specifics(std::ostringstream& os) const;
-};
-
-// compact sketch
-
-template<typename Allocator = std::allocator<uint64_t>>
-class compact_theta_sketch_experimental: public theta_sketch_experimental<Allocator> {
-public:
- using Base = theta_sketch_experimental<Allocator>;
- using iterator = typename Base::iterator;
- using const_iterator = typename Base::const_iterator;
- using AllocBytes = typename std::allocator_traits<Allocator>::template rebind_alloc<uint8_t>;
- using vector_bytes = std::vector<uint8_t, AllocBytes>;
-
- static const uint8_t SERIAL_VERSION = 3;
- static const uint8_t SKETCH_TYPE = 3;
-
- // Instances of this type can be obtained:
- // - by compacting an update_theta_sketch
- // - as a result of a set operation
- // - by deserializing a previously serialized compact sketch
-
- compact_theta_sketch_experimental(const Base& other, bool ordered);
- compact_theta_sketch_experimental(const compact_theta_sketch_experimental&) = default;
- compact_theta_sketch_experimental(compact_theta_sketch_experimental&&) noexcept = default;
- virtual ~compact_theta_sketch_experimental() = default;
- compact_theta_sketch_experimental& operator=(const compact_theta_sketch_experimental&) = default;
- compact_theta_sketch_experimental& operator=(compact_theta_sketch_experimental&&) = default;
-
- virtual Allocator get_allocator() const;
- virtual bool is_empty() const;
- virtual bool is_ordered() const;
- virtual uint64_t get_theta64() const;
- virtual uint32_t get_num_retained() const;
- virtual uint16_t get_seed_hash() const;
-
- /**
- * This method serializes the sketch into a given stream in a binary form
- * @param os output stream
- */
- void serialize(std::ostream& os) const;
-
- /**
- * This method serializes the sketch as a vector of bytes.
- * An optional header can be reserved in front of the sketch.
- * It is an uninitialized space of a given size.
- * This header is used in Datasketches PostgreSQL extension.
- * @param header_size_bytes space to reserve in front of the sketch
- */
- vector_bytes serialize(unsigned header_size_bytes = 0) const;
-
- virtual iterator begin();
- virtual iterator end();
- virtual const_iterator begin() const;
- virtual const_iterator end() const;
-
- /**
- * This method deserializes a sketch from a given stream.
- * @param is input stream
- * @param seed the seed for the hash function that was used to create the sketch
- * @return an instance of the sketch
- */
- static compact_theta_sketch_experimental deserialize(std::istream& is,
- uint64_t seed = DEFAULT_SEED, const Allocator& allocator = Allocator());
-
- /**
- * This method deserializes a sketch from a given array of bytes.
- * @param bytes pointer to the array of bytes
- * @param size the size of the array
- * @param seed the seed for the hash function that was used to create the sketch
- * @return an instance of the sketch
- */
- static compact_theta_sketch_experimental deserialize(const void* bytes, size_t size,
- uint64_t seed = DEFAULT_SEED, const Allocator& allocator = Allocator());
-
- // for internal use
- compact_theta_sketch_experimental(bool is_empty, bool is_ordered, uint16_t seed_hash, uint64_t theta, std::vector<uint64_t, Allocator>&& entries);
-
-private:
- enum flags { IS_BIG_ENDIAN, IS_READ_ONLY, IS_EMPTY, IS_COMPACT, IS_ORDERED };
-
- bool is_empty_;
- bool is_ordered_;
- uint16_t seed_hash_;
- uint64_t theta_;
- std::vector<uint64_t, Allocator> entries_;
-
- virtual void print_specifics(std::ostringstream& os) const;
-};
-
-template<typename Allocator>
-class update_theta_sketch_experimental<Allocator>::builder: public theta_base_builder<builder, Allocator> {
-public:
- builder(const Allocator& allocator = Allocator());
- update_theta_sketch_experimental build() const;
-};
-
-} /* namespace datasketches */
-
-#include "theta_sketch_experimental_impl.hpp"
-
-#endif
diff --git a/tuple/include/theta_sketch_experimental_impl.hpp b/tuple/include/theta_sketch_experimental_impl.hpp
deleted file mode 100644
index 1fa4652..0000000
--- a/tuple/include/theta_sketch_experimental_impl.hpp
+++ /dev/null
@@ -1,481 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#include <sstream>
-
-#include "serde.hpp"
-#include "binomial_bounds.hpp"
-#include "theta_helpers.hpp"
-
-namespace datasketches {
-
-template<typename A>
-bool theta_sketch_experimental<A>::is_estimation_mode() const {
- return get_theta64() < theta_constants::MAX_THETA && !is_empty();
-}
-
-template<typename A>
-double theta_sketch_experimental<A>::get_theta() const {
- return static_cast<double>(get_theta64()) / theta_constants::MAX_THETA;
-}
-
-template<typename A>
-double theta_sketch_experimental<A>::get_estimate() const {
- return get_num_retained() / get_theta();
-}
-
-template<typename A>
-double theta_sketch_experimental<A>::get_lower_bound(uint8_t num_std_devs) const {
- if (!is_estimation_mode()) return get_num_retained();
- return binomial_bounds::get_lower_bound(get_num_retained(), get_theta(), num_std_devs);
-}
-
-template<typename A>
-double theta_sketch_experimental<A>::get_upper_bound(uint8_t num_std_devs) const {
- if (!is_estimation_mode()) return get_num_retained();
- return binomial_bounds::get_upper_bound(get_num_retained(), get_theta(), num_std_devs);
-}
-
-template<typename A>
-string<A> theta_sketch_experimental<A>::to_string(bool detail) const {
- std::basic_ostringstream<char, std::char_traits<char>, AllocChar<A>> os;
- os << "### Theta sketch summary:" << std::endl;
- os << " num retained entries : " << get_num_retained() << std::endl;
- os << " seed hash : " << get_seed_hash() << std::endl;
- os << " empty? : " << (is_empty() ? "true" : "false") << std::endl;
- os << " ordered? : " << (is_ordered() ? "true" : "false") << std::endl;
- os << " estimation mode? : " << (is_estimation_mode() ? "true" : "false") << std::endl;
- os << " theta (fraction) : " << get_theta() << std::endl;
- os << " theta (raw 64-bit) : " << get_theta64() << std::endl;
- os << " estimate : " << this->get_estimate() << std::endl;
- os << " lower bound 95% conf : " << this->get_lower_bound(2) << std::endl;
- os << " upper bound 95% conf : " << this->get_upper_bound(2) << std::endl;
- print_specifics(os);
- os << "### End sketch summary" << std::endl;
- if (detail) {
- os << "### Retained entries" << std::endl;
- for (const auto& hash: *this) {
- os << hash << std::endl;
- }
- os << "### End retained entries" << std::endl;
- }
- return os.str();
-}
-
-// update sketch
-
-template<typename A>
-update_theta_sketch_experimental<A>::update_theta_sketch_experimental(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf,
- uint64_t theta, uint64_t seed, const A& allocator):
-table_(lg_cur_size, lg_nom_size, rf, theta, seed, allocator)
-{}
-
-template<typename A>
-A update_theta_sketch_experimental<A>::get_allocator() const {
- return table_.allocator_;
-}
-
-template<typename A>
-bool update_theta_sketch_experimental<A>::is_empty() const {
- return table_.is_empty_;
-}
-
-template<typename A>
-bool update_theta_sketch_experimental<A>::is_ordered() const {
- return false;
-}
-
-template<typename A>
-uint64_t update_theta_sketch_experimental<A>::get_theta64() const {
- return table_.theta_;
-}
-
-template<typename A>
-uint32_t update_theta_sketch_experimental<A>::get_num_retained() const {
- return table_.num_entries_;
-}
-
-template<typename A>
-uint16_t update_theta_sketch_experimental<A>::get_seed_hash() const {
- return compute_seed_hash(table_.seed_);
-}
-
-template<typename A>
-uint8_t update_theta_sketch_experimental<A>::get_lg_k() const {
- return table_.lg_nom_size_;
-}
-
-template<typename A>
-auto update_theta_sketch_experimental<A>::get_rf() const -> resize_factor {
- return table_.rf_;
-}
-
-template<typename A>
-void update_theta_sketch_experimental<A>::update(uint64_t value) {
- update(&value, sizeof(value));
-}
-
-template<typename A>
-void update_theta_sketch_experimental<A>::update(int64_t value) {
- update(&value, sizeof(value));
-}
-
-template<typename A>
-void update_theta_sketch_experimental<A>::update(uint32_t value) {
- update(static_cast<int32_t>(value));
-}
-
-template<typename A>
-void update_theta_sketch_experimental<A>::update(int32_t value) {
- update(static_cast<int64_t>(value));
-}
-
-template<typename A>
-void update_theta_sketch_experimental<A>::update(uint16_t value) {
- update(static_cast<int16_t>(value));
-}
-
-template<typename A>
-void update_theta_sketch_experimental<A>::update(int16_t value) {
- update(static_cast<int64_t>(value));
-}
-
-template<typename A>
-void update_theta_sketch_experimental<A>::update(uint8_t value) {
- update(static_cast<int8_t>(value));
-}
-
-template<typename A>
-void update_theta_sketch_experimental<A>::update(int8_t value) {
- update(static_cast<int64_t>(value));
-}
-
-template<typename A>
-void update_theta_sketch_experimental<A>::update(double value) {
- update(canonical_double(value));
-}
-
-template<typename A>
-void update_theta_sketch_experimental<A>::update(float value) {
- update(static_cast<double>(value));
-}
-
-template<typename A>
-void update_theta_sketch_experimental<A>::update(const std::string& value) {
- if (value.empty()) return;
- update(value.c_str(), value.length());
-}
-
-template<typename A>
-void update_theta_sketch_experimental<A>::update(const void* data, size_t length) {
- const uint64_t hash = table_.hash_and_screen(data, length);
- if (hash == 0) return;
- auto result = table_.find(hash);
- if (!result.second) {
- table_.insert(result.first, hash);
- }
-}
-
-template<typename A>
-void update_theta_sketch_experimental<A>::trim() {
- table_.trim();
-}
-
-template<typename A>
-auto update_theta_sketch_experimental<A>::begin() -> iterator {
- return iterator(table_.entries_, 1 << table_.lg_cur_size_, 0);
-}
-
-template<typename A>
-auto update_theta_sketch_experimental<A>::end() -> iterator {
- return iterator(nullptr, 0, 1 << table_.lg_cur_size_);
-}
-
-template<typename A>
-auto update_theta_sketch_experimental<A>::begin() const -> const_iterator {
- return const_iterator(table_.entries_, 1 << table_.lg_cur_size_, 0);
-}
-
-template<typename A>
-auto update_theta_sketch_experimental<A>::end() const -> const_iterator {
- return const_iterator(nullptr, 0, 1 << table_.lg_cur_size_);
-}
-template<typename A>
-compact_theta_sketch_experimental<A> update_theta_sketch_experimental<A>::compact(bool ordered) const {
- return compact_theta_sketch_experimental<A>(*this, ordered);
-}
-
-template<typename A>
-void update_theta_sketch_experimental<A>::print_specifics(std::ostringstream& os) const {
- os << " lg nominal size : " << static_cast<int>(table_.lg_nom_size_) << std::endl;
- os << " lg current size : " << static_cast<int>(table_.lg_cur_size_) << std::endl;
- os << " resize factor : " << (1 << table_.rf_) << std::endl;
-}
-
-// builder
-
-template<typename A>
-update_theta_sketch_experimental<A>::builder::builder(const A& allocator): theta_base_builder<builder, A>(allocator) {}
-
-template<typename A>
-update_theta_sketch_experimental<A> update_theta_sketch_experimental<A>::builder::build() const {
- return update_theta_sketch_experimental(this->starting_lg_size(), this->lg_k_, this->rf_, this->starting_theta(), this->seed_, this->allocator_);
-}
-
-// experimental compact theta sketch
-
-template<typename A>
-compact_theta_sketch_experimental<A>::compact_theta_sketch_experimental(const Base& other, bool ordered):
-is_empty_(other.is_empty()),
-is_ordered_(other.is_ordered() || ordered),
-seed_hash_(other.get_seed_hash()),
-theta_(other.get_theta64()),
-entries_(other.get_allocator())
-{
- entries_.reserve(other.get_num_retained());
- std::copy(other.begin(), other.end(), std::back_inserter(entries_));
- if (ordered && !other.is_ordered()) std::sort(entries_.begin(), entries_.end());
-}
-
-template<typename A>
-compact_theta_sketch_experimental<A>::compact_theta_sketch_experimental(bool is_empty, bool is_ordered, uint16_t seed_hash, uint64_t theta,
- std::vector<uint64_t, A>&& entries):
-is_empty_(is_empty),
-is_ordered_(is_ordered),
-seed_hash_(seed_hash),
-theta_(theta),
-entries_(std::move(entries))
-{}
-
-template<typename A>
-A compact_theta_sketch_experimental<A>::get_allocator() const {
- return entries_.get_allocator();
-}
-
-template<typename A>
-bool compact_theta_sketch_experimental<A>::is_empty() const {
- return is_empty_;
-}
-
-template<typename A>
-bool compact_theta_sketch_experimental<A>::is_ordered() const {
- return is_ordered_;
-}
-
-template<typename A>
-uint64_t compact_theta_sketch_experimental<A>::get_theta64() const {
- return theta_;
-}
-
-template<typename A>
-uint32_t compact_theta_sketch_experimental<A>::get_num_retained() const {
- return entries_.size();
-}
-
-template<typename A>
-uint16_t compact_theta_sketch_experimental<A>::get_seed_hash() const {
- return seed_hash_;
-}
-
-template<typename A>
-auto compact_theta_sketch_experimental<A>::begin() -> iterator {
- return iterator(entries_.data(), entries_.size(), 0);
-}
-
-template<typename A>
-auto compact_theta_sketch_experimental<A>::end() -> iterator {
- return iterator(nullptr, 0, entries_.size());
-}
-
-template<typename A>
-auto compact_theta_sketch_experimental<A>::begin() const -> const_iterator {
- return const_iterator(entries_.data(), entries_.size(), 0);
-}
-
-template<typename A>
-auto compact_theta_sketch_experimental<A>::end() const -> const_iterator {
- return const_iterator(nullptr, 0, entries_.size());
-}
-
-template<typename A>
-void compact_theta_sketch_experimental<A>::print_specifics(std::ostringstream&) const {}
-
-template<typename A>
-void compact_theta_sketch_experimental<A>::serialize(std::ostream& os) const {
- const bool is_single_item = entries_.size() == 1 && !this->is_estimation_mode();
- const uint8_t preamble_longs = this->is_empty() || is_single_item ? 1 : this->is_estimation_mode() ? 3 : 2;
- os.write(reinterpret_cast<const char*>(&preamble_longs), sizeof(preamble_longs));
- const uint8_t serial_version = SERIAL_VERSION;
- os.write(reinterpret_cast<const char*>(&serial_version), sizeof(serial_version));
- const uint8_t type = SKETCH_TYPE;
- os.write(reinterpret_cast<const char*>(&type), sizeof(type));
- const uint16_t unused16 = 0;
- os.write(reinterpret_cast<const char*>(&unused16), sizeof(unused16));
- const uint8_t flags_byte(
- (1 << flags::IS_COMPACT) |
- (1 << flags::IS_READ_ONLY) |
- (this->is_empty() ? 1 << flags::IS_EMPTY : 0) |
- (this->is_ordered() ? 1 << flags::IS_ORDERED : 0)
- );
- os.write(reinterpret_cast<const char*>(&flags_byte), sizeof(flags_byte));
- const uint16_t seed_hash = get_seed_hash();
- os.write(reinterpret_cast<const char*>(&seed_hash), sizeof(seed_hash));
- if (!this->is_empty()) {
- if (!is_single_item) {
- const uint32_t num_entries = entries_.size();
- os.write(reinterpret_cast<const char*>(&num_entries), sizeof(num_entries));
- const uint32_t unused32 = 0;
- os.write(reinterpret_cast<const char*>(&unused32), sizeof(unused32));
- if (this->is_estimation_mode()) {
- os.write(reinterpret_cast<const char*>(&(this->theta_)), sizeof(uint64_t));
- }
- }
- os.write(reinterpret_cast<const char*>(entries_.data()), entries_.size() * sizeof(uint64_t));
- }
-}
-
-template<typename A>
-auto compact_theta_sketch_experimental<A>::serialize(unsigned header_size_bytes) const -> vector_bytes {
- const bool is_single_item = entries_.size() == 1 && !this->is_estimation_mode();
- const uint8_t preamble_longs = this->is_empty() || is_single_item ? 1 : this->is_estimation_mode() ? 3 : 2;
- const size_t size = header_size_bytes + sizeof(uint64_t) * preamble_longs
- + sizeof(uint64_t) * entries_.size();
- vector_bytes bytes(size, 0, entries_.get_allocator());
- uint8_t* ptr = bytes.data() + header_size_bytes;
-
- ptr += copy_to_mem(&preamble_longs, ptr, sizeof(preamble_longs));
- const uint8_t serial_version = SERIAL_VERSION;
- ptr += copy_to_mem(&serial_version, ptr, sizeof(serial_version));
- const uint8_t type = SKETCH_TYPE;
- ptr += copy_to_mem(&type, ptr, sizeof(type));
- const uint16_t unused16 = 0;
- ptr += copy_to_mem(&unused16, ptr, sizeof(unused16));
- const uint8_t flags_byte(
- (1 << flags::IS_COMPACT) |
- (1 << flags::IS_READ_ONLY) |
- (this->is_empty() ? 1 << flags::IS_EMPTY : 0) |
- (this->is_ordered() ? 1 << flags::IS_ORDERED : 0)
- );
- ptr += copy_to_mem(&flags_byte, ptr, sizeof(flags_byte));
- const uint16_t seed_hash = get_seed_hash();
- ptr += copy_to_mem(&seed_hash, ptr, sizeof(seed_hash));
- if (!this->is_empty()) {
- if (!is_single_item) {
- const uint32_t num_entries = entries_.size();
- ptr += copy_to_mem(&num_entries, ptr, sizeof(num_entries));
- const uint32_t unused32 = 0;
- ptr += copy_to_mem(&unused32, ptr, sizeof(unused32));
- if (this->is_estimation_mode()) {
- ptr += copy_to_mem(&theta_, ptr, sizeof(uint64_t));
- }
- }
- ptr += copy_to_mem(entries_.data(), ptr, entries_.size() * sizeof(uint64_t));
- }
- return bytes;
-}
-
-template<typename A>
-compact_theta_sketch_experimental<A> compact_theta_sketch_experimental<A>::deserialize(std::istream& is, uint64_t seed, const A& allocator) {
- uint8_t preamble_longs;
- is.read(reinterpret_cast<char*>(&preamble_longs), sizeof(preamble_longs));
- uint8_t serial_version;
- is.read(reinterpret_cast<char*>(&serial_version), sizeof(serial_version));
- uint8_t type;
- is.read(reinterpret_cast<char*>(&type), sizeof(type));
- uint16_t unused16;
- is.read(reinterpret_cast<char*>(&unused16), sizeof(unused16));
- uint8_t flags_byte;
- is.read(reinterpret_cast<char*>(&flags_byte), sizeof(flags_byte));
- uint16_t seed_hash;
- is.read(reinterpret_cast<char*>(&seed_hash), sizeof(seed_hash));
- checker<true>::check_sketch_type(type, SKETCH_TYPE);
- checker<true>::check_serial_version(serial_version, SERIAL_VERSION);
- const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
- if (!is_empty) checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
-
- uint64_t theta = theta_constants::MAX_THETA;
- uint32_t num_entries = 0;
- if (!is_empty) {
- if (preamble_longs == 1) {
- num_entries = 1;
- } else {
- is.read(reinterpret_cast<char*>(&num_entries), sizeof(num_entries));
- uint32_t unused32;
- is.read(reinterpret_cast<char*>(&unused32), sizeof(unused32));
- if (preamble_longs > 2) {
- is.read(reinterpret_cast<char*>(&theta), sizeof(theta));
- }
- }
- }
- std::vector<uint64_t, A> entries(num_entries, 0, allocator);
- if (!is_empty) is.read(reinterpret_cast<char*>(entries.data()), sizeof(uint64_t) * entries.size());
-
- const bool is_ordered = flags_byte & (1 << flags::IS_ORDERED);
- if (!is.good()) throw std::runtime_error("error reading from std::istream");
- return compact_theta_sketch_experimental(is_empty, is_ordered, seed_hash, theta, std::move(entries));
-}
-
-template<typename A>
-compact_theta_sketch_experimental<A> compact_theta_sketch_experimental<A>::deserialize(const void* bytes, size_t size, uint64_t seed, const A& allocator) {
- ensure_minimum_memory(size, 8);
- const char* ptr = static_cast<const char*>(bytes);
- const char* base = ptr;
- uint8_t preamble_longs;
- ptr += copy_from_mem(ptr, &preamble_longs, sizeof(preamble_longs));
- uint8_t serial_version;
- ptr += copy_from_mem(ptr, &serial_version, sizeof(serial_version));
- uint8_t type;
- ptr += copy_from_mem(ptr, &type, sizeof(type));
- uint16_t unused16;
- ptr += copy_from_mem(ptr, &unused16, sizeof(unused16));
- uint8_t flags_byte;
- ptr += copy_from_mem(ptr, &flags_byte, sizeof(flags_byte));
- uint16_t seed_hash;
- ptr += copy_from_mem(ptr, &seed_hash, sizeof(seed_hash));
- checker<true>::check_sketch_type(type, SKETCH_TYPE);
- checker<true>::check_serial_version(serial_version, SERIAL_VERSION);
- const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
- if (!is_empty) checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
-
- uint64_t theta = theta_constants::MAX_THETA;
- uint32_t num_entries = 0;
- if (!is_empty) {
- if (preamble_longs == 1) {
- num_entries = 1;
- } else {
- ensure_minimum_memory(size, 8); // read the first prelong before this method
- ptr += copy_from_mem(ptr, &num_entries, sizeof(num_entries));
- uint32_t unused32;
- ptr += copy_from_mem(ptr, &unused32, sizeof(unused32));
- if (preamble_longs > 2) {
- ensure_minimum_memory(size, (preamble_longs - 1) << 3);
- ptr += copy_from_mem(ptr, &theta, sizeof(theta));
- }
- }
- }
- const size_t entries_size_bytes = sizeof(uint64_t) * num_entries;
- check_memory_size(ptr - base + entries_size_bytes, size);
- std::vector<uint64_t, A> entries(num_entries, 0, allocator);
- if (!is_empty) ptr += copy_from_mem(ptr, entries.data(), entries_size_bytes);
-
- const bool is_ordered = flags_byte & (1 << flags::IS_ORDERED);
- return compact_theta_sketch_experimental(is_empty, is_ordered, seed_hash, theta, std::move(entries));
-}
-
-} /* namespace datasketches */
diff --git a/tuple/include/theta_union_experimental.hpp b/tuple/include/theta_union_experimental.hpp
deleted file mode 100644
index 0849f70..0000000
--- a/tuple/include/theta_union_experimental.hpp
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#ifndef THETA_UNION_EXPERIMENTAL_HPP_
-#define THETA_UNION_EXPERIMENTAL_HPP_
-
-#include "serde.hpp"
-#include "tuple_sketch.hpp"
-#include "theta_union_base.hpp"
-#include "theta_sketch_experimental.hpp"
-
-namespace datasketches {
-
-// experimental theta union derived from the same base as tuple union
-
-template<typename Allocator = std::allocator<uint64_t>>
-class theta_union_experimental {
-public:
- using Entry = uint64_t;
- using ExtractKey = trivial_extract_key;
- using Sketch = theta_sketch_experimental<Allocator>;
- using CompactSketch = compact_theta_sketch_experimental<Allocator>;
- using resize_factor = theta_constants::resize_factor;
-
- struct pass_through_policy {
- uint64_t operator()(uint64_t internal_entry, uint64_t incoming_entry) const {
- unused(incoming_entry);
- return internal_entry;
- }
- };
- using State = theta_union_base<Entry, ExtractKey, pass_through_policy, Sketch, CompactSketch, Allocator>;
-
- // No constructor here. Use builder instead.
- class builder;
-
- /**
- * This method is to update the union with a given sketch
- * @param sketch to update the union with
- */
- void update(const Sketch& sketch);
-
- /**
- * This method produces a copy of the current state of the union as a compact sketch.
- * @param ordered optional flag to specify if ordered sketch should be produced
- * @return the result of the union
- */
- CompactSketch get_result(bool ordered = true) const;
-
-private:
- State state_;
-
- // for builder
- theta_union_experimental(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta, uint64_t seed, const Allocator& allocator);
-};
-
-template<typename A>
-class theta_union_experimental<A>::builder: public theta_base_builder<builder, A> {
-public:
- builder(const A& allocator = A());
-
- /**
- * This is to create an instance of the union with predefined parameters.
- * @return an instance of the union
- */
- theta_union_experimental<A> build() const;
-};
-
-} /* namespace datasketches */
-
-#include "theta_union_experimental_impl.hpp"
-
-#endif
diff --git a/tuple/include/theta_union_experimental_impl.hpp b/tuple/include/theta_union_experimental_impl.hpp
deleted file mode 100644
index f80afe4..0000000
--- a/tuple/include/theta_union_experimental_impl.hpp
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-namespace datasketches {
-
-template<typename A>
-theta_union_experimental<A>::theta_union_experimental(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta, uint64_t seed, const A& allocator):
-state_(lg_cur_size, lg_nom_size, rf, theta, seed, pass_through_policy(), allocator)
-{}
-
-template<typename A>
-void theta_union_experimental<A>::update(const Sketch& sketch) {
- state_.update(sketch);
-}
-
-template<typename A>
-auto theta_union_experimental<A>::get_result(bool ordered) const -> CompactSketch {
- return state_.get_result(ordered);
-}
-
-template<typename A>
-theta_union_experimental<A>::builder::builder(const A& allocator): theta_base_builder<builder, A>(allocator) {}
-
-template<typename A>
-auto theta_union_experimental<A>::builder::build() const -> theta_union_experimental {
- return theta_union_experimental(
- this->starting_sub_multiple(this->lg_k_ + 1, this->MIN_LG_K, static_cast<uint8_t>(this->rf_)),
- this->lg_k_, this->rf_, this->starting_theta(), this->seed_, this->allocator_);
-}
-
-} /* namespace datasketches */
diff --git a/tuple/test/theta_union_experimental_test.cpp b/tuple/include/tuple_jaccard_similarity.hpp
similarity index 60%
rename from tuple/test/theta_union_experimental_test.cpp
rename to tuple/include/tuple_jaccard_similarity.hpp
index c270a11..0a6633c 100644
--- a/tuple/test/theta_union_experimental_test.cpp
+++ b/tuple/include/tuple_jaccard_similarity.hpp
@@ -17,28 +17,22 @@
* under the License.
*/
-#include <iostream>
+#ifndef TUPLE_JACCARD_SIMILARITY_HPP_
+#define TUPLE_JACCARD_SIMILARITY_HPP_
-#include <catch.hpp>
-#include <tuple_union.hpp>
-
-#include <theta_union_experimental.hpp>
+#include "theta_jaccard_similarity_base.hpp"
+#include "tuple_union.hpp"
+#include "tuple_intersection.hpp"
namespace datasketches {
-TEST_CASE("theta_union_exeperimental") {
- auto update_sketch1 = update_theta_sketch_experimental<>::builder().build();
- update_sketch1.update(1);
- update_sketch1.update(2);
-
- auto update_sketch2 = update_theta_sketch_experimental<>::builder().build();
- update_sketch2.update(1);
- update_sketch2.update(3);
-
- auto u = theta_union_experimental<>::builder().build();
- u.update(update_sketch1);
- u.update(update_sketch2);
- auto r = u.get_result();
-}
+template<
+ typename Summary,
+ typename IntersectionPolicy,
+ typename UnionPolicy = default_union_policy<Summary>,
+ typename Allocator = std::allocator<Summary>>
+using tuple_jaccard_similarity = jaccard_similarity_base<tuple_union<Summary, UnionPolicy, Allocator>, tuple_intersection<Summary, IntersectionPolicy, Allocator>, pair_extract_key<uint64_t, Summary>>;
} /* namespace datasketches */
+
+# endif
diff --git a/tuple/include/tuple_sketch.hpp b/tuple/include/tuple_sketch.hpp
index 4966b74..7777606 100644
--- a/tuple/include/tuple_sketch.hpp
+++ b/tuple/include/tuple_sketch.hpp
@@ -31,7 +31,17 @@
template<typename S, typename A> class tuple_sketch;
template<typename S, typename U, typename P, typename A> class update_tuple_sketch;
template<typename S, typename A> class compact_tuple_sketch;
-template<typename A> class theta_sketch_experimental;
+template<typename A> class theta_sketch_alloc;
+
+template<typename K, typename V>
+struct pair_extract_key {
+ K& operator()(std::pair<K, V>& entry) const {
+ return entry.first;
+ }
+ const K& operator()(const std::pair<K, V>& entry) const {
+ return entry.first;
+ }
+};
template<
typename Summary,
@@ -143,7 +153,8 @@
virtual const_iterator end() const = 0;
protected:
- virtual void print_specifics(std::basic_ostream<char>& os) const = 0;
+ using ostrstream = std::basic_ostringstream<char, std::char_traits<char>, AllocChar<Allocator>>;
+ virtual void print_specifics(ostrstream& os) const = 0;
static uint16_t get_seed_hash(uint64_t seed);
@@ -333,7 +344,8 @@
// for builder
update_tuple_sketch(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta, uint64_t seed, const Policy& policy, const Allocator& allocator);
- virtual void print_specifics(std::basic_ostream<char>& os) const;
+ using ostrstream = typename Base::ostrstream;
+ virtual void print_specifics(ostrstream& os) const;
};
// compact sketch
@@ -372,7 +384,7 @@
compact_tuple_sketch& operator=(const compact_tuple_sketch&) = default;
compact_tuple_sketch& operator=(compact_tuple_sketch&&) = default;
- compact_tuple_sketch(const theta_sketch_experimental<AllocU64>& other, const Summary& summary, bool ordered = true);
+ compact_tuple_sketch(const theta_sketch_alloc<AllocU64>& other, const Summary& summary, bool ordered = true);
virtual Allocator get_allocator() const;
virtual bool is_empty() const;
@@ -461,7 +473,8 @@
bool destroy_;
};
- virtual void print_specifics(std::basic_ostream<char>& os) const;
+ using ostrstream = typename Base::ostrstream;
+ virtual void print_specifics(ostrstream& os) const;
};
diff --git a/tuple/include/tuple_sketch_impl.hpp b/tuple/include/tuple_sketch_impl.hpp
index 52e2ebf..51d79ad 100644
--- a/tuple/include/tuple_sketch_impl.hpp
+++ b/tuple/include/tuple_sketch_impl.hpp
@@ -53,7 +53,7 @@
template<typename S, typename A>
string<A> tuple_sketch<S, A>::to_string(bool detail) const {
- std::basic_ostringstream<char, std::char_traits<char>, AllocChar<A>> os;
+ ostrstream os;
os << "### Tuple sketch summary:" << std::endl;
os << " num retained entries : " << get_num_retained() << std::endl;
os << " seed hash : " << get_seed_hash() << std::endl;
@@ -238,7 +238,7 @@
}
template<typename S, typename U, typename P, typename A>
-void update_tuple_sketch<S, U, P, A>::print_specifics(std::basic_ostream<char>& os) const {
+void update_tuple_sketch<S, U, P, A>::print_specifics(ostrstream& os) const {
os << " lg nominal size : " << (int) map_.lg_nom_size_ << std::endl;
os << " lg current size : " << (int) map_.lg_cur_size_ << std::endl;
os << " resize factor : " << (1 << map_.rf_) << std::endl;
@@ -279,7 +279,7 @@
{}
template<typename S, typename A>
-compact_tuple_sketch<S, A>::compact_tuple_sketch(const theta_sketch_experimental<AllocU64>& other, const S& summary, bool ordered):
+compact_tuple_sketch<S, A>::compact_tuple_sketch(const theta_sketch_alloc<AllocU64>& other, const S& summary, bool ordered):
is_empty_(other.is_empty()),
is_ordered_(other.is_ordered() || ordered),
seed_hash_(other.get_seed_hash()),
@@ -567,7 +567,7 @@
}
template<typename S, typename A>
-void compact_tuple_sketch<S, A>::print_specifics(std::basic_ostream<char>&) const {}
+void compact_tuple_sketch<S, A>::print_specifics(ostrstream&) const {}
// builder
diff --git a/tuple/test/CMakeLists.txt b/tuple/test/CMakeLists.txt
index cf87aaa..af01c34 100644
--- a/tuple/test/CMakeLists.txt
+++ b/tuple/test/CMakeLists.txt
@@ -43,11 +43,6 @@
tuple_union_test.cpp
tuple_intersection_test.cpp
tuple_a_not_b_test.cpp
- array_of_doubles_sketch_test.cpp
- theta_sketch_experimental_test.cpp
- theta_union_experimental_test.cpp
- theta_intersection_experimental_test.cpp
- theta_a_not_b_experimental_test.cpp
- theta_jaccard_similarity_test.cpp
tuple_jaccard_similarity_test.cpp
+ array_of_doubles_sketch_test.cpp
)
diff --git a/tuple/test/theta_a_not_b_experimental_test.cpp b/tuple/test/theta_a_not_b_experimental_test.cpp
deleted file mode 100644
index 6b44f8b..0000000
--- a/tuple/test/theta_a_not_b_experimental_test.cpp
+++ /dev/null
@@ -1,250 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#include <catch.hpp>
-
-#include <theta_a_not_b_experimental.hpp>
-
-namespace datasketches {
-
-// These tests have been copied from the existing theta sketch implementation.
-
-using update_theta_sketch = update_theta_sketch_experimental<>;
-using compact_theta_sketch = compact_theta_sketch_experimental<>;
-using theta_a_not_b = theta_a_not_b_experimental<>;
-
-TEST_CASE("theta a-not-b: empty", "[theta_a_not_b]") {
- theta_a_not_b a_not_b;
- auto a = update_theta_sketch::builder().build();
- auto b = update_theta_sketch::builder().build();
- compact_theta_sketch result = a_not_b.compute(a, b);
- REQUIRE(result.get_num_retained() == 0);
- REQUIRE(result.is_empty());
- REQUIRE_FALSE(result.is_estimation_mode());
- REQUIRE(result.get_estimate() == 0.0);
-}
-
-TEST_CASE("theta a-not-b: non empty no retained keys", "[theta_a_not_b]") {
- update_theta_sketch a = update_theta_sketch::builder().build();
- a.update(1);
- update_theta_sketch b = update_theta_sketch::builder().set_p(0.001).build();
- theta_a_not_b a_not_b;
-
- // B is still empty
- compact_theta_sketch result = a_not_b.compute(a, b);
- REQUIRE_FALSE(result.is_empty());
- REQUIRE_FALSE(result.is_estimation_mode());
- REQUIRE(result.get_num_retained() == 1);
- REQUIRE(result.get_theta() == Approx(1).margin(1e-10));
- REQUIRE(result.get_estimate() == 1.0);
-
- // B is not empty in estimation mode and no entries
- b.update(1);
- REQUIRE(b.get_num_retained() == 0U);
-
- result = a_not_b.compute(a, b);
- REQUIRE_FALSE(result.is_empty());
- REQUIRE(result.is_estimation_mode());
- REQUIRE(result.get_num_retained() == 0);
- REQUIRE(result.get_theta() == Approx(0.001).margin(1e-10));
- REQUIRE(result.get_estimate() == 0.0);
-}
-
-TEST_CASE("theta a-not-b: exact mode half overlap", "[theta_a_not_b]") {
- update_theta_sketch a = update_theta_sketch::builder().build();
- int value = 0;
- for (int i = 0; i < 1000; i++) a.update(value++);
-
- update_theta_sketch b = update_theta_sketch::builder().build();
- value = 500;
- for (int i = 0; i < 1000; i++) b.update(value++);
-
- theta_a_not_b a_not_b;
-
- // unordered inputs, ordered result
- compact_theta_sketch result = a_not_b.compute(a, b);
- REQUIRE_FALSE(result.is_empty());
- REQUIRE_FALSE(result.is_estimation_mode());
- REQUIRE(result.is_ordered());
- REQUIRE(result.get_estimate() == 500.0);
-
- // unordered inputs, unordered result
- result = a_not_b.compute(a, b, false);
- REQUIRE_FALSE(result.is_empty());
- REQUIRE_FALSE(result.is_estimation_mode());
- REQUIRE_FALSE(result.is_ordered());
- REQUIRE(result.get_estimate() == 500.0);
-
- // ordered inputs
- result = a_not_b.compute(a.compact(), b.compact());
- REQUIRE_FALSE(result.is_empty());
- REQUIRE_FALSE(result.is_estimation_mode());
- REQUIRE(result.is_ordered());
- REQUIRE(result.get_estimate() == 500.0);
-
- // A is ordered, so the result is ordered regardless
- result = a_not_b.compute(a.compact(), b, false);
- REQUIRE_FALSE(result.is_empty());
- REQUIRE_FALSE(result.is_estimation_mode());
- REQUIRE(result.is_ordered());
- REQUIRE(result.get_estimate() == 500.0);
-}
-
-TEST_CASE("theta a-not-b: exact mode disjoint", "[theta_a_not_b]") {
- update_theta_sketch a = update_theta_sketch::builder().build();
- int value = 0;
- for (int i = 0; i < 1000; i++) a.update(value++);
-
- update_theta_sketch b = update_theta_sketch::builder().build();
- for (int i = 0; i < 1000; i++) b.update(value++);
-
- theta_a_not_b a_not_b;
-
- // unordered inputs
- compact_theta_sketch result = a_not_b.compute(a, b);
- REQUIRE_FALSE(result.is_empty());
- REQUIRE_FALSE(result.is_estimation_mode());
- REQUIRE(result.get_estimate() == 1000.0);
-
- // ordered inputs
- result = a_not_b.compute(a.compact(), b.compact());
- REQUIRE_FALSE(result.is_empty());
- REQUIRE_FALSE(result.is_estimation_mode());
- REQUIRE(result.get_estimate() == 1000.0);
-}
-
-TEST_CASE("theta a-not-b: exact mode full overlap", "[theta_a_not_b]") {
- update_theta_sketch sketch = update_theta_sketch::builder().build();
- int value = 0;
- for (int i = 0; i < 1000; i++) sketch.update(value++);
-
- theta_a_not_b a_not_b;
-
- // unordered inputs
- compact_theta_sketch result = a_not_b.compute(sketch, sketch);
- REQUIRE(result.is_empty());
- REQUIRE_FALSE(result.is_estimation_mode());
- REQUIRE(result.get_estimate() == 0.0);
-
- // ordered inputs
- result = a_not_b.compute(sketch.compact(), sketch.compact());
- REQUIRE(result.is_empty());
- REQUIRE_FALSE(result.is_estimation_mode());
- REQUIRE(result.get_estimate() == 0.0);
-}
-
-TEST_CASE("theta a-not-b: estimation mode half overlap", "[theta_a_not_b]") {
- update_theta_sketch a = update_theta_sketch::builder().build();
- int value = 0;
- for (int i = 0; i < 10000; i++) a.update(value++);
-
- update_theta_sketch b = update_theta_sketch::builder().build();
- value = 5000;
- for (int i = 0; i < 10000; i++) b.update(value++);
-
- theta_a_not_b a_not_b;
-
- // unordered inputs
- compact_theta_sketch result = a_not_b.compute(a, b);
- REQUIRE_FALSE(result.is_empty());
- REQUIRE(result.is_estimation_mode());
- REQUIRE(result.get_estimate() == Approx(5000).margin(5000 * 0.02));
-
- // ordered inputs
- result = a_not_b.compute(a.compact(), b.compact());
- REQUIRE_FALSE(result.is_empty());
- REQUIRE(result.is_estimation_mode());
- REQUIRE(result.get_estimate() == Approx(5000).margin(5000 * 0.02));
-}
-
-TEST_CASE("theta a-not-b: estimation mode disjoint", "[theta_a_not_b]") {
- update_theta_sketch a = update_theta_sketch::builder().build();
- int value = 0;
- for (int i = 0; i < 10000; i++) a.update(value++);
-
- update_theta_sketch b = update_theta_sketch::builder().build();
- for (int i = 0; i < 10000; i++) b.update(value++);
-
- theta_a_not_b a_not_b;
-
- // unordered inputs
- compact_theta_sketch result = a_not_b.compute(a, b);
- REQUIRE_FALSE(result.is_empty());
- REQUIRE(result.is_estimation_mode());
- REQUIRE(result.get_estimate() == Approx(10000).margin(10000 * 0.02));
-
- // ordered inputs
- result = a_not_b.compute(a.compact(), b.compact());
- REQUIRE_FALSE(result.is_empty());
- REQUIRE(result.is_estimation_mode());
- REQUIRE(result.get_estimate() == Approx(10000).margin(10000 * 0.02));
-}
-
-TEST_CASE("theta a-not-b: estimation mode full overlap", "[theta_a_not_b]") {
- update_theta_sketch sketch = update_theta_sketch::builder().build();
- int value = 0;
- for (int i = 0; i < 10000; i++) sketch.update(value++);
-
- theta_a_not_b a_not_b;
-
- // unordered inputs
- compact_theta_sketch result = a_not_b.compute(sketch, sketch);
- REQUIRE_FALSE(result.is_empty());
- REQUIRE(result.is_estimation_mode());
- REQUIRE(result.get_estimate() == 0.0);
-
- // ordered inputs
- result = a_not_b.compute(sketch.compact(), sketch.compact());
- REQUIRE_FALSE(result.is_empty());
- REQUIRE(result.is_estimation_mode());
- REQUIRE(result.get_estimate() == 0.0);
-}
-
-TEST_CASE("theta a-not-b: seed mismatch", "[theta_a_not_b]") {
- update_theta_sketch sketch = update_theta_sketch::builder().build();
- sketch.update(1); // non-empty should not be ignored
- theta_a_not_b a_not_b(123);
- REQUIRE_THROWS_AS(a_not_b.compute(sketch, sketch), std::invalid_argument);
-}
-
-TEST_CASE("theta a-not-b: issue #152", "[theta_a_not_b]") {
- update_theta_sketch a = update_theta_sketch::builder().build();
- int value = 0;
- for (int i = 0; i < 10000; i++) a.update(value++);
-
- update_theta_sketch b = update_theta_sketch::builder().build();
- value = 5000;
- for (int i = 0; i < 25000; i++) b.update(value++);
-
- theta_a_not_b a_not_b;
-
- // unordered inputs
- compact_theta_sketch result = a_not_b.compute(a, b);
- REQUIRE_FALSE(result.is_empty());
- REQUIRE(result.is_estimation_mode());
- REQUIRE(result.get_estimate() == Approx(5000).margin(5000 * 0.03));
-
- // ordered inputs
- result = a_not_b.compute(a.compact(), b.compact());
- REQUIRE_FALSE(result.is_empty());
- REQUIRE(result.is_estimation_mode());
- REQUIRE(result.get_estimate() == Approx(5000).margin(5000 * 0.03));
-}
-
-} /* namespace datasketches */
diff --git a/tuple/test/theta_compact_empty_from_java.sk b/tuple/test/theta_compact_empty_from_java.sk
deleted file mode 100644
index f6c647f..0000000
--- a/tuple/test/theta_compact_empty_from_java.sk
+++ /dev/null
Binary files differ
diff --git a/tuple/test/theta_compact_estimation_from_java.sk b/tuple/test/theta_compact_estimation_from_java.sk
deleted file mode 100644
index 7c6babf..0000000
--- a/tuple/test/theta_compact_estimation_from_java.sk
+++ /dev/null
Binary files differ
diff --git a/tuple/test/theta_compact_single_item_from_java.sk b/tuple/test/theta_compact_single_item_from_java.sk
deleted file mode 100644
index be5ee68..0000000
--- a/tuple/test/theta_compact_single_item_from_java.sk
+++ /dev/null
Binary files differ
diff --git a/tuple/test/theta_intersection_experimental_test.cpp b/tuple/test/theta_intersection_experimental_test.cpp
deleted file mode 100644
index 3337636..0000000
--- a/tuple/test/theta_intersection_experimental_test.cpp
+++ /dev/null
@@ -1,224 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#include <catch.hpp>
-
-#include <theta_intersection_experimental.hpp>
-
-namespace datasketches {
-
-// These tests have been copied from the existing theta sketch implementation.
-
-using update_theta_sketch = update_theta_sketch_experimental<>;
-using compact_theta_sketch = compact_theta_sketch_experimental<>;
-using theta_intersection = theta_intersection_experimental<>;
-
-TEST_CASE("theta intersection: invalid", "[theta_intersection]") {
- theta_intersection intersection;
- REQUIRE_FALSE(intersection.has_result());
- REQUIRE_THROWS_AS(intersection.get_result(), std::invalid_argument);
-}
-
-TEST_CASE("theta intersection: empty", "[theta_intersection]") {
- theta_intersection intersection;
- update_theta_sketch sketch = update_theta_sketch::builder().build();
- intersection.update(sketch);
- compact_theta_sketch result = intersection.get_result();
- REQUIRE(result.get_num_retained() == 0);
- REQUIRE(result.is_empty());
- REQUIRE_FALSE(result.is_estimation_mode());
- REQUIRE(result.get_estimate() == 0.0);
-
- intersection.update(sketch);
- result = intersection.get_result();
- REQUIRE(result.get_num_retained() == 0);
- REQUIRE(result.is_empty());
- REQUIRE_FALSE(result.is_estimation_mode());
- REQUIRE(result.get_estimate() == 0.0);
-}
-
-TEST_CASE("theta intersection: non empty no retained keys", "[theta_intersection]") {
- update_theta_sketch sketch = update_theta_sketch::builder().set_p(0.001).build();
- sketch.update(1);
- theta_intersection intersection;
- intersection.update(sketch);
- compact_theta_sketch result = intersection.get_result();
- REQUIRE(result.get_num_retained() == 0);
- REQUIRE_FALSE(result.is_empty());
- REQUIRE(result.is_estimation_mode());
- REQUIRE(result.get_theta() == Approx(0.001).margin(1e-10));
- REQUIRE(result.get_estimate() == 0.0);
-
- intersection.update(sketch);
- result = intersection.get_result();
- REQUIRE(result.get_num_retained() == 0);
- REQUIRE_FALSE(result.is_empty());
- REQUIRE(result.is_estimation_mode());
- REQUIRE(result.get_theta() == Approx(0.001).margin(1e-10));
- REQUIRE(result.get_estimate() == 0.0);
-}
-
-TEST_CASE("theta intersection: exact mode half overlap unordered", "[theta_intersection]") {
- update_theta_sketch sketch1 = update_theta_sketch::builder().build();
- int value = 0;
- for (int i = 0; i < 1000; i++) sketch1.update(value++);
-
- update_theta_sketch sketch2 = update_theta_sketch::builder().build();
- value = 500;
- for (int i = 0; i < 1000; i++) sketch2.update(value++);
-
- theta_intersection intersection;
- intersection.update(sketch1);
- intersection.update(sketch2);
- compact_theta_sketch result = intersection.get_result();
- REQUIRE_FALSE(result.is_empty());
- REQUIRE_FALSE(result.is_estimation_mode());
- REQUIRE(result.get_estimate() == 500.0);
-}
-
-TEST_CASE("theta intersection: exact mode half overlap ordered", "[theta_intersection]") {
- update_theta_sketch sketch1 = update_theta_sketch::builder().build();
- int value = 0;
- for (int i = 0; i < 1000; i++) sketch1.update(value++);
-
- update_theta_sketch sketch2 = update_theta_sketch::builder().build();
- value = 500;
- for (int i = 0; i < 1000; i++) sketch2.update(value++);
-
- theta_intersection intersection;
- intersection.update(sketch1.compact());
- intersection.update(sketch2.compact());
- compact_theta_sketch result = intersection.get_result();
- REQUIRE_FALSE(result.is_empty());
- REQUIRE_FALSE(result.is_estimation_mode());
- REQUIRE(result.get_estimate() == 500.0);
-}
-
-TEST_CASE("theta intersection: exact mode disjoint unordered", "[theta_intersection]") {
- update_theta_sketch sketch1 = update_theta_sketch::builder().build();
- int value = 0;
- for (int i = 0; i < 1000; i++) sketch1.update(value++);
-
- update_theta_sketch sketch2 = update_theta_sketch::builder().build();
- for (int i = 0; i < 1000; i++) sketch2.update(value++);
-
- theta_intersection intersection;
- intersection.update(sketch1);
- intersection.update(sketch2);
- compact_theta_sketch result = intersection.get_result();
- REQUIRE(result.is_empty());
- REQUIRE_FALSE(result.is_estimation_mode());
- REQUIRE(result.get_estimate() == 0.0);
-}
-
-TEST_CASE("theta intersection: exact mode disjoint ordered", "[theta_intersection]") {
- update_theta_sketch sketch1 = update_theta_sketch::builder().build();
- int value = 0;
- for (int i = 0; i < 1000; i++) sketch1.update(value++);
-
- update_theta_sketch sketch2 = update_theta_sketch::builder().build();
- for (int i = 0; i < 1000; i++) sketch2.update(value++);
-
- theta_intersection intersection;
- intersection.update(sketch1.compact());
- intersection.update(sketch2.compact());
- compact_theta_sketch result = intersection.get_result();
- REQUIRE(result.is_empty());
- REQUIRE_FALSE(result.is_estimation_mode());
- REQUIRE(result.get_estimate() == 0.0);
-}
-
-TEST_CASE("theta intersection: estimation mode half overlap unordered", "[theta_intersection]") {
- update_theta_sketch sketch1 = update_theta_sketch::builder().build();
- int value = 0;
- for (int i = 0; i < 10000; i++) sketch1.update(value++);
-
- update_theta_sketch sketch2 = update_theta_sketch::builder().build();
- value = 5000;
- for (int i = 0; i < 10000; i++) sketch2.update(value++);
-
- theta_intersection intersection;
- intersection.update(sketch1);
- intersection.update(sketch2);
- compact_theta_sketch result = intersection.get_result();
- REQUIRE_FALSE(result.is_empty());
- REQUIRE(result.is_estimation_mode());
- REQUIRE(result.get_estimate() == Approx(5000).margin(5000 * 0.02));
-}
-
-TEST_CASE("theta intersection: estimation mode half overlap ordered", "[theta_intersection]") {
- update_theta_sketch sketch1 = update_theta_sketch::builder().build();
- int value = 0;
- for (int i = 0; i < 10000; i++) sketch1.update(value++);
-
- update_theta_sketch sketch2 = update_theta_sketch::builder().build();
- value = 5000;
- for (int i = 0; i < 10000; i++) sketch2.update(value++);
-
- theta_intersection intersection;
- intersection.update(sketch1.compact());
- intersection.update(sketch2.compact());
- compact_theta_sketch result = intersection.get_result();
- REQUIRE_FALSE(result.is_empty());
- REQUIRE(result.is_estimation_mode());
- REQUIRE(result.get_estimate() == Approx(5000).margin(5000 * 0.02));
-}
-
-TEST_CASE("theta intersection: estimation mode disjoint unordered", "[theta_intersection]") {
- update_theta_sketch sketch1 = update_theta_sketch::builder().build();
- int value = 0;
- for (int i = 0; i < 10000; i++) sketch1.update(value++);
-
- update_theta_sketch sketch2 = update_theta_sketch::builder().build();
- for (int i = 0; i < 10000; i++) sketch2.update(value++);
-
- theta_intersection intersection;
- intersection.update(sketch1);
- intersection.update(sketch2);
- compact_theta_sketch result = intersection.get_result();
- REQUIRE_FALSE(result.is_empty());
- REQUIRE(result.is_estimation_mode());
- REQUIRE(result.get_estimate() == 0.0);
-}
-
-TEST_CASE("theta intersection: estimation mode disjoint ordered", "[theta_intersection]") {
- update_theta_sketch sketch1 = update_theta_sketch::builder().build();
- int value = 0;
- for (int i = 0; i < 10000; i++) sketch1.update(value++);
-
- update_theta_sketch sketch2 = update_theta_sketch::builder().build();
- for (int i = 0; i < 10000; i++) sketch2.update(value++);
-
- theta_intersection intersection;
- intersection.update(sketch1.compact());
- intersection.update(sketch2.compact());
- compact_theta_sketch result = intersection.get_result();
- REQUIRE_FALSE(result.is_empty());
- REQUIRE(result.is_estimation_mode());
- REQUIRE(result.get_estimate() == 0.0);
-}
-
-TEST_CASE("theta intersection: seed mismatch", "[theta_intersection]") {
- update_theta_sketch sketch = update_theta_sketch::builder().build();
- sketch.update(1); // non-empty should not be ignored
- theta_intersection intersection(123);
- REQUIRE_THROWS_AS(intersection.update(sketch), std::invalid_argument);
-}
-
-} /* namespace datasketches */
diff --git a/tuple/test/theta_sketch_experimental_test.cpp b/tuple/test/theta_sketch_experimental_test.cpp
deleted file mode 100644
index 61435a4..0000000
--- a/tuple/test/theta_sketch_experimental_test.cpp
+++ /dev/null
@@ -1,247 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#include <fstream>
-#include <sstream>
-
-#include <catch.hpp>
-#include <theta_sketch_experimental.hpp>
-
-namespace datasketches {
-
-#ifdef TEST_BINARY_INPUT_PATH
-const std::string inputPath = TEST_BINARY_INPUT_PATH;
-#else
-const std::string inputPath = "test/";
-#endif
-
-// These tests have been copied from the existing theta sketch implementation.
-// Serialization as base class and serialization of update sketch have been removed.
-
-using update_theta_sketch = update_theta_sketch_experimental<>;
-using compact_theta_sketch = compact_theta_sketch_experimental<>;
-
-TEST_CASE("theta sketch: empty", "[theta_sketch]") {
- update_theta_sketch update_sketch = update_theta_sketch::builder().build();
- REQUIRE(update_sketch.is_empty());
- REQUIRE_FALSE(update_sketch.is_estimation_mode());
- REQUIRE(update_sketch.get_theta() == 1.0);
- REQUIRE(update_sketch.get_estimate() == 0.0);
- REQUIRE(update_sketch.get_lower_bound(1) == 0.0);
- REQUIRE(update_sketch.get_upper_bound(1) == 0.0);
-
- compact_theta_sketch compact_sketch = update_sketch.compact();
- REQUIRE(compact_sketch.is_empty());
- REQUIRE_FALSE(compact_sketch.is_estimation_mode());
- REQUIRE(compact_sketch.get_theta() == 1.0);
- REQUIRE(compact_sketch.get_estimate() == 0.0);
- REQUIRE(compact_sketch.get_lower_bound(1) == 0.0);
- REQUIRE(compact_sketch.get_upper_bound(1) == 0.0);
-}
-
-TEST_CASE("theta sketch: non empty no retained keys", "[theta_sketch]") {
- update_theta_sketch update_sketch = update_theta_sketch::builder().set_p(0.001).build();
- update_sketch.update(1);
- //std::cerr << update_sketch.to_string();
- REQUIRE(update_sketch.get_num_retained() == 0);
- REQUIRE_FALSE(update_sketch.is_empty());
- REQUIRE(update_sketch.is_estimation_mode());
- REQUIRE(update_sketch.get_estimate() == 0.0);
- REQUIRE(update_sketch.get_lower_bound(1) == 0.0);
- REQUIRE(update_sketch.get_upper_bound(1) > 0);
-
- compact_theta_sketch compact_sketch = update_sketch.compact();
- REQUIRE(compact_sketch.get_num_retained() == 0);
- REQUIRE_FALSE(compact_sketch.is_empty());
- REQUIRE(compact_sketch.is_estimation_mode());
- REQUIRE(compact_sketch.get_estimate() == 0.0);
- REQUIRE(compact_sketch.get_lower_bound(1) == 0.0);
- REQUIRE(compact_sketch.get_upper_bound(1) > 0);
-}
-
-TEST_CASE("theta sketch: single item", "[theta_sketch]") {
- update_theta_sketch update_sketch = update_theta_sketch::builder().build();
- update_sketch.update(1);
- REQUIRE_FALSE(update_sketch.is_empty());
- REQUIRE_FALSE(update_sketch.is_estimation_mode());
- REQUIRE(update_sketch.get_theta() == 1.0);
- REQUIRE(update_sketch.get_estimate() == 1.0);
- REQUIRE(update_sketch.get_lower_bound(1) == 1.0);
- REQUIRE(update_sketch.get_upper_bound(1) == 1.0);
-
- compact_theta_sketch compact_sketch = update_sketch.compact();
- REQUIRE_FALSE(compact_sketch.is_empty());
- REQUIRE_FALSE(compact_sketch.is_estimation_mode());
- REQUIRE(compact_sketch.get_theta() == 1.0);
- REQUIRE(compact_sketch.get_estimate() == 1.0);
- REQUIRE(compact_sketch.get_lower_bound(1) == 1.0);
- REQUIRE(compact_sketch.get_upper_bound(1) == 1.0);
-}
-
-TEST_CASE("theta sketch: resize exact", "[theta_sketch]") {
- update_theta_sketch update_sketch = update_theta_sketch::builder().build();
- for (int i = 0; i < 2000; i++) update_sketch.update(i);
- REQUIRE_FALSE(update_sketch.is_empty());
- REQUIRE_FALSE(update_sketch.is_estimation_mode());
- REQUIRE(update_sketch.get_theta() == 1.0);
- REQUIRE(update_sketch.get_estimate() == 2000.0);
- REQUIRE(update_sketch.get_lower_bound(1) == 2000.0);
- REQUIRE(update_sketch.get_upper_bound(1) == 2000.0);
-
- compact_theta_sketch compact_sketch = update_sketch.compact();
- REQUIRE_FALSE(compact_sketch.is_empty());
- REQUIRE_FALSE(compact_sketch.is_estimation_mode());
- REQUIRE(compact_sketch.get_theta() == 1.0);
- REQUIRE(compact_sketch.get_estimate() == 2000.0);
- REQUIRE(compact_sketch.get_lower_bound(1) == 2000.0);
- REQUIRE(compact_sketch.get_upper_bound(1) == 2000.0);
-}
-
-TEST_CASE("theta sketch: estimation", "[theta_sketch]") {
- update_theta_sketch update_sketch = update_theta_sketch::builder().set_resize_factor(update_theta_sketch::resize_factor::X1).build();
- const int n = 8000;
- for (int i = 0; i < n; i++) update_sketch.update(i);
- //std::cerr << update_sketch.to_string();
- REQUIRE_FALSE(update_sketch.is_empty());
- REQUIRE(update_sketch.is_estimation_mode());
- REQUIRE(update_sketch.get_theta() < 1.0);
- REQUIRE(update_sketch.get_estimate() == Approx((double) n).margin(n * 0.01));
- REQUIRE(update_sketch.get_lower_bound(1) < n);
- REQUIRE(update_sketch.get_upper_bound(1) > n);
-
- const uint32_t k = 1 << update_theta_sketch::builder::DEFAULT_LG_K;
- REQUIRE(update_sketch.get_num_retained() >= k);
- update_sketch.trim();
- REQUIRE(update_sketch.get_num_retained() == k);
-
- compact_theta_sketch compact_sketch = update_sketch.compact();
- REQUIRE_FALSE(compact_sketch.is_empty());
- REQUIRE(compact_sketch.is_ordered());
- REQUIRE(compact_sketch.is_estimation_mode());
- REQUIRE(compact_sketch.get_theta() < 1.0);
- REQUIRE(compact_sketch.get_estimate() == Approx((double) n).margin(n * 0.01));
- REQUIRE(compact_sketch.get_lower_bound(1) < n);
- REQUIRE(compact_sketch.get_upper_bound(1) > n);
-}
-
-TEST_CASE("theta sketch: deserialize compact empty from java", "[theta_sketch]") {
- std::ifstream is;
- is.exceptions(std::ios::failbit | std::ios::badbit);
- is.open(inputPath + "theta_compact_empty_from_java.sk", std::ios::binary);
- auto sketch = compact_theta_sketch::deserialize(is);
- REQUIRE(sketch.is_empty());
- REQUIRE_FALSE(sketch.is_estimation_mode());
- REQUIRE(sketch.get_num_retained() == 0);
- REQUIRE(sketch.get_theta() == 1.0);
- REQUIRE(sketch.get_estimate() == 0.0);
- REQUIRE(sketch.get_lower_bound(1) == 0.0);
- REQUIRE(sketch.get_upper_bound(1) == 0.0);
-}
-
-TEST_CASE("theta sketch: deserialize single item from java", "[theta_sketch]") {
- std::ifstream is;
- is.exceptions(std::ios::failbit | std::ios::badbit);
- is.open(inputPath + "theta_compact_single_item_from_java.sk", std::ios::binary);
- auto sketch = compact_theta_sketch::deserialize(is);
- REQUIRE_FALSE(sketch.is_empty());
- REQUIRE_FALSE(sketch.is_estimation_mode());
- REQUIRE(sketch.get_num_retained() == 1);
- REQUIRE(sketch.get_theta() == 1.0);
- REQUIRE(sketch.get_estimate() == 1.0);
- REQUIRE(sketch.get_lower_bound(1) == 1.0);
- REQUIRE(sketch.get_upper_bound(1) == 1.0);
-}
-
-TEST_CASE("theta sketch: deserialize compact estimation from java", "[theta_sketch]") {
- std::ifstream is;
- is.exceptions(std::ios::failbit | std::ios::badbit);
- is.open(inputPath + "theta_compact_estimation_from_java.sk", std::ios::binary);
- auto sketch = compact_theta_sketch::deserialize(is);
- REQUIRE_FALSE(sketch.is_empty());
- REQUIRE(sketch.is_estimation_mode());
- REQUIRE(sketch.is_ordered());
- REQUIRE(sketch.get_num_retained() == 4342);
- REQUIRE(sketch.get_theta() == Approx(0.531700444213199).margin(1e-10));
- REQUIRE(sketch.get_estimate() == Approx(8166.25234614053).margin(1e-10));
- REQUIRE(sketch.get_lower_bound(2) == Approx(7996.956955317471).margin(1e-10));
- REQUIRE(sketch.get_upper_bound(2) == Approx(8339.090301078124).margin(1e-10));
-
- // the same construction process in Java must have produced exactly the same sketch
- update_theta_sketch update_sketch = update_theta_sketch::builder().build();
- const int n = 8192;
- for (int i = 0; i < n; i++) update_sketch.update(i);
- REQUIRE(sketch.get_num_retained() == update_sketch.get_num_retained());
- REQUIRE(sketch.get_theta() == Approx(update_sketch.get_theta()).margin(1e-10));
- REQUIRE(sketch.get_estimate() == Approx(update_sketch.get_estimate()).margin(1e-10));
- REQUIRE(sketch.get_lower_bound(1) == Approx(update_sketch.get_lower_bound(1)).margin(1e-10));
- REQUIRE(sketch.get_upper_bound(1) == Approx(update_sketch.get_upper_bound(1)).margin(1e-10));
- REQUIRE(sketch.get_lower_bound(2) == Approx(update_sketch.get_lower_bound(2)).margin(1e-10));
- REQUIRE(sketch.get_upper_bound(2) == Approx(update_sketch.get_upper_bound(2)).margin(1e-10));
- REQUIRE(sketch.get_lower_bound(3) == Approx(update_sketch.get_lower_bound(3)).margin(1e-10));
- REQUIRE(sketch.get_upper_bound(3) == Approx(update_sketch.get_upper_bound(3)).margin(1e-10));
- compact_theta_sketch compact_sketch = update_sketch.compact();
- // the sketches are ordered, so the iteration sequence must match exactly
- auto iter = sketch.begin();
- for (const auto& key: compact_sketch) {
- REQUIRE(*iter == key);
- ++iter;
- }
-}
-
-TEST_CASE("theta sketch: serialize deserialize stream and bytes equivalence", "[theta_sketch]") {
- update_theta_sketch update_sketch = update_theta_sketch::builder().build();
- const int n = 8192;
- for (int i = 0; i < n; i++) update_sketch.update(i);
-
- std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
- update_sketch.compact().serialize(s);
- auto bytes = update_sketch.compact().serialize();
- REQUIRE(bytes.size() == static_cast<size_t>(s.tellp()));
- for (size_t i = 0; i < bytes.size(); ++i) {
- REQUIRE(((char*)bytes.data())[i] == (char)s.get());
- }
-
- s.seekg(0); // rewind
- compact_theta_sketch deserialized_sketch1 = compact_theta_sketch::deserialize(s);
- compact_theta_sketch deserialized_sketch2 = compact_theta_sketch::deserialize(bytes.data(), bytes.size());
- REQUIRE(bytes.size() == static_cast<size_t>(s.tellg()));
- REQUIRE(deserialized_sketch2.is_empty() == deserialized_sketch1.is_empty());
- REQUIRE(deserialized_sketch2.is_ordered() == deserialized_sketch1.is_ordered());
- REQUIRE(deserialized_sketch2.get_num_retained() == deserialized_sketch1.get_num_retained());
- REQUIRE(deserialized_sketch2.get_theta() == deserialized_sketch1.get_theta());
- REQUIRE(deserialized_sketch2.get_estimate() == deserialized_sketch1.get_estimate());
- REQUIRE(deserialized_sketch2.get_lower_bound(1) == deserialized_sketch1.get_lower_bound(1));
- REQUIRE(deserialized_sketch2.get_upper_bound(1) == deserialized_sketch1.get_upper_bound(1));
- // the sketches are ordered, so the iteration sequence must match exactly
- auto iter = deserialized_sketch1.begin();
- for (auto key: deserialized_sketch2) {
- REQUIRE(*iter == key);
- ++iter;
- }
-}
-
-TEST_CASE("theta sketch: deserialize compact single item buffer overrun", "[theta_sketch]") {
- update_theta_sketch update_sketch = update_theta_sketch::builder().build();
- update_sketch.update(1);
- auto bytes = update_sketch.compact().serialize();
- REQUIRE_THROWS_AS(compact_theta_sketch::deserialize(bytes.data(), 7), std::out_of_range);
- REQUIRE_THROWS_AS(compact_theta_sketch::deserialize(bytes.data(), bytes.size() - 1), std::out_of_range);
-}
-
-} /* namespace datasketches */
diff --git a/tuple/test/tuple_a_not_b_test.cpp b/tuple/test/tuple_a_not_b_test.cpp
index 1c56102..7c9446c 100644
--- a/tuple/test/tuple_a_not_b_test.cpp
+++ b/tuple/test/tuple_a_not_b_test.cpp
@@ -21,7 +21,7 @@
#include <catch.hpp>
#include <tuple_a_not_b.hpp>
-#include <theta_sketch_experimental.hpp>
+#include <theta_sketch.hpp>
namespace datasketches {
@@ -102,9 +102,6 @@
REQUIRE(result.get_estimate() == 500.0);
}
-// needed until promotion of experimental to replace existing theta sketch
-using update_theta_sketch = update_theta_sketch_experimental<>;
-
TEST_CASE("mixed a-not-b: exact mode half overlap", "[tuple_a_not_b]") {
auto a = update_tuple_sketch<float>::builder().build();
int value = 0;
diff --git a/tuple/test/tuple_intersection_test.cpp b/tuple/test/tuple_intersection_test.cpp
index 9796cb3..06ccd76 100644
--- a/tuple/test/tuple_intersection_test.cpp
+++ b/tuple/test/tuple_intersection_test.cpp
@@ -21,7 +21,7 @@
#include <catch.hpp>
#include <tuple_intersection.hpp>
-#include <theta_sketch_experimental.hpp>
+#include <theta_sketch.hpp>
namespace datasketches {
@@ -136,9 +136,6 @@
}
}
-// needed until promotion of experimental to replace existing theta sketch
-using update_theta_sketch = update_theta_sketch_experimental<>;
-
TEST_CASE("mixed intersection: exact mode half overlap", "[tuple_intersection]") {
auto sketch1 = update_tuple_sketch<float>::builder().build();
int value = 0;
diff --git a/tuple/test/tuple_jaccard_similarity_test.cpp b/tuple/test/tuple_jaccard_similarity_test.cpp
index 9545593..2b3efbb 100644
--- a/tuple/test/tuple_jaccard_similarity_test.cpp
+++ b/tuple/test/tuple_jaccard_similarity_test.cpp
@@ -20,7 +20,8 @@
#include <iostream>
#include <catch.hpp>
-#include <jaccard_similarity.hpp>
+
+#include "tuple_jaccard_similarity.hpp"
namespace datasketches {
diff --git a/tuple/test/tuple_sketch_allocation_test.cpp b/tuple/test/tuple_sketch_allocation_test.cpp
index d87c06e..a8e279a 100644
--- a/tuple/test/tuple_sketch_allocation_test.cpp
+++ b/tuple/test/tuple_sketch_allocation_test.cpp
@@ -51,7 +51,7 @@
test_allocator_total_bytes = 0;
test_allocator_net_allocations = 0;
{
- auto update_sketch = update_tuple_sketch_test::builder().build();
+ auto update_sketch = update_tuple_sketch_test::builder(test_type_replace_policy(), test_allocator<test_type>(0)).build();
for (int i = 0; i < 10000; ++i) update_sketch.update(i, 1);
for (int i = 0; i < 10000; ++i) update_sketch.update(i, 2);
REQUIRE(!update_sketch.is_empty());
@@ -77,7 +77,7 @@
REQUIRE(count == update_sketch.get_num_retained());
auto bytes = compact_sketch.serialize(0, test_type_serde());
- auto deserialized_sketch = compact_tuple_sketch_test::deserialize(bytes.data(), bytes.size(), DEFAULT_SEED, test_type_serde());
+ auto deserialized_sketch = compact_tuple_sketch_test::deserialize(bytes.data(), bytes.size(), DEFAULT_SEED, test_type_serde(), test_allocator<test_type>(0));
REQUIRE(deserialized_sketch.get_estimate() == compact_sketch.get_estimate());
// update sketch copy
diff --git a/tuple/test/tuple_union_test.cpp b/tuple/test/tuple_union_test.cpp
index 281b37c..4088fa2 100644
--- a/tuple/test/tuple_union_test.cpp
+++ b/tuple/test/tuple_union_test.cpp
@@ -21,7 +21,7 @@
#include <catch.hpp>
#include <tuple_union.hpp>
-#include <theta_sketch_experimental.hpp>
+#include <theta_sketch.hpp>
namespace datasketches {
@@ -37,9 +37,6 @@
REQUIRE(result.get_estimate() == 0);
}
-// needed until promotion of experimental to replace existing theta sketch
-using update_theta_sketch = update_theta_sketch_experimental<>;
-
TEST_CASE("tupe_union float: empty theta sketch", "[tuple union]") {
auto update_sketch = update_theta_sketch::builder().build();