Merge pull request #222 from apache/cpc_max_serialized_size
CPC max serialized size bytes
diff --git a/cpc/include/cpc_sketch.hpp b/cpc/include/cpc_sketch.hpp
index 651c254..cdc4d16 100644
--- a/cpc/include/cpc_sketch.hpp
+++ b/cpc/include/cpc_sketch.hpp
@@ -235,6 +235,17 @@
*/
static cpc_sketch_alloc<A> deserialize(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED, const A& allocator = A());
+ /**
+ * The actual size of a compressed CPC sketch has a small random variance, but the following
+ * empirically measured size should be large enough for at least 99.9 percent of sketches.
+ *
+ * <p>For small values of <i>n</i> the size can be much smaller.
+ *
+ * @param lg_k the given value of lg_k.
+ * @return the estimated maximum compressed serialized size of a sketch.
+ */
+ static size_t get_max_serialized_size_bytes(uint8_t lg_k);
+
// for internal use
uint32_t get_num_coupons() const;
@@ -303,6 +314,8 @@
inline void write_hip(std::ostream& os) const;
inline size_t copy_hip_to_mem(void* dst) const;
+ static void check_lg_k(uint8_t lg_k);
+
friend cpc_compressor<A>;
friend cpc_union_alloc<A>;
};
diff --git a/cpc/include/cpc_sketch_impl.hpp b/cpc/include/cpc_sketch_impl.hpp
index 1bb1be1..2ad977a 100644
--- a/cpc/include/cpc_sketch_impl.hpp
+++ b/cpc/include/cpc_sketch_impl.hpp
@@ -53,9 +53,7 @@
kxp(1 << lg_k),
hip_est_accum(0)
{
- if (lg_k < CPC_MIN_LG_K || lg_k > CPC_MAX_LG_K) {
- throw std::invalid_argument("lg_k must be >= " + std::to_string(CPC_MIN_LG_K) + " and <= " + std::to_string(CPC_MAX_LG_K) + ": " + std::to_string(lg_k));
- }
+ check_lg_k(lg_k);
}
template<typename A>
@@ -681,6 +679,49 @@
std::move(uncompressed.window), has_hip, kxp, hip_est_accum, seed);
}
+/*
+ * These empirical values for the 99.9th percentile of size in bytes were measured using 100,000
+ * trials. The value for each trial is the maximum of 5*16=80 measurements that were equally
+ * spaced over values of the quantity C/K between 3.0 and 8.0. This table does not include the
+ * worst-case space for the preamble, which is added by the function.
+ */
+static const uint8_t CPC_EMPIRICAL_SIZE_MAX_LGK = 19;
+static const size_t CPC_EMPIRICAL_MAX_SIZE_BYTES[] = {
+ 24, // lg_k = 4
+ 36, // lg_k = 5
+ 56, // lg_k = 6
+ 100, // lg_k = 7
+ 180, // lg_k = 8
+ 344, // lg_k = 9
+ 660, // lg_k = 10
+ 1292, // lg_k = 11
+ 2540, // lg_k = 12
+ 5020, // lg_k = 13
+ 9968, // lg_k = 14
+ 19836, // lg_k = 15
+ 39532, // lg_k = 16
+ 78880, // lg_k = 17
+ 157516, // lg_k = 18
+ 314656 // lg_k = 19
+};
+static const double CPC_EMPIRICAL_MAX_SIZE_FACTOR = 0.6; // 0.6 = 4.8 / 8.0
+static const size_t CPC_MAX_PREAMBLE_SIZE_BYTES = 40;
+
+template<typename A>
+size_t cpc_sketch_alloc<A>::get_max_serialized_size_bytes(uint8_t lg_k) {
+ check_lg_k(lg_k);
+ if (lg_k <= CPC_EMPIRICAL_SIZE_MAX_LGK) return CPC_EMPIRICAL_MAX_SIZE_BYTES[lg_k - CPC_MIN_LG_K] + CPC_MAX_PREAMBLE_SIZE_BYTES;
+ const uint32_t k = 1 << lg_k;
+ return (int) (CPC_EMPIRICAL_MAX_SIZE_FACTOR * k) + CPC_MAX_PREAMBLE_SIZE_BYTES;
+}
+
+template<typename A>
+void cpc_sketch_alloc<A>::check_lg_k(uint8_t lg_k) {
+ if (lg_k < CPC_MIN_LG_K || lg_k > CPC_MAX_LG_K) {
+ throw std::invalid_argument("lg_k must be >= " + std::to_string(CPC_MIN_LG_K) + " and <= " + std::to_string(CPC_MAX_LG_K) + ": " + std::to_string(lg_k));
+ }
+}
+
template<typename A>
uint32_t cpc_sketch_alloc<A>::get_num_coupons() const {
return num_coupons;
diff --git a/cpc/test/cpc_sketch_test.cpp b/cpc/test/cpc_sketch_test.cpp
index 0a2ca74..f0e5d7a 100644
--- a/cpc/test/cpc_sketch_test.cpp
+++ b/cpc/test/cpc_sketch_test.cpp
@@ -398,4 +398,9 @@
REQUIRE(sketch.get_estimate() == Approx(1).margin(RELATIVE_ERROR_FOR_LG_K_11));
}
+TEST_CASE("cpc sketch: max serialized size", "[cpc_sketch]") {
+ REQUIRE(cpc_sketch::get_max_serialized_size_bytes(4) == 24 + 40);
+ REQUIRE(cpc_sketch::get_max_serialized_size_bytes(26) == static_cast<size_t>((0.6 * (1 << 26)) + 40));
+}
+
} /* namespace datasketches */