Merge pull request #222 from apache/cpc_max_serialized_size

CPC max serialized size bytes
diff --git a/cpc/include/cpc_sketch.hpp b/cpc/include/cpc_sketch.hpp
index 651c254..cdc4d16 100644
--- a/cpc/include/cpc_sketch.hpp
+++ b/cpc/include/cpc_sketch.hpp
@@ -235,6 +235,17 @@
    */
   static cpc_sketch_alloc<A> deserialize(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED, const A& allocator = A());
 
+  /**
+   * The actual size of a compressed CPC sketch has a small random variance, but the following
+   * empirically measured size should be large enough for at least 99.9 percent of sketches.
+   *
+   * <p>For small values of <i>n</i> the size can be much smaller.
+   *
+   * @param lg_k the given value of lg_k.
+   * @return the estimated maximum compressed serialized size of a sketch.
+   */
+  static size_t get_max_serialized_size_bytes(uint8_t lg_k);
+
   // for internal use
   uint32_t get_num_coupons() const;
 
@@ -303,6 +314,8 @@
   inline void write_hip(std::ostream& os) const;
   inline size_t copy_hip_to_mem(void* dst) const;
 
+  static void check_lg_k(uint8_t lg_k);
+
   friend cpc_compressor<A>;
   friend cpc_union_alloc<A>;
 };
diff --git a/cpc/include/cpc_sketch_impl.hpp b/cpc/include/cpc_sketch_impl.hpp
index 1bb1be1..2ad977a 100644
--- a/cpc/include/cpc_sketch_impl.hpp
+++ b/cpc/include/cpc_sketch_impl.hpp
@@ -53,9 +53,7 @@
 kxp(1 << lg_k),
 hip_est_accum(0)
 {
-  if (lg_k < CPC_MIN_LG_K || lg_k > CPC_MAX_LG_K) {
-    throw std::invalid_argument("lg_k must be >= " + std::to_string(CPC_MIN_LG_K) + " and <= " + std::to_string(CPC_MAX_LG_K) + ": " + std::to_string(lg_k));
-  }
+  check_lg_k(lg_k);
 }
 
 template<typename A>
@@ -681,6 +679,49 @@
       std::move(uncompressed.window), has_hip, kxp, hip_est_accum, seed);
 }
 
+/*
+ * These empirical values for the 99.9th percentile of size in bytes were measured using 100,000
+ * trials. The value for each trial is the maximum of 5*16=80 measurements that were equally
+ * spaced over values of the quantity C/K between 3.0 and 8.0. This table does not include the
+ * worst-case space for the preamble, which is added by the function.
+ */
+static const uint8_t CPC_EMPIRICAL_SIZE_MAX_LGK = 19;
+static const size_t CPC_EMPIRICAL_MAX_SIZE_BYTES[]  = {
+    24,     // lg_k = 4
+    36,     // lg_k = 5
+    56,     // lg_k = 6
+    100,    // lg_k = 7
+    180,    // lg_k = 8
+    344,    // lg_k = 9
+    660,    // lg_k = 10
+    1292,   // lg_k = 11
+    2540,   // lg_k = 12
+    5020,   // lg_k = 13
+    9968,   // lg_k = 14
+    19836,  // lg_k = 15
+    39532,  // lg_k = 16
+    78880,  // lg_k = 17
+    157516, // lg_k = 18
+    314656  // lg_k = 19
+};
+static const double CPC_EMPIRICAL_MAX_SIZE_FACTOR = 0.6; // 0.6 = 4.8 / 8.0
+static const size_t CPC_MAX_PREAMBLE_SIZE_BYTES = 40;
+
+template<typename A>
+size_t cpc_sketch_alloc<A>::get_max_serialized_size_bytes(uint8_t lg_k) {
+  check_lg_k(lg_k);
+  if (lg_k <= CPC_EMPIRICAL_SIZE_MAX_LGK) return CPC_EMPIRICAL_MAX_SIZE_BYTES[lg_k - CPC_MIN_LG_K] + CPC_MAX_PREAMBLE_SIZE_BYTES;
+  const uint32_t k = 1 << lg_k;
+  return (int) (CPC_EMPIRICAL_MAX_SIZE_FACTOR * k) + CPC_MAX_PREAMBLE_SIZE_BYTES;
+}
+
+template<typename A>
+void cpc_sketch_alloc<A>::check_lg_k(uint8_t lg_k) {
+  if (lg_k < CPC_MIN_LG_K || lg_k > CPC_MAX_LG_K) {
+    throw std::invalid_argument("lg_k must be >= " + std::to_string(CPC_MIN_LG_K) + " and <= " + std::to_string(CPC_MAX_LG_K) + ": " + std::to_string(lg_k));
+  }
+}
+
 template<typename A>
 uint32_t cpc_sketch_alloc<A>::get_num_coupons() const {
   return num_coupons;
diff --git a/cpc/test/cpc_sketch_test.cpp b/cpc/test/cpc_sketch_test.cpp
index 0a2ca74..f0e5d7a 100644
--- a/cpc/test/cpc_sketch_test.cpp
+++ b/cpc/test/cpc_sketch_test.cpp
@@ -398,4 +398,9 @@
   REQUIRE(sketch.get_estimate() == Approx(1).margin(RELATIVE_ERROR_FOR_LG_K_11));
 }
 
+TEST_CASE("cpc sketch: max serialized size", "[cpc_sketch]") {
+  REQUIRE(cpc_sketch::get_max_serialized_size_bytes(4) == 24 + 40);
+  REQUIRE(cpc_sketch::get_max_serialized_size_bytes(26) == static_cast<size_t>((0.6 * (1 << 26)) + 40));
+}
+
 } /* namespace datasketches */