Merge pull request #176 from apache/req_sketch Req sketch

commit: 202bc2cb9606723d686fe2a05684f3b8ff0a9b5a [log] [tgz]
author: Alexander Saydakov <13126686+AlexanderSaydakov@users.noreply.github.com> Mon Jan 25 14:45:51 2021 -0800
committer: GitHub <noreply@github.com> Mon Jan 25 14:45:51 2021 -0800
tree: e6ae4803813f77f4ac16a3b5a1eabcf7064a9adc
parent: b2f749ed5ce6ba650f4259602b133c310c3a5ee4 [diff]
parent: 9b99b9be6e45a623388a8d6eb2438130abe88f1c [diff]
diff --git a/.asf.yaml b/.asf.yaml
new file mode 100644
index 0000000..15e33a5
--- /dev/null
+++ b/.asf.yaml

@@ -0,0 +1,2 @@
+github:
+  homepage: https://datasketches.apache.org
\ No newline at end of file

diff --git a/DISCLAIMER-WIP b/DISCLAIMER-WIP
deleted file mode 100644
index ae9f942..0000000
--- a/DISCLAIMER-WIP
+++ /dev/null

@@ -1,26 +0,0 @@
-Apache DataSketches (incubating) is an effort undergoing incubation 
-at The Apache Software Foundation (ASF), sponsored by the Apache Incubator. 
-
-Incubation is required of all newly accepted projects until a further review 
-indicates that the infrastructure, communications, and decision making process 
-have stabilized in a manner consistent with other successful ASF projects.
-
-While incubation status is not necessarily a reflection of the
-completeness or stability of the code, it does indicate that the
-project has yet to be fully endorsed by the ASF.
-
-Some of the incubating project's releases may not be fully compliant
-with ASF policy. For example, releases may have incomplete or
-un-reviewed licensing conditions. What follows is a list of known
-issues the project is currently aware of (note that this list, by
-definition, is likely to be incomplete): 
-
- * No issues are known at this time. 
-
-If you are planning to incorporate this work into your
-product or project, please be aware that you will need to conduct a
-thorough licensing review to determine the overall implications of
-including this work. For the current status of this project through the Apache
-Incubator visit: 
-
-http://incubator.apache.org/projects/datasketches.html

diff --git a/LICENSE b/LICENSE
index 5947c4b..a52b236 100644
--- a/LICENSE
+++ b/LICENSE

@@ -247,7 +247,7 @@
     SOFTWARE.
     -------------------------------------------------------------
     Code locations:
-      * https://github.com/apache/incubator-datasketches-cpp/blob/master/setup.py
+      * https://github.com/apache/datasketches-cpp/blob/master/setup.py
     that is adapted from the above.
 
 
@@ -285,7 +285,7 @@
     DEALINGS IN THE SOFTWARE.
     -------------------------------------------------------------
     Code Locations
-      * https://github.com/apache/incubator-datasketches-cpp/blob/master/common/test/catch.hpp
+      * https://github.com/apache/datasketches-cpp/blob/master/common/test/catch.hpp
     that is adapted from the above.
 
 

diff --git a/NOTICE b/NOTICE
index 41a4505..d971e7b 100644
--- a/NOTICE
+++ b/NOTICE

@@ -1,4 +1,4 @@
-Apache DataSketches-cpp (incubating)
+Apache DataSketches-cpp
 Copyright 2020 The Apache Software Foundation
 
 Copyright 2015-2018 Yahoo

diff --git a/README.md b/README.md
index 44d4646..a92d62f 100644
--- a/README.md
+++ b/README.md

@@ -4,7 +4,7 @@
 This component is also a dependency of other components of the library that create adaptors for target systems, such as PostgreSQL.
 
 Note that we have a parallel core component for Java implementations of the same sketch algorithms, 
-[incubator-datasketches-java](https://github.com/apache/incubator-datasketches-java).
+[datasketches-java](https://github.com/apache/datasketches-java).
 
 Please visit the main [DataSketches website](https://datasketches.apache.org) for more information. 
 
@@ -14,7 +14,7 @@
 
 This code requires C++11. It was tested with GCC 4.8.5 (standard in RedHat at the time of this writing), GCC 8.2.0 and Apple LLVM version 10.0.1 (clang-1001.0.46.4)
 
-This includes Python bindings. For the Python interface, see the README notes in [the python subdirectory](https://github.com/apache/incubator-datasketches-cpp/tree/master/python).
+This includes Python bindings. For the Python interface, see the README notes in [the python subdirectory](https://github.com/apache/datasketches-cpp/tree/master/python).
 
 This library is header-only. The build process provided is only for building unit tests and the python library.
 
@@ -40,7 +40,3 @@
 	$ cmake --build build --config Release
 	$ cmake --build build --config Release --target RUN_TESTS
 ```
-
-----
-
-Disclaimer: Apache DataSketches is an effort undergoing incubation at The Apache Software Foundation (ASF), sponsored by the Apache Incubator. Incubation is required of all newly accepted projects until a further review indicates that the infrastructure, communications, and decision making process have stabilized in a manner consistent with other successful ASF projects. While incubation status is not necessarily a reflection of the completeness or stability of the code, it does indicate that the project has yet to be fully endorsed by the ASF.
\ No newline at end of file

diff --git a/cpc/include/cpc_sketch_impl.hpp b/cpc/include/cpc_sketch_impl.hpp
index ebb6948..e6bc010 100644
--- a/cpc/include/cpc_sketch_impl.hpp
+++ b/cpc/include/cpc_sketch_impl.hpp

@@ -410,20 +410,20 @@
   const bool has_table = compressed.table_data.size() > 0;
   const bool has_window = compressed.window_data.size() > 0;
   const uint8_t preamble_ints = get_preamble_ints(num_coupons, has_hip, has_table, has_window);
-  os.write((char*)&preamble_ints, sizeof(preamble_ints));
+  os.write(reinterpret_cast<const char*>(&preamble_ints), sizeof(preamble_ints));
   const uint8_t serial_version = SERIAL_VERSION;
-  os.write((char*)&serial_version, sizeof(serial_version));
+  os.write(reinterpret_cast<const char*>(&serial_version), sizeof(serial_version));
   const uint8_t family = FAMILY;
-  os.write((char*)&family, sizeof(family));
-  os.write((char*)&lg_k, sizeof(lg_k));
-  os.write((char*)&first_interesting_column, sizeof(first_interesting_column));
+  os.write(reinterpret_cast<const char*>(&family), sizeof(family));
+  os.write(reinterpret_cast<const char*>(&lg_k), sizeof(lg_k));
+  os.write(reinterpret_cast<const char*>(&first_interesting_column), sizeof(first_interesting_column));
   const uint8_t flags_byte(
     (1 << flags::IS_COMPRESSED)
     | (has_hip ? 1 << flags::HAS_HIP : 0)
     | (has_table ? 1 << flags::HAS_TABLE : 0)
     | (has_window ? 1 << flags::HAS_WINDOW : 0)
   );
-  os.write((char*)&flags_byte, sizeof(flags_byte));
+  os.write(reinterpret_cast<const char*>(&flags_byte), sizeof(flags_byte));
   const uint16_t seed_hash(compute_seed_hash(seed));
   os.write((char*)&seed_hash, sizeof(seed_hash));
   if (!is_empty()) {
@@ -794,8 +794,8 @@
 
 template<typename A>
 void cpc_sketch_alloc<A>::write_hip(std::ostream& os) const {
-  os.write((char*)&kxp, sizeof(kxp));
-  os.write((char*)&hip_est_accum, sizeof(hip_est_accum));
+  os.write(reinterpret_cast<const char*>(&kxp), sizeof(kxp));
+  os.write(reinterpret_cast<const char*>(&hip_est_accum), sizeof(hip_est_accum));
 }
 
 template<typename A>

diff --git a/hll/include/HllUnion-internal.hpp b/hll/include/HllUnion-internal.hpp
index 5aa184f..0d12fd3 100644
--- a/hll/include/HllUnion-internal.hpp
+++ b/hll/include/HllUnion-internal.hpp

@@ -38,34 +38,6 @@
 {}
 
 template<typename A>
-hll_union_alloc<A> hll_union_alloc<A>::deserialize(const void* bytes, size_t len) {
-  hll_sketch_alloc<A> sk(hll_sketch_alloc<A>::deserialize(bytes, len));
-  // we're using the sketch's lg_config_k to initialize the union so
-  // we can initialize the Union with it as long as it's HLL_8.
-  hll_union_alloc<A> hllUnion(sk.get_lg_config_k());
-  if (sk.get_target_type() == HLL_8) {
-    std::swap(hllUnion.gadget.sketch_impl, sk.sketch_impl);
-  } else {
-    hllUnion.update(sk);
-  }
-  return hllUnion;
-}
-
-template<typename A>
-hll_union_alloc<A> hll_union_alloc<A>::deserialize(std::istream& is) {
-  hll_sketch_alloc<A> sk(hll_sketch_alloc<A>::deserialize(is));
-  // we're using the sketch's lg_config_k to initialize the union so
-  // we can initialize the Union with it as long as it's HLL_8.
-  hll_union_alloc<A> hllUnion(sk.get_lg_config_k());
-  if (sk.get_target_type() == HLL_8) {    
-    std::swap(hllUnion.gadget.sketch_impl, sk.sketch_impl);
-  } else {
-    hllUnion.update(sk);
-  }
-  return hllUnion;
-}
-
-template<typename A>
 hll_sketch_alloc<A> hll_union_alloc<A>::get_result(target_hll_type target_type) const {
   return hll_sketch_alloc<A>(gadget, target_type);
 }

diff --git a/hll/include/hll.hpp b/hll/include/hll.hpp
index c76ba48..3898dda 100644
--- a/hll/include/hll.hpp
+++ b/hll/include/hll.hpp

@@ -434,21 +434,6 @@
     explicit hll_union_alloc(int lg_max_k);
 
     /**
-     * Construct an hll_union operator from the given std::istream, which
-     * must be a valid serialized image of an hll_union.
-     * @param is The input stream from which to read.
-     */
-    static hll_union_alloc deserialize(std::istream& is);
-
-  /**
-     * Construct an hll_union operator from the given byte array, which
-     * must be a valid serialized image of an hll_union.
-     * @param bytes The byte array to read.
-     * @param len Byte array length in bytes.
-     */
-    static hll_union_alloc deserialize(const void* bytes, size_t len);
-
-    /**
      * Returns the current cardinality estimate
      * @return the cardinality estimate
      */

diff --git a/kll/include/kll_quantile_calculator_impl.hpp b/kll/include/kll_quantile_calculator_impl.hpp
index cb92674..f580819 100644
--- a/kll/include/kll_quantile_calculator_impl.hpp
+++ b/kll/include/kll_quantile_calculator_impl.hpp

@@ -81,7 +81,7 @@
 void kll_quantile_calculator<T, C, A>::convert_to_preceding_cummulative() {
   uint64_t subtotal = 0;
   for (auto& entry: entries_) {
-    const uint32_t new_subtotal = subtotal + entry.second;
+    const uint64_t new_subtotal = subtotal + entry.second;
     entry.second = subtotal;
     subtotal = new_subtotal;
   }

diff --git a/kll/include/kll_sketch.hpp b/kll/include/kll_sketch.hpp
index 8b3409c..a4530c9 100644
--- a/kll/include/kll_sketch.hpp
+++ b/kll/include/kll_sketch.hpp

@@ -315,8 +315,8 @@
      *
      * <p>If the sketch is empty this returns an empty vector.
      *
-     * @param split_points an array of <i>m</i> unique, monotonically increasing float values
-     * that divide the real number line into <i>m+1</i> consecutive disjoint intervals.
+     * @param split_points an array of <i>m</i> unique, monotonically increasing values
+     * that divide the input domain into <i>m+1</i> consecutive disjoint intervals.
      * The definition of an "interval" is inclusive of the left split point (or minimum value) and
      * exclusive of the right split point, with the exception that the last interval will include
      * the maximum value.
@@ -338,8 +338,8 @@
      *
      * <p>If the sketch is empty this returns an empty vector.
      *
-     * @param split_points an array of <i>m</i> unique, monotonically increasing float values
-     * that divide the real number line into <i>m+1</i> consecutive disjoint intervals.
+     * @param split_points an array of <i>m</i> unique, monotonically increasing values
+     * that divide the input domain into <i>m+1</i> consecutive disjoint intervals.
      * The definition of an "interval" is inclusive of the left split point (or minimum value) and
      * exclusive of the right split point, with the exception that the last interval will include
      * the maximum value.
@@ -401,7 +401,7 @@
      * @param is input stream
      * @return an instance of a sketch
      */
-    static kll_sketch<T, C, S, A> deserialize(std::istream& is);
+    static kll_sketch deserialize(std::istream& is);
 
     /**
      * This method deserializes a sketch from a given array of bytes.
@@ -409,7 +409,7 @@
      * @param size the size of the array
      * @return an instance of a sketch
      */
-    static kll_sketch<T, C, S, A> deserialize(const void* bytes, size_t size);
+    static kll_sketch deserialize(const void* bytes, size_t size);
 
     /*
      * Gets the normalized rank error given k and pmf.

diff --git a/kll/include/kll_sketch_impl.hpp b/kll/include/kll_sketch_impl.hpp
index d051135..f0c5ff3 100644
--- a/kll/include/kll_sketch_impl.hpp
+++ b/kll/include/kll_sketch_impl.hpp

@@ -91,7 +91,7 @@
 
 template<typename T, typename C, typename S, typename A>
 kll_sketch<T, C, S, A>& kll_sketch<T, C, S, A>::operator=(const kll_sketch& other) {
-  kll_sketch<T, C, S, A> copy(other);
+  kll_sketch copy(other);
   std::swap(k_, copy.k_);
   std::swap(m_, copy.m_);
   std::swap(min_k_, copy.min_k_);
@@ -271,6 +271,7 @@
 template<typename T, typename C, typename S, typename A>
 std::vector<T, A> kll_sketch<T, C, S, A>::get_quantiles(const double* fractions, uint32_t size) const {
   std::vector<T, A> quantiles;
+  quantiles.reserve(size);
   if (is_empty()) return quantiles;
   std::unique_ptr<kll_quantile_calculator<T, C, A>, std::function<void(kll_quantile_calculator<T, C, A>*)>> quantile_calculator;
   quantiles.reserve(size);
@@ -469,9 +470,9 @@
   check_serial_version(serial_version);
   check_family_id(family_id);
 
+  if (!is.good()) throw std::runtime_error("error reading from std::istream");
   const bool is_empty(flags_byte & (1 << flags::IS_EMPTY));
-  if (!is.good()) throw std::runtime_error("error reading from std::istream"); 
-  if (is_empty) return kll_sketch<T, C, S, A>(k);
+  if (is_empty) return kll_sketch(k);
 
   uint64_t n;
   uint16_t min_k;
@@ -524,8 +525,7 @@
     // copy did not throw, repackage with destrtuctor
     max_value = std::unique_ptr<T, item_deleter>(max_value_buffer.release(), item_deleter());
   }
-  if (!is.good())
-    throw std::runtime_error("error reading from std::istream");
+  if (!is.good()) throw std::runtime_error("error reading from std::istream");
   return kll_sketch(k, min_k, n, num_levels, std::move(levels), std::move(items), capacity,
       std::move(min_value), std::move(max_value), is_level_zero_sorted);
 }

diff --git a/kll/test/kll_sketch_test.cpp b/kll/test/kll_sketch_test.cpp
index ac71757..08e9abe 100644
--- a/kll/test/kll_sketch_test.cpp
+++ b/kll/test/kll_sketch_test.cpp

@@ -156,6 +156,34 @@
     REQUIRE(quantiles[2] == quantiles2[2]);
   }
 
+  SECTION("10 items") {
+    kll_float_sketch sketch;
+    sketch.update(1);
+    sketch.update(2);
+    sketch.update(3);
+    sketch.update(4);
+    sketch.update(5);
+    sketch.update(6);
+    sketch.update(7);
+    sketch.update(8);
+    sketch.update(9);
+    sketch.update(10);
+    REQUIRE(sketch.get_quantile(0) == 1.0);
+    REQUIRE(sketch.get_quantile(0.5) == 6.0);
+    REQUIRE(sketch.get_quantile(0.99) == 10.0);
+    REQUIRE(sketch.get_quantile(1) == 10.0);
+  }
+
+  SECTION("100 items") {
+    kll_float_sketch sketch;
+    for (int i = 0; i < 100; ++i) sketch.update(i);
+    REQUIRE(sketch.get_quantile(0) == 0);
+    REQUIRE(sketch.get_quantile(0.01) == 1);
+    REQUIRE(sketch.get_quantile(0.5) == 50);
+    REQUIRE(sketch.get_quantile(0.99) == 99.0);
+    REQUIRE(sketch.get_quantile(1) == 99.0);
+  }
+
   SECTION("many items, estimation mode") {
     kll_float_sketch sketch;
     const int n(1000000);

diff --git a/python/README.md b/python/README.md
index 6d8700a..8acd4c2 100644
--- a/python/README.md
+++ b/python/README.md

@@ -8,7 +8,7 @@
 An official pypi build is eventually planned but not yet available.
 
 If you instead want to take a (possibly ill-advised) gamble on the current state of the master branch being useable, you can run:
-```pip install git+https://github.com/apache/incubator-datasketches-cpp.git```
+```pip install git+https://github.com/apache/datasketches-cpp.git```
 
 ## Developer Instructions
 
@@ -16,8 +16,8 @@
 
 When cloning the source repository, you should include the pybind11 submodule with the `--recursive` option to the clone command:
 ```
-git clone --recursive https://github.com/apache/incubator-datasketches-cpp.git
-cd incubator-datasketches-cpp
+git clone --recursive https://github.com/apache/datasketches-cpp.git
+cd datasketches-cpp
 python -m pip install --upgrade pip setuptools wheel numpy
 python setup.py build
 ```

diff --git a/setup.py b/setup.py
index fdec2ba..47ba477 100644
--- a/setup.py
+++ b/setup.py

@@ -77,7 +77,7 @@
 
 setup(
     name='datasketches',
-    version='2.2.0-incubating-SNAPSHOT',
+    version='2.2.0-SNAPSHOT',
     author='Datasketches Developers',
     author_email='dev@datasketches.apache.org',
     description='A wrapper for the C++ Datasketches library',

diff --git a/theta/include/theta_sketch_impl.hpp b/theta/include/theta_sketch_impl.hpp
index c099e22..579a675 100644
--- a/theta/include/theta_sketch_impl.hpp
+++ b/theta/include/theta_sketch_impl.hpp

@@ -671,20 +671,20 @@
 void compact_theta_sketch_alloc<A>::serialize(std::ostream& os) const {
   const bool is_single_item = keys_.size() == 1 && !this->is_estimation_mode();
   const uint8_t preamble_longs = this->is_empty() || is_single_item ? 1 : this->is_estimation_mode() ? 3 : 2;
-  os.write((char*)&preamble_longs, sizeof(preamble_longs));
+  os.write(reinterpret_cast<const char*>(&preamble_longs), sizeof(preamble_longs));
   const uint8_t serial_version = theta_sketch_alloc<A>::SERIAL_VERSION;
-  os.write((char*)&serial_version, sizeof(serial_version));
+  os.write(reinterpret_cast<const char*>(&serial_version), sizeof(serial_version));
   const uint8_t type = SKETCH_TYPE;
-  os.write((char*)&type, sizeof(type));
+  os.write(reinterpret_cast<const char*>(&type), sizeof(type));
   const uint16_t unused16 = 0;
-  os.write((char*)&unused16, sizeof(unused16));
+  os.write(reinterpret_cast<const char*>(&unused16), sizeof(unused16));
   const uint8_t flags_byte(
     (1 << theta_sketch_alloc<A>::flags::IS_COMPACT) |
     (1 << theta_sketch_alloc<A>::flags::IS_READ_ONLY) |
     (this->is_empty() ? 1 << theta_sketch_alloc<A>::flags::IS_EMPTY : 0) |
     (this->is_ordered() ? 1 << theta_sketch_alloc<A>::flags::IS_ORDERED : 0)
   );
-  os.write((char*)&flags_byte, sizeof(flags_byte));
+  os.write(reinterpret_cast<const char*>(&flags_byte), sizeof(flags_byte));
   const uint16_t seed_hash = get_seed_hash();
   os.write((char*)&seed_hash, sizeof(seed_hash));
   if (!this->is_empty()) {

diff --git a/theta/include/theta_union.hpp b/theta/include/theta_union.hpp
index 7fa2095..6cf8ccc 100644
--- a/theta/include/theta_union.hpp
+++ b/theta/include/theta_union.hpp

@@ -24,7 +24,7 @@
 #include <functional>
 #include <climits>
 
-#include <theta_sketch.hpp>
+#include "theta_sketch.hpp"
 
 namespace datasketches {
 

diff --git a/tuple/include/array_of_doubles_a_not_b.hpp b/tuple/include/array_of_doubles_a_not_b.hpp
index c2bbc4e..67e14ca 100644
--- a/tuple/include/array_of_doubles_a_not_b.hpp
+++ b/tuple/include/array_of_doubles_a_not_b.hpp

@@ -29,10 +29,10 @@
 namespace datasketches {
 
 template<typename Allocator = std::allocator<double>>
-class array_of_doubles_a_not_b_alloc: tuple_a_not_b<std::vector<double, Allocator>, AllocVectorDouble<Allocator>> {
+class array_of_doubles_a_not_b_alloc: tuple_a_not_b<aod<Allocator>, AllocAOD<Allocator>> {
 public:
-  using Summary = std::vector<double, Allocator>;
-  using AllocSummary = AllocVectorDouble<Allocator>;
+  using Summary = aod<Allocator>;
+  using AllocSummary = AllocAOD<Allocator>;
   using Base = tuple_a_not_b<Summary, AllocSummary>;
   using CompactSketch = compact_array_of_doubles_sketch_alloc<Allocator>;
 

diff --git a/tuple/test/array_of_doubles_sketch_test.cpp b/tuple/test/array_of_doubles_sketch_test.cpp
index fa5fc92..7a5e359 100644
--- a/tuple/test/array_of_doubles_sketch_test.cpp
+++ b/tuple/test/array_of_doubles_sketch_test.cpp

@@ -26,6 +26,7 @@
 #include <array_of_doubles_sketch.hpp>
 #include <array_of_doubles_union.hpp>
 #include <array_of_doubles_intersection.hpp>
+#include <array_of_doubles_a_not_b.hpp>
 
 namespace datasketches {
 
@@ -280,4 +281,18 @@
   REQUIRE(result.get_estimate() == Approx(500).margin(0.01));
 }
 
+TEST_CASE("aod a-not-b: half overlap", "[tuple_sketch]") {
+  double a[1] = {1};
+
+  auto update_sketch1 = update_array_of_doubles_sketch::builder().build();
+  for (int i = 0; i < 1000; ++i) update_sketch1.update(i, a);
+
+  auto update_sketch2 = update_array_of_doubles_sketch::builder().build();
+  for (int i = 500; i < 1500; ++i) update_sketch2.update(i, a);
+
+  array_of_doubles_a_not_b a_not_b;
+  auto result = a_not_b.compute(update_sketch1, update_sketch2);
+  REQUIRE(result.get_estimate() == Approx(500).margin(0.01));
+}
+
 } /* namespace datasketches */
commit	202bc2cb9606723d686fe2a05684f3b8ff0a9b5a	[log] [tgz]
author	Alexander Saydakov <13126686+AlexanderSaydakov@users.noreply.github.com>	Mon Jan 25 14:45:51 2021 -0800
committer	GitHub <noreply@github.com>	Mon Jan 25 14:45:51 2021 -0800
tree	e6ae4803813f77f4ac16a3b5a1eabcf7064a9adc
parent	b2f749ed5ce6ba650f4259602b133c310c3a5ee4 [diff]
parent	9b99b9be6e45a623388a8d6eb2438130abe88f1c [diff]