Merge pull request #176 from apache/req_sketch
Req sketch
diff --git a/.asf.yaml b/.asf.yaml
new file mode 100644
index 0000000..15e33a5
--- /dev/null
+++ b/.asf.yaml
@@ -0,0 +1,2 @@
+github:
+ homepage: https://datasketches.apache.org
\ No newline at end of file
diff --git a/DISCLAIMER-WIP b/DISCLAIMER-WIP
deleted file mode 100644
index ae9f942..0000000
--- a/DISCLAIMER-WIP
+++ /dev/null
@@ -1,26 +0,0 @@
-Apache DataSketches (incubating) is an effort undergoing incubation
-at The Apache Software Foundation (ASF), sponsored by the Apache Incubator.
-
-Incubation is required of all newly accepted projects until a further review
-indicates that the infrastructure, communications, and decision making process
-have stabilized in a manner consistent with other successful ASF projects.
-
-While incubation status is not necessarily a reflection of the
-completeness or stability of the code, it does indicate that the
-project has yet to be fully endorsed by the ASF.
-
-Some of the incubating project's releases may not be fully compliant
-with ASF policy. For example, releases may have incomplete or
-un-reviewed licensing conditions. What follows is a list of known
-issues the project is currently aware of (note that this list, by
-definition, is likely to be incomplete):
-
- * No issues are known at this time.
-
-If you are planning to incorporate this work into your
-product or project, please be aware that you will need to conduct a
-thorough licensing review to determine the overall implications of
-including this work. For the current status of this project through the Apache
-Incubator visit:
-
-http://incubator.apache.org/projects/datasketches.html
diff --git a/LICENSE b/LICENSE
index 5947c4b..a52b236 100644
--- a/LICENSE
+++ b/LICENSE
@@ -247,7 +247,7 @@
SOFTWARE.
-------------------------------------------------------------
Code locations:
- * https://github.com/apache/incubator-datasketches-cpp/blob/master/setup.py
+ * https://github.com/apache/datasketches-cpp/blob/master/setup.py
that is adapted from the above.
@@ -285,7 +285,7 @@
DEALINGS IN THE SOFTWARE.
-------------------------------------------------------------
Code Locations
- * https://github.com/apache/incubator-datasketches-cpp/blob/master/common/test/catch.hpp
+ * https://github.com/apache/datasketches-cpp/blob/master/common/test/catch.hpp
that is adapted from the above.
diff --git a/NOTICE b/NOTICE
index 41a4505..d971e7b 100644
--- a/NOTICE
+++ b/NOTICE
@@ -1,4 +1,4 @@
-Apache DataSketches-cpp (incubating)
+Apache DataSketches-cpp
Copyright 2020 The Apache Software Foundation
Copyright 2015-2018 Yahoo
diff --git a/README.md b/README.md
index 44d4646..a92d62f 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,7 @@
This component is also a dependency of other components of the library that create adaptors for target systems, such as PostgreSQL.
Note that we have a parallel core component for Java implementations of the same sketch algorithms,
-[incubator-datasketches-java](https://github.com/apache/incubator-datasketches-java).
+[datasketches-java](https://github.com/apache/datasketches-java).
Please visit the main [DataSketches website](https://datasketches.apache.org) for more information.
@@ -14,7 +14,7 @@
This code requires C++11. It was tested with GCC 4.8.5 (standard in RedHat at the time of this writing), GCC 8.2.0 and Apple LLVM version 10.0.1 (clang-1001.0.46.4)
-This includes Python bindings. For the Python interface, see the README notes in [the python subdirectory](https://github.com/apache/incubator-datasketches-cpp/tree/master/python).
+This includes Python bindings. For the Python interface, see the README notes in [the python subdirectory](https://github.com/apache/datasketches-cpp/tree/master/python).
This library is header-only. The build process provided is only for building unit tests and the python library.
@@ -40,7 +40,3 @@
$ cmake --build build --config Release
$ cmake --build build --config Release --target RUN_TESTS
```
-
-----
-
-Disclaimer: Apache DataSketches is an effort undergoing incubation at The Apache Software Foundation (ASF), sponsored by the Apache Incubator. Incubation is required of all newly accepted projects until a further review indicates that the infrastructure, communications, and decision making process have stabilized in a manner consistent with other successful ASF projects. While incubation status is not necessarily a reflection of the completeness or stability of the code, it does indicate that the project has yet to be fully endorsed by the ASF.
\ No newline at end of file
diff --git a/cpc/include/cpc_sketch_impl.hpp b/cpc/include/cpc_sketch_impl.hpp
index ebb6948..e6bc010 100644
--- a/cpc/include/cpc_sketch_impl.hpp
+++ b/cpc/include/cpc_sketch_impl.hpp
@@ -410,20 +410,20 @@
const bool has_table = compressed.table_data.size() > 0;
const bool has_window = compressed.window_data.size() > 0;
const uint8_t preamble_ints = get_preamble_ints(num_coupons, has_hip, has_table, has_window);
- os.write((char*)&preamble_ints, sizeof(preamble_ints));
+ os.write(reinterpret_cast<const char*>(&preamble_ints), sizeof(preamble_ints));
const uint8_t serial_version = SERIAL_VERSION;
- os.write((char*)&serial_version, sizeof(serial_version));
+ os.write(reinterpret_cast<const char*>(&serial_version), sizeof(serial_version));
const uint8_t family = FAMILY;
- os.write((char*)&family, sizeof(family));
- os.write((char*)&lg_k, sizeof(lg_k));
- os.write((char*)&first_interesting_column, sizeof(first_interesting_column));
+ os.write(reinterpret_cast<const char*>(&family), sizeof(family));
+ os.write(reinterpret_cast<const char*>(&lg_k), sizeof(lg_k));
+ os.write(reinterpret_cast<const char*>(&first_interesting_column), sizeof(first_interesting_column));
const uint8_t flags_byte(
(1 << flags::IS_COMPRESSED)
| (has_hip ? 1 << flags::HAS_HIP : 0)
| (has_table ? 1 << flags::HAS_TABLE : 0)
| (has_window ? 1 << flags::HAS_WINDOW : 0)
);
- os.write((char*)&flags_byte, sizeof(flags_byte));
+ os.write(reinterpret_cast<const char*>(&flags_byte), sizeof(flags_byte));
const uint16_t seed_hash(compute_seed_hash(seed));
os.write((char*)&seed_hash, sizeof(seed_hash));
if (!is_empty()) {
@@ -794,8 +794,8 @@
template<typename A>
void cpc_sketch_alloc<A>::write_hip(std::ostream& os) const {
- os.write((char*)&kxp, sizeof(kxp));
- os.write((char*)&hip_est_accum, sizeof(hip_est_accum));
+ os.write(reinterpret_cast<const char*>(&kxp), sizeof(kxp));
+ os.write(reinterpret_cast<const char*>(&hip_est_accum), sizeof(hip_est_accum));
}
template<typename A>
diff --git a/hll/include/HllUnion-internal.hpp b/hll/include/HllUnion-internal.hpp
index 5aa184f..0d12fd3 100644
--- a/hll/include/HllUnion-internal.hpp
+++ b/hll/include/HllUnion-internal.hpp
@@ -38,34 +38,6 @@
{}
template<typename A>
-hll_union_alloc<A> hll_union_alloc<A>::deserialize(const void* bytes, size_t len) {
- hll_sketch_alloc<A> sk(hll_sketch_alloc<A>::deserialize(bytes, len));
- // we're using the sketch's lg_config_k to initialize the union so
- // we can initialize the Union with it as long as it's HLL_8.
- hll_union_alloc<A> hllUnion(sk.get_lg_config_k());
- if (sk.get_target_type() == HLL_8) {
- std::swap(hllUnion.gadget.sketch_impl, sk.sketch_impl);
- } else {
- hllUnion.update(sk);
- }
- return hllUnion;
-}
-
-template<typename A>
-hll_union_alloc<A> hll_union_alloc<A>::deserialize(std::istream& is) {
- hll_sketch_alloc<A> sk(hll_sketch_alloc<A>::deserialize(is));
- // we're using the sketch's lg_config_k to initialize the union so
- // we can initialize the Union with it as long as it's HLL_8.
- hll_union_alloc<A> hllUnion(sk.get_lg_config_k());
- if (sk.get_target_type() == HLL_8) {
- std::swap(hllUnion.gadget.sketch_impl, sk.sketch_impl);
- } else {
- hllUnion.update(sk);
- }
- return hllUnion;
-}
-
-template<typename A>
hll_sketch_alloc<A> hll_union_alloc<A>::get_result(target_hll_type target_type) const {
return hll_sketch_alloc<A>(gadget, target_type);
}
diff --git a/hll/include/hll.hpp b/hll/include/hll.hpp
index c76ba48..3898dda 100644
--- a/hll/include/hll.hpp
+++ b/hll/include/hll.hpp
@@ -434,21 +434,6 @@
explicit hll_union_alloc(int lg_max_k);
/**
- * Construct an hll_union operator from the given std::istream, which
- * must be a valid serialized image of an hll_union.
- * @param is The input stream from which to read.
- */
- static hll_union_alloc deserialize(std::istream& is);
-
- /**
- * Construct an hll_union operator from the given byte array, which
- * must be a valid serialized image of an hll_union.
- * @param bytes The byte array to read.
- * @param len Byte array length in bytes.
- */
- static hll_union_alloc deserialize(const void* bytes, size_t len);
-
- /**
* Returns the current cardinality estimate
* @return the cardinality estimate
*/
diff --git a/kll/include/kll_quantile_calculator_impl.hpp b/kll/include/kll_quantile_calculator_impl.hpp
index cb92674..f580819 100644
--- a/kll/include/kll_quantile_calculator_impl.hpp
+++ b/kll/include/kll_quantile_calculator_impl.hpp
@@ -81,7 +81,7 @@
void kll_quantile_calculator<T, C, A>::convert_to_preceding_cummulative() {
uint64_t subtotal = 0;
for (auto& entry: entries_) {
- const uint32_t new_subtotal = subtotal + entry.second;
+ const uint64_t new_subtotal = subtotal + entry.second;
entry.second = subtotal;
subtotal = new_subtotal;
}
diff --git a/kll/include/kll_sketch.hpp b/kll/include/kll_sketch.hpp
index 8b3409c..a4530c9 100644
--- a/kll/include/kll_sketch.hpp
+++ b/kll/include/kll_sketch.hpp
@@ -315,8 +315,8 @@
*
* <p>If the sketch is empty this returns an empty vector.
*
- * @param split_points an array of <i>m</i> unique, monotonically increasing float values
- * that divide the real number line into <i>m+1</i> consecutive disjoint intervals.
+ * @param split_points an array of <i>m</i> unique, monotonically increasing values
+ * that divide the input domain into <i>m+1</i> consecutive disjoint intervals.
* The definition of an "interval" is inclusive of the left split point (or minimum value) and
* exclusive of the right split point, with the exception that the last interval will include
* the maximum value.
@@ -338,8 +338,8 @@
*
* <p>If the sketch is empty this returns an empty vector.
*
- * @param split_points an array of <i>m</i> unique, monotonically increasing float values
- * that divide the real number line into <i>m+1</i> consecutive disjoint intervals.
+ * @param split_points an array of <i>m</i> unique, monotonically increasing values
+ * that divide the input domain into <i>m+1</i> consecutive disjoint intervals.
* The definition of an "interval" is inclusive of the left split point (or minimum value) and
* exclusive of the right split point, with the exception that the last interval will include
* the maximum value.
@@ -401,7 +401,7 @@
* @param is input stream
* @return an instance of a sketch
*/
- static kll_sketch<T, C, S, A> deserialize(std::istream& is);
+ static kll_sketch deserialize(std::istream& is);
/**
* This method deserializes a sketch from a given array of bytes.
@@ -409,7 +409,7 @@
* @param size the size of the array
* @return an instance of a sketch
*/
- static kll_sketch<T, C, S, A> deserialize(const void* bytes, size_t size);
+ static kll_sketch deserialize(const void* bytes, size_t size);
/*
* Gets the normalized rank error given k and pmf.
diff --git a/kll/include/kll_sketch_impl.hpp b/kll/include/kll_sketch_impl.hpp
index d051135..f0c5ff3 100644
--- a/kll/include/kll_sketch_impl.hpp
+++ b/kll/include/kll_sketch_impl.hpp
@@ -91,7 +91,7 @@
template<typename T, typename C, typename S, typename A>
kll_sketch<T, C, S, A>& kll_sketch<T, C, S, A>::operator=(const kll_sketch& other) {
- kll_sketch<T, C, S, A> copy(other);
+ kll_sketch copy(other);
std::swap(k_, copy.k_);
std::swap(m_, copy.m_);
std::swap(min_k_, copy.min_k_);
@@ -271,6 +271,7 @@
template<typename T, typename C, typename S, typename A>
std::vector<T, A> kll_sketch<T, C, S, A>::get_quantiles(const double* fractions, uint32_t size) const {
std::vector<T, A> quantiles;
+ quantiles.reserve(size);
if (is_empty()) return quantiles;
std::unique_ptr<kll_quantile_calculator<T, C, A>, std::function<void(kll_quantile_calculator<T, C, A>*)>> quantile_calculator;
quantiles.reserve(size);
@@ -469,9 +470,9 @@
check_serial_version(serial_version);
check_family_id(family_id);
+ if (!is.good()) throw std::runtime_error("error reading from std::istream");
const bool is_empty(flags_byte & (1 << flags::IS_EMPTY));
- if (!is.good()) throw std::runtime_error("error reading from std::istream");
- if (is_empty) return kll_sketch<T, C, S, A>(k);
+ if (is_empty) return kll_sketch(k);
uint64_t n;
uint16_t min_k;
@@ -524,8 +525,7 @@
// copy did not throw, repackage with destrtuctor
max_value = std::unique_ptr<T, item_deleter>(max_value_buffer.release(), item_deleter());
}
- if (!is.good())
- throw std::runtime_error("error reading from std::istream");
+ if (!is.good()) throw std::runtime_error("error reading from std::istream");
return kll_sketch(k, min_k, n, num_levels, std::move(levels), std::move(items), capacity,
std::move(min_value), std::move(max_value), is_level_zero_sorted);
}
diff --git a/kll/test/kll_sketch_test.cpp b/kll/test/kll_sketch_test.cpp
index ac71757..08e9abe 100644
--- a/kll/test/kll_sketch_test.cpp
+++ b/kll/test/kll_sketch_test.cpp
@@ -156,6 +156,34 @@
REQUIRE(quantiles[2] == quantiles2[2]);
}
+ SECTION("10 items") {
+ kll_float_sketch sketch;
+ sketch.update(1);
+ sketch.update(2);
+ sketch.update(3);
+ sketch.update(4);
+ sketch.update(5);
+ sketch.update(6);
+ sketch.update(7);
+ sketch.update(8);
+ sketch.update(9);
+ sketch.update(10);
+ REQUIRE(sketch.get_quantile(0) == 1.0);
+ REQUIRE(sketch.get_quantile(0.5) == 6.0);
+ REQUIRE(sketch.get_quantile(0.99) == 10.0);
+ REQUIRE(sketch.get_quantile(1) == 10.0);
+ }
+
+ SECTION("100 items") {
+ kll_float_sketch sketch;
+ for (int i = 0; i < 100; ++i) sketch.update(i);
+ REQUIRE(sketch.get_quantile(0) == 0);
+ REQUIRE(sketch.get_quantile(0.01) == 1);
+ REQUIRE(sketch.get_quantile(0.5) == 50);
+ REQUIRE(sketch.get_quantile(0.99) == 99.0);
+ REQUIRE(sketch.get_quantile(1) == 99.0);
+ }
+
SECTION("many items, estimation mode") {
kll_float_sketch sketch;
const int n(1000000);
diff --git a/python/README.md b/python/README.md
index 6d8700a..8acd4c2 100644
--- a/python/README.md
+++ b/python/README.md
@@ -8,7 +8,7 @@
An official pypi build is eventually planned but not yet available.
If you instead want to take a (possibly ill-advised) gamble on the current state of the master branch being useable, you can run:
-```pip install git+https://github.com/apache/incubator-datasketches-cpp.git```
+```pip install git+https://github.com/apache/datasketches-cpp.git```
## Developer Instructions
@@ -16,8 +16,8 @@
When cloning the source repository, you should include the pybind11 submodule with the `--recursive` option to the clone command:
```
-git clone --recursive https://github.com/apache/incubator-datasketches-cpp.git
-cd incubator-datasketches-cpp
+git clone --recursive https://github.com/apache/datasketches-cpp.git
+cd datasketches-cpp
python -m pip install --upgrade pip setuptools wheel numpy
python setup.py build
```
diff --git a/setup.py b/setup.py
index fdec2ba..47ba477 100644
--- a/setup.py
+++ b/setup.py
@@ -77,7 +77,7 @@
setup(
name='datasketches',
- version='2.2.0-incubating-SNAPSHOT',
+ version='2.2.0-SNAPSHOT',
author='Datasketches Developers',
author_email='dev@datasketches.apache.org',
description='A wrapper for the C++ Datasketches library',
diff --git a/theta/include/theta_sketch_impl.hpp b/theta/include/theta_sketch_impl.hpp
index c099e22..579a675 100644
--- a/theta/include/theta_sketch_impl.hpp
+++ b/theta/include/theta_sketch_impl.hpp
@@ -671,20 +671,20 @@
void compact_theta_sketch_alloc<A>::serialize(std::ostream& os) const {
const bool is_single_item = keys_.size() == 1 && !this->is_estimation_mode();
const uint8_t preamble_longs = this->is_empty() || is_single_item ? 1 : this->is_estimation_mode() ? 3 : 2;
- os.write((char*)&preamble_longs, sizeof(preamble_longs));
+ os.write(reinterpret_cast<const char*>(&preamble_longs), sizeof(preamble_longs));
const uint8_t serial_version = theta_sketch_alloc<A>::SERIAL_VERSION;
- os.write((char*)&serial_version, sizeof(serial_version));
+ os.write(reinterpret_cast<const char*>(&serial_version), sizeof(serial_version));
const uint8_t type = SKETCH_TYPE;
- os.write((char*)&type, sizeof(type));
+ os.write(reinterpret_cast<const char*>(&type), sizeof(type));
const uint16_t unused16 = 0;
- os.write((char*)&unused16, sizeof(unused16));
+ os.write(reinterpret_cast<const char*>(&unused16), sizeof(unused16));
const uint8_t flags_byte(
(1 << theta_sketch_alloc<A>::flags::IS_COMPACT) |
(1 << theta_sketch_alloc<A>::flags::IS_READ_ONLY) |
(this->is_empty() ? 1 << theta_sketch_alloc<A>::flags::IS_EMPTY : 0) |
(this->is_ordered() ? 1 << theta_sketch_alloc<A>::flags::IS_ORDERED : 0)
);
- os.write((char*)&flags_byte, sizeof(flags_byte));
+ os.write(reinterpret_cast<const char*>(&flags_byte), sizeof(flags_byte));
const uint16_t seed_hash = get_seed_hash();
os.write((char*)&seed_hash, sizeof(seed_hash));
if (!this->is_empty()) {
diff --git a/theta/include/theta_union.hpp b/theta/include/theta_union.hpp
index 7fa2095..6cf8ccc 100644
--- a/theta/include/theta_union.hpp
+++ b/theta/include/theta_union.hpp
@@ -24,7 +24,7 @@
#include <functional>
#include <climits>
-#include <theta_sketch.hpp>
+#include "theta_sketch.hpp"
namespace datasketches {
diff --git a/tuple/include/array_of_doubles_a_not_b.hpp b/tuple/include/array_of_doubles_a_not_b.hpp
index c2bbc4e..67e14ca 100644
--- a/tuple/include/array_of_doubles_a_not_b.hpp
+++ b/tuple/include/array_of_doubles_a_not_b.hpp
@@ -29,10 +29,10 @@
namespace datasketches {
template<typename Allocator = std::allocator<double>>
-class array_of_doubles_a_not_b_alloc: tuple_a_not_b<std::vector<double, Allocator>, AllocVectorDouble<Allocator>> {
+class array_of_doubles_a_not_b_alloc: tuple_a_not_b<aod<Allocator>, AllocAOD<Allocator>> {
public:
- using Summary = std::vector<double, Allocator>;
- using AllocSummary = AllocVectorDouble<Allocator>;
+ using Summary = aod<Allocator>;
+ using AllocSummary = AllocAOD<Allocator>;
using Base = tuple_a_not_b<Summary, AllocSummary>;
using CompactSketch = compact_array_of_doubles_sketch_alloc<Allocator>;
diff --git a/tuple/test/array_of_doubles_sketch_test.cpp b/tuple/test/array_of_doubles_sketch_test.cpp
index fa5fc92..7a5e359 100644
--- a/tuple/test/array_of_doubles_sketch_test.cpp
+++ b/tuple/test/array_of_doubles_sketch_test.cpp
@@ -26,6 +26,7 @@
#include <array_of_doubles_sketch.hpp>
#include <array_of_doubles_union.hpp>
#include <array_of_doubles_intersection.hpp>
+#include <array_of_doubles_a_not_b.hpp>
namespace datasketches {
@@ -280,4 +281,18 @@
REQUIRE(result.get_estimate() == Approx(500).margin(0.01));
}
+TEST_CASE("aod a-not-b: half overlap", "[tuple_sketch]") {
+ double a[1] = {1};
+
+ auto update_sketch1 = update_array_of_doubles_sketch::builder().build();
+ for (int i = 0; i < 1000; ++i) update_sketch1.update(i, a);
+
+ auto update_sketch2 = update_array_of_doubles_sketch::builder().build();
+ for (int i = 500; i < 1500; ++i) update_sketch2.update(i, a);
+
+ array_of_doubles_a_not_b a_not_b;
+ auto result = a_not_b.compute(update_sketch1, update_sketch2);
+ REQUIRE(result.get_estimate() == Approx(500).margin(0.01));
+}
+
} /* namespace datasketches */