Update workflows to skip creating build/, add python descriptions
diff --git a/.github/workflows/build_cmake.yml b/.github/workflows/build_cmake.yml
index be1f1b5..742a712 100644
--- a/.github/workflows/build_cmake.yml
+++ b/.github/workflows/build_cmake.yml
@@ -44,7 +44,7 @@
with:
submodules: true
- name: Configure
- run: mkdir build && cd build && cmake ..
+ run: cd build && cmake ..
- name: Build C++ unit tests
run: cmake --build build --config Release
- name: Run C++ tests
diff --git a/.github/workflows/code_coverage.yml b/.github/workflows/code_coverage.yml
index 6dfbbcb..b05bca0 100644
--- a/.github/workflows/code_coverage.yml
+++ b/.github/workflows/code_coverage.yml
@@ -24,7 +24,7 @@
cd "lcov-$VERSION"
sudo make install
- name: Configure
- run: mkdir build && cd build && cmake .. -DCOVERAGE=ON
+ run: cd build && cmake .. -DCOVERAGE=ON
- name: Build unit tests
run: cmake --build build
- name: Run tests
diff --git a/python/src/cpc_wrapper.cpp b/python/src/cpc_wrapper.cpp
index 2a8ffa4..673e3c7 100644
--- a/python/src/cpc_wrapper.cpp
+++ b/python/src/cpc_wrapper.cpp
@@ -53,25 +53,38 @@
using namespace datasketches;
py::class_<cpc_sketch>(m, "cpc_sketch")
- .def(py::init<uint8_t, uint64_t>(), py::arg("lg_k"), py::arg("seed")=DEFAULT_SEED)
+ .def(py::init<uint8_t, uint64_t>(), py::arg("lg_k")=CPC_DEFAULT_LG_K, py::arg("seed")=DEFAULT_SEED)
.def(py::init<const cpc_sketch&>())
- .def("__str__", &cpc_sketch::to_string)
- .def("to_string", &cpc_sketch::to_string)
- .def("serialize", &dspy::cpc_sketch_serialize)
- .def_static("deserialize", &dspy::cpc_sketch_deserialize)
- .def<void (cpc_sketch::*)(uint64_t)>("update", &cpc_sketch::update, py::arg("datum"))
- .def<void (cpc_sketch::*)(double)>("update", &cpc_sketch::update, py::arg("datum"))
- .def<void (cpc_sketch::*)(const std::string&)>("update", &cpc_sketch::update, py::arg("datum"))
- .def("is_empty", &cpc_sketch::is_empty)
- .def("get_estimate", &cpc_sketch::get_estimate)
- .def("get_lower_bound", &cpc_sketch::get_lower_bound, py::arg("kappa"))
- .def("get_upper_bound", &cpc_sketch::get_upper_bound, py::arg("kappa"))
+ .def("__str__", &cpc_sketch::to_string,
+ "Produces a string summary of the sketch")
+ .def("to_string", &cpc_sketch::to_string,
+ "Produces a string summary of the sketch")
+ .def("serialize", &dspy::cpc_sketch_serialize,
+ "Serializes the sketch into a bytes object")
+ .def_static("deserialize", &dspy::cpc_sketch_deserialize,
+ "Reads a bytes object and returns the corresponding cpc_sketch")
+ .def<void (cpc_sketch::*)(uint64_t)>("update", &cpc_sketch::update, py::arg("datum"),
+ "Updates the sketch with the given 64-bit integer value")
+ .def<void (cpc_sketch::*)(double)>("update", &cpc_sketch::update, py::arg("datum"),
+ "Updates the sketch with the given 64-bit floating point")
+ .def<void (cpc_sketch::*)(const std::string&)>("update", &cpc_sketch::update, py::arg("datum"),
+ "Updates the sketch with the given string")
+ .def("is_empty", &cpc_sketch::is_empty,
+ "Returns True if the sketch is empty, otherwise Dalse")
+ .def("get_estimate", &cpc_sketch::get_estimate,
+ "Estimate of the distinct count of the input stream")
+ .def("get_lower_bound", &cpc_sketch::get_lower_bound, py::arg("kappa"),
+ "Returns an approximate lower bound on the estimate for kappa values in {1, 2, 3}, roughly corresponding to standard deviations")
+ .def("get_upper_bound", &cpc_sketch::get_upper_bound, py::arg("kappa"),
+ "Returns an approximate upper bound on the estimate for kappa values in {1, 2, 3}, roughly corresponding to standard deviations")
;
py::class_<cpc_union>(m, "cpc_union")
.def(py::init<uint8_t, uint64_t>(), py::arg("lg_k"), py::arg("seed")=DEFAULT_SEED)
.def(py::init<const cpc_union&>())
- .def("update", (void (cpc_union::*)(const cpc_sketch&)) &cpc_union::update, py::arg("sketch"))
- .def("get_result", &dspy::cpc_union_get_result)
+ .def("update", (void (cpc_union::*)(const cpc_sketch&)) &cpc_union::update, py::arg("sketch"),
+ "Updates the union with the provided CPC sketch")
+ .def("get_result", &dspy::cpc_union_get_result,
+ "Returns a CPC sketch with the result of the union")
;
}
diff --git a/python/src/fi_wrapper.cpp b/python/src/fi_wrapper.cpp
index 33b725d..c4066b7 100644
--- a/python/src/fi_wrapper.cpp
+++ b/python/src/fi_wrapper.cpp
@@ -75,23 +75,39 @@
py::class_<frequent_items_sketch<T>>(m, name)
.def(py::init<uint8_t>(), py::arg("lg_max_k"))
- .def("__str__", &frequent_items_sketch<T>::to_string, py::arg("print_items")=false)
- .def("to_string", &frequent_items_sketch<T>::to_string, py::arg("print_items")=false)
- .def("update", (void (frequent_items_sketch<T>::*)(const T&, uint64_t)) &frequent_items_sketch<T>::update, py::arg("item"), py::arg("weight")=1)
+ .def("__str__", &frequent_items_sketch<T>::to_string, py::arg("print_items")=false,
+ "Produces a string summary of the sketch")
+ .def("to_string", &frequent_items_sketch<T>::to_string, py::arg("print_items")=false,
+ "Produces a string summary of the sketch")
+ .def("update", (void (frequent_items_sketch<T>::*)(const T&, uint64_t)) &frequent_items_sketch<T>::update, py::arg("item"), py::arg("weight")=1,
+ "Updates the sketch with the given string and, optionally, a weight")
.def("get_frequent_items", &dspy::fi_sketch_get_frequent_items<T>, py::arg("err_type"), py::arg("threshold")=0)
- .def("merge", (void (frequent_items_sketch<T>::*)(const frequent_items_sketch<T>&)) &frequent_items_sketch<T>::merge)
- .def("is_empty", &frequent_items_sketch<T>::is_empty)
- .def("get_num_active_items", &frequent_items_sketch<T>::get_num_active_items)
- .def("get_total_weight", &frequent_items_sketch<T>::get_total_weight)
- .def("get_estimate", &frequent_items_sketch<T>::get_estimate, py::arg("item"))
- .def("get_lower_bound", &frequent_items_sketch<T>::get_lower_bound, py::arg("item"))
- .def("get_upper_bound", &frequent_items_sketch<T>::get_upper_bound, py::arg("item"))
- .def("get_sketch_epsilon", (double (frequent_items_sketch<T>::*)(void) const) &frequent_items_sketch<T>::get_epsilon)
- .def_static("get_epsilon_for_lg_size", &dspy::fi_sketch_get_generic_epsilon<T>, py::arg("lg_max_map_size"))
- .def_static("get_apriori_error", &frequent_items_sketch<T>::get_apriori_error, py::arg("lg_max_map_size"), py::arg("estimated_total_weight"))
- .def("get_serialized_size_bytes", &frequent_items_sketch<T>::get_serialized_size_bytes)
- .def("serialize", &dspy::fi_sketch_serialize<T>)
- .def_static("deserialize", &dspy::fi_sketch_deserialize<T>)
+ .def("merge", (void (frequent_items_sketch<T>::*)(const frequent_items_sketch<T>&)) &frequent_items_sketch<T>::merge,
+ "Merges the given sketch into this one")
+ .def("is_empty", &frequent_items_sketch<T>::is_empty,
+ "Returns True if the sketch is empty, otherwise False")
+ .def("get_num_active_items", &frequent_items_sketch<T>::get_num_active_items,
+ "Returns the number of active items in the sketch")
+ .def("get_total_weight", &frequent_items_sketch<T>::get_total_weight,
+ "Returns the sum of the weights (frequencies) in the stream seen so far by the sketch")
+ .def("get_estimate", &frequent_items_sketch<T>::get_estimate, py::arg("item"),
+ "Returns the estimate of the weight (frequency) of the given item.\n"
+ "Note: The true frequency of a item would be the sum of the counts as a result of the "
+ "two update functions.")
+ .def("get_lower_bound", &frequent_items_sketch<T>::get_lower_bound, py::arg("item"),
+ "Returns the guaranteed lower bound weight (frequency) of the given item.")
+ .def("get_upper_bound", &frequent_items_sketch<T>::get_upper_bound, py::arg("item"),
+ "Returns the guaranteed upper bound weight (frequency) of the given item.")
+ .def("get_sketch_epsilon", (double (frequent_items_sketch<T>::*)(void) const) &frequent_items_sketch<T>::get_epsilon,
+ "Returns the epsilon value used by the sketch to compute error")
+ .def_static("get_epsilon_for_lg_size", &dspy::fi_sketch_get_generic_epsilon<T>, py::arg("lg_max_map_size"),
+ "Returns the epsilon value used to compute a priori error for a given log2(max_map_size)")
+ .def_static("get_apriori_error", &frequent_items_sketch<T>::get_apriori_error, py::arg("lg_max_map_size"), py::arg("estimated_total_weight"),
+ "Returns the estimated a priori error given the max_map_size for the sketch and the estimated_total_stream_weight.")
+ .def("get_serialized_size_bytes", &frequent_items_sketch<T>::get_serialized_size_bytes,
+ "Computes the size needed to serialize the current state of the sketch. This can be expensive since every item needs to be looked at.")
+ .def("serialize", &dspy::fi_sketch_serialize<T>, "Serializes the sketch into a bytes object")
+ .def_static("deserialize", &dspy::fi_sketch_deserialize<T>, "Reads a bytes object and returns the corresponding frequent_strings_sketch")
;
}
diff --git a/python/src/hll_wrapper.cpp b/python/src/hll_wrapper.cpp
index e3c752a..c4c97fe 100644
--- a/python/src/hll_wrapper.cpp
+++ b/python/src/hll_wrapper.cpp
@@ -59,51 +59,78 @@
.def(py::init<int>(), py::arg("lg_k"))
.def(py::init<int, target_hll_type>(), py::arg("lg_k"), py::arg("tgt_type"))
.def(py::init<int, target_hll_type, bool>(), py::arg("lg_k"), py::arg("tgt_type"), py::arg("start_max_size")=false)
- .def_static("deserialize", &dspy::hll_sketch_deserialize)
- .def("serialize_compact", &dspy::hll_sketch_serialize_compact)
- .def("serialize_updatable", &dspy::hll_sketch_serialize_updatable)
+ .def_static("deserialize", &dspy::hll_sketch_deserialize,
+ "Reads a bytes object and returns the corresponding hll_sketch")
+ .def("serialize_compact", &dspy::hll_sketch_serialize_compact,
+ "Serializes the sketch into a bytes object, compressiong the exception table if HLL_4")
+ .def("serialize_updatable", &dspy::hll_sketch_serialize_updatable,
+ "Serializes the sketch into a bytes object")
.def("__str__", (std::string (hll_sketch::*)(bool,bool,bool,bool) const) &hll_sketch::to_string,
- py::arg("summary")=true, py::arg("detail")=false, py::arg("aux_detail")=false, py::arg("all")=false)
+ py::arg("summary")=true, py::arg("detail")=false, py::arg("aux_detail")=false, py::arg("all")=false,
+ "Produces a string summary of the sketch")
.def("to_string", (std::string (hll_sketch::*)(bool,bool,bool,bool) const) &hll_sketch::to_string,
- py::arg("summary")=true, py::arg("detail")=false, py::arg("aux_detail")=false, py::arg("all")=false)
- .def_property_readonly("lg_config_k", &hll_sketch::get_lg_config_k)
- .def_property_readonly("tgt_type", &hll_sketch::get_target_type)
- .def("get_estimate", &hll_sketch::get_estimate)
- .def("get_lower_bound", &hll_sketch::get_lower_bound, py::arg("num_std_devs"))
- .def("get_upper_bound", &hll_sketch::get_upper_bound, py::arg("num_std_devs"))
- .def("is_compact", &hll_sketch::is_compact)
- .def("is_empty", &hll_sketch::is_empty)
- .def("get_updatable_serialization_bytes", &hll_sketch::get_updatable_serialization_bytes)
- .def("get_compact_serialization_bytes", &hll_sketch::get_compact_serialization_bytes)
- .def("reset", &hll_sketch::reset)
- .def("update", (void (hll_sketch::*)(int64_t)) &hll_sketch::update, py::arg("datum"))
- .def("update", (void (hll_sketch::*)(double)) &hll_sketch::update, py::arg("datum"))
- .def("update", (void (hll_sketch::*)(const std::string&)) &hll_sketch::update, py::arg("datum"))
+ py::arg("summary")=true, py::arg("detail")=false, py::arg("aux_detail")=false, py::arg("all")=false,
+ "Produces a string summary of the sketch")
+ .def_property_readonly("lg_config_k", &hll_sketch::get_lg_config_k, "Configured lg_k value for the sketch")
+ .def_property_readonly("tgt_type", &hll_sketch::get_target_type, "Returns the HLL type (4, 6, or 8) when in estimation mode")
+ .def("get_estimate", &hll_sketch::get_estimate,
+ "Estimate of the distinct count of the input stream")
+ .def("get_lower_bound", &hll_sketch::get_lower_bound, py::arg("num_std_devs"),
+ "Returns the approximate lower error bound given the specified number of standard deviations in {1, 2, 3}")
+ .def("get_upper_bound", &hll_sketch::get_upper_bound, py::arg("num_std_devs"),
+ "Returns the approximate upper error bound given the specified number of standard deviations in {1, 2, 3}")
+ .def("is_compact", &hll_sketch::is_compact,
+ "True if the sketch is compact, otherwise False")
+ .def("is_empty", &hll_sketch::is_empty,
+ "True if the sketch is empty, otherwise False")
+ .def("get_updatable_serialization_bytes", &hll_sketch::get_updatable_serialization_bytes,
+ "Returns the size of the serialized sketch")
+ .def("get_compact_serialization_bytes", &hll_sketch::get_compact_serialization_bytes,
+ "Returns the size of the serialized sketch when compressing the exception table if HLL_4")
+ .def("reset", &hll_sketch::reset,
+ "Resets the sketch to the empty state in coupon colleciton mode")
+ .def("update", (void (hll_sketch::*)(int64_t)) &hll_sketch::update, py::arg("datum"),
+ "Updates the sketch with the given integral value")
+ .def("update", (void (hll_sketch::*)(double)) &hll_sketch::update, py::arg("datum"),
+ "Updates the sketch with the given floating point value")
+ .def("update", (void (hll_sketch::*)(const std::string&)) &hll_sketch::update, py::arg("datum"),
+ "Updates the sketch with the given string value")
.def_static("get_max_updatable_serialization_bytes", &hll_sketch::get_max_updatable_serialization_bytes,
- py::arg("lg_k"), py::arg("tgt_type"))
+ py::arg("lg_k"), py::arg("tgt_type"),
+ "Provides a likely upper bound on serialization size for the given paramters")
.def_static("get_rel_err", &hll_sketch::get_rel_err,
- py::arg("upper_bound"), py::arg("unioned"), py::arg("lg_k"), py::arg("num_std_devs"))
+ py::arg("upper_bound"), py::arg("unioned"), py::arg("lg_k"), py::arg("num_std_devs"),
+ "Retuns the a priori relative error bound for the given parameters")
;
py::class_<hll_union>(m, "hll_union")
.def(py::init<int>(), py::arg("lg_max_k"))
- .def_property_readonly("lg_config_k", &hll_union::get_lg_config_k)
- .def_property_readonly("tgt_type", &hll_union::get_target_type)
- .def("get_estimate", &hll_union::get_estimate)
- .def("get_lower_bound", &hll_union::get_lower_bound, py::arg("num_std_devs"))
- .def("get_upper_bound", &hll_union::get_upper_bound, py::arg("num_std_devs"))
- .def("is_compact", &hll_union::is_compact)
- .def("is_empty", &hll_union::is_empty)
- .def("get_updatable_serialization_bytes", &hll_union::get_updatable_serialization_bytes)
- .def("get_compact_serialization_bytes", &hll_union::get_compact_serialization_bytes)
- .def("reset", &hll_union::reset)
- .def("get_result", &hll_union::get_result, py::arg("tgt_type")=HLL_4)
- .def<void (hll_union::*)(const hll_sketch&)>("update", &hll_union::update, py::arg("sketch"))
- .def<void (hll_union::*)(int64_t)>("update", &hll_union::update, py::arg("datum"))
- .def<void (hll_union::*)(double)>("update", &hll_union::update, py::arg("datum"))
- .def<void (hll_union::*)(const std::string&)>("update", &hll_union::update, py::arg("datum"))
- .def_static("get_max_serialization_bytes", &hll_union::get_max_serialization_bytes, py::arg("lg_k"))
+ .def_property_readonly("lg_config_k", &hll_union::get_lg_config_k, "Configured lg_k value for the union")
+ .def_property_readonly("tgt_type", &hll_union::get_target_type, "Returns the HLL type (4, 6, or 8) when in estimation mode")
+ .def("get_estimate", &hll_union::get_estimate,
+ "Estimate of the distinct count of the input stream")
+ .def("get_lower_bound", &hll_union::get_lower_bound, py::arg("num_std_devs"),
+ "Returns the approximate lower error bound given the specified number of standard deviations in {1, 2, 3}")
+ .def("get_upper_bound", &hll_union::get_upper_bound, py::arg("num_std_devs"),
+ "Returns the approximate upper error bound given the specified number of standard deviations in {1, 2, 3}")
+ .def("is_compact", &hll_union::is_compact,
+ "True if the union is compact, otherwise False")
+ .def("is_empty", &hll_union::is_empty,
+ "True if the union is empty, otherwise False")
+ .def("reset", &hll_union::reset,
+ "Resets the union to the empty state")
+ .def("get_result", &hll_union::get_result, py::arg("tgt_type")=HLL_4,
+ "Returns a sketch of the target type representing the current union state")
+ .def<void (hll_union::*)(const hll_sketch&)>("update", &hll_union::update, py::arg("sketch"),
+ "Updates the union with the given HLL sketch")
+ .def<void (hll_union::*)(int64_t)>("update", &hll_union::update, py::arg("datum"),
+ "Updates the union with the given integral value")
+ .def<void (hll_union::*)(double)>("update", &hll_union::update, py::arg("datum"),
+ "Updates the union with the given floating point value")
+ .def<void (hll_union::*)(const std::string&)>("update", &hll_union::update, py::arg("datum"),
+ "Updates the union with the given string value")
.def_static("get_rel_err", &hll_union::get_rel_err,
- py::arg("upper_bound"), py::arg("unioned"), py::arg("lg_k"), py::arg("num_std_devs"))
+ py::arg("upper_bound"), py::arg("unioned"), py::arg("lg_k"), py::arg("num_std_devs"),
+ "Retuns the a priori relative error bound for the given parameters")
;
}
diff --git a/python/src/kll_wrapper.cpp b/python/src/kll_wrapper.cpp
index 361cc5c..8ffdc12 100644
--- a/python/src/kll_wrapper.cpp
+++ b/python/src/kll_wrapper.cpp
@@ -116,30 +116,90 @@
using namespace datasketches;
py::class_<kll_sketch<T>>(m, name)
- .def(py::init<uint16_t>(), py::arg("k"))
+ .def(py::init<uint16_t>(), py::arg("k")=kll_sketch<T>::DEFAULT_K)
.def(py::init<const kll_sketch<T>&>())
- .def("update", (void (kll_sketch<T>::*)(const T&)) &kll_sketch<T>::update, py::arg("item"))
- .def("update", &dspy::kll_sketch_update<T>, py::arg("array"))
- .def("merge", (void (kll_sketch<T>::*)(const kll_sketch<T>&)) &kll_sketch<T>::merge, py::arg("sketch"))
- .def("__str__", &kll_sketch<T>::to_string, py::arg("print_levels")=false, py::arg("print_items")=false)
- .def("to_string", &kll_sketch<T>::to_string, py::arg("print_levels")=false, py::arg("print_items")=false)
- .def("is_empty", &kll_sketch<T>::is_empty)
- .def("get_n", &kll_sketch<T>::get_n)
- .def("get_num_retained", &kll_sketch<T>::get_num_retained)
- .def("is_estimation_mode", &kll_sketch<T>::is_estimation_mode)
- .def("get_min_value", &kll_sketch<T>::get_min_value)
- .def("get_max_value", &kll_sketch<T>::get_max_value)
- .def("get_quantile", &kll_sketch<T>::get_quantile, py::arg("fraction"))
- .def("get_quantiles", &dspy::kll_sketch_get_quantiles<T>, py::arg("fractions"))
- .def("get_rank", &kll_sketch<T>::get_rank, py::arg("value"))
- .def("get_pmf", &dspy::kll_sketch_get_pmf<T>, py::arg("split_points"))
- .def("get_cdf", &dspy::kll_sketch_get_cdf<T>, py::arg("split_points"))
+ .def("update", (void (kll_sketch<T>::*)(const T&)) &kll_sketch<T>::update, py::arg("item"),
+ "Updates the sketch with the given value")
+ .def("update", &dspy::kll_sketch_update<T>, py::arg("array"),
+ "Updates the sketch with the values in the given array")
+ .def("merge", (void (kll_sketch<T>::*)(const kll_sketch<T>&)) &kll_sketch<T>::merge, py::arg("sketch"),
+ "Merges the provided sketch into the this one")
+ .def("__str__", &kll_sketch<T>::to_string, py::arg("print_levels")=false, py::arg("print_items")=false,
+ "Produces a string summary of the sketch")
+ .def("to_string", &kll_sketch<T>::to_string, py::arg("print_levels")=false, py::arg("print_items")=false,
+ "Produces a string summary of the sketch")
+ .def("is_empty", &kll_sketch<T>::is_empty,
+ "Returns True if the sketch is empty, otherwise False")
+ .def("get_n", &kll_sketch<T>::get_n,
+ "Returns the length of the input stream")
+ .def("get_num_retained", &kll_sketch<T>::get_num_retained,
+ "Returns the number of retained items (samples) in the sketch")
+ .def("is_estimation_mode", &kll_sketch<T>::is_estimation_mode,
+ "Returns True if the sketch is in estimation mode, otherwise False")
+ .def("get_min_value", &kll_sketch<T>::get_min_value,
+ "Returns the minimum value from the stream. If empty, kll_floats_sketch retursn nan; kll_ints_sketch throws a RuntimeError")
+ .def("get_max_value", &kll_sketch<T>::get_max_value,
+ "Returns the maximum value from the stream. If empty, kll_floats_sketch retursn nan; kll_ints_sketch throws a RuntimeError")
+ .def("get_quantile", &kll_sketch<T>::get_quantile, py::arg("fraction"),
+ "Returns an approximation to the value of the data item "
+ "that would be preceded by the given fraction of a hypothetical sorted "
+ "version of the input stream so far.\n"
+ "Note that this method has a fairly large overhead (microseconds instead of nanoseconds) "
+ "so it should not be called multiple times to get different quantiles from the same "
+ "sketch. Instead use get_quantiles(), which pays the overhead only once.\n"
+ "For kll_floats_sketch: if the sketch is empty this returns nan. "
+ "For kll_ints_sketch: if the sketch is empty this throws a RuntimeError.")
+ .def("get_quantiles", &dspy::kll_sketch_get_quantiles<T>, py::arg("fractions"),
+ "This is a more efficient multiple-query version of get_quantile().\n"
+ "This returns an array that could have been generated by using get_quantile() for each "
+ "fractional rank separately, but would be very inefficient. "
+ "This method incurs the internal set-up overhead once and obtains multiple quantile values in "
+ "a single query. It is strongly recommend that this method be used instead of multiple calls "
+ "to get_quantile().\n"
+ "If the sketch is empty this returns an empty vector.")
+ .def("get_rank", &kll_sketch<T>::get_rank, py::arg("value"),
+ "Returns an approximation to the normalized (fractional) rank of the given value from 0 to 1, inclusive.\n"
+ "The resulting approximation has a probabilistic guarantee that can be obtained from the "
+ "get_normalized_rank_error(False) function.\n"
+ "If the sketch is empty this returns nan.")
+ .def("get_pmf", &dspy::kll_sketch_get_pmf<T>, py::arg("split_points"),
+ "Returns an approximation to the Probability Mass Function (PMF) of the input stream "
+ "given a set of split points (values).\n"
+ "The resulting approximations have a probabilistic guarantee that can be obtained from the "
+ "get_normalized_rank_error(True) function.\n"
+ "If the sketch is empty this returns an empty vector.\n"
+ "split_points is an array of m unique, monotonically increasing float values "
+ "that divide the real number line into m+1 consecutive disjoint intervals.\n"
+ "The definition of an 'interval' is inclusive of the left split point (or minimum value) and "
+ "exclusive of the right split point, with the exception that the last interval will include "
+ "the maximum value.\n"
+ "It is not necessary to include either the min or max values in these split points.")
+ .def("get_cdf", &dspy::kll_sketch_get_cdf<T>, py::arg("split_points"),
+ "Returns an approximation to the Cumulative Distribution Function (CDF), which is the "
+ "cumulative analog of the PMF, of the input stream given a set of split points (values).\n"
+ "The resulting approximations have a probabilistic guarantee that can be obtained from the "
+ "get_normalized_rank_error(True) function.\n"
+ "If the sketch is empty this returns an empty vector.\n"
+ "split_points is an array of m unique, monotonically increasing float values "
+ "that divide the real number line into m+1 consecutive disjoint intervals.\n"
+ "The definition of an 'interval' is inclusive of the left split point (or minimum value) and "
+ "exclusive of the right split point, with the exception that the last interval will include "
+ "the maximum value.\n"
+ "It is not necessary to include either the min or max values in these split points.")
.def("normalized_rank_error", (double (kll_sketch<T>::*)(bool) const) &kll_sketch<T>::get_normalized_rank_error,
- py::arg("as_pmf"))
+ py::arg("as_pmf"),
+ "Gets the normalized rank error for this sketch.\n"
+ "If pmf is True, returns the 'double-sided' normalized rank error for the get_PMF() function.\n"
+ "Otherwise, it is the 'single-sided' normalized rank error for all the other queries.\n"
+ "Constants were derived as the best fit to 99 percentile empirically measured max error in thousands of trials")
.def_static("get_normalized_rank_error", &dspy::kll_sketch_generic_normalized_rank_error<T>,
- py::arg("k"), py::arg("as_pmf"))
- .def("serialize", &dspy::kll_sketch_serialize<T>)
- .def_static("deserialize", &dspy::kll_sketch_deserialize<T>)
+ py::arg("k"), py::arg("as_pmf"),
+ "Gets the normalized rank error given parameters k and the pmf flag.\n"
+ "If pmf is True, returns the 'double-sided' normalized rank error for the get_PMF() function.\n"
+ "Otherwise, it is the 'single-sided' normalized rank error for all the other queries.\n"
+ "Constants were derived as the best fit to 99 percentile empirically measured max error in thousands of trials")
+ .def("serialize", &dspy::kll_sketch_serialize<T>, "Serailizes the sketch into a bytes object")
+ .def_static("deserialize", &dspy::kll_sketch_deserialize<T>, "Deserializes the sketch from a bytes object")
;
}
diff --git a/python/src/theta_wrapper.cpp b/python/src/theta_wrapper.cpp
index cc08e72..37d7045 100644
--- a/python/src/theta_wrapper.cpp
+++ b/python/src/theta_wrapper.cpp
@@ -81,57 +81,82 @@
using namespace datasketches;
py::class_<theta_sketch>(m, "theta_sketch")
- .def("serialize", &dspy::theta_sketch_serialize)
- .def_static("deserialize", &dspy::theta_sketch_deserialize, py::arg("bytes"), py::arg("seed")=DEFAULT_SEED)
- .def("__str__", &theta_sketch::to_string, py::arg("print_items")=false)
- .def("to_string", &theta_sketch::to_string, py::arg("print_items")=false)
- .def("is_empty", &theta_sketch::is_empty)
- .def("get_estimate", &theta_sketch::get_estimate)
- .def("get_upper_bound", &theta_sketch::get_upper_bound, py::arg("num_std_devs"))
- .def("get_lower_bound", &theta_sketch::get_lower_bound, py::arg("num_std_devs"))
- .def("is_estimation_mode", &theta_sketch::is_estimation_mode)
- .def("get_theta", &theta_sketch::get_theta)
- .def("get_num_retained", &theta_sketch::get_num_retained)
- .def("get_seed_hash", &dspy::theta_sketch_get_seed_hash)
- .def("is_ordered", &theta_sketch::is_ordered)
+ .def("serialize", &dspy::theta_sketch_serialize,
+ "Serializes the sketch into a bytes object")
+ .def_static("deserialize", &dspy::theta_sketch_deserialize, py::arg("bytes"), py::arg("seed")=DEFAULT_SEED,
+ "Reads a bytes object and returns the corresponding cpc_sketch")
+ .def("__str__", &theta_sketch::to_string, py::arg("print_items")=false,
+ "Produces a string summary of the sketch")
+ .def("to_string", &theta_sketch::to_string, py::arg("print_items")=false,
+ "Produces a string summary of the sketch")
+ .def("is_empty", &theta_sketch::is_empty,
+ "Returns True if the sketch is empty, otherwise Dalse")
+ .def("get_estimate", &theta_sketch::get_estimate,
+ "Estimate of the distinct count of the input stream")
+ .def("get_upper_bound", &theta_sketch::get_upper_bound, py::arg("num_std_devs"),
+ "Returns an approximate upper bound on the estimate at standard deviations in {1, 2, 3}")
+ .def("get_lower_bound", &theta_sketch::get_lower_bound, py::arg("num_std_devs"),
+ "Returns an approximate lower bound on the estimate at standard deviations in {1, 2, 3}")
+ .def("is_estimation_mode", &theta_sketch::is_estimation_mode,
+ "Returns True if sketch is in estimation mode, otherwise False")
+ .def("get_theta", &theta_sketch::get_theta,
+ "Returns theta (effective sampling rate) as a fraction from 0 to 1")
+ .def("get_num_retained", &theta_sketch::get_num_retained,
+ "Retunrs the number of items currently in the sketch")
+ .def("get_seed_hash", &dspy::theta_sketch_get_seed_hash,
+ "Returns a hash of the seed used in the sketch")
+ .def("is_ordered", &theta_sketch::is_ordered,
+ "Returns True if the sketch entries are sorted, otherwise False")
;
py::class_<update_theta_sketch, theta_sketch>(m, "update_theta_sketch")
.def(py::init(&dspy::update_theta_sketch_factory),
py::arg("lg_k")=update_theta_sketch::builder::DEFAULT_LG_K, py::arg("p")=1.0, py::arg("seed")=DEFAULT_SEED)
.def(py::init<const update_theta_sketch&>())
- .def("update", (void (update_theta_sketch::*)(int64_t)) &update_theta_sketch::update, py::arg("datum"))
- .def("update", (void (update_theta_sketch::*)(double)) &update_theta_sketch::update, py::arg("datum"))
- .def("update", (void (update_theta_sketch::*)(const std::string&)) &update_theta_sketch::update, py::arg("datum"))
- .def("compact", &update_theta_sketch::compact, py::arg("ordered")=true)
+ .def("update", (void (update_theta_sketch::*)(int64_t)) &update_theta_sketch::update, py::arg("datum"),
+ "Updates the sketch with the given integral value")
+ .def("update", (void (update_theta_sketch::*)(double)) &update_theta_sketch::update, py::arg("datum"),
+ "Updates the sketch with the given floating point value")
+ .def("update", (void (update_theta_sketch::*)(const std::string&)) &update_theta_sketch::update, py::arg("datum"),
+ "Updates the sketch with the given string")
+ .def("compact", &update_theta_sketch::compact, py::arg("ordered")=true,
+ "Returns a compacted form of the sketch, optionally sorting it")
.def_static("deserialize", &dspy::update_theta_sketch_deserialize,
- py::arg("bytes"), py::arg("seed")=DEFAULT_SEED)
+ py::arg("bytes"), py::arg("seed")=DEFAULT_SEED,
+ "Reads a bytes object and returns the corresponding update_theta_sketch")
;
py::class_<compact_theta_sketch, theta_sketch>(m, "compact_theta_sketch")
.def(py::init<const compact_theta_sketch&>())
.def(py::init<const theta_sketch&, bool>())
.def_static("deserialize", &dspy::compact_theta_sketch_deserialize,
- py::arg("bytes"), py::arg("seed")=DEFAULT_SEED)
+ py::arg("bytes"), py::arg("seed")=DEFAULT_SEED,
+ "Reads a bytes object and returns the corresponding update_theta_sketch")
;
py::class_<theta_union>(m, "theta_union")
.def(py::init(&dspy::theta_union_factory),
py::arg("lg_k")=update_theta_sketch::builder::DEFAULT_LG_K, py::arg("p")=1.0, py::arg("seed")=DEFAULT_SEED)
- .def("update", &theta_union::update, py::arg("sketch"))
- .def("get_result", &theta_union::get_result, py::arg("ordered")=true)
+ .def("update", &theta_union::update, py::arg("sketch"),
+ "Updates the union with the given sketch")
+ .def("get_result", &theta_union::get_result, py::arg("ordered")=true,
+ "Returns the sketch corresponding to the union result")
;
py::class_<theta_intersection>(m, "theta_intersection")
.def(py::init<uint64_t>(), py::arg("seed")=DEFAULT_SEED)
.def(py::init<const theta_intersection&>())
- .def("update", &theta_intersection::update, py::arg("sketch"))
- .def("get_result", &theta_intersection::get_result, py::arg("ordered")=true)
- .def("has_result", &theta_intersection::has_result)
+ .def("update", &theta_intersection::update, py::arg("sketch"),
+ "Intersections the provided sketch with the current intersection state")
+ .def("get_result", &theta_intersection::get_result, py::arg("ordered")=true,
+ "Returns the sketch corresponding to the intersection result")
+ .def("has_result", &theta_intersection::has_result,
+ "Returns True if the intersection has a valid result, otherwisel False")
;
py::class_<theta_a_not_b>(m, "theta_a_not_b")
.def(py::init<uint64_t>(), py::arg("seed")=DEFAULT_SEED)
- .def("compute", &theta_a_not_b::compute, py::arg("a"), py::arg("b"), py::arg("ordered")=true)
+ .def("compute", &theta_a_not_b::compute, py::arg("a"), py::arg("b"), py::arg("ordered")=true,
+ "Returns a sketch with the reuslt of appying the A-not-B operation on the given inputs")
;
}
diff --git a/python/src/vo_wrapper.cpp b/python/src/vo_wrapper.cpp
index e7c462b..2bec977 100644
--- a/python/src/vo_wrapper.cpp
+++ b/python/src/vo_wrapper.cpp
@@ -83,15 +83,25 @@
py::class_<var_opt_sketch<T>>(m, name)
.def(py::init<uint32_t>(), py::arg("k"))
- .def("__str__", &dspy::vo_sketch_to_string<T>, py::arg("print_items")=false)
- .def("to_string", &dspy::vo_sketch_to_string<T>, py::arg("print_items")=false)
- .def("update", (void (var_opt_sketch<T>::*)(const T&, double)) &var_opt_sketch<T>::update, py::arg("item"), py::arg("weight")=1.0)
- .def_property_readonly("k", &var_opt_sketch<T>::get_k)
- .def_property_readonly("n", &var_opt_sketch<T>::get_n)
- .def_property_readonly("num_samples", &var_opt_sketch<T>::get_num_samples)
- .def("get_samples", &dspy::vo_sketch_get_samples<T>)
- .def("is_empty", &var_opt_sketch<T>::is_empty)
- .def("estimate_subset_sum", &dspy::vo_sketch_estimate_subset_sum<T>)
+ .def("__str__", &dspy::vo_sketch_to_string<T>, py::arg("print_items")=false,
+ "Produces a string summary of the sketch")
+ .def("to_string", &dspy::vo_sketch_to_string<T>, py::arg("print_items")=false,
+ "Produces a string summary of the sketch")
+ .def("update", (void (var_opt_sketch<T>::*)(const T&, double)) &var_opt_sketch<T>::update, py::arg("item"), py::arg("weight")=1.0,
+ "Updates the sketch with the given value and weight")
+ .def_property_readonly("k", &var_opt_sketch<T>::get_k,
+ "Returns the sketch's maximum configured sample size")
+ .def_property_readonly("n", &var_opt_sketch<T>::get_n,
+ "Returns the total stream length")
+ .def_property_readonly("num_samples", &var_opt_sketch<T>::get_num_samples,
+ "Returns the number of samples currently in the sketch")
+ .def("get_samples", &dspy::vo_sketch_get_samples<T>,
+ "Retyrns the set of samples in the sketch")
+ .def("is_empty", &var_opt_sketch<T>::is_empty,
+ "Returns True if the sketch is empty, otherwise False")
+ .def("estimate_subset_sum", &dspy::vo_sketch_estimate_subset_sum<T>,
+ "Applies a provided predicate to the sketch and returns the estimated total weight matching the predicate, as well "
+ "as upper and lower bounds on the estimate and the total weight processed by the sketch")
// As of writing, not yet clear how to serialize arbitrary python objects,
// especially in any sort of language-portable way
//.def("get_serialized_size_bytes", &var_opt_sketch<T>::get_serialized_size_bytes)
@@ -106,11 +116,16 @@
py::class_<var_opt_union<T>>(m, name)
.def(py::init<uint32_t>(), py::arg("max_k"))
- .def("__str__", &var_opt_union<T>::to_string)
- .def("to_string", &var_opt_union<T>::to_string)
- .def("update", (void (var_opt_union<T>::*)(const var_opt_sketch<T>& sk)) &var_opt_union<T>::update, py::arg("sketch"))
- .def("get_result", &var_opt_union<T>::get_result)
- .def("reset", &var_opt_union<T>::reset)
+ .def("__str__", &var_opt_union<T>::to_string,
+ "Produces a string summary of the sketch")
+ .def("to_string", &var_opt_union<T>::to_string,
+ "Produces a string summary of the sketch")
+ .def("update", (void (var_opt_union<T>::*)(const var_opt_sketch<T>& sk)) &var_opt_union<T>::update, py::arg("sketch"),
+ "Updates the union with the given sketch")
+ .def("get_result", &var_opt_union<T>::get_result,
+ "Returns a sketch corresponding to the union result")
+ .def("reset", &var_opt_union<T>::reset,
+ "Resets the union to the empty state")
// As of writing, not yet clear how to serialize arbitrary python objects,
// especially in any sort of language-portable way
//.def("get_serialized_size_bytes", &var_opt_sketch<T>::get_serialized_size_bytes)
@@ -119,7 +134,6 @@
;
}
-
void init_vo(py::module &m) {
bind_vo_sketch<py::object>(m, "var_opt_sketch");
bind_vo_union<py::object>(m, "var_opt_union");