Merge pull request #19 from apache/histogram
added histogram function
diff --git a/sql/datasketches_kll_float_sketch.sql b/sql/datasketches_kll_float_sketch.sql
index b179a75..332496f 100644
--- a/sql/datasketches_kll_float_sketch.sql
+++ b/sql/datasketches_kll_float_sketch.sql
@@ -105,3 +105,11 @@
CREATE OR REPLACE FUNCTION kll_float_sketch_get_quantiles(kll_float_sketch, double precision[]) RETURNS real[]
AS '$libdir/datasketches', 'pg_kll_float_sketch_get_quantiles'
LANGUAGE C STRICT IMMUTABLE;
+
+CREATE OR REPLACE FUNCTION kll_float_sketch_get_histogram(kll_float_sketch) RETURNS double precision[]
+ AS '$libdir/datasketches', 'pg_kll_float_sketch_get_histogram'
+ LANGUAGE C STRICT IMMUTABLE;
+
+CREATE OR REPLACE FUNCTION kll_float_sketch_get_histogram(kll_float_sketch, int) RETURNS double precision[]
+ AS '$libdir/datasketches', 'pg_kll_float_sketch_get_histogram'
+ LANGUAGE C STRICT IMMUTABLE;
diff --git a/src/kll_float_sketch_c_adapter.cpp b/src/kll_float_sketch_c_adapter.cpp
index 721c359..1eac1b2 100644
--- a/src/kll_float_sketch_c_adapter.cpp
+++ b/src/kll_float_sketch_c_adapter.cpp
@@ -131,14 +131,19 @@
pg_unreachable();
}
-Datum* kll_float_sketch_get_pmf_or_cdf(const void* sketchptr, const float* split_points, unsigned num_split_points, bool is_cdf) {
+Datum* kll_float_sketch_get_pmf_or_cdf(const void* sketchptr, const float* split_points, unsigned num_split_points, bool is_cdf, bool scale) {
try {
auto array = is_cdf ?
static_cast<const kll_float_sketch*>(sketchptr)->get_CDF(split_points, num_split_points) :
static_cast<const kll_float_sketch*>(sketchptr)->get_PMF(split_points, num_split_points);
Datum* pmf = (Datum*) palloc(sizeof(Datum) * (num_split_points + 1));
+ const uint64_t n = static_cast<const kll_float_sketch*>(sketchptr)->get_n();
for (unsigned i = 0; i < num_split_points + 1; i++) {
- pmf[i] = pg_float8_get_datum(array[i]);
+ if (scale) {
+ pmf[i] = pg_float8_get_datum(array[i] * n);
+ } else {
+ pmf[i] = pg_float8_get_datum(array[i]);
+ }
}
return pmf;
} catch (std::exception& e) {
diff --git a/src/kll_float_sketch_c_adapter.h b/src/kll_float_sketch_c_adapter.h
index 75dd62b..ff9723c 100644
--- a/src/kll_float_sketch_c_adapter.h
+++ b/src/kll_float_sketch_c_adapter.h
@@ -43,7 +43,7 @@
void* kll_float_sketch_deserialize(const char* buffer, unsigned length);
unsigned kll_float_sketch_get_serialized_size_bytes(const void* sketchptr);
-void** kll_float_sketch_get_pmf_or_cdf(const void* sketchptr, const float* split_points, unsigned num_split_points, bool is_cdf);
+void** kll_float_sketch_get_pmf_or_cdf(const void* sketchptr, const float* split_points, unsigned num_split_points, bool is_cdf, bool scale);
void** kll_float_sketch_get_quantiles(const void* sketchptr, const double* fractions, unsigned num_fractions);
#ifdef __cplusplus
diff --git a/src/kll_float_sketch_pg_functions.c b/src/kll_float_sketch_pg_functions.c
index 14cc3c1..08eb22c 100644
--- a/src/kll_float_sketch_pg_functions.c
+++ b/src/kll_float_sketch_pg_functions.c
@@ -38,6 +38,7 @@
PG_FUNCTION_INFO_V1(pg_kll_float_sketch_get_pmf);
PG_FUNCTION_INFO_V1(pg_kll_float_sketch_get_cdf);
PG_FUNCTION_INFO_V1(pg_kll_float_sketch_get_quantiles);
+PG_FUNCTION_INFO_V1(pg_kll_float_sketch_get_histogram);
/* function declarations */
Datum pg_kll_float_sketch_recv(PG_FUNCTION_ARGS);
@@ -52,8 +53,10 @@
Datum pg_kll_float_sketch_get_pmf(PG_FUNCTION_ARGS);
Datum pg_kll_float_sketch_get_cdf(PG_FUNCTION_ARGS);
Datum pg_kll_float_sketch_get_quantiles(PG_FUNCTION_ARGS);
+Datum pg_kll_float_sketch_get_histogram(PG_FUNCTION_ARGS);
static const unsigned DEFAULT_K = 200;
+static const unsigned DEFAULT_NUM_BINS = 10;
Datum pg_kll_float_sketch_add_item(PG_FUNCTION_ARGS) {
void* sketchptr;
@@ -227,7 +230,7 @@
for (i = 0; i < arr_len_in; i++) {
split_points[i] = DatumGetFloat4(data_in[i]);
}
- result = (Datum*) kll_float_sketch_get_pmf_or_cdf(sketchptr, split_points, arr_len_in, false);
+ result = (Datum*) kll_float_sketch_get_pmf_or_cdf(sketchptr, split_points, arr_len_in, false, false);
pfree(split_points);
// construct output array of fractions
@@ -277,7 +280,7 @@
for (i = 0; i < arr_len_in; i++) {
split_points[i] = DatumGetFloat4(data_in[i]);
}
- result = (Datum*) kll_float_sketch_get_pmf_or_cdf(sketchptr, split_points, arr_len_in, true);
+ result = (Datum*) kll_float_sketch_get_pmf_or_cdf(sketchptr, split_points, arr_len_in, true, false);
pfree(split_points);
// construct output array of fractions
@@ -337,3 +340,47 @@
PG_RETURN_ARRAYTYPE_P(arr_out);
}
+
+Datum pg_kll_float_sketch_get_histogram(PG_FUNCTION_ARGS) {
+ const bytea* bytes_in;
+ void* sketchptr;
+ int num_bins;
+
+ // output array of bins
+ Datum* result;
+ ArrayType* arr_out;
+ int16 elmlen_out;
+ bool elmbyval_out;
+ char elmalign_out;
+ int arr_len_out;
+
+ int i;
+
+ bytes_in = PG_GETARG_BYTEA_P(0);
+ sketchptr = kll_float_sketch_deserialize(VARDATA(bytes_in), VARSIZE(bytes_in) - VARHDRSZ);
+
+ num_bins = PG_GETARG_INT32(1);
+ if (num_bins == 0) num_bins = DEFAULT_NUM_BINS;
+ if (num_bins < 2) {
+ elog(ERROR, "at least two bins expected");
+ }
+
+ float* split_points = palloc(sizeof(float) * (num_bins - 1));
+ const float min_value = kll_float_sketch_get_quantile(sketchptr, 0);
+ const float max_value = kll_float_sketch_get_quantile(sketchptr, 1);
+ const float delta = (max_value - min_value) / num_bins;
+ for (i = 0; i < num_bins - 1; i++) {
+ split_points[i] = min_value + delta * (i + 1);
+ }
+ result = (Datum*) kll_float_sketch_get_pmf_or_cdf(sketchptr, split_points, num_bins - 1, false, true);
+ pfree(split_points);
+
+ // construct output array
+ arr_len_out = num_bins;
+ get_typlenbyvalalign(FLOAT8OID, &elmlen_out, &elmbyval_out, &elmalign_out);
+ arr_out = construct_array(result, arr_len_out, FLOAT8OID, elmlen_out, elmbyval_out, elmalign_out);
+
+ kll_float_sketch_delete(sketchptr);
+
+ PG_RETURN_ARRAYTYPE_P(arr_out);
+}