IMPALA-10020: Implement ds_kll_cdf_as_string() function
This is the support for Cumulative Distribution Function (CDF) from
Apache DataSketches KLL algorithm collection. It receives a serialized
KLL sketch and one or more float values to represent ranges in the
sketched values.
E.g. [1, 5, 10] will mean the following ranges:
(-inf, 1), (-inf, 5), (-inf, 10), (-inf, +inf)
Returns a comma separated string where each value in the string is a
number in the range of [0,1] and shows that what percentage of the
data is in the particular ranges.
Note, ds_kll_cdf() should return an Array of doubles as the result but
with that we have to wait for the complex type support. Until, we
provide ds_kll_cdf_as_string() that can be deprecated once we
have array support. Tracking Jira for returning complex types from
functions is IMPALA-9520.
Example:
select ds_kll_cdf_as_string(ds_kll_sketch(float_col), 2, 4, 10)
from alltypes;
+----------------------------------------------------------+
| ds_kll_cdf_as_string(ds_kll_sketch(float_col), 2, 4, 10) |
+----------------------------------------------------------+
| 0.2,0.401644,1,1 |
+----------------------------------------------------------+
Change-Id: I77e6afc4556ad05a295b89f6d06c2e4a6bb2cf82
Reviewed-on: http://gerrit.cloudera.org:8080/16359
Reviewed-by: Gabor Kaszab <gaborkaszab@cloudera.com>
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
diff --git a/be/src/exprs/datasketches-functions-ir.cc b/be/src/exprs/datasketches-functions-ir.cc
index a6c0b98..1c2cb68 100644
--- a/be/src/exprs/datasketches-functions-ir.cc
+++ b/be/src/exprs/datasketches-functions-ir.cc
@@ -104,8 +104,9 @@
}
}
-StringVal DataSketchesFunctions::DsKllPMFAsString(FunctionContext* ctx,
- const StringVal& serialized_sketch, int num_args, const FloatVal* args) {
+StringVal DataSketchesFunctions::GetDsKllPMFOrCDF(FunctionContext* ctx,
+ const StringVal& serialized_sketch, int num_args, const FloatVal* args,
+ PMFCDF mode) {
DCHECK(num_args > 0);
if (args == nullptr || args->is_null) return StringVal::null();
if (serialized_sketch.is_null || serialized_sketch.len == 0) return StringVal::null();
@@ -115,18 +116,29 @@
LogSketchDeserializationError(ctx);
return StringVal::null();
}
- float pmf_input[(unsigned int)num_args];
- for (int i = 0; i < num_args; ++i) pmf_input[i] = args[i].val;
+ float input_ranges[(unsigned int)num_args];
+ for (int i = 0; i < num_args; ++i) input_ranges[i] = args[i].val;
try {
- std::vector<double> results = sketch.get_PMF(pmf_input, num_args);
+ std::vector<double> results = (mode == PMF) ?
+ sketch.get_PMF(input_ranges, num_args) : sketch.get_CDF(input_ranges, num_args);
return DsKllVectorResultToStringVal(ctx, results);
} catch(const std::exception& e) {
- ctx->SetError(Substitute("Error while running PMF from DataSketches KLL. "
+ ctx->SetError(Substitute("Error while running DataSketches KLL function. "
"Message: $0", e.what()).c_str());
return StringVal::null();
}
return StringVal::null();
}
+StringVal DataSketchesFunctions::DsKllPMFAsString(FunctionContext* ctx,
+ const StringVal& serialized_sketch, int num_args, const FloatVal* args) {
+ return GetDsKllPMFOrCDF(ctx, serialized_sketch, num_args, args, PMF);
+}
+
+StringVal DataSketchesFunctions::DsKllCDFAsString(FunctionContext* ctx,
+ const StringVal& serialized_sketch, int num_args, const FloatVal* args) {
+ return GetDsKllPMFOrCDF(ctx, serialized_sketch, num_args, args, CDF);
+}
+
}
diff --git a/be/src/exprs/datasketches-functions.h b/be/src/exprs/datasketches-functions.h
index dbb1fdc..93f9815 100644
--- a/be/src/exprs/datasketches-functions.h
+++ b/be/src/exprs/datasketches-functions.h
@@ -76,6 +76,32 @@
/// that we have to wait for the complex type support. Tracking Jira is IMPALA-9520.
static StringVal DsKllPMFAsString(FunctionContext* ctx,
const StringVal& serialized_sketch, int num_args, const FloatVal* args);
+
+
+ /// 'serialized_sketch' is expected as a serialized Apache DataSketches KLL sketch. If
+ /// it is not, then the query fails.
+ /// 'args' holds one or more numbers that will be used as ranges to divide the input
+ /// of the sketch. E.g. [1.0, 3.5, 10.1] will create the following ranges:
+ /// (-inf, 1.0), (-inf, 3.5), (-inf, 10.1), (-inf, +inf)
+ /// This function returns a comma separated string that contains the probability of
+ /// having an item in each of the received ranges. E.g. a return value of 0.2 means
+ /// that approximately 20% of the items are in that given range.
+ /// Note, this function is meant to return an Array of doubles as the result but with
+ /// that we have to wait for the complex type support. Tracking Jira is IMPALA-9520.
+ static StringVal DsKllCDFAsString(FunctionContext* ctx,
+ const StringVal& serialized_sketch, int num_args, const FloatVal* args);
+
+private:
+ enum PMFCDF {
+ PMF,
+ CDF
+ };
+
+ /// Helper functions for DsKllPMFAsString() and DsKllCDFAsString(). 'mode' indicates
+ /// whether get_PMF() or get_CDF() should be invoked on the KLL sketch.
+ static StringVal GetDsKllPMFOrCDF(FunctionContext* ctx,
+ const StringVal& serialized_sketch, int num_args, const FloatVal* args,
+ PMFCDF mode);
};
}
diff --git a/common/function-registry/impala_functions.py b/common/function-registry/impala_functions.py
index 3334eb0..7216a48 100644
--- a/common/function-registry/impala_functions.py
+++ b/common/function-registry/impala_functions.py
@@ -943,6 +943,8 @@
'_ZN6impala21DataSketchesFunctions22DsKllQuantilesAsStringEPN10impala_udf15FunctionContextERKNS1_9StringValEiPKNS1_9DoubleValE'],
[['ds_kll_pmf_as_string'], 'STRING', ['STRING', 'FLOAT', '...'],
'_ZN6impala21DataSketchesFunctions16DsKllPMFAsStringEPN10impala_udf15FunctionContextERKNS1_9StringValEiPKNS1_8FloatValE'],
+ [['ds_kll_cdf_as_string'], 'STRING', ['STRING', 'FLOAT', '...'],
+ '_ZN6impala21DataSketchesFunctions16DsKllCDFAsStringEPN10impala_udf15FunctionContextERKNS1_9StringValEiPKNS1_8FloatValE'],
]
invisible_functions = [
diff --git a/testdata/workloads/functional-query/queries/QueryTest/datasketches-kll.test b/testdata/workloads/functional-query/queries/QueryTest/datasketches-kll.test
index d1ac0c2..1a7d934 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/datasketches-kll.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/datasketches-kll.test
@@ -475,12 +475,96 @@
ds_kll_pmf_as_string(ds_kll_sketch(float_col), 1, 5, 3)
from functional_parquet.alltypessmall;
---- CATCH
-UDF ERROR: Error while running PMF from DataSketches KLL. Message: Values must be unique and monotonically increasing
+UDF ERROR: Error while running DataSketches KLL function. Message: Values must be unique and monotonically increasing
====
---- QUERY
select
ds_kll_pmf_as_string(ds_kll_sketch(float_col), 1, 5, 5)
from functional_parquet.alltypessmall;
---- CATCH
-UDF ERROR: Error while running PMF from DataSketches KLL. Message: Values must be unique and monotonically increasing
+UDF ERROR: Error while running DataSketches KLL function. Message: Values must be unique and monotonically increasing
+====
+---- QUERY
+# Checks that ds_kll_cdf_as_string() produces NULL for an empty dataset.
+select
+ ds_kll_cdf_as_string(ds_kll_sketch(cast(f2 as float)), 0.5)
+from functional_parquet.emptytable;
+---- TYPES
+STRING
+---- RESULTS
+'NULL'
+====
+---- QUERY
+# Check that ds_kll_cdf_as_string() returns null for a null input.
+select ds_kll_cdf_as_string(c, 0.5) from functional_parquet.nulltable;
+---- RESULTS
+'NULL'
+---- TYPES
+STRING
+====
+---- QUERY
+# Check that ds_kll_cdf_as_string() returns error for strings that are not serialized
+# sketches.
+select ds_kll_cdf_as_string(date_string_col, 0.5) from functional_parquet.alltypestiny;
+---- CATCH
+UDF ERROR: Unable to deserialize sketch
+====
+---- QUERY
+select
+ ds_kll_cdf_as_string(ds_kll_sketch(float_col), 0, 0.5, cast('nan' as float), 1)
+from functional_parquet.alltypessmall;
+---- CATCH
+UDF ERROR: NULL or NaN provided in the input list.
+====
+---- QUERY
+select
+ ds_kll_cdf_as_string(ds_kll_sketch(float_col), 0, 0.5, 1, cast('nan' as float))
+from functional_parquet.alltypessmall;
+---- CATCH
+UDF ERROR: NULL or NaN provided in the input list.
+====
+---- QUERY
+select
+ ds_kll_cdf_as_string(ds_kll_sketch(float_col), cast('nan' as float))
+from functional_parquet.alltypessmall;
+---- CATCH
+UDF ERROR: NULL or NaN provided in the input list.
+====
+---- QUERY
+select
+ ds_kll_cdf_as_string(ds_kll_sketch(float_col), 0, 0.5, NULL, 1)
+from functional_parquet.alltypessmall;
+---- CATCH
+UDF ERROR: NULL or NaN provided in the input list.
+====
+---- QUERY
+select
+ ds_kll_cdf_as_string(ds_kll_sketch(float_col), 5)
+from functional_parquet.alltypessmall;
+---- RESULTS
+'0.6,1'
+---- TYPES
+STRING
+====
+---- QUERY
+select ds_kll_cdf_as_string(ds_kll_sketch(float_col), 1, 5, 8, 9)
+from functional_parquet.alltypessmall;
+---- RESULTS
+'0.12,0.6,0.84,0.92,1'
+---- TYPES
+STRING
+====
+---- QUERY
+select
+ ds_kll_cdf_as_string(ds_kll_sketch(float_col), 1, 5, 3)
+from functional_parquet.alltypessmall;
+---- CATCH
+UDF ERROR: Error while running DataSketches KLL function. Message: Values must be unique and monotonically increasing
+====
+---- QUERY
+select
+ ds_kll_cdf_as_string(ds_kll_sketch(float_col), 1, 5, 5)
+from functional_parquet.alltypessmall;
+---- CATCH
+UDF ERROR: Error while running DataSketches KLL function. Message: Values must be unique and monotonically increasing
====