IMPALA-10020: Implement ds_kll_cdf_as_string() function

This is the support for Cumulative Distribution Function (CDF) from
Apache DataSketches KLL algorithm collection. It receives a serialized
KLL sketch and one or more float values to represent ranges in the
sketched values.
E.g. [1, 5, 10] will mean the following ranges:
(-inf, 1), (-inf, 5), (-inf, 10), (-inf, +inf)
Returns a comma separated string where each value in the string is a
number in the range of [0,1] and shows that what percentage of the
data is in the particular ranges.

Note, ds_kll_cdf() should return an Array of doubles as the result but
with that we have to wait for the complex type support. Until, we
provide ds_kll_cdf_as_string() that can be deprecated once we
have array support. Tracking Jira for returning complex types from
functions is IMPALA-9520.

Example:
select ds_kll_cdf_as_string(ds_kll_sketch(float_col), 2, 4, 10)
from alltypes;
+----------------------------------------------------------+
| ds_kll_cdf_as_string(ds_kll_sketch(float_col), 2, 4, 10) |
+----------------------------------------------------------+
| 0.2,0.401644,1,1                                         |
+----------------------------------------------------------+

Change-Id: I77e6afc4556ad05a295b89f6d06c2e4a6bb2cf82
Reviewed-on: http://gerrit.cloudera.org:8080/16359
Reviewed-by: Gabor Kaszab <gaborkaszab@cloudera.com>
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
diff --git a/be/src/exprs/datasketches-functions-ir.cc b/be/src/exprs/datasketches-functions-ir.cc
index a6c0b98..1c2cb68 100644
--- a/be/src/exprs/datasketches-functions-ir.cc
+++ b/be/src/exprs/datasketches-functions-ir.cc
@@ -104,8 +104,9 @@
   }
 }
 
-StringVal DataSketchesFunctions::DsKllPMFAsString(FunctionContext* ctx,
-    const StringVal& serialized_sketch, int num_args, const FloatVal* args) {
+StringVal DataSketchesFunctions::GetDsKllPMFOrCDF(FunctionContext* ctx,
+    const StringVal& serialized_sketch, int num_args, const FloatVal* args,
+    PMFCDF mode) {
   DCHECK(num_args > 0);
   if (args == nullptr || args->is_null) return StringVal::null();
   if (serialized_sketch.is_null || serialized_sketch.len == 0) return StringVal::null();
@@ -115,18 +116,29 @@
     LogSketchDeserializationError(ctx);
     return StringVal::null();
   }
-  float pmf_input[(unsigned int)num_args];
-  for (int i = 0; i < num_args; ++i) pmf_input[i] = args[i].val;
+  float input_ranges[(unsigned int)num_args];
+  for (int i = 0; i < num_args; ++i) input_ranges[i] = args[i].val;
   try {
-    std::vector<double> results = sketch.get_PMF(pmf_input, num_args);
+    std::vector<double> results = (mode == PMF) ?
+        sketch.get_PMF(input_ranges, num_args) : sketch.get_CDF(input_ranges, num_args);
     return DsKllVectorResultToStringVal(ctx, results);
   } catch(const std::exception& e) {
-    ctx->SetError(Substitute("Error while running PMF from DataSketches KLL. "
+    ctx->SetError(Substitute("Error while running DataSketches KLL function. "
         "Message: $0", e.what()).c_str());
     return StringVal::null();
   }
   return StringVal::null();
 }
 
+StringVal DataSketchesFunctions::DsKllPMFAsString(FunctionContext* ctx,
+    const StringVal& serialized_sketch, int num_args, const FloatVal* args) {
+  return GetDsKllPMFOrCDF(ctx, serialized_sketch, num_args, args, PMF);
+}
+
+StringVal DataSketchesFunctions::DsKllCDFAsString(FunctionContext* ctx,
+    const StringVal& serialized_sketch, int num_args, const FloatVal* args) {
+  return GetDsKllPMFOrCDF(ctx, serialized_sketch, num_args, args, CDF);
+}
+
 }
 
diff --git a/be/src/exprs/datasketches-functions.h b/be/src/exprs/datasketches-functions.h
index dbb1fdc..93f9815 100644
--- a/be/src/exprs/datasketches-functions.h
+++ b/be/src/exprs/datasketches-functions.h
@@ -76,6 +76,32 @@
   /// that we have to wait for the complex type support. Tracking Jira is IMPALA-9520.
   static StringVal DsKllPMFAsString(FunctionContext* ctx,
       const StringVal& serialized_sketch, int num_args, const FloatVal* args);
+
+
+  /// 'serialized_sketch' is expected as a serialized Apache DataSketches KLL sketch. If
+  /// it is not, then the query fails.
+  /// 'args' holds one or more numbers that will be used as ranges to divide the input
+  /// of the sketch. E.g. [1.0, 3.5, 10.1] will create the following ranges:
+  ///     (-inf, 1.0), (-inf, 3.5), (-inf, 10.1), (-inf, +inf)
+  /// This function returns a comma separated string that contains the probability of
+  /// having an item in each of the received ranges. E.g. a return value of 0.2 means
+  /// that approximately 20% of the items are in that given range.
+  /// Note, this function is meant to return an Array of doubles as the result but with
+  /// that we have to wait for the complex type support. Tracking Jira is IMPALA-9520.
+  static StringVal DsKllCDFAsString(FunctionContext* ctx,
+      const StringVal& serialized_sketch, int num_args, const FloatVal* args);
+
+private:
+  enum PMFCDF {
+    PMF,
+    CDF
+  };
+
+  /// Helper functions for DsKllPMFAsString() and DsKllCDFAsString(). 'mode' indicates
+  /// whether get_PMF() or get_CDF() should be invoked on the KLL sketch.
+  static StringVal GetDsKllPMFOrCDF(FunctionContext* ctx,
+      const StringVal& serialized_sketch, int num_args, const FloatVal* args,
+      PMFCDF mode);
 };
 
 }
diff --git a/common/function-registry/impala_functions.py b/common/function-registry/impala_functions.py
index 3334eb0..7216a48 100644
--- a/common/function-registry/impala_functions.py
+++ b/common/function-registry/impala_functions.py
@@ -943,6 +943,8 @@
       '_ZN6impala21DataSketchesFunctions22DsKllQuantilesAsStringEPN10impala_udf15FunctionContextERKNS1_9StringValEiPKNS1_9DoubleValE'],
   [['ds_kll_pmf_as_string'], 'STRING', ['STRING', 'FLOAT', '...'],
       '_ZN6impala21DataSketchesFunctions16DsKllPMFAsStringEPN10impala_udf15FunctionContextERKNS1_9StringValEiPKNS1_8FloatValE'],
+  [['ds_kll_cdf_as_string'], 'STRING', ['STRING', 'FLOAT', '...'],
+      '_ZN6impala21DataSketchesFunctions16DsKllCDFAsStringEPN10impala_udf15FunctionContextERKNS1_9StringValEiPKNS1_8FloatValE'],
 ]
 
 invisible_functions = [
diff --git a/testdata/workloads/functional-query/queries/QueryTest/datasketches-kll.test b/testdata/workloads/functional-query/queries/QueryTest/datasketches-kll.test
index d1ac0c2..1a7d934 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/datasketches-kll.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/datasketches-kll.test
@@ -475,12 +475,96 @@
     ds_kll_pmf_as_string(ds_kll_sketch(float_col), 1, 5, 3)
 from functional_parquet.alltypessmall;
 ---- CATCH
-UDF ERROR: Error while running PMF from DataSketches KLL. Message: Values must be unique and monotonically increasing
+UDF ERROR: Error while running DataSketches KLL function. Message: Values must be unique and monotonically increasing
 ====
 ---- QUERY
 select
     ds_kll_pmf_as_string(ds_kll_sketch(float_col), 1, 5, 5)
 from functional_parquet.alltypessmall;
 ---- CATCH
-UDF ERROR: Error while running PMF from DataSketches KLL. Message: Values must be unique and monotonically increasing
+UDF ERROR: Error while running DataSketches KLL function. Message: Values must be unique and monotonically increasing
+====
+---- QUERY
+# Checks that ds_kll_cdf_as_string() produces NULL for an empty dataset.
+select
+    ds_kll_cdf_as_string(ds_kll_sketch(cast(f2 as float)), 0.5)
+from functional_parquet.emptytable;
+---- TYPES
+STRING
+---- RESULTS
+'NULL'
+====
+---- QUERY
+# Check that ds_kll_cdf_as_string() returns null for a null input.
+select ds_kll_cdf_as_string(c, 0.5) from functional_parquet.nulltable;
+---- RESULTS
+'NULL'
+---- TYPES
+STRING
+====
+---- QUERY
+# Check that ds_kll_cdf_as_string() returns error for strings that are not serialized
+# sketches.
+select ds_kll_cdf_as_string(date_string_col, 0.5) from functional_parquet.alltypestiny;
+---- CATCH
+UDF ERROR: Unable to deserialize sketch
+====
+---- QUERY
+select
+    ds_kll_cdf_as_string(ds_kll_sketch(float_col), 0, 0.5, cast('nan' as float), 1)
+from functional_parquet.alltypessmall;
+---- CATCH
+UDF ERROR: NULL or NaN provided in the input list.
+====
+---- QUERY
+select
+    ds_kll_cdf_as_string(ds_kll_sketch(float_col), 0, 0.5, 1, cast('nan' as float))
+from functional_parquet.alltypessmall;
+---- CATCH
+UDF ERROR: NULL or NaN provided in the input list.
+====
+---- QUERY
+select
+    ds_kll_cdf_as_string(ds_kll_sketch(float_col), cast('nan' as float))
+from functional_parquet.alltypessmall;
+---- CATCH
+UDF ERROR: NULL or NaN provided in the input list.
+====
+---- QUERY
+select
+    ds_kll_cdf_as_string(ds_kll_sketch(float_col), 0, 0.5, NULL, 1)
+from functional_parquet.alltypessmall;
+---- CATCH
+UDF ERROR: NULL or NaN provided in the input list.
+====
+---- QUERY
+select
+    ds_kll_cdf_as_string(ds_kll_sketch(float_col), 5)
+from functional_parquet.alltypessmall;
+---- RESULTS
+'0.6,1'
+---- TYPES
+STRING
+====
+---- QUERY
+select ds_kll_cdf_as_string(ds_kll_sketch(float_col), 1, 5, 8, 9)
+from functional_parquet.alltypessmall;
+---- RESULTS
+'0.12,0.6,0.84,0.92,1'
+---- TYPES
+STRING
+====
+---- QUERY
+select
+    ds_kll_cdf_as_string(ds_kll_sketch(float_col), 1, 5, 3)
+from functional_parquet.alltypessmall;
+---- CATCH
+UDF ERROR: Error while running DataSketches KLL function. Message: Values must be unique and monotonically increasing
+====
+---- QUERY
+select
+    ds_kll_cdf_as_string(ds_kll_sketch(float_col), 1, 5, 5)
+from functional_parquet.alltypessmall;
+---- CATCH
+UDF ERROR: Error while running DataSketches KLL function. Message: Values must be unique and monotonically increasing
 ====