IMPALA-10133:Implement ds_hll_stringify function.
This function receives a string that is a serialized Apache DataSketches
HLL sketch and returns its stringified format.
A stringified format should look like and contains the following data:
select ds_hll_stringify(ds_hll_sketch(float_col)) from
functional_parquet.alltypestiny;
+--------------------------------------------+
| ds_hll_stringify(ds_hll_sketch(float_col)) |
+--------------------------------------------+
| ### HLL sketch summary: |
| Log Config K : 12 |
| Hll Target : HLL_4 |
| Current Mode : LIST |
| LB : 2 |
| Estimate : 2 |
| UB : 2.0001 |
| OutOfOrder flag: false |
| Coupon count : 2 |
| ### End HLL sketch summary |
| |
+--------------------------------------------+
Change-Id: I85dbf20b5114dd75c300eef0accabe90eac240a0
Reviewed-on: http://gerrit.cloudera.org:8080/16382
Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
diff --git a/be/src/exprs/datasketches-functions-ir.cc b/be/src/exprs/datasketches-functions-ir.cc
index 4edb83f..1cef6c9 100644
--- a/be/src/exprs/datasketches-functions-ir.cc
+++ b/be/src/exprs/datasketches-functions-ir.cc
@@ -38,6 +38,20 @@
return sketch.get_estimate();
}
+StringVal DataSketchesFunctions::DsHllStringify(FunctionContext* ctx,
+ const StringVal& serialized_sketch) {
+ if (serialized_sketch.is_null || serialized_sketch.len == 0) return StringVal::null();
+ datasketches::hll_sketch sketch(DS_SKETCH_CONFIG, DS_HLL_TYPE);
+ if (!DeserializeDsSketch(serialized_sketch, &sketch)) {
+ LogSketchDeserializationError(ctx);
+ return StringVal::null();
+ }
+ string str = sketch.to_string(true, false, false, false);
+ StringVal dst(ctx, str.size());
+ memcpy(dst.ptr, str.c_str(), str.size());
+ return dst;
+}
+
FloatVal DataSketchesFunctions::DsKllQuantile(FunctionContext* ctx,
const StringVal& serialized_sketch, const DoubleVal& rank) {
if (serialized_sketch.is_null || serialized_sketch.len == 0) return FloatVal::null();
diff --git a/be/src/exprs/datasketches-functions.h b/be/src/exprs/datasketches-functions.h
index c35c3f4..91d9313 100644
--- a/be/src/exprs/datasketches-functions.h
+++ b/be/src/exprs/datasketches-functions.h
@@ -35,6 +35,12 @@
static BigIntVal DsHllEstimate(FunctionContext* ctx,
const StringVal& serialized_sketch);
+ /// 'serialized_sketch' is expected as a serialized Apache DataSketches HLL sketch. If
+ /// it is not, then the query fails. This function returns the stringified format of
+ /// an Apache DataSketches HLL sketch.
+ static StringVal DsHllStringify(FunctionContext* ctx,
+ const StringVal& serialized_sketch);
+
/// 'serialized_sketch' is expected as a serialized Apache DataSketches KLL sketch. If
/// it is not, then the query fails. 'rank' is used to identify which item (estimate)
/// to return from the sketched dataset. E.g. 0.1 means the item where 10% of the
diff --git a/common/function-registry/impala_functions.py b/common/function-registry/impala_functions.py
index 93a2926..6a644fe 100644
--- a/common/function-registry/impala_functions.py
+++ b/common/function-registry/impala_functions.py
@@ -933,6 +933,8 @@
# Functions to use Apache DataSketches functionality
[['ds_hll_estimate'], 'BIGINT', ['STRING'],
'_ZN6impala21DataSketchesFunctions13DsHllEstimateEPN10impala_udf15FunctionContextERKNS1_9StringValE'],
+ [['ds_hll_stringify'], 'STRING', ['STRING'],
+ '_ZN6impala21DataSketchesFunctions14DsHllStringifyEPN10impala_udf15FunctionContextERKNS1_9StringValE'],
[['ds_kll_quantile'], 'FLOAT', ['STRING', 'DOUBLE'],
'_ZN6impala21DataSketchesFunctions13DsKllQuantileEPN10impala_udf15FunctionContextERKNS1_9StringValERKNS1_9DoubleValE'],
[['ds_kll_n'], 'BIGINT', ['STRING'],
diff --git a/testdata/workloads/functional-query/queries/QueryTest/datasketches-hll.test b/testdata/workloads/functional-query/queries/QueryTest/datasketches-hll.test
index b55e85f..3327ed3 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/datasketches-hll.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/datasketches-hll.test
@@ -249,3 +249,40 @@
---- TYPES
BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT
====
+---- QUERY
+# Check that ds_hll_stringify() returns null for an empty sketch.
+select ds_hll_stringify(ds_kll_sketch(cast(f2 as float))) from functional_parquet.emptytable;
+---- RESULTS
+'NULL'
+---- TYPES
+STRING
+====
+---- QUERY
+# Check that ds_hll_stringify() returns null for a null input.
+select ds_hll_stringify(c) from functional_parquet.nulltable;
+---- RESULTS
+'NULL'
+---- TYPES
+STRING
+====
+---- QUERY
+# Check that ds_hll_stringify() returns error for strings that are not serialized sketches.
+select ds_hll_stringify(date_string_col) from functional_parquet.alltypestiny;
+---- CATCH
+UDF ERROR: Unable to deserialize sketch
+====
+---- QUERY
+# Check that ds_hll_stringify() works on sketches created by Hive.
+select ds_hll_stringify(f) from hll_sketches_from_hive;
+---- RESULTS
+row_regex: .*### HLL sketch summary:.*Log Config K.*Hll Target.*Current Mode.*LB.*### End HLL sketch summary.*
+---- TYPES
+STRING
+====
+---- QUERY
+select ds_hll_stringify(ds_hll_sketch(float_col)) from functional_parquet.alltypestiny;
+---- RESULTS
+row_regex: .*### HLL sketch summary:.*Log Config K.*Hll Target.*Current Mode.*LB.*### End HLL sketch summary.*
+---- TYPES
+STRING
+====