| ==== |
| ---- QUERY |
| # Use a small table for testing Datasketches HLL functions through Impala to make sure |
| # that these approximate functions give the correct result. For testing Impala |
| # functionality no need to test how Datasketches HLL approximates count distint values |
| # so a small table is enough. |
| select |
| ds_hll_estimate(ds_hll_sketch(tinyint_col)), |
| ds_hll_estimate(ds_hll_sketch(int_col)), |
| ds_hll_estimate(ds_hll_sketch(bigint_col)), |
| ds_hll_estimate(ds_hll_sketch(float_col)), |
| ds_hll_estimate(ds_hll_sketch(double_col)), |
| ds_hll_estimate(ds_hll_sketch(string_col)) |
| from functional_parquet.alltypessmall |
| ---- RESULTS |
| 10,10,10,10,10,10 |
| ---- TYPES |
| BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT |
| ==== |
| ---- QUERY |
| select |
| ds_hll_sketch_and_estimate(tinyint_col), |
| ds_hll_sketch_and_estimate(int_col), |
| ds_hll_sketch_and_estimate(bigint_col), |
| ds_hll_sketch_and_estimate(float_col), |
| ds_hll_sketch_and_estimate(double_col), |
| ds_hll_sketch_and_estimate(string_col) |
| from functional_parquet.alltypessmall |
| ---- RESULTS |
| 10,10,10,10,10,10 |
| ---- TYPES |
| BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT |
| ==== |
| ---- QUERY |
| # Check that unsupported types give error with ds_hll_sketch(). |
| select ds_hll_sketch(bool_col) from functional_parquet.alltypessmall; |
| ---- CATCH |
| AnalysisException: No matching function with signature: ds_hll_sketch(BOOLEAN) |
| ==== |
| ---- QUERY |
| select ds_hll_sketch(smallint_col) from functional_parquet.alltypessmall; |
| ---- CATCH |
| AnalysisException: No matching function with signature: ds_hll_sketch(SMALLINT) |
| ==== |
| ---- QUERY |
| select ds_hll_sketch(cast(date_string_col as date format 'MM/DD/YYYY')) |
| from functional_parquet.alltypessmall; |
| ---- CATCH |
| AnalysisException: No matching function with signature: ds_hll_sketch(DATE) |
| ==== |
| ---- QUERY |
| select ds_hll_sketch(d1) from functional_parquet.decimal_tbl; |
| ---- CATCH |
| AnalysisException: No matching function with signature: ds_hll_sketch(DECIMAL(9,0)) |
| ==== |
| ---- QUERY |
| # Check that unsupported types give error with ds_hll_sketch_and_estimate(). |
| select ds_hll_sketch_and_estimate(bool_col) from functional_parquet.alltypessmall; |
| ---- CATCH |
| AnalysisException: No matching function with signature: ds_hll_sketch_and_estimate(BOOLEAN) |
| ==== |
| ---- QUERY |
| select ds_hll_sketch_and_estimate(smallint_col) from functional_parquet.alltypessmall; |
| ---- CATCH |
| AnalysisException: No matching function with signature: ds_hll_sketch_and_estimate(SMALLINT) |
| ==== |
| ---- QUERY |
| select ds_hll_sketch_and_estimate(cast(date_string_col as date format 'MM/DD/YYYY')) |
| from functional_parquet.alltypessmall; |
| ---- CATCH |
| AnalysisException: No matching function with signature: ds_hll_sketch_and_estimate(DATE) |
| ==== |
| ---- QUERY |
| select ds_hll_sketch_and_estimate(d1) from functional_parquet.decimal_tbl; |
| ---- CATCH |
| AnalysisException: No matching function with signature: ds_hll_sketch_and_estimate(DECIMAL(9,0)) |
| ==== |
| ---- QUERY |
| # Check if HLL works with null values. |
| select |
| ds_hll_estimate(ds_hll_sketch(null_str)), |
| ds_hll_estimate(ds_hll_sketch(null_int)), |
| ds_hll_estimate(ds_hll_sketch(null_double)), |
| ds_hll_estimate(ds_hll_sketch(some_nulls)), |
| ds_hll_sketch_and_estimate(null_str), |
| ds_hll_sketch_and_estimate(null_int), |
| ds_hll_sketch_and_estimate(null_double), |
| ds_hll_sketch_and_estimate(some_nulls) |
| from functional_parquet.nullrows |
| ---- RESULTS |
| NULL,NULL,NULL,6,NULL,NULL,NULL,6 |
| ---- TYPES |
| BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT |
| ==== |
| ---- QUERY |
| # Check if HLL works for empty datasets. |
| select |
| ds_hll_estimate(ds_hll_sketch(field)), |
| ds_hll_estimate(ds_hll_sketch(f2)), |
| ds_hll_sketch_and_estimate(field), |
| ds_hll_sketch_and_estimate(f2) |
| from functional_parquet.emptytable |
| ---- RESULTS |
| NULL,NULL,NULL,NULL |
| ---- TYPES |
| BIGINT,BIGINT,BIGINT,BIGINT |
| ==== |
| ---- QUERY |
| # Write sketches to a table as string and get an estimate from the written sketch. |
| # Note, the plan is to write sketches as binary instead of strings. For this we have to |
| # wait for the binary support (IMPALA-9482). |
| create table sketch_store |
| (year int, month int, date_sketch string, float_sketch string) |
| stored as parquet; |
| insert into sketch_store |
| select |
| year, |
| month, |
| ds_hll_sketch(date_string_col), |
| ds_hll_sketch(float_col) |
| from functional_parquet.alltypessmall |
| group by year, month; |
| select |
| year, |
| month, |
| ds_hll_estimate(date_sketch), |
| ds_hll_estimate(float_sketch) |
| from sketch_store; |
| ---- RESULTS |
| 2009,1,3,10 |
| 2009,2,3,10 |
| 2009,3,3,10 |
| 2009,4,3,10 |
| ---- TYPES |
| INT,INT,BIGINT,BIGINT |
| ==== |
| ---- QUERY |
| # Check that ds_hll_estimate returns error for strings that are not serialized sketches. |
| select ds_hll_estimate(date_string_col) from functional_parquet.alltypestiny; |
| ---- CATCH |
| UDF ERROR: Unable to deserialize sketch. |
| ==== |
| ---- QUERY |
| # Check that ds_hll_estimate returns null for null and empty string inputs. |
| select ds_hll_estimate(b), ds_hll_estimate(c) from functional_parquet.nulltable; |
| ---- RESULTS |
| NULL,NULL |
| ---- TYPES |
| BIGINT,BIGINT |
| ==== |
| ---- QUERY |
| # Check that sketches made by Hive can be read and used for estimating by Impala. |
| select |
| ds_hll_estimate(ti) as ti, |
| ds_hll_estimate(i) as i, |
| ds_hll_estimate(bi) as bi, |
| ds_hll_estimate(f) as f, |
| ds_hll_estimate(d) as d, |
| ds_hll_estimate(s) as s, |
| ds_hll_estimate(c) as c, |
| ds_hll_estimate(v) as v, |
| ds_hll_estimate(nc) as nc |
| from hll_sketches_from_hive; |
| ---- TYPES |
| BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT |
| ---- RESULTS |
| 5,7,6,5,6,4,3,3,NULL |
| ==== |
| ---- QUERY |
| # Unions the sketches from sketch_store and checks if the union produces the same result |
| # as if the whole data was sketched together into a single sketch. |
| select |
| ds_hll_estimate(ds_hll_union(date_sketch)), |
| ds_hll_estimate(ds_hll_union(float_sketch)) |
| from sketch_store; |
| ---- TYPES |
| BIGINT,BIGINT |
| ---- RESULTS |
| 12,10 |
| ==== |
| ---- QUERY |
| # Checks that ds_hll_union() produces NULL for an empty dataset. |
| select ds_hll_union(field) from functional_parquet.emptytable; |
| ---- TYPES |
| STRING |
| ---- RESULTS |
| 'NULL' |
| ==== |
| ---- QUERY |
| # Checks that ds_hll_union() produces NULL for NULL inputs. |
| select ds_hll_union(null_str) from functional_parquet.nullrows; |
| ---- TYPES |
| STRING |
| ---- RESULTS |
| 'NULL' |
| ==== |
| ---- QUERY |
| # ds_hll_union() returns an error if it receives an invalid serialized sketch. |
| select ds_hll_union(date_string_col) from functional_parquet.alltypestiny where id=1; |
| ---- CATCH |
| UDF ERROR: Unable to deserialize sketch. |
| ==== |
| ---- QUERY |
| # Get the same sketches from Impala and Hive and put them into the same table. When we |
| # get the estimates from the unions of these sketches the expectation is to get the same |
| # results as if these sketches were used separately to get the estimates. However, for |
| # string types (STRING, CHAR, VARCHAR) we see the numbers doubling up because of the |
| # difference between how Impala and Hive uses these types. See IMPALA-9939 for further |
| # details. |
| create table hll_sketches_impala_hive like hll_sketches_from_impala stored as parquet; |
| insert into hll_sketches_impala_hive select * from hll_sketches_from_hive; |
| insert into hll_sketches_impala_hive select * from hll_sketches_from_impala; |
| select |
| ds_hll_estimate(ds_hll_union(ti)) as ti, |
| ds_hll_estimate(ds_hll_union(i)) as i, |
| ds_hll_estimate(ds_hll_union(bi)) as bi, |
| ds_hll_estimate(ds_hll_union(f)) as f, |
| ds_hll_estimate(ds_hll_union(d)) as d, |
| ds_hll_estimate(ds_hll_union(s)) as s, |
| ds_hll_estimate(ds_hll_union(c)) as c, |
| ds_hll_estimate(ds_hll_union(v)) as v, |
| ds_hll_estimate(ds_hll_union(nc)) as nc |
| from hll_sketches_impala_hive; |
| ---- TYPES |
| BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT |
| ---- RESULTS |
| 5,7,6,5,6,8,6,6,NULL |
| ==== |
| ---- QUERY |
| # IMPALA-9942: DataSketches HLL shouldn't take empty strings as distinct values |
| create table empty_string (s string, v varchar(1), c char(1)); |
| insert into empty_string values |
| ("", cast("" as varchar(1)), cast("" as char(1))), |
| ("a", cast("a" as varchar(1)), cast("a" as char(1))), |
| ("", cast("" as varchar(1)), cast("" as char(1))), |
| ("b", cast("b" as varchar(1)), cast("b" as char(1))), |
| ("b", cast("b" as varchar(1)), cast("b" as char(1))); |
| # Check if HLL works with empty strings. |
| select |
| ds_hll_estimate(ds_hll_sketch(s)), |
| ds_hll_estimate(ds_hll_sketch(v)), |
| ds_hll_estimate(ds_hll_sketch(c)), |
| ds_hll_sketch_and_estimate(s), |
| ds_hll_sketch_and_estimate(v), |
| ds_hll_sketch_and_estimate(c) |
| from empty_string |
| ---- RESULTS |
| 2,2,3,2,2,3 |
| ---- TYPES |
| BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT |
| ==== |
| ---- QUERY |
| # Check that ds_hll_stringify() returns null for an empty sketch. |
| select ds_hll_stringify(ds_kll_sketch(cast(f2 as float))) from functional_parquet.emptytable; |
| ---- RESULTS |
| 'NULL' |
| ---- TYPES |
| STRING |
| ==== |
| ---- QUERY |
| # Check that ds_hll_stringify() returns null for a null input. |
| select ds_hll_stringify(c) from functional_parquet.nulltable; |
| ---- RESULTS |
| 'NULL' |
| ---- TYPES |
| STRING |
| ==== |
| ---- QUERY |
| # Check that ds_hll_stringify() returns error for strings that are not serialized sketches. |
| select ds_hll_stringify(date_string_col) from functional_parquet.alltypestiny; |
| ---- CATCH |
| UDF ERROR: Unable to deserialize sketch |
| ==== |
| ---- QUERY |
| # Check that ds_hll_stringify() works on sketches created by Hive. |
| select ds_hll_stringify(f) from hll_sketches_from_hive; |
| ---- RESULTS |
| row_regex: .*### HLL sketch summary:.*Log Config K.*Hll Target.*Current Mode.*LB.*### End HLL sketch summary.* |
| ---- TYPES |
| STRING |
| ==== |
| ---- QUERY |
| select ds_hll_stringify(ds_hll_sketch(float_col)) from functional_parquet.alltypestiny; |
| ---- RESULTS |
| row_regex: .*### HLL sketch summary:.*Log Config K.*Hll Target.*Current Mode.*LB.*### End HLL sketch summary.* |
| ---- TYPES |
| STRING |
| ==== |
| ---- QUERY |
| # Check that ds_hll_estimate_bounds_as_string() returns error for strings that are not |
| # serialized sketches. |
| select ds_hll_estimate_bounds_as_string(date_string_col) from functional_parquet.alltypestiny; |
| ---- CATCH |
| UDF ERROR: Unable to deserialize sketch. |
| ==== |
| ---- QUERY |
| # Check that ds_hll_estimate_bounds_as_string() returns error for kappa is not 1, 2, 3. |
| select ds_hll_estimate_bounds_as_string(date_string_col, 4) from functional_parquet.alltypestiny; |
| ---- CATCH |
| UDF ERROR: Kappa must be 1, 2 or 3 |
| ==== |
| ---- QUERY |
| select ds_hll_estimate_bounds_as_string(date_string_col, 1.5) from functional_parquet.alltypestiny; |
| ---- CATCH |
| AnalysisException: No matching function with signature: ds_hll_estimate_bounds_as_string(STRING, DECIMAL(2,1)). |
| ==== |
| ---- QUERY |
| # Check that ds_hll_estimate_bounds_as_string() works on sketches created by Hive. |
| select ds_hll_estimate_bounds_as_string(f) from hll_sketches_from_hive; |
| ---- RESULTS |
| '5,5,5.0005' |
| ---- TYPES |
| STRING |
| ==== |
| ---- QUERY |
| select ds_hll_estimate_bounds_as_string(f, 2) from hll_sketches_from_hive; |
| ---- RESULTS |
| '5,5,5.0005' |
| ---- TYPES |
| STRING |
| ==== |
| ---- QUERY |
| select ds_hll_estimate_bounds_as_string(f, 1) from hll_sketches_from_hive; |
| ---- RESULTS |
| '5,5,5.00025' |
| ---- TYPES |
| STRING |
| ==== |
| ---- QUERY |
| select ds_hll_estimate_bounds_as_string(ds_hll_sketch(float_col)) |
| from functional_parquet.alltypestiny; |
| ---- RESULTS |
| '2,2,2.0002' |
| ---- TYPES |
| STRING |
| ==== |
| ---- QUERY |
| select ds_hll_estimate_bounds_as_string(ds_hll_sketch(float_col), 3) |
| from functional_parquet.alltypestiny; |
| ---- RESULTS |
| '2,2,2.0003' |
| ---- TYPES |
| STRING |
| ==== |
| ---- QUERY |
| # Checks that unioning a valid sketch with a null value result the valid sketch being returned. |
| select |
| ds_hll_estimate(ds_hll_union_f(date_sketch, null)), |
| ds_hll_estimate(ds_hll_union_f(null, float_sketch)) |
| from sketch_store; |
| ---- TYPES |
| BIGINT,BIGINT |
| ---- RESULTS |
| 3,10 |
| 3,10 |
| 3,10 |
| 3,10 |
| ==== |
| ---- QUERY |
| # Check that ds_hll_union_f() returns an empty sketch for an empty sketch. |
| select ds_hll_estimate(ds_hll_union_f(ds_hll_sketch(cast(f2 as float)), null)) |
| from functional_parquet.emptytable; |
| ---- TYPES |
| BIGINT |
| ---- RESULTS |
| 0 |
| ==== |
| ---- QUERY |
| # Checks that ds_hll_union_f() returns an empty sketch for NULL inputs. |
| select ds_hll_estimate(ds_hll_union_f(null_str, some_nulls)) from functional_parquet.nullrows |
| where id='b'; |
| ---- TYPES |
| BIGINT |
| ---- RESULTS |
| 0 |
| ==== |
| ---- QUERY |
| # ds_hll_union_f() returns an error if it receives an invalid serialized sketch. |
| select ds_hll_union_f(null, date_string_col) from functional_parquet.alltypestiny |
| where id=1; |
| ---- CATCH |
| UDF ERROR: Unable to deserialize sketch. |
| ==== |
| ---- QUERY |
| # ds_hll_union_f() returns an error if it receives an invalid serialized sketch. |
| select ds_hll_union_f(date_string_col, null) from functional_parquet.alltypestiny |
| where id=1; |
| ---- CATCH |
| UDF ERROR: Unable to deserialize sketch. |
| ==== |
| ---- QUERY |
| # Get the same sketches from Impala and Hive and put them into the same table (different |
| # column prefix). When we get the estimates from the unions of these sketches the expectation |
| # is to get the same results as if these sketches were used separately to get the estimates. |
| # However, for string types (STRING, CHAR, VARCHAR) we see the numbers doubling up because |
| # of the difference between how Impala and Hive uses these types. See IMPALA-9939 for further |
| # details. |
| create table hll_sketches_impala_hive2 ( |
| i_ti string,i_i string,i_bi string,i_f string,i_d string,i_s string,i_c string,i_v string, |
| i_nc string, |
| h_ti string,h_i string,h_bi string,h_f string,h_d string,h_s string,h_c string,h_v string, |
| h_nc string) stored as parquet; |
| insert overwrite hll_sketches_impala_hive2 select |
| i.ti i_ti, i.i i_i, i.bi i_bi, i.f i_f, i.d i_d, i.s i_s, i.c i_c, i.v i_v,i.nc i_nc, |
| h.ti h_ti, h.i h_i, h.bi h_bi, h.f h_f, h.d h_d, h.s h_s, h.c h_c, h.v h_v,h.nc h_nc |
| from hll_sketches_from_impala i, hll_sketches_from_hive h; |
| select |
| ds_hll_estimate(ds_hll_union_f(i_ti, h_ti)) as ti, |
| ds_hll_estimate(ds_hll_union_f(i_i, h_i)) as i, |
| ds_hll_estimate(ds_hll_union_f(i_bi, h_bi)) as bi, |
| ds_hll_estimate(ds_hll_union_f(i_f, h_f)) as f, |
| ds_hll_estimate(ds_hll_union_f(i_d, h_d)) as d, |
| ds_hll_estimate(ds_hll_union_f(i_s, h_s)) as s, |
| ds_hll_estimate(ds_hll_union_f(i_c, h_c)) as c, |
| ds_hll_estimate(ds_hll_union_f(i_v, h_v)) as v, |
| ds_hll_estimate(ds_hll_union_f(i_nc, h_nc)) as nc |
| from hll_sketches_impala_hive2; |
| ---- TYPES |
| BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT |
| ---- RESULTS |
| 5,7,6,5,6,8,6,6,0 |
| ==== |