blob: 3327ed38f71d3d60cc7f3978b7ccf14c18027357 [file] [log] [blame]
====
---- QUERY
# Use a small table for testing Datasketches HLL functions through Impala to make sure
# that these approximate functions give the correct result. For testing Impala
# functionality no need to test how Datasketches HLL approximates count distint values
# so a small table is enough.
select
ds_hll_estimate(ds_hll_sketch(tinyint_col)),
ds_hll_estimate(ds_hll_sketch(int_col)),
ds_hll_estimate(ds_hll_sketch(bigint_col)),
ds_hll_estimate(ds_hll_sketch(float_col)),
ds_hll_estimate(ds_hll_sketch(double_col)),
ds_hll_estimate(ds_hll_sketch(string_col))
from functional_parquet.alltypessmall
---- RESULTS
10,10,10,10,10,10
---- TYPES
BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT
====
---- QUERY
select
ds_hll_sketch_and_estimate(tinyint_col),
ds_hll_sketch_and_estimate(int_col),
ds_hll_sketch_and_estimate(bigint_col),
ds_hll_sketch_and_estimate(float_col),
ds_hll_sketch_and_estimate(double_col),
ds_hll_sketch_and_estimate(string_col)
from functional_parquet.alltypessmall
---- RESULTS
10,10,10,10,10,10
---- TYPES
BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT
====
---- QUERY
# Check that unsupported types give error with ds_hll_sketch().
select ds_hll_sketch(bool_col) from functional_parquet.alltypessmall;
---- CATCH
AnalysisException: No matching function with signature: ds_hll_sketch(BOOLEAN)
====
---- QUERY
select ds_hll_sketch(smallint_col) from functional_parquet.alltypessmall;
---- CATCH
AnalysisException: No matching function with signature: ds_hll_sketch(SMALLINT)
====
---- QUERY
select ds_hll_sketch(cast(date_string_col as date format 'MM/DD/YYYY'))
from functional_parquet.alltypessmall;
---- CATCH
AnalysisException: No matching function with signature: ds_hll_sketch(DATE)
====
---- QUERY
select ds_hll_sketch(d1) from functional_parquet.decimal_tbl;
---- CATCH
AnalysisException: No matching function with signature: ds_hll_sketch(DECIMAL(9,0))
====
---- QUERY
# Check that unsupported types give error with ds_hll_sketch_and_estimate().
select ds_hll_sketch_and_estimate(bool_col) from functional_parquet.alltypessmall;
---- CATCH
AnalysisException: No matching function with signature: ds_hll_sketch_and_estimate(BOOLEAN)
====
---- QUERY
select ds_hll_sketch_and_estimate(smallint_col) from functional_parquet.alltypessmall;
---- CATCH
AnalysisException: No matching function with signature: ds_hll_sketch_and_estimate(SMALLINT)
====
---- QUERY
select ds_hll_sketch_and_estimate(cast(date_string_col as date format 'MM/DD/YYYY'))
from functional_parquet.alltypessmall;
---- CATCH
AnalysisException: No matching function with signature: ds_hll_sketch_and_estimate(DATE)
====
---- QUERY
select ds_hll_sketch_and_estimate(d1) from functional_parquet.decimal_tbl;
---- CATCH
AnalysisException: No matching function with signature: ds_hll_sketch_and_estimate(DECIMAL(9,0))
====
---- QUERY
# Check if HLL works with null values.
select
ds_hll_estimate(ds_hll_sketch(null_str)),
ds_hll_estimate(ds_hll_sketch(null_int)),
ds_hll_estimate(ds_hll_sketch(null_double)),
ds_hll_estimate(ds_hll_sketch(some_nulls)),
ds_hll_sketch_and_estimate(null_str),
ds_hll_sketch_and_estimate(null_int),
ds_hll_sketch_and_estimate(null_double),
ds_hll_sketch_and_estimate(some_nulls)
from functional_parquet.nullrows
---- RESULTS
NULL,NULL,NULL,6,NULL,NULL,NULL,6
---- TYPES
BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT
====
---- QUERY
# Check if HLL works for empty datasets.
select
ds_hll_estimate(ds_hll_sketch(field)),
ds_hll_estimate(ds_hll_sketch(f2)),
ds_hll_sketch_and_estimate(field),
ds_hll_sketch_and_estimate(f2)
from functional_parquet.emptytable
---- RESULTS
NULL,NULL,NULL,NULL
---- TYPES
BIGINT,BIGINT,BIGINT,BIGINT
====
---- QUERY
# Write sketches to a table as string and get an estimate from the written sketch.
# Note, the plan is to write sketches as binary instead of strings. For this we have to
# wait for the binary support (IMPALA-9482).
create table sketch_store
(year int, month int, date_sketch string, float_sketch string)
stored as parquet;
insert into sketch_store
select
year,
month,
ds_hll_sketch(date_string_col),
ds_hll_sketch(float_col)
from functional_parquet.alltypessmall
group by year, month;
select
year,
month,
ds_hll_estimate(date_sketch),
ds_hll_estimate(float_sketch)
from sketch_store;
---- RESULTS
2009,1,3,10
2009,2,3,10
2009,3,3,10
2009,4,3,10
---- TYPES
INT,INT,BIGINT,BIGINT
====
---- QUERY
# Check that ds_hll_estimate returns error for strings that are not serialized sketches.
select ds_hll_estimate(date_string_col) from functional_parquet.alltypestiny;
---- CATCH
UDF ERROR: Unable to deserialize sketch.
====
---- QUERY
# Check that ds_hll_estimate returns null for null and empty string inputs.
select ds_hll_estimate(b), ds_hll_estimate(c) from functional_parquet.nulltable;
---- RESULTS
NULL,NULL
---- TYPES
BIGINT,BIGINT
====
---- QUERY
# Check that sketches made by Hive can be read and used for estimating by Impala.
select
ds_hll_estimate(ti) as ti,
ds_hll_estimate(i) as i,
ds_hll_estimate(bi) as bi,
ds_hll_estimate(f) as f,
ds_hll_estimate(d) as d,
ds_hll_estimate(s) as s,
ds_hll_estimate(c) as c,
ds_hll_estimate(v) as v,
ds_hll_estimate(nc) as nc
from hll_sketches_from_hive;
---- TYPES
BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT
---- RESULTS
5,7,6,5,6,4,3,3,NULL
====
---- QUERY
# Unions the sketches from sketch_store and checks if the union produces the same result
# as if the whole data was sketched together into a single sketch.
select
ds_hll_estimate(ds_hll_union(date_sketch)),
ds_hll_estimate(ds_hll_union(float_sketch))
from sketch_store;
---- TYPES
BIGINT,BIGINT
---- RESULTS
12,10
====
---- QUERY
# Checks that ds_hll_union() produces NULL for an empty dataset.
select ds_hll_union(field) from functional_parquet.emptytable;
---- TYPES
STRING
---- RESULTS
'NULL'
====
---- QUERY
# Checks that ds_hll_union() produces NULL for NULL inputs.
select ds_hll_union(null_str) from functional_parquet.nullrows;
---- TYPES
STRING
---- RESULTS
'NULL'
====
---- QUERY
# ds_hll_union() returns an error if it receives an invalid serialized sketch.
select ds_hll_union(date_string_col) from functional_parquet.alltypestiny where id=1;
---- CATCH
UDF ERROR: Unable to deserialize sketch.
====
---- QUERY
# Get the same sketches from Impala and Hive and put them into the same table. When we
# get the estimates from the unions of these sketches the expectation is to get the same
# results as if these sketches were used separately to get the estimates. However, for
# string types (STRING, CHAR, VARCHAR) we see the numbers doubling up because of the
# difference between how Impala and Hive uses these types. See IMPALA-9939 for further
# details.
create table hll_sketches_impala_hive like hll_sketches_from_impala stored as parquet;
insert into hll_sketches_impala_hive select * from hll_sketches_from_hive;
insert into hll_sketches_impala_hive select * from hll_sketches_from_impala;
select
ds_hll_estimate(ds_hll_union(ti)) as ti,
ds_hll_estimate(ds_hll_union(i)) as i,
ds_hll_estimate(ds_hll_union(bi)) as bi,
ds_hll_estimate(ds_hll_union(f)) as f,
ds_hll_estimate(ds_hll_union(d)) as d,
ds_hll_estimate(ds_hll_union(s)) as s,
ds_hll_estimate(ds_hll_union(c)) as c,
ds_hll_estimate(ds_hll_union(v)) as v,
ds_hll_estimate(ds_hll_union(nc)) as nc
from hll_sketches_impala_hive;
---- TYPES
BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT
---- RESULTS
5,7,6,5,6,8,6,6,NULL
====
---- QUERY
# IMPALA-9942: DataSketches HLL shouldn't take empty strings as distinct values
create table empty_string (s string, v varchar(1), c char(1));
insert into empty_string values
("", cast("" as varchar(1)), cast("" as char(1))),
("a", cast("a" as varchar(1)), cast("a" as char(1))),
("", cast("" as varchar(1)), cast("" as char(1))),
("b", cast("b" as varchar(1)), cast("b" as char(1))),
("b", cast("b" as varchar(1)), cast("b" as char(1)));
# Check if HLL works with empty strings.
select
ds_hll_estimate(ds_hll_sketch(s)),
ds_hll_estimate(ds_hll_sketch(v)),
ds_hll_estimate(ds_hll_sketch(c)),
ds_hll_sketch_and_estimate(s),
ds_hll_sketch_and_estimate(v),
ds_hll_sketch_and_estimate(c)
from empty_string
---- RESULTS
2,2,3,2,2,3
---- TYPES
BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT
====
---- QUERY
# Check that ds_hll_stringify() returns null for an empty sketch.
select ds_hll_stringify(ds_kll_sketch(cast(f2 as float))) from functional_parquet.emptytable;
---- RESULTS
'NULL'
---- TYPES
STRING
====
---- QUERY
# Check that ds_hll_stringify() returns null for a null input.
select ds_hll_stringify(c) from functional_parquet.nulltable;
---- RESULTS
'NULL'
---- TYPES
STRING
====
---- QUERY
# Check that ds_hll_stringify() returns error for strings that are not serialized sketches.
select ds_hll_stringify(date_string_col) from functional_parquet.alltypestiny;
---- CATCH
UDF ERROR: Unable to deserialize sketch
====
---- QUERY
# Check that ds_hll_stringify() works on sketches created by Hive.
select ds_hll_stringify(f) from hll_sketches_from_hive;
---- RESULTS
row_regex: .*### HLL sketch summary:.*Log Config K.*Hll Target.*Current Mode.*LB.*### End HLL sketch summary.*
---- TYPES
STRING
====
---- QUERY
select ds_hll_stringify(ds_hll_sketch(float_col)) from functional_parquet.alltypestiny;
---- RESULTS
row_regex: .*### HLL sketch summary:.*Log Config K.*Hll Target.*Current Mode.*LB.*### End HLL sketch summary.*
---- TYPES
STRING
====