blob: 757e352b81743e911244316112c31640b2ac4190 [file] [log] [blame]
====
---- QUERY
# Use a small table for testing Datasketches CPC functions through Impala to make sure
# that these approximate functions give the correct result. For testing Impala
# functionality no need to test how Datasketches CPC approximates count distint values
# so a small table is enough.
select
ds_cpc_estimate(ds_cpc_sketch(tinyint_col)),
ds_cpc_estimate(ds_cpc_sketch(int_col)),
ds_cpc_estimate(ds_cpc_sketch(bigint_col)),
ds_cpc_estimate(ds_cpc_sketch(float_col)),
ds_cpc_estimate(ds_cpc_sketch(double_col)),
ds_cpc_estimate(ds_cpc_sketch(string_col)),
ds_cpc_estimate(ds_cpc_sketch(date_string_col))
from functional_parquet.alltypessmall
---- RESULTS
10,10,10,10,10,10,12
---- TYPES
BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT
====
---- QUERY
select
ds_cpc_sketch_and_estimate(tinyint_col),
ds_cpc_sketch_and_estimate(int_col),
ds_cpc_sketch_and_estimate(bigint_col),
ds_cpc_sketch_and_estimate(float_col),
ds_cpc_sketch_and_estimate(double_col),
ds_cpc_sketch_and_estimate(string_col),
ds_cpc_sketch_and_estimate(date_string_col)
from functional_parquet.alltypessmall
---- RESULTS
10,10,10,10,10,10,12
---- TYPES
BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT
====
---- QUERY
# Check that unsupported types give error with ds_cpc_sketch().
select ds_cpc_sketch(bool_col) from functional_parquet.alltypessmall;
---- CATCH
AnalysisException: No matching function with signature: ds_cpc_sketch(BOOLEAN)
====
---- QUERY
select ds_cpc_sketch(smallint_col) from functional_parquet.alltypessmall;
---- CATCH
AnalysisException: No matching function with signature: ds_cpc_sketch(SMALLINT)
====
---- QUERY
select ds_cpc_sketch(cast(date_string_col as date format 'MM/DD/YYYY'))
from functional_parquet.alltypessmall;
---- CATCH
AnalysisException: No matching function with signature: ds_cpc_sketch(DATE)
====
---- QUERY
select ds_cpc_sketch(d1) from functional_parquet.decimal_tbl;
---- CATCH
AnalysisException: No matching function with signature: ds_cpc_sketch(DECIMAL(9,0))
====
---- QUERY
# Check that unsupported types give error with ds_cpc_sketch_and_estimate().
select ds_cpc_sketch_and_estimate(bool_col) from functional_parquet.alltypessmall;
---- CATCH
AnalysisException: No matching function with signature: ds_cpc_sketch_and_estimate(BOOLEAN)
====
---- QUERY
select ds_cpc_sketch_and_estimate(smallint_col) from functional_parquet.alltypessmall;
---- CATCH
AnalysisException: No matching function with signature: ds_cpc_sketch_and_estimate(SMALLINT)
====
---- QUERY
select ds_cpc_sketch_and_estimate(cast(date_string_col as date format 'MM/DD/YYYY'))
from functional_parquet.alltypessmall;
---- CATCH
AnalysisException: No matching function with signature: ds_cpc_sketch_and_estimate(DATE)
====
---- QUERY
select ds_cpc_sketch_and_estimate(d1) from functional_parquet.decimal_tbl;
---- CATCH
AnalysisException: No matching function with signature: ds_cpc_sketch_and_estimate(DECIMAL(9,0))
====
---- QUERY
# Check if CPC works with null values.
select
ds_cpc_estimate(ds_cpc_sketch(null_str)),
ds_cpc_estimate(ds_cpc_sketch(null_int)),
ds_cpc_estimate(ds_cpc_sketch(null_double)),
ds_cpc_estimate(ds_cpc_sketch(some_nulls)),
ds_cpc_sketch_and_estimate(null_str),
ds_cpc_sketch_and_estimate(null_int),
ds_cpc_sketch_and_estimate(null_double),
ds_cpc_sketch_and_estimate(some_nulls)
from functional_parquet.nullrows;
---- RESULTS
NULL,NULL,NULL,6,NULL,NULL,NULL,6
---- TYPES
BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT
====
---- QUERY
# Check if CPC works for empty datasets.
select
ds_cpc_estimate(ds_cpc_sketch(field)),
ds_cpc_estimate(ds_cpc_sketch(f2)),
ds_cpc_sketch_and_estimate(field),
ds_cpc_sketch_and_estimate(f2)
from functional_parquet.emptytable;
---- RESULTS
NULL,NULL,NULL,NULL
---- TYPES
BIGINT,BIGINT,BIGINT,BIGINT
====
---- QUERY
# Write sketches to a table as string and get an estimate from the written sketch.
# Note, the plan is to write sketches as binary instead of strings. For this we have to
# wait for the binary support (IMPALA-9482).
create table cpc_sketch_store
(year int, month int, date_sketch string, float_sketch string)
stored as parquet;
insert into cpc_sketch_store
select
year,
month,
ds_cpc_sketch(date_string_col),
ds_cpc_sketch(float_col)
from functional_parquet.alltypessmall
group by year, month;
select
year,
month,
ds_cpc_estimate(date_sketch),
ds_cpc_estimate(float_sketch)
from cpc_sketch_store order by month;
---- RESULTS
2009,1,3,10
2009,2,3,10
2009,3,3,10
2009,4,3,10
---- TYPES
INT,INT,BIGINT,BIGINT
====
---- QUERY
# Check that ds_cpc_estimate returns error for strings that are not serialized sketches.
select ds_cpc_estimate(date_string_col) from functional_parquet.alltypestiny;
---- CATCH
UDF ERROR: Unable to deserialize sketch: Insufficient buffer size detected: bytes available 8, minimum needed 192
====
---- QUERY
# Check that ds_cpc_estimate returns null for null and empty string inputs.
select ds_cpc_estimate(b), ds_cpc_estimate(c) from functional_parquet.nulltable;
---- RESULTS
NULL,NULL
---- TYPES
BIGINT,BIGINT
====
---- QUERY
# Check that sketches made by Hive can be read and used for estimating by Impala.
select
ds_cpc_estimate(ti) as ti,
ds_cpc_estimate(i) as i,
ds_cpc_estimate(bi) as bi,
ds_cpc_estimate(f) as f,
ds_cpc_estimate(d) as d,
ds_cpc_estimate(s) as s,
ds_cpc_estimate(c) as c,
ds_cpc_estimate(v) as v,
ds_cpc_estimate(nc) as nc
from cpc_sketches_from_hive;
---- TYPES
BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT
---- RESULTS
5,7,6,6,7,4,4,3,NULL
====
---- QUERY
# Check if CPC works with empty strings.
create table empty_string (s string, v varchar(1), c char(1));
insert into empty_string values
("", cast("" as varchar(1)), cast("" as char(1))),
("a", cast("a" as varchar(1)), cast("a" as char(1))),
("", cast("" as varchar(1)), cast("" as char(1))),
("b", cast("b" as varchar(1)), cast("b" as char(1))),
("b", cast("b" as varchar(1)), cast("b" as char(1)));
select
ds_cpc_estimate(ds_cpc_sketch(s)),
ds_cpc_estimate(ds_cpc_sketch(v)),
ds_cpc_estimate(ds_cpc_sketch(c))
from empty_string
---- RESULTS
2,2,3
---- TYPES
BIGINT,BIGINT,BIGINT
====
---- QUERY
# Unions the sketches from cpc_sketch_store and checks if the union produces the same
# result as if the whole data was sketched together into a single sketch.
select
ds_cpc_estimate(ds_cpc_union(date_sketch)),
ds_cpc_estimate(ds_cpc_union(float_sketch))
from cpc_sketch_store;
---- TYPES
BIGINT,BIGINT
---- RESULTS
12,10
====
---- QUERY
# Checks that ds_cpc_union() produces NULL for an empty dataset.
select ds_cpc_union(field) from functional_parquet.emptytable;
---- TYPES
STRING
---- RESULTS
'NULL'
====
---- QUERY
# Checks that ds_cpc_union() produces NULL for NULL inputs.
select ds_cpc_union(null_str) from functional_parquet.nullrows;
---- TYPES
STRING
---- RESULTS
'NULL'
====
---- QUERY
# ds_cpc_union() returns an error if it receives an invalid serialized sketch.
select ds_cpc_union(date_string_col) from functional_parquet.alltypestiny where id=1;
---- CATCH
UDF ERROR: Unable to deserialize sketch: Insufficient buffer size detected: bytes available 8, minimum needed 192
====
---- QUERY
# Get the same sketches from Impala and Hive and put them into the same table. When we
# get the estimates from the unions of these sketches the expectation is to get the same
# results as if these sketches were used separately to get the estimates. However, for
# string types (STRING, CHAR, VARCHAR) we see the numbers doubling up because of the
# difference between how Impala and Hive uses these types. Like IMPALA-9939.
create table cpc_sketches_impala_hive like cpc_sketches_from_impala stored as parquet;
insert into cpc_sketches_impala_hive select * from cpc_sketches_from_hive;
insert into cpc_sketches_impala_hive select * from cpc_sketches_from_impala;
select
ds_cpc_estimate(ds_cpc_union(ti)) as ti,
ds_cpc_estimate(ds_cpc_union(i)) as i,
ds_cpc_estimate(ds_cpc_union(bi)) as bi,
ds_cpc_estimate(ds_cpc_union(f)) as f,
ds_cpc_estimate(ds_cpc_union(d)) as d,
ds_cpc_estimate(ds_cpc_union(s)) as s,
ds_cpc_estimate(ds_cpc_union(c)) as c,
ds_cpc_estimate(ds_cpc_union(v)) as v,
ds_cpc_estimate(ds_cpc_union(nc)) as nc
from cpc_sketches_impala_hive;
---- TYPES
BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT
---- RESULTS
5,7,6,6,7,8,8,6,NULL
====
---- QUERY
# Check that ds_cpc_stringify() returns null for an empty sketch.
select ds_cpc_stringify(ds_cpc_sketch(cast(f2 as float))) from functional_parquet.emptytable;
---- RESULTS
'NULL'
---- TYPES
STRING
====
---- QUERY
# Check that ds_cpc_stringify() returns null for a null input.
select ds_cpc_stringify(c) from functional_parquet.nulltable;
---- RESULTS
'NULL'
---- TYPES
STRING
====
---- QUERY
# Check that ds_cpc_stringify() returns error for strings that are not serialized sketches.
select ds_cpc_stringify(date_string_col) from functional_parquet.alltypestiny;
---- CATCH
UDF ERROR: Unable to deserialize sketch: Insufficient buffer size detected: bytes available 8, minimum needed 192
====
---- QUERY
# Check that ds_cpc_stringify() works on sketches created by Hive.
select ds_cpc_stringify(f) from cpc_sketches_from_hive;
---- RESULTS
row_regex: .*### CPC sketch summary:.*lg_k.*seed hash.*C.*flavor.*### End sketch summary.*
---- TYPES
STRING
====
---- QUERY
select ds_cpc_stringify(ds_cpc_sketch(float_col)) from functional_parquet.alltypestiny;
---- RESULTS
row_regex: .*### CPC sketch summary:.*lg_k.*seed hash.*C.*flavor.*### End sketch summary.*
---- TYPES
STRING
====
---- QUERY
# Checks that unioning a valid sketch with a null value result the valid sketch being returned.
select
ds_cpc_estimate(ds_cpc_union_f(date_sketch, null)),
ds_cpc_estimate(ds_cpc_union_f(null, float_sketch))
from cpc_sketch_store;
---- TYPES
BIGINT,BIGINT
---- RESULTS
3,10
3,10
3,10
3,10
====
---- QUERY
# Check that ds_cpc_union_f() returns an empty sketch for an empty sketch.
select ds_cpc_estimate(ds_cpc_union_f(ds_cpc_sketch(cast(f2 as float)), null))
from functional_parquet.emptytable;
---- TYPES
BIGINT
---- RESULTS
0
====
---- QUERY
# Checks that ds_cpc_union_f() returns an empty sketch for NULL inputs.
select ds_cpc_estimate(ds_cpc_union_f(null_str, some_nulls)) from functional_parquet.nullrows
where id='b';
---- TYPES
BIGINT
---- RESULTS
0
====
---- QUERY
# ds_cpc_union_f() returns an error if it receives an invalid serialized sketch.
select ds_cpc_union_f(null, date_string_col) from functional_parquet.alltypestiny
where id=1;
---- CATCH
UDF ERROR: Unable to deserialize sketch: Insufficient buffer size detected: bytes available 8, minimum needed 192
====
---- QUERY
# ds_cpc_union_f() returns an error if it receives an invalid serialized sketch.
select ds_cpc_union_f(date_string_col, null) from functional_parquet.alltypestiny
where id=1;
---- CATCH
UDF ERROR: Unable to deserialize sketch: Insufficient buffer size detected: bytes available 8, minimum needed 192
====
---- QUERY
# Get the same sketches from Impala and Hive and put them into the same table (different
# column prefix). When we get the estimates from the unions of these sketches the expectation
# is to get the same results as if these sketches were used separately to get the estimates.
# However, for string types (STRING, CHAR, VARCHAR) we see the numbers doubling up because
# of the difference between how Impala and Hive uses these types. Like IMPALA-9939.
create table cpc_sketches_impala_hive2 (
i_ti string,i_i string,i_bi string,i_f string,i_d string,i_s string,i_c string,i_v string,
i_nc string,
h_ti string,h_i string,h_bi string,h_f string,h_d string,h_s string,h_c string,h_v string,
h_nc string) stored as parquet;
insert overwrite cpc_sketches_impala_hive2 select
i.ti i_ti, i.i i_i, i.bi i_bi, i.f i_f, i.d i_d, i.s i_s, i.c i_c, i.v i_v,i.nc i_nc,
h.ti h_ti, h.i h_i, h.bi h_bi, h.f h_f, h.d h_d, h.s h_s, h.c h_c, h.v h_v,h.nc h_nc
from cpc_sketches_from_impala i, cpc_sketches_from_hive h;
select
ds_cpc_estimate(ds_cpc_union_f(i_ti, h_ti)) as ti,
ds_cpc_estimate(ds_cpc_union_f(i_i, h_i)) as i,
ds_cpc_estimate(ds_cpc_union_f(i_bi, h_bi)) as bi,
ds_cpc_estimate(ds_cpc_union_f(i_f, h_f)) as f,
ds_cpc_estimate(ds_cpc_union_f(i_d, h_d)) as d,
ds_cpc_estimate(ds_cpc_union_f(i_s, h_s)) as s,
ds_cpc_estimate(ds_cpc_union_f(i_c, h_c)) as c,
ds_cpc_estimate(ds_cpc_union_f(i_v, h_v)) as v,
ds_cpc_estimate(ds_cpc_union_f(i_nc, h_nc)) as nc
from cpc_sketches_impala_hive2;
---- TYPES
BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT
---- RESULTS
5,7,6,6,7,8,8,6,0
====
---- QUERY
# Union two sketches from different columns of sketch_intermediate.
create table sketch_input (id1 int, id2 int);
insert into table sketch_input values
(1, 2), (2, 4), (3, 6), (4, 8), (5, 10), (6, 12), (7, 14), (8, 16), (9, 18), (10, 20);
create table sketch_intermediate (sketch1 string, sketch2 string) stored as parquet;
insert overwrite sketch_intermediate
select ds_cpc_sketch(id1), ds_cpc_sketch(id2) from sketch_input;
select ds_cpc_estimate(ds_cpc_union_f(sketch1, sketch2)) from sketch_intermediate;
---- TYPES
BIGINT
---- RESULTS
15
====