blob: bd6e7163678c869ba96e8c5bf5b728246f45c155 [file] [log] [blame]
set hive.strict.checks.cartesian.product=false;
CREATE TABLE `customer_removal_n0`(
`c_custkey` bigint,
`c_name` string,
`c_address` string,
`c_city` string,
`c_nation` string,
`c_region` string,
`c_phone` string,
`c_mktsegment` string,
primary key (`c_custkey`) disable rely);
CREATE TABLE `dates_removal_n0`(
`d_datekey` bigint,
`d_id` bigint,
`d_date` string,
`d_dayofweek` string,
`d_month` string,
`d_year` int,
`d_yearmonthnum` int,
`d_yearmonth` string,
`d_daynuminweek` int,
`d_daynuminmonth` int,
`d_daynuminyear` int,
`d_monthnuminyear` int,
`d_weeknuminyear` int,
`d_sellingseason` string,
`d_lastdayinweekfl` int,
`d_lastdayinmonthfl` int,
`d_holidayfl` int ,
`d_weekdayfl`int,
primary key (`d_datekey`, `d_id`) disable rely);
-- group by key has single primary key
EXPLAIN SELECT c_custkey from customer_removal_n0 where c_nation IN ('USA', 'INDIA') group by c_custkey;
-- mix of primary + non-primary keys
EXPLAIN SELECT c_custkey from customer_removal_n0 where c_nation IN ('USA', 'INDIA') group by c_custkey, c_nation;
-- multiple keys
EXPLAIN SELECT d_datekey from dates_removal_n0 where d_year IN (1985, 2004) group by d_datekey, d_id;
-- multiple keys + non-keys + different order
EXPLAIN SELECT d_datekey from dates_removal_n0 where d_year IN (1985, 2004) group by d_id, d_datekey, d_sellingseason
order by d_datekey limit 10;
-- multiple keys in different order and mixed with non-keys
EXPLAIN SELECT d_datekey from dates_removal_n0 where d_year IN (1985, 2004) group by d_id, d_daynuminmonth, d_datekey,
d_sellingseason order by d_datekey limit 10;
-- same as above but with aggregate
EXPLAIN SELECT count(d_datekey) from dates_removal_n0 where d_year IN (1985, 2004) group by d_id, d_daynuminmonth, d_datekey,
d_sellingseason order by d_datekey limit 10;
-- join
insert into dates_removal_n0(d_datekey, d_id) values(3, 0);
insert into dates_removal_n0(d_datekey, d_id) values(3, 1);
insert into customer_removal_n0 (c_custkey) values(3);
EXPLAIN SELECT d_datekey from dates_removal_n0 join customer_removal_n0 on d_datekey = c_custkey group by d_datekey, d_id;
SELECT d_datekey from dates_removal_n0 join customer_removal_n0 on d_datekey = c_custkey group by d_datekey, d_id;
-- group by keys are not primary keys
EXPLAIN SELECT d_datekey from dates_removal_n0 where d_year IN (1985, 2004) group by d_datekey, d_sellingseason
order by d_datekey limit 10;
-- negative
-- with aggregate function
EXPLAIN SELECT count(c_custkey) from customer_removal_n0 where c_nation IN ('USA', 'INDIA')
group by c_custkey, c_nation;
DROP TABLE customer_removal_n0;
DROP TABLE dates_removal_n0;
-- group by reduction optimization
create table dest_g21 (key1 int, value1 double, primary key(key1) disable rely);
insert into dest_g21 values(1, 2), (2,2), (3, 1), (4,4), (5, null), (6, null);
-- value1 will removed because it is unused, then whole group by will be removed because key1 is unique
explain select key1 from dest_g21 group by key1, value1;
select key1 from dest_g21 group by key1, value1;
-- same query but with filter
explain select key1 from dest_g21 where value1 > 1 group by key1, value1;
select key1 from dest_g21 where value1 > 1 group by key1, value1;
explain select key1 from dest_g21 where key1 > 1 group by key1, value1;
select key1 from dest_g21 where key1 > 1 group by key1, value1;
-- only value1 will be removed because there is aggregate call
explain select count(key1) from dest_g21 group by key1, value1;
select count(key1) from dest_g21 group by key1, value1;
explain select count(key1) from dest_g21 where value1 > 1 group by key1, value1;
select count(key1) from dest_g21 where value1 > 1 group by key1, value1;
-- t1.key is unique even after join therefore group by = group by (t1.key)
explain select t1.key1 from dest_g21 t1 join dest_g21 t2 on t1.key1 = t2.key1 where t2.value1 > 2 group by t1.key1, t1.value1;
select t1.key1 from dest_g21 t1 join dest_g21 t2 on t1.key1 = t2.key1 where t2.value1 > 2 group by t1.key1, t1.value1;
explain select count(t1.key1) from dest_g21 t1 join dest_g21 t2 on t1.key1 = t2.key1 where t2.value1 > 2 group by t1.key1, t1.value1;
select count(t1.key1) from dest_g21 t1 join dest_g21 t2 on t1.key1 = t2.key1 where t2.value1 > 2 group by t1.key1, t1.value1;
-- both aggregate and one of the key1 should be removed
explain select key1 from (select key1, count(key1) from dest_g21 where value1 < 4.5 group by key1, value1) sub;
select key1 from (select key1, count(key1) from dest_g21 where value1 < 4.5 group by key1, value1) sub;
-- one of the aggregate will be removed and one of the key1 will be removed
explain select key1, sm from (select key1, count(key1), sum(key1) as sm from dest_g21 where value1 < 4.5 group by key1, value1) sub;
select key1, sm from (select key1, count(key1), sum(key1) as sm from dest_g21 where value1 < 4.5 group by key1, value1) sub;
DROP table dest_g21;
CREATE TABLE tconst(i int NOT NULL disable rely, j INT NOT NULL disable norely, d_year string);
INSERT INTO tconst values(1, 1, '2001'), (2, null, '2002'), (3, 3, '2010');
-- explicit NOT NULL filter
explain select i, j from tconst where i is not null group by i,j, d_year;
select i, j from tconst where i is not null group by i,j, d_year;
-- filter on i should be removed
explain select i, j from tconst where i IS NOT NULL and j IS NOT NULL group by i,j, d_year;
select i, j from tconst where i IS NOT NULL and j IS NOT NULL group by i,j, d_year;
-- where will be removed since i is not null is always true
explain select i,j from tconst where i is not null OR j IS NOT NULL group by i, j, d_year;
select i,j from tconst where i is not null OR j IS NOT NULL group by i, j, d_year;
-- should not have implicit filter on join keys
explain select sum(t1.i) from tconst t1 join tconst t2 on t1.i=t2.j group by t1.i, t1.d_year;
select sum(t1.i) from tconst t1 join tconst t2 on t1.i=t2.j group by t1.i, t1.d_year;
-- both join keys have NOT NULL
explain select sum(t1.i) from tconst t1 join tconst t2 on t1.i=t2.i group by t1.i, t1.d_year;
select sum(t1.i) from tconst t1 join tconst t2 on t1.i=t2.i group by t1.i, t1.d_year;
DROP TABLE tconst;
-- UNIQUE + NOT NULL (same as primary key)
create table dest_g21 (key1 int NOT NULL disable rely, value1 double, UNIQUE(key1) disable rely);
explain select key1 from dest_g21 group by key1, value1;
-- UNIQUE with nullabiity
create table dest_g24 (key1 int , value1 double, UNIQUE(key1) disable rely);
explain select key1 from dest_g24 group by key1, value1;
DROP TABLE dest_g21;
DROP TABLE dest_g24;
CREATE TABLE `customer`(
`c_customer_sk` int,
`c_customer_id` string,
`c_current_cdemo_sk` int,
`c_current_hdemo_sk` int,
`c_current_addr_sk` int,
`c_first_shipto_date_sk` int,
`c_first_sales_date_sk` int,
`c_salutation` string,
`c_first_name` string,
`c_last_name` string,
`c_preferred_cust_flag` string,
`c_birth_day` int,
`c_birth_month` int,
`c_birth_year` int,
`c_birth_country` string,
`c_login` string,
`c_email_address` string,
`c_last_review_date` string);
CREATE TABLE `store_sales`(
`ss_sold_date_sk` int,
`ss_sold_time_sk` int,
`ss_item_sk` int,
`ss_customer_sk` int,
`ss_cdemo_sk` int,
`ss_hdemo_sk` int,
`ss_addr_sk` int,
`ss_store_sk` int,
`ss_promo_sk` int,
`ss_ticket_number` int,
`ss_quantity` int,
`ss_wholesale_cost` decimal(7,2),
`ss_list_price` decimal(7,2),
`ss_sales_price` decimal(7,2),
`ss_ext_discount_amt` decimal(7,2),
`ss_ext_sales_price` decimal(7,2),
`ss_ext_wholesale_cost` decimal(7,2),
`ss_ext_list_price` decimal(7,2),
`ss_ext_tax` decimal(7,2),
`ss_coupon_amt` decimal(7,2),
`ss_net_paid` decimal(7,2),
`ss_net_paid_inc_tax` decimal(7,2),
`ss_net_profit` decimal(7,2));
alter table customer add constraint pk_c primary key (c_customer_sk) disable novalidate rely;
alter table customer change column c_customer_id c_customer_id string constraint cid_nn not null disable novalidate rely;
alter table customer add constraint uk1 UNIQUE(c_customer_id) disable novalidate rely;
alter table store_sales add constraint pk_ss primary key (ss_item_sk, ss_ticket_number) disable novalidate rely;
alter table store_sales add constraint ss_c foreign key (ss_customer_sk) references customer (c_customer_sk) disable novalidate rely;
explain cbo
select c_customer_id
from customer
,store_sales
where c_customer_sk = ss_customer_sk
group by c_customer_id
,c_first_name
,c_last_name
,c_preferred_cust_flag
,c_birth_country
,c_login
,c_email_address;
explain cbo
select c_customer_id
from store_sales
,customer
where c_customer_sk = ss_customer_sk
group by c_customer_id
,c_first_name
,c_last_name
,c_preferred_cust_flag
,c_birth_country
,c_login
,c_email_address;
explain cbo
with year_total as (
select c_customer_id customer_id
,c_first_name customer_first_name
,c_last_name customer_last_name
,c_preferred_cust_flag customer_preferred_cust_flag
,c_birth_country customer_birth_country
,c_login customer_login
,c_email_address customer_email_address
,sum(((ss_ext_list_price-ss_ext_wholesale_cost-ss_ext_discount_amt)+ss_ext_sales_price)/2) year_total
,'s' sale_type
from customer
,store_sales
where c_customer_sk = ss_customer_sk
group by c_customer_id
,c_first_name
,c_last_name
,c_preferred_cust_flag
,c_birth_country
,c_login
,c_email_address
)
select t_s_secyear.customer_preferred_cust_flag
from
year_total t_s_secyear
where t_s_secyear.sale_type = 's'
order by t_s_secyear.customer_preferred_cust_flag
limit 100;
explain cbo
with year_total as (
select c_customer_id customer_id
,c_first_name customer_first_name
,c_last_name customer_last_name
,c_preferred_cust_flag customer_preferred_cust_flag
,c_birth_country customer_birth_country
,c_login customer_login
,c_email_address customer_email_address
,sum(((ss_ext_list_price-ss_ext_wholesale_cost-ss_ext_discount_amt)+ss_ext_sales_price)/2) year_total
,'s' sale_type
from store_sales
,customer
where c_customer_sk = ss_customer_sk
group by c_customer_id
,c_first_name
,c_last_name
,c_preferred_cust_flag
,c_birth_country
,c_login
,c_email_address
)
select t_s_secyear.customer_preferred_cust_flag
from
year_total t_s_secyear
where t_s_secyear.sale_type = 's'
order by t_s_secyear.customer_preferred_cust_flag
limit 100;
CREATE TABLE `date_dim`(
`d_date_sk` int,
`d_date_id` string,
`d_date` string,
`d_month_seq` int,
`d_week_seq` int,
`d_quarter_seq` int,
`d_year` int,
`d_dow` int,
`d_moy` int,
`d_dom` int,
`d_qoy` int,
`d_fy_year` int,
`d_fy_quarter_seq` int,
`d_fy_week_seq` int,
`d_day_name` string,
`d_quarter_name` string,
`d_holiday` string,
`d_weekend` string,
`d_following_holiday` string,
`d_first_dom` int,
`d_last_dom` int,
`d_same_day_ly` int,
`d_same_day_lq` int,
`d_current_day` string,
`d_current_week` string,
`d_current_month` string,
`d_current_quarter` string,
`d_current_year` string);
explain cbo
with year_total as (
select c_customer_id customer_id
,c_first_name customer_first_name
,c_last_name customer_last_name
,c_preferred_cust_flag customer_preferred_cust_flag
,c_birth_country customer_birth_country
,c_login customer_login
,c_email_address customer_email_address
,d_year dyear
,sum(((ss_ext_list_price-ss_ext_wholesale_cost-ss_ext_discount_amt)+ss_ext_sales_price)/2) year_total
,'s' sale_type
from customer
,store_sales
,date_dim
where c_customer_sk = ss_customer_sk
and ss_sold_date_sk = d_date_sk
group by c_customer_id
,c_first_name
,c_last_name
,c_preferred_cust_flag
,c_birth_country
,c_login
,c_email_address
,d_year
)
select t_s_secyear.customer_preferred_cust_flag
from year_total t_s_firstyear
,year_total t_s_secyear
where t_s_secyear.customer_id = t_s_firstyear.customer_id
and t_s_firstyear.sale_type = 's'
and t_s_secyear.sale_type = 's'
and t_s_firstyear.dyear = 2001
and t_s_secyear.dyear = 2001+1
and t_s_firstyear.year_total > 0
order by t_s_secyear.customer_preferred_cust_flag
limit 100;
-- group by is both on unique as well pk but pk is used
EXPLAIN CBO
SELECT
C_CUSTOMER_SK
FROM
CUSTOMER
, STORE_SALES
WHERE
C_CUSTOMER_SK = SS_CUSTOMER_SK
GROUP BY
C_CUSTOMER_SK
, C_CUSTOMER_ID
, C_FIRST_NAME
, C_LAST_NAME
, C_PREFERRED_CUST_FLAG
, C_BIRTH_COUNTRY
, C_LOGIN
, C_EMAIL_ADDRESS
;
-- group by is both on unique as well pk but unique is used
EXPLAIN CBO
SELECT
C_CUSTOMER_ID
FROM
CUSTOMER
, STORE_SALES
WHERE
C_CUSTOMER_SK = SS_CUSTOMER_SK
GROUP BY
C_CUSTOMER_SK
, C_CUSTOMER_ID
, C_FIRST_NAME
, C_LAST_NAME
, C_PREFERRED_CUST_FLAG
, C_BIRTH_COUNTRY
, C_LOGIN
, C_EMAIL_ADDRESS
;
-- should keep the unique + one column c_first_name in gby
EXPLAIN CBO
SELECT
C_FIRST_NAME
FROM
CUSTOMER
, STORE_SALES
WHERE
C_CUSTOMER_SK = SS_CUSTOMER_SK
GROUP BY
C_CUSTOMER_SK
, C_FIRST_NAME
, C_LAST_NAME
, C_PREFERRED_CUST_FLAG
, C_BIRTH_COUNTRY
, C_LOGIN
, C_EMAIL_ADDRESS
;
-- group by keys order is different than than the source
EXPLAIN CBO
SELECT
C_CUSTOMER_ID
FROM
CUSTOMER
, STORE_SALES
WHERE
C_CUSTOMER_SK = SS_CUSTOMER_SK
GROUP BY
C_EMAIL_ADDRESS
, C_LAST_NAME
, C_FIRST_NAME
, C_CUSTOMER_ID
, C_PREFERRED_CUST_FLAG
, C_BIRTH_COUNTRY
, C_LOGIN
;
-- group by keys with columns from multiple table
explain cbo select c_customer_sk from
(select c_first_name, c_customer_sk ,d_date solddate,count(*) cnt
from store_sales
,date_dim
,customer
where ss_sold_date_sk = d_date_sk
and ss_item_sk = c_customer_sk
group by c_first_name,c_customer_sk,d_date
having count(*) >4) subq;
-- group by keys from multiple table with expression
explain cbo select c_customer_sk from
(select substr(c_first_name, 1,30), c_customer_sk ,d_date solddate,count(*) cnt
from store_sales
,date_dim
,customer
where ss_sold_date_sk = d_date_sk
and ss_item_sk = c_customer_sk
group by substr(c_first_name, 1, 30),c_customer_sk,d_date
having count(*) >4) subq;
-- group by keys from same table with expression
explain cbo select c_customer_sk from
(select substr(c_first_name, 1,30), c_customer_sk ,count(*) cnt
from store_sales
,date_dim
,customer
where ss_sold_date_sk = d_date_sk
and ss_item_sk = c_customer_sk
group by substr(c_first_name, 1, 30),c_customer_sk
having count(*) >4) subq;
-- group by keys from multiple table with non-deterministic expression
explain cbo select c_customer_sk from
(select rand(), c_customer_sk ,d_date solddate,count(*) cnt
from store_sales
,date_dim
,customer
where ss_sold_date_sk = d_date_sk
and ss_item_sk = c_customer_sk
group by rand(),c_customer_sk,d_date
having count(*) >4) subq;
-- group by keys from multiple table with expression on pk itself, group by shouldn't be reduced
explain cbo select * from
(select substr(c_first_name, 1,30), log2(c_customer_sk),d_date solddate,count(*) cnt
from store_sales
,date_dim
,customer
where ss_sold_date_sk = d_date_sk
and ss_item_sk = c_customer_sk
group by substr(c_first_name, 1, 30),log2(c_customer_sk),d_date
having count(*) >4) subq;
-- group by with keys consisting of pk from multiple tables with extra columns from both side
explain cbo select c_customer_sk from
(select substr(c_first_name, 1,30), c_customer_sk ,d_date solddate, d_date_sk, count(*) cnt
from store_sales
,date_dim
,customer
where ss_sold_date_sk = d_date_sk
and ss_item_sk = c_customer_sk
group by substr(c_first_name, 1, 30),c_customer_sk,d_date, d_date_sk
having count(*) >4) subq;
-- group by with keys consisting of pk from multiple tables with extra expressions from both side
explain cbo select c_customer_sk from
(select substr(c_first_name, 1,30), c_customer_sk ,log2(d_date) solddate, d_date_sk, count(*) cnt
from store_sales
,date_dim
,customer
where ss_sold_date_sk = d_date_sk
and ss_item_sk = c_customer_sk
group by substr(c_first_name, 1, 30),c_customer_sk, log2(d_date), d_date_sk
having count(*) >4) subq;
create table web_sales(ws_order_number int, ws_item_sk int, ws_price float,
constraint pk1 primary key(ws_order_number, ws_item_sk) disable rely);
insert into web_sales values(1, 1, 1.2);
insert into web_sales values(1, 1, 1.2);
explain cbo select count(distinct ws_order_number) from web_sales;
select count(distinct ws_order_number) from web_sales;
drop table web_sales;
create table t1(i int primary key disable rely, j int);
insert into t1 values(1,100),(2,200);
create table t2(i int primary key disable rely, j int);
insert into t2 values(2,1000),(4,500);
-- UNION
explain cbo select i from (select i, j from t1 union all select i,j from t2) subq group by i,j;
select i from (select i, j from t1 union all select i,j from t2) subq group by i,j;
-- INTERSECT
explain cbo select i from (select i, j from t1 intersect select i,j from t2) subq group by i,j;
select i from (select i, j from t1 intersect select i,j from t2) subq group by i,j;
drop table t1;
drop table t2;