ql/src/test/queries/clientpositive/subquery_in_having.q - hive - Git at Google

 --! qt:dataset:src
 set hive.mapred.mode=nonstrict;
 set hive.auto.convert.anti.join=true;
 -- SORT_QUERY_RESULTS

 -- data setup
 DROP TABLE IF EXISTS part_subq;

 CREATE TABLE part_subq(
     p_partkey INT,
     p_name STRING,
     p_mfgr STRING,
     p_brand STRING,
     p_type STRING,
     p_size INT,
     p_container STRING,
     p_retailprice DOUBLE,
     p_comment STRING
 );

 LOAD DATA LOCAL INPATH '../../data/files/tpch/tiny/part.tbl.bz2' overwrite into table part_subq;

 -- non agg, non corr
 explain
  select key, count(*)
 from src
 group by key
 having count(*) in (select count(*) from src s1 where s1.key > '9' group by s1.key )
 ;


 select s1.key, count(*) from src s1 where s1.key > '9' group by s1.key;

 select key, count(*)
 from src
 group by key
 having count(*) in (select count(*) from src s1 where s1.key = '90' group by s1.key )
 ;

 set hive.optimize.correlation=false;

 -- agg, non corr
 explain
 select p_mfgr, avg(p_size)
 from part_subq b
 group by b.p_mfgr
 having b.p_mfgr in
    (select p_mfgr
     from part_subq
     group by p_mfgr
     having max(p_size) - min(p_size) < 20
    )
 ;

 set hive.optimize.correlation=true;

 -- agg, non corr
 explain
 select p_mfgr, avg(p_size)
 from part_subq b
 group by b.p_mfgr
 having b.p_mfgr in
    (select p_mfgr
     from part_subq
     group by p_mfgr
     having max(p_size) - min(p_size) < 20
    )
 ;

 -- join on agg
 select b.key, min(b.value)
 from src b
 group by b.key
 having b.key in ( select a.key
                 from src a
                 where a.value > 'val_9' and a.value = min(b.value)
                 )
 ;

 -- where and having
 -- Plan is:
 -- Stage 1: b semijoin sq1:src (subquery in where)
 -- Stage 2: group by Stage 1 o/p
 -- Stage 5: group by on sq2:src (subquery in having)
 -- Stage 6: Stage 2 o/p semijoin Stage 5
 explain
 select key, value, count(*)
 from src b
 where b.key in (select key from src where src.key > '8')
 group by key, value
 having count(*) in (select count(*) from src s1 where s1.key > '9' group by s1.key )
 ;
 select key, value, count(*)
 from src b
 where b.key in (select key from src where src.key > '8')
 group by key, value
 having count(*) in (select count(*) from src s1 where s1.key > '9' group by s1.key )
 ;

 set hive.auto.convert.join=true;
 -- Plan is:
 -- Stage  5: group by on sq2:src (subquery in having)
 -- Stage 10: hashtable for sq1:src (subquery in where)
 -- Stage  2: b map-side semijoin Stage 10 o/p
 -- Stage  3: Stage 2 semijoin Stage 5
 -- Stage  9: construct hastable for Stage 5 o/p
 -- Stage  6: Stage 2 map-side semijoin Stage 9
 explain
 select key, value, count(*)
 from src b
 where b.key in (select key from src where src.key > '8')
 group by key, value
 having count(*) in (select count(*) from src s1 where s1.key > '9' group by s1.key )
 ;

 -- both having and where corr
 explain
 select key, value, count(*)
 from src b
 where b.key in (select key from src where src.value = b.value)
 group by key, value
 having count(*) in (select count(*) from src s1 where s1.key > '9' group by s1.key )
 ;
 select key, value, count(*)
 from src b
 where b.key in (select key from src where src.value = b.value)
 group by key, value
 having count(*) in (select count(*) from src s1 where s1.key > '9' group by s1.key )
 ;

 -- non agg, non corr, windowing
 explain
 select p_mfgr, p_name, avg(p_size)
 from part_subq
 group by p_mfgr, p_name
 having p_name in
   (select first_value(p_name) over(partition by p_mfgr order by p_size) from part_subq)
 ;

 CREATE TABLE src_null_n4 (key STRING COMMENT 'default', value STRING COMMENT 'default') STORED AS TEXTFILE;
 LOAD DATA LOCAL INPATH "../../data/files/kv1.txt" INTO TABLE src_null_n4;
 INSERT INTO src_null_n4 values('5444', null);

 explain
 select key, value, count(*)
 from src_null_n4 b
 where NOT EXISTS (select key from src_null_n4 where src_null_n4.value <> b.value)
 group by key, value
 having count(*) not in (select count(*) from src_null_n4 s1 where s1.key > '9' and s1.value <> b.value group by s1.key );

 set hive.auto.convert.anti.join=false;

 select key, value, count(*)
 from src_null_n4 b
 where NOT EXISTS (select key from src_null_n4 where src_null_n4.value <> b.value)
 group by key, value
 having count(*) not in (select count(*) from src_null_n4 s1 where s1.key > '9' and s1.value <> b.value group by s1.key );


 explain
 select key, value, count(*)
 from src_null_n4 b
 where NOT EXISTS (select key from src_null_n4 where src_null_n4.value <> b.value)
 group by key, value
 having count(*) not in (select count(*) from src_null_n4 s1 where s1.key > '9' and s1.value <> b.value group by s1.key );

 select key, value, count(*)
 from src_null_n4 b
 where NOT EXISTS (select key from src_null_n4 where src_null_n4.value <> b.value)
 group by key, value
 having count(*) not in (select count(*) from src_null_n4 s1 where s1.key > '9' and s1.value <> b.value group by s1.key );

 DROP TABLE src_null_n4;
 DROP TABLE part_subq;
	--! qt:dataset:src
	set hive.mapred.mode=nonstrict;
	set hive.auto.convert.anti.join=true;
	-- SORT_QUERY_RESULTS

	-- data setup
	DROP TABLE IF EXISTS part_subq;

	CREATE TABLE part_subq(
	p_partkey INT,
	p_name STRING,
	p_mfgr STRING,
	p_brand STRING,
	p_type STRING,
	p_size INT,
	p_container STRING,
	p_retailprice DOUBLE,
	p_comment STRING
	);

	LOAD DATA LOCAL INPATH '../../data/files/tpch/tiny/part.tbl.bz2' overwrite into table part_subq;

	-- non agg, non corr
	explain
	select key, count(*)
	from src
	group by key
	having count() in (select count() from src s1 where s1.key > '9' group by s1.key )
	;


	select s1.key, count(*) from src s1 where s1.key > '9' group by s1.key;

	select key, count(*)
	from src
	group by key
	having count() in (select count() from src s1 where s1.key = '90' group by s1.key )
	;

	set hive.optimize.correlation=false;

	-- agg, non corr
	explain
	select p_mfgr, avg(p_size)
	from part_subq b
	group by b.p_mfgr
	having b.p_mfgr in
	(select p_mfgr
	from part_subq
	group by p_mfgr
	having max(p_size) - min(p_size) < 20
	)
	;

	set hive.optimize.correlation=true;

	-- agg, non corr
	explain
	select p_mfgr, avg(p_size)
	from part_subq b
	group by b.p_mfgr
	having b.p_mfgr in
	(select p_mfgr
	from part_subq
	group by p_mfgr
	having max(p_size) - min(p_size) < 20
	)
	;

	-- join on agg
	select b.key, min(b.value)
	from src b
	group by b.key
	having b.key in ( select a.key
	from src a
	where a.value > 'val_9' and a.value = min(b.value)
	)
	;

	-- where and having
	-- Plan is:
	-- Stage 1: b semijoin sq1:src (subquery in where)
	-- Stage 2: group by Stage 1 o/p
	-- Stage 5: group by on sq2:src (subquery in having)
	-- Stage 6: Stage 2 o/p semijoin Stage 5
	explain
	select key, value, count(*)
	from src b
	where b.key in (select key from src where src.key > '8')
	group by key, value
	having count() in (select count() from src s1 where s1.key > '9' group by s1.key )
	;
	select key, value, count(*)
	from src b
	where b.key in (select key from src where src.key > '8')
	group by key, value
	having count() in (select count() from src s1 where s1.key > '9' group by s1.key )
	;

	set hive.auto.convert.join=true;
	-- Plan is:
	-- Stage 5: group by on sq2:src (subquery in having)
	-- Stage 10: hashtable for sq1:src (subquery in where)
	-- Stage 2: b map-side semijoin Stage 10 o/p
	-- Stage 3: Stage 2 semijoin Stage 5
	-- Stage 9: construct hastable for Stage 5 o/p
	-- Stage 6: Stage 2 map-side semijoin Stage 9
	explain
	select key, value, count(*)
	from src b
	where b.key in (select key from src where src.key > '8')
	group by key, value
	having count() in (select count() from src s1 where s1.key > '9' group by s1.key )
	;

	-- both having and where corr
	explain
	select key, value, count(*)
	from src b
	where b.key in (select key from src where src.value = b.value)
	group by key, value
	having count() in (select count() from src s1 where s1.key > '9' group by s1.key )
	;
	select key, value, count(*)
	from src b
	where b.key in (select key from src where src.value = b.value)
	group by key, value
	having count() in (select count() from src s1 where s1.key > '9' group by s1.key )
	;

	-- non agg, non corr, windowing
	explain
	select p_mfgr, p_name, avg(p_size)
	from part_subq
	group by p_mfgr, p_name
	having p_name in
	(select first_value(p_name) over(partition by p_mfgr order by p_size) from part_subq)
	;

	CREATE TABLE src_null_n4 (key STRING COMMENT 'default', value STRING COMMENT 'default') STORED AS TEXTFILE;
	LOAD DATA LOCAL INPATH "../../data/files/kv1.txt" INTO TABLE src_null_n4;
	INSERT INTO src_null_n4 values('5444', null);

	explain
	select key, value, count(*)
	from src_null_n4 b
	where NOT EXISTS (select key from src_null_n4 where src_null_n4.value <> b.value)
	group by key, value
	having count() not in (select count() from src_null_n4 s1 where s1.key > '9' and s1.value <> b.value group by s1.key );

	set hive.auto.convert.anti.join=false;

	select key, value, count(*)
	from src_null_n4 b
	where NOT EXISTS (select key from src_null_n4 where src_null_n4.value <> b.value)
	group by key, value
	having count() not in (select count() from src_null_n4 s1 where s1.key > '9' and s1.value <> b.value group by s1.key );


	explain
	select key, value, count(*)
	from src_null_n4 b
	where NOT EXISTS (select key from src_null_n4 where src_null_n4.value <> b.value)
	group by key, value
	having count() not in (select count() from src_null_n4 s1 where s1.key > '9' and s1.value <> b.value group by s1.key );

	select key, value, count(*)
	from src_null_n4 b
	where NOT EXISTS (select key from src_null_n4 where src_null_n4.value <> b.value)
	group by key, value
	having count() not in (select count() from src_null_n4 s1 where s1.key > '9' and s1.value <> b.value group by s1.key );

	DROP TABLE src_null_n4;
	DROP TABLE part_subq;