ql/src/test/queries/clientpositive/reduce_deduplicate_stat_based.q - hive - Git at Google

 -- HIVE-23365: Put RS deduplication optimization under cost based decision
 -- In every query in this file reduce deduplication is applicable. Nevertheless, depending on the statistics the
 -- optimizer may decide to apply it or not.
 set hive.optimize.reducededuplication=true;

 create table person (id integer, name string, sex string, country int); -- id: 4B, name:20B, sex:1B, country:4B
 -- Based on the stats below the deduplication should be applied when the remaining partition columns contain id or name,
 -- Respectively when the remaining partition columns are sex and country the deduplication shouldn't apply.
 alter table person update statistics set('numRows'='741000000', 'rawDataSize'='21489000000');
 alter table person update statistics for column id set('numDVs'='741000000','numNulls'='0', 'highValue'='100000000','lowValue'='0');
 alter table person update statistics for column name set('numDVs'='7410000','numNulls'='10000', 'maxColLen'='20', 'avgColLen'='10');
 alter table person update statistics for column sex set('numDVs'='3','numNulls'='1000000', 'maxColLen'='1', 'avgColLen'='1');
 alter table person update statistics for column country set('numDVs'='44','numNulls'='1000', 'highValue'='43', 'lowValue'='0');

 -- The deduplication changes the parallelism when the partition columns of the two ReduceSinkOperators since the usual
 -- merge replaces the partitioning columns of the child RS with the columns of the parent.
 -- Currently it takes place in two cases:
 -- Case I: Parent RS columns are more specific than those of the child RS, and child columns are assigned;

 explain select name, count(id) from (select * from person distribute by name, id) Q0 group by name;
 explain select country, count(id) from (select * from person distribute by country, id) Q1 group by country;
 explain select sex, count(id) from (select * from person distribute by sex, id) Q2 group by sex;
 explain select name, count(id) from (select id, name from person group by name, id) Q3 group by name;
 explain select country, count(id) from (select id, country from person group by id, country) Q4 group by country;
 explain select sex, count(id) from (select id, sex from person group by id, sex) Q5 group by sex;

 -- Case II: Child RS columns are more specific than those of the parent RS, and parent columns are not assigned.

 set hive.remove.orderby.in.subquery=false;
 explain select name, count(id) from (select * from person sort by name, id) Q7 group by name;
 explain select country, count(id) from (select * from person sort by country, id) Q8 group by country;
 explain select sex, count(id) from (select * from person sort by sex, id) Q9 group by sex;
	-- HIVE-23365: Put RS deduplication optimization under cost based decision
	-- In every query in this file reduce deduplication is applicable. Nevertheless, depending on the statistics the
	-- optimizer may decide to apply it or not.
	set hive.optimize.reducededuplication=true;

	create table person (id integer, name string, sex string, country int); -- id: 4B, name:20B, sex:1B, country:4B
	-- Based on the stats below the deduplication should be applied when the remaining partition columns contain id or name,
	-- Respectively when the remaining partition columns are sex and country the deduplication shouldn't apply.
	alter table person update statistics set('numRows'='741000000', 'rawDataSize'='21489000000');
	alter table person update statistics for column id set('numDVs'='741000000','numNulls'='0', 'highValue'='100000000','lowValue'='0');
	alter table person update statistics for column name set('numDVs'='7410000','numNulls'='10000', 'maxColLen'='20', 'avgColLen'='10');
	alter table person update statistics for column sex set('numDVs'='3','numNulls'='1000000', 'maxColLen'='1', 'avgColLen'='1');
	alter table person update statistics for column country set('numDVs'='44','numNulls'='1000', 'highValue'='43', 'lowValue'='0');

	-- The deduplication changes the parallelism when the partition columns of the two ReduceSinkOperators since the usual
	-- merge replaces the partitioning columns of the child RS with the columns of the parent.
	-- Currently it takes place in two cases:
	-- Case I: Parent RS columns are more specific than those of the child RS, and child columns are assigned;

	explain select name, count(id) from (select * from person distribute by name, id) Q0 group by name;
	explain select country, count(id) from (select * from person distribute by country, id) Q1 group by country;
	explain select sex, count(id) from (select * from person distribute by sex, id) Q2 group by sex;
	explain select name, count(id) from (select id, name from person group by name, id) Q3 group by name;
	explain select country, count(id) from (select id, country from person group by id, country) Q4 group by country;
	explain select sex, count(id) from (select id, sex from person group by id, sex) Q5 group by sex;

	-- Case II: Child RS columns are more specific than those of the parent RS, and parent columns are not assigned.

	set hive.remove.orderby.in.subquery=false;
	explain select name, count(id) from (select * from person sort by name, id) Q7 group by name;
	explain select country, count(id) from (select * from person sort by country, id) Q8 group by country;
	explain select sex, count(id) from (select * from person sort by sex, id) Q9 group by sex;