blob: c727671ee0f29cfd9ac8e4d3ac9bfd15c2ea7914 [file] [log] [blame]
set hive.stats.fetch.column.stats=true;
set hive.map.aggr.hash.percentmemory=0.0f;
-- hash aggregation is disabled
-- There are different cases for Group By depending on map/reduce side, hash aggregation,
-- grouping sets and column stats. If we don't have column stats, we just assume hash
-- aggregation is disabled. Following are the possible cases and rule for cardinality
-- estimation
-- MAP SIDE:
-- Case 1: NO column stats, NO hash aggregation, NO grouping sets — numRows
-- Case 2: NO column stats, NO hash aggregation, grouping sets — numRows * sizeOfGroupingSet
-- Case 3: column stats, hash aggregation, NO grouping sets — Min(numRows / 2, ndvProduct * parallelism)
-- Case 4: column stats, hash aggregation, grouping sets — Min((numRows * sizeOfGroupingSet) / 2, ndvProduct * parallelism * sizeOfGroupingSet)
-- Case 5: column stats, NO hash aggregation, NO grouping sets — numRows
-- Case 6: column stats, NO hash aggregation, grouping sets — numRows * sizeOfGroupingSet
-- REDUCE SIDE:
-- Case 7: NO column stats — numRows / 2
-- Case 8: column stats, grouping sets — Min(numRows, ndvProduct * sizeOfGroupingSet)
-- Case 9: column stats, NO grouping sets - Min(numRows, ndvProduct)
create table if not exists loc_staging_n2 (
state string,
locid int,
zip bigint,
year int
) row format delimited fields terminated by '|' stored as textfile;
create table loc_orc_n2 like loc_staging_n2;
alter table loc_orc_n2 set fileformat orc;
load data local inpath '../../data/files/loc.txt' overwrite into table loc_staging_n2;
insert overwrite table loc_orc_n2 select * from loc_staging_n2;
-- numRows: 8 rawDataSize: 796
explain select * from loc_orc_n2;
-- partial column stats
analyze table loc_orc_n2 compute statistics for columns state;
-- inner group by: map - numRows: 8 reduce - numRows: 4
-- outer group by: map - numRows: 4 reduce numRows: 2
explain select a, c, min(b)
from ( select state as a, locid as b, count(*) as c
from loc_orc_n2
group by state,locid
) sq1
group by a,c;
analyze table loc_orc_n2 compute statistics for columns state,locid,year;
-- Case 5: column stats, NO hash aggregation, NO grouping sets - cardinality = 8
-- Case 9: column stats, NO grouping sets - caridnality = 2
explain select year from loc_orc_n2 group by year;
-- Case 5: column stats, NO hash aggregation, NO grouping sets - cardinality = 8
-- Case 9: column stats, NO grouping sets - caridnality = 8
explain select state,locid from loc_orc_n2 group by state,locid;
-- Case 6: column stats, NO hash aggregation, grouping sets - cardinality = 32
-- Case 8: column stats, grouping sets - cardinality = 32
explain select state,locid from loc_orc_n2 group by state,locid with cube;
-- Case 6: column stats, NO hash aggregation, grouping sets - cardinality = 24
-- Case 8: column stats, grouping sets - cardinality = 24
explain select state,locid from loc_orc_n2 group by state,locid with rollup;
explain select state,locid from loc_orc_n2 group by rollup( state,locid );
-- Case 6: column stats, NO hash aggregation, grouping sets - cardinality = 8
-- Case 8: column stats, grouping sets - cardinality = 8
explain select state,locid from loc_orc_n2 group by state,locid grouping sets((state));
-- Case 6: column stats, NO hash aggregation, grouping sets - cardinality = 16
-- Case 8: column stats, grouping sets - cardinality = 16
explain select state,locid from loc_orc_n2 group by state,locid grouping sets((state),(locid));
-- Case 6: column stats, NO hash aggregation, grouping sets - cardinality = 24
-- Case 8: column stats, grouping sets - cardinality = 24
explain select state,locid from loc_orc_n2 group by state,locid grouping sets((state),(locid),());
-- Case 6: column stats, NO hash aggregation, grouping sets - cardinality = 32
-- Case 8: column stats, grouping sets - cardinality = 32
explain select state,locid from loc_orc_n2 group by state,locid grouping sets((state,locid),(state),(locid),());
set hive.map.aggr.hash.percentmemory=0.5f;
set mapred.max.split.size=80;
-- map-side parallelism will be 10
-- Case 3: column stats, hash aggregation, NO grouping sets - cardinality = 4
-- Case 9: column stats, NO grouping sets - caridnality = 2
explain select year from loc_orc_n2 group by year;
-- Case 4: column stats, hash aggregation, grouping sets - cardinality = 16
-- Case 8: column stats, grouping sets - cardinality = 16
explain select state,locid from loc_orc_n2 group by state,locid with cube;
-- ndvProduct becomes 0 as zip does not have column stats
-- Case 3: column stats, hash aggregation, NO grouping sets - cardinality = 4
-- Case 9: column stats, NO grouping sets - caridnality = 2
explain select state,zip from loc_orc_n2 group by state,zip;
set mapred.max.split.size=1000;
set hive.stats.fetch.column.stats=false;
-- Case 2: NO column stats, NO hash aggregation, NO grouping sets - cardinality = 32
-- Case 7: NO column stats - cardinality = 16
explain select state,locid from loc_orc_n2 group by state,locid with cube;
-- Case 2: NO column stats, NO hash aggregation, NO grouping sets - cardinality = 24
-- Case 7: NO column stats - cardinality = 12
explain select state,locid from loc_orc_n2 group by state,locid with rollup;
explain select state,locid from loc_orc_n2 group by rollup (state,locid);
-- Case 2: NO column stats, NO hash aggregation, NO grouping sets - cardinality = 8
-- Case 7: NO column stats - cardinality = 4
explain select state,locid from loc_orc_n2 group by state,locid grouping sets((state));
-- Case 2: NO column stats, NO hash aggregation, NO grouping sets - cardinality = 16
-- Case 7: NO column stats - cardinality = 8
explain select state,locid from loc_orc_n2 group by state,locid grouping sets((state),(locid));
-- Case 2: NO column stats, NO hash aggregation, NO grouping sets - cardinality = 24
-- Case 7: NO column stats - cardinality = 12
explain select state,locid from loc_orc_n2 group by state,locid grouping sets((state),(locid),());
-- Case 2: NO column stats, NO hash aggregation, NO grouping sets - cardinality = 32
-- Case 7: NO column stats - cardinality = 16
explain select state,locid from loc_orc_n2 group by state,locid grouping sets((state,locid),(state),(locid),());
set mapred.max.split.size=80;
-- Case 1: NO column stats, NO hash aggregation, NO grouping sets - cardinality = 8
-- Case 7: NO column stats - cardinality = 4
explain select year from loc_orc_n2 group by year;
-- Case 2: NO column stats, NO hash aggregation, NO grouping sets - cardinality = 32
-- Case 7: NO column stats - cardinality = 16
explain select state,locid from loc_orc_n2 group by state,locid with cube;
set hive.stats.fetch.column.stats=true;
create table t1_uq12(i int, j int);
alter table t1_uq12 update statistics set('numRows'='10000', 'rawDataSize'='18000');
alter table t1_uq12 update statistics for column i set('numDVs'='2500','numNulls'='50','highValue'='1000','lowValue'='0');
alter table t1_uq12 update statistics for column j set('numDVs'='500','numNulls'='30','highValue'='100','lowValue'='50');
create table t2_uq12(i2 int, j2 int);
alter table t2_uq12 update statistics set('numRows'='100000000', 'rawDataSize'='10000');
alter table t2_uq12 update statistics for column i2 set('numDVs'='10000000','numNulls'='0','highValue'='8000','lowValue'='0');
alter table t2_uq12 update statistics for column j2 set('numDVs'='10','numNulls'='0','highValue'='800','lowValue'='-1');
explain select count (1) from t1_uq12,t2_uq12 where t1_uq12.j=t2_uq12.i2 group by t1_uq12.i, t1_uq12.j;
drop table t1_uq12;
drop table t2_uq12;