| drop table location; |
| |
| -- There are different cases for Group By depending on map/reduce side, hash aggregation, |
| -- grouping sets and column stats. If we don't have column stats, we just assume hash |
| -- aggregation is disabled. Following are the possible cases and rule for cardinality |
| -- estimation |
| |
| -- MAP SIDE: |
| -- Case 1: NO column stats, NO hash aggregation, NO grouping sets — numRows |
| -- Case 2: NO column stats, NO hash aggregation, grouping sets — numRows * sizeOfGroupingSet |
| -- Case 3: column stats, hash aggregation, NO grouping sets — Min(numRows / 2, ndvProduct * parallelism) |
| -- Case 4: column stats, hash aggregation, grouping sets — Min((numRows * sizeOfGroupingSet) / 2, ndvProduct * parallelism * sizeOfGroupingSet) |
| -- Case 5: column stats, NO hash aggregation, NO grouping sets — numRows |
| -- Case 6: column stats, NO hash aggregation, grouping sets — numRows * sizeOfGroupingSet |
| |
| -- REDUCE SIDE: |
| -- Case 7: NO column stats — numRows / 2 |
| -- Case 8: column stats, grouping sets — Min(numRows, ndvProduct * sizeOfGroupingSet) |
| -- Case 9: column stats, NO grouping sets - Min(numRows, ndvProduct) |
| |
| create table location (state string, country string, votes bigint); |
| load data local inpath "../../data/files/location.txt" overwrite into table location; |
| |
| analyze table location compute statistics; |
| analyze table location compute statistics for columns state, country; |
| |
| set mapred.max.split.size=50; |
| set hive.map.aggr.hash.percentmemory=0.5f; |
| set hive.stats.fetch.column.stats=false; |
| |
| -- Case 1: NO column stats, NO hash aggregation, NO grouping sets - cardinality = 20 |
| -- Case 7: NO column stats - cardinality = 10 |
| explain select state, country from location group by state, country; |
| |
| -- Case 2: NO column stats, NO hash aggregation, NO grouping sets - cardinality = 80 |
| -- Case 7: NO column stats - cardinality = 40 |
| explain select state, country from location group by state, country with cube; |
| |
| set hive.stats.fetch.column.stats=true; |
| -- parallelism = 4 |
| |
| -- Case 3: column stats, hash aggregation, NO grouping sets - cardinality = 8 |
| -- Case 9: column stats, NO grouping sets - caridnality = 2 |
| explain select state, country from location group by state, country; |
| |
| -- column stats for votes is missing, so ndvProduct becomes 0 and will be set to numRows / 2 |
| -- Case 3: column stats, hash aggregation, NO grouping sets - cardinality = 10 |
| -- Case 9: column stats, NO grouping sets - caridnality = 5 |
| explain select state, votes from location group by state, votes; |
| |
| -- Case 4: column stats, hash aggregation, grouping sets - cardinality = 32 |
| -- Case 8: column stats, grouping sets - cardinality = 8 |
| explain select state, country from location group by state, country with cube; |
| |
| set hive.map.aggr.hash.percentmemory=0.0f; |
| -- Case 5: column stats, NO hash aggregation, NO grouping sets - cardinality = 20 |
| -- Case 9: column stats, NO grouping sets - caridnality = 2 |
| explain select state, country from location group by state, country; |
| |
| -- Case 6: column stats, NO hash aggregation, grouping sets - cardinality = 80 |
| -- Case 8: column stats, grouping sets - cardinality = 8 |
| explain select state, country from location group by state, country with cube; |
| |
| drop table location; |