blob: 78c5a4870783482585279939757f95aaef1eb371 [file] [log] [blame]
# Distinct agg without a grouping expr
select count(distinct int_col) from functional.alltypes;
---- QUERYOPTIONS
SHUFFLE_DISTINCT_EXPRS=false
---- DISTRIBUTEDPLAN
PLAN-ROOT SINK
|
06:AGGREGATE [FINALIZE]
| output: count:merge(int_col)
| row-size=8B cardinality=1
|
05:EXCHANGE [UNPARTITIONED]
|
02:AGGREGATE
| output: count(int_col)
| row-size=8B cardinality=1
|
04:AGGREGATE
| group by: int_col
| row-size=4B cardinality=10
|
03:EXCHANGE [HASH(int_col)]
|
01:AGGREGATE [STREAMING]
| group by: int_col
| row-size=4B cardinality=10
|
00:SCAN HDFS [functional.alltypes]
HDFS partitions=24/24 files=24 size=478.45KB
row-size=4B cardinality=7.30K
====
select count(distinct int_col) from functional.alltypes;
---- QUERYOPTIONS
# Distinct exprs in a aggregation without grouping is always shuffled by. Setting it to
# true doesn't affect the plan.
SHUFFLE_DISTINCT_EXPRS=true
---- DISTRIBUTEDPLAN
PLAN-ROOT SINK
|
06:AGGREGATE [FINALIZE]
| output: count:merge(int_col)
| row-size=8B cardinality=1
|
05:EXCHANGE [UNPARTITIONED]
|
02:AGGREGATE
| output: count(int_col)
| row-size=8B cardinality=1
|
04:AGGREGATE
| group by: int_col
| row-size=4B cardinality=10
|
03:EXCHANGE [HASH(int_col)]
|
01:AGGREGATE [STREAMING]
| group by: int_col
| row-size=4B cardinality=10
|
00:SCAN HDFS [functional.alltypes]
HDFS partitions=24/24 files=24 size=478.45KB
row-size=4B cardinality=7.30K
====
# Distinct agg with a grouping expr
select count(distinct int_col) from functional.alltypes group by year;
---- QUERYOPTIONS
SHUFFLE_DISTINCT_EXPRS=false
---- DISTRIBUTEDPLAN
PLAN-ROOT SINK
|
05:EXCHANGE [UNPARTITIONED]
|
02:AGGREGATE [FINALIZE]
| output: count(int_col)
| group by: `year`
| row-size=12B cardinality=2
|
04:AGGREGATE
| group by: `year`, int_col
| row-size=8B cardinality=20
|
03:EXCHANGE [HASH(`year`)]
|
01:AGGREGATE [STREAMING]
| group by: `year`, int_col
| row-size=8B cardinality=20
|
00:SCAN HDFS [functional.alltypes]
HDFS partitions=24/24 files=24 size=478.45KB
row-size=8B cardinality=7.30K
====
select count(distinct int_col) from functional.alltypes group by year;
---- QUERYOPTIONS
# Shuffling by distinct exprs will create 1 more exchange node and 1 more agg node.
SHUFFLE_DISTINCT_EXPRS=true
---- DISTRIBUTEDPLAN
PLAN-ROOT SINK
|
07:EXCHANGE [UNPARTITIONED]
|
06:AGGREGATE [FINALIZE]
| output: count:merge(int_col)
| group by: `year`
| row-size=12B cardinality=2
|
05:EXCHANGE [HASH(`year`)]
|
02:AGGREGATE [STREAMING]
| output: count(int_col)
| group by: `year`
| row-size=12B cardinality=2
|
04:AGGREGATE
| group by: `year`, int_col
| row-size=8B cardinality=20
|
03:EXCHANGE [HASH(`year`,int_col)]
|
01:AGGREGATE [STREAMING]
| group by: `year`, int_col
| row-size=8B cardinality=20
|
00:SCAN HDFS [functional.alltypes]
HDFS partitions=24/24 files=24 size=478.45KB
row-size=8B cardinality=7.30K
====
# Distinct agg without a grouping expr and with a compatible child partition
select count(distinct a.int_col) from functional.alltypes a inner join [shuffle]
functional.alltypes b on a.int_col = b.int_col;
---- QUERYOPTIONS
SHUFFLE_DISTINCT_EXPRS=false
---- DISTRIBUTEDPLAN
PLAN-ROOT SINK
|
08:AGGREGATE [FINALIZE]
| output: count:merge(a.int_col)
| row-size=8B cardinality=1
|
07:EXCHANGE [UNPARTITIONED]
|
04:AGGREGATE
| output: count(a.int_col)
| row-size=8B cardinality=1
|
03:AGGREGATE
| group by: a.int_col
| row-size=4B cardinality=10
|
02:HASH JOIN [INNER JOIN, PARTITIONED]
| hash predicates: a.int_col = b.int_col
| runtime filters: RF000 <- b.int_col
| row-size=8B cardinality=5.33M
|
|--06:EXCHANGE [HASH(b.int_col)]
| |
| 01:SCAN HDFS [functional.alltypes b]
| HDFS partitions=24/24 files=24 size=478.45KB
| row-size=4B cardinality=7.30K
|
05:EXCHANGE [HASH(a.int_col)]
|
00:SCAN HDFS [functional.alltypes a]
HDFS partitions=24/24 files=24 size=478.45KB
runtime filters: RF000 -> a.int_col
row-size=4B cardinality=7.30K
====
select count(distinct a.int_col) from functional.alltypes a inner join [shuffle]
functional.alltypes b on a.int_col = b.int_col;
---- QUERYOPTIONS
# Distinct exprs in a aggregation without grouping is always shuffled by. Setting it to
# true doesn't affect the plan.
SHUFFLE_DISTINCT_EXPRS=true
---- DISTRIBUTEDPLAN
PLAN-ROOT SINK
|
08:AGGREGATE [FINALIZE]
| output: count:merge(a.int_col)
| row-size=8B cardinality=1
|
07:EXCHANGE [UNPARTITIONED]
|
04:AGGREGATE
| output: count(a.int_col)
| row-size=8B cardinality=1
|
03:AGGREGATE
| group by: a.int_col
| row-size=4B cardinality=10
|
02:HASH JOIN [INNER JOIN, PARTITIONED]
| hash predicates: a.int_col = b.int_col
| runtime filters: RF000 <- b.int_col
| row-size=8B cardinality=5.33M
|
|--06:EXCHANGE [HASH(b.int_col)]
| |
| 01:SCAN HDFS [functional.alltypes b]
| HDFS partitions=24/24 files=24 size=478.45KB
| row-size=4B cardinality=7.30K
|
05:EXCHANGE [HASH(a.int_col)]
|
00:SCAN HDFS [functional.alltypes a]
HDFS partitions=24/24 files=24 size=478.45KB
runtime filters: RF000 -> a.int_col
row-size=4B cardinality=7.30K
====
# Distinct agg with a grouping expr and a compatible child partition
select count(distinct a.int_col) from functional.alltypes a inner join [shuffle]
functional.alltypes b on a.year = b.year group by a.year;
---- QUERYOPTIONS
SHUFFLE_DISTINCT_EXPRS=false
---- DISTRIBUTEDPLAN
PLAN-ROOT SINK
|
07:EXCHANGE [UNPARTITIONED]
|
04:AGGREGATE [FINALIZE]
| output: count(a.int_col)
| group by: a.`year`
| row-size=12B cardinality=2
|
03:AGGREGATE
| group by: a.`year`, a.int_col
| row-size=8B cardinality=20
|
02:HASH JOIN [INNER JOIN, PARTITIONED]
| hash predicates: a.`year` = b.`year`
| runtime filters: RF000 <- b.`year`
| row-size=12B cardinality=87.60K
|
|--06:EXCHANGE [HASH(b.`year`)]
| |
| 01:SCAN HDFS [functional.alltypes b]
| HDFS partitions=24/24 files=24 size=478.45KB
| partition key scan
| row-size=4B cardinality=24
|
05:EXCHANGE [HASH(a.`year`)]
|
00:SCAN HDFS [functional.alltypes a]
HDFS partitions=24/24 files=24 size=478.45KB
runtime filters: RF000 -> a.`year`
row-size=8B cardinality=7.30K
====
select count(distinct a.int_col) from functional.alltypes a inner join [shuffle]
functional.alltypes b on a.year = b.year group by a.year;
---- QUERYOPTIONS
# Shuffling by distinct exprs will create 2 more exchange nodes and 2 more agg nodes.
SHUFFLE_DISTINCT_EXPRS=true
---- DISTRIBUTEDPLAN
PLAN-ROOT SINK
|
11:EXCHANGE [UNPARTITIONED]
|
10:AGGREGATE [FINALIZE]
| output: count:merge(a.int_col)
| group by: a.`year`
| row-size=12B cardinality=2
|
09:EXCHANGE [HASH(a.`year`)]
|
04:AGGREGATE [STREAMING]
| output: count(a.int_col)
| group by: a.`year`
| row-size=12B cardinality=2
|
08:AGGREGATE
| group by: a.`year`, a.int_col
| row-size=8B cardinality=20
|
07:EXCHANGE [HASH(a.`year`,a.int_col)]
|
03:AGGREGATE [STREAMING]
| group by: a.`year`, a.int_col
| row-size=8B cardinality=20
|
02:HASH JOIN [INNER JOIN, PARTITIONED]
| hash predicates: a.`year` = b.`year`
| runtime filters: RF000 <- b.`year`
| row-size=12B cardinality=87.60K
|
|--06:EXCHANGE [HASH(b.`year`)]
| |
| 01:SCAN HDFS [functional.alltypes b]
| HDFS partitions=24/24 files=24 size=478.45KB
| partition key scan
| row-size=4B cardinality=24
|
05:EXCHANGE [HASH(a.`year`)]
|
00:SCAN HDFS [functional.alltypes a]
HDFS partitions=24/24 files=24 size=478.45KB
runtime filters: RF000 -> a.`year`
row-size=8B cardinality=7.30K
====
# The input is partitioned by distinct exprs + grouping exprs
select count(distinct a.int_col) from functional.alltypes a inner join [shuffle]
functional.alltypes b on a.year = b.year and a.int_col = b.int_col group by a.year;
---- QUERYOPTIONS
# The input partition is compatible with grouping exprs + distinct exprs. Phase-1 merge
# aggregation is skipped.
SHUFFLE_DISTINCT_EXPRS=true
---- DISTRIBUTEDPLAN
PLAN-ROOT SINK
|
09:EXCHANGE [UNPARTITIONED]
|
08:AGGREGATE [FINALIZE]
| output: count:merge(a.int_col)
| group by: a.`year`
| row-size=12B cardinality=2
|
07:EXCHANGE [HASH(a.`year`)]
|
04:AGGREGATE [STREAMING]
| output: count(a.int_col)
| group by: a.`year`
| row-size=12B cardinality=2
|
03:AGGREGATE
| group by: a.`year`, a.int_col
| row-size=8B cardinality=20
|
02:HASH JOIN [INNER JOIN, PARTITIONED]
| hash predicates: a.`year` = b.`year`, a.int_col = b.int_col
| runtime filters: RF000 <- b.`year`, RF001 <- b.int_col
| row-size=16B cardinality=5.33M
|
|--06:EXCHANGE [HASH(b.`year`,b.int_col)]
| |
| 01:SCAN HDFS [functional.alltypes b]
| HDFS partitions=24/24 files=24 size=478.45KB
| row-size=8B cardinality=7.30K
|
05:EXCHANGE [HASH(a.`year`,a.int_col)]
|
00:SCAN HDFS [functional.alltypes a]
HDFS partitions=24/24 files=24 size=478.45KB
runtime filters: RF000 -> a.`year`, RF001 -> a.int_col
row-size=8B cardinality=7.30K
====
select count(distinct a.int_col) from functional.alltypes a inner join [shuffle]
functional.alltypes b on a.year = b.year and a.int_col = b.int_col group by a.year;
---- QUERYOPTIONS
# The input partition is not compatible with grouping exprs. Phase-1 merge aggregation is
# executed.
SHUFFLE_DISTINCT_EXPRS=false
---- DISTRIBUTEDPLAN
PLAN-ROOT SINK
|
09:EXCHANGE [UNPARTITIONED]
|
04:AGGREGATE [FINALIZE]
| output: count(a.int_col)
| group by: a.`year`
| row-size=12B cardinality=2
|
08:AGGREGATE
| group by: a.`year`, a.int_col
| row-size=8B cardinality=20
|
07:EXCHANGE [HASH(a.`year`)]
|
03:AGGREGATE [STREAMING]
| group by: a.`year`, a.int_col
| row-size=8B cardinality=20
|
02:HASH JOIN [INNER JOIN, PARTITIONED]
| hash predicates: a.`year` = b.`year`, a.int_col = b.int_col
| runtime filters: RF000 <- b.`year`, RF001 <- b.int_col
| row-size=16B cardinality=5.33M
|
|--06:EXCHANGE [HASH(b.`year`,b.int_col)]
| |
| 01:SCAN HDFS [functional.alltypes b]
| HDFS partitions=24/24 files=24 size=478.45KB
| row-size=8B cardinality=7.30K
|
05:EXCHANGE [HASH(a.`year`,a.int_col)]
|
00:SCAN HDFS [functional.alltypes a]
HDFS partitions=24/24 files=24 size=478.45KB
runtime filters: RF000 -> a.`year`, RF001 -> a.int_col
row-size=8B cardinality=7.30K
====