blob: f08f40eff9df7fb43a5c9d2f1cc4dec99f49cead [file] [log] [blame]
# Verify that that the parquet count(*) optimization is applied in all count(*) or
# count(<literal>) cases when scanning a Parquet table. In the last case, we are scanning
# a text table, so the optimization is not applied.
select count(*) from functional_parquet.alltypes
union all
select count(1) from functional_parquet.alltypes
union all
select count(123) from functional_parquet.alltypes
union all
select count(*) from functional.alltypes
---- PLAN
PLAN-ROOT SINK
|
00:UNION
| pass-through-operands: all
| row-size=8B cardinality=4
|
|--08:AGGREGATE [FINALIZE]
| | output: count(*)
| | row-size=8B cardinality=1
| |
| 07:SCAN HDFS [functional.alltypes]
| partitions=24/24 files=24 size=478.45KB
| row-size=0B cardinality=7.30K
|
|--06:AGGREGATE [FINALIZE]
| | output: sum_init_zero(functional_parquet.alltypes.stats: num_rows)
| | row-size=8B cardinality=1
| |
| 05:SCAN HDFS [functional_parquet.alltypes]
| partitions=24/24 files=24 size=189.28KB
| row-size=8B cardinality=unavailable
|
|--04:AGGREGATE [FINALIZE]
| | output: sum_init_zero(functional_parquet.alltypes.stats: num_rows)
| | row-size=8B cardinality=1
| |
| 03:SCAN HDFS [functional_parquet.alltypes]
| partitions=24/24 files=24 size=189.28KB
| row-size=8B cardinality=unavailable
|
02:AGGREGATE [FINALIZE]
| output: sum_init_zero(functional_parquet.alltypes.stats: num_rows)
| row-size=8B cardinality=1
|
01:SCAN HDFS [functional_parquet.alltypes]
partitions=24/24 files=24 size=189.28KB
row-size=8B cardinality=unavailable
---- DISTRIBUTEDPLAN
PLAN-ROOT SINK
|
00:UNION
| pass-through-operands: all
| row-size=8B cardinality=4
|
|--16:AGGREGATE [FINALIZE]
| | output: count:merge(*)
| | row-size=8B cardinality=1
| |
| 15:EXCHANGE [UNPARTITIONED]
| |
| 08:AGGREGATE
| | output: count(*)
| | row-size=8B cardinality=1
| |
| 07:SCAN HDFS [functional.alltypes]
| partitions=24/24 files=24 size=478.45KB
| row-size=0B cardinality=7.30K
|
|--14:AGGREGATE [FINALIZE]
| | output: count:merge(*)
| | row-size=8B cardinality=1
| |
| 13:EXCHANGE [UNPARTITIONED]
| |
| 06:AGGREGATE
| | output: sum_init_zero(functional_parquet.alltypes.stats: num_rows)
| | row-size=8B cardinality=1
| |
| 05:SCAN HDFS [functional_parquet.alltypes]
| partitions=24/24 files=24 size=189.28KB
| row-size=8B cardinality=unavailable
|
|--12:AGGREGATE [FINALIZE]
| | output: count:merge(*)
| | row-size=8B cardinality=1
| |
| 11:EXCHANGE [UNPARTITIONED]
| |
| 04:AGGREGATE
| | output: sum_init_zero(functional_parquet.alltypes.stats: num_rows)
| | row-size=8B cardinality=1
| |
| 03:SCAN HDFS [functional_parquet.alltypes]
| partitions=24/24 files=24 size=189.28KB
| row-size=8B cardinality=unavailable
|
10:AGGREGATE [FINALIZE]
| output: count:merge(*)
| row-size=8B cardinality=1
|
09:EXCHANGE [UNPARTITIONED]
|
02:AGGREGATE
| output: sum_init_zero(functional_parquet.alltypes.stats: num_rows)
| row-size=8B cardinality=1
|
01:SCAN HDFS [functional_parquet.alltypes]
partitions=24/24 files=24 size=189.28KB
row-size=8B cardinality=unavailable
====
# Verify that the parquet count(*) optimization is applied even if there is more than
# one item in the select list.
select count(*), count(1), count(123) from functional_parquet.alltypes
---- PLAN
PLAN-ROOT SINK
|
01:AGGREGATE [FINALIZE]
| output: sum_init_zero(functional_parquet.alltypes.stats: num_rows)
| row-size=8B cardinality=1
|
00:SCAN HDFS [functional_parquet.alltypes]
partitions=24/24 files=24 size=189.28KB
row-size=8B cardinality=unavailable
====
# Select count(<partition col>) - the optimization should be disabled because it's not a
# count(<literal>) or count(*) aggregate function.
select count(year) from functional_parquet.alltypes
---- PLAN
PLAN-ROOT SINK
|
01:AGGREGATE [FINALIZE]
| output: count(`year`)
| row-size=8B cardinality=1
|
00:SCAN HDFS [functional_parquet.alltypes]
partitions=24/24 files=24 size=189.28KB
row-size=4B cardinality=unavailable
====
# Group by partition columns.
select month, count(*) from functional_parquet.alltypes group by month, year
---- PLAN
PLAN-ROOT SINK
|
01:AGGREGATE [FINALIZE]
| output: sum_init_zero(functional_parquet.alltypes.stats: num_rows)
| group by: `month`, `year`
| row-size=16B cardinality=24
|
00:SCAN HDFS [functional_parquet.alltypes]
partitions=24/24 files=24 size=189.28KB
row-size=16B cardinality=unavailable
====
# The optimization is disabled because tinyint_col is not a partition col.
select tinyint_col, count(*) from functional_parquet.alltypes group by tinyint_col, year
---- PLAN
PLAN-ROOT SINK
|
01:AGGREGATE [FINALIZE]
| output: count(*)
| group by: tinyint_col, `year`
| row-size=13B cardinality=unavailable
|
00:SCAN HDFS [functional_parquet.alltypes]
partitions=24/24 files=24 size=189.28KB
row-size=5B cardinality=unavailable
====
# The optimization is disabled because there are two aggregate functions.
select avg(year), count(*) from functional_parquet.alltypes
---- PLAN
PLAN-ROOT SINK
|
01:AGGREGATE [FINALIZE]
| output: avg(`year`), count(*)
| row-size=16B cardinality=1
|
00:SCAN HDFS [functional_parquet.alltypes]
partitions=24/24 files=24 size=189.28KB
row-size=4B cardinality=unavailable
====
# Optimization is not applied because the inner count(*) is not materialized. The outer
# count(*) does not reference a base table.
select count(*) from (select count(*) from functional_parquet.alltypes) t
---- PLAN
PLAN-ROOT SINK
|
02:AGGREGATE [FINALIZE]
| output: count(*)
| row-size=8B cardinality=1
|
01:AGGREGATE [FINALIZE]
| row-size=0B cardinality=1
|
00:SCAN HDFS [functional_parquet.alltypes]
partitions=24/24 files=24 size=189.28KB
row-size=0B cardinality=unavailable
====
# The optimization is applied if count(*) is in the having clause.
select 1 from functional_parquet.alltypes having count(*) > 1
---- PLAN
PLAN-ROOT SINK
|
01:AGGREGATE [FINALIZE]
| output: sum_init_zero(functional_parquet.alltypes.stats: num_rows)
| having: count(*) > 1
| row-size=8B cardinality=0
|
00:SCAN HDFS [functional_parquet.alltypes]
partitions=24/24 files=24 size=189.28KB
row-size=8B cardinality=unavailable
====
# The count(*) optimization is applied in the inline view.
select count(*), count(a) from (select count(1) as a from functional_parquet.alltypes) t
---- PLAN
PLAN-ROOT SINK
|
02:AGGREGATE [FINALIZE]
| output: count(*), count(count(*))
| row-size=16B cardinality=1
|
01:AGGREGATE [FINALIZE]
| output: sum_init_zero(functional_parquet.alltypes.stats: num_rows)
| row-size=8B cardinality=1
|
00:SCAN HDFS [functional_parquet.alltypes]
partitions=24/24 files=24 size=189.28KB
row-size=8B cardinality=unavailable
====
# The count(*) optimization is applied to the inline view even if there is a join.
select *
from functional.alltypes x inner join (
select count(1) as a from functional_parquet.alltypes group by year
) t on x.id = t.a;
---- PLAN
PLAN-ROOT SINK
|
03:HASH JOIN [INNER JOIN]
| hash predicates: x.id = count(*)
| runtime filters: RF000 <- count(*)
| row-size=101B cardinality=7.30K
|
|--02:AGGREGATE [FINALIZE]
| | output: sum_init_zero(functional_parquet.alltypes.stats: num_rows)
| | group by: `year`
| | row-size=12B cardinality=2
| |
| 01:SCAN HDFS [functional_parquet.alltypes]
| partitions=24/24 files=24 size=189.28KB
| row-size=12B cardinality=unavailable
|
00:SCAN HDFS [functional.alltypes x]
partitions=24/24 files=24 size=478.45KB
runtime filters: RF000 -> x.id
row-size=89B cardinality=7.30K
====
# The count(*) optimization is not applied if there is more than 1 table ref.
select count(*) from functional_parquet.alltypes a, functional_parquet.alltypes b
---- PLAN
PLAN-ROOT SINK
|
03:AGGREGATE [FINALIZE]
| output: count(*)
| row-size=8B cardinality=1
|
02:NESTED LOOP JOIN [CROSS JOIN]
| row-size=0B cardinality=unavailable
|
|--01:SCAN HDFS [functional_parquet.alltypes b]
| partitions=24/24 files=24 size=189.28KB
| row-size=0B cardinality=unavailable
|
00:SCAN HDFS [functional_parquet.alltypes a]
partitions=24/24 files=24 size=189.28KB
row-size=0B cardinality=unavailable
====
# The count(*) optimization is applied if there are predicates on partition columns.
select count(1) from functional_parquet.alltypes where year < 2010 and month > 8;
---- PLAN
PLAN-ROOT SINK
|
01:AGGREGATE [FINALIZE]
| output: sum_init_zero(functional_parquet.alltypes.stats: num_rows)
| row-size=8B cardinality=1
|
00:SCAN HDFS [functional_parquet.alltypes]
partition predicates: `year` < 2010, `month` > 8
partitions=4/24 files=4 size=31.40KB
row-size=8B cardinality=unavailable
====
# tinyint_col is not a partition column so the optimization is disabled.
select count(1) from functional_parquet.alltypes where year < 2010 and tinyint_col > 8;
---- PLAN
PLAN-ROOT SINK
|
01:AGGREGATE [FINALIZE]
| output: count(*)
| row-size=8B cardinality=1
|
00:SCAN HDFS [functional_parquet.alltypes]
partition predicates: `year` < 2010
partitions=12/24 files=12 size=94.74KB
predicates: tinyint_col > 8
row-size=1B cardinality=unavailable
====
# Optimization is applied after constant folding.
select count(1 + 2 + 3) from functional_parquet.alltypes
---- PLAN
PLAN-ROOT SINK
|
01:AGGREGATE [FINALIZE]
| output: sum_init_zero(functional_parquet.alltypes.stats: num_rows)
| row-size=8B cardinality=1
|
00:SCAN HDFS [functional_parquet.alltypes]
partitions=24/24 files=24 size=189.28KB
row-size=8B cardinality=unavailable
====
# Optimization is not applied to count(null).
select count(1 + null + 3) from functional_parquet.alltypes
union all
select count(null) from functional_parquet.alltypes
---- PLAN
PLAN-ROOT SINK
|
00:UNION
| pass-through-operands: all
| row-size=8B cardinality=2
|
|--04:AGGREGATE [FINALIZE]
| | output: count(NULL)
| | row-size=8B cardinality=1
| |
| 03:SCAN HDFS [functional_parquet.alltypes]
| partitions=24/24 files=24 size=189.28KB
| row-size=0B cardinality=unavailable
|
02:AGGREGATE [FINALIZE]
| output: count(NULL + 3)
| row-size=8B cardinality=1
|
01:SCAN HDFS [functional_parquet.alltypes]
partitions=24/24 files=24 size=189.28KB
row-size=0B cardinality=unavailable
====
# Optimization is not applied when selecting from an empty table.
select count(*) from functional_parquet.emptytable
---- PLAN
PLAN-ROOT SINK
|
01:AGGREGATE [FINALIZE]
| output: count(*)
| row-size=8B cardinality=0
|
00:SCAN HDFS [functional_parquet.emptytable]
partitions=0/0 files=0 size=0B
row-size=0B cardinality=0
====
# Optimization is not applied when all partitions are pruned.
select count(1) from functional_parquet.alltypes where year = -1
---- PLAN
PLAN-ROOT SINK
|
01:AGGREGATE [FINALIZE]
| output: count(*)
| row-size=8B cardinality=0
|
00:SCAN HDFS [functional_parquet.alltypes]
partition predicates: `year` = -1
partitions=0/24 files=0 size=0B
row-size=0B cardinality=0
====
# Optimization is not applied across query blocks, even though it would be correct here.
select count(*) from (select int_col from functional_parquet.alltypes) t
---- PLAN
PLAN-ROOT SINK
|
01:AGGREGATE [FINALIZE]
| output: count(*)
| row-size=8B cardinality=1
|
00:SCAN HDFS [functional_parquet.alltypes]
partitions=24/24 files=24 size=189.28KB
row-size=0B cardinality=unavailable
====
# Optimization is not applied when there is a distinct agg.
select count(*), count(distinct 1) from functional_parquet.alltypes
---- PLAN
PLAN-ROOT SINK
|
02:AGGREGATE [FINALIZE]
| output: count(1), count:merge(*)
| row-size=16B cardinality=1
|
01:AGGREGATE
| output: count(*)
| group by: 1
| row-size=9B cardinality=1
|
00:SCAN HDFS [functional_parquet.alltypes]
partitions=24/24 files=24 size=189.28KB
row-size=0B cardinality=unavailable
====
# The optimization is applied here because only the count(*) and a partition column are
# materialized. Non-materialized agg exprs are ignored.
select year, cnt from (
select year, count(bigint_col), count(*) cnt, avg(int_col)
from functional_parquet.alltypes
where month=1
group by year
) t
---- PLAN
PLAN-ROOT SINK
|
01:AGGREGATE [FINALIZE]
| output: sum_init_zero(functional_parquet.alltypes.stats: num_rows)
| group by: `year`
| row-size=12B cardinality=2
|
00:SCAN HDFS [functional_parquet.alltypes]
partition predicates: `month` = 1
partitions=2/24 files=2 size=16.06KB
row-size=12B cardinality=unavailable
====