blob: 6b53ede0d6165cca95bb0f36eb9d58b28cb266e6 [file] [log] [blame]
# All queries in this test are run with PARQUET_DICTIONARY_FILTERING and
# PARQUET_READ_STATISTICS disabled. The expected behavior is for the planner to skip
# assigning statistics and dictionary conjuncts while querying parquet files.
# Parquet predicates to be skipped:
# parquet statistics predicate on int_col
# parquet dictionary predicate on int_col
select count(*) from functional_parquet.alltypes
where int_col > 1 and int_col * rand() > 50 and int_col is null
and int_col > tinyint_col;
---- PLAN
F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
| Per-Host Resources: mem-estimate=42.00MB mem-reservation=16.00KB thread-reservation=2
PLAN-ROOT SINK
| output exprs: count(*)
| mem-estimate=0B mem-reservation=0B thread-reservation=0
|
01:AGGREGATE [FINALIZE]
| output: count(*)
| mem-estimate=10.00MB mem-reservation=0B spill-buffer=2.00MB thread-reservation=0
| tuple-ids=1 row-size=8B cardinality=1
| in pipelines: 01(GETNEXT), 00(OPEN)
|
00:SCAN HDFS [functional_parquet.alltypes]
HDFS partitions=24/24 files=24 size=200.33KB
predicates: int_col IS NULL, int_col > CAST(1 AS INT), int_col > CAST(tinyint_col AS INT), CAST(int_col AS DOUBLE) * rand() > CAST(50 AS DOUBLE)
stored statistics:
table: rows=unavailable size=unavailable
partitions: 0/24 rows=12.84K
columns: unavailable
extrapolated-rows=disabled max-scan-range-rows=unavailable
mem-estimate=32.00MB mem-reservation=16.00KB thread-reservation=1
tuple-ids=0 row-size=5B cardinality=1.27K
in pipelines: 00(GETNEXT)
====
# Parquet predicates to be skipped:
# parquet statistics predicate on bigint_col, double_col, float_col, id, tinyint_col,
# string_col, smallint_col & date_string_col
# parquet dictionary predicate on bool_col, bigint_col, double_col, float_col, id,
# tinyint_col, string_col, smallint_col, int_col &
# date_string_col
select count(*) from functional_parquet.alltypes
where id = 1 and bool_col and tinyint_col < 50 and smallint_col in (1,2,3,4,5)
and mod(int_col,2) = 1 and bigint_col < 5000 and float_col > 50.00
and double_col > 100.00 and date_string_col > '1993-10-01'
and string_col in ('aaaa', 'bbbb', 'cccc')
and timestamp_cmp(timestamp_col, '2016-11-20 00:00:00') = 1
and year > 2000 and month < 12;
---- PLAN
F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
| Per-Host Resources: mem-estimate=138.00MB mem-reservation=88.00KB thread-reservation=2
PLAN-ROOT SINK
| output exprs: count(*)
| mem-estimate=0B mem-reservation=0B thread-reservation=0
|
01:AGGREGATE [FINALIZE]
| output: count(*)
| mem-estimate=10.00MB mem-reservation=0B spill-buffer=2.00MB thread-reservation=0
| tuple-ids=1 row-size=8B cardinality=1
| in pipelines: 01(GETNEXT), 00(OPEN)
|
00:SCAN HDFS [functional_parquet.alltypes]
partition predicates: `year` > CAST(2000 AS INT), `month` < CAST(12 AS INT)
HDFS partitions=22/24 files=22 size=183.48KB
predicates: bool_col, bigint_col < CAST(5000 AS BIGINT), double_col > CAST(100.00 AS DOUBLE), float_col > CAST(50.00 AS FLOAT), id = CAST(1 AS INT), tinyint_col < CAST(50 AS TINYINT), int_col % CAST(2 AS INT) = CAST(1 AS INT), string_col IN ('aaaa', 'bbbb', 'cccc'), smallint_col IN (CAST(1 AS SMALLINT), CAST(2 AS SMALLINT), CAST(3 AS SMALLINT), CAST(4 AS SMALLINT), CAST(5 AS SMALLINT)), timestamp_cmp(timestamp_col, TIMESTAMP '2016-11-20 00:00:00') = CAST(1 AS INT), date_string_col > '1993-10-01'
stored statistics:
table: rows=unavailable size=unavailable
partitions: 0/22 rows=11.74K
columns missing stats: id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col
extrapolated-rows=disabled max-scan-range-rows=unavailable
mem-estimate=128.00MB mem-reservation=88.00KB thread-reservation=1
tuple-ids=0 row-size=72B cardinality=1.17K
in pipelines: 00(GETNEXT)
====
# Parquet predicates to be skipped:
# parquet dictionary predicates on id, string_col & int_col
select count(*) from functional_parquet.alltypes
where id NOT IN (0,1,2) and string_col IN ('aaaa', 'bbbb', 'cccc', NULL)
and mod(int_col,50) IN (0,1)
and id IN (int_col);
---- PLAN
F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
| Per-Host Resources: mem-estimate=58.00MB mem-reservation=24.00KB thread-reservation=2
PLAN-ROOT SINK
| output exprs: count(*)
| mem-estimate=0B mem-reservation=0B thread-reservation=0
|
01:AGGREGATE [FINALIZE]
| output: count(*)
| mem-estimate=10.00MB mem-reservation=0B spill-buffer=2.00MB thread-reservation=0
| tuple-ids=1 row-size=8B cardinality=1
| in pipelines: 01(GETNEXT), 00(OPEN)
|
00:SCAN HDFS [functional_parquet.alltypes]
HDFS partitions=24/24 files=24 size=200.33KB
predicates: id IN (int_col), id NOT IN (CAST(0 AS INT), CAST(1 AS INT), CAST(2 AS INT)), int_col % CAST(50 AS INT) IN (CAST(0 AS INT), CAST(1 AS INT)), string_col IN ('aaaa', 'bbbb', 'cccc', NULL)
stored statistics:
table: rows=unavailable size=unavailable
partitions: 0/24 rows=12.84K
columns: unavailable
extrapolated-rows=disabled max-scan-range-rows=unavailable
mem-estimate=48.00MB mem-reservation=24.00KB thread-reservation=1
tuple-ids=0 row-size=20B cardinality=1.27K
in pipelines: 00(GETNEXT)
====
# Nested parquet predicates to be skipped:
# parquet statistics predicates on a.item.e
# parquet dictionary predicates on a.item.e
select id from functional_parquet.complextypestbl c, c.nested_struct.c.d cn, cn.item a
where a.item.e < -10;
---- PLAN
F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
| Per-Host Resources: mem-estimate=48.00MB mem-reservation=24.00KB thread-reservation=2
PLAN-ROOT SINK
| output exprs: id
| mem-estimate=0B mem-reservation=0B thread-reservation=0
|
01:SUBPLAN
| mem-estimate=0B mem-reservation=0B thread-reservation=0
| tuple-ids=2,1,0 row-size=36B cardinality=440.00K
| in pipelines: 00(GETNEXT)
|
|--08:NESTED LOOP JOIN [CROSS JOIN]
| | mem-estimate=20B mem-reservation=0B thread-reservation=0
| | tuple-ids=2,1,0 row-size=36B cardinality=100
| | in pipelines: 00(GETNEXT)
| |
| |--02:SINGULAR ROW SRC
| | parent-subplan=01
| | mem-estimate=0B mem-reservation=0B thread-reservation=0
| | tuple-ids=0 row-size=20B cardinality=1
| | in pipelines: 00(GETNEXT)
| |
| 04:SUBPLAN
| | mem-estimate=0B mem-reservation=0B thread-reservation=0
| | tuple-ids=2,1 row-size=16B cardinality=100
| | in pipelines: 00(GETNEXT)
| |
| |--07:NESTED LOOP JOIN [CROSS JOIN]
| | | mem-estimate=12B mem-reservation=0B thread-reservation=0
| | | tuple-ids=2,1 row-size=16B cardinality=10
| | | in pipelines: 00(GETNEXT)
| | |
| | |--05:SINGULAR ROW SRC
| | | parent-subplan=04
| | | mem-estimate=0B mem-reservation=0B thread-reservation=0
| | | tuple-ids=1 row-size=12B cardinality=1
| | | in pipelines: 00(GETNEXT)
| | |
| | 06:UNNEST [cn.item a]
| | parent-subplan=04
| | mem-estimate=0B mem-reservation=0B thread-reservation=0
| | tuple-ids=2 row-size=0B cardinality=10
| | in pipelines: 00(GETNEXT)
| |
| 03:UNNEST [c.nested_struct.c.d cn]
| parent-subplan=01
| mem-estimate=0B mem-reservation=0B thread-reservation=0
| tuple-ids=1 row-size=0B cardinality=10
| in pipelines: 00(GETNEXT)
|
00:SCAN HDFS [functional_parquet.complextypestbl c]
HDFS partitions=1/1 files=2 size=6.92KB
predicates: !empty(c.nested_struct.c.d)
predicates on cn: !empty(cn.item)
predicates on a: a.item.e < CAST(-10 AS INT)
stored statistics:
table: rows=unavailable size=unavailable
columns missing stats: id
extrapolated-rows=disabled max-scan-range-rows=unavailable
mem-estimate=48.00MB mem-reservation=24.00KB thread-reservation=1
tuple-ids=0 row-size=20B cardinality=4.40K
in pipelines: 00(GETNEXT)
====
# Parquet predicates to be skipped at each level:
# parquet statistics predicates on c_custkey, o.o_orderkey & l.l_partkey
# parquet dictionary predicates on c_custkey, o.o_orderkey & l.l_partkey
select c_custkey from tpch_nested_parquet.customer c, c.c_orders o,
o.o_lineitems l where c_custkey > 0 and o.o_orderkey > 0 and l.l_partkey > 0;
---- PLAN
F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
| Per-Host Resources: mem-estimate=264.00MB mem-reservation=16.00MB thread-reservation=2
PLAN-ROOT SINK
| output exprs: c_custkey
| mem-estimate=0B mem-reservation=0B thread-reservation=0
|
01:SUBPLAN
| mem-estimate=0B mem-reservation=0B thread-reservation=0
| tuple-ids=2,1,0 row-size=48B cardinality=1.50M
| in pipelines: 00(GETNEXT)
|
|--08:NESTED LOOP JOIN [CROSS JOIN]
| | mem-estimate=20B mem-reservation=0B thread-reservation=0
| | tuple-ids=2,1,0 row-size=48B cardinality=100
| | in pipelines: 00(GETNEXT)
| |
| |--02:SINGULAR ROW SRC
| | parent-subplan=01
| | mem-estimate=0B mem-reservation=0B thread-reservation=0
| | tuple-ids=0 row-size=20B cardinality=1
| | in pipelines: 00(GETNEXT)
| |
| 04:SUBPLAN
| | mem-estimate=0B mem-reservation=0B thread-reservation=0
| | tuple-ids=2,1 row-size=28B cardinality=100
| | in pipelines: 00(GETNEXT)
| |
| |--07:NESTED LOOP JOIN [CROSS JOIN]
| | | mem-estimate=20B mem-reservation=0B thread-reservation=0
| | | tuple-ids=2,1 row-size=28B cardinality=10
| | | in pipelines: 00(GETNEXT)
| | |
| | |--05:SINGULAR ROW SRC
| | | parent-subplan=04
| | | mem-estimate=0B mem-reservation=0B thread-reservation=0
| | | tuple-ids=1 row-size=20B cardinality=1
| | | in pipelines: 00(GETNEXT)
| | |
| | 06:UNNEST [o.o_lineitems l]
| | parent-subplan=04
| | mem-estimate=0B mem-reservation=0B thread-reservation=0
| | tuple-ids=2 row-size=0B cardinality=10
| | in pipelines: 00(GETNEXT)
| |
| 03:UNNEST [c.c_orders o]
| parent-subplan=01
| mem-estimate=0B mem-reservation=0B thread-reservation=0
| tuple-ids=1 row-size=0B cardinality=10
| in pipelines: 00(GETNEXT)
|
00:SCAN HDFS [tpch_nested_parquet.customer c]
HDFS partitions=1/1 files=4 size=288.99MB
predicates: c_custkey > CAST(0 AS BIGINT), !empty(c.c_orders)
predicates on o: !empty(o.o_lineitems), o.o_orderkey > CAST(0 AS BIGINT)
predicates on l: l.l_partkey > CAST(0 AS BIGINT)
stored statistics:
table: rows=150.00K size=288.99MB
columns missing stats: c_orders
extrapolated-rows=disabled max-scan-range-rows=50.12K
mem-estimate=264.00MB mem-reservation=16.00MB thread-reservation=1
tuple-ids=0 row-size=20B cardinality=15.00K
in pipelines: 00(GETNEXT)
====
# Parquet filtering to be skipped on multiple collections at the same nested level:
# parquet statistics filtering on l.l_shipdate, l.l_receiptdate, l.l_shipmode
# & l.l_returnflag
# parquet dictionary filtering on l.l_shipmode, l.l_receiptdate, l.l_shipmode
# & l.l_returnflag
select c_name, o.o_clerk from tpch_nested_parquet.customer c,
c.c_orders o, o.o_lineitems l
where l.l_shipdate = '1994-08-19' and
l.l_receiptdate = '1994-08-24' and l.l_shipmode = 'RAIL' and l.l_returnflag = 'R' and
l.l_comment is null;
---- PLAN
F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
| Per-Host Resources: mem-estimate=616.00MB mem-reservation=32.00MB thread-reservation=2
PLAN-ROOT SINK
| output exprs: c_name, o.o_clerk
| mem-estimate=0B mem-reservation=0B thread-reservation=0
|
01:SUBPLAN
| mem-estimate=0B mem-reservation=0B thread-reservation=0
| tuple-ids=2,1,0 row-size=126B cardinality=15.00M
| in pipelines: 00(GETNEXT)
|
|--08:NESTED LOOP JOIN [CROSS JOIN]
| | mem-estimate=42B mem-reservation=0B thread-reservation=0
| | tuple-ids=2,1,0 row-size=126B cardinality=100
| | in pipelines: 00(GETNEXT)
| |
| |--02:SINGULAR ROW SRC
| | parent-subplan=01
| | mem-estimate=0B mem-reservation=0B thread-reservation=0
| | tuple-ids=0 row-size=42B cardinality=1
| | in pipelines: 00(GETNEXT)
| |
| 04:SUBPLAN
| | mem-estimate=0B mem-reservation=0B thread-reservation=0
| | tuple-ids=2,1 row-size=84B cardinality=100
| | in pipelines: 00(GETNEXT)
| |
| |--07:NESTED LOOP JOIN [CROSS JOIN]
| | | mem-estimate=24B mem-reservation=0B thread-reservation=0
| | | tuple-ids=2,1 row-size=84B cardinality=10
| | | in pipelines: 00(GETNEXT)
| | |
| | |--05:SINGULAR ROW SRC
| | | parent-subplan=04
| | | mem-estimate=0B mem-reservation=0B thread-reservation=0
| | | tuple-ids=1 row-size=24B cardinality=1
| | | in pipelines: 00(GETNEXT)
| | |
| | 06:UNNEST [o.o_lineitems l]
| | parent-subplan=04
| | mem-estimate=0B mem-reservation=0B thread-reservation=0
| | tuple-ids=2 row-size=0B cardinality=10
| | in pipelines: 00(GETNEXT)
| |
| 03:UNNEST [c.c_orders o]
| parent-subplan=01
| mem-estimate=0B mem-reservation=0B thread-reservation=0
| tuple-ids=1 row-size=0B cardinality=10
| in pipelines: 00(GETNEXT)
|
00:SCAN HDFS [tpch_nested_parquet.customer c]
HDFS partitions=1/1 files=4 size=288.99MB
predicates: !empty(c.c_orders)
predicates on o: !empty(o.o_lineitems)
predicates on l: l.l_shipdate = '1994-08-19', l.l_receiptdate = '1994-08-24', l.l_shipmode = 'RAIL', l.l_returnflag = 'R', l.l_comment IS NULL
stored statistics:
table: rows=150.00K size=288.99MB
columns missing stats: c_orders
extrapolated-rows=disabled max-scan-range-rows=50.12K
mem-estimate=616.00MB mem-reservation=32.00MB thread-reservation=1
tuple-ids=0 row-size=42B cardinality=150.00K
in pipelines: 00(GETNEXT)
====
# Parquet filtering to be skipped on a mixed file format table:
# parquet statistics predicates on bigint_col, double_col, float_col, id, tinyint_col,
# string_col, smallint_col & date_string_col
# parquet dictionary predicates on bool_col, bigint_col, double_col, float_col, id,
# tinyint_col, string_col, smallint_col, int_col,
# timestamp_col & date_string_col
select count(*) from functional.alltypesmixedformat
where id = 1 and bool_col and tinyint_col < 50 and smallint_col in (1,2,3,4,5)
and mod(int_col,2) = 1 and bigint_col < 5000 and float_col > 50.00
and double_col > 100.00 and date_string_col > '1993-10-01'
and string_col in ('aaaa', 'bbbb', 'cccc')
and timestamp_cmp(timestamp_col, '2016-11-20 00:00:00') = 1
and year > 2000 and month < 12;
---- PLAN
F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
| Per-Host Resources: mem-estimate=138.00MB mem-reservation=88.00KB thread-reservation=2
PLAN-ROOT SINK
| output exprs: count(*)
| mem-estimate=0B mem-reservation=0B thread-reservation=0
|
01:AGGREGATE [FINALIZE]
| output: count(*)
| mem-estimate=10.00MB mem-reservation=0B spill-buffer=2.00MB thread-reservation=0
| tuple-ids=1 row-size=8B cardinality=1
| in pipelines: 01(GETNEXT), 00(OPEN)
|
00:SCAN HDFS [functional.alltypesmixedformat]
partition predicates: `year` > CAST(2000 AS INT), `month` < CAST(12 AS INT)
HDFS partitions=4/4 files=4 size=66.33KB
predicates: bool_col, bigint_col < CAST(5000 AS BIGINT), double_col > CAST(100.00 AS DOUBLE), float_col > CAST(50.00 AS FLOAT), id = CAST(1 AS INT), tinyint_col < CAST(50 AS TINYINT), int_col % CAST(2 AS INT) = CAST(1 AS INT), string_col IN ('aaaa', 'bbbb', 'cccc'), smallint_col IN (CAST(1 AS SMALLINT), CAST(2 AS SMALLINT), CAST(3 AS SMALLINT), CAST(4 AS SMALLINT), CAST(5 AS SMALLINT)), timestamp_cmp(timestamp_col, TIMESTAMP '2016-11-20 00:00:00') = CAST(1 AS INT), date_string_col > '1993-10-01'
stored statistics:
table: rows=unavailable size=unavailable
partitions: 0/4 rows=2.55K
columns missing stats: id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col
extrapolated-rows=disabled max-scan-range-rows=unavailable
mem-estimate=128.00MB mem-reservation=88.00KB thread-reservation=1
tuple-ids=0 row-size=72B cardinality=254
in pipelines: 00(GETNEXT)
====