blob: 2da3d1a37be4e4512718ae434ca6dec92b2f2977 [file] [log] [blame]
# Join cardinality tests
# Each has a qualitative description of the quantitative math
#
# Simplest join: Cartesian product
# card = |T1| * |T2|
select m.id, d.id
from functional.alltypestiny m,
functional.alltypessmall d
---- PLAN
PLAN-ROOT SINK
|
02:NESTED LOOP JOIN [CROSS JOIN]
| row-size=8B cardinality=800
|
|--00:SCAN HDFS [functional.alltypestiny m]
| partitions=4/4 files=4 size=460B
| row-size=4B cardinality=8
|
01:SCAN HDFS [functional.alltypessmall d]
partitions=4/4 files=4 size=6.32KB
row-size=4B cardinality=100
====
# Cartesian product between table and a no-stats, 0-row table
select a.id, b.id
from functional.alltypestiny a,
functional.alltypesnopart b
---- PLAN
PLAN-ROOT SINK
|
02:NESTED LOOP JOIN [CROSS JOIN]
| row-size=8B cardinality=0
|
|--01:SCAN HDFS [functional.alltypesnopart b]
| partitions=1/1 files=0 size=0B
| row-size=4B cardinality=0
|
00:SCAN HDFS [functional.alltypestiny a]
partitions=4/4 files=4 size=460B
row-size=4B cardinality=8
====
# Cartesian product between two empty tables
select a.id, b.id
from functional.alltypesnopart a,
functional.alltypesnopart b
---- PLAN
PLAN-ROOT SINK
|
02:NESTED LOOP JOIN [CROSS JOIN]
| row-size=8B cardinality=0
|
|--01:SCAN HDFS [functional.alltypesnopart b]
| partitions=1/1 files=0 size=0B
| row-size=4B cardinality=0
|
00:SCAN HDFS [functional.alltypesnopart a]
partitions=1/1 files=0 size=0B
row-size=4B cardinality=0
====
# Simplest M:1 join, join between a master table (M) and a detail table (D)
# (small):(tiny)
# |join| = |D|
select m.id, d.id
from functional.alltypestiny m,
functional.alltypessmall d
where m.id = d.id
---- PLAN
PLAN-ROOT SINK
|
02:HASH JOIN [INNER JOIN]
| hash predicates: d.id = m.id
| runtime filters: RF000 <- m.id
| row-size=8B cardinality=9
|
|--00:SCAN HDFS [functional.alltypestiny m]
| partitions=4/4 files=4 size=460B
| row-size=4B cardinality=8
|
01:SCAN HDFS [functional.alltypessmall d]
partitions=4/4 files=4 size=6.32KB
runtime filters: RF000 -> d.id
row-size=4B cardinality=100
====
# Same, but reverse WHERE order
select m.id, d.id
from functional.alltypestiny m,
functional.alltypessmall d
where d.id = m.id
---- PLAN
PLAN-ROOT SINK
|
02:HASH JOIN [INNER JOIN]
| hash predicates: d.id = m.id
| runtime filters: RF000 <- m.id
| row-size=8B cardinality=9
|
|--00:SCAN HDFS [functional.alltypestiny m]
| partitions=4/4 files=4 size=460B
| row-size=4B cardinality=8
|
01:SCAN HDFS [functional.alltypessmall d]
partitions=4/4 files=4 size=6.32KB
runtime filters: RF000 -> d.id
row-size=4B cardinality=100
====
# Same, but use JOIN ... ON
select m.id, d.id
from functional.alltypestiny m
join functional.alltypessmall d on m.id = d.id
---- PLAN
PLAN-ROOT SINK
|
02:HASH JOIN [INNER JOIN]
| hash predicates: d.id = m.id
| runtime filters: RF000 <- m.id
| row-size=8B cardinality=9
|
|--00:SCAN HDFS [functional.alltypestiny m]
| partitions=4/4 files=4 size=460B
| row-size=4B cardinality=8
|
01:SCAN HDFS [functional.alltypessmall d]
partitions=4/4 files=4 size=6.32KB
runtime filters: RF000 -> d.id
row-size=4B cardinality=100
====
# Same, reverse ON order
select m.id, d.id
from functional.alltypestiny m
join functional.alltypessmall d on d.id = m.id
---- PLAN
PLAN-ROOT SINK
|
02:HASH JOIN [INNER JOIN]
| hash predicates: d.id = m.id
| runtime filters: RF000 <- m.id
| row-size=8B cardinality=9
|
|--00:SCAN HDFS [functional.alltypestiny m]
| partitions=4/4 files=4 size=460B
| row-size=4B cardinality=8
|
01:SCAN HDFS [functional.alltypessmall d]
partitions=4/4 files=4 size=6.32KB
runtime filters: RF000 -> d.id
row-size=4B cardinality=100
====
# Same, but reverse FROM order
select m.id, d.id
from functional.alltypessmall d,
functional.alltypestiny m
where m.id = d.id
---- PLAN
PLAN-ROOT SINK
|
02:HASH JOIN [INNER JOIN]
| hash predicates: d.id = m.id
| runtime filters: RF000 <- m.id
| row-size=8B cardinality=9
|
|--01:SCAN HDFS [functional.alltypestiny m]
| partitions=4/4 files=4 size=460B
| row-size=4B cardinality=8
|
00:SCAN HDFS [functional.alltypessmall d]
partitions=4/4 files=4 size=6.32KB
runtime filters: RF000 -> d.id
row-size=4B cardinality=100
====
# Same, but use JOIN ... ON
select m.id, d.id
from functional.alltypessmall m
join functional.alltypestiny d on m.id = d.id
---- PLAN
PLAN-ROOT SINK
|
02:HASH JOIN [INNER JOIN]
| hash predicates: m.id = d.id
| runtime filters: RF000 <- d.id
| row-size=8B cardinality=9
|
|--01:SCAN HDFS [functional.alltypestiny d]
| partitions=4/4 files=4 size=460B
| row-size=4B cardinality=8
|
00:SCAN HDFS [functional.alltypessmall m]
partitions=4/4 files=4 size=6.32KB
runtime filters: RF000 -> m.id
row-size=4B cardinality=100
====
# Similar, but with big tables
# |M| = |customer| = 150K
# |M.pk| = |customer.c_custkey| = |customer| = 150K
# |D| = |orders| = 1.5M
# |D.fk| = |orders.o_custkey| = 100K
# |join| = |D| = |orders|
select m.c_custkey, d.o_orderkey
from tpch.customer m,
tpch.orders d
where m.c_custkey = d.o_custkey
---- PLAN
PLAN-ROOT SINK
|
02:HASH JOIN [INNER JOIN]
| hash predicates: d.o_custkey = m.c_custkey
| runtime filters: RF000 <- m.c_custkey
| row-size=24B cardinality=1.50M
|
|--00:SCAN HDFS [tpch.customer m]
| partitions=1/1 files=1 size=23.08MB
| row-size=8B cardinality=150.00K
|
01:SCAN HDFS [tpch.orders d]
partitions=1/1 files=1 size=162.56MB
runtime filters: RF000 -> d.o_custkey
row-size=16B cardinality=1.50M
====
# Simple join between table and a no-stats, 0-row table
# Bug: expected cardinality = 0
select a.id, b.id
from functional.alltypestiny a,
functional.alltypesnopart b
WHERE a.id = b.id
---- PLAN
PLAN-ROOT SINK
|
02:HASH JOIN [INNER JOIN]
| hash predicates: a.id = b.id
| runtime filters: RF000 <- b.id
| row-size=8B cardinality=8
|
|--01:SCAN HDFS [functional.alltypesnopart b]
| partitions=1/1 files=0 size=0B
| row-size=4B cardinality=0
|
00:SCAN HDFS [functional.alltypestiny a]
partitions=4/4 files=4 size=460B
runtime filters: RF000 -> a.id
row-size=4B cardinality=8
====
# Cartesian product between two empty tables
select a.id, b.id
from functional.alltypesnopart a,
functional.alltypesnopart b
WHERE a.id = b.id
---- PLAN
PLAN-ROOT SINK
|
02:HASH JOIN [INNER JOIN]
| hash predicates: a.id = b.id
| runtime filters: RF000 <- b.id
| row-size=8B cardinality=0
|
|--01:SCAN HDFS [functional.alltypesnopart b]
| partitions=1/1 files=0 size=0B
| row-size=4B cardinality=0
|
00:SCAN HDFS [functional.alltypesnopart a]
partitions=1/1 files=0 size=0B
runtime filters: RF000 -> a.id
row-size=4B cardinality=0
====
# Selective master filtering
# |join| = |D|/|D.fk| = 15
select m.c_custkey, d.o_orderkey
from tpch.customer m,
tpch.orders d
where m.c_custkey = d.o_custkey
and m.c_name = 'foo'
---- PLAN
PLAN-ROOT SINK
|
02:HASH JOIN [INNER JOIN]
| hash predicates: d.o_custkey = m.c_custkey
| runtime filters: RF000 <- m.c_custkey
| row-size=54B cardinality=16
|
|--00:SCAN HDFS [tpch.customer m]
| partitions=1/1 files=1 size=23.08MB
| predicates: m.c_name = 'foo'
| row-size=38B cardinality=1
|
01:SCAN HDFS [tpch.orders d]
partitions=1/1 files=1 size=162.56MB
runtime filters: RF000 -> d.o_custkey
row-size=16B cardinality=1.50M
====
# Selective master filtering, column common with detail
# |join| = |D|/|D.fk| = 15
# Bug: IMPALA-8014, expected cardinality ~15
select m.c_custkey, d.o_orderkey
from tpch.customer m,
tpch.orders d
where m.c_custkey = d.o_custkey
and m.c_custkey = 10
---- PLAN
PLAN-ROOT SINK
|
02:HASH JOIN [INNER JOIN]
| hash predicates: d.o_custkey = m.c_custkey
| runtime filters: RF000 <- m.c_custkey
| row-size=24B cardinality=1
|
|--00:SCAN HDFS [tpch.customer m]
| partitions=1/1 files=1 size=23.08MB
| predicates: m.c_custkey = 10
| row-size=8B cardinality=1
|
01:SCAN HDFS [tpch.orders d]
partitions=1/1 files=1 size=162.56MB
predicates: d.o_custkey = 10
runtime filters: RF000 -> d.o_custkey
row-size=16B cardinality=15
====
# Selective detail filtering
# Card = 1: one detail record finds its master record
select m.c_custkey, d.o_orderkey
from tpch.customer m,
tpch.orders d
where m.c_custkey = d.o_custkey
and d.o_orderkey = 10
---- PLAN
PLAN-ROOT SINK
|
02:HASH JOIN [INNER JOIN]
| hash predicates: m.c_custkey = d.o_custkey
| runtime filters: RF000 <- d.o_custkey
| row-size=24B cardinality=1
|
|--01:SCAN HDFS [tpch.orders d]
| partitions=1/1 files=1 size=162.56MB
| predicates: d.o_orderkey = 10
| row-size=16B cardinality=1
|
00:SCAN HDFS [tpch.customer m]
partitions=1/1 files=1 size=23.08MB
runtime filters: RF000 -> m.c_custkey
row-size=8B cardinality=150.00K
====
# Broad master filtering
# |M'| = |M| * 0.33 = 50K
# Each master finds all its details
# |join|= |M'| * |D|/|D.fk| = 50K * 15 = 750K
# Though |M.pk| > |D.fk|, we assume that filtering eliminated the unmatched keys
# Bug: Several, expected cardinality ~750K
select m.c_custkey, d.o_orderkey
from tpch.customer m,
tpch.orders d
where m.c_custkey = d.o_custkey
and m.c_name < 'foo'
---- PLAN
PLAN-ROOT SINK
|
02:HASH JOIN [INNER JOIN]
| hash predicates: d.o_custkey = m.c_custkey
| runtime filters: RF000 <- m.c_custkey
| row-size=54B cardinality=228.68K
|
|--00:SCAN HDFS [tpch.customer m]
| partitions=1/1 files=1 size=23.08MB
| predicates: m.c_name < 'foo'
| row-size=38B cardinality=15.00K
|
01:SCAN HDFS [tpch.orders d]
partitions=1/1 files=1 size=162.56MB
runtime filters: RF000 -> d.o_custkey
row-size=16B cardinality=1.50M
====
# Broad master filtering, column common with detail
# |D'| = |D| * 0.33 = 500K
# Each detail finds its master
# |join| = |D'| = 500K
# Bug: Expected cardinality ~500K
select m.c_custkey, d.o_orderkey
from tpch.customer m,
tpch.orders d
where m.c_custkey = d.o_custkey
and m.c_custkey < 1234
---- PLAN
PLAN-ROOT SINK
|
02:HASH JOIN [INNER JOIN]
| hash predicates: d.o_custkey = m.c_custkey
| runtime filters: RF000 <- m.c_custkey
| row-size=24B cardinality=22.87K
|
|--00:SCAN HDFS [tpch.customer m]
| partitions=1/1 files=1 size=23.08MB
| predicates: m.c_custkey < 1234
| row-size=8B cardinality=15.00K
|
01:SCAN HDFS [tpch.orders d]
partitions=1/1 files=1 size=162.56MB
predicates: d.o_custkey < 1234
runtime filters: RF000 -> d.o_custkey
row-size=16B cardinality=150.00K
====
# Broad detail filtering
# |D'| = |D| * 0.33 = 500K
# Every detail finds its master
# Card = 500K
# Bug: Expected cardinlity ~500K
select m.c_custkey, d.o_orderkey
from tpch.customer m,
tpch.orders d
where m.c_custkey = d.o_custkey
and d.o_orderkey < 1234
---- PLAN
PLAN-ROOT SINK
|
02:HASH JOIN [INNER JOIN]
| hash predicates: d.o_custkey = m.c_custkey
| runtime filters: RF000 <- m.c_custkey
| row-size=24B cardinality=150.00K
|
|--00:SCAN HDFS [tpch.customer m]
| partitions=1/1 files=1 size=23.08MB
| row-size=8B cardinality=150.00K
|
01:SCAN HDFS [tpch.orders d]
partitions=1/1 files=1 size=162.56MB
predicates: d.o_orderkey < 1234
runtime filters: RF000 -> d.o_custkey
row-size=16B cardinality=150.00K
====
# Filtering on join between table and a no-stats, 0-row table
# Inner join with a zero-sized table produces 0 rows.
select a.id, b.id
from functional.alltypestiny a,
functional.alltypesnopart b
WHERE a.id = b.id
AND a.id = 10
---- PLAN
PLAN-ROOT SINK
|
02:HASH JOIN [INNER JOIN]
| hash predicates: a.id = b.id
| runtime filters: RF000 <- b.id
| row-size=8B cardinality=1
|
|--01:SCAN HDFS [functional.alltypesnopart b]
| partitions=1/1 files=0 size=0B
| predicates: b.id = 10
| row-size=4B cardinality=0
|
00:SCAN HDFS [functional.alltypestiny a]
partitions=4/4 files=4 size=460B
predicates: a.id = 10
runtime filters: RF000 -> a.id
row-size=4B cardinality=1
====
# Filtering on join between table and a no-stats, 0-row table
# Bug: Expected cardinality ~1
select a.id, b.id
from functional.alltypestiny a,
functional.alltypesnopart b
WHERE a.id = b.id
AND b.id = 10
---- PLAN
PLAN-ROOT SINK
|
02:HASH JOIN [INNER JOIN]
| hash predicates: a.id = b.id
| runtime filters: RF000 <- b.id
| row-size=8B cardinality=1
|
|--01:SCAN HDFS [functional.alltypesnopart b]
| partitions=1/1 files=0 size=0B
| predicates: b.id = 10
| row-size=4B cardinality=0
|
00:SCAN HDFS [functional.alltypestiny a]
partitions=4/4 files=4 size=460B
predicates: a.id = 10
runtime filters: RF000 -> a.id
row-size=4B cardinality=1
====
# Cartesian product between two empty tables
select a.id, b.id
from functional.alltypesnopart a,
functional.alltypesnopart b
WHERE a.id = b.id
AND a.id = 10
---- PLAN
PLAN-ROOT SINK
|
02:HASH JOIN [INNER JOIN]
| hash predicates: a.id = b.id
| runtime filters: RF000 <- b.id
| row-size=8B cardinality=0
|
|--01:SCAN HDFS [functional.alltypesnopart b]
| partitions=1/1 files=0 size=0B
| predicates: b.id = 10
| row-size=4B cardinality=0
|
00:SCAN HDFS [functional.alltypesnopart a]
partitions=1/1 files=0 size=0B
predicates: a.id = 10
runtime filters: RF000 -> a.id
row-size=4B cardinality=0
====
# Selective filtering on both sides
# |D'| = 1
# |C'| = 1
# |D' >< C'| = 0 or 1, depending whether we're lucky
select m.c_custkey, d.o_orderkey
from tpch.customer m,
tpch.orders d
where m.c_custkey = d.o_custkey
and m.c_name = 'foo'
and d.o_orderkey = 10
---- PLAN
PLAN-ROOT SINK
|
02:HASH JOIN [INNER JOIN]
| hash predicates: m.c_custkey = d.o_custkey
| runtime filters: RF000 <- d.o_custkey
| row-size=54B cardinality=1
|
|--01:SCAN HDFS [tpch.orders d]
| partitions=1/1 files=1 size=162.56MB
| predicates: d.o_orderkey = 10
| row-size=16B cardinality=1
|
00:SCAN HDFS [tpch.customer m]
partitions=1/1 files=1 size=23.08MB
predicates: m.c_name = 'foo'
runtime filters: RF000 -> m.c_custkey
row-size=38B cardinality=1
====
# Correlated filtering on master
# |join| = |D| / |D.fk|
# Bug: Expected cardinality 15
select m.c_custkey, d.o_orderkey
from tpch.customer m,
tpch.orders d
where m.c_custkey = d.o_custkey
and m.c_custkey = 10
---- PLAN
PLAN-ROOT SINK
|
02:HASH JOIN [INNER JOIN]
| hash predicates: d.o_custkey = m.c_custkey
| runtime filters: RF000 <- m.c_custkey
| row-size=24B cardinality=1
|
|--00:SCAN HDFS [tpch.customer m]
| partitions=1/1 files=1 size=23.08MB
| predicates: m.c_custkey = 10
| row-size=8B cardinality=1
|
01:SCAN HDFS [tpch.orders d]
partitions=1/1 files=1 size=162.56MB
predicates: d.o_custkey = 10
runtime filters: RF000 -> d.o_custkey
row-size=16B cardinality=15
====
# Correlated filtering on detail
# |join| = |D| / |D.fk|
# Bug: Expected cardinality 15
select m.c_custkey, d.o_orderkey
from tpch.customer m,
tpch.orders d
where m.c_custkey = d.o_custkey
and d.o_custkey = 10
---- PLAN
PLAN-ROOT SINK
|
02:HASH JOIN [INNER JOIN]
| hash predicates: d.o_custkey = m.c_custkey
| runtime filters: RF000 <- m.c_custkey
| row-size=24B cardinality=1
|
|--00:SCAN HDFS [tpch.customer m]
| partitions=1/1 files=1 size=23.08MB
| predicates: m.c_custkey = 10
| row-size=8B cardinality=1
|
01:SCAN HDFS [tpch.orders d]
partitions=1/1 files=1 size=162.56MB
predicates: d.o_custkey = 10
runtime filters: RF000 -> d.o_custkey
row-size=16B cardinality=15
====
# Redundant correlated filtering on both sides
# Same as above case internally
# Bug: Expected cardinality ~1
select m.c_custkey, d.o_orderkey
from tpch.customer m,
tpch.orders d
where m.c_custkey = d.o_custkey
and m.c_name = 'foo'
and m.c_custkey = 10
and d.o_custkey = 10
---- PLAN
PLAN-ROOT SINK
|
02:HASH JOIN [INNER JOIN]
| hash predicates: d.o_custkey = m.c_custkey
| runtime filters: RF000 <- m.c_custkey
| row-size=54B cardinality=1
|
|--00:SCAN HDFS [tpch.customer m]
| partitions=1/1 files=1 size=23.08MB
| predicates: m.c_custkey = 10, m.c_name = 'foo'
| row-size=38B cardinality=1
|
01:SCAN HDFS [tpch.orders d]
partitions=1/1 files=1 size=162.56MB
predicates: d.o_custkey = 10
runtime filters: RF000 -> d.o_custkey
row-size=16B cardinality=15
====
# Selective filtering on master, broad on detail
# |M'| = 1
# |D'| = |D| * 0.33, uncorrelated with above
# |D.fk'| = |D.fk| * 0.33
# Since we assume containment: that the smaller key set
# is a subset of the larger one.
# Card = |D'| / max(1, |D.fk'|) = 5
# Bug: Expected cardinality ~5
select m.c_custkey, d.o_orderkey
from tpch.customer m,
tpch.orders d
where m.c_custkey = d.o_custkey
and m.c_name = 'foo'
and d.o_orderkey < 1234
---- PLAN
PLAN-ROOT SINK
|
02:HASH JOIN [INNER JOIN]
| hash predicates: d.o_custkey = m.c_custkey
| runtime filters: RF000 <- m.c_custkey
| row-size=54B cardinality=2
|
|--00:SCAN HDFS [tpch.customer m]
| partitions=1/1 files=1 size=23.08MB
| predicates: m.c_name = 'foo'
| row-size=38B cardinality=1
|
01:SCAN HDFS [tpch.orders d]
partitions=1/1 files=1 size=162.56MB
predicates: d.o_orderkey < 1234
runtime filters: RF000 -> d.o_custkey
row-size=16B cardinality=150.00K
====
# Selective filtering on detail, broad on master
# |M'| = |M| * 0.33
# |D'| = 1
# 1/3 chance that the detail finds its master, so card = 0 or 1
select m.c_custkey, d.o_orderkey
from tpch.customer m,
tpch.orders d
where m.c_custkey = d.o_custkey
and m.c_name < 'foo'
and d.o_orderkey = 10
---- PLAN
PLAN-ROOT SINK
|
02:HASH JOIN [INNER JOIN]
| hash predicates: m.c_custkey = d.o_custkey
| runtime filters: RF000 <- d.o_custkey
| row-size=54B cardinality=1
|
|--01:SCAN HDFS [tpch.orders d]
| partitions=1/1 files=1 size=162.56MB
| predicates: d.o_orderkey = 10
| row-size=16B cardinality=1
|
00:SCAN HDFS [tpch.customer m]
partitions=1/1 files=1 size=23.08MB
predicates: m.c_name < 'foo'
runtime filters: RF000 -> m.c_custkey
row-size=38B cardinality=15.00K
====
# M:N join
# |partsupp| = 800K
# |lineitem| = 6M
# NDV(ps_suppkey) = NDV(l_suppkey) = |suppkey| = 9.7K
# |lineitem >< partsupp| = (|lineitem| * |partsupp|) / |suppkey|
# = 800K * 6M / 10K = 80 * 600 = 480M
select t1.ps_suppkey, t2.l_suppkey
from tpch.partsupp t1,
tpch.lineitem t2
where t1.ps_suppkey = t2.l_suppkey
---- PLAN
PLAN-ROOT SINK
|
02:HASH JOIN [INNER JOIN]
| hash predicates: t2.l_suppkey = t1.ps_suppkey
| runtime filters: RF000 <- t1.ps_suppkey
| row-size=16B cardinality=494.33M
|
|--00:SCAN HDFS [tpch.partsupp t1]
| partitions=1/1 files=1 size=112.71MB
| row-size=8B cardinality=800.00K
|
01:SCAN HDFS [tpch.lineitem t2]
partitions=1/1 files=1 size=718.94MB
runtime filters: RF000 -> t2.l_suppkey
row-size=8B cardinality=6.00M
====
# M:N join with filtering on smaller table
# NDV(availqty) = = |availqty| = 10K
# |partsupp'| = |partsupp| / |availqty|
# = 800K / 10K = 80
# |lineitem >< partsupp'| = |lineitem| * |partsupp'| / |suppkey|
# = 6M * 80 / 10K = 48K
select t1.ps_suppkey, t2.l_suppkey
from tpch.partsupp t1,
tpch.lineitem t2
where t1.ps_suppkey = t2.l_suppkey
and t1.ps_availqty = 10
---- PLAN
PLAN-ROOT SINK
|
02:HASH JOIN [INNER JOIN]
| hash predicates: t2.l_suppkey = t1.ps_suppkey
| runtime filters: RF000 <- t1.ps_suppkey
| row-size=20B cardinality=48.20K
|
|--00:SCAN HDFS [tpch.partsupp t1]
| partitions=1/1 files=1 size=112.71MB
| predicates: t1.ps_availqty = 10
| row-size=12B cardinality=78
|
01:SCAN HDFS [tpch.lineitem t2]
partitions=1/1 files=1 size=718.94MB
runtime filters: RF000 -> t2.l_suppkey
row-size=8B cardinality=6.00M
====
# M:N join with filtering on larger table
# NDV(l_partkey) = |l_partkey| = 200K
# |lineitem'| = |lineitem| / |l_partkey| = 6M / 200K = 30
# |lineitem' >< partsupp| = |lineitem'| * |partsupp| / |suppkey|
# = 30 * 800K / 10K = 2400
select t1.ps_suppkey, t2.l_suppkey
from tpch.partsupp t1,
tpch.lineitem t2
where t1.ps_suppkey = t2.l_suppkey
and t2.l_partkey = 10
---- PLAN
PLAN-ROOT SINK
|
02:HASH JOIN [INNER JOIN]
| hash predicates: t1.ps_suppkey = t2.l_suppkey
| runtime filters: RF000 <- t2.l_suppkey
| row-size=24B cardinality=2.47K
|
|--01:SCAN HDFS [tpch.lineitem t2]
| partitions=1/1 files=1 size=718.94MB
| predicates: t2.l_partkey = 10
| row-size=16B cardinality=30
|
00:SCAN HDFS [tpch.partsupp t1]
partitions=1/1 files=1 size=112.71MB
runtime filters: RF000 -> t1.ps_suppkey
row-size=8B cardinality=800.00K
====
# M:N join with selective filtering on shared column
# |partsupp'| = |partsupp| / |suppkey| = 800K / 10K = 80
# But, filtering is done on the join column, so we want:
# |lineitem >< partsupp'| = |lineitem| * |partsupp'|
# = (80 * 6M) / 10K = 80 * 600 = 48K
select t1.ps_suppkey, t2.l_suppkey
from tpch.partsupp t1,
tpch.lineitem t2
where t1.ps_suppkey = t2.l_suppkey
and t1.ps_suppkey = 10
---- PLAN
PLAN-ROOT SINK
|
02:HASH JOIN [INNER JOIN]
| hash predicates: t2.l_suppkey = t1.ps_suppkey
| runtime filters: RF000 <- t1.ps_suppkey
| row-size=16B cardinality=50.67K
|
|--00:SCAN HDFS [tpch.partsupp t1]
| partitions=1/1 files=1 size=112.71MB
| predicates: t1.ps_suppkey = 10
| row-size=8B cardinality=82
|
01:SCAN HDFS [tpch.lineitem t2]
partitions=1/1 files=1 size=718.94MB
predicates: t2.l_suppkey = 10
runtime filters: RF000 -> t2.l_suppkey
row-size=8B cardinality=618
====
# M:N join with broad filtering on smaller table
# Most general join case
# |partsupp'| = |partsupp| * 0.33
# = 800K / 3 = 267K
# |lineietem >< partsupp'| = |lineitem| * |partsupp'| / |suppkey|
# = 267K * 6M / 10K = 160M
# Bug: Expected cardinality ~160M
select t1.ps_suppkey, t2.l_suppkey
from tpch.partsupp t1,
tpch.lineitem t2
where t1.ps_suppkey = t2.l_suppkey
and t1.ps_availqty < 10
---- PLAN
PLAN-ROOT SINK
|
02:HASH JOIN [INNER JOIN]
| hash predicates: t2.l_suppkey = t1.ps_suppkey
| runtime filters: RF000 <- t1.ps_suppkey
| row-size=20B cardinality=49.43M
|
|--00:SCAN HDFS [tpch.partsupp t1]
| partitions=1/1 files=1 size=112.71MB
| predicates: t1.ps_availqty < 10
| row-size=12B cardinality=80.00K
|
01:SCAN HDFS [tpch.lineitem t2]
partitions=1/1 files=1 size=718.94MB
runtime filters: RF000 -> t2.l_suppkey
row-size=8B cardinality=6.00M
====
# M:N join with broad filtering on larger table
# |lineitem'| = |lineitem| * 0.33 = 6M / 3 = 2M
# |lineitem' >< partsupp| = |lineitem'| * |partsupp| / |suppkey|
# = 2M * 800K / 10K = 160M
# Bug: Expected cardinality ~161M
select t1.ps_suppkey, t2.l_suppkey
from tpch.partsupp t1,
tpch.lineitem t2
where t1.ps_suppkey = t2.l_suppkey
and t2.l_partkey < 10
---- PLAN
PLAN-ROOT SINK
|
02:HASH JOIN [INNER JOIN]
| hash predicates: t2.l_suppkey = t1.ps_suppkey
| runtime filters: RF000 <- t1.ps_suppkey
| row-size=24B cardinality=49.43M
|
|--00:SCAN HDFS [tpch.partsupp t1]
| partitions=1/1 files=1 size=112.71MB
| row-size=8B cardinality=800.00K
|
01:SCAN HDFS [tpch.lineitem t2]
partitions=1/1 files=1 size=718.94MB
predicates: t2.l_partkey < 10
runtime filters: RF000 -> t2.l_suppkey
row-size=16B cardinality=600.12K
====
# M:N join with filtering on both sides
# Most general join case
# |lineitem'| = |lineitem| * 0.33
# |partsup'| = |partsupp| * 0.33
# |suppkey'| = |suppkey| * 0.33
# |lineitem' >< partsupp'| = |lineitem'| * |partsupp'| / |suppkey'|
# = 800K/3 * 6M/3 / 3.3K = 161M
# (Recall the containment assumption above.)
# Bug: Expected cardinality ~161M
select t1.ps_suppkey, t2.l_suppkey
from tpch.partsupp t1,
tpch.lineitem t2
where t1.ps_suppkey = t2.l_suppkey
and t1.ps_availqty < 10
and t2.l_partkey < 10
---- PLAN
PLAN-ROOT SINK
|
02:HASH JOIN [INNER JOIN]
| hash predicates: t2.l_suppkey = t1.ps_suppkey
| runtime filters: RF000 <- t1.ps_suppkey
| row-size=28B cardinality=49.43M
|
|--00:SCAN HDFS [tpch.partsupp t1]
| partitions=1/1 files=1 size=112.71MB
| predicates: t1.ps_availqty < 10
| row-size=12B cardinality=80.00K
|
01:SCAN HDFS [tpch.lineitem t2]
partitions=1/1 files=1 size=718.94MB
predicates: t2.l_partkey < 10
runtime filters: RF000 -> t2.l_suppkey
row-size=16B cardinality=600.12K
====
# Join with join-level filter predicates
# |join| = |orders| * sel(c.nationkey + o.o_shippriority = 10)
# |orders| = 1.5M
# sel(c.nationkey + o.o_shippriority = 10) = 0.1 (can't use NDV)
# |join| = 1.5M * .1 = 150K
# Bug: Several, expected cardinality ~1.5M
select c.c_custkey, o.o_orderkey
from tpch.customer c,
tpch.orders o
where c.c_custkey = o.o_custkey
and c.c_nationkey + o.o_shippriority = 10
---- PLAN
PLAN-ROOT SINK
|
02:HASH JOIN [INNER JOIN]
| hash predicates: o.o_custkey = c.c_custkey
| other predicates: c.c_nationkey + o.o_shippriority = 10
| runtime filters: RF000 <- c.c_custkey
| row-size=30B cardinality=1.50M
|
|--00:SCAN HDFS [tpch.customer c]
| partitions=1/1 files=1 size=23.08MB
| row-size=10B cardinality=150.00K
|
01:SCAN HDFS [tpch.orders o]
partitions=1/1 files=1 size=162.56MB
runtime filters: RF000 -> o.o_custkey
row-size=20B cardinality=1.50M
====
# Join with a table that has no stats and zero rows.
# The zero rows trumps the stats.
# Zero-row table is on the build side, |join| = 0
# Bug: Expected cardinality = 0
select a.int_col, b.int_col
from functional.alltypesagg a, functional.alltypesnopart b
where a.id = b.id
---- PLAN
PLAN-ROOT SINK
|
02:HASH JOIN [INNER JOIN]
| hash predicates: a.id = b.id
| runtime filters: RF000 <- b.id
| row-size=16B cardinality=11.00K
|
|--01:SCAN HDFS [functional.alltypesnopart b]
| partitions=1/1 files=0 size=0B
| row-size=8B cardinality=0
|
00:SCAN HDFS [functional.alltypesagg a]
partitions=11/11 files=11 size=814.73KB
runtime filters: RF000 -> a.id
row-size=8B cardinality=11.00K
====
# Adding table and join filters should not affect the result.
# Bug: Expected cardinality = 0
select a.int_col, b.int_col
from functional.alltypesagg a, functional.alltypesnopart b
where a.id = b.id
and a.smallint_col = 10
and b.smallint_col = 20
and a.int_col + b.int_col > 30
---- PLAN
PLAN-ROOT SINK
|
02:HASH JOIN [INNER JOIN]
| hash predicates: a.id = b.id
| other predicates: a.int_col + b.int_col > 30
| runtime filters: RF000 <- b.id
| row-size=20B cardinality=113
|
|--01:SCAN HDFS [functional.alltypesnopart b]
| partitions=1/1 files=0 size=0B
| predicates: b.smallint_col = 20
| row-size=10B cardinality=0
|
00:SCAN HDFS [functional.alltypesagg a]
partitions=11/11 files=11 size=814.73KB
predicates: a.smallint_col = 10
runtime filters: RF000 -> a.id
row-size=10B cardinality=113
====
# Join on a computed column
# Assumes Cartesian product * 0.1
# |join| = 11K * 7K * 0.1 = 7M
# Bug: Expected cardinality ~7M
select a.id, b.id
from functional.alltypes a, functional.alltypesagg b
where a.id = b.id + b.int_col
---- PLAN
PLAN-ROOT SINK
|
02:HASH JOIN [INNER JOIN]
| hash predicates: b.id + b.int_col = a.id
| runtime filters: RF000 <- a.id
| row-size=12B cardinality=11.00K
|
|--00:SCAN HDFS [functional.alltypes a]
| partitions=24/24 files=24 size=478.45KB
| row-size=4B cardinality=7.30K
|
01:SCAN HDFS [functional.alltypesagg b]
partitions=11/11 files=11 size=814.73KB
runtime filters: RF000 -> b.id + b.int_col
row-size=8B cardinality=11.00K
====