blob: 306efc3b734a3bcbce1a4ab68cdbf7014dcc4ac2 [file] [log] [blame]
-- A classic motion hazard / deadlock is between the inner and outer motions,
-- but it is also possible to happen between the outer and joinqual / subplan
-- motions. A sample plan is as below:
--
-- Gather Motion 3:1 (slice4; segments: 3)
-- -> Hash Left Join
-- Hash Cond: (t_outer.c2 = t_inner.c2)
-- Join Filter: (NOT (SubPlan))
-- -> Redistribute Motion 3:3 (slice1; segments: 3)
-- Hash Key: t_outer.c2
-- -> Seq Scan on t_outer
-- -> Hash
-- -> Redistribute Motion 3:3 (slice2; segments: 3)
-- Hash Key: t_inner.c2
-- -> Seq Scan on t_inner
-- SubPlan 1 (slice4; segments: 3)
-- -> Result
-- Filter: (t_subplan.c2 = t_outer.c1)
-- -> Materialize
-- -> Broadcast Motion 3:3 (slice3; segments: 3)
-- -> Seq Scan on t_subplan
-- Suppose :x0 is distributed on seg0, it does not matter if it is not.
-- This assumption is only to simplify the explanation.
\set x0 1
\set scale 10000
drop schema if exists deadlock2 cascade;
NOTICE: schema "deadlock2" does not exist, skipping
create schema deadlock2;
set search_path = deadlock2;
create table t_inner (c1 int, c2 int);
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c1' as the Apache Cloudberry data distribution key for this table.
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
create table t_outer (c1 int, c2 int);
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c1' as the Apache Cloudberry data distribution key for this table.
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
create table t_subplan (c1 int, c2 int);
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c1' as the Apache Cloudberry data distribution key for this table.
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
create table t_subplan2 (c1 int, c2 int);
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c1' as the Apache Cloudberry data distribution key for this table.
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
-- First load enough data on all the relations to generate redistribute-both
-- motion instead of broadcast-one motion.
insert into t_inner select i, i from generate_series(1,:scale) i;
insert into t_outer select i, i from generate_series(1,:scale) i;
insert into t_subplan select i, i from generate_series(1,:scale) i;
insert into t_subplan2 select i, i from generate_series(1,:scale) i;
analyze t_inner;
analyze t_outer;
analyze t_subplan;
analyze t_subplan2;
-- Then delete all of them and load the real data, do not use TRUNCATE as it
-- will clear the analyze information.
delete from t_inner;
delete from t_outer;
delete from t_subplan;
delete from t_subplan2;
-- t_inner is the inner relation, it does not need much data as long as it is
-- not empty.
insert into t_inner values (:x0, :x0);
-- t_outer is the outer relation of the hash join, all its data are on seg0,
-- and redistributes to seg0, so outer@slice1@seg0 sends data to
-- hashjoin@slice4@seg0 and waits for ACK from it. After all the data are sent
-- it will send EOS to all the segments of hashjoin@slice4. So once
-- hashjoin@slice4@seg1 reads from outer@slice1 it has to wait for EOS.
insert into t_outer select :x0, :x0 from generate_series(1,:scale) i;
-- t_subplan is the subplan relation of the hash join, all its data are on
-- seg0, and broadcasts to seg0 and seg1. When subplan@slice3@seg0 sends data
-- to hashjoin@slice4@seg1 it has to wait for ACK from it, this can happen
-- before hashjoin@slice4@seg1 reading from outer@slice1.
insert into t_subplan select :x0, :x0 from generate_series(1,:scale) i;
-- t_subplan2 is same as t_subplan, used for case of multiple quals
insert into t_subplan2 select :x0, :x0 from generate_series(1,:scale) i;
-- In the past hash join do the job like this:
--
-- 10. read all the inner tuples and build the hash table;
-- 20. read an outer tuple;
-- 30. pass the outer tuple to the join qual, reads through the motion in the
-- subplan;
-- 40. if there are more outer tuples goto 20.
--
-- So this makes it possible that hashjoin@slice4@seg0 is reading from
-- subplan@slice3@seg0 while itself is being waited by outer@slice1@seg0, then
-- it forms a deadlock:
--
-- outer@slice1@seg0 --[waits for ACK]-> hashjoin@slice4@seg0
-- ^ |
-- | |
-- [waits for outer tuple] [waits for subplan tuple]
-- | |
-- | v
-- hashjoin@slice4@seg1 <-[wait for ACK]-- subplan@slice3@seg0
--
-- This deadlock is prevented by prefetching the subplan.
-- In theory this deadlock exists in all of hash join, merge join and nestloop
-- join, but so far we have only constructed a reproducer for hash join.
set enable_hashjoin to on;
set enable_mergejoin to off;
set enable_nestloop to off;
set Test_print_prefetch_joinqual = on;
select count(*) from t_inner right join t_outer on t_inner.c2=t_outer.c2
and not exists (select 0 from t_subplan where t_subplan.c2=t_outer.c1);
NOTICE: prefetch join qual in slice 0 of plannode 3
NOTICE: prefetch join qual in slice 1 of plannode 3 (seg0 slice1 127.0.1.1:7002 pid=25303)
NOTICE: prefetch join qual in slice 1 of plannode 3 (seg1 slice1 127.0.1.1:7003 pid=25304)
NOTICE: prefetch join qual in slice 1 of plannode 3 (seg2 slice1 127.0.1.1:7004 pid=25305)
count
-------
10000
(1 row)
-- The logic of ExecPrefetchJoinQual is to use two null
-- tuples to fake inner and outertuple and then to ExecQual.
-- It may short cut if some previous qual is test null expr.
-- So ExecPrefetchJoinQual has to force ExecQual for each
-- qual expr in the joinqual list. See the Github issue
-- https://github.com/greenplum-db/gpdb/issues/8677
-- for details.
select count(*) from t_inner right join t_outer on t_inner.c2=t_outer.c2
and (t_inner.c1 is null or not exists (select 0 from t_subplan where t_subplan.c2=t_outer.c1));
NOTICE: prefetch join qual in slice 0 of plannode 3
NOTICE: prefetch join qual in slice 1 of plannode 3 (seg0 slice1 127.0.1.1:7002 pid=74425)
NOTICE: prefetch join qual in slice 1 of plannode 3 (seg1 slice1 127.0.1.1:7003 pid=74426)
NOTICE: prefetch join qual in slice 1 of plannode 3 (seg2 slice1 127.0.1.1:7004 pid=74427)
count
-------
10000
(1 row)
select count(*) from t_inner right join t_outer on t_inner.c2=t_outer.c2
and not exists (select 0 from t_subplan where t_subplan.c2=t_outer.c1)
and not exists (select 1 from t_subplan where t_subplan.c2=t_outer.c1);
NOTICE: prefetch join qual in slice 0 of plannode 3
NOTICE: prefetch join qual in slice 1 of plannode 3 (seg0 slice1 127.0.1.1:7002 pid=74425)
NOTICE: prefetch join qual in slice 1 of plannode 3 (seg1 slice1 127.0.1.1:7003 pid=74426)
NOTICE: prefetch join qual in slice 1 of plannode 3 (seg2 slice1 127.0.1.1:7004 pid=74427)
count
-------
10000
(1 row)
-- Except JoinQual, NonJoinQual has the similar deadlock issue.
-- Test NonJoinQual which should also be prefetched.
select count(*) from t_inner right join t_outer on t_inner.c2=t_outer.c2
where (t_inner.c1 is null or not exists (select 0 from t_subplan where t_subplan.c2=t_outer.c1));
count
-------
0
(1 row)
-- Test NonJoinQual includes multiple motion nodes
select count(*) from t_inner right join t_outer on t_inner.c2=t_outer.c2
where (t_inner.c1 is null or not exists (select 0 from t_subplan where t_subplan.c2=t_outer.c1)
or not exists (select 0 from t_subplan2 where t_subplan2.c2=t_outer.c1));
count
-------
0
(1 row)