src/test/regress/expected/deadlock2_optimizer.out - cloudberry - Git at Google

 -- A classic motion hazard / deadlock is between the inner and outer motions,
 -- but it is also possible to happen between the outer and joinqual / subplan
 -- motions.  A sample plan is as below:
 --
 --  Gather Motion 3:1  (slice4; segments: 3)
 --    ->  Hash Left Join
 --          Hash Cond: (t_outer.c2 = t_inner.c2)
 --          Join Filter: (NOT (SubPlan))
 --          ->  Redistribute Motion 3:3  (slice1; segments: 3)
 --                Hash Key: t_outer.c2
 --                ->  Seq Scan on t_outer
 --          ->  Hash
 --                ->  Redistribute Motion 3:3  (slice2; segments: 3)
 --                      Hash Key: t_inner.c2
 --                      ->  Seq Scan on t_inner
 --          SubPlan 1  (slice4; segments: 3)
 --            ->  Result
 --                  Filter: (t_subplan.c2 = t_outer.c1)
 --                  ->  Materialize
 --                        ->  Broadcast Motion 3:3  (slice3; segments: 3)
 --                              ->  Seq Scan on t_subplan
 -- Suppose :x0 is distributed on seg0, it does not matter if it is not.
 -- This assumption is only to simplify the explanation.
 \set x0 1
 \set scale 10000
 drop schema if exists deadlock2 cascade;
 NOTICE:  schema "deadlock2" does not exist, skipping
 create schema deadlock2;
 set search_path = deadlock2;
 create table t_inner (c1 int, c2 int);
 NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c1' as the Apache Cloudberry data distribution key for this table.
 HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
 create table t_outer (c1 int, c2 int);
 NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c1' as the Apache Cloudberry data distribution key for this table.
 HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
 create table t_subplan (c1 int, c2 int);
 NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c1' as the Apache Cloudberry data distribution key for this table.
 HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
 create table t_subplan2 (c1 int, c2 int);
 NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c1' as the Apache Cloudberry data distribution key for this table.
 HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
 -- First load enough data on all the relations to generate redistribute-both
 -- motion instead of broadcast-one motion.
 insert into t_inner select i, i from generate_series(1,:scale) i;
 insert into t_outer select i, i from generate_series(1,:scale) i;
 insert into t_subplan select i, i from generate_series(1,:scale) i;
 insert into t_subplan2 select i, i from generate_series(1,:scale) i;
 analyze t_inner;
 analyze t_outer;
 analyze t_subplan;
 analyze t_subplan2;
 -- Then delete all of them and load the real data, do not use TRUNCATE as it
 -- will clear the analyze information.
 delete from t_inner;
 delete from t_outer;
 delete from t_subplan;
 delete from t_subplan2;
 -- t_inner is the inner relation, it does not need much data as long as it is
 -- not empty.
 insert into t_inner values (:x0, :x0);
 -- t_outer is the outer relation of the hash join, all its data are on seg0,
 -- and redistributes to seg0, so outer@slice1@seg0 sends data to
 -- hashjoin@slice4@seg0 and waits for ACK from it.  After all the data are sent
 -- it will send EOS to all the segments of hashjoin@slice4.  So once
 -- hashjoin@slice4@seg1 reads from outer@slice1 it has to wait for EOS.
 insert into t_outer select :x0, :x0 from generate_series(1,:scale) i;
 -- t_subplan is the subplan relation of the hash join, all its data are on
 -- seg0, and broadcasts to seg0 and seg1.  When subplan@slice3@seg0 sends data
 -- to hashjoin@slice4@seg1 it has to wait for ACK from it, this can happen
 -- before hashjoin@slice4@seg1 reading from outer@slice1.
 insert into t_subplan select :x0, :x0 from generate_series(1,:scale) i;
 -- t_subplan2 is same as t_subplan, used for case of multiple quals
 insert into t_subplan2 select :x0, :x0 from generate_series(1,:scale) i;
 -- In the past hash join do the job like this:
 --
 -- 10. read all the inner tuples and build the hash table;
 -- 20. read an outer tuple;
 -- 30. pass the outer tuple to the join qual, reads through the motion in the
 --     subplan;
 -- 40. if there are more outer tuples goto 20.
 --
 -- So this makes it possible that hashjoin@slice4@seg0 is reading from
 -- subplan@slice3@seg0 while itself is being waited by outer@slice1@seg0, then
 -- it forms a deadlock:
 --
 --      outer@slice1@seg0   --[waits for ACK]->  hashjoin@slice4@seg0
 --              ^                                         |
 --              |                                         |
 --    [waits for outer tuple]                 [waits for subplan tuple]
 --              |                                         |
 --              |                                         v
 --     hashjoin@slice4@seg1  <-[wait for ACK]--  subplan@slice3@seg0
 --
 -- This deadlock is prevented by prefetching the subplan.
 -- In theory this deadlock exists in all of hash join, merge join and nestloop
 -- join, but so far we have only constructed a reproducer for hash join.
 set enable_hashjoin to on;
 set enable_mergejoin to off;
 set enable_nestloop to off;
 set Test_print_prefetch_joinqual = on;
 select count(*) from t_inner right join t_outer on t_inner.c2=t_outer.c2
    and not exists (select 0 from t_subplan where t_subplan.c2=t_outer.c1);
 NOTICE:  prefetch join qual in slice 0 of plannode 3
 NOTICE:  prefetch join qual in slice 1 of plannode 3  (seg0 slice1 127.0.1.1:7002 pid=25303)
 NOTICE:  prefetch join qual in slice 1 of plannode 3  (seg1 slice1 127.0.1.1:7003 pid=25304)
 NOTICE:  prefetch join qual in slice 1 of plannode 3  (seg2 slice1 127.0.1.1:7004 pid=25305)
  count
 -------
  10000
 (1 row)

 -- The logic of ExecPrefetchJoinQual is to use two null
 -- tuples to fake inner and outertuple and then to ExecQual.
 -- It may short cut if some previous qual is test null expr.
 -- So ExecPrefetchJoinQual has to force ExecQual for each
 -- qual expr in the joinqual list. See the Github issue
 -- https://github.com/greenplum-db/gpdb/issues/8677
 -- for details.
 select count(*) from t_inner right join t_outer on t_inner.c2=t_outer.c2
    and (t_inner.c1 is null or not exists (select 0 from t_subplan where t_subplan.c2=t_outer.c1));
 NOTICE:  prefetch join qual in slice 0 of plannode 3
 NOTICE:  prefetch join qual in slice 1 of plannode 3  (seg0 slice1 127.0.1.1:7002 pid=74425)
 NOTICE:  prefetch join qual in slice 1 of plannode 3  (seg1 slice1 127.0.1.1:7003 pid=74426)
 NOTICE:  prefetch join qual in slice 1 of plannode 3  (seg2 slice1 127.0.1.1:7004 pid=74427)
  count
 -------
  10000
 (1 row)

 select count(*) from t_inner right join t_outer on t_inner.c2=t_outer.c2
    and not exists (select 0 from t_subplan where t_subplan.c2=t_outer.c1)
    and not exists (select 1 from t_subplan where t_subplan.c2=t_outer.c1);
 NOTICE:  prefetch join qual in slice 0 of plannode 3
 NOTICE:  prefetch join qual in slice 1 of plannode 3  (seg0 slice1 127.0.1.1:7002 pid=74425)
 NOTICE:  prefetch join qual in slice 1 of plannode 3  (seg1 slice1 127.0.1.1:7003 pid=74426)
 NOTICE:  prefetch join qual in slice 1 of plannode 3  (seg2 slice1 127.0.1.1:7004 pid=74427)
  count
 -------
  10000
 (1 row)

 -- Except JoinQual, NonJoinQual has the similar deadlock issue.
 -- Test NonJoinQual which should also be prefetched.
 select count(*) from t_inner right join t_outer on t_inner.c2=t_outer.c2
 	where (t_inner.c1 is null or not exists (select 0 from t_subplan where t_subplan.c2=t_outer.c1));
  count
 -------
      0
 (1 row)

 -- Test NonJoinQual includes multiple motion nodes
 select count(*) from t_inner right join t_outer on t_inner.c2=t_outer.c2
 	where (t_inner.c1 is null or not exists (select 0 from t_subplan where t_subplan.c2=t_outer.c1)
 	or not exists (select 0 from t_subplan2 where t_subplan2.c2=t_outer.c1));
  count
 -------
      0
 (1 row)
	-- A classic motion hazard / deadlock is between the inner and outer motions,
	-- but it is also possible to happen between the outer and joinqual / subplan
	-- motions. A sample plan is as below:
	--
	-- Gather Motion 3:1 (slice4; segments: 3)
	-- -> Hash Left Join
	-- Hash Cond: (t_outer.c2 = t_inner.c2)
	-- Join Filter: (NOT (SubPlan))
	-- -> Redistribute Motion 3:3 (slice1; segments: 3)
	-- Hash Key: t_outer.c2
	-- -> Seq Scan on t_outer
	-- -> Hash
	-- -> Redistribute Motion 3:3 (slice2; segments: 3)
	-- Hash Key: t_inner.c2
	-- -> Seq Scan on t_inner
	-- SubPlan 1 (slice4; segments: 3)
	-- -> Result
	-- Filter: (t_subplan.c2 = t_outer.c1)
	-- -> Materialize
	-- -> Broadcast Motion 3:3 (slice3; segments: 3)
	-- -> Seq Scan on t_subplan
	-- Suppose :x0 is distributed on seg0, it does not matter if it is not.
	-- This assumption is only to simplify the explanation.
	\set x0 1
	\set scale 10000
	drop schema if exists deadlock2 cascade;
	NOTICE: schema "deadlock2" does not exist, skipping
	create schema deadlock2;
	set search_path = deadlock2;
	create table t_inner (c1 int, c2 int);
	NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c1' as the Apache Cloudberry data distribution key for this table.
	HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
	create table t_outer (c1 int, c2 int);
	NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c1' as the Apache Cloudberry data distribution key for this table.
	HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
	create table t_subplan (c1 int, c2 int);
	NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c1' as the Apache Cloudberry data distribution key for this table.
	HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
	create table t_subplan2 (c1 int, c2 int);
	NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c1' as the Apache Cloudberry data distribution key for this table.
	HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
	-- First load enough data on all the relations to generate redistribute-both
	-- motion instead of broadcast-one motion.
	insert into t_inner select i, i from generate_series(1,:scale) i;
	insert into t_outer select i, i from generate_series(1,:scale) i;
	insert into t_subplan select i, i from generate_series(1,:scale) i;
	insert into t_subplan2 select i, i from generate_series(1,:scale) i;
	analyze t_inner;
	analyze t_outer;
	analyze t_subplan;
	analyze t_subplan2;
	-- Then delete all of them and load the real data, do not use TRUNCATE as it
	-- will clear the analyze information.
	delete from t_inner;
	delete from t_outer;
	delete from t_subplan;
	delete from t_subplan2;
	-- t_inner is the inner relation, it does not need much data as long as it is
	-- not empty.
	insert into t_inner values (:x0, :x0);
	-- t_outer is the outer relation of the hash join, all its data are on seg0,
	-- and redistributes to seg0, so outer@slice1@seg0 sends data to
	-- hashjoin@slice4@seg0 and waits for ACK from it. After all the data are sent
	-- it will send EOS to all the segments of hashjoin@slice4. So once
	-- hashjoin@slice4@seg1 reads from outer@slice1 it has to wait for EOS.
	insert into t_outer select :x0, :x0 from generate_series(1,:scale) i;
	-- t_subplan is the subplan relation of the hash join, all its data are on
	-- seg0, and broadcasts to seg0 and seg1. When subplan@slice3@seg0 sends data
	-- to hashjoin@slice4@seg1 it has to wait for ACK from it, this can happen
	-- before hashjoin@slice4@seg1 reading from outer@slice1.
	insert into t_subplan select :x0, :x0 from generate_series(1,:scale) i;
	-- t_subplan2 is same as t_subplan, used for case of multiple quals
	insert into t_subplan2 select :x0, :x0 from generate_series(1,:scale) i;
	-- In the past hash join do the job like this:
	--
	-- 10. read all the inner tuples and build the hash table;
	-- 20. read an outer tuple;
	-- 30. pass the outer tuple to the join qual, reads through the motion in the
	-- subplan;
	-- 40. if there are more outer tuples goto 20.
	--
	-- So this makes it possible that hashjoin@slice4@seg0 is reading from
	-- subplan@slice3@seg0 while itself is being waited by outer@slice1@seg0, then
	-- it forms a deadlock:
	--
	-- outer@slice1@seg0 --[waits for ACK]-> hashjoin@slice4@seg0
	-- ^ \|
	-- \| \|
	-- [waits for outer tuple] [waits for subplan tuple]
	-- \| \|
	-- \| v
	-- hashjoin@slice4@seg1 <-[wait for ACK]-- subplan@slice3@seg0
	--
	-- This deadlock is prevented by prefetching the subplan.
	-- In theory this deadlock exists in all of hash join, merge join and nestloop
	-- join, but so far we have only constructed a reproducer for hash join.
	set enable_hashjoin to on;
	set enable_mergejoin to off;
	set enable_nestloop to off;
	set Test_print_prefetch_joinqual = on;
	select count(*) from t_inner right join t_outer on t_inner.c2=t_outer.c2
	and not exists (select 0 from t_subplan where t_subplan.c2=t_outer.c1);
	NOTICE: prefetch join qual in slice 0 of plannode 3
	NOTICE: prefetch join qual in slice 1 of plannode 3 (seg0 slice1 127.0.1.1:7002 pid=25303)
	NOTICE: prefetch join qual in slice 1 of plannode 3 (seg1 slice1 127.0.1.1:7003 pid=25304)
	NOTICE: prefetch join qual in slice 1 of plannode 3 (seg2 slice1 127.0.1.1:7004 pid=25305)
	count
	-------
	10000
	(1 row)

	-- The logic of ExecPrefetchJoinQual is to use two null
	-- tuples to fake inner and outertuple and then to ExecQual.
	-- It may short cut if some previous qual is test null expr.
	-- So ExecPrefetchJoinQual has to force ExecQual for each
	-- qual expr in the joinqual list. See the Github issue
	-- https://github.com/greenplum-db/gpdb/issues/8677
	-- for details.
	select count(*) from t_inner right join t_outer on t_inner.c2=t_outer.c2
	and (t_inner.c1 is null or not exists (select 0 from t_subplan where t_subplan.c2=t_outer.c1));
	NOTICE: prefetch join qual in slice 0 of plannode 3
	NOTICE: prefetch join qual in slice 1 of plannode 3 (seg0 slice1 127.0.1.1:7002 pid=74425)
	NOTICE: prefetch join qual in slice 1 of plannode 3 (seg1 slice1 127.0.1.1:7003 pid=74426)
	NOTICE: prefetch join qual in slice 1 of plannode 3 (seg2 slice1 127.0.1.1:7004 pid=74427)
	count
	-------
	10000
	(1 row)

	select count(*) from t_inner right join t_outer on t_inner.c2=t_outer.c2
	and not exists (select 0 from t_subplan where t_subplan.c2=t_outer.c1)
	and not exists (select 1 from t_subplan where t_subplan.c2=t_outer.c1);
	NOTICE: prefetch join qual in slice 0 of plannode 3
	NOTICE: prefetch join qual in slice 1 of plannode 3 (seg0 slice1 127.0.1.1:7002 pid=74425)
	NOTICE: prefetch join qual in slice 1 of plannode 3 (seg1 slice1 127.0.1.1:7003 pid=74426)
	NOTICE: prefetch join qual in slice 1 of plannode 3 (seg2 slice1 127.0.1.1:7004 pid=74427)
	count
	-------
	10000
	(1 row)

	-- Except JoinQual, NonJoinQual has the similar deadlock issue.
	-- Test NonJoinQual which should also be prefetched.
	select count(*) from t_inner right join t_outer on t_inner.c2=t_outer.c2
	where (t_inner.c1 is null or not exists (select 0 from t_subplan where t_subplan.c2=t_outer.c1));
	count
	-------
	0
	(1 row)

	-- Test NonJoinQual includes multiple motion nodes
	select count(*) from t_inner right join t_outer on t_inner.c2=t_outer.c2
	where (t_inner.c1 is null or not exists (select 0 from t_subplan where t_subplan.c2=t_outer.c1)
	or not exists (select 0 from t_subplan2 where t_subplan2.c2=t_outer.c1));
	count
	-------
	0
	(1 row)