src/test/regress/expected/qp_skew_optimizer.out - cloudberry - Git at Google

 create schema orca_skew;
 set optimizer_skew_factor = 1;
 -- Verify ORCA chooses broadcast over redistribution for skewed join
 -- t1 skewed, randomly distributed
 -- t2 uniform, randomly distributed
 -- t1, t2 have ~25 rows
 -- start_ignore
 DROP TABLE t1;
 ERROR:  table "t1" does not exist
 DROP TABLE t2;
 ERROR:  table "t2" does not exist
 -- end_ignore
 CREATE TABLE t1 (
     c11 integer,
     c12 integer
 )
  DISTRIBUTED RANDOMLY;
 CREATE TABLE t2 (
     c21 integer,
     c22 integer
 )
  DISTRIBUTED RANDOMLY;
 -- t1: 6 distinct values
 -- (0,1) 20 rows (skewed)
 -- (1,2) 1 row
 -- (2,3) 1 row
 -- (3,4) 1 row
 -- (5,6) 1 row
 -- (6,7) 1 row
 insert into t1 select 0,1 from generate_series(1,20);
 insert into t1 select 1,2;
 insert into t1 select 2,3;
 insert into t1 select 3,4;
 insert into t1 select 5,6;
 insert into t1 select 6,7;
 -- t2: 9 distinct values
 -- (7,8) 3 rows
 -- (8,9) 3 rows
 -- (9,10) 3 rows
 -- (10,11) 3 rows
 -- (12,13) 3 rows
 -- (15,16) 3 rows
 -- (16,1) 3 rows (match)
 -- (17,2) 3 rows (match)
 -- (20,3) 3 rows (match)
 insert into t2 select 7,8 from generate_series(1,3);
 insert into t2 select 8,9 from generate_series(1,3);
 insert into t2 select 9,10 from generate_series(1,3);
 insert into t2 select 10,11 from generate_series(1,3);
 insert into t2 select 12,13 from generate_series(1,3);
 insert into t2 select 15,16 from generate_series(1,3);
 insert into t2 select 16,1 from generate_series(1,3);
 insert into t2 select 17,2 from generate_series(1,3);
 insert into t2 select 20,3 from generate_series(1,3);
 ANALYZE t1;
 ANALYZE t2;
 EXPLAIN SELECT
   c12, c22
   FROM
   t1 INNER JOIN t2
     ON c12 = c22;
                                              QUERY PLAN
 ----------------------------------------------------------------------------------------------------
  Gather Motion 3:1  (slice1; segments: 3)  (cost=0.00..862.01 rows=75 width=8)
    ->  Hash Join  (cost=0.00..862.01 rows=25 width=8)
          Hash Cond: (t2.c22 = t1.c12)
          ->  Seq Scan on t2  (cost=0.00..431.00 rows=9 width=4)
          ->  Hash  (cost=431.00..431.00 rows=25 width=4)
                ->  Broadcast Motion 3:3  (slice2; segments: 3)  (cost=0.00..431.00 rows=25 width=4)
                      ->  Seq Scan on t1  (cost=0.00..431.00 rows=9 width=4)
  Optimizer: Pivotal Optimizer (GPORCA)
 (8 rows)

 -- Verify ORCA chooses broadcast over redistribution for skewed multiple joins
 -- t1 has a skewed value (0,1) matching tuples (16,1) in t2
 -- the result of t1 ⋈ t2 also tends to skew
 -- start_ignore
 DROP TABLE t3;
 ERROR:  table "t3" does not exist
 -- end_ignore
 CREATE TABLE t3 (
     c31 integer,
     c32 integer
 )
  DISTRIBUTED RANDOMLY;
 -- t3: 3 distinct values
 -- (0,1) 7 rows
 -- (1,2) 7 rows
 -- (2,3) 7 rows
 insert into t3 select 0,1 from generate_series(1,7);
 insert into t3 select 1,2 from generate_series(1,7);
 insert into t3 select 2,3 from generate_series(1,7);
 ANALYZE t3;
 -- Force ORCA to execute t1 ⋈ t2 first
 set optimizer_join_order = query;
 EXPLAIN SELECT
   c12, c22, c32
   FROM
   t1 INNER JOIN t2
     ON c12 = c22
   INNER JOIN t3
     ON c22 = c32;
                                                 QUERY PLAN
 ----------------------------------------------------------------------------------------------------------
  Gather Motion 3:1  (slice1; segments: 3)  (cost=0.00..1293.05 rows=525 width=12)
    ->  Hash Join  (cost=0.00..1293.03 rows=175 width=12)
          Hash Cond: (t2.c22 = t3.c32)
          ->  Hash Join  (cost=0.00..862.01 rows=25 width=8)
                Hash Cond: (t1.c12 = t2.c22)
                ->  Seq Scan on t1  (cost=0.00..431.00 rows=9 width=4)
                ->  Hash  (cost=431.00..431.00 rows=27 width=4)
                      ->  Broadcast Motion 3:3  (slice2; segments: 3)  (cost=0.00..431.00 rows=27 width=4)
                            ->  Seq Scan on t2  (cost=0.00..431.00 rows=9 width=4)
          ->  Hash  (cost=431.00..431.00 rows=21 width=4)
                ->  Broadcast Motion 3:3  (slice3; segments: 3)  (cost=0.00..431.00 rows=21 width=4)
                      ->  Seq Scan on t3  (cost=0.00..431.00 rows=7 width=4)
  Optimizer: Pivotal Optimizer (GPORCA)
 (13 rows)

 -- Reset join order
 set optimizer_join_order = exhaustive2;
 -- Verify ORCA chooses redistribution over broadcast
 -- when the data skew isn't pronuounced
 -- start_ignore
 DROP TABLE t4;
 ERROR:  table "t4" does not exist
 -- end_ignore
 CREATE TABLE t4 (
     c41 integer,
     c42 integer
 )
  DISTRIBUTED RANDOMLY;
 -- t4: 3 distinct values
 -- (3,2) 8 rows
 -- (4,3) 6 rows
 -- (5,4) 4 rows
 insert into t4 select 3,2 from generate_series(1,8);
 insert into t4 select 4,3 from generate_series(1,6);
 insert into t4 select 5,4 from generate_series(1,4);
 ANALYZE t4;
 EXPLAIN SELECT
   c32, c42
   FROM
   t3 INNER JOIN t4
     ON c32 = c42;
                                               QUERY PLAN
 ------------------------------------------------------------------------------------------------------
  Gather Motion 3:1  (slice1; segments: 3)  (cost=0.00..862.01 rows=126 width=8)
    ->  Hash Join  (cost=0.00..862.01 rows=42 width=8)
          Hash Cond: (t3.c32 = t4.c42)
          ->  Redistribute Motion 3:3  (slice2; segments: 3)  (cost=0.00..431.00 rows=7 width=4)
                Hash Key: t3.c32
                ->  Seq Scan on t3  (cost=0.00..431.00 rows=7 width=4)
          ->  Hash  (cost=431.00..431.00 rows=6 width=4)
                ->  Redistribute Motion 3:3  (slice3; segments: 3)  (cost=0.00..431.00 rows=6 width=4)
                      Hash Key: t4.c42
                      ->  Seq Scan on t4  (cost=0.00..431.00 rows=6 width=4)
  Optimizer: Pivotal Optimizer (GPORCA)
 (11 rows)

 -- user option to emphasize data skew by multiplying
 -- the skew ratio with a larger-than 1 skew factor
 set optimizer_skew_factor = 25;
 EXPLAIN SELECT
   c32, c42
   FROM
   t3 INNER JOIN t4
     ON c32 = c42;
                                              QUERY PLAN
 ----------------------------------------------------------------------------------------------------
  Gather Motion 3:1  (slice1; segments: 3)  (cost=0.00..862.01 rows=126 width=8)
    ->  Hash Join  (cost=0.00..862.01 rows=42 width=8)
          Hash Cond: (t3.c32 = t4.c42)
          ->  Seq Scan on t3  (cost=0.00..431.00 rows=7 width=4)
          ->  Hash  (cost=431.00..431.00 rows=18 width=4)
                ->  Broadcast Motion 3:3  (slice2; segments: 3)  (cost=0.00..431.00 rows=18 width=4)
                      ->  Seq Scan on t4  (cost=0.00..431.00 rows=6 width=4)
  Optimizer: Pivotal Optimizer (GPORCA)
 (8 rows)

 -- reset skew factor
 set optimizer_skew_factor = 1;
 -- Currently ORCA doesn't introduce a shuffle motion
 -- before joining a table distributed by a skewed column
 -- To demonstrate that, we recreate t1 and t2 with the same
 -- data but distribute them by the c*1 column
 -- t1 skewed, distributed by c11
 -- t2 uniform, distributed by c21
 -- t1, t2 have ~25 rows
 -- start_ignore
 DROP TABLE t1;
 DROP TABLE t2;
 -- end_ignore
 CREATE TABLE t1 (
     c11 integer,
     c12 integer
 )
  DISTRIBUTED BY (c11);
 CREATE TABLE t2 (
     c21 integer,
     c22 integer
 )
  DISTRIBUTED BY (c21);
 -- t1: 6 distinct values
 -- segment 1:
 -- (0,1) 20 rows
 -- (1,2) 1 rows
 -- segment 2:
 -- (2,3) 1 rows
 -- (3,4) 1 rows
 -- segment 3:
 -- (5,6) 1 rows
 -- (6,7) 1 rows
 insert into t1 select 0,1 from generate_series(1,20);
 insert into t1 select 1,2;
 insert into t1 select 2,3;
 insert into t1 select 3,4;
 insert into t1 select 5,6;
 insert into t1 select 6,7;
 -- t2: 6 distinct values
 -- segment 1:
 -- (7,8) 3 rows
 -- (8,9) 3 rows
 -- (16,1) 3 rows
 -- segment 2:
 -- (9,10) 3 rows
 -- (10,11) 3 rows
 -- (17,2) 3 rows
 -- segment 3:
 -- (12,13) 3 rows
 -- (15,16) 3 rows
 -- (20,3) 3 rows
 insert into t2 select 7,8 from generate_series(1,3);
 insert into t2 select 8,9 from generate_series(1,3);
 insert into t2 select 9,10 from generate_series(1,3);
 insert into t2 select 10,11 from generate_series(1,3);
 insert into t2 select 12,13 from generate_series(1,3);
 insert into t2 select 15,16 from generate_series(1,3);
 insert into t2 select 16,1 from generate_series(1,3);
 insert into t2 select 17,2 from generate_series(1,3);
 insert into t2 select 20,3 from generate_series(1,3);
 ANALYZE t1;
 ANALYZE t2;
 EXPLAIN SELECT
   c12, c22
   FROM
   t1 INNER JOIN t2
     ON c12 = c22;
                                              QUERY PLAN
 ----------------------------------------------------------------------------------------------------
  Gather Motion 3:1  (slice1; segments: 3)  (cost=0.00..862.01 rows=75 width=8)
    ->  Hash Join  (cost=0.00..862.01 rows=25 width=8)
          Hash Cond: (t2.c22 = t1.c12)
          ->  Seq Scan on t2  (cost=0.00..431.00 rows=9 width=4)
          ->  Hash  (cost=431.00..431.00 rows=25 width=4)
                ->  Broadcast Motion 3:3  (slice2; segments: 3)  (cost=0.00..431.00 rows=25 width=4)
                      ->  Seq Scan on t1  (cost=0.00..431.00 rows=9 width=4)
  Optimizer: Pivotal Optimizer (GPORCA)
 (8 rows)

 reset optimizer_skew_factor;
	create schema orca_skew;
	set optimizer_skew_factor = 1;
	-- Verify ORCA chooses broadcast over redistribution for skewed join
	-- t1 skewed, randomly distributed
	-- t2 uniform, randomly distributed
	-- t1, t2 have ~25 rows
	-- start_ignore
	DROP TABLE t1;
	ERROR: table "t1" does not exist
	DROP TABLE t2;
	ERROR: table "t2" does not exist
	-- end_ignore
	CREATE TABLE t1 (
	c11 integer,
	c12 integer
	)
	DISTRIBUTED RANDOMLY;
	CREATE TABLE t2 (
	c21 integer,
	c22 integer
	)
	DISTRIBUTED RANDOMLY;
	-- t1: 6 distinct values
	-- (0,1) 20 rows (skewed)
	-- (1,2) 1 row
	-- (2,3) 1 row
	-- (3,4) 1 row
	-- (5,6) 1 row
	-- (6,7) 1 row
	insert into t1 select 0,1 from generate_series(1,20);
	insert into t1 select 1,2;
	insert into t1 select 2,3;
	insert into t1 select 3,4;
	insert into t1 select 5,6;
	insert into t1 select 6,7;
	-- t2: 9 distinct values
	-- (7,8) 3 rows
	-- (8,9) 3 rows
	-- (9,10) 3 rows
	-- (10,11) 3 rows
	-- (12,13) 3 rows
	-- (15,16) 3 rows
	-- (16,1) 3 rows (match)
	-- (17,2) 3 rows (match)
	-- (20,3) 3 rows (match)
	insert into t2 select 7,8 from generate_series(1,3);
	insert into t2 select 8,9 from generate_series(1,3);
	insert into t2 select 9,10 from generate_series(1,3);
	insert into t2 select 10,11 from generate_series(1,3);
	insert into t2 select 12,13 from generate_series(1,3);
	insert into t2 select 15,16 from generate_series(1,3);
	insert into t2 select 16,1 from generate_series(1,3);
	insert into t2 select 17,2 from generate_series(1,3);
	insert into t2 select 20,3 from generate_series(1,3);
	ANALYZE t1;
	ANALYZE t2;
	EXPLAIN SELECT
	c12, c22
	FROM
	t1 INNER JOIN t2
	ON c12 = c22;
	QUERY PLAN
	----------------------------------------------------------------------------------------------------
	Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..862.01 rows=75 width=8)
	-> Hash Join (cost=0.00..862.01 rows=25 width=8)
	Hash Cond: (t2.c22 = t1.c12)
	-> Seq Scan on t2 (cost=0.00..431.00 rows=9 width=4)
	-> Hash (cost=431.00..431.00 rows=25 width=4)
	-> Broadcast Motion 3:3 (slice2; segments: 3) (cost=0.00..431.00 rows=25 width=4)
	-> Seq Scan on t1 (cost=0.00..431.00 rows=9 width=4)
	Optimizer: Pivotal Optimizer (GPORCA)
	(8 rows)

	-- Verify ORCA chooses broadcast over redistribution for skewed multiple joins
	-- t1 has a skewed value (0,1) matching tuples (16,1) in t2
	-- the result of t1 ⋈ t2 also tends to skew
	-- start_ignore
	DROP TABLE t3;
	ERROR: table "t3" does not exist
	-- end_ignore
	CREATE TABLE t3 (
	c31 integer,
	c32 integer
	)
	DISTRIBUTED RANDOMLY;
	-- t3: 3 distinct values
	-- (0,1) 7 rows
	-- (1,2) 7 rows
	-- (2,3) 7 rows
	insert into t3 select 0,1 from generate_series(1,7);
	insert into t3 select 1,2 from generate_series(1,7);
	insert into t3 select 2,3 from generate_series(1,7);
	ANALYZE t3;
	-- Force ORCA to execute t1 ⋈ t2 first
	set optimizer_join_order = query;
	EXPLAIN SELECT
	c12, c22, c32
	FROM
	t1 INNER JOIN t2
	ON c12 = c22
	INNER JOIN t3
	ON c22 = c32;
	QUERY PLAN
	----------------------------------------------------------------------------------------------------------
	Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..1293.05 rows=525 width=12)
	-> Hash Join (cost=0.00..1293.03 rows=175 width=12)
	Hash Cond: (t2.c22 = t3.c32)
	-> Hash Join (cost=0.00..862.01 rows=25 width=8)
	Hash Cond: (t1.c12 = t2.c22)
	-> Seq Scan on t1 (cost=0.00..431.00 rows=9 width=4)
	-> Hash (cost=431.00..431.00 rows=27 width=4)
	-> Broadcast Motion 3:3 (slice2; segments: 3) (cost=0.00..431.00 rows=27 width=4)
	-> Seq Scan on t2 (cost=0.00..431.00 rows=9 width=4)
	-> Hash (cost=431.00..431.00 rows=21 width=4)
	-> Broadcast Motion 3:3 (slice3; segments: 3) (cost=0.00..431.00 rows=21 width=4)
	-> Seq Scan on t3 (cost=0.00..431.00 rows=7 width=4)
	Optimizer: Pivotal Optimizer (GPORCA)
	(13 rows)

	-- Reset join order
	set optimizer_join_order = exhaustive2;
	-- Verify ORCA chooses redistribution over broadcast
	-- when the data skew isn't pronuounced
	-- start_ignore
	DROP TABLE t4;
	ERROR: table "t4" does not exist
	-- end_ignore
	CREATE TABLE t4 (
	c41 integer,
	c42 integer
	)
	DISTRIBUTED RANDOMLY;
	-- t4: 3 distinct values
	-- (3,2) 8 rows
	-- (4,3) 6 rows
	-- (5,4) 4 rows
	insert into t4 select 3,2 from generate_series(1,8);
	insert into t4 select 4,3 from generate_series(1,6);
	insert into t4 select 5,4 from generate_series(1,4);
	ANALYZE t4;
	EXPLAIN SELECT
	c32, c42
	FROM
	t3 INNER JOIN t4
	ON c32 = c42;
	QUERY PLAN
	------------------------------------------------------------------------------------------------------
	Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..862.01 rows=126 width=8)
	-> Hash Join (cost=0.00..862.01 rows=42 width=8)
	Hash Cond: (t3.c32 = t4.c42)
	-> Redistribute Motion 3:3 (slice2; segments: 3) (cost=0.00..431.00 rows=7 width=4)
	Hash Key: t3.c32
	-> Seq Scan on t3 (cost=0.00..431.00 rows=7 width=4)
	-> Hash (cost=431.00..431.00 rows=6 width=4)
	-> Redistribute Motion 3:3 (slice3; segments: 3) (cost=0.00..431.00 rows=6 width=4)
	Hash Key: t4.c42
	-> Seq Scan on t4 (cost=0.00..431.00 rows=6 width=4)
	Optimizer: Pivotal Optimizer (GPORCA)
	(11 rows)

	-- user option to emphasize data skew by multiplying
	-- the skew ratio with a larger-than 1 skew factor
	set optimizer_skew_factor = 25;
	EXPLAIN SELECT
	c32, c42
	FROM
	t3 INNER JOIN t4
	ON c32 = c42;
	QUERY PLAN
	----------------------------------------------------------------------------------------------------
	Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..862.01 rows=126 width=8)
	-> Hash Join (cost=0.00..862.01 rows=42 width=8)
	Hash Cond: (t3.c32 = t4.c42)
	-> Seq Scan on t3 (cost=0.00..431.00 rows=7 width=4)
	-> Hash (cost=431.00..431.00 rows=18 width=4)
	-> Broadcast Motion 3:3 (slice2; segments: 3) (cost=0.00..431.00 rows=18 width=4)
	-> Seq Scan on t4 (cost=0.00..431.00 rows=6 width=4)
	Optimizer: Pivotal Optimizer (GPORCA)
	(8 rows)

	-- reset skew factor
	set optimizer_skew_factor = 1;
	-- Currently ORCA doesn't introduce a shuffle motion
	-- before joining a table distributed by a skewed column
	-- To demonstrate that, we recreate t1 and t2 with the same
	-- data but distribute them by the c*1 column
	-- t1 skewed, distributed by c11
	-- t2 uniform, distributed by c21
	-- t1, t2 have ~25 rows
	-- start_ignore
	DROP TABLE t1;
	DROP TABLE t2;
	-- end_ignore
	CREATE TABLE t1 (
	c11 integer,
	c12 integer
	)
	DISTRIBUTED BY (c11);
	CREATE TABLE t2 (
	c21 integer,
	c22 integer
	)
	DISTRIBUTED BY (c21);
	-- t1: 6 distinct values
	-- segment 1:
	-- (0,1) 20 rows
	-- (1,2) 1 rows
	-- segment 2:
	-- (2,3) 1 rows
	-- (3,4) 1 rows
	-- segment 3:
	-- (5,6) 1 rows
	-- (6,7) 1 rows
	insert into t1 select 0,1 from generate_series(1,20);
	insert into t1 select 1,2;
	insert into t1 select 2,3;
	insert into t1 select 3,4;
	insert into t1 select 5,6;
	insert into t1 select 6,7;
	-- t2: 6 distinct values
	-- segment 1:
	-- (7,8) 3 rows
	-- (8,9) 3 rows
	-- (16,1) 3 rows
	-- segment 2:
	-- (9,10) 3 rows
	-- (10,11) 3 rows
	-- (17,2) 3 rows
	-- segment 3:
	-- (12,13) 3 rows
	-- (15,16) 3 rows
	-- (20,3) 3 rows
	insert into t2 select 7,8 from generate_series(1,3);
	insert into t2 select 8,9 from generate_series(1,3);
	insert into t2 select 9,10 from generate_series(1,3);
	insert into t2 select 10,11 from generate_series(1,3);
	insert into t2 select 12,13 from generate_series(1,3);
	insert into t2 select 15,16 from generate_series(1,3);
	insert into t2 select 16,1 from generate_series(1,3);
	insert into t2 select 17,2 from generate_series(1,3);
	insert into t2 select 20,3 from generate_series(1,3);
	ANALYZE t1;
	ANALYZE t2;
	EXPLAIN SELECT
	c12, c22
	FROM
	t1 INNER JOIN t2
	ON c12 = c22;
	QUERY PLAN
	----------------------------------------------------------------------------------------------------
	Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..862.01 rows=75 width=8)
	-> Hash Join (cost=0.00..862.01 rows=25 width=8)
	Hash Cond: (t2.c22 = t1.c12)
	-> Seq Scan on t2 (cost=0.00..431.00 rows=9 width=4)
	-> Hash (cost=431.00..431.00 rows=25 width=4)
	-> Broadcast Motion 3:3 (slice2; segments: 3) (cost=0.00..431.00 rows=25 width=4)
	-> Seq Scan on t1 (cost=0.00..431.00 rows=9 width=4)
	Optimizer: Pivotal Optimizer (GPORCA)
	(8 rows)

	reset optimizer_skew_factor;