blob: 45806b46f4e87ec768981ed7393556d6a17593ef [file] [log] [blame]
create schema orca_skew;
set optimizer_skew_factor = 1;
-- Verify ORCA chooses broadcast over redistribution for skewed join
-- t1 skewed, randomly distributed
-- t2 uniform, randomly distributed
-- t1, t2 have ~25 rows
-- start_ignore
DROP TABLE t1;
ERROR: table "t1" does not exist
DROP TABLE t2;
ERROR: table "t2" does not exist
-- end_ignore
CREATE TABLE t1 (
c11 integer,
c12 integer
)
DISTRIBUTED RANDOMLY;
CREATE TABLE t2 (
c21 integer,
c22 integer
)
DISTRIBUTED RANDOMLY;
-- t1: 6 distinct values
-- (0,1) 20 rows (skewed)
-- (1,2) 1 row
-- (2,3) 1 row
-- (3,4) 1 row
-- (5,6) 1 row
-- (6,7) 1 row
insert into t1 select 0,1 from generate_series(1,20);
insert into t1 select 1,2;
insert into t1 select 2,3;
insert into t1 select 3,4;
insert into t1 select 5,6;
insert into t1 select 6,7;
-- t2: 9 distinct values
-- (7,8) 3 rows
-- (8,9) 3 rows
-- (9,10) 3 rows
-- (10,11) 3 rows
-- (12,13) 3 rows
-- (15,16) 3 rows
-- (16,1) 3 rows (match)
-- (17,2) 3 rows (match)
-- (20,3) 3 rows (match)
insert into t2 select 7,8 from generate_series(1,3);
insert into t2 select 8,9 from generate_series(1,3);
insert into t2 select 9,10 from generate_series(1,3);
insert into t2 select 10,11 from generate_series(1,3);
insert into t2 select 12,13 from generate_series(1,3);
insert into t2 select 15,16 from generate_series(1,3);
insert into t2 select 16,1 from generate_series(1,3);
insert into t2 select 17,2 from generate_series(1,3);
insert into t2 select 20,3 from generate_series(1,3);
ANALYZE t1;
ANALYZE t2;
EXPLAIN SELECT
c12, c22
FROM
t1 INNER JOIN t2
ON c12 = c22;
QUERY PLAN
----------------------------------------------------------------------------------------------------
Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..862.01 rows=75 width=8)
-> Hash Join (cost=0.00..862.01 rows=25 width=8)
Hash Cond: (t2.c22 = t1.c12)
-> Seq Scan on t2 (cost=0.00..431.00 rows=9 width=4)
-> Hash (cost=431.00..431.00 rows=25 width=4)
-> Broadcast Motion 3:3 (slice2; segments: 3) (cost=0.00..431.00 rows=25 width=4)
-> Seq Scan on t1 (cost=0.00..431.00 rows=9 width=4)
Optimizer: Pivotal Optimizer (GPORCA)
(8 rows)
-- Verify ORCA chooses broadcast over redistribution for skewed multiple joins
-- t1 has a skewed value (0,1) matching tuples (16,1) in t2
-- the result of t1 t2 also tends to skew
-- start_ignore
DROP TABLE t3;
ERROR: table "t3" does not exist
-- end_ignore
CREATE TABLE t3 (
c31 integer,
c32 integer
)
DISTRIBUTED RANDOMLY;
-- t3: 3 distinct values
-- (0,1) 7 rows
-- (1,2) 7 rows
-- (2,3) 7 rows
insert into t3 select 0,1 from generate_series(1,7);
insert into t3 select 1,2 from generate_series(1,7);
insert into t3 select 2,3 from generate_series(1,7);
ANALYZE t3;
-- Force ORCA to execute t1 t2 first
set optimizer_join_order = query;
EXPLAIN SELECT
c12, c22, c32
FROM
t1 INNER JOIN t2
ON c12 = c22
INNER JOIN t3
ON c22 = c32;
QUERY PLAN
----------------------------------------------------------------------------------------------------------
Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..1293.05 rows=525 width=12)
-> Hash Join (cost=0.00..1293.03 rows=175 width=12)
Hash Cond: (t2.c22 = t3.c32)
-> Hash Join (cost=0.00..862.01 rows=25 width=8)
Hash Cond: (t1.c12 = t2.c22)
-> Seq Scan on t1 (cost=0.00..431.00 rows=9 width=4)
-> Hash (cost=431.00..431.00 rows=27 width=4)
-> Broadcast Motion 3:3 (slice2; segments: 3) (cost=0.00..431.00 rows=27 width=4)
-> Seq Scan on t2 (cost=0.00..431.00 rows=9 width=4)
-> Hash (cost=431.00..431.00 rows=21 width=4)
-> Broadcast Motion 3:3 (slice3; segments: 3) (cost=0.00..431.00 rows=21 width=4)
-> Seq Scan on t3 (cost=0.00..431.00 rows=7 width=4)
Optimizer: Pivotal Optimizer (GPORCA)
(13 rows)
-- Reset join order
set optimizer_join_order = exhaustive2;
-- Verify ORCA chooses redistribution over broadcast
-- when the data skew isn't pronuounced
-- start_ignore
DROP TABLE t4;
ERROR: table "t4" does not exist
-- end_ignore
CREATE TABLE t4 (
c41 integer,
c42 integer
)
DISTRIBUTED RANDOMLY;
-- t4: 3 distinct values
-- (3,2) 8 rows
-- (4,3) 6 rows
-- (5,4) 4 rows
insert into t4 select 3,2 from generate_series(1,8);
insert into t4 select 4,3 from generate_series(1,6);
insert into t4 select 5,4 from generate_series(1,4);
ANALYZE t4;
EXPLAIN SELECT
c32, c42
FROM
t3 INNER JOIN t4
ON c32 = c42;
QUERY PLAN
------------------------------------------------------------------------------------------------------
Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..862.01 rows=126 width=8)
-> Hash Join (cost=0.00..862.01 rows=42 width=8)
Hash Cond: (t3.c32 = t4.c42)
-> Redistribute Motion 3:3 (slice2; segments: 3) (cost=0.00..431.00 rows=7 width=4)
Hash Key: t3.c32
-> Seq Scan on t3 (cost=0.00..431.00 rows=7 width=4)
-> Hash (cost=431.00..431.00 rows=6 width=4)
-> Redistribute Motion 3:3 (slice3; segments: 3) (cost=0.00..431.00 rows=6 width=4)
Hash Key: t4.c42
-> Seq Scan on t4 (cost=0.00..431.00 rows=6 width=4)
Optimizer: Pivotal Optimizer (GPORCA)
(11 rows)
-- user option to emphasize data skew by multiplying
-- the skew ratio with a larger-than 1 skew factor
set optimizer_skew_factor = 25;
EXPLAIN SELECT
c32, c42
FROM
t3 INNER JOIN t4
ON c32 = c42;
QUERY PLAN
----------------------------------------------------------------------------------------------------
Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..862.01 rows=126 width=8)
-> Hash Join (cost=0.00..862.01 rows=42 width=8)
Hash Cond: (t3.c32 = t4.c42)
-> Seq Scan on t3 (cost=0.00..431.00 rows=7 width=4)
-> Hash (cost=431.00..431.00 rows=18 width=4)
-> Broadcast Motion 3:3 (slice2; segments: 3) (cost=0.00..431.00 rows=18 width=4)
-> Seq Scan on t4 (cost=0.00..431.00 rows=6 width=4)
Optimizer: Pivotal Optimizer (GPORCA)
(8 rows)
-- reset skew factor
set optimizer_skew_factor = 1;
-- Currently ORCA doesn't introduce a shuffle motion
-- before joining a table distributed by a skewed column
-- To demonstrate that, we recreate t1 and t2 with the same
-- data but distribute them by the c*1 column
-- t1 skewed, distributed by c11
-- t2 uniform, distributed by c21
-- t1, t2 have ~25 rows
-- start_ignore
DROP TABLE t1;
DROP TABLE t2;
-- end_ignore
CREATE TABLE t1 (
c11 integer,
c12 integer
)
DISTRIBUTED BY (c11);
CREATE TABLE t2 (
c21 integer,
c22 integer
)
DISTRIBUTED BY (c21);
-- t1: 6 distinct values
-- segment 1:
-- (0,1) 20 rows
-- (1,2) 1 rows
-- segment 2:
-- (2,3) 1 rows
-- (3,4) 1 rows
-- segment 3:
-- (5,6) 1 rows
-- (6,7) 1 rows
insert into t1 select 0,1 from generate_series(1,20);
insert into t1 select 1,2;
insert into t1 select 2,3;
insert into t1 select 3,4;
insert into t1 select 5,6;
insert into t1 select 6,7;
-- t2: 6 distinct values
-- segment 1:
-- (7,8) 3 rows
-- (8,9) 3 rows
-- (16,1) 3 rows
-- segment 2:
-- (9,10) 3 rows
-- (10,11) 3 rows
-- (17,2) 3 rows
-- segment 3:
-- (12,13) 3 rows
-- (15,16) 3 rows
-- (20,3) 3 rows
insert into t2 select 7,8 from generate_series(1,3);
insert into t2 select 8,9 from generate_series(1,3);
insert into t2 select 9,10 from generate_series(1,3);
insert into t2 select 10,11 from generate_series(1,3);
insert into t2 select 12,13 from generate_series(1,3);
insert into t2 select 15,16 from generate_series(1,3);
insert into t2 select 16,1 from generate_series(1,3);
insert into t2 select 17,2 from generate_series(1,3);
insert into t2 select 20,3 from generate_series(1,3);
ANALYZE t1;
ANALYZE t2;
EXPLAIN SELECT
c12, c22
FROM
t1 INNER JOIN t2
ON c12 = c22;
QUERY PLAN
----------------------------------------------------------------------------------------------------
Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..862.01 rows=75 width=8)
-> Hash Join (cost=0.00..862.01 rows=25 width=8)
Hash Cond: (t2.c22 = t1.c12)
-> Seq Scan on t2 (cost=0.00..431.00 rows=9 width=4)
-> Hash (cost=431.00..431.00 rows=25 width=4)
-> Broadcast Motion 3:3 (slice2; segments: 3) (cost=0.00..431.00 rows=25 width=4)
-> Seq Scan on t1 (cost=0.00..431.00 rows=9 width=4)
Optimizer: Pivotal Optimizer (GPORCA)
(8 rows)
reset optimizer_skew_factor;