blob: d0228951730b7340578eb7117ac74fbde036b04e [file] [log] [blame]
--
-- Test correlated subquery in subplan with motion chooses correct scan type
--
-- Given I have two distributed tables
create table choose_seqscan_t1(id1 int,id2 int);
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'id1' as the Apache Cloudberry data distribution key for this table.
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
create table choose_seqscan_t2(id1 int,id2 int);
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'id1' as the Apache Cloudberry data distribution key for this table.
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
-- and they have some data
insert into choose_seqscan_t1 select i+1,i from generate_series(1,50)i;
insert into choose_seqscan_t2 select i+1,i from generate_series(1,50)i;
-- and one of the tables has an index on a column which is not the distribution column
create index bidx2 on choose_seqscan_t2(id2);
-- and the statistics reflect the newly inserted data
analyze choose_seqscan_t1; analyze choose_seqscan_t2;
-- making an indexscan cheaper with this GUC is only necessary with this small dataset
-- if you insert more data, you can still ensure an indexscan is considered
set random_page_cost=1;
set seq_page_cost=5;
-- and I query the table with the index from inside a subquery which will be pulled up inside of a subquery that will stay a subplan
select (select id1 from (select * from choose_seqscan_t2) foo where id2=choose_seqscan_t1.id2) from choose_seqscan_t1 order by id1;
id1
-----
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
(50 rows)
explain select (select id1 from (select * from choose_seqscan_t2) foo where id2=choose_seqscan_t1.id2) from choose_seqscan_t1;
QUERY PLAN
---------------------------------------------------------------------------------------------------------
Gather Motion 3:1 (slice2; segments: 3) (cost=0.00..184.75 rows=50 width=4)
-> Seq Scan on choose_seqscan_t1 (cost=0.00..184.75 rows=17 width=4)
SubPlan 1
-> Result (cost=0.00..3.63 rows=1 width=4)
Filter: (choose_seqscan_t2.id2 = choose_seqscan_t1.id2)
-> Materialize (cost=0.00..3.63 rows=1 width=4)
-> Broadcast Motion 3:3 (slice1; segments: 3) (cost=0.00..3.62 rows=1 width=4)
-> Seq Scan on choose_seqscan_t2 (cost=0.00..3.62 rows=1 width=4)
Optimizer: Postgres query optimizer
(9 rows)
-- then, a sequential scan is chosen because I need a motion to move choose_seqscan_t2
-- Index Scan can be used on quals that don't depend on the correlation vars, however.
select t1.id1, (select count(*) from choose_seqscan_t2 t2 where t2.id1 = t1.id1 and t2.id2 = 1) from choose_seqscan_t1 t1 where t1.id1 < 10;
id1 | count
-----+-------
5 | 0
6 | 0
9 | 0
2 | 1
3 | 0
4 | 0
7 | 0
8 | 0
(8 rows)
explain select t1.id1, (select count(*) from choose_seqscan_t2 t2 where t2.id1 = t1.id1 and t2.id2 = 1) from choose_seqscan_t1 t1 where t1.id1 < 10;
QUERY PLAN
------------------------------------------------------------------------------------------------------------------------
Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..21.62 rows=9 width=12)
-> Seq Scan on choose_seqscan_t1 t1 (cost=0.00..21.62 rows=3 width=12)
Filter: (id1 < 10)
SubPlan 1
-> Aggregate (cost=2.19..2.20 rows=1 width=8)
-> Result (cost=0.14..2.19 rows=1 width=0)
Filter: (t2.id1 = t1.id1)
-> Materialize (cost=0.14..2.18 rows=1 width=4)
-> Broadcast Motion 3:3 (slice2; segments: 3) (cost=0.14..2.18 rows=1 width=4)
-> Index Scan using bidx2 on choose_seqscan_t2 t2 (cost=0.14..2.16 rows=1 width=4)
Index Cond: (id2 = 1)
Optimizer: Postgres query optimizer
(12 rows)
-- Test using a join within the subplan. It could perhaps use an Nested Loop Join +
-- Index Scan to do the join, but at the moment, the planner doesn't consider distributing
-- the Function Scan.
select t1.id1, (select count(*) from generate_series(1,5) g, choose_seqscan_t2 t2 where t1.id1 = t2.id1 and t2.id2 = g) from choose_seqscan_t1 t1 where t1.id1 < 10;
id1 | count
-----+-------
2 | 1
3 | 1
4 | 1
7 | 0
8 | 0
5 | 1
6 | 1
9 | 0
(8 rows)
explain select t1.id1, (select count(*) from generate_series(1,5) g, choose_seqscan_t2 t2 where t1.id1 = t2.id1 and t2.id2 = g) from choose_seqscan_t1 t1 where t1.id1 < 10;
QUERY PLAN
----------------------------------------------------------------------------------------------------------------------
Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..43.72 rows=8 width=12)
-> Seq Scan on choose_seqscan_t1 t1 (cost=0.00..43.57 rows=3 width=12)
Filter: (id1 < 10)
SubPlan 1
-> Aggregate (cost=5.57..5.58 rows=1 width=8)
-> Hash Join (cost=0.12..5.55 rows=2 width=0)
Hash Cond: (t2.id2 = g.g)
-> Result (cost=0.00..5.25 rows=17 width=8)
Filter: (t1.id1 = t2.id1)
-> Materialize (cost=0.00..4.75 rows=17 width=8)
-> Broadcast Motion 3:3 (slice2; segments: 3) (cost=0.00..4.50 rows=17 width=8)
-> Seq Scan on choose_seqscan_t2 t2 (cost=0.00..3.50 rows=17 width=8)
-> Hash (cost=0.05..0.05 rows=2 width=4)
-> Function Scan on generate_series g (cost=0.00..0.05 rows=2 width=4)
Optimizer: Postgres query optimizer
(15 rows)
-- Similar, but use a real table. One possible plan for the subplan here would be to do the join
-- first, and then filter the join result based on the correlation qual "t1.id1 = t2.id1". But
-- the planner isn't smart enough to generate that plan, currently.
create table choose_seqscan_t3(id1 int,id2 int);
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'id1' as the Apache Cloudberry data distribution key for this table.
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
create index bidx3 on choose_seqscan_t3(id1);
insert into choose_seqscan_t3 select i+1,i from generate_series(1,50)i;
analyze choose_seqscan_t3;
select t1.id1, (select count(*) from choose_seqscan_t3 t3, choose_seqscan_t2 t2 where t1.id1 = t2.id1 and t3.id1 = t2.id1) from choose_seqscan_t1 t1 where t1.id1 < 10;
id1 | count
-----+-------
5 | 1
6 | 1
9 | 1
2 | 1
3 | 1
4 | 1
7 | 1
8 | 1
(8 rows)
explain select t1.id1, (select count(*) from choose_seqscan_t3 t3, choose_seqscan_t2 t2 where t1.id1 = t2.id1 and t3.id1 = t2.id1) from choose_seqscan_t1 t1 where t1.id1 < 10;
QUERY PLAN
----------------------------------------------------------------------------------------------------------------------------
Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..81632653449.62 rows=9 width=8)
-> Seq Scan on choose_seqscan_t1 t1 (cost=0.00..81632653449.62 rows=3 width=8)
Filter: (id1 < 10)
SubPlan 1
-> Aggregate (cost=10000000047.12..10000000047.14 rows=1 width=8)
-> Nested Loop (cost=10000000000.00..10000000040.88 rows=2500 width=0)
-> Result (cost=0.00..4.75 rows=50 width=4)
Filter: (t3.id1 = t1.id1)
-> Materialize (cost=0.00..4.75 rows=50 width=4)
-> Broadcast Motion 3:3 (slice2; segments: 3) (cost=0.00..4.50 rows=17 width=4)
-> Seq Scan on choose_seqscan_t3 t3 (cost=0.00..3.50 rows=17 width=4)
-> Materialize (cost=0.00..5.00 rows=50 width=4)
-> Result (cost=0.00..4.75 rows=50 width=4)
Filter: (t2.id1 = t1.id1)
-> Materialize (cost=0.00..4.75 rows=50 width=4)
-> Broadcast Motion 3:3 (slice3; segments: 3) (cost=0.00..4.50 rows=17 width=4)
-> Seq Scan on choose_seqscan_t2 t2 (cost=0.00..3.50 rows=17 width=4)
Optimizer: Postgres query optimizer
(18 rows)
-- start_ignore
drop table if exists choose_seqscan_t1;
drop table if exists choose_seqscan_t2;
-- end_ignore
-- Given I have one replicated table
create table choose_indexscan_t1(id1 int, id2 int);
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'id1' as the Apache Cloudberry data distribution key for this table.
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
create table choose_indexscan_t2(id1 int, id2 int) distributed replicated;
-- and it has data
insert into choose_indexscan_t1 select i+1, i from generate_series(1,20)i;
insert into choose_indexscan_t2 select i+1, i from generate_series(1,100)i;
-- and the replicated table has an index on a column which is not the distribution key
create index choose_indexscan_t2_idx on choose_indexscan_t2(id2);
-- and the statistics reflect the newly inserted data
analyze choose_indexscan_t1; analyze choose_indexscan_t2;
-- making an indexscan cheaper with this GUC is only necessary with this small dataset
-- if you insert more data, you can still ensure an indexscan is considered
set random_page_cost=1;
-- and I query the table with the index from inside a subquery which will be pulled up inside of a subquery that will stay a subplan
select (select id1 from (select * from choose_indexscan_t2) foo where id2=choose_indexscan_t1.id2) from choose_indexscan_t1 order by id1;
id1
-----
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
(20 rows)
-- start_ignore
-- GPDB_96_MERGE_FIXME: we no longer choose an index scan here, because the planner cannot
-- easily determine that it's safe. See https://github.com/greenplum-db/gpdb/issues/8648.
-- I hope we can get that back with the 9.6 upper planner pathification.
-- end_ignore
explain select (select id1 from (select * from choose_indexscan_t2) foo where id2=choose_indexscan_t1.id2) from choose_indexscan_t1;
QUERY PLAN
-----------------------------------------------------------------------------------------------------------
Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..93.20 rows=20 width=4)
-> Seq Scan on choose_indexscan_t1 (cost=0.00..93.20 rows=7 width=4)
SubPlan 1
-> Result (cost=0.00..4.50 rows=100 width=8)
Filter: (choose_indexscan_t2.id2 = choose_indexscan_t1.id2)
-> Materialize (cost=0.00..4.50 rows=100 width=8)
-> Broadcast Motion 1:3 (slice2; segments: 1) (cost=0.00..4.00 rows=100 width=8)
-> Seq Scan on choose_indexscan_t2 (cost=0.00..2.00 rows=100 width=8)
Optimizer: Postgres query optimizer
(9 rows)
-- then an indexscan is chosen because it is correct to do this on a replicated table since no motion is required
-- Test that Motions are added when you mix replicated tables and catalog
-- tables in the same query. A replicated table is available on all segments,
-- but *not* on the QD node, so we need a motion for this, because the catalog
-- table is scanned in the QD. (Catalog tables are present with same contents
-- on all segments, too, so we could alternatively perform scan the catalog
-- table oon one of the segments.)
-- https://github.com/greenplum-db/gpdb/issues/8648
create table mytables (tablename text, explanation text) distributed replicated;
insert into mytables values ('pg_class', 'contains all relations');
create index on mytables(tablename);
select c.relname, (select explanation from mytables mt where mt.tablename=c.relname ) from pg_class c where relname = 'pg_class';
relname | explanation
----------+------------------------
pg_class | contains all relations
(1 row)
set enable_seqscan=off;
explain select c.relname, (select explanation from mytables mt where mt.tablename=c.relname ) from pg_class c where relname = 'pg_class';
QUERY PLAN
---------------------------------------------------------------------------------------------------------------------
Index Only Scan using pg_class_relname_nsp_index on pg_class c (cost=0.15..10000000002.20 rows=1 width=64)
Index Cond: (relname = 'pg_class'::name)
SubPlan 1
-> Result (cost=10000000000.00..10000000001.03 rows=1 width=32)
Filter: (mt.tablename = (c.relname)::text)
-> Materialize (cost=10000000000.00..10000000001.03 rows=1 width=32)
-> Gather Motion 1:1 (slice1; segments: 1) (cost=10000000000.00..10000000001.03 rows=1 width=32)
-> Seq Scan on mytables mt (cost=10000000000.00..10000000001.01 rows=1 width=32)
Optimizer: Postgres query optimizer
(9 rows)
select c.relname, (select explanation from mytables mt where mt.tablename=c.relname ) from pg_class c where relname = 'pg_class';
relname | explanation
----------+------------------------
pg_class | contains all relations
(1 row)
reset enable_seqscan;
-- start_ignore
drop table if exists choose_indexscan_t1;
drop table if exists choose_indexscan_t2;
-- end_ignore