| -- |
| -- Test correlated subquery in subplan with motion chooses correct scan type |
| -- |
| -- Given I have two distributed tables |
| create table choose_seqscan_t1(id1 int,id2 int); |
| NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'id1' as the Apache Cloudberry data distribution key for this table. |
| HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. |
| create table choose_seqscan_t2(id1 int,id2 int); |
| NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'id1' as the Apache Cloudberry data distribution key for this table. |
| HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. |
| -- and they have some data |
| insert into choose_seqscan_t1 select i+1,i from generate_series(1,50)i; |
| insert into choose_seqscan_t2 select i+1,i from generate_series(1,50)i; |
| -- and one of the tables has an index on a column which is not the distribution column |
| create index bidx2 on choose_seqscan_t2(id2); |
| -- and the statistics reflect the newly inserted data |
| analyze choose_seqscan_t1; analyze choose_seqscan_t2; |
| -- making an indexscan cheaper with this GUC is only necessary with this small dataset |
| -- if you insert more data, you can still ensure an indexscan is considered |
| set random_page_cost=1; |
| set seq_page_cost=5; |
| -- and I query the table with the index from inside a subquery which will be pulled up inside of a subquery that will stay a subplan |
| select (select id1 from (select * from choose_seqscan_t2) foo where id2=choose_seqscan_t1.id2) from choose_seqscan_t1 order by id1; |
| id1 |
| ----- |
| 2 |
| 3 |
| 4 |
| 5 |
| 6 |
| 7 |
| 8 |
| 9 |
| 10 |
| 11 |
| 12 |
| 13 |
| 14 |
| 15 |
| 16 |
| 17 |
| 18 |
| 19 |
| 20 |
| 21 |
| 22 |
| 23 |
| 24 |
| 25 |
| 26 |
| 27 |
| 28 |
| 29 |
| 30 |
| 31 |
| 32 |
| 33 |
| 34 |
| 35 |
| 36 |
| 37 |
| 38 |
| 39 |
| 40 |
| 41 |
| 42 |
| 43 |
| 44 |
| 45 |
| 46 |
| 47 |
| 48 |
| 49 |
| 50 |
| 51 |
| (50 rows) |
| |
| explain select (select id1 from (select * from choose_seqscan_t2) foo where id2=choose_seqscan_t1.id2) from choose_seqscan_t1; |
| QUERY PLAN |
| --------------------------------------------------------------------------------------------------------- |
| Gather Motion 3:1 (slice2; segments: 3) (cost=0.00..184.75 rows=50 width=4) |
| -> Seq Scan on choose_seqscan_t1 (cost=0.00..184.75 rows=17 width=4) |
| SubPlan 1 |
| -> Result (cost=0.00..3.63 rows=1 width=4) |
| Filter: (choose_seqscan_t2.id2 = choose_seqscan_t1.id2) |
| -> Materialize (cost=0.00..3.63 rows=1 width=4) |
| -> Broadcast Motion 3:3 (slice1; segments: 3) (cost=0.00..3.62 rows=1 width=4) |
| -> Seq Scan on choose_seqscan_t2 (cost=0.00..3.62 rows=1 width=4) |
| Optimizer: Postgres query optimizer |
| (9 rows) |
| |
| -- then, a sequential scan is chosen because I need a motion to move choose_seqscan_t2 |
| -- Index Scan can be used on quals that don't depend on the correlation vars, however. |
| select t1.id1, (select count(*) from choose_seqscan_t2 t2 where t2.id1 = t1.id1 and t2.id2 = 1) from choose_seqscan_t1 t1 where t1.id1 < 10; |
| id1 | count |
| -----+------- |
| 5 | 0 |
| 6 | 0 |
| 9 | 0 |
| 2 | 1 |
| 3 | 0 |
| 4 | 0 |
| 7 | 0 |
| 8 | 0 |
| (8 rows) |
| |
| explain select t1.id1, (select count(*) from choose_seqscan_t2 t2 where t2.id1 = t1.id1 and t2.id2 = 1) from choose_seqscan_t1 t1 where t1.id1 < 10; |
| QUERY PLAN |
| ------------------------------------------------------------------------------------------------------------------------ |
| Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..21.62 rows=9 width=12) |
| -> Seq Scan on choose_seqscan_t1 t1 (cost=0.00..21.62 rows=3 width=12) |
| Filter: (id1 < 10) |
| SubPlan 1 |
| -> Aggregate (cost=2.19..2.20 rows=1 width=8) |
| -> Result (cost=0.14..2.19 rows=1 width=0) |
| Filter: (t2.id1 = t1.id1) |
| -> Materialize (cost=0.14..2.18 rows=1 width=4) |
| -> Broadcast Motion 3:3 (slice2; segments: 3) (cost=0.14..2.18 rows=1 width=4) |
| -> Index Scan using bidx2 on choose_seqscan_t2 t2 (cost=0.14..2.16 rows=1 width=4) |
| Index Cond: (id2 = 1) |
| Optimizer: Postgres query optimizer |
| (12 rows) |
| |
| -- Test using a join within the subplan. It could perhaps use an Nested Loop Join + |
| -- Index Scan to do the join, but at the moment, the planner doesn't consider distributing |
| -- the Function Scan. |
| select t1.id1, (select count(*) from generate_series(1,5) g, choose_seqscan_t2 t2 where t1.id1 = t2.id1 and t2.id2 = g) from choose_seqscan_t1 t1 where t1.id1 < 10; |
| id1 | count |
| -----+------- |
| 2 | 1 |
| 3 | 1 |
| 4 | 1 |
| 7 | 0 |
| 8 | 0 |
| 5 | 1 |
| 6 | 1 |
| 9 | 0 |
| (8 rows) |
| |
| explain select t1.id1, (select count(*) from generate_series(1,5) g, choose_seqscan_t2 t2 where t1.id1 = t2.id1 and t2.id2 = g) from choose_seqscan_t1 t1 where t1.id1 < 10; |
| QUERY PLAN |
| ---------------------------------------------------------------------------------------------------------------------- |
| Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..43.72 rows=8 width=12) |
| -> Seq Scan on choose_seqscan_t1 t1 (cost=0.00..43.57 rows=3 width=12) |
| Filter: (id1 < 10) |
| SubPlan 1 |
| -> Aggregate (cost=5.57..5.58 rows=1 width=8) |
| -> Hash Join (cost=0.12..5.55 rows=2 width=0) |
| Hash Cond: (t2.id2 = g.g) |
| -> Result (cost=0.00..5.25 rows=17 width=8) |
| Filter: (t1.id1 = t2.id1) |
| -> Materialize (cost=0.00..4.75 rows=17 width=8) |
| -> Broadcast Motion 3:3 (slice2; segments: 3) (cost=0.00..4.50 rows=17 width=8) |
| -> Seq Scan on choose_seqscan_t2 t2 (cost=0.00..3.50 rows=17 width=8) |
| -> Hash (cost=0.05..0.05 rows=2 width=4) |
| -> Function Scan on generate_series g (cost=0.00..0.05 rows=2 width=4) |
| Optimizer: Postgres query optimizer |
| (15 rows) |
| |
| -- Similar, but use a real table. One possible plan for the subplan here would be to do the join |
| -- first, and then filter the join result based on the correlation qual "t1.id1 = t2.id1". But |
| -- the planner isn't smart enough to generate that plan, currently. |
| create table choose_seqscan_t3(id1 int,id2 int); |
| NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'id1' as the Apache Cloudberry data distribution key for this table. |
| HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. |
| create index bidx3 on choose_seqscan_t3(id1); |
| insert into choose_seqscan_t3 select i+1,i from generate_series(1,50)i; |
| analyze choose_seqscan_t3; |
| select t1.id1, (select count(*) from choose_seqscan_t3 t3, choose_seqscan_t2 t2 where t1.id1 = t2.id1 and t3.id1 = t2.id1) from choose_seqscan_t1 t1 where t1.id1 < 10; |
| id1 | count |
| -----+------- |
| 5 | 1 |
| 6 | 1 |
| 9 | 1 |
| 2 | 1 |
| 3 | 1 |
| 4 | 1 |
| 7 | 1 |
| 8 | 1 |
| (8 rows) |
| |
| explain select t1.id1, (select count(*) from choose_seqscan_t3 t3, choose_seqscan_t2 t2 where t1.id1 = t2.id1 and t3.id1 = t2.id1) from choose_seqscan_t1 t1 where t1.id1 < 10; |
| QUERY PLAN |
| ---------------------------------------------------------------------------------------------------------------------------- |
| Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..81632653449.62 rows=9 width=8) |
| -> Seq Scan on choose_seqscan_t1 t1 (cost=0.00..81632653449.62 rows=3 width=8) |
| Filter: (id1 < 10) |
| SubPlan 1 |
| -> Aggregate (cost=10000000047.12..10000000047.14 rows=1 width=8) |
| -> Nested Loop (cost=10000000000.00..10000000040.88 rows=2500 width=0) |
| -> Result (cost=0.00..4.75 rows=50 width=4) |
| Filter: (t3.id1 = t1.id1) |
| -> Materialize (cost=0.00..4.75 rows=50 width=4) |
| -> Broadcast Motion 3:3 (slice2; segments: 3) (cost=0.00..4.50 rows=17 width=4) |
| -> Seq Scan on choose_seqscan_t3 t3 (cost=0.00..3.50 rows=17 width=4) |
| -> Materialize (cost=0.00..5.00 rows=50 width=4) |
| -> Result (cost=0.00..4.75 rows=50 width=4) |
| Filter: (t2.id1 = t1.id1) |
| -> Materialize (cost=0.00..4.75 rows=50 width=4) |
| -> Broadcast Motion 3:3 (slice3; segments: 3) (cost=0.00..4.50 rows=17 width=4) |
| -> Seq Scan on choose_seqscan_t2 t2 (cost=0.00..3.50 rows=17 width=4) |
| Optimizer: Postgres query optimizer |
| (18 rows) |
| |
| -- start_ignore |
| drop table if exists choose_seqscan_t1; |
| drop table if exists choose_seqscan_t2; |
| -- end_ignore |
| -- Given I have one replicated table |
| create table choose_indexscan_t1(id1 int, id2 int); |
| NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'id1' as the Apache Cloudberry data distribution key for this table. |
| HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. |
| create table choose_indexscan_t2(id1 int, id2 int) distributed replicated; |
| -- and it has data |
| insert into choose_indexscan_t1 select i+1, i from generate_series(1,20)i; |
| insert into choose_indexscan_t2 select i+1, i from generate_series(1,100)i; |
| -- and the replicated table has an index on a column which is not the distribution key |
| create index choose_indexscan_t2_idx on choose_indexscan_t2(id2); |
| -- and the statistics reflect the newly inserted data |
| analyze choose_indexscan_t1; analyze choose_indexscan_t2; |
| -- making an indexscan cheaper with this GUC is only necessary with this small dataset |
| -- if you insert more data, you can still ensure an indexscan is considered |
| set random_page_cost=1; |
| -- and I query the table with the index from inside a subquery which will be pulled up inside of a subquery that will stay a subplan |
| select (select id1 from (select * from choose_indexscan_t2) foo where id2=choose_indexscan_t1.id2) from choose_indexscan_t1 order by id1; |
| id1 |
| ----- |
| 2 |
| 3 |
| 4 |
| 5 |
| 6 |
| 7 |
| 8 |
| 9 |
| 10 |
| 11 |
| 12 |
| 13 |
| 14 |
| 15 |
| 16 |
| 17 |
| 18 |
| 19 |
| 20 |
| 21 |
| (20 rows) |
| |
| -- start_ignore |
| -- GPDB_96_MERGE_FIXME: we no longer choose an index scan here, because the planner cannot |
| -- easily determine that it's safe. See https://github.com/greenplum-db/gpdb/issues/8648. |
| -- I hope we can get that back with the 9.6 upper planner pathification. |
| -- end_ignore |
| explain select (select id1 from (select * from choose_indexscan_t2) foo where id2=choose_indexscan_t1.id2) from choose_indexscan_t1; |
| QUERY PLAN |
| ----------------------------------------------------------------------------------------------------------- |
| Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..93.20 rows=20 width=4) |
| -> Seq Scan on choose_indexscan_t1 (cost=0.00..93.20 rows=7 width=4) |
| SubPlan 1 |
| -> Result (cost=0.00..4.50 rows=100 width=8) |
| Filter: (choose_indexscan_t2.id2 = choose_indexscan_t1.id2) |
| -> Materialize (cost=0.00..4.50 rows=100 width=8) |
| -> Broadcast Motion 1:3 (slice2; segments: 1) (cost=0.00..4.00 rows=100 width=8) |
| -> Seq Scan on choose_indexscan_t2 (cost=0.00..2.00 rows=100 width=8) |
| Optimizer: Postgres query optimizer |
| (9 rows) |
| |
| -- then an indexscan is chosen because it is correct to do this on a replicated table since no motion is required |
| -- Test that Motions are added when you mix replicated tables and catalog |
| -- tables in the same query. A replicated table is available on all segments, |
| -- but *not* on the QD node, so we need a motion for this, because the catalog |
| -- table is scanned in the QD. (Catalog tables are present with same contents |
| -- on all segments, too, so we could alternatively perform scan the catalog |
| -- table oon one of the segments.) |
| -- https://github.com/greenplum-db/gpdb/issues/8648 |
| create table mytables (tablename text, explanation text) distributed replicated; |
| insert into mytables values ('pg_class', 'contains all relations'); |
| create index on mytables(tablename); |
| select c.relname, (select explanation from mytables mt where mt.tablename=c.relname ) from pg_class c where relname = 'pg_class'; |
| relname | explanation |
| ----------+------------------------ |
| pg_class | contains all relations |
| (1 row) |
| |
| set enable_seqscan=off; |
| explain select c.relname, (select explanation from mytables mt where mt.tablename=c.relname ) from pg_class c where relname = 'pg_class'; |
| QUERY PLAN |
| --------------------------------------------------------------------------------------------------------------------- |
| Index Only Scan using pg_class_relname_nsp_index on pg_class c (cost=0.15..10000000002.20 rows=1 width=64) |
| Index Cond: (relname = 'pg_class'::name) |
| SubPlan 1 |
| -> Result (cost=10000000000.00..10000000001.03 rows=1 width=32) |
| Filter: (mt.tablename = (c.relname)::text) |
| -> Materialize (cost=10000000000.00..10000000001.03 rows=1 width=32) |
| -> Gather Motion 1:1 (slice1; segments: 1) (cost=10000000000.00..10000000001.03 rows=1 width=32) |
| -> Seq Scan on mytables mt (cost=10000000000.00..10000000001.01 rows=1 width=32) |
| Optimizer: Postgres query optimizer |
| (9 rows) |
| |
| select c.relname, (select explanation from mytables mt where mt.tablename=c.relname ) from pg_class c where relname = 'pg_class'; |
| relname | explanation |
| ----------+------------------------ |
| pg_class | contains all relations |
| (1 row) |
| |
| reset enable_seqscan; |
| -- start_ignore |
| drop table if exists choose_indexscan_t1; |
| drop table if exists choose_indexscan_t2; |
| -- end_ignore |