| -- |
| -- Queries that lead to hanging (not dead lock) when we don't handle synchronization properly in shared scan |
| -- Queries that lead to wrong result when we don't finish executing the subtree below the shared scan being squelched. |
| -- |
| CREATE SCHEMA shared_scan; |
| SET search_path = shared_scan; |
| CREATE TABLE foo (a int, b int); |
| NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Apache Cloudberry data distribution key for this table. |
| HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. |
| CREATE TABLE bar (c int, d int); |
| NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c' as the Apache Cloudberry data distribution key for this table. |
| HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. |
| CREATE TABLE jazz(e int, f int); |
| NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'e' as the Apache Cloudberry data distribution key for this table. |
| HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. |
| INSERT INTO foo values (1, 2); |
| INSERT INTO bar SELECT i, i from generate_series(1, 100)i; |
| INSERT INTO jazz VALUES (2, 2), (3, 3); |
| ANALYZE foo; |
| ANALYZE bar; |
| ANALYZE jazz; |
| SELECT $query$ |
| SELECT * FROM |
| ( |
| WITH cte AS (SELECT * FROM foo) |
| SELECT * FROM (SELECT * FROM cte UNION ALL SELECT * FROM cte) |
| AS X |
| JOIN bar ON b = c |
| ) AS XY |
| JOIN jazz on c = e AND b = f; |
| $query$ AS qry \gset |
| -- We are very particular about this plan shape and data distribution with ORCA: |
| -- 1. `jazz` has to be the inner table of the outer HASH JOIN, so that on a |
| -- segment which has zero tuples in `jazz`, the Sequence node that contains the |
| -- Shared Scan will be squelched on that segment. If `jazz` is not on the inner |
| -- side, the above mentioned "hang" scenario will not be covered. |
| -- 2. The Shared Scan producer has to be on a different slice from consumers, |
| -- and some tuples coming out of the Share Scan producer on one segments are |
| -- redistributed to a different segment over Motion. If not, the above mentioned |
| -- "wrong result" scenario will not be covered. |
| EXPLAIN (COSTS OFF) |
| :qry ; |
| QUERY PLAN |
| ------------------------------------------------------------------------ |
| Gather Motion 3:1 (slice1; segments: 3) |
| -> Hash Join |
| Hash Cond: (bar.c = jazz.e) |
| -> Sequence |
| -> Shared Scan (share slice:id 1:0) |
| -> Redistribute Motion 3:3 (slice2; segments: 3) |
| Hash Key: foo.b |
| -> Seq Scan on foo |
| -> Hash Join |
| Hash Cond: (bar.c = share0_ref2.b) |
| -> Seq Scan on bar |
| -> Hash |
| -> Append |
| -> Shared Scan (share slice:id 1:0) |
| -> Shared Scan (share slice:id 1:0) |
| -> Hash |
| -> Seq Scan on jazz |
| Filter: (e = f) |
| Optimizer: Pivotal Optimizer (GPORCA) |
| (19 rows) |
| |
| SET statement_timeout = '15s'; |
| :qry ; |
| a | b | c | d | e | f |
| ---+---+---+---+---+--- |
| 1 | 2 | 2 | 2 | 2 | 2 |
| 1 | 2 | 2 | 2 | 2 | 2 |
| (2 rows) |
| |
| RESET statement_timeout; |
| SELECT COUNT(*) |
| FROM (SELECT *, |
| ( |
| WITH cte AS (SELECT * FROM jazz WHERE jazz.e = bar.c) |
| SELECT 1 FROM cte c1, cte c2 |
| ) |
| FROM bar) as s; |
| count |
| ------- |
| 100 |
| (1 row) |
| |
| CREATE TABLE t1 (a int, b int); |
| NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Apache Cloudberry data distribution key for this table. |
| HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. |
| CREATE TABLE t2 (a int); |
| NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Apache Cloudberry data distribution key for this table. |
| HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. |
| -- ORCA plan contains a Shared Scan producer with a unsorted Motion below it |
| EXPLAIN (COSTS OFF) |
| WITH cte AS (SELECT * FROM t1 WHERE random() < 0.1 LIMIT 10) SELECT a, 1, 1 FROM cte JOIN t2 USING (a); |
| QUERY PLAN |
| ------------------------------------------------------------------------ |
| Sequence |
| -> Shared Scan (share slice:id 0:0) |
| -> Limit |
| -> Gather Motion 3:1 (slice1; segments: 3) |
| -> Seq Scan on t1 |
| Filter: (random() < '0.1'::double precision) |
| -> Result |
| -> Gather Motion 3:1 (slice2; segments: 3) |
| -> Hash Join |
| Hash Cond: (share0_ref2.a = t2.a) |
| -> Redistribute Motion 1:3 (slice3) |
| Hash Key: share0_ref2.a |
| -> Result |
| -> Shared Scan (share slice:id 3:0) |
| -> Hash |
| -> Seq Scan on t2 |
| Optimizer: Pivotal Optimizer (GPORCA) |
| (17 rows) |
| |
| -- This functions returns one more column than expected. |
| CREATE OR REPLACE FUNCTION col_mismatch_func1() RETURNS TABLE (field1 int, field2 int) |
| LANGUAGE 'plpgsql' VOLATILE STRICT AS |
| $$ |
| DECLARE |
| v_qry text; |
| BEGIN |
| v_qry := 'WITH cte AS (SELECT * FROM t1 WHERE random() < 0.1 LIMIT 10) SELECT a, 1 , 1 FROM cte JOIN t2 USING (a)'; |
| RETURN QUERY EXECUTE v_qry; |
| END |
| $$; |
| -- This should only ERROR and should not SIGSEGV |
| SELECT col_mismatch_func1(); |
| ERROR: structure of query does not match function result type |
| DETAIL: Number of returned columns (3) does not match expected column count (2). |
| CONTEXT: SQL statement "WITH cte AS (SELECT * FROM t1 WHERE random() < 0.1 LIMIT 10) SELECT a, 1 , 1 FROM cte JOIN t2 USING (a)" |
| PL/pgSQL function col_mismatch_func1() line 6 at RETURN QUERY |
| -- ORCA plan contains a Shared Scan producer with a sorted Motion below it |
| EXPLAIN (COSTS OFF) |
| WITH cte AS (SELECT * FROM t1 WHERE random() < 0.1 ORDER BY b LIMIT 10) SELECT a, 1, 1 FROM cte JOIN t2 USING (a); |
| QUERY PLAN |
| ------------------------------------------------------------------------------ |
| Sequence |
| -> Shared Scan (share slice:id 0:0) |
| -> Limit |
| -> Gather Motion 3:1 (slice1; segments: 3) |
| Merge Key: t1.b |
| -> Sort |
| Sort Key: t1.b |
| -> Seq Scan on t1 |
| Filter: (random() < '0.1'::double precision) |
| -> Result |
| -> Gather Motion 3:1 (slice2; segments: 3) |
| -> Hash Join |
| Hash Cond: (share0_ref2.a = t2.a) |
| -> Redistribute Motion 1:3 (slice3) |
| Hash Key: share0_ref2.a |
| -> Result |
| -> Shared Scan (share slice:id 3:0) |
| -> Hash |
| -> Seq Scan on t2 |
| Optimizer: Pivotal Optimizer (GPORCA) |
| (20 rows) |
| |
| --- This functions returns one more column than expected. |
| CREATE OR REPLACE FUNCTION col_mismatch_func2() RETURNS TABLE (field1 int, field2 int) |
| LANGUAGE 'plpgsql' VOLATILE STRICT AS |
| $$ |
| DECLARE |
| v_qry text; |
| BEGIN |
| v_qry := 'WITH cte AS (SELECT * FROM t1 WHERE random() < 0.1 ORDER BY b LIMIT 10) SELECT a, 1 , 1 FROM cte JOIN t2 USING (a)'; |
| RETURN QUERY EXECUTE v_qry; |
| END |
| $$; |
| -- This should only ERROR and should not SIGSEGV |
| SELECT col_mismatch_func2(); |
| ERROR: structure of query does not match function result type |
| DETAIL: Number of returned columns (3) does not match expected column count (2). |
| CONTEXT: SQL statement "WITH cte AS (SELECT * FROM t1 WHERE random() < 0.1 ORDER BY b LIMIT 10) SELECT a, 1 , 1 FROM cte JOIN t2 USING (a)" |
| PL/pgSQL function col_mismatch_func2() line 6 at RETURN QUERY |
| -- https://github.com/greenplum-db/gpdb/issues/12701 |
| -- Disable cte sharing in subquery |
| drop table if exists pk_list; |
| NOTICE: table "pk_list" does not exist, skipping |
| create table pk_list (id int, schema_name varchar, table_name varchar) distributed by (id); |
| drop table if exists calender; |
| NOTICE: table "calender" does not exist, skipping |
| create table calender (id int, data_hour timestamp) distributed by (id); |
| explain (costs off) |
| with |
| tbls as (select distinct schema_name, table_name as table_nm from pk_list), |
| tbls_daily_report_23 as (select unnest(string_to_array('mart_cm.card' ,',')) as table_nm_23), |
| tbls_w_onl_actl_data as (select unnest(string_to_array('mart_cm.cont_resp,mart_cm.card', ',')) as table_nm_onl_act) |
| select data_hour, stat.schema_name as schema_nm, dt.table_nm |
| from ( |
| select * from calender c |
| cross join tbls |
| ) dt |
| inner join ( |
| select tbls.schema_name, tbls.table_nm as table_name |
| from tbls tbls |
| ) stat on dt.table_nm = stat.table_name |
| where |
| (data_hour = date_trunc('day',data_hour) and stat.schema_name || '.' ||stat.table_name not in (select table_nm_23 from tbls_daily_report_23)) |
| and (stat.schema_name || '.' ||stat.table_name not in (select table_nm_onl_act from tbls_w_onl_actl_data)) |
| or (stat.schema_name || '.' ||stat.table_name in (select table_nm_onl_act from tbls_w_onl_actl_data)); |
| QUERY PLAN |
| --------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| Gather Motion 3:1 (slice1; segments: 3) |
| -> Hash Join |
| Hash Cond: ((tbls.table_nm)::text = (stat.table_name)::text) |
| Join Filter: (((c.data_hour = date_trunc('day'::text, c.data_hour)) AND (NOT (hashed SubPlan 1)) AND (NOT (hashed SubPlan 2))) OR (hashed SubPlan 3)) |
| -> Nested Loop |
| -> Broadcast Motion 3:3 (slice2; segments: 3) |
| -> Subquery Scan on tbls |
| -> HashAggregate |
| Group Key: pk_list.schema_name, pk_list.table_name |
| -> Redistribute Motion 3:3 (slice3; segments: 3) |
| Hash Key: pk_list.schema_name, pk_list.table_name |
| -> HashAggregate |
| Group Key: pk_list.schema_name, pk_list.table_name |
| -> Seq Scan on pk_list |
| -> Materialize |
| -> Seq Scan on calender c |
| -> Hash |
| -> Broadcast Motion 3:3 (slice4; segments: 3) |
| -> Subquery Scan on stat |
| Filter: (((NOT (hashed SubPlan 1)) AND (NOT (hashed SubPlan 2))) OR (hashed SubPlan 3)) |
| -> HashAggregate |
| Group Key: pk_list_1.schema_name, pk_list_1.table_name |
| -> Redistribute Motion 3:3 (slice5; segments: 3) |
| Hash Key: pk_list_1.schema_name, pk_list_1.table_name |
| -> HashAggregate |
| Group Key: pk_list_1.schema_name, pk_list_1.table_name |
| -> Seq Scan on pk_list pk_list_1 |
| SubPlan 1 |
| -> ProjectSet |
| -> Result |
| SubPlan 2 |
| -> ProjectSet |
| -> Result |
| SubPlan 3 |
| -> ProjectSet |
| -> Result |
| Optimizer: Postgres query optimizer |
| (37 rows) |
| |