blob: a8ab8bd0c23f73be887f6c23a27c47af2656c3c1 [file] [log] [blame]
--
-- Queries that lead to hanging (not dead lock) when we don't handle synchronization properly in shared scan
-- Queries that lead to wrong result when we don't finish executing the subtree below the shared scan being squelched.
--
CREATE SCHEMA shared_scan;
SET search_path = shared_scan;
CREATE TABLE foo (a int, b int);
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Apache Cloudberry data distribution key for this table.
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
CREATE TABLE bar (c int, d int);
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c' as the Apache Cloudberry data distribution key for this table.
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
CREATE TABLE jazz(e int, f int);
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'e' as the Apache Cloudberry data distribution key for this table.
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
INSERT INTO foo values (1, 2);
INSERT INTO bar SELECT i, i from generate_series(1, 100)i;
INSERT INTO jazz VALUES (2, 2), (3, 3);
ANALYZE foo;
ANALYZE bar;
ANALYZE jazz;
SELECT $query$
SELECT * FROM
(
WITH cte AS (SELECT * FROM foo)
SELECT * FROM (SELECT * FROM cte UNION ALL SELECT * FROM cte)
AS X
JOIN bar ON b = c
) AS XY
JOIN jazz on c = e AND b = f;
$query$ AS qry \gset
-- We are very particular about this plan shape and data distribution with ORCA:
-- 1. `jazz` has to be the inner table of the outer HASH JOIN, so that on a
-- segment which has zero tuples in `jazz`, the Sequence node that contains the
-- Shared Scan will be squelched on that segment. If `jazz` is not on the inner
-- side, the above mentioned "hang" scenario will not be covered.
-- 2. The Shared Scan producer has to be on a different slice from consumers,
-- and some tuples coming out of the Share Scan producer on one segments are
-- redistributed to a different segment over Motion. If not, the above mentioned
-- "wrong result" scenario will not be covered.
EXPLAIN (COSTS OFF)
:qry ;
QUERY PLAN
------------------------------------------------------------------------
Gather Motion 3:1 (slice1; segments: 3)
-> Hash Join
Hash Cond: (bar.c = jazz.e)
-> Sequence
-> Shared Scan (share slice:id 1:0)
-> Redistribute Motion 3:3 (slice2; segments: 3)
Hash Key: foo.b
-> Seq Scan on foo
-> Hash Join
Hash Cond: (bar.c = share0_ref2.b)
-> Seq Scan on bar
-> Hash
-> Append
-> Shared Scan (share slice:id 1:0)
-> Shared Scan (share slice:id 1:0)
-> Hash
-> Seq Scan on jazz
Filter: (e = f)
Optimizer: Pivotal Optimizer (GPORCA)
(19 rows)
SET statement_timeout = '15s';
:qry ;
a | b | c | d | e | f
---+---+---+---+---+---
1 | 2 | 2 | 2 | 2 | 2
1 | 2 | 2 | 2 | 2 | 2
(2 rows)
RESET statement_timeout;
SELECT COUNT(*)
FROM (SELECT *,
(
WITH cte AS (SELECT * FROM jazz WHERE jazz.e = bar.c)
SELECT 1 FROM cte c1, cte c2
)
FROM bar) as s;
count
-------
100
(1 row)
CREATE TABLE t1 (a int, b int);
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Apache Cloudberry data distribution key for this table.
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
CREATE TABLE t2 (a int);
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Apache Cloudberry data distribution key for this table.
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
-- ORCA plan contains a Shared Scan producer with a unsorted Motion below it
EXPLAIN (COSTS OFF)
WITH cte AS (SELECT * FROM t1 WHERE random() < 0.1 LIMIT 10) SELECT a, 1, 1 FROM cte JOIN t2 USING (a);
QUERY PLAN
------------------------------------------------------------------------
Sequence
-> Shared Scan (share slice:id 0:0)
-> Limit
-> Gather Motion 3:1 (slice1; segments: 3)
-> Seq Scan on t1
Filter: (random() < '0.1'::double precision)
-> Result
-> Gather Motion 3:1 (slice2; segments: 3)
-> Hash Join
Hash Cond: (share0_ref2.a = t2.a)
-> Redistribute Motion 1:3 (slice3)
Hash Key: share0_ref2.a
-> Result
-> Shared Scan (share slice:id 3:0)
-> Hash
-> Seq Scan on t2
Optimizer: Pivotal Optimizer (GPORCA)
(17 rows)
-- This functions returns one more column than expected.
CREATE OR REPLACE FUNCTION col_mismatch_func1() RETURNS TABLE (field1 int, field2 int)
LANGUAGE 'plpgsql' VOLATILE STRICT AS
$$
DECLARE
v_qry text;
BEGIN
v_qry := 'WITH cte AS (SELECT * FROM t1 WHERE random() < 0.1 LIMIT 10) SELECT a, 1 , 1 FROM cte JOIN t2 USING (a)';
RETURN QUERY EXECUTE v_qry;
END
$$;
-- This should only ERROR and should not SIGSEGV
SELECT col_mismatch_func1();
ERROR: structure of query does not match function result type
DETAIL: Number of returned columns (3) does not match expected column count (2).
CONTEXT: SQL statement "WITH cte AS (SELECT * FROM t1 WHERE random() < 0.1 LIMIT 10) SELECT a, 1 , 1 FROM cte JOIN t2 USING (a)"
PL/pgSQL function col_mismatch_func1() line 6 at RETURN QUERY
-- ORCA plan contains a Shared Scan producer with a sorted Motion below it
EXPLAIN (COSTS OFF)
WITH cte AS (SELECT * FROM t1 WHERE random() < 0.1 ORDER BY b LIMIT 10) SELECT a, 1, 1 FROM cte JOIN t2 USING (a);
QUERY PLAN
------------------------------------------------------------------------------
Sequence
-> Shared Scan (share slice:id 0:0)
-> Limit
-> Gather Motion 3:1 (slice1; segments: 3)
Merge Key: t1.b
-> Sort
Sort Key: t1.b
-> Seq Scan on t1
Filter: (random() < '0.1'::double precision)
-> Result
-> Gather Motion 3:1 (slice2; segments: 3)
-> Hash Join
Hash Cond: (share0_ref2.a = t2.a)
-> Redistribute Motion 1:3 (slice3)
Hash Key: share0_ref2.a
-> Result
-> Shared Scan (share slice:id 3:0)
-> Hash
-> Seq Scan on t2
Optimizer: Pivotal Optimizer (GPORCA)
(20 rows)
--- This functions returns one more column than expected.
CREATE OR REPLACE FUNCTION col_mismatch_func2() RETURNS TABLE (field1 int, field2 int)
LANGUAGE 'plpgsql' VOLATILE STRICT AS
$$
DECLARE
v_qry text;
BEGIN
v_qry := 'WITH cte AS (SELECT * FROM t1 WHERE random() < 0.1 ORDER BY b LIMIT 10) SELECT a, 1 , 1 FROM cte JOIN t2 USING (a)';
RETURN QUERY EXECUTE v_qry;
END
$$;
-- This should only ERROR and should not SIGSEGV
SELECT col_mismatch_func2();
ERROR: structure of query does not match function result type
DETAIL: Number of returned columns (3) does not match expected column count (2).
CONTEXT: SQL statement "WITH cte AS (SELECT * FROM t1 WHERE random() < 0.1 ORDER BY b LIMIT 10) SELECT a, 1 , 1 FROM cte JOIN t2 USING (a)"
PL/pgSQL function col_mismatch_func2() line 6 at RETURN QUERY
-- https://github.com/greenplum-db/gpdb/issues/12701
-- Disable cte sharing in subquery
drop table if exists pk_list;
NOTICE: table "pk_list" does not exist, skipping
create table pk_list (id int, schema_name varchar, table_name varchar) distributed by (id);
drop table if exists calender;
NOTICE: table "calender" does not exist, skipping
create table calender (id int, data_hour timestamp) distributed by (id);
explain (costs off)
with
tbls as (select distinct schema_name, table_name as table_nm from pk_list),
tbls_daily_report_23 as (select unnest(string_to_array('mart_cm.card' ,',')) as table_nm_23),
tbls_w_onl_actl_data as (select unnest(string_to_array('mart_cm.cont_resp,mart_cm.card', ',')) as table_nm_onl_act)
select data_hour, stat.schema_name as schema_nm, dt.table_nm
from (
select * from calender c
cross join tbls
) dt
inner join (
select tbls.schema_name, tbls.table_nm as table_name
from tbls tbls
) stat on dt.table_nm = stat.table_name
where
(data_hour = date_trunc('day',data_hour) and stat.schema_name || '.' ||stat.table_name not in (select table_nm_23 from tbls_daily_report_23))
and (stat.schema_name || '.' ||stat.table_name not in (select table_nm_onl_act from tbls_w_onl_actl_data))
or (stat.schema_name || '.' ||stat.table_name in (select table_nm_onl_act from tbls_w_onl_actl_data));
QUERY PLAN
---------------------------------------------------------------------------------------------------------------------------------------------------------------
Gather Motion 3:1 (slice1; segments: 3)
-> Hash Join
Hash Cond: ((tbls.table_nm)::text = (stat.table_name)::text)
Join Filter: (((c.data_hour = date_trunc('day'::text, c.data_hour)) AND (NOT (hashed SubPlan 1)) AND (NOT (hashed SubPlan 2))) OR (hashed SubPlan 3))
-> Nested Loop
-> Broadcast Motion 3:3 (slice2; segments: 3)
-> Subquery Scan on tbls
-> HashAggregate
Group Key: pk_list.schema_name, pk_list.table_name
-> Redistribute Motion 3:3 (slice3; segments: 3)
Hash Key: pk_list.schema_name, pk_list.table_name
-> HashAggregate
Group Key: pk_list.schema_name, pk_list.table_name
-> Seq Scan on pk_list
-> Materialize
-> Seq Scan on calender c
-> Hash
-> Broadcast Motion 3:3 (slice4; segments: 3)
-> Subquery Scan on stat
Filter: (((NOT (hashed SubPlan 1)) AND (NOT (hashed SubPlan 2))) OR (hashed SubPlan 3))
-> HashAggregate
Group Key: pk_list_1.schema_name, pk_list_1.table_name
-> Redistribute Motion 3:3 (slice5; segments: 3)
Hash Key: pk_list_1.schema_name, pk_list_1.table_name
-> HashAggregate
Group Key: pk_list_1.schema_name, pk_list_1.table_name
-> Seq Scan on pk_list pk_list_1
SubPlan 1
-> ProjectSet
-> Result
SubPlan 2
-> ProjectSet
-> Result
SubPlan 3
-> ProjectSet
-> Result
Optimizer: Postgres query optimizer
(37 rows)