src/test/isolation2/sql/terminate_in_gang_creation.sql - cloudberry - Git at Google

 -- start_matchsubs
 -- m/seg[0-9] [0-9.]+:\d+/
 -- s/seg[0-9] [0-9.]+:\d+/segN IP:PORT/
 -- m/lock \[\d+,\d+\]/
 -- s/lock \[\d+,\d+\]//
 -- m/DETAIL: .* FATAL:  reader could not find writer proc entry/
 -- s/DETAIL: .* FATAL:  reader could not find writer proc entry/DETAIL: FATAL:  reader could not find writer proc entry/
 -- end_matchsubs

 -- skip dtx recovery check to avoid hitting the fault create_gang_in_progress.
 SELECT gp_inject_fault_infinite('before_orphaned_check', 'skip', dbid)
     FROM gp_segment_configuration WHERE role='p' AND content=-1;
 ALTER SYSTEM SET gp_dtx_recovery_interval to 5;
 SELECT pg_reload_conf();
 SELECT gp_wait_until_triggered_fault('before_orphaned_check', 1, dbid)
     FROM gp_segment_configuration WHERE role='p' AND content=-1;

 -- SIGSEGV issue when freeing gangs
 --
 -- When SIGTERM is handled during gang creation we used to trigger
 -- a wild pointer access like below backtrace:
 --
 -- #0  raise
 -- #1  StandardHandlerForSigillSigsegvSigbus_OnMainThread
 -- #2  <signal handler called>
 -- #3  MemoryContextFreeImpl
 -- #4  cdbconn_termSegmentDescriptor
 -- #5  DisconnectAndDestroyGang
 -- #6  freeGangsForPortal
 -- #7  AbortTransaction
 -- ...
 -- #14 ProcessInterrupts
 -- #15 createGang_async
 -- #16 createGang
 -- #17 AllocateWriterGang

 DROP TABLE IF EXISTS foo;
 CREATE TABLE foo (c1 int, c2 int) DISTRIBUTED BY (c1);

 10: BEGIN;

 SELECT gp_inject_fault('create_gang_in_progress', 'reset', 1);
 SELECT gp_inject_fault('create_gang_in_progress', 'suspend', 1);

 10&: SELECT * FROM foo a JOIN foo b USING (c2);

 SELECT gp_wait_until_triggered_fault('create_gang_in_progress', 1, 1);

 SELECT pg_terminate_backend(pid) FROM pg_stat_activity
  WHERE query = 'SELECT * FROM foo a JOIN foo b USING (c2);';

 SELECT gp_inject_fault('create_gang_in_progress', 'resume', 1);

 10<:
 10q:

 DROP TABLE foo;

 -- Test a bug that if cached idle primary QE is gone (e.g. after kill-9, pg_ctl
 -- restart, etc), a new query needs a new created reader gang might fail with
 -- error like this:
 --
 -- ERROR:  failed to acquire resources on one or more segments
 -- DETAIL:  FATAL:  reader could not find writer proc entry
 -- DETAIL:  lock [0,1260] AccessShareLock 0. Probably because writer gang is gone somehow. Maybe try rerunning.
 -- (seg2 127.0.0.1:7004)
 --
 -- This is ok since the writer gang is gone, but previously QD code does
 -- not reset all gangs (just retry creating the new reader gang) so re-running
 -- this query could always fail with the same error since the reader gang would
 -- always fail to create. The below test is used to test the fix.

 -- skip FTS probes to avoid segment being marked down on restart
 SELECT gp_inject_fault_infinite('fts_probe', 'skip', dbid)
     FROM gp_segment_configuration WHERE role='p' AND content=-1;
 SELECT gp_request_fts_probe_scan();
 -- Prevent below pg_ctl restart timeout although the timeout should be enough.
 CHECKPOINT;

 -- not recycle idle QEs to avoid the flaky test where restarting primary takes a long time.
 11: SET gp_vmem_idle_resource_timeout TO 0;
 11: CREATE TABLE foo (c1 int, c2 int) DISTRIBUTED BY (c1);
 -- ORCA optimizes value scan so there is no additional reader gang in below INSERT.
 11: SET optimizer = off;
 -- the value scan (reader gang) might be on any segment so restart all segments.
 SELECT pg_ctl(datadir, 'restart', 'immediate')
 	FROM gp_segment_configuration WHERE role='p' AND content != -1;
 11: INSERT INTO foo values(2),(1);
 11: INSERT INTO foo values(2),(1);
 11: DROP TABLE foo;
 11: RESET gp_vmem_idle_resource_timeout;

 SELECT gp_inject_fault('fts_probe', 'reset', dbid)
 	FROM gp_segment_configuration WHERE role='p' AND content=-1;
 SELECT gp_inject_fault_infinite('before_orphaned_check', 'reset', dbid)
     FROM gp_segment_configuration WHERE role='p' AND content=-1;
 2: ALTER SYSTEM RESET gp_dtx_recovery_interval;
 2: SELECT pg_reload_conf();
	-- start_matchsubs
	-- m/seg[0-9] [0-9.]+:\d+/
	-- s/seg[0-9] [0-9.]+:\d+/segN IP:PORT/
	-- m/lock \[\d+,\d+\]/
	-- s/lock \[\d+,\d+\]//
	-- m/DETAIL: .* FATAL: reader could not find writer proc entry/
	-- s/DETAIL: .* FATAL: reader could not find writer proc entry/DETAIL: FATAL: reader could not find writer proc entry/
	-- end_matchsubs

	-- skip dtx recovery check to avoid hitting the fault create_gang_in_progress.
	SELECT gp_inject_fault_infinite('before_orphaned_check', 'skip', dbid)
	FROM gp_segment_configuration WHERE role='p' AND content=-1;
	ALTER SYSTEM SET gp_dtx_recovery_interval to 5;
	SELECT pg_reload_conf();
	SELECT gp_wait_until_triggered_fault('before_orphaned_check', 1, dbid)
	FROM gp_segment_configuration WHERE role='p' AND content=-1;

	-- SIGSEGV issue when freeing gangs
	--
	-- When SIGTERM is handled during gang creation we used to trigger
	-- a wild pointer access like below backtrace:
	--
	-- #0 raise
	-- #1 StandardHandlerForSigillSigsegvSigbus_OnMainThread
	-- #2 <signal handler called>
	-- #3 MemoryContextFreeImpl
	-- #4 cdbconn_termSegmentDescriptor
	-- #5 DisconnectAndDestroyGang
	-- #6 freeGangsForPortal
	-- #7 AbortTransaction
	-- ...
	-- #14 ProcessInterrupts
	-- #15 createGang_async
	-- #16 createGang
	-- #17 AllocateWriterGang

	DROP TABLE IF EXISTS foo;
	CREATE TABLE foo (c1 int, c2 int) DISTRIBUTED BY (c1);

	10: BEGIN;

	SELECT gp_inject_fault('create_gang_in_progress', 'reset', 1);
	SELECT gp_inject_fault('create_gang_in_progress', 'suspend', 1);

	10&: SELECT * FROM foo a JOIN foo b USING (c2);

	SELECT gp_wait_until_triggered_fault('create_gang_in_progress', 1, 1);

	SELECT pg_terminate_backend(pid) FROM pg_stat_activity
	WHERE query = 'SELECT * FROM foo a JOIN foo b USING (c2);';

	SELECT gp_inject_fault('create_gang_in_progress', 'resume', 1);

	10<:
	10q:

	DROP TABLE foo;

	-- Test a bug that if cached idle primary QE is gone (e.g. after kill-9, pg_ctl
	-- restart, etc), a new query needs a new created reader gang might fail with
	-- error like this:
	--
	-- ERROR: failed to acquire resources on one or more segments
	-- DETAIL: FATAL: reader could not find writer proc entry
	-- DETAIL: lock [0,1260] AccessShareLock 0. Probably because writer gang is gone somehow. Maybe try rerunning.
	-- (seg2 127.0.0.1:7004)
	--
	-- This is ok since the writer gang is gone, but previously QD code does
	-- not reset all gangs (just retry creating the new reader gang) so re-running
	-- this query could always fail with the same error since the reader gang would
	-- always fail to create. The below test is used to test the fix.

	-- skip FTS probes to avoid segment being marked down on restart
	SELECT gp_inject_fault_infinite('fts_probe', 'skip', dbid)
	FROM gp_segment_configuration WHERE role='p' AND content=-1;
	SELECT gp_request_fts_probe_scan();
	-- Prevent below pg_ctl restart timeout although the timeout should be enough.
	CHECKPOINT;

	-- not recycle idle QEs to avoid the flaky test where restarting primary takes a long time.
	11: SET gp_vmem_idle_resource_timeout TO 0;
	11: CREATE TABLE foo (c1 int, c2 int) DISTRIBUTED BY (c1);
	-- ORCA optimizes value scan so there is no additional reader gang in below INSERT.
	11: SET optimizer = off;
	-- the value scan (reader gang) might be on any segment so restart all segments.
	SELECT pg_ctl(datadir, 'restart', 'immediate')
	FROM gp_segment_configuration WHERE role='p' AND content != -1;
	11: INSERT INTO foo values(2),(1);
	11: INSERT INTO foo values(2),(1);
	11: DROP TABLE foo;
	11: RESET gp_vmem_idle_resource_timeout;

	SELECT gp_inject_fault('fts_probe', 'reset', dbid)
	FROM gp_segment_configuration WHERE role='p' AND content=-1;
	SELECT gp_inject_fault_infinite('before_orphaned_check', 'reset', dbid)
	FROM gp_segment_configuration WHERE role='p' AND content=-1;
	2: ALTER SYSTEM RESET gp_dtx_recovery_interval;
	2: SELECT pg_reload_conf();