src/test/isolation2/sql/fts_segment_reset.sql - cloudberry - Git at Google

 -- This test verifies that FTS shouldn't issue failover to mirror when
 -- the primary is taking long in the RESET state.

 -- start_matchsubs
 -- m/seg0 [0-9.]+:\d+/
 -- s/seg0 [0-9.]+:\d+/seg0 IP:PORT/
 -- end_matchsubs

 -- Let FTS detect/declare failure sooner
 -- start_ignore
 alter system set gp_fts_probe_interval to 10;
 -- Because after RESET, it still takes a little while for the primary
 -- to restart, and potentially makes FTS think it's in "recovery not
 -- in progress" stage and promote the mirror, we would need the FTS
 -- to make that decision a bit less frequently.
 alter system set gp_fts_probe_retries to 15;
 select pg_reload_conf();
 -- end_ignore

 -- Let the background writer sleep 17 seconds to delay the resetting.
 -- This number is selected to be larger than the 15-second retry window
 -- which makes a meaningful test, meanwhile reduce the chance that FTS sees
 -- a "recovery not in progress" primary as much as possible.
 select gp_inject_fault('fault_in_background_writer_quickdie', 'sleep', '', '', '', 1, 1, 17, dbid)
 from gp_segment_configuration where role = 'p' and content = 0;

 -- Do not let the postmaster send SIGKILL to the bgwriter
 select gp_inject_fault_infinite('postmaster_server_loop_no_sigkill', 'skip', dbid)
 from gp_segment_configuration where role = 'p' and content = 0;

 -- Now bring down primary of seg0. There're a lot of ways to do that, in order
 -- to better emulate a real-world scnarios we're injecting a PANIC to do that.
 1:select gp_inject_fault('start_prepare', 'panic', dbid)
 from gp_segment_configuration where role = 'p' AND content = 0;
 1&:create table fts_reset_t(a int);

 -- This should fail due to the seg0 in reset mode
 2&:create table fts_reset_t2(a int);

 -- Try another one but let the gang creation retry for longer.
 -- Default is 2000ms (gp_gang_creation_retry_timer) * 5 (gp_gang_creation_retry_count) = 10s.
 -- Now make it 50s which is well longer than the delay we inserted before, so it can succeed.
 3:set gp_gang_creation_retry_timer = 10000;
 3:create table fts_reset_t3(a int);

 1<:
 2<:

 -- We shouldn't see failover to mirror
 select gp_request_fts_probe_scan();
 select dbid, role, preferred_role, status from gp_segment_configuration where content = 0;

 select gp_inject_fault('postmaster_server_loop_no_sigkill', 'reset', dbid) from gp_segment_configuration where role = 'p' and content = 0;
 select gp_inject_fault('fault_in_background_writer_quickdie', 'reset', dbid) from gp_segment_configuration where role = 'p' and content = 0;

 select pg_sleep(30);

 -- start_ignore
 -- restore parameters
 alter system reset gp_fts_probe_interval;
 alter system reset gp_fts_probe_retries;
 select pg_reload_conf();
 -- end_ignore

 -- The only table that should have been created successfully
 drop table fts_reset_t3;

 -- In case anything goes wrong, we don't want to affect other tests. So rebalance the cluster anyway.
 !\retcode gprecoverseg -aF
 !\retcode gprecoverseg -ar
	-- This test verifies that FTS shouldn't issue failover to mirror when
	-- the primary is taking long in the RESET state.

	-- start_matchsubs
	-- m/seg0 [0-9.]+:\d+/
	-- s/seg0 [0-9.]+:\d+/seg0 IP:PORT/
	-- end_matchsubs

	-- Let FTS detect/declare failure sooner
	-- start_ignore
	alter system set gp_fts_probe_interval to 10;
	-- Because after RESET, it still takes a little while for the primary
	-- to restart, and potentially makes FTS think it's in "recovery not
	-- in progress" stage and promote the mirror, we would need the FTS
	-- to make that decision a bit less frequently.
	alter system set gp_fts_probe_retries to 15;
	select pg_reload_conf();
	-- end_ignore

	-- Let the background writer sleep 17 seconds to delay the resetting.
	-- This number is selected to be larger than the 15-second retry window
	-- which makes a meaningful test, meanwhile reduce the chance that FTS sees
	-- a "recovery not in progress" primary as much as possible.
	select gp_inject_fault('fault_in_background_writer_quickdie', 'sleep', '', '', '', 1, 1, 17, dbid)
	from gp_segment_configuration where role = 'p' and content = 0;

	-- Do not let the postmaster send SIGKILL to the bgwriter
	select gp_inject_fault_infinite('postmaster_server_loop_no_sigkill', 'skip', dbid)
	from gp_segment_configuration where role = 'p' and content = 0;

	-- Now bring down primary of seg0. There're a lot of ways to do that, in order
	-- to better emulate a real-world scnarios we're injecting a PANIC to do that.
	1:select gp_inject_fault('start_prepare', 'panic', dbid)
	from gp_segment_configuration where role = 'p' AND content = 0;
	1&:create table fts_reset_t(a int);

	-- This should fail due to the seg0 in reset mode
	2&:create table fts_reset_t2(a int);

	-- Try another one but let the gang creation retry for longer.
	-- Default is 2000ms (gp_gang_creation_retry_timer) * 5 (gp_gang_creation_retry_count) = 10s.
	-- Now make it 50s which is well longer than the delay we inserted before, so it can succeed.
	3:set gp_gang_creation_retry_timer = 10000;
	3:create table fts_reset_t3(a int);

	1<:
	2<:

	-- We shouldn't see failover to mirror
	select gp_request_fts_probe_scan();
	select dbid, role, preferred_role, status from gp_segment_configuration where content = 0;

	select gp_inject_fault('postmaster_server_loop_no_sigkill', 'reset', dbid) from gp_segment_configuration where role = 'p' and content = 0;
	select gp_inject_fault('fault_in_background_writer_quickdie', 'reset', dbid) from gp_segment_configuration where role = 'p' and content = 0;

	select pg_sleep(30);

	-- start_ignore
	-- restore parameters
	alter system reset gp_fts_probe_interval;
	alter system reset gp_fts_probe_retries;
	select pg_reload_conf();
	-- end_ignore

	-- The only table that should have been created successfully
	drop table fts_reset_t3;

	-- In case anything goes wrong, we don't want to affect other tests. So rebalance the cluster anyway.
	!\retcode gprecoverseg -aF
	!\retcode gprecoverseg -ar