blob: 2480f97dfee65f67b1907e083ad5e5fb9d2b0302 [file] [log] [blame]
-- This test verifies that FTS shouldn't issue failover to mirror when
-- the primary is taking long in the RESET state.
-- start_matchsubs
-- m/seg0 [0-9.]+:\d+/
-- s/seg0 [0-9.]+:\d+/seg0 IP:PORT/
-- end_matchsubs
-- Let FTS detect/declare failure sooner
-- start_ignore
alter system set gp_fts_probe_interval to 10;
-- Because after RESET, it still takes a little while for the primary
-- to restart, and potentially makes FTS think it's in "recovery not
-- in progress" stage and promote the mirror, we would need the FTS
-- to make that decision a bit less frequently.
alter system set gp_fts_probe_retries to 15;
select pg_reload_conf();
-- end_ignore
-- Let the background writer sleep 17 seconds to delay the resetting.
-- This number is selected to be larger than the 15-second retry window
-- which makes a meaningful test, meanwhile reduce the chance that FTS sees
-- a "recovery not in progress" primary as much as possible.
select gp_inject_fault('fault_in_background_writer_quickdie', 'sleep', '', '', '', 1, 1, 17, dbid)
from gp_segment_configuration where role = 'p' and content = 0;
-- Do not let the postmaster send SIGKILL to the bgwriter
select gp_inject_fault_infinite('postmaster_server_loop_no_sigkill', 'skip', dbid)
from gp_segment_configuration where role = 'p' and content = 0;
-- Now bring down primary of seg0. There're a lot of ways to do that, in order
-- to better emulate a real-world scnarios we're injecting a PANIC to do that.
1:select gp_inject_fault('start_prepare', 'panic', dbid)
from gp_segment_configuration where role = 'p' AND content = 0;
1&:create table fts_reset_t(a int);
-- This should fail due to the seg0 in reset mode
2&:create table fts_reset_t2(a int);
-- Try another one but let the gang creation retry for longer.
-- Default is 2000ms (gp_gang_creation_retry_timer) * 5 (gp_gang_creation_retry_count) = 10s.
-- Now make it 50s which is well longer than the delay we inserted before, so it can succeed.
3:set gp_gang_creation_retry_timer = 10000;
3:create table fts_reset_t3(a int);
1<:
2<:
-- We shouldn't see failover to mirror
select gp_request_fts_probe_scan();
select dbid, role, preferred_role, status from gp_segment_configuration where content = 0;
select gp_inject_fault('postmaster_server_loop_no_sigkill', 'reset', dbid) from gp_segment_configuration where role = 'p' and content = 0;
select gp_inject_fault('fault_in_background_writer_quickdie', 'reset', dbid) from gp_segment_configuration where role = 'p' and content = 0;
select pg_sleep(30);
-- start_ignore
-- restore parameters
alter system reset gp_fts_probe_interval;
alter system reset gp_fts_probe_retries;
select pg_reload_conf();
-- end_ignore
-- The only table that should have been created successfully
drop table fts_reset_t3;
-- In case anything goes wrong, we don't want to affect other tests. So rebalance the cluster anyway.
!\retcode gprecoverseg -aF
!\retcode gprecoverseg -ar