blob: b1be240916a4fb1383bb148b87b7397e6f61f793 [file] [log] [blame]
-- Test system faults scenarios
-- start_matchsubs
--
-- m/Is the server running on host.*/
-- s/Is the server running on host "\d+.\d+.\d+.\d+" and accepting/Is the server running on host <IP> and accepting/
-- m/(seg\d+ \d+.\d+.\d+.\d+:\d+)/
-- s/(.*)/(seg<ID> IP:PORT)/
-- m/ERROR: connection to dbid 1 .*:7000 failed .*/
-- s/ERROR: connection to dbid 1 .*:7000 failed .*/ERROR: connection to dbid 1 <host>:7000 failed/
--
-- end_matchsubs
-- Let FTS detect/declare failure sooner
!\retcode gpconfig -c gp_fts_probe_interval -v 10 --coordinatoronly;
!\retcode gpstop -u;
create table hs_failover(a int);
insert into hs_failover select * from generate_series(1,10);
-1S: select * from hs_failover;
----------------------------------------------------------------
-- Mirror segment fails
----------------------------------------------------------------
select pg_ctl(datadir, 'stop', 'immediate') from gp_segment_configuration where content=1 and role = 'm';
-- make sure mirror is detected down
create temp table hs_tt(a int);
select gp_request_fts_probe_scan();
-- will not succeed
-1S: select * from hs_failover;
-1Sq:
-- recovery
!\retcode gprecoverseg -aF;
-- sync-up
select wait_until_all_segments_synchronized();
-- works now
-1S: select * from hs_failover;
----------------------------------------------------------------
-- Primary segment fails
----------------------------------------------------------------
-- inject a fault where the mirror gets out of recovery
select gp_inject_fault('out_of_recovery_in_startupxlog', 'skip', dbid) from gp_segment_configuration where content = 1 and role = 'm';
select pg_ctl(datadir, 'stop', 'immediate') from gp_segment_configuration where content=1 and role = 'p';
select gp_request_fts_probe_scan();
-- make sure failover happens
select dbid, content, role, preferred_role, mode, status from gp_segment_configuration where content = 1;
select gp_wait_until_triggered_fault('out_of_recovery_in_startupxlog', 1, dbid) from gp_segment_configuration where content = 1 and role = 'p';
select gp_inject_fault('out_of_recovery_in_startupxlog', 'reset', dbid) from gp_segment_configuration where content = 1 and role = 'p';
-- On an existing standby connection, query will run but it is dispatched to the previous mirror
-- in an existing gang. That mirror is now a primary, so it will complain and the query fails.
-1S: select * from hs_failover;
-1Sq:
-- start_ignore
-- will fail due to downed mirror (previous primary)
-1S: select * from hs_failover;
-1Sq:
-- end_ignore
-- bring the downed mirror up
!\retcode gprecoverseg -aF;
select wait_until_all_segments_synchronized();
-- mirror is up
-1S: select dbid, content, role, preferred_role, mode, status from gp_segment_configuration where content = 1;
-- now the query will succeed
-1S: select * from hs_failover;
-1Sq:
-- re-balance, bring the segments to their preferred roles
!\retcode gprecoverseg -ar;
select wait_until_all_segments_synchronized();
-1S: select dbid, content, role, preferred_role, mode, status from gp_segment_configuration where content = 1;
-- query runs fine still
-1S: select * from hs_failover;
----------------------------------------------------------------
-- DTX recovery
----------------------------------------------------------------
-- skip FTS probe to prevent unexpected mirror promotion
1: select gp_inject_fault_infinite('fts_probe', 'skip', dbid) from gp_segment_configuration where role='p' and content=-1;
1: create table tt_hs_dtx(a int);
-- inject fault to repeatedly fail the COMMIT PREPARE phase of 2PC, which ensures that the dtx cannot finish even by the dtx recovery process.
select gp_inject_fault_infinite('finish_commit_prepared', 'error', dbid) from gp_segment_configuration where content=1 and role='p';
-- session 1 on primary QD tries to commit a DTX, but cannot finish due to the fault on a QE
1&: insert into tt_hs_dtx select * from generate_series(1,10);
-- inject a panic on primary QD, essentially restarts the primary QD
2: select gp_inject_fault('before_read_command', 'panic', dbid) from gp_segment_configuration where content=-1 and role='p';
2: select 1;
1<:
1q:
2q:
-- standby QD can still run query
-1S: select * from hs_failover;
-- it cannot see rows from the in-doubt DTX
-1S: select * from tt_hs_dtx;
-- let the failed dtx be recovered, also make sure the standby replays the forget record which signals the completion of the dtx
-1S: select gp_inject_fault('redoDistributedForgetCommitRecord', 'skip', dbid) from gp_segment_configuration where content=-1 and role='m';
-1S: select gp_inject_fault_infinite('finish_commit_prepared', 'reset', dbid) from gp_segment_configuration where content=1 and role='p';
-1S: select gp_wait_until_triggered_fault('redoDistributedForgetCommitRecord', 1, dbid) from gp_segment_configuration where content=-1 and role='m';
-1S: select gp_inject_fault('redoDistributedForgetCommitRecord', 'reset', dbid) from gp_segment_configuration where content=-1 and role='m';
-- standby should see the rows from the in-doubt DTX now
-1S: select * from tt_hs_dtx;
-1S: select wait_until_all_segments_synchronized();
1: select gp_inject_fault('before_read_command', 'reset', dbid) from gp_segment_configuration where content=-1 and role='p';
1: select gp_inject_fault('fts_probe', 'reset', dbid) from gp_segment_configuration where role='p' and content=-1;