blob: 8cd7666e236f06d012d36162fc01c0c785cb7457 [file] [log] [blame]
-- Test this scenario:
-- mirror has latency replaying the WAL from the primary, the master is reset
-- from PANIC, master will start the DTX recovery process to recover the
-- in-progress two-phase transactions.
-- The FTS process should be able to continue probe and 'sync off' the mirror
-- while the 'dtx recovery' process is hanging recovering distributed transactions.
1: create table t_wait_lsn(a int);
5: create table t_wait_lsn2(a int);
-- suspend segment 0 before performing 'COMMIT PREPARED'
2: select gp_inject_fault_infinite('finish_prepared_start_of_function', 'suspend', dbid) from gp_segment_configuration where content=0 and role='p';
1&: insert into t_wait_lsn values(2),(1);
5&: insert into t_wait_lsn2 values(2),(1);
2: select gp_wait_until_triggered_fault('finish_prepared_start_of_function', 2, dbid) from gp_segment_configuration where content=0 and role='p';
-- let walreceiver on mirror 0 skip WAL flush
2: select gp_inject_fault_infinite('walrecv_skip_flush', 'skip', dbid) from gp_segment_configuration where content=0 and role='m';
-- resume 'COMMIT PREPARED', session 1 will hang on 'SyncRepWaitForLSN'
2: select gp_inject_fault_infinite('finish_prepared_start_of_function', 'reset', dbid) from gp_segment_configuration where content=0 and role='p';
0U: select count(*) from pg_prepared_xacts;
-- stop mirror
3: SELECT pg_ctl(datadir, 'stop', 'immediate') FROM gp_segment_configuration WHERE content=0 AND role = 'm';
!\retcode gpfts -R 1 -A -D;
-- trigger master reset
3: select gp_inject_fault('exec_simple_query_start', 'panic', current_setting('gp_dbid')::smallint);
-- verify master panic happens. The PANIC message does not emit sometimes so
-- mask it.
-- start_matchsubs
-- m/PANIC: fault triggered, fault name:'exec_simple_query_start' fault type:'panic'\n/
-- s/PANIC: fault triggered, fault name:'exec_simple_query_start' fault type:'panic'\n//
-- end_matchsubs
3: select 1;
-- potential flakiness: there is a chance where the coordinator
-- recovers fast enough (from the panic above) that we end up fault injecting too late.
-1U: select gp_inject_fault_infinite('post_progress_recovery_comitted', 'suspend', dbid) FROM gp_segment_configuration WHERE content=-1 AND role='p';
-1U: select gp_wait_until_triggered_fault('post_progress_recovery_comitted', 1, dbid) from gp_segment_configuration where content=-1 and role='p';
-1U: select * from gp_stat_progress_dtx_recovery;
-1U: select gp_inject_fault_infinite('post_progress_recovery_comitted', 'reset', dbid) from gp_segment_configuration where content=-1 and role='p';
-- wait for coordinator finish crash recovery
-1U: select wait_until_standby_in_state('streaming');
-- wait for FTS to 'sync off' the mirror, meanwhile, dtx recovery process will
-- restart repeatedly.
-- the query should succeed finally since dtx recovery process is able to quit.
-- this's what we want to test.
4: select count(*) from t_wait_lsn;
1<:
5<:
!\retcode gpfts -R 1 -A -D;
!\retcode gprecoverseg -a;
!\retcode gpfts -R 1 -A -D;
-- loop while segments come in sync
4: select wait_until_all_segments_synchronized();
4: select pg_sleep(10);
4: select count(*) from t_wait_lsn;
4: drop table t_wait_lsn;
4: drop table t_wait_lsn2;
4: select gp_inject_fault('walrecv_skip_flush', 'reset', dbid) from gp_segment_configuration where content=0;