blob: b946b961c142751a017ed3778e0b97a97399a2c2 [file] [log] [blame]
-- Test scenarios covering crash recovery for some specific DTM point.
--
-- start_matchsubs
--
-- # create a match/subs expression
--
-- s/\s+\(.*\.[ch]:\d+\)/ (SOMEFILE:SOMEFUNC)/
-- m/(PANIC):.*unable to complete*/
--
-- m/^ERROR: Error on receive from seg0.*: server closed the connection unexpectedly/
-- s/^ERROR: Error on receive from seg0.*: server closed the connection unexpectedly/ERROR: server closed the connection unexpectedly/
--
-- end_matchsubs
-- Make the test faster and also make some queries fail as expected after
-- 2pc retry PANIC (do not finish earlier before PANIC happens).
alter system set dtx_phase2_retry_second to 5;
select pg_reload_conf();
1:SELECT role, preferred_role, content, status FROM gp_segment_configuration;
-- Scenario 1: Test to fail broadcasting of COMMIT PREPARED to one
-- segment and hence trigger PANIC in master while after completing
-- phase 2 of 2PC. Master's recovery cycle should correctly broadcast
-- COMMIT PREPARED again because master should find distributed commit
-- record in its xlog during recovery. Verify that the transaction is
-- committed after recovery. This scenario used to create cluster
-- inconsistency due to bug fixed now, as transaction used to get
-- committed on all segments except one where COMMIT PREPARED
-- broadcast failed before recovery. Master used to miss sending the
-- COMMIT PREPARED across restart and instead abort the transaction
-- after querying in-doubt prepared transactions from segments.
-- Inject fault to fail the COMMIT PREPARED on one segment.
-- Temporarily disable retry finish prepared in this session, because
-- we are not interested in testing the retry logic. This makes it
-- suffice to trigger the fault only once. Otherwise, the fault may
-- continue to trigger even after PANIC on coordinator, impacting
-- finish prepared operation during crash recovery.
1: SET dtx_phase2_retry_second = 0;
1: SELECT gp_inject_fault('finish_prepared_start_of_function', 'error', dbid)
from gp_segment_configuration where content=0 and role='p';
-- Start looping in background, till master panics and closes the session
3&: SELECT wait_till_master_shutsdown();
-- Start transaction which should hit PANIC as COMMIT PREPARED will fail to one segment
1: CREATE TABLE commit_phase1_panic(a int, b int);
-- Reset the fault in utility mode because normal mode connection will
-- not be accepted until DTX recovery is finished.
-1U: SELECT gp_inject_fault('finish_prepared_start_of_function', 'reset', dbid)
from gp_segment_configuration where content=0 and role='p';
-1Uq:
-- Join back to know master has completed postmaster reset.
3<:
-- Start a session on master which would complete the DTM recovery and hence COMMIT PREPARED
4: SELECT * from commit_phase1_panic;
4: INSERT INTO commit_phase1_panic select i,i from generate_series(1, 10)i;
4: SELECT count(*) from commit_phase1_panic;
-- Scenario 2: Inject FATAL on master after recording commit but
-- before broadcasting COMMIT_PREPARED to segments. FATAL must convert
-- to PANIC and make sure to complete the 2PC processing and not leave
-- dangling prepared transaction. There used to bug as a result the
-- master backend process would just die, leaving dangling prepared
-- transaction on segment but commited on master.
-- Start looping in background, till master panics and closes the
-- session
5&: SELECT wait_till_master_shutsdown();
6: SELECT gp_inject_fault('dtm_broadcast_commit_prepared', 'fatal', dbid)
from gp_segment_configuration where role='p' and content=-1;
6: CREATE TABLE commit_fatal_fault_test_table(a int, b int);
5<:
-- Start a session on master which would complete the DTM recovery and hence COMMIT PREPARED
7: SELECT count(*) from commit_fatal_fault_test_table;
7: SELECT * FROM gp_dist_random('pg_prepared_xacts');
7: SELECT gp_inject_fault('dtm_broadcast_commit_prepared', 'reset', dbid)
from gp_segment_configuration where role='p' and content=-1;
-- Scenario 3: Inject ERROR after prepare phase has completed to
-- trigger abort. Then on abort inject FATAL on master before sending
-- ABORT_PREPARED. FATAL must convert to PANIC and make sure to
-- complete the 2PC processing and not leave dangling prepared
-- transaction. There used to bug as a result the master backend
-- process would just die, leaving dangling prepared transaction on
-- segment but aborted on master.
-- Start looping in background, till master panics and closes the
-- session
8&: SELECT wait_till_master_shutsdown();
9: SELECT gp_inject_fault('transaction_abort_after_distributed_prepared', 'error', dbid)
from gp_segment_configuration where role='p' and content=-1;
9: SELECT gp_inject_fault('dtm_broadcast_abort_prepared', 'fatal', dbid)
from gp_segment_configuration where role='p' and content=-1;
9: CREATE TABLE abort_fatal_fault_test_table(a int, b int);
8<:
10: SELECT count(*) from abort_fatal_fault_test_table;
10: SELECT * FROM gp_dist_random('pg_prepared_xacts');
10: SELECT gp_inject_fault('transaction_abort_after_distributed_prepared', 'reset', dbid)
from gp_segment_configuration where role='p' and content=-1;
10: SELECT gp_inject_fault('dtm_broadcast_abort_prepared', 'reset', dbid)
from gp_segment_configuration where role='p' and content=-1;
-- Scenario 4: QE panics after writing prepare xlog record. This
-- should cause master to broadcast abort and QEs handle the abort in
-- DTX_CONTEXT_LOCAL_ONLY context.
11: CREATE TABLE QE_panic_test_table(a int, b int);
11: INSERT INTO QE_panic_test_table SELECT * from generate_series(0, 9);
-- To help speedy recovery
11: CHECKPOINT;
-- Increase 2PC retry timeout to avoid any failures. Alter
-- system is required to set the GUC and can't be set on session level
-- as session reset happens for every abort retry.
11: alter system set dtx_phase2_retry_second to 600;
11: select pg_reload_conf();
-- skip FTS probes always
11: SELECT gp_inject_fault_infinite('fts_probe', 'skip', dbid)
from gp_segment_configuration where role='p' and content=-1;
11: SELECT gp_request_fts_probe_scan();
11: SELECT gp_inject_fault('end_prepare_two_phase', 'infinite_loop', dbid)
from gp_segment_configuration where role='p' and content=0;
-- statement to trigger fault after writing prepare record
12&: DELETE FROM QE_panic_test_table;
11: SELECT gp_wait_until_triggered_fault('end_prepare_two_phase', 1, dbid)
from gp_segment_configuration where role='p' and content=0;
11: SELECT pg_ctl(datadir, 'restart') from gp_segment_configuration where role = 'p' and content = 0;
12<:
13: SELECT count(*) from QE_panic_test_table;
13: SELECT * FROM gp_dist_random('pg_prepared_xacts');
13: SELECT gp_inject_fault('fts_probe', 'reset', dbid)
from gp_segment_configuration where role='p' and content=-1;
13: alter system reset dtx_phase2_retry_second;
13: select pg_reload_conf();
-- Scenario 5: QD panics when a QE process is doing prepare but not yet finished.
-- This should cause dtx recovery finally aborts the orphaned prepared transaction.
15: CREATE TABLE master_reset(a int);
15: SELECT gp_inject_fault_infinite('before_xlog_xact_prepare', 'suspend', dbid)
from gp_segment_configuration where role = 'p' and content = 1;
15: SELECT gp_inject_fault_infinite('after_xlog_xact_prepare_flushed', 'skip', dbid)
from gp_segment_configuration where role = 'p' and content = 1;
16&: INSERT INTO master_reset SELECT a from generate_series(1, 10) a;
15: SELECT gp_wait_until_triggered_fault('before_xlog_xact_prepare', 1, dbid)
from gp_segment_configuration where role = 'p' and content = 1;
-- set gucs to speed up testing
15: ALTER SYSTEM SET gp_dtx_recovery_prepared_period to 0;
15: ALTER SYSTEM SET gp_dtx_recovery_interval to 5;
15: SELECT pg_reload_conf();
-- trigger master panic and wait until master down before running any new query.
17&: SELECT wait_till_master_shutsdown();
18: SELECT gp_inject_fault('exec_simple_query_start', 'panic', current_setting('gp_dbid')::smallint);
18: SELECT 1;
16<:
17<:
-- wait until master is up for querying.
19: SELECT 1;
-- master suspends before running periodical checking of orphaned prepared transactions.
19: SELECT gp_inject_fault_infinite('before_orphaned_check', 'suspend', dbid)
from gp_segment_configuration where role = 'p' and content = -1;
19: SELECT gp_wait_until_triggered_fault('before_orphaned_check', 1, dbid)
from gp_segment_configuration where role = 'p' and content = -1;
-- let prepare finish else dtx recovery can not abort the prepared transaction.
19: SELECT gp_inject_fault_infinite('before_xlog_xact_prepare', 'reset', dbid)
from gp_segment_configuration where role = 'p' and content = 1;
19: SELECT gp_wait_until_triggered_fault('after_xlog_xact_prepare_flushed', 1, dbid)
from gp_segment_configuration where role = 'p' and content = 1;
19: SELECT gp_inject_fault_infinite('after_xlog_xact_prepare_flushed', 'reset', dbid)
from gp_segment_configuration where role = 'p' and content = 1;
-- should exist an orphaned prepared transaction.
1U: SELECT count(*) from pg_prepared_xacts;
-- if there is orphaned prepared transaction, drop would hang since
-- the orphaned prepared transaction holds lock of the table that conflicts
-- with required lock of the drop operation.
19: SELECT gp_inject_fault_infinite('before_orphaned_check', 'reset', dbid)
from gp_segment_configuration where role = 'p' and content = -1;
19: DROP TABLE master_reset;
19: ALTER SYSTEM RESET gp_dtx_recovery_interval;
19: ALTER SYSTEM RESET gp_dtx_recovery_prepared_period;
19: SELECT pg_reload_conf();
-- ensure the orphaned prepared transaction is gone.
1U: SELECT * from pg_prepared_xacts;
-- Scenario 6: retry Abort Prepared on QD fails but won't cause panic. The dtx
-- recovery process finally aborts it.
-- speed up testing by setting some gucs.
20: ALTER SYSTEM SET gp_dtx_recovery_prepared_period to 0;
20: ALTER SYSTEM SET gp_dtx_recovery_interval to 5;
20: ALTER SYSTEM SET dtx_phase2_retry_second to 5;
20: SELECT pg_reload_conf();
20: CREATE TABLE test_retry_abort(a int);
-- master: set fault to trigger abort prepare
-- primary 0: set fault so that retry prepared abort fails.
20: SELECT gp_inject_fault('dtm_broadcast_prepare', 'error', dbid)
from gp_segment_configuration where role = 'p' and content = -1;
20: SELECT gp_inject_fault_infinite('finish_prepared_start_of_function', 'error', dbid)
from gp_segment_configuration where role = 'p' and content = 0;
-- run two phase query.
21: INSERT INTO test_retry_abort SELECT generate_series(1,10);
-- verify the transaction was aborted and there is one orphaned prepared
-- transaction on seg0.
20: SELECT * from test_retry_abort;
0U: SELECT count(*) from pg_prepared_xacts;
-- dtx recovery ready to handle the orphaned prepared transaction.
20: SELECT gp_inject_fault_infinite('before_orphaned_check', 'suspend', dbid)
from gp_segment_configuration where role = 'p' and content = -1;
20: SELECT gp_wait_until_triggered_fault('before_orphaned_check', 1, dbid)
from gp_segment_configuration where role = 'p' and content = -1;
20: SELECT gp_inject_fault_infinite('after_orphaned_check', 'skip', dbid)
from gp_segment_configuration where role = 'p' and content = -1;
-- kick off abort prepared on seg0 and then dtx recovery will abort that one.
20: SELECT gp_inject_fault_infinite('finish_prepared_start_of_function', 'reset', dbid)
from gp_segment_configuration where role = 'p' and content = 0;
20: SELECT gp_inject_fault_infinite('before_orphaned_check', 'reset', dbid)
from gp_segment_configuration where role = 'p' and content = -1;
-- verify there is no orphaned prepared transaction on seg0.
20: SELECT gp_wait_until_triggered_fault('after_orphaned_check', 1, dbid)
from gp_segment_configuration where role = 'p' and content = -1;
0U: SELECT * from pg_prepared_xacts;
-- cleanup
20: ALTER SYSTEM RESET gp_dtx_recovery_interval;
20: ALTER SYSTEM RESET gp_dtx_recovery_prepared_period;
20: ALTER SYSTEM RESET dtx_phase2_retry_second;
20: SELECT pg_reload_conf();
20: SELECT gp_inject_fault('dtm_broadcast_prepare', 'reset', dbid)
from gp_segment_configuration where role = 'p' and content = -1;
20: SELECT gp_inject_fault_infinite('after_orphaned_check', 'reset', dbid)
from gp_segment_configuration where role = 'p' and content = -1;
20: DROP TABLE test_retry_abort;