src/test/singlenode_isolation2/sql/pg_rewind_fail_missing_xlog.sql - cloudberry - Git at Google

 -- Test the bug that if checkpoint.redo before the oldest replication slot LSN
 -- is removed/recylced in checkpointer, gprecoverseg (based on pg_rewind) would
 -- would fail.

 CREATE TABLE tst_missing_tbl (a int);
 INSERT INTO tst_missing_tbl values(2),(1),(5);

 -- make the test faster.
 !\retcode gpconfig -c wal_keep_size -v 128;
 !\retcode gpstop -ari;

 -- Test 1: primary was marked down by the master but acetually it keeps running
 -- and previously, checkpoints could recycle/remove the checkpoint.redo wal
 -- file before the oldest replication slot LSN and thus make pg_rewind fail due
 -- to missing xlog file.

 -- Run a checkpoint so that the below sqls won't cause a checkpoint
 -- until an explicit checkpoint command is issued by the test.
 -- checkpoint_timeout is by default 300 but the below test should be able to
 -- finish in 300 seconds.
 1: CHECKPOINT;

 0U: SELECT pg_switch_wal is not null FROM pg_switch_wal();
 1: INSERT INTO tst_missing_tbl values(2),(1),(5);
 0U: SELECT pg_switch_wal is not null FROM pg_switch_wal();
 1: INSERT INTO tst_missing_tbl values(2),(1),(5);
 0U: SELECT pg_switch_wal is not null FROM pg_switch_wal();
 1: INSERT INTO tst_missing_tbl values(2),(1),(5);
 -- Should be not needed mostly but let's 100% ensure since pg_switch_wal()
 -- won't switch if it has been on the boundary (seldom though).
 0U: SELECT pg_switch_wal is not null FROM pg_switch_wal();
 1: INSERT INTO tst_missing_tbl values(2),(1),(5);

 -- Mark down the primary with content 0 via fts fault injection.
 1: SELECT gp_inject_fault_infinite('fts_handle_message', 'error', dbid) FROM gp_segment_configuration WHERE content = 0 AND role = 'p';

 -- Trigger failover and double check.
 1: SELECT gp_request_fts_probe_scan();
 1: SELECT role, preferred_role from gp_segment_configuration where content = 0;

 -- Run two more checkpoints. Previously this causes the checkpoint.redo wal
 -- file before the oldest replication slot LSN is recycled/removed.
 0M: CHECKPOINT;
 0M: CHECKPOINT;

 -- Wait some seconds until the promotion is done. When the query comes too early,
 -- the promoted primary is still hot-standby, but we don't support hot-standby now.
 2: select pg_sleep(2);

 -- Write something (promote adds a 'End Of Recovery' xlog that causes the
 -- divergence between primary and mirror, but I add a write here so that we
 -- know that a wal divergence is explicitly triggered and 100% completed.  Also
 -- sanity check the tuple distribution (assumption of the test).
 2: INSERT INTO tst_missing_tbl values(2),(1),(5);
 2: SELECT gp_segment_id, count(*) from tst_missing_tbl group by gp_segment_id;

 -- Ensure that pg_rewind succeeds. Previously it could fail since the divergence
 -- LSN wal file is missing.
 !\retcode gprecoverseg -av;
 -- In case it fails it should not affect subsequent testing.
 !\retcode gprecoverseg -aF;
 2: SELECT wait_until_all_segments_synchronized();

 -- Test 2
 -- primary is abnormally shutdown, but pg_rewind would call single mode
 -- postgres to ensure it clean shutdown and that causes two checkpoints.

 -- See previous comment for why.
 3: CHECKPOINT;

 1U: SELECT pg_switch_wal is not null FROM pg_switch_wal();
 3: INSERT INTO tst_missing_tbl values(2),(1),(5);
 1U: SELECT pg_switch_wal is not null FROM pg_switch_wal();
 3: INSERT INTO tst_missing_tbl values(2),(1),(5);
 1U: SELECT pg_switch_wal is not null FROM pg_switch_wal();
 3: INSERT INTO tst_missing_tbl values(2),(1),(5);
 -- Should be not needed mostly but let's 100% ensure since pg_switch_wal()
 -- won't switch if it is on the boundary already (seldom though).
 1U: SELECT pg_switch_wal is not null FROM pg_switch_wal();
 3: INSERT INTO tst_missing_tbl values(2),(1),(5);

 -- Hang at checkpointer before writing checkpoint xlog.
 3: SELECT gp_inject_fault('checkpoint_after_redo_calculated', 'suspend', dbid) FROM gp_segment_configuration WHERE role='p' AND content = 1;
 1U&: CHECKPOINT;
 3: SELECT gp_wait_until_triggered_fault('checkpoint_after_redo_calculated', 1, dbid) FROM gp_segment_configuration WHERE role='p' AND content = 1;

 -- Stop the primary immediately and promote the mirror.
 3: SELECT pg_ctl(datadir, 'stop', 'immediate') FROM gp_segment_configuration WHERE role='p' AND content = 1;
 3: SELECT gp_request_fts_probe_scan();
 -- Wait for the end of recovery CHECKPOINT completed after the mirror was promoted
 3: SELECT gp_inject_fault('checkpoint_after_redo_calculated', 'skip', dbid) FROM gp_segment_configuration WHERE role='p' AND content = 1;
 3: SELECT gp_wait_until_triggered_fault('checkpoint_after_redo_calculated', 1, dbid) FROM gp_segment_configuration WHERE role = 'p' AND content = 1;
 3: SELECT gp_inject_fault('checkpoint_after_redo_calculated', 'reset', dbid) FROM gp_segment_configuration WHERE role = 'p' AND content = 1;
 3: SELECT role, preferred_role from gp_segment_configuration where content = 1;

 4: INSERT INTO tst_missing_tbl values(2),(1),(5);
 4: SELECT gp_segment_id, count(*) from tst_missing_tbl group by gp_segment_id;

 -- CHECKPOINT should fail now.
 1U<:
 1Uq:

 -- Ensure that pg_rewind succeeds. For unclean shutdown, there are two
 -- checkpoints are introduced in pg_rewind when running single-mode postgres
 -- (one is the checkpoint after crash recovery and another is the shutdown
 -- checkpoint) and previously the checkpoints clean up the wal files that
 -- include the previous checkpoint (before divergence LSN) for pg_rewind and
 -- thus makes gprecoverseg (pg_rewind) fail.
 !\retcode gprecoverseg -av;
 -- In case it fails it should not affect subsequent testing.
 !\retcode gprecoverseg -aF;
 4: SELECT wait_until_all_segments_synchronized();

 -- Cleanup
 5: DROP TABLE tst_missing_tbl;
 !\retcode gprecoverseg -ar;
 5: SELECT wait_until_all_segments_synchronized();
 !\retcode gpconfig -r wal_keep_size;
 !\retcode gpstop -ari;
	-- Test the bug that if checkpoint.redo before the oldest replication slot LSN
	-- is removed/recylced in checkpointer, gprecoverseg (based on pg_rewind) would
	-- would fail.

	CREATE TABLE tst_missing_tbl (a int);
	INSERT INTO tst_missing_tbl values(2),(1),(5);

	-- make the test faster.
	!\retcode gpconfig -c wal_keep_size -v 128;
	!\retcode gpstop -ari;

	-- Test 1: primary was marked down by the master but acetually it keeps running
	-- and previously, checkpoints could recycle/remove the checkpoint.redo wal
	-- file before the oldest replication slot LSN and thus make pg_rewind fail due
	-- to missing xlog file.

	-- Run a checkpoint so that the below sqls won't cause a checkpoint
	-- until an explicit checkpoint command is issued by the test.
	-- checkpoint_timeout is by default 300 but the below test should be able to
	-- finish in 300 seconds.
	1: CHECKPOINT;

	0U: SELECT pg_switch_wal is not null FROM pg_switch_wal();
	1: INSERT INTO tst_missing_tbl values(2),(1),(5);
	0U: SELECT pg_switch_wal is not null FROM pg_switch_wal();
	1: INSERT INTO tst_missing_tbl values(2),(1),(5);
	0U: SELECT pg_switch_wal is not null FROM pg_switch_wal();
	1: INSERT INTO tst_missing_tbl values(2),(1),(5);
	-- Should be not needed mostly but let's 100% ensure since pg_switch_wal()
	-- won't switch if it has been on the boundary (seldom though).
	0U: SELECT pg_switch_wal is not null FROM pg_switch_wal();
	1: INSERT INTO tst_missing_tbl values(2),(1),(5);

	-- Mark down the primary with content 0 via fts fault injection.
	1: SELECT gp_inject_fault_infinite('fts_handle_message', 'error', dbid) FROM gp_segment_configuration WHERE content = 0 AND role = 'p';

	-- Trigger failover and double check.
	1: SELECT gp_request_fts_probe_scan();
	1: SELECT role, preferred_role from gp_segment_configuration where content = 0;

	-- Run two more checkpoints. Previously this causes the checkpoint.redo wal
	-- file before the oldest replication slot LSN is recycled/removed.
	0M: CHECKPOINT;
	0M: CHECKPOINT;

	-- Wait some seconds until the promotion is done. When the query comes too early,
	-- the promoted primary is still hot-standby, but we don't support hot-standby now.
	2: select pg_sleep(2);

	-- Write something (promote adds a 'End Of Recovery' xlog that causes the
	-- divergence between primary and mirror, but I add a write here so that we
	-- know that a wal divergence is explicitly triggered and 100% completed. Also
	-- sanity check the tuple distribution (assumption of the test).
	2: INSERT INTO tst_missing_tbl values(2),(1),(5);
	2: SELECT gp_segment_id, count(*) from tst_missing_tbl group by gp_segment_id;

	-- Ensure that pg_rewind succeeds. Previously it could fail since the divergence
	-- LSN wal file is missing.
	!\retcode gprecoverseg -av;
	-- In case it fails it should not affect subsequent testing.
	!\retcode gprecoverseg -aF;
	2: SELECT wait_until_all_segments_synchronized();

	-- Test 2
	-- primary is abnormally shutdown, but pg_rewind would call single mode
	-- postgres to ensure it clean shutdown and that causes two checkpoints.

	-- See previous comment for why.
	3: CHECKPOINT;

	1U: SELECT pg_switch_wal is not null FROM pg_switch_wal();
	3: INSERT INTO tst_missing_tbl values(2),(1),(5);
	1U: SELECT pg_switch_wal is not null FROM pg_switch_wal();
	3: INSERT INTO tst_missing_tbl values(2),(1),(5);
	1U: SELECT pg_switch_wal is not null FROM pg_switch_wal();
	3: INSERT INTO tst_missing_tbl values(2),(1),(5);
	-- Should be not needed mostly but let's 100% ensure since pg_switch_wal()
	-- won't switch if it is on the boundary already (seldom though).
	1U: SELECT pg_switch_wal is not null FROM pg_switch_wal();
	3: INSERT INTO tst_missing_tbl values(2),(1),(5);

	-- Hang at checkpointer before writing checkpoint xlog.
	3: SELECT gp_inject_fault('checkpoint_after_redo_calculated', 'suspend', dbid) FROM gp_segment_configuration WHERE role='p' AND content = 1;
	1U&: CHECKPOINT;
	3: SELECT gp_wait_until_triggered_fault('checkpoint_after_redo_calculated', 1, dbid) FROM gp_segment_configuration WHERE role='p' AND content = 1;

	-- Stop the primary immediately and promote the mirror.
	3: SELECT pg_ctl(datadir, 'stop', 'immediate') FROM gp_segment_configuration WHERE role='p' AND content = 1;
	3: SELECT gp_request_fts_probe_scan();
	-- Wait for the end of recovery CHECKPOINT completed after the mirror was promoted
	3: SELECT gp_inject_fault('checkpoint_after_redo_calculated', 'skip', dbid) FROM gp_segment_configuration WHERE role='p' AND content = 1;
	3: SELECT gp_wait_until_triggered_fault('checkpoint_after_redo_calculated', 1, dbid) FROM gp_segment_configuration WHERE role = 'p' AND content = 1;
	3: SELECT gp_inject_fault('checkpoint_after_redo_calculated', 'reset', dbid) FROM gp_segment_configuration WHERE role = 'p' AND content = 1;
	3: SELECT role, preferred_role from gp_segment_configuration where content = 1;

	4: INSERT INTO tst_missing_tbl values(2),(1),(5);
	4: SELECT gp_segment_id, count(*) from tst_missing_tbl group by gp_segment_id;

	-- CHECKPOINT should fail now.
	1U<:
	1Uq:

	-- Ensure that pg_rewind succeeds. For unclean shutdown, there are two
	-- checkpoints are introduced in pg_rewind when running single-mode postgres
	-- (one is the checkpoint after crash recovery and another is the shutdown
	-- checkpoint) and previously the checkpoints clean up the wal files that
	-- include the previous checkpoint (before divergence LSN) for pg_rewind and
	-- thus makes gprecoverseg (pg_rewind) fail.
	!\retcode gprecoverseg -av;
	-- In case it fails it should not affect subsequent testing.
	!\retcode gprecoverseg -aF;
	4: SELECT wait_until_all_segments_synchronized();

	-- Cleanup
	5: DROP TABLE tst_missing_tbl;
	!\retcode gprecoverseg -ar;
	5: SELECT wait_until_all_segments_synchronized();
	!\retcode gpconfig -r wal_keep_size;
	!\retcode gpstop -ari;