blob: 9a1ffa2781a892308d9be6afc2808e077e85c444 [file] [log] [blame]
-- Ensure that the wal files up until the one which has the last CHECKPOINT wal
-- record BEFORE the oldest replication slot restart_lsn are not recycled even
-- if more than one CHECKPOINTs are performed after that restart_lsn. Otherwise
-- gprecoverseg (based on pg_rewind) would fail due to missing wal file
CREATE OR REPLACE LANGUAGE plpython3u;
CREATE OR REPLACE FUNCTION connectSeg(n int, port int, hostname text) RETURNS bool AS $$
import os
import subprocess
import time
for i in range(n):
try:
subprocess.run(["psql", "-h", str(hostname), "-p", str(port), "postgres", "-Xc", "select 1;"],
env={"PGOPTIONS": "-c gp_role=utility", "PATH": os.getenv("PATH")},
check=True)
return True
except Exception as e:
time.sleep(1)
raise Exception("wait connection timeout")
$$
LANGUAGE plpython3u;
CREATE TABLE tst_missing_tbl (a int);
INSERT INTO tst_missing_tbl values(2),(1),(5);
-- Make the test faster by not preserving any extra wal segment files
!\retcode gpconfig -c wal_keep_size -v 0;
!\retcode gpstop -ari;
-- Test 1: Ensure that pg_rewind doesn't fail due to checkpoints inadvertently
-- recycling WAL when a former primary is marked down in configuration, while it
-- actually continues to run. The subsequent CHECKPOINT which is performed after the
-- failover on the segment being marked down should not recycle the wal file
-- which has the last common checkpoint of the target and source segment of
-- pg_rewind.
-- Run a checkpoint so that the below sqls won't cause a checkpoint
-- until an explicit checkpoint command is issued by the test.
-- checkpoint_timeout is by default 300 but the below test should be able to
-- finish in 300 seconds.
1: CHECKPOINT;
0U: SELECT pg_switch_wal is not null FROM pg_switch_wal();
1: INSERT INTO tst_missing_tbl values(2),(1),(5);
-- Should be not needed mostly but let's 100% ensure since pg_switch_wal()
-- won't switch if it has been on the boundary (seldom though).
0U: SELECT pg_switch_wal is not null FROM pg_switch_wal();
1: INSERT INTO tst_missing_tbl values(2),(1),(5);
0Uq:
-- Make sure primary/mirror pair is in sync, otherwise FTS can't promote mirror
1: SELECT wait_until_all_segments_synchronized();
-- Mark down the primary with content 0 via fts fault injection.
1: SELECT gp_inject_fault_infinite('fts_handle_message', 'error', dbid) FROM gp_segment_configuration WHERE content = 0 AND role = 'p';
-- Trigger failover and double check.
1: SELECT gp_request_fts_probe_scan();
1: SELECT role, preferred_role from gp_segment_configuration where content = 0;
-- Run two more checkpoints. Previously this causes the checkpoint.redo wal
-- file before the oldest replication slot LSN is recycled/removed.
0M: CHECKPOINT;
0M: BEGIN;
0M: DROP TABLE tst_missing_tbl;
0M: ABORT;
0M: CHECKPOINT;
0Mq:
-- Write something (promote adds a 'End Of Recovery' xlog that causes the
-- divergence between primary and mirror, but I add a write here so that we
-- know that a wal divergence is explicitly triggered and 100% completed. Also
-- sanity check the tuple distribution (assumption of the test).
2: INSERT INTO tst_missing_tbl values(2),(1),(5);
2: SELECT gp_segment_id, count(*) from tst_missing_tbl group by gp_segment_id;
-- Ensure that pg_rewind succeeds. Previously it could fail since the divergence
-- LSN wal file is missing.
!\retcode gprecoverseg -av;
-- In case it fails it should not affect subsequent testing.
!\retcode gprecoverseg -aF;
2: SELECT wait_until_all_segments_synchronized();
-- Test 2: Ensure that pg_rewind doesn't fail due to checkpoints inadvertently
-- recycling WAL when a former primary was abnormally shutdown. In the case the
-- target segment was abnormally shutdown, pg_rewind starts and then stops
-- the target segment as single-user mode postgres to ensure clean shutdown which
-- causes two checkpoints. The two newer checkpoints introduced by pg_rewind
-- should not recycle the wal file that has the last common checkpoint of the
-- target and source segment of pg_rewind.
-- See previous comment for why.
3: CHECKPOINT;
1U: SELECT pg_switch_wal is not null FROM pg_switch_wal();
3: INSERT INTO tst_missing_tbl values(2),(1),(5);
1U: SELECT pg_switch_wal is not null FROM pg_switch_wal();
3: INSERT INTO tst_missing_tbl values(2),(1),(5);
1U: SELECT pg_switch_wal is not null FROM pg_switch_wal();
3: INSERT INTO tst_missing_tbl values(2),(1),(5);
-- Should be not needed mostly but let's 100% ensure since pg_switch_wal()
-- won't switch if it is on the boundary already (seldom though).
1U: SELECT pg_switch_wal is not null FROM pg_switch_wal();
3: INSERT INTO tst_missing_tbl values(2),(1),(5);
-- Hang at checkpointer before writing checkpoint xlog.
3: SELECT gp_inject_fault('checkpoint_after_redo_calculated', 'suspend', dbid) FROM gp_segment_configuration WHERE role='p' AND content = 1;
1U&: CHECKPOINT;
3: SELECT gp_wait_until_triggered_fault('checkpoint_after_redo_calculated', 1, dbid) FROM gp_segment_configuration WHERE role='p' AND content = 1;
-- Stop the primary immediately and promote the mirror.
3: SELECT pg_ctl(datadir, 'stop', 'immediate') FROM gp_segment_configuration WHERE role='p' AND content = 1;
3: SELECT gp_request_fts_probe_scan();
-- Wait for the segment promotion finished and accept the connection
3: select connectSeg(600,port,hostname) from gp_segment_configuration where content = 1 and role = 'p';
-- Wait for the end of recovery CHECKPOINT completed after the mirror was promoted
3: SELECT gp_inject_fault('checkpoint_after_redo_calculated', 'skip', dbid) FROM gp_segment_configuration WHERE role='p' AND content = 1;
3: SELECT gp_wait_until_triggered_fault('checkpoint_after_redo_calculated', 1, dbid) FROM gp_segment_configuration WHERE role = 'p' AND content = 1;
3: SELECT gp_inject_fault('checkpoint_after_redo_calculated', 'reset', dbid) FROM gp_segment_configuration WHERE role = 'p' AND content = 1;
3: SELECT role, preferred_role from gp_segment_configuration where content = 1;
4: INSERT INTO tst_missing_tbl values(2),(1),(5);
4: SELECT gp_segment_id, count(*) from tst_missing_tbl group by gp_segment_id;
-- CHECKPOINT should fail now.
1U<:
1Uq:
-- Ensure that pg_rewind succeeds. For unclean shutdown, there are two
-- checkpoints are introduced in pg_rewind when running single-user mode postgres
-- (one is the checkpoint after crash recovery and another is the shutdown
-- checkpoint) and previously the checkpoints clean up the wal files that
-- include the previous checkpoint (before divergence LSN) for pg_rewind and
-- thus makes gprecoverseg (pg_rewind) fail.
!\retcode gprecoverseg -av;
-- In case it fails it should not affect subsequent testing.
!\retcode gprecoverseg -aF;
4: SELECT wait_until_all_segments_synchronized();
-- Test 3: Ensure that pg_rewind doesn't fail due to checkpoints inadvertently
-- recycling WAL when a former primary is marked down in configuration, while it
-- actually continued running for a while before it was cleanly shutdown. The wal
-- file on the target segment which has the last common checkpoint of the target
-- and source segment of pg_rewind should survive the subsequent CHECKPOINTs
-- performed on the target segment even if the segment was cleanly shutdown
-- and started again after the failover.
-- Run a checkpoint so that the below sqls won't cause a checkpoint
-- until an explicit checkpoint command is issued by the test.
-- checkpoint_timeout is by default 300 but the below test should be able to
-- finish in 300 seconds.
1: CHECKPOINT;
0U: SELECT pg_switch_wal is not null FROM pg_switch_wal();
1: INSERT INTO tst_missing_tbl values(2),(1),(5);
-- Should be not needed mostly but let's 100% ensure since pg_switch_wal()
-- won't switch if it has been on the boundary (seldom though).
0U: SELECT pg_switch_wal is not null FROM pg_switch_wal();
1: INSERT INTO tst_missing_tbl values(2),(1),(5);
0Uq:
-- Make sure primary/mirror pair is in sync, otherwise FTS can't promote mirror
1: SELECT wait_until_all_segments_synchronized();
-- Mark down the primary with content 0 via fts fault injection.
1: SELECT gp_inject_fault_infinite('fts_handle_message', 'error', dbid) FROM gp_segment_configuration WHERE content = 0 AND role = 'p';
-- Trigger failover and double check.
1: SELECT gp_request_fts_probe_scan();
1: SELECT role, preferred_role from gp_segment_configuration where content = 0;
-- Run two more checkpoints. Previously this causes the checkpoint.redo wal
-- file before the oldest replication slot LSN is recycled/removed.
0M: BEGIN;
0M: DROP TABLE tst_missing_tbl;
0M: ABORT;
0M: CHECKPOINT;
0M: BEGIN;
0M: DROP TABLE tst_missing_tbl;
0M: ABORT;
0M: CHECKPOINT;
-- Clean shutdown. Clean shutdown performs CHECKPOINT
2: SELECT pg_ctl(datadir, 'stop', 'fast') from gp_segment_configuration where role = 'm' and content = 0;
0Mq:
-- Start again. Start from a clean shutdown does not perform CHECKPOINT
2: SELECT pg_ctl_start(datadir, port) from gp_segment_configuration where role = 'm' and content = 0;
-- Perform CHECKPOINT. Previously this causes the checkpoint.redo wal
-- file before the oldest replication slot LSN is recycled/removed.
0M: CHECKPOINT;
-- Write something (promote adds a 'End Of Recovery' xlog that causes the
-- divergence between primary and mirror, but I add a write here so that we
-- know that a wal divergence is explicitly triggered and 100% completed. Also
-- sanity check the tuple distribution (assumption of the test).
2: INSERT INTO tst_missing_tbl values(2),(1),(5);
2: SELECT gp_segment_id, count(*) from tst_missing_tbl group by gp_segment_id;
-- Ensure that pg_rewind succeeds. Previously it could fail since the divergence
-- LSN wal file is missing.
!\retcode gprecoverseg -av;
-- In case it fails it should not affect subsequent testing.
!\retcode gprecoverseg -aF;
2: SELECT wait_until_all_segments_synchronized();
-- Test 4: Ensure that pg_rewind doesn't fail due to checkpoints inadvertently
-- recycling WAL when a former primary was abnormally shutdown and it wrote
-- a CHECKPOINT record locally but the record did not make to the wal receiver.
-- In the case the target segment was abnormally shutdown, pg_rewind starts and
-- then stops the target segment as single-user mode postgres to ensure clean shutdown
-- which causes two checkpoints. The two newer checkpoints introduced by pg_rewind
-- plus the checkpoint during the unclean shutdown should not recycle the wal
-- file that has the last common checkpoint of the target and source segment of
-- pg_rewind.
-- See previous comment for why.
3: CHECKPOINT;
1U: SELECT pg_switch_wal is not null FROM pg_switch_wal();
3: INSERT INTO tst_missing_tbl values(2),(1),(5);
-- Should be not needed mostly but let's 100% ensure since pg_switch_wal()
-- won't switch if it is on the boundary already (seldom though).
1U: SELECT pg_switch_wal is not null FROM pg_switch_wal();
3: INSERT INTO tst_missing_tbl values(2),(1),(5);
-- Have primary/mirror pair in sync before suspending the wal sender.
3: SELECT wait_until_all_segments_synchronized();
-- Hang the wal sender before writing checkpoint wal record.
3: SELECT gp_inject_fault('wal_sender_loop', 'suspend', dbid) FROM gp_segment_configuration WHERE role='p' AND content = 1;
3: SELECT gp_wait_until_triggered_fault('wal_sender_loop', 1, dbid) FROM gp_segment_configuration WHERE role='p' AND content = 1;
-- Checkpoint and make sure the CHECKPOINT record is written on disk
3: SELECT gp_inject_fault('checkpoint_control_file_updated', 'suspend', dbid) FROM gp_segment_configuration WHERE role='p' AND content = 1;
1U&: CHECKPOINT;
3: SELECT gp_wait_until_triggered_fault('checkpoint_control_file_updated', 1, dbid) FROM gp_segment_configuration WHERE role='p' AND content = 1;
-- Stop the primary immediately and promote the mirror.
3: SELECT pg_ctl(datadir, 'stop', 'immediate') FROM gp_segment_configuration WHERE role='p' AND content = 1;
3: SELECT gp_request_fts_probe_scan();
-- Wait for the segment promotion finished and accept the connection
3: select connectSeg(600,port,hostname) from gp_segment_configuration where content = 1 and role = 'p';
-- Reset faults and confirm FTS configuration
3: SELECT gp_inject_fault('wal_sender_loop', 'reset', dbid) FROM gp_segment_configuration WHERE role='p' AND content = 1;
3: SELECT gp_inject_fault('checkpoint_control_file_updated', 'reset', dbid) FROM gp_segment_configuration WHERE role = 'p' AND content = 1;
3: SELECT role, preferred_role from gp_segment_configuration where content = 1;
-- Write something on the current primary
4: INSERT INTO tst_missing_tbl values(2),(1),(5);
4: SELECT gp_segment_id, count(*) from tst_missing_tbl group by gp_segment_id;
-- CHECKPOINT should fail now.
1U<:
1Uq:
-- Ensure that pg_rewind succeeds. For unclean shutdown, there are two
-- checkpoints are introduced in pg_rewind when running single-user mode postgres
-- (one is the checkpoint after crash recovery and another is the shutdown
-- checkpoint) and previously the checkpoints clean up the wal files that
-- include the previous checkpoint (before divergence LSN) for pg_rewind and
-- thus makes gprecoverseg (pg_rewind) fail.
!\retcode gprecoverseg -av;
-- In case it fails it should not affect subsequent testing.
!\retcode gprecoverseg -aF;
4: SELECT wait_until_all_segments_synchronized();
-- Test 5: Ensure that replication slot's restart_lsn advances and WAL files
-- that are older than the minimum restart_lsn get removed/recycled.
-- Turn off wal_recycle to make this test easier to write.
!\retcode gpconfig -c wal_recycle -v false;
!\retcode gpstop -ari;
-- Create an unlogged table on the primary that remembers replication slot's last restart_lsn and number of WAL files.
1U: CREATE UNLOGGED TABLE unlogged_wal_retention_test(restart_lsn_before pg_lsn, wal_count_before int);
1U: INSERT INTO unlogged_wal_retention_test SELECT (select restart_lsn FROM pg_replication_slots WHERE slot_name = 'internal_wal_replication_slot') as restart_lsn_before, (select count(*) from pg_ls_waldir()) as wal_count_before;
5: CHECKPOINT;
-- Replication slot's restart_lsn should advance to the checkpoint's redo location.
1U: SELECT pg_wal_lsn_diff(restart_lsn, restart_lsn_before) > 0 from pg_replication_slots, unlogged_wal_retention_test WHERE slot_name = 'internal_wal_replication_slot';
1U: SELECT pg_wal_lsn_diff(restart_lsn, redo_lsn) = 0 from pg_replication_slots, pg_control_checkpoint() WHERE slot_name = 'internal_wal_replication_slot';
-- Some old WALs should be removed
1U: select ((select count(*)::int from pg_ls_waldir()) - wal_count_before) < 0 FROM unlogged_wal_retention_test;
-- Record the restart_lsn and the WAL file count.
1U: UPDATE unlogged_wal_retention_test SET restart_lsn_before = (SELECT restart_lsn from pg_replication_slots WHERE slot_name = 'internal_wal_replication_slot'), wal_count_before = (select count(*) from pg_ls_waldir());
-- Write some records to a newer WAL file.
1U: SELECT pg_switch_wal is not null FROM pg_switch_wal();
5: INSERT INTO tst_missing_tbl values(2),(1),(5);
-- Replication slot's restart_lsn should NOT change regardless mirror has received more wals.
1U: select pg_wal_lsn_diff(restart_lsn, restart_lsn_before) = 0 FROM pg_replication_slots, unlogged_wal_retention_test WHERE slot_name = 'internal_wal_replication_slot';
-- Record the WAL file count.
1U: UPDATE unlogged_wal_retention_test SET wal_count_before = (select count(*) from pg_ls_waldir());
-- Hang the wal sender before writing more wals.
5: SELECT gp_inject_fault('wal_sender_loop', 'suspend', dbid) FROM gp_segment_configuration WHERE role='p' AND content = 1;
-- Perform checkpoint. This is to mimic walreciver's confirmed received lsn is behind the redo lsn.
1U: CHECKPOINT;
-- Replication slot's restart_lsn should NOT change regardless a new checkpoints was performed.
1U: select pg_wal_lsn_diff(restart_lsn, restart_lsn_before) = 0 FROM pg_replication_slots, unlogged_wal_retention_test WHERE slot_name = 'internal_wal_replication_slot';
-- Replication slot's restart_lsn should be smaller than the checkpoint's redo location.
1U: SELECT pg_wal_lsn_diff(restart_lsn, redo_lsn) < 0 from pg_replication_slots, pg_control_checkpoint() WHERE slot_name = 'internal_wal_replication_slot';
-- Resume walsender.
5: SELECT gp_inject_fault('wal_sender_loop', 'reset', dbid) FROM gp_segment_configuration WHERE role='p' AND content = 1;
-- Perform transaction to make sure wals are in sync.
5: INSERT INTO tst_missing_tbl values(2),(1),(5);
-- Replication slot's restart_lsn should now advance to the checkpoint's redo location.
1U: SELECT pg_wal_lsn_diff(restart_lsn, restart_lsn_before) > 0 from pg_replication_slots, unlogged_wal_retention_test WHERE slot_name = 'internal_wal_replication_slot';
1U: SELECT pg_wal_lsn_diff(restart_lsn, redo_lsn) = 0 from pg_replication_slots, pg_control_checkpoint() WHERE slot_name = 'internal_wal_replication_slot';
-- Trigger old WAL removal.
1U: CHECKPOINT;
-- And the WAL file(s) older than the redo lsn that we previously kept should now be removed.
1U: select ((select count(*)::int from pg_ls_waldir()) - wal_count_before) < 0 FROM unlogged_wal_retention_test;
-- Cleanup
1U: DROP TABLE unlogged_wal_retention_test;
1Uq:
5: DROP TABLE tst_missing_tbl;
5: DROP FUNCTION connectSeg;
!\retcode gpconfig -r wal_keep_size;
!\retcode gpconfig -r wal_recycle;
!\retcode gpstop -ari;