src/test/isolation2/sql/segwalrep/mirror_promotion.sql - cloudberry - Git at Google

 -- Tests mirror promotion triggered by FTS in 2 different scenarios.
 --
 -- 1st: Shut-down of primary and hence unavailability of primary
 -- leading to mirror promotion. In this case the connection between
 -- primary and mirror is disconnected prior to promotion and
 -- walreceiver doesn't exist.
 --
 -- 2nd: Primary is alive but using fault injector simulated to not
 -- respond to fts. This helps to validate fts time-out logic for
 -- probes. Plus also mirror promotion triggered while connection
 -- between primary and mirror is still alive and hence walreceiver
 -- also exist during promotion.

 SELECT role, preferred_role, content, status FROM gp_segment_configuration;
 -- stop a primary in order to trigger a mirror promotion
 select pg_ctl((select datadir from gp_segment_configuration c
 where c.role='p' and c.content=0), 'stop');

 -- trigger failover
 select gp_request_fts_probe_scan();
 -- wait some seconds until the promotion is done.
 !\retcode gpfts -A -D;

 -- expect: to see the content 0, preferred primary is mirror and it's down
 -- the preferred mirror is primary and it's up and not-in-sync
 select content, preferred_role, role, status, mode
 from gp_segment_configuration
 where content = 0;

 -- wait some seconds until the promotion is done.
 select pg_sleep(2);
 -- wait for content 0 (earlier mirror, now primary) to finish the promotion
 0U: select 1;
 -- Quit this utility mode session, as need to start fresh one below
 0Uq:

 -- fully recover the failed primary as new mirror
 !\retcode gprecoverseg -aF --no-progress;

 -- loop while segments come in sync
 select wait_until_all_segments_synchronized();

 -- expect: to see roles flipped and in sync
 select content, preferred_role, role, status, mode
 from gp_segment_configuration
 where content = 0;

 -- start_ignore
 -- set GUCs to speed-up the test
 alter system set gp_fts_probe_retries to 2;
 alter system set gp_fts_probe_timeout to 5;
 select pg_reload_conf();

 select dbid from gp_segment_configuration where content = 0 and role = 'p';
 -- end_ignore

 select gp_inject_fault_infinite('fts_handle_message', 'infinite_loop', dbid)
 from gp_segment_configuration
 where content = 0 and role = 'p';

 -- trigger failover
 select gp_request_fts_probe_scan();
 -- trigger one more probe right away which mostly results in sending
 -- promotion request again to mirror, while its going through
 -- promotion, which is nice condition to test as well.
 select gp_request_fts_probe_scan();
 !\retcode gpfts -A -D;

 -- expect segments restored back to its preferred role, but mirror is down
 select content, preferred_role, role, status, mode
 from gp_segment_configuration
 where content = 0;

 -- wait some seconds until the promotion is done.
 select pg_sleep(2);
 -- start_ignore
 -- reset GUCs
 alter system set gp_fts_probe_retries to default;
 alter system set gp_fts_probe_timeout to default;
 select pg_reload_conf();
 -- end_ignore
 -- wait some seconds until the promotion is done.
 select pg_sleep(2);

 -- -- wait for content 0 (earlier mirror, now primary) to finish the promotion
 0U: select 1;

 -- create tablespace to test if it works with gprecoverseg -F (pg_basebackup)
 !\retcode mkdir -p /tmp/mirror_promotion_tablespace_loc;
 create tablespace mirror_promotion_tablespace location '/tmp/mirror_promotion_tablespace_loc';
 create table mirror_promotion_tblspc_heap_table (a int) tablespace mirror_promotion_tablespace;

 -- -- now, let's fully recover the mirror
 !\retcode gprecoverseg -aF  --no-progress;

 drop table mirror_promotion_tblspc_heap_table;
 drop tablespace mirror_promotion_tablespace;

 -- loop while segments come in sync
 select wait_until_all_segments_synchronized();

 -- now, the content 0 primary and mirror should be at their preferred role
 -- and up and in-sync
 select content, preferred_role, role, status, mode
 from gp_segment_configuration
 where content = 0;
	-- Tests mirror promotion triggered by FTS in 2 different scenarios.
	--
	-- 1st: Shut-down of primary and hence unavailability of primary
	-- leading to mirror promotion. In this case the connection between
	-- primary and mirror is disconnected prior to promotion and
	-- walreceiver doesn't exist.
	--
	-- 2nd: Primary is alive but using fault injector simulated to not
	-- respond to fts. This helps to validate fts time-out logic for
	-- probes. Plus also mirror promotion triggered while connection
	-- between primary and mirror is still alive and hence walreceiver
	-- also exist during promotion.

	SELECT role, preferred_role, content, status FROM gp_segment_configuration;
	-- stop a primary in order to trigger a mirror promotion
	select pg_ctl((select datadir from gp_segment_configuration c
	where c.role='p' and c.content=0), 'stop');

	-- trigger failover
	select gp_request_fts_probe_scan();
	-- wait some seconds until the promotion is done.
	!\retcode gpfts -A -D;

	-- expect: to see the content 0, preferred primary is mirror and it's down
	-- the preferred mirror is primary and it's up and not-in-sync
	select content, preferred_role, role, status, mode
	from gp_segment_configuration
	where content = 0;

	-- wait some seconds until the promotion is done.
	select pg_sleep(2);
	-- wait for content 0 (earlier mirror, now primary) to finish the promotion
	0U: select 1;
	-- Quit this utility mode session, as need to start fresh one below
	0Uq:

	-- fully recover the failed primary as new mirror
	!\retcode gprecoverseg -aF --no-progress;

	-- loop while segments come in sync
	select wait_until_all_segments_synchronized();

	-- expect: to see roles flipped and in sync
	select content, preferred_role, role, status, mode
	from gp_segment_configuration
	where content = 0;

	-- start_ignore
	-- set GUCs to speed-up the test
	alter system set gp_fts_probe_retries to 2;
	alter system set gp_fts_probe_timeout to 5;
	select pg_reload_conf();

	select dbid from gp_segment_configuration where content = 0 and role = 'p';
	-- end_ignore

	select gp_inject_fault_infinite('fts_handle_message', 'infinite_loop', dbid)
	from gp_segment_configuration
	where content = 0 and role = 'p';

	-- trigger failover
	select gp_request_fts_probe_scan();
	-- trigger one more probe right away which mostly results in sending
	-- promotion request again to mirror, while its going through
	-- promotion, which is nice condition to test as well.
	select gp_request_fts_probe_scan();
	!\retcode gpfts -A -D;

	-- expect segments restored back to its preferred role, but mirror is down
	select content, preferred_role, role, status, mode
	from gp_segment_configuration
	where content = 0;

	-- wait some seconds until the promotion is done.
	select pg_sleep(2);
	-- start_ignore
	-- reset GUCs
	alter system set gp_fts_probe_retries to default;
	alter system set gp_fts_probe_timeout to default;
	select pg_reload_conf();
	-- end_ignore
	-- wait some seconds until the promotion is done.
	select pg_sleep(2);

	-- -- wait for content 0 (earlier mirror, now primary) to finish the promotion
	0U: select 1;

	-- create tablespace to test if it works with gprecoverseg -F (pg_basebackup)
	!\retcode mkdir -p /tmp/mirror_promotion_tablespace_loc;
	create tablespace mirror_promotion_tablespace location '/tmp/mirror_promotion_tablespace_loc';
	create table mirror_promotion_tblspc_heap_table (a int) tablespace mirror_promotion_tablespace;

	-- -- now, let's fully recover the mirror
	!\retcode gprecoverseg -aF --no-progress;

	drop table mirror_promotion_tblspc_heap_table;
	drop tablespace mirror_promotion_tablespace;

	-- loop while segments come in sync
	select wait_until_all_segments_synchronized();

	-- now, the content 0 primary and mirror should be at their preferred role
	-- and up and in-sync
	select content, preferred_role, role, status, mode
	from gp_segment_configuration
	where content = 0;