src/test/isolation2/sql/fts_errors.sql - cloudberry - Git at Google

 -- This test triggers failover of content 0 and content 1
 -- Content 0 is used to test if FTS can handle DNS errors
 -- Content 1 is used to test the gang interaction in various
 -- sessions when a failover is triggered and mirror is promoted
 -- to primary

 -- start_matchsubs
 -- m/^ERROR:  Error on receive from .*: server closed the connection unexpectedly/
 -- s/^ERROR:  Error on receive from .*: server closed the connection unexpectedly/ERROR: server closed the connection unexpectedly/
 -- end_matchsubs

 -- Allow extra time for mirror promotion to complete recovery to avoid
 -- gprecoverseg BEGIN failures due to gang creation failure as some primaries
 -- are not up. Setting these increase the number of retries in gang creation in
 -- case segment is in recovery. Approximately we want to wait 120 seconds.
 -- start_ignore
 set statement_timeout='720s';
 -- end_ignore
 !\retcode gpconfig -c gp_gang_creation_retry_count -v 120 --skipvalidation --masteronly;
 !\retcode gpconfig -c gp_gang_creation_retry_timer -v 1000 --skipvalidation --masteronly;
 !\retcode gpstop -u;

 -- Helper function
 CREATE or REPLACE FUNCTION wait_until_segments_are_down(num_segs int)
 RETURNS bool AS
 $$
 declare
 retries int; /* in func */
 begin /* in func */
   retries := 120; /* in func */
   loop /* in func */
     if (select count(*) = num_segs from gp_segment_configuration where status = 'd') then /* in func */
       return true; /* in func */
     end if; /* in func */
     if retries <= 0 then /* in func */
       return false; /* in func */
     end if; /* in func */
     perform pg_sleep(1); /* in func */
     retries := retries - 1; /* in func */
   end loop; /* in func */
 end; /* in func */
 $$ language plpgsql;

 -- no segment down.
 select count(*) from gp_segment_configuration where status = 'd';

 drop table if exists fts_errors_test;
 create table fts_errors_test(a int);

 1:BEGIN;
 1:END;
 2:BEGIN;
 2:INSERT INTO fts_errors_test SELECT * FROM generate_series(1,100);
 3:BEGIN;
 3:CREATE TEMP TABLE tmp3 (c1 int, c2 int);
 3:DECLARE c1 CURSOR for select * from tmp3;
 4:CREATE TEMP TABLE tmp4 (c1 int, c2 int);
 5:BEGIN;
 5:CREATE TEMP TABLE tmp5 (c1 int, c2 int);
 5:SAVEPOINT s1;
 5:CREATE TEMP TABLE tmp51 (c1 int, c2 int);

 -- probe to make sure when we call gp_request_fts_probe_scan() next
 -- time below, don't overlap with auto-trigger of FTS scans by FTS
 -- process. As if that happens, due to race condition will not trigger
 -- the fault and fail the test.
 select gp_request_fts_probe_scan();
 !\retcode gpfts -A -D;

 -- stop a primary in order to trigger a mirror promotion for content 1
 select pg_ctl((select datadir from gp_segment_configuration c
 where c.role='p' and c.content=1), 'stop');

 -- trigger a DNS error. This fault internally gets trigerred for content 0
 select gp_inject_fault_infinite('get_dns_cached_address', 'skip', 1);

 -- trigger failover
 select gp_request_fts_probe_scan();
 !\retcode gpfts -A -D;
 select pg_sleep(5);

 -- Since both gp_request_fts_probe_scan() and gp_inject_fault() will
 -- call the cdbcomponent_updateCdbComponents(), there is a plausible
 -- race condition between the fts_probes and the reset of the fault
 -- injector; if the reset triggers the fault before the fts probe
 -- completes, the primary will be taken down without removing the fault
 -- To avoid the race condition, the test waits until both the segments
 -- go down before removing the fault.
 -- The test expect the following 2 segments to go down:
 -- 1. pg_ctl stop for dbid=3(content 1, primary)
 -- 2. get_dns_cached_address fault injected for dbid=2(content 0, primary)

 -- get_dns_cached_address will make FTS update failed
 -- should check no segment is down
 -- start_ignore
 -1U: select wait_until_segments_are_down(0);
 -- end_ignore
 select gp_inject_fault('get_dns_cached_address', 'reset', 1);

 -- session 1: in no transaction and no temp table created, it's safe to
 --            update cdb_component_dbs and use the new promoted primary
 1:BEGIN;
 1:END;
 -- session 2: in transaction, gxid is dispatched to writer gang, cann't
 --            update cdb_component_dbs, following query should fail
 -- start_ignore
 2:END;
 -- end_ignore
 -- session 3: in transaction and has a cursor, cann't update
 --            cdb_component_dbs, following query should fail
 3:FETCH ALL FROM c1;
 3:END;
 -- session 4: not in transaction but has temp table, cann't update
 --            cdb_component_dbs, following query should fail and session
 --            is reset
 4:select * from tmp4;
 4:select * from tmp4;
 -- session 5: has a subtransaction, cann't update cdb_component_dbs,
 --            following query should fail
 5:select * from tmp51;
 5:ROLLBACK TO SAVEPOINT s1;
 5:END;
 1q:
 2q:
 3q:
 4q:
 5q:

 -- immediate stop mirror for content 0. This is just to speed up the test, next
 -- step gprecovertseg will do the same but it uses gpstop fast mode and not
 -- immediate, which add time to tests.
 select pg_ctl((select datadir from gp_segment_configuration c
 where c.role='m' and c.content=0), 'stop');
 select pg_sleep(60);

 -- fully recover the failed primary as new mirror
 !\retcode gprecoverseg -aF --no-progress;
 !\retcode gpfts -A -D;

 -- loop while segments come in sync
 select wait_until_all_segments_synchronized();

 !\retcode gprecoverseg -ar;
 !\retcode gpfts -A -D;

 -- loop while segments come in sync
 select wait_until_all_segments_synchronized();

 -- verify no segment is down after recovery
 select count(*) from gp_segment_configuration where status = 'd';

 !\retcode gpconfig -r gp_gang_creation_retry_count --skipvalidation --masteronly;
 !\retcode gpconfig -r gp_gang_creation_retry_timer --skipvalidation --masteronly;
 !\retcode gpstop -u;
 -- start_ignore
 reset statement_timeout;
 -- end_ignore
	-- This test triggers failover of content 0 and content 1
	-- Content 0 is used to test if FTS can handle DNS errors
	-- Content 1 is used to test the gang interaction in various
	-- sessions when a failover is triggered and mirror is promoted
	-- to primary

	-- start_matchsubs
	-- m/^ERROR: Error on receive from .*: server closed the connection unexpectedly/
	-- s/^ERROR: Error on receive from .*: server closed the connection unexpectedly/ERROR: server closed the connection unexpectedly/
	-- end_matchsubs

	-- Allow extra time for mirror promotion to complete recovery to avoid
	-- gprecoverseg BEGIN failures due to gang creation failure as some primaries
	-- are not up. Setting these increase the number of retries in gang creation in
	-- case segment is in recovery. Approximately we want to wait 120 seconds.
	-- start_ignore
	set statement_timeout='720s';
	-- end_ignore
	!\retcode gpconfig -c gp_gang_creation_retry_count -v 120 --skipvalidation --masteronly;
	!\retcode gpconfig -c gp_gang_creation_retry_timer -v 1000 --skipvalidation --masteronly;
	!\retcode gpstop -u;

	-- Helper function
	CREATE or REPLACE FUNCTION wait_until_segments_are_down(num_segs int)
	RETURNS bool AS
	$$
	declare
	retries int; /* in func */
	begin /* in func */
	retries := 120; /* in func */
	loop /* in func */
	if (select count() = num_segs from gp_segment_configuration where status = 'd') then / in func */
	return true; /* in func */
	end if; /* in func */
	if retries <= 0 then /* in func */
	return false; /* in func */
	end if; /* in func */
	perform pg_sleep(1); /* in func */
	retries := retries - 1; /* in func */
	end loop; /* in func */
	end; /* in func */
	$$ language plpgsql;

	-- no segment down.
	select count(*) from gp_segment_configuration where status = 'd';

	drop table if exists fts_errors_test;
	create table fts_errors_test(a int);

	1:BEGIN;
	1:END;
	2:BEGIN;
	2:INSERT INTO fts_errors_test SELECT * FROM generate_series(1,100);
	3:BEGIN;
	3:CREATE TEMP TABLE tmp3 (c1 int, c2 int);
	3:DECLARE c1 CURSOR for select * from tmp3;
	4:CREATE TEMP TABLE tmp4 (c1 int, c2 int);
	5:BEGIN;
	5:CREATE TEMP TABLE tmp5 (c1 int, c2 int);
	5:SAVEPOINT s1;
	5:CREATE TEMP TABLE tmp51 (c1 int, c2 int);

	-- probe to make sure when we call gp_request_fts_probe_scan() next
	-- time below, don't overlap with auto-trigger of FTS scans by FTS
	-- process. As if that happens, due to race condition will not trigger
	-- the fault and fail the test.
	select gp_request_fts_probe_scan();
	!\retcode gpfts -A -D;

	-- stop a primary in order to trigger a mirror promotion for content 1
	select pg_ctl((select datadir from gp_segment_configuration c
	where c.role='p' and c.content=1), 'stop');

	-- trigger a DNS error. This fault internally gets trigerred for content 0
	select gp_inject_fault_infinite('get_dns_cached_address', 'skip', 1);

	-- trigger failover
	select gp_request_fts_probe_scan();
	!\retcode gpfts -A -D;
	select pg_sleep(5);

	-- Since both gp_request_fts_probe_scan() and gp_inject_fault() will
	-- call the cdbcomponent_updateCdbComponents(), there is a plausible
	-- race condition between the fts_probes and the reset of the fault
	-- injector; if the reset triggers the fault before the fts probe
	-- completes, the primary will be taken down without removing the fault
	-- To avoid the race condition, the test waits until both the segments
	-- go down before removing the fault.
	-- The test expect the following 2 segments to go down:
	-- 1. pg_ctl stop for dbid=3(content 1, primary)
	-- 2. get_dns_cached_address fault injected for dbid=2(content 0, primary)

	-- get_dns_cached_address will make FTS update failed
	-- should check no segment is down
	-- start_ignore
	-1U: select wait_until_segments_are_down(0);
	-- end_ignore
	select gp_inject_fault('get_dns_cached_address', 'reset', 1);

	-- session 1: in no transaction and no temp table created, it's safe to
	-- update cdb_component_dbs and use the new promoted primary
	1:BEGIN;
	1:END;
	-- session 2: in transaction, gxid is dispatched to writer gang, cann't
	-- update cdb_component_dbs, following query should fail
	-- start_ignore
	2:END;
	-- end_ignore
	-- session 3: in transaction and has a cursor, cann't update
	-- cdb_component_dbs, following query should fail
	3:FETCH ALL FROM c1;
	3:END;
	-- session 4: not in transaction but has temp table, cann't update
	-- cdb_component_dbs, following query should fail and session
	-- is reset
	4:select * from tmp4;
	4:select * from tmp4;
	-- session 5: has a subtransaction, cann't update cdb_component_dbs,
	-- following query should fail
	5:select * from tmp51;
	5:ROLLBACK TO SAVEPOINT s1;
	5:END;
	1q:
	2q:
	3q:
	4q:
	5q:

	-- immediate stop mirror for content 0. This is just to speed up the test, next
	-- step gprecovertseg will do the same but it uses gpstop fast mode and not
	-- immediate, which add time to tests.
	select pg_ctl((select datadir from gp_segment_configuration c
	where c.role='m' and c.content=0), 'stop');
	select pg_sleep(60);

	-- fully recover the failed primary as new mirror
	!\retcode gprecoverseg -aF --no-progress;
	!\retcode gpfts -A -D;

	-- loop while segments come in sync
	select wait_until_all_segments_synchronized();

	!\retcode gprecoverseg -ar;
	!\retcode gpfts -A -D;

	-- loop while segments come in sync
	select wait_until_all_segments_synchronized();

	-- verify no segment is down after recovery
	select count(*) from gp_segment_configuration where status = 'd';

	!\retcode gpconfig -r gp_gang_creation_retry_count --skipvalidation --masteronly;
	!\retcode gpconfig -r gp_gang_creation_retry_timer --skipvalidation --masteronly;
	!\retcode gpstop -u;
	-- start_ignore
	reset statement_timeout;
	-- end_ignore