src/test/isolation2/expected/fts_errors.out - cloudberry - Git at Google

 -- This test triggers failover of content 0 and content 1
 -- Content 0 is used to test if FTS can handle DNS errors
 -- Content 1 is used to test the gang interaction in various
 -- sessions when a failover is triggered and mirror is promoted
 -- to primary

 -- start_matchsubs
 -- m/^ERROR:  Error on receive from .*: server closed the connection unexpectedly/
 -- s/^ERROR:  Error on receive from .*: server closed the connection unexpectedly/ERROR: server closed the connection unexpectedly/
 -- end_matchsubs

 -- Allow extra time for mirror promotion to complete recovery to avoid
 -- gprecoverseg BEGIN failures due to gang creation failure as some primaries
 -- are not up. Setting these increase the number of retries in gang creation in
 -- case segment is in recovery. Approximately we want to wait 120 seconds.
 !\retcode gpconfig -c gp_gang_creation_retry_count -v 120 --skipvalidation --masteronly;
 -- start_ignore
 -- end_ignore
 (exited with code 0)
 !\retcode gpconfig -c gp_gang_creation_retry_timer -v 1000 --skipvalidation --masteronly;
 -- start_ignore
 -- end_ignore
 (exited with code 0)
 !\retcode gpstop -u;
 -- start_ignore
 -- end_ignore
 (exited with code 0)

 -- Helper function
 CREATE or REPLACE FUNCTION wait_until_segments_are_down(num_segs int) RETURNS bool AS $$ declare retries int; /* in func */ begin /* in func */ retries := 120; /* in func */ loop /* in func */ if (select count(*) = num_segs from gp_segment_configuration where status = 'd') then /* in func */ return true; /* in func */ end if; /* in func */ if retries <= 0 then /* in func */ return false; /* in func */ end if; /* in func */ perform pg_sleep(1); /* in func */ retries := retries - 1; /* in func */ end loop; /* in func */ end; /* in func */ $$ language plpgsql;
 CREATE

 -- no segment down.
 select count(*) from gp_segment_configuration where status = 'd';
  count
 -------
  0
 (1 row)

 drop table if exists fts_errors_test;
 DROP
 create table fts_errors_test(a int);
 CREATE

 1:BEGIN;
 BEGIN
 1:END;
 END
 2:BEGIN;
 BEGIN
 2:INSERT INTO fts_errors_test SELECT * FROM generate_series(1,100);
 INSERT 100
 3:BEGIN;
 BEGIN
 3:CREATE TEMP TABLE tmp3 (c1 int, c2 int);
 CREATE
 3:DECLARE c1 CURSOR for select * from tmp3;
 DECLARE
 4:CREATE TEMP TABLE tmp4 (c1 int, c2 int);
 CREATE
 5:BEGIN;
 BEGIN
 5:CREATE TEMP TABLE tmp5 (c1 int, c2 int);
 CREATE
 5:SAVEPOINT s1;
 SAVEPOINT
 5:CREATE TEMP TABLE tmp51 (c1 int, c2 int);
 CREATE

 -- probe to make sure when we call gp_request_fts_probe_scan() next
 -- time below, don't overlap with auto-trigger of FTS scans by FTS
 -- process. As if that happens, due to race condition will not trigger
 -- the fault and fail the test.
 select gp_request_fts_probe_scan();
  gp_request_fts_probe_scan
 ---------------------------
  t
 (1 row)
 !\retcode gpfts -A -D;
 -- start_ignore

 -- end_ignore
 (exited with code 0)

 -- stop a primary in order to trigger a mirror promotion for content 1
 select pg_ctl((select datadir from gp_segment_configuration c where c.role='p' and c.content=1), 'stop');
  pg_ctl
 --------
  OK
 (1 row)

 -- trigger a DNS error. This fault internally gets trigerred for content 0
 select gp_inject_fault_infinite('get_dns_cached_address', 'skip', 1);
  gp_inject_fault_infinite
 --------------------------
  Success:
 (1 row)

 -- trigger failover
 select gp_request_fts_probe_scan();
  gp_request_fts_probe_scan
 ---------------------------
  t
 (1 row)
 !\retcode gpfts -A -D;
 -- start_ignore

 -- end_ignore
 (exited with code 0)
 select pg_sleep(5);
  pg_sleep
 ----------

 (1 row)

 -- Since both gp_request_fts_probe_scan() and gp_inject_fault() will
 -- call the cdbcomponent_updateCdbComponents(), there is a plausible
 -- race condition between the fts_probes and the reset of the fault
 -- injector; if the reset triggers the fault before the fts probe
 -- completes, the primary will be taken down without removing the fault
 -- To avoid the race condition, the test waits until both the segments
 -- go down before removing the fault.
 -- The test expect the following 2 segments to go down:
 -- 1. pg_ctl stop for dbid=3(content 1, primary)
 -- 2. get_dns_cached_address fault injected for dbid=2(content 0, primary)

 -- get_dns_cached_address will make FTS update failed
 -- should check no segment is down
 -- start_ignore
 -1U: select wait_until_segments_are_down(0);
  wait_until_segments_are_down
 ------------------------------
  t
 (1 row)
 -- end_ignore
 select gp_inject_fault('get_dns_cached_address', 'reset', 1);
  gp_inject_fault
 -----------------
  Success:
 (1 row)

 -- session 1: in no transaction and no temp table created, it's safe to
 --            update cdb_component_dbs and use the new promoted primary
 1:BEGIN;
 BEGIN
 1:END;
 END
 -- session 2: in transaction, gxid is dispatched to writer gang, cann't
 --            update cdb_component_dbs, following query should fail
 -- start_ignore
 2:END;
 ERROR:  Error on receive from seg1 127.0.1.1:7003 pid=19840: server closed the connection unexpectedly
 	This probably means the server terminated abnormally
 	before or while processing the request.
 -- end_ignore
 -- session 3: in transaction and has a cursor, cann't update
 --            cdb_component_dbs, following query should fail
 3:FETCH ALL FROM c1;
  c1 | c2
 ----+----
 (0 rows)
 3:END;
 ERROR:  gang was lost due to cluster reconfiguration (cdbgang_async.c:98)
 -- session 4: not in transaction but has temp table, cann't update
 --            cdb_component_dbs, following query should fail and session
 --            is reset
 4:select * from tmp4;
 ERROR:  gang was lost due to cluster reconfiguration (cdbgang_async.c:98)
 4:select * from tmp4;
 ERROR:  relation "tmp4" does not exist
 LINE 1: select * from tmp4;
                       ^
 -- session 5: has a subtransaction, cann't update cdb_component_dbs,
 --            following query should fail
 5:select * from tmp51;
 ERROR:  gang was lost due to cluster reconfiguration (cdbgang_async.c:98)
 5:ROLLBACK TO SAVEPOINT s1;
 ERROR:  Could not rollback to savepoint (ROLLBACK TO SAVEPOINT s1)
 5:END;
 END
 1q: ... <quitting>
 2q: ... <quitting>
 3q: ... <quitting>
 4q: ... <quitting>
 5q: ... <quitting>

 -- immediate stop mirror for content 0. This is just to speed up the test, next
 -- step gprecovertseg will do the same but it uses gpstop fast mode and not
 -- immediate, which add time to tests.
 select pg_ctl((select datadir from gp_segment_configuration c where c.role='m' and c.content=0), 'stop');
  pg_ctl
 --------
  OK
 (1 row)
 select pg_sleep(60);
  pg_sleep
 ----------

 (1 row)

 -- fully recover the failed primary as new mirror
 !\retcode gprecoverseg -aF --no-progress;
 -- start_ignore
 -- end_ignore
 (exited with code 0)
 !\retcode gpfts -A -D;
 -- start_ignore

 -- end_ignore
 (exited with code 0)

 -- loop while segments come in sync
 select wait_until_all_segments_synchronized();
  wait_until_all_segments_synchronized
 --------------------------------------
  OK
 (1 row)

 !\retcode gprecoverseg -ar;
 -- start_ignore
 -- end_ignore
 (exited with code 0)
 !\retcode gpfts -A -D;
 -- start_ignore

 -- end_ignore
 (exited with code 0)

 -- loop while segments come in sync
 select wait_until_all_segments_synchronized();
  wait_until_all_segments_synchronized
 --------------------------------------
  OK
 (1 row)

 -- verify no segment is down after recovery
 select count(*) from gp_segment_configuration where status = 'd';
  count
 -------
  0
 (1 row)

 !\retcode gpconfig -r gp_gang_creation_retry_count --skipvalidation --masteronly;
 -- start_ignore
 -- end_ignore
 (exited with code 0)
 !\retcode gpconfig -r gp_gang_creation_retry_timer --skipvalidation --masteronly;
 -- start_ignore
 -- end_ignore
 (exited with code 0)
 !\retcode gpstop -u;
 -- start_ignore
 -- end_ignore
 (exited with code 0)
	-- This test triggers failover of content 0 and content 1
	-- Content 0 is used to test if FTS can handle DNS errors
	-- Content 1 is used to test the gang interaction in various
	-- sessions when a failover is triggered and mirror is promoted
	-- to primary

	-- start_matchsubs
	-- m/^ERROR: Error on receive from .*: server closed the connection unexpectedly/
	-- s/^ERROR: Error on receive from .*: server closed the connection unexpectedly/ERROR: server closed the connection unexpectedly/
	-- end_matchsubs

	-- Allow extra time for mirror promotion to complete recovery to avoid
	-- gprecoverseg BEGIN failures due to gang creation failure as some primaries
	-- are not up. Setting these increase the number of retries in gang creation in
	-- case segment is in recovery. Approximately we want to wait 120 seconds.
	!\retcode gpconfig -c gp_gang_creation_retry_count -v 120 --skipvalidation --masteronly;
	-- start_ignore
	-- end_ignore
	(exited with code 0)
	!\retcode gpconfig -c gp_gang_creation_retry_timer -v 1000 --skipvalidation --masteronly;
	-- start_ignore
	-- end_ignore
	(exited with code 0)
	!\retcode gpstop -u;
	-- start_ignore
	-- end_ignore
	(exited with code 0)

	-- Helper function
	CREATE or REPLACE FUNCTION wait_until_segments_are_down(num_segs int) RETURNS bool AS $$ declare retries int; /* in func / begin / in func / retries := 120; / in func / loop / in func / if (select count() = num_segs from gp_segment_configuration where status = 'd') then /* in func / return true; / in func / end if; / in func / if retries <= 0 then / in func / return false; / in func / end if; / in func / perform pg_sleep(1); / in func / retries := retries - 1; / in func / end loop; / in func / end; / in func */ $$ language plpgsql;
	CREATE

	-- no segment down.
	select count(*) from gp_segment_configuration where status = 'd';
	count
	-------
	0
	(1 row)

	drop table if exists fts_errors_test;
	DROP
	create table fts_errors_test(a int);
	CREATE

	1:BEGIN;
	BEGIN
	1:END;
	END
	2:BEGIN;
	BEGIN
	2:INSERT INTO fts_errors_test SELECT * FROM generate_series(1,100);
	INSERT 100
	3:BEGIN;
	BEGIN
	3:CREATE TEMP TABLE tmp3 (c1 int, c2 int);
	CREATE
	3:DECLARE c1 CURSOR for select * from tmp3;
	DECLARE
	4:CREATE TEMP TABLE tmp4 (c1 int, c2 int);
	CREATE
	5:BEGIN;
	BEGIN
	5:CREATE TEMP TABLE tmp5 (c1 int, c2 int);
	CREATE
	5:SAVEPOINT s1;
	SAVEPOINT
	5:CREATE TEMP TABLE tmp51 (c1 int, c2 int);
	CREATE

	-- probe to make sure when we call gp_request_fts_probe_scan() next
	-- time below, don't overlap with auto-trigger of FTS scans by FTS
	-- process. As if that happens, due to race condition will not trigger
	-- the fault and fail the test.
	select gp_request_fts_probe_scan();
	gp_request_fts_probe_scan
	---------------------------
	t
	(1 row)
	!\retcode gpfts -A -D;
	-- start_ignore

	-- end_ignore
	(exited with code 0)

	-- stop a primary in order to trigger a mirror promotion for content 1
	select pg_ctl((select datadir from gp_segment_configuration c where c.role='p' and c.content=1), 'stop');
	pg_ctl
	--------
	OK
	(1 row)

	-- trigger a DNS error. This fault internally gets trigerred for content 0
	select gp_inject_fault_infinite('get_dns_cached_address', 'skip', 1);
	gp_inject_fault_infinite
	--------------------------
	Success:
	(1 row)

	-- trigger failover
	select gp_request_fts_probe_scan();
	gp_request_fts_probe_scan
	---------------------------
	t
	(1 row)
	!\retcode gpfts -A -D;
	-- start_ignore

	-- end_ignore
	(exited with code 0)
	select pg_sleep(5);
	pg_sleep
	----------

	(1 row)

	-- Since both gp_request_fts_probe_scan() and gp_inject_fault() will
	-- call the cdbcomponent_updateCdbComponents(), there is a plausible
	-- race condition between the fts_probes and the reset of the fault
	-- injector; if the reset triggers the fault before the fts probe
	-- completes, the primary will be taken down without removing the fault
	-- To avoid the race condition, the test waits until both the segments
	-- go down before removing the fault.
	-- The test expect the following 2 segments to go down:
	-- 1. pg_ctl stop for dbid=3(content 1, primary)
	-- 2. get_dns_cached_address fault injected for dbid=2(content 0, primary)

	-- get_dns_cached_address will make FTS update failed
	-- should check no segment is down
	-- start_ignore
	-1U: select wait_until_segments_are_down(0);
	wait_until_segments_are_down
	------------------------------
	t
	(1 row)
	-- end_ignore
	select gp_inject_fault('get_dns_cached_address', 'reset', 1);
	gp_inject_fault
	-----------------
	Success:
	(1 row)

	-- session 1: in no transaction and no temp table created, it's safe to
	-- update cdb_component_dbs and use the new promoted primary
	1:BEGIN;
	BEGIN
	1:END;
	END
	-- session 2: in transaction, gxid is dispatched to writer gang, cann't
	-- update cdb_component_dbs, following query should fail
	-- start_ignore
	2:END;
	ERROR: Error on receive from seg1 127.0.1.1:7003 pid=19840: server closed the connection unexpectedly
	This probably means the server terminated abnormally
	before or while processing the request.
	-- end_ignore
	-- session 3: in transaction and has a cursor, cann't update
	-- cdb_component_dbs, following query should fail
	3:FETCH ALL FROM c1;
	c1 \| c2
	----+----
	(0 rows)
	3:END;
	ERROR: gang was lost due to cluster reconfiguration (cdbgang_async.c:98)
	-- session 4: not in transaction but has temp table, cann't update
	-- cdb_component_dbs, following query should fail and session
	-- is reset
	4:select * from tmp4;
	ERROR: gang was lost due to cluster reconfiguration (cdbgang_async.c:98)
	4:select * from tmp4;
	ERROR: relation "tmp4" does not exist
	LINE 1: select * from tmp4;
	^
	-- session 5: has a subtransaction, cann't update cdb_component_dbs,
	-- following query should fail
	5:select * from tmp51;
	ERROR: gang was lost due to cluster reconfiguration (cdbgang_async.c:98)
	5:ROLLBACK TO SAVEPOINT s1;
	ERROR: Could not rollback to savepoint (ROLLBACK TO SAVEPOINT s1)
	5:END;
	END
	1q: ... <quitting>
	2q: ... <quitting>
	3q: ... <quitting>
	4q: ... <quitting>
	5q: ... <quitting>

	-- immediate stop mirror for content 0. This is just to speed up the test, next
	-- step gprecovertseg will do the same but it uses gpstop fast mode and not
	-- immediate, which add time to tests.
	select pg_ctl((select datadir from gp_segment_configuration c where c.role='m' and c.content=0), 'stop');
	pg_ctl
	--------
	OK
	(1 row)
	select pg_sleep(60);
	pg_sleep
	----------

	(1 row)

	-- fully recover the failed primary as new mirror
	!\retcode gprecoverseg -aF --no-progress;
	-- start_ignore
	-- end_ignore
	(exited with code 0)
	!\retcode gpfts -A -D;
	-- start_ignore

	-- end_ignore
	(exited with code 0)

	-- loop while segments come in sync
	select wait_until_all_segments_synchronized();
	wait_until_all_segments_synchronized
	--------------------------------------
	OK
	(1 row)

	!\retcode gprecoverseg -ar;
	-- start_ignore
	-- end_ignore
	(exited with code 0)
	!\retcode gpfts -A -D;
	-- start_ignore

	-- end_ignore
	(exited with code 0)

	-- loop while segments come in sync
	select wait_until_all_segments_synchronized();
	wait_until_all_segments_synchronized
	--------------------------------------
	OK
	(1 row)

	-- verify no segment is down after recovery
	select count(*) from gp_segment_configuration where status = 'd';
	count
	-------
	0
	(1 row)

	!\retcode gpconfig -r gp_gang_creation_retry_count --skipvalidation --masteronly;
	-- start_ignore
	-- end_ignore
	(exited with code 0)
	!\retcode gpconfig -r gp_gang_creation_retry_timer --skipvalidation --masteronly;
	-- start_ignore
	-- end_ignore
	(exited with code 0)
	!\retcode gpstop -u;
	-- start_ignore
	-- end_ignore
	(exited with code 0)