blob: 01781cc3ec2c7d0b89e1e765e252900e051d7a59 [file] [log] [blame]
-- This test triggers failover of content 0 and content 1
-- Content 0 is used to test if FTS can handle DNS errors
-- Content 1 is used to test the gang interaction in various
-- sessions when a failover is triggered and mirror is promoted
-- to primary
-- start_matchsubs
-- m/^ERROR: Error on receive from .*: server closed the connection unexpectedly/
-- s/^ERROR: Error on receive from .*: server closed the connection unexpectedly/ERROR: server closed the connection unexpectedly/
-- end_matchsubs
-- Allow extra time for mirror promotion to complete recovery to avoid
-- gprecoverseg BEGIN failures due to gang creation failure as some primaries
-- are not up. Setting these increase the number of retries in gang creation in
-- case segment is in recovery. Approximately we want to wait 120 seconds.
-- start_ignore
set statement_timeout='720s';
-- end_ignore
!\retcode gpconfig -c gp_gang_creation_retry_count -v 120 --skipvalidation --masteronly;
!\retcode gpconfig -c gp_gang_creation_retry_timer -v 1000 --skipvalidation --masteronly;
!\retcode gpstop -u;
-- Helper function
CREATE or REPLACE FUNCTION wait_until_segments_are_down(num_segs int)
RETURNS bool AS
$$
declare
retries int; /* in func */
begin /* in func */
retries := 120; /* in func */
loop /* in func */
if (select count(*) = num_segs from gp_segment_configuration where status = 'd') then /* in func */
return true; /* in func */
end if; /* in func */
if retries <= 0 then /* in func */
return false; /* in func */
end if; /* in func */
perform pg_sleep(1); /* in func */
retries := retries - 1; /* in func */
end loop; /* in func */
end; /* in func */
$$ language plpgsql;
-- no segment down.
select count(*) from gp_segment_configuration where status = 'd';
drop table if exists fts_errors_test;
create table fts_errors_test(a int);
1:BEGIN;
1:END;
2:BEGIN;
2:INSERT INTO fts_errors_test SELECT * FROM generate_series(1,100);
3:BEGIN;
3:CREATE TEMP TABLE tmp3 (c1 int, c2 int);
3:DECLARE c1 CURSOR for select * from tmp3;
4:CREATE TEMP TABLE tmp4 (c1 int, c2 int);
5:BEGIN;
5:CREATE TEMP TABLE tmp5 (c1 int, c2 int);
5:SAVEPOINT s1;
5:CREATE TEMP TABLE tmp51 (c1 int, c2 int);
-- probe to make sure when we call gp_request_fts_probe_scan() next
-- time below, don't overlap with auto-trigger of FTS scans by FTS
-- process. As if that happens, due to race condition will not trigger
-- the fault and fail the test.
select gp_request_fts_probe_scan();
!\retcode gpfts -A -D;
-- stop a primary in order to trigger a mirror promotion for content 1
select pg_ctl((select datadir from gp_segment_configuration c
where c.role='p' and c.content=1), 'stop');
-- trigger a DNS error. This fault internally gets trigerred for content 0
select gp_inject_fault_infinite('get_dns_cached_address', 'skip', 1);
-- trigger failover
select gp_request_fts_probe_scan();
!\retcode gpfts -A -D;
select pg_sleep(5);
-- Since both gp_request_fts_probe_scan() and gp_inject_fault() will
-- call the cdbcomponent_updateCdbComponents(), there is a plausible
-- race condition between the fts_probes and the reset of the fault
-- injector; if the reset triggers the fault before the fts probe
-- completes, the primary will be taken down without removing the fault
-- To avoid the race condition, the test waits until both the segments
-- go down before removing the fault.
-- The test expect the following 2 segments to go down:
-- 1. pg_ctl stop for dbid=3(content 1, primary)
-- 2. get_dns_cached_address fault injected for dbid=2(content 0, primary)
-- get_dns_cached_address will make FTS update failed
-- should check no segment is down
-- start_ignore
-1U: select wait_until_segments_are_down(0);
-- end_ignore
select gp_inject_fault('get_dns_cached_address', 'reset', 1);
-- session 1: in no transaction and no temp table created, it's safe to
-- update cdb_component_dbs and use the new promoted primary
1:BEGIN;
1:END;
-- session 2: in transaction, gxid is dispatched to writer gang, cann't
-- update cdb_component_dbs, following query should fail
-- start_ignore
2:END;
-- end_ignore
-- session 3: in transaction and has a cursor, cann't update
-- cdb_component_dbs, following query should fail
3:FETCH ALL FROM c1;
3:END;
-- session 4: not in transaction but has temp table, cann't update
-- cdb_component_dbs, following query should fail and session
-- is reset
4:select * from tmp4;
4:select * from tmp4;
-- session 5: has a subtransaction, cann't update cdb_component_dbs,
-- following query should fail
5:select * from tmp51;
5:ROLLBACK TO SAVEPOINT s1;
5:END;
1q:
2q:
3q:
4q:
5q:
-- immediate stop mirror for content 0. This is just to speed up the test, next
-- step gprecovertseg will do the same but it uses gpstop fast mode and not
-- immediate, which add time to tests.
select pg_ctl((select datadir from gp_segment_configuration c
where c.role='m' and c.content=0), 'stop');
select pg_sleep(60);
-- fully recover the failed primary as new mirror
!\retcode gprecoverseg -aF --no-progress;
!\retcode gpfts -A -D;
-- loop while segments come in sync
select wait_until_all_segments_synchronized();
!\retcode gprecoverseg -ar;
!\retcode gpfts -A -D;
-- loop while segments come in sync
select wait_until_all_segments_synchronized();
-- verify no segment is down after recovery
select count(*) from gp_segment_configuration where status = 'd';
!\retcode gpconfig -r gp_gang_creation_retry_count --skipvalidation --masteronly;
!\retcode gpconfig -r gp_gang_creation_retry_timer --skipvalidation --masteronly;
!\retcode gpstop -u;
-- start_ignore
reset statement_timeout;
-- end_ignore