blob: 69cb7a2a38ce7104daaedd90fd7297c38c80b51e [file] [log] [blame]
-- This test triggers failover of content 0 and content 1
-- Content 0 is used to test if FTS can handle DNS errors
-- Content 1 is used to test the gang interaction in various
-- sessions when a failover is triggered and mirror is promoted
-- to primary
-- start_matchsubs
-- m/^ERROR: Error on receive from .*: server closed the connection unexpectedly/
-- s/^ERROR: Error on receive from .*: server closed the connection unexpectedly/ERROR: server closed the connection unexpectedly/
-- end_matchsubs
-- Allow extra time for mirror promotion to complete recovery to avoid
-- gprecoverseg BEGIN failures due to gang creation failure as some primaries
-- are not up. Setting these increase the number of retries in gang creation in
-- case segment is in recovery. Approximately we want to wait 120 seconds.
!\retcode gpconfig -c gp_gang_creation_retry_count -v 120 --skipvalidation --masteronly;
-- start_ignore
-- end_ignore
(exited with code 0)
!\retcode gpconfig -c gp_gang_creation_retry_timer -v 1000 --skipvalidation --masteronly;
-- start_ignore
-- end_ignore
(exited with code 0)
!\retcode gpstop -u;
-- start_ignore
-- end_ignore
(exited with code 0)
-- Helper function
CREATE or REPLACE FUNCTION wait_until_segments_are_down(num_segs int) RETURNS bool AS $$ declare retries int; /* in func */ begin /* in func */ retries := 120; /* in func */ loop /* in func */ if (select count(*) = num_segs from gp_segment_configuration where status = 'd') then /* in func */ return true; /* in func */ end if; /* in func */ if retries <= 0 then /* in func */ return false; /* in func */ end if; /* in func */ perform pg_sleep(1); /* in func */ retries := retries - 1; /* in func */ end loop; /* in func */ end; /* in func */ $$ language plpgsql;
CREATE
-- no segment down.
select count(*) from gp_segment_configuration where status = 'd';
count
-------
0
(1 row)
drop table if exists fts_errors_test;
DROP
create table fts_errors_test(a int);
CREATE
1:BEGIN;
BEGIN
1:END;
END
2:BEGIN;
BEGIN
2:INSERT INTO fts_errors_test SELECT * FROM generate_series(1,100);
INSERT 100
3:BEGIN;
BEGIN
3:CREATE TEMP TABLE tmp3 (c1 int, c2 int);
CREATE
3:DECLARE c1 CURSOR for select * from tmp3;
DECLARE
4:CREATE TEMP TABLE tmp4 (c1 int, c2 int);
CREATE
5:BEGIN;
BEGIN
5:CREATE TEMP TABLE tmp5 (c1 int, c2 int);
CREATE
5:SAVEPOINT s1;
SAVEPOINT
5:CREATE TEMP TABLE tmp51 (c1 int, c2 int);
CREATE
-- probe to make sure when we call gp_request_fts_probe_scan() next
-- time below, don't overlap with auto-trigger of FTS scans by FTS
-- process. As if that happens, due to race condition will not trigger
-- the fault and fail the test.
select gp_request_fts_probe_scan();
gp_request_fts_probe_scan
---------------------------
t
(1 row)
!\retcode gpfts -A -D;
-- start_ignore
-- end_ignore
(exited with code 0)
-- stop a primary in order to trigger a mirror promotion for content 1
select pg_ctl((select datadir from gp_segment_configuration c where c.role='p' and c.content=1), 'stop');
pg_ctl
--------
OK
(1 row)
-- trigger a DNS error. This fault internally gets trigerred for content 0
select gp_inject_fault_infinite('get_dns_cached_address', 'skip', 1);
gp_inject_fault_infinite
--------------------------
Success:
(1 row)
-- trigger failover
select gp_request_fts_probe_scan();
gp_request_fts_probe_scan
---------------------------
t
(1 row)
!\retcode gpfts -A -D;
-- start_ignore
-- end_ignore
(exited with code 0)
select pg_sleep(5);
pg_sleep
----------
(1 row)
-- Since both gp_request_fts_probe_scan() and gp_inject_fault() will
-- call the cdbcomponent_updateCdbComponents(), there is a plausible
-- race condition between the fts_probes and the reset of the fault
-- injector; if the reset triggers the fault before the fts probe
-- completes, the primary will be taken down without removing the fault
-- To avoid the race condition, the test waits until both the segments
-- go down before removing the fault.
-- The test expect the following 2 segments to go down:
-- 1. pg_ctl stop for dbid=3(content 1, primary)
-- 2. get_dns_cached_address fault injected for dbid=2(content 0, primary)
-- get_dns_cached_address will make FTS update failed
-- should check no segment is down
-- start_ignore
-1U: select wait_until_segments_are_down(0);
wait_until_segments_are_down
------------------------------
t
(1 row)
-- end_ignore
select gp_inject_fault('get_dns_cached_address', 'reset', 1);
gp_inject_fault
-----------------
Success:
(1 row)
-- session 1: in no transaction and no temp table created, it's safe to
-- update cdb_component_dbs and use the new promoted primary
1:BEGIN;
BEGIN
1:END;
END
-- session 2: in transaction, gxid is dispatched to writer gang, cann't
-- update cdb_component_dbs, following query should fail
-- start_ignore
2:END;
ERROR: Error on receive from seg1 127.0.1.1:7003 pid=19840: server closed the connection unexpectedly
This probably means the server terminated abnormally
before or while processing the request.
-- end_ignore
-- session 3: in transaction and has a cursor, cann't update
-- cdb_component_dbs, following query should fail
3:FETCH ALL FROM c1;
c1 | c2
----+----
(0 rows)
3:END;
ERROR: gang was lost due to cluster reconfiguration (cdbgang_async.c:98)
-- session 4: not in transaction but has temp table, cann't update
-- cdb_component_dbs, following query should fail and session
-- is reset
4:select * from tmp4;
ERROR: gang was lost due to cluster reconfiguration (cdbgang_async.c:98)
4:select * from tmp4;
ERROR: relation "tmp4" does not exist
LINE 1: select * from tmp4;
^
-- session 5: has a subtransaction, cann't update cdb_component_dbs,
-- following query should fail
5:select * from tmp51;
ERROR: gang was lost due to cluster reconfiguration (cdbgang_async.c:98)
5:ROLLBACK TO SAVEPOINT s1;
ERROR: Could not rollback to savepoint (ROLLBACK TO SAVEPOINT s1)
5:END;
END
1q: ... <quitting>
2q: ... <quitting>
3q: ... <quitting>
4q: ... <quitting>
5q: ... <quitting>
-- immediate stop mirror for content 0. This is just to speed up the test, next
-- step gprecovertseg will do the same but it uses gpstop fast mode and not
-- immediate, which add time to tests.
select pg_ctl((select datadir from gp_segment_configuration c where c.role='m' and c.content=0), 'stop');
pg_ctl
--------
OK
(1 row)
select pg_sleep(60);
pg_sleep
----------
(1 row)
-- fully recover the failed primary as new mirror
!\retcode gprecoverseg -aF --no-progress;
-- start_ignore
-- end_ignore
(exited with code 0)
!\retcode gpfts -A -D;
-- start_ignore
-- end_ignore
(exited with code 0)
-- loop while segments come in sync
select wait_until_all_segments_synchronized();
wait_until_all_segments_synchronized
--------------------------------------
OK
(1 row)
!\retcode gprecoverseg -ar;
-- start_ignore
-- end_ignore
(exited with code 0)
!\retcode gpfts -A -D;
-- start_ignore
-- end_ignore
(exited with code 0)
-- loop while segments come in sync
select wait_until_all_segments_synchronized();
wait_until_all_segments_synchronized
--------------------------------------
OK
(1 row)
-- verify no segment is down after recovery
select count(*) from gp_segment_configuration where status = 'd';
count
-------
0
(1 row)
!\retcode gpconfig -r gp_gang_creation_retry_count --skipvalidation --masteronly;
-- start_ignore
-- end_ignore
(exited with code 0)
!\retcode gpconfig -r gp_gang_creation_retry_timer --skipvalidation --masteronly;
-- start_ignore
-- end_ignore
(exited with code 0)
!\retcode gpstop -u;
-- start_ignore
-- end_ignore
(exited with code 0)