| -- This test triggers failover of content 0 and content 1 |
| -- Content 0 is used to test if FTS can handle DNS errors |
| -- Content 1 is used to test the gang interaction in various |
| -- sessions when a failover is triggered and mirror is promoted |
| -- to primary |
| |
| -- start_matchsubs |
| -- m/^ERROR: Error on receive from .*: server closed the connection unexpectedly/ |
| -- s/^ERROR: Error on receive from .*: server closed the connection unexpectedly/ERROR: server closed the connection unexpectedly/ |
| -- end_matchsubs |
| |
| -- Allow extra time for mirror promotion to complete recovery to avoid |
| -- gprecoverseg BEGIN failures due to gang creation failure as some primaries |
| -- are not up. Setting these increase the number of retries in gang creation in |
| -- case segment is in recovery. Approximately we want to wait 120 seconds. |
| !\retcode gpconfig -c gp_gang_creation_retry_count -v 120 --skipvalidation --masteronly; |
| -- start_ignore |
| -- end_ignore |
| (exited with code 0) |
| !\retcode gpconfig -c gp_gang_creation_retry_timer -v 1000 --skipvalidation --masteronly; |
| -- start_ignore |
| -- end_ignore |
| (exited with code 0) |
| !\retcode gpstop -u; |
| -- start_ignore |
| -- end_ignore |
| (exited with code 0) |
| |
| -- Helper function |
| CREATE or REPLACE FUNCTION wait_until_segments_are_down(num_segs int) RETURNS bool AS $$ declare retries int; /* in func */ begin /* in func */ retries := 120; /* in func */ loop /* in func */ if (select count(*) = num_segs from gp_segment_configuration where status = 'd') then /* in func */ return true; /* in func */ end if; /* in func */ if retries <= 0 then /* in func */ return false; /* in func */ end if; /* in func */ perform pg_sleep(1); /* in func */ retries := retries - 1; /* in func */ end loop; /* in func */ end; /* in func */ $$ language plpgsql; |
| CREATE |
| |
| -- no segment down. |
| select count(*) from gp_segment_configuration where status = 'd'; |
| count |
| ------- |
| 0 |
| (1 row) |
| |
| drop table if exists fts_errors_test; |
| DROP |
| create table fts_errors_test(a int); |
| CREATE |
| |
| 1:BEGIN; |
| BEGIN |
| 1:END; |
| END |
| 2:BEGIN; |
| BEGIN |
| 2:INSERT INTO fts_errors_test SELECT * FROM generate_series(1,100); |
| INSERT 100 |
| 3:BEGIN; |
| BEGIN |
| 3:CREATE TEMP TABLE tmp3 (c1 int, c2 int); |
| CREATE |
| 3:DECLARE c1 CURSOR for select * from tmp3; |
| DECLARE |
| 4:CREATE TEMP TABLE tmp4 (c1 int, c2 int); |
| CREATE |
| 5:BEGIN; |
| BEGIN |
| 5:CREATE TEMP TABLE tmp5 (c1 int, c2 int); |
| CREATE |
| 5:SAVEPOINT s1; |
| SAVEPOINT |
| 5:CREATE TEMP TABLE tmp51 (c1 int, c2 int); |
| CREATE |
| |
| -- probe to make sure when we call gp_request_fts_probe_scan() next |
| -- time below, don't overlap with auto-trigger of FTS scans by FTS |
| -- process. As if that happens, due to race condition will not trigger |
| -- the fault and fail the test. |
| select gp_request_fts_probe_scan(); |
| gp_request_fts_probe_scan |
| --------------------------- |
| t |
| (1 row) |
| !\retcode gpfts -A -D; |
| -- start_ignore |
| |
| -- end_ignore |
| (exited with code 0) |
| |
| -- stop a primary in order to trigger a mirror promotion for content 1 |
| select pg_ctl((select datadir from gp_segment_configuration c where c.role='p' and c.content=1), 'stop'); |
| pg_ctl |
| -------- |
| OK |
| (1 row) |
| |
| -- trigger a DNS error. This fault internally gets trigerred for content 0 |
| select gp_inject_fault_infinite('get_dns_cached_address', 'skip', 1); |
| gp_inject_fault_infinite |
| -------------------------- |
| Success: |
| (1 row) |
| |
| -- trigger failover |
| select gp_request_fts_probe_scan(); |
| gp_request_fts_probe_scan |
| --------------------------- |
| t |
| (1 row) |
| !\retcode gpfts -A -D; |
| -- start_ignore |
| |
| -- end_ignore |
| (exited with code 0) |
| select pg_sleep(5); |
| pg_sleep |
| ---------- |
| |
| (1 row) |
| |
| -- Since both gp_request_fts_probe_scan() and gp_inject_fault() will |
| -- call the cdbcomponent_updateCdbComponents(), there is a plausible |
| -- race condition between the fts_probes and the reset of the fault |
| -- injector; if the reset triggers the fault before the fts probe |
| -- completes, the primary will be taken down without removing the fault |
| -- To avoid the race condition, the test waits until both the segments |
| -- go down before removing the fault. |
| -- The test expect the following 2 segments to go down: |
| -- 1. pg_ctl stop for dbid=3(content 1, primary) |
| -- 2. get_dns_cached_address fault injected for dbid=2(content 0, primary) |
| |
| -- get_dns_cached_address will make FTS update failed |
| -- should check no segment is down |
| -- start_ignore |
| -1U: select wait_until_segments_are_down(0); |
| wait_until_segments_are_down |
| ------------------------------ |
| t |
| (1 row) |
| -- end_ignore |
| select gp_inject_fault('get_dns_cached_address', 'reset', 1); |
| gp_inject_fault |
| ----------------- |
| Success: |
| (1 row) |
| |
| -- session 1: in no transaction and no temp table created, it's safe to |
| -- update cdb_component_dbs and use the new promoted primary |
| 1:BEGIN; |
| BEGIN |
| 1:END; |
| END |
| -- session 2: in transaction, gxid is dispatched to writer gang, cann't |
| -- update cdb_component_dbs, following query should fail |
| -- start_ignore |
| 2:END; |
| ERROR: Error on receive from seg1 127.0.1.1:7003 pid=19840: server closed the connection unexpectedly |
| This probably means the server terminated abnormally |
| before or while processing the request. |
| -- end_ignore |
| -- session 3: in transaction and has a cursor, cann't update |
| -- cdb_component_dbs, following query should fail |
| 3:FETCH ALL FROM c1; |
| c1 | c2 |
| ----+---- |
| (0 rows) |
| 3:END; |
| ERROR: gang was lost due to cluster reconfiguration (cdbgang_async.c:98) |
| -- session 4: not in transaction but has temp table, cann't update |
| -- cdb_component_dbs, following query should fail and session |
| -- is reset |
| 4:select * from tmp4; |
| ERROR: gang was lost due to cluster reconfiguration (cdbgang_async.c:98) |
| 4:select * from tmp4; |
| ERROR: relation "tmp4" does not exist |
| LINE 1: select * from tmp4; |
| ^ |
| -- session 5: has a subtransaction, cann't update cdb_component_dbs, |
| -- following query should fail |
| 5:select * from tmp51; |
| ERROR: gang was lost due to cluster reconfiguration (cdbgang_async.c:98) |
| 5:ROLLBACK TO SAVEPOINT s1; |
| ERROR: Could not rollback to savepoint (ROLLBACK TO SAVEPOINT s1) |
| 5:END; |
| END |
| 1q: ... <quitting> |
| 2q: ... <quitting> |
| 3q: ... <quitting> |
| 4q: ... <quitting> |
| 5q: ... <quitting> |
| |
| -- immediate stop mirror for content 0. This is just to speed up the test, next |
| -- step gprecovertseg will do the same but it uses gpstop fast mode and not |
| -- immediate, which add time to tests. |
| select pg_ctl((select datadir from gp_segment_configuration c where c.role='m' and c.content=0), 'stop'); |
| pg_ctl |
| -------- |
| OK |
| (1 row) |
| select pg_sleep(60); |
| pg_sleep |
| ---------- |
| |
| (1 row) |
| |
| -- fully recover the failed primary as new mirror |
| !\retcode gprecoverseg -aF --no-progress; |
| -- start_ignore |
| -- end_ignore |
| (exited with code 0) |
| !\retcode gpfts -A -D; |
| -- start_ignore |
| |
| -- end_ignore |
| (exited with code 0) |
| |
| -- loop while segments come in sync |
| select wait_until_all_segments_synchronized(); |
| wait_until_all_segments_synchronized |
| -------------------------------------- |
| OK |
| (1 row) |
| |
| !\retcode gprecoverseg -ar; |
| -- start_ignore |
| -- end_ignore |
| (exited with code 0) |
| !\retcode gpfts -A -D; |
| -- start_ignore |
| |
| -- end_ignore |
| (exited with code 0) |
| |
| -- loop while segments come in sync |
| select wait_until_all_segments_synchronized(); |
| wait_until_all_segments_synchronized |
| -------------------------------------- |
| OK |
| (1 row) |
| |
| -- verify no segment is down after recovery |
| select count(*) from gp_segment_configuration where status = 'd'; |
| count |
| ------- |
| 0 |
| (1 row) |
| |
| !\retcode gpconfig -r gp_gang_creation_retry_count --skipvalidation --masteronly; |
| -- start_ignore |
| -- end_ignore |
| (exited with code 0) |
| !\retcode gpconfig -r gp_gang_creation_retry_timer --skipvalidation --masteronly; |
| -- start_ignore |
| -- end_ignore |
| (exited with code 0) |
| !\retcode gpstop -u; |
| -- start_ignore |
| -- end_ignore |
| (exited with code 0) |
| |
| |