| -- Test to make sure FTS doesn't mark primary down if its recovering. Fault |
| -- 'fts_conn_startup_packet' is used to simulate the primary responding |
| -- in-recovery to FTS, primary is not actually going through crash-recovery in |
| -- test. |
| select role, preferred_role, mode, status from gp_segment_configuration where content = 0; |
| role | preferred_role | mode | status |
| ------+----------------+------+-------- |
| p | p | s | u |
| m | m | s | u |
| (2 rows) |
| |
| select gp_inject_fault_infinite('fts_conn_startup_packet', 'skip', dbid) |
| from gp_segment_configuration where content = 0 and role = 'p'; |
| gp_inject_fault_infinite |
| -------------------------- |
| Success: |
| (1 row) |
| |
| -- to make test deterministic and fast |
| -- start_ignore |
| \!gpconfig -c gp_fts_probe_retries -v 2 --masteronly |
| -- end_ignore |
| -- Allow extra time for mirror promotion to complete recovery to avoid |
| -- gprecoverseg BEGIN failures due to gang creation failure as some primaries |
| -- are not up. Setting these increase the number of retries in gang creation in |
| -- case segment is in recovery. Approximately we want to wait 2 minutes at most. |
| -- start_ignore |
| \!gpconfig -c gp_gang_creation_retry_count -v 127 --skipvalidation --masteronly |
| \!gpconfig -c gp_gang_creation_retry_timer -v 1000 --skipvalidation --masteronly |
| \!gpstop -u |
| -- end_ignore |
| -- Wait a few seconds, to ensure the config changes take effect. |
| select pg_sleep(5); |
| pg_sleep |
| ---------- |
| |
| (1 row) |
| |
| select gp_request_fts_probe_scan(); |
| gp_request_fts_probe_scan |
| --------------------------- |
| t |
| (1 row) |
| |
| select gp_wait_until_triggered_fault('fts_conn_startup_packet', 3, dbid) |
| from gp_segment_configuration where content = 0 and role = 'p'; |
| gp_wait_until_triggered_fault |
| ------------------------------- |
| Success: |
| (1 row) |
| |
| select role, preferred_role, mode, status from gp_segment_configuration where content = 0; |
| role | preferred_role | mode | status |
| ------+----------------+------+-------- |
| p | p | s | u |
| m | m | s | u |
| (2 rows) |
| |
| -- test other scenario where recovery on primary is hung and hence FTS marks |
| -- primary down and promotes mirror. When 'fts_recovery_in_progress' is set to |
| -- skip it mimics the behavior of hung recovery on primary. |
| select gp_inject_fault_infinite('fts_recovery_in_progress', 'skip', dbid) |
| from gp_segment_configuration where content = 0 and role = 'p'; |
| gp_inject_fault_infinite |
| -------------------------- |
| Success: |
| (1 row) |
| |
| -- We call gp_request_fts_probe_scan twice to guarantee that the scan happens |
| -- after the fts_recovery_in_progress fault has been injected. If periodic fts |
| -- probe is running when the first request scan is run it is possible to not |
| -- see the effect due to the fault. |
| select gp_request_fts_probe_scan(); |
| gp_request_fts_probe_scan |
| --------------------------- |
| t |
| (1 row) |
| |
| select gp_request_fts_probe_scan(); |
| gp_request_fts_probe_scan |
| --------------------------- |
| t |
| (1 row) |
| |
| select role, preferred_role, mode, status from gp_segment_configuration where content = 0; |
| role | preferred_role | mode | status |
| ------+----------------+------+-------- |
| m | p | n | d |
| p | m | n | u |
| (2 rows) |
| |
| -- The remaining steps are to bring back the cluster to original state. |
| -- start_ignore |
| -- Wait until content 0 mirror is promoted otherwise, gprecoverseg |
| -- that runs after will fail. |
| do $$ |
| declare |
| y int; |
| begin |
| for i in 1..120 loop |
| begin |
| select count(*) into y from gp_dist_random('gp_id'); |
| raise notice 'got % results, mirror must have been promoted', y; |
| return; |
| exception |
| when others then |
| raise notice 'mirror may not be promoted yet: %', sqlerrm; |
| perform pg_sleep(0.5); |
| end; |
| end loop; |
| end; |
| $$; |
| NOTICE: got 3 results, mirror must have been promoted |
| \! gprecoverseg -av |
| -- end_ignore |
| -- loop while segments come in sync |
| do $$ |
| begin |
| for i in 1..120 loop |
| if (select count(*) = 0 from gp_segment_configuration where content = 0 and mode != 's') then |
| return; |
| end if; |
| perform gp_request_fts_probe_scan(); |
| end loop; |
| end; |
| $$; |
| select role, preferred_role, mode, status from gp_segment_configuration where content = 0; |
| role | preferred_role | mode | status |
| ------+----------------+------+-------- |
| p | m | s | u |
| m | p | s | u |
| (2 rows) |
| |
| -- start_ignore |
| \! gprecoverseg -arv |
| -- end_ignore |
| -- loop while segments come in sync |
| do $$ |
| begin |
| for i in 1..120 loop |
| if (select count(*) = 0 from gp_segment_configuration where content = 0 and mode != 's') then |
| return; |
| end if; |
| perform gp_request_fts_probe_scan(); |
| end loop; |
| end; |
| $$; |
| select role, preferred_role, mode, status from gp_segment_configuration where content = 0; |
| role | preferred_role | mode | status |
| ------+----------------+------+-------- |
| p | p | s | u |
| m | m | s | u |
| (2 rows) |
| |
| -- start_ignore |
| \!gpconfig -r gp_gang_creation_retry_count --skipvalidation --masteronly |
| \!gpconfig -r gp_gang_creation_retry_timer --skipvalidation --masteronly |
| \!gpstop -u |
| -- end_ignore |
| -- cleanup steps |
| select gp_inject_fault('all', 'reset', dbid) |
| from gp_segment_configuration where content = 0 and role = 'p'; |
| gp_inject_fault |
| ----------------- |
| Success: |
| (1 row) |
| |