Fix prometheus to survive mem3_sync termination
Currently, if `mem3_sync` is terminated, `prometheus_server` will
crash the BEAM when it tries to get internal replication jobs:
```
[error] 2023-05-31T15:52:13.989437Z node1@127.0.0.1 <0.1065.0> -------- gen_server couch_prometheus_server terminated with reason: no such process or port in call to gen_server:call(mem3_sync, get_backlog) at gen_server:call/2(line:370) <= couch_prometheus_server:get_internal_replication_jobs_stat/0(line:131) <= couch_prometheus_server:get_system_stats/0(line:118) <= couch_prometheus_server:refresh_metrics/0(line:90) <= couch_prometheus_server:handle_info/2(line:76) <= gen_server:try_dispatch/4(line:1123) <= gen_server:handle_msg/6(line:1200) <= proc_lib:init_p_do_apply/3(line:240)
```
and eventually
```
[os_mon] cpu supervisor port (cpu_sup): Erlang has closed
{"Kernel pid terminated",application_controller,"{application_terminated,couch_prometheus,shutdown}"}
```
This adds a try/catch to prevent the crash.
diff --git a/src/couch_prometheus/src/couch_prometheus_server.erl b/src/couch_prometheus/src/couch_prometheus_server.erl
index d40efc7..1649898 100644
--- a/src/couch_prometheus/src/couch_prometheus_server.erl
+++ b/src/couch_prometheus/src/couch_prometheus_server.erl
@@ -36,6 +36,12 @@
terminate/2
]).
+-ifdef(TEST).
+-export([
+ get_internal_replication_jobs_stat/0
+]).
+-endif.
+
-include("couch_prometheus.hrl").
start_link() ->
@@ -128,7 +134,13 @@
internal_replication_jobs,
gauge,
"count of internal replication changes to process",
- mem3_sync:get_backlog()
+ try
+ mem3_sync:get_backlog()
+ catch
+ _:_ ->
+ couch_log:warning("~p mem3_sync down", [?MODULE]),
+ 0
+ end
).
get_membership_stat() ->
diff --git a/src/couch_prometheus/test/eunit/couch_prometheus_e2e_tests.erl b/src/couch_prometheus/test/eunit/couch_prometheus_e2e_tests.erl
index 2a10160..d24a01b 100644
--- a/src/couch_prometheus/test/eunit/couch_prometheus_e2e_tests.erl
+++ b/src/couch_prometheus/test/eunit/couch_prometheus_e2e_tests.erl
@@ -41,7 +41,8 @@
?TDEF_FE(t_prometheus_port),
?TDEF_FE(t_metric_updated),
?TDEF_FE(t_no_duplicate_metrics),
- ?TDEF_FE(t_starts_with_couchdb)
+ ?TDEF_FE(t_starts_with_couchdb),
+ ?TDEF_FE(t_survives_mem3_sync_termination)
]
}
}
@@ -173,6 +174,19 @@
Lines
).
+t_survives_mem3_sync_termination(_) ->
+ ServerPid = whereis(couch_prometheus_server),
+ ?assertNotEqual(undefined, ServerPid),
+ ?assertNotEqual(undefined, whereis(mem3_sync)),
+ ok = supervisor:terminate_child(mem3_sup, mem3_sync),
+ ?assertEqual(undefined, whereis(mem3_sync)),
+ ?assertMatch(
+ [[_, _], <<"couchdb_internal_replication_jobs 0">>],
+ couch_prometheus_server:get_internal_replication_jobs_stat()
+ ),
+ {ok, _} = supervisor:restart_child(mem3_sup, mem3_sync),
+ ?assertEqual(ServerPid, whereis(couch_prometheus_server)).
+
node_local_url(Port) ->
Addr = config:get("chttpd", "bind_address", "127.0.0.1"),
lists:concat(["http://", Addr, ":", Port, "/_node/_local/_prometheus"]).