Add jittered delay during replication error handling
For one-to-many replications, when source fails, it can create a stampede
effect. A jittered delay is used to avoid that. Delay is random, in a range
proportional to current number of replications, with a maximum of 1 minute.
Seed random number generator within each replication process with a
non-deterministic value, otherwise the same sequence of delays is generated
for all replications.
Jira: COUCHDB-3006
diff --git a/src/couch_replicator.erl b/src/couch_replicator.erl
index 4e25e14..7f0c7ee 100644
--- a/src/couch_replicator.erl
+++ b/src/couch_replicator.erl
@@ -256,6 +256,8 @@
do_init(#rep{options = Options, id = {BaseId, Ext}, user_ctx=UserCtx} = Rep) ->
process_flag(trap_exit, true),
+ random:seed(os:timestamp()),
+
#rep_state{
source = Source,
target = Target,
diff --git a/src/couch_replicator_manager.erl b/src/couch_replicator_manager.erl
index 0811796..342dffb 100644
--- a/src/couch_replicator_manager.erl
+++ b/src/couch_replicator_manager.erl
@@ -45,6 +45,8 @@
-define(REP_TO_STATE, couch_rep_id_to_rep_state).
-define(INITIAL_WAIT, 2.5). % seconds
-define(MAX_WAIT, 600). % seconds
+-define(AVG_ERROR_DELAY_MSEC, 100).
+-define(MAX_ERROR_DELAY_MSEC, 60000).
-define(OWNER, <<"owner">>).
-define(DB_TO_SEQ, db_to_seq).
@@ -124,6 +126,7 @@
nil ->
ok;
#rep_state{rep = #rep{db_name = DbName, doc_id = DocId}} ->
+ ok = add_error_jitter(),
update_rep_doc(DbName, DocId, [
{<<"_replication_state">>, <<"error">>},
{<<"_replication_state_reason">>, to_binary(error_reason(Error))},
@@ -131,6 +134,15 @@
ok = gen_server:call(?MODULE, {rep_error, RepId, Error}, infinity)
end.
+% Add random delay proportional to the number of replications
+% on current node, in order to prevent a stampede when a source
+% with multiple replication targets fails
+add_error_jitter() ->
+ RepCount = ets:info(?REP_TO_STATE, size),
+ Range = min(2 * RepCount * ?AVG_ERROR_DELAY_MSEC, ?MAX_ERROR_DELAY_MSEC),
+ timer:sleep(random:uniform(Range)).
+
+
continue(#rep{doc_id = null}) ->
{true, no_owner};
continue(#rep{id = RepId}) ->