Add jittered delay during replication error handling

For one-to-many replications, when source fails, it can create a stampede
effect. A jittered delay is used to avoid that. Delay is random, in a range
proportional to current number of replications, with a maximum of 1 minute.

Seed random number generator within each replication process with a
non-deterministic value, otherwise the same sequence of delays is generated
for all replications.

Jira: COUCHDB-3006
diff --git a/src/couch_replicator.erl b/src/couch_replicator.erl
index 4e25e14..7f0c7ee 100644
--- a/src/couch_replicator.erl
+++ b/src/couch_replicator.erl
@@ -256,6 +256,8 @@
 do_init(#rep{options = Options, id = {BaseId, Ext}, user_ctx=UserCtx} = Rep) ->
     process_flag(trap_exit, true),
 
+    random:seed(os:timestamp()),
+
     #rep_state{
         source = Source,
         target = Target,
diff --git a/src/couch_replicator_manager.erl b/src/couch_replicator_manager.erl
index 0811796..342dffb 100644
--- a/src/couch_replicator_manager.erl
+++ b/src/couch_replicator_manager.erl
@@ -45,6 +45,8 @@
 -define(REP_TO_STATE, couch_rep_id_to_rep_state).
 -define(INITIAL_WAIT, 2.5). % seconds
 -define(MAX_WAIT, 600).     % seconds
+-define(AVG_ERROR_DELAY_MSEC, 100).
+-define(MAX_ERROR_DELAY_MSEC, 60000).
 -define(OWNER, <<"owner">>).
 
 -define(DB_TO_SEQ, db_to_seq).
@@ -124,6 +126,7 @@
     nil ->
         ok;
     #rep_state{rep = #rep{db_name = DbName, doc_id = DocId}} ->
+        ok = add_error_jitter(),
         update_rep_doc(DbName, DocId, [
             {<<"_replication_state">>, <<"error">>},
             {<<"_replication_state_reason">>, to_binary(error_reason(Error))},
@@ -131,6 +134,15 @@
         ok = gen_server:call(?MODULE, {rep_error, RepId, Error}, infinity)
     end.
 
+% Add random delay proportional to the number of replications
+% on current node, in order to prevent a stampede when a source
+% with multiple replication targets fails
+add_error_jitter() ->
+    RepCount = ets:info(?REP_TO_STATE, size),
+    Range = min(2 * RepCount * ?AVG_ERROR_DELAY_MSEC, ?MAX_ERROR_DELAY_MSEC),
+    timer:sleep(random:uniform(Range)).
+
+
 continue(#rep{doc_id = null}) ->
     {true, no_owner};
 continue(#rep{id = RepId}) ->