KUDU-2149: avoid election stacking by restoring failure monitor semantics Prior to commit 21b0f3d, the dedicated failure monitor thread invoked RaftConsensus::StartElection() synchronously, thus preventing it from surfacing additional failures during that time. This patch attempts to restore these semantics by short-circuiting and ignoring any failures detected while a Raft thread is in StartElection(). This is a super targeted fix geared towards a point release; a more correct fix would be to completely disable failure detection while an election is running, but that'll require more work. Originally I had written a test that injects latency into ConsensusMetadata::Flush(), toggles the fix, and compares the number of vote request RPCs. I couldn't get it to be totally robust, and the "feature flag" used in the toggle is likely to become obselete quickly. So in the end I decided to drop the test from the patch. Change-Id: Ifeaf99ce57f7d5cd01a6c786c178567a98438ced Reviewed-on: http://gerrit.cloudera.org:8080/8107 Reviewed-by: Mike Percy <mpercy@apache.org> Tested-by: Kudu Jenkins (cherry picked from commit edd41cb40fbad206e2c356983baba8fbc57199b5) Reviewed-on: http://gerrit.cloudera.org:8080/10987 Reviewed-by: Adar Dembo <adar@cloudera.com> Tested-by: Adar Dembo <adar@cloudera.com>

commit: 737930f3cd0c0caba3c36944f91058ffaead4714 [log] [tgz]
author: Adar Dembo <adar@cloudera.com> Tue Sep 19 18:45:51 2017 -0700
committer: Adar Dembo <adar@cloudera.com> Thu Jul 19 00:28:48 2018 +0000
tree: 27b92f15adfbbc1d3ca4b7f739b7d91c1eb623b1
parent: 81325bf5f2572e5d3fb7af0e53db7a1b4ac2c500 [diff]
diff --git a/src/kudu/consensus/raft_consensus.cc b/src/kudu/consensus/raft_consensus.cc
index 8a251a1..863a46b 100644
--- a/src/kudu/consensus/raft_consensus.cc
+++ b/src/kudu/consensus/raft_consensus.cc

@@ -521,9 +521,13 @@
 }
 
 void RaftConsensus::ReportFailureDetectedTask() {
-  WARN_NOT_OK(StartElection(FLAGS_raft_enable_pre_election ?
-      PRE_ELECTION : NORMAL_ELECTION, ELECTION_TIMEOUT_EXPIRED),
-              LogPrefixThreadSafe() + "failed to trigger leader election");
+  std::unique_lock<simple_spinlock> try_lock(failure_detector_election_lock_,
+                                             std::try_to_lock);
+  if (try_lock.owns_lock()) {
+    WARN_NOT_OK(StartElection(FLAGS_raft_enable_pre_election ?
+        PRE_ELECTION : NORMAL_ELECTION, ELECTION_TIMEOUT_EXPIRED),
+                LogPrefixThreadSafe() + "failed to trigger leader election");
+  }
 }
 
 void RaftConsensus::ReportFailureDetected() {

diff --git a/src/kudu/consensus/raft_consensus.h b/src/kudu/consensus/raft_consensus.h
index 3a661d7..bf2b9da 100644
--- a/src/kudu/consensus/raft_consensus.h
+++ b/src/kudu/consensus/raft_consensus.h

@@ -772,6 +772,22 @@
 
   std::shared_ptr<rpc::PeriodicTimer> failure_detector_;
 
+  // Lock held while starting a failure-triggered election.
+  //
+  // After reporting a failure and asynchronously starting an election, the
+  // failure detector immediately rearms. If the election starts slowly (i.e.
+  // there's a lot of contention on the consensus lock, or persisting votes is
+  // really slow due to other I/O), more elections may start and "stack" on
+  // top of the first. Forcing the starting of elections to serialize on this
+  // lock prevents that from happening. See KUDU-2149 for more details.
+  //
+  // Note: the lock is only ever acquired via try_lock(); if it cannot be
+  // acquired, a StartElection() is in progress so the next one is skipped.
+  //
+  // TODO(KUDU-2155): should be replaced with explicit disabling/enabling of
+  // the failure detector during elections.
+  simple_spinlock failure_detector_election_lock_;
+
   // If any RequestVote() RPC arrives before this timestamp,
   // the request will be ignored. This prevents abandoned or partitioned
   // nodes from disturbing the healthy leader.
commit	737930f3cd0c0caba3c36944f91058ffaead4714	[log] [tgz]
author	Adar Dembo <adar@cloudera.com>	Tue Sep 19 18:45:51 2017 -0700
committer	Adar Dembo <adar@cloudera.com>	Thu Jul 19 00:28:48 2018 +0000
tree	27b92f15adfbbc1d3ca4b7f739b7d91c1eb623b1
parent	81325bf5f2572e5d3fb7af0e53db7a1b4ac2c500 [diff]