src/kudu/integration-tests/raft_consensus_nonvoter-itest.cc - kudu - Git at Google

 // Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements.  See the NOTICE file
 // distributed with this work for additional information
 // regarding copyright ownership.  The ASF licenses this file
 // to you under the Apache License, Version 2.0 (the
 // "License"); you may not use this file except in compliance
 // with the License.  You may obtain a copy of the License at
 //
 //   http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing,
 // software distributed under the License is distributed on an
 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.

 #include <cstddef>
 #include <cstdint>
 #include <functional>
 #include <memory>
 #include <numeric>
 #include <ostream>
 #include <string>
 #include <thread>
 #include <unordered_map>
 #include <utility>
 #include <vector>

 #include <boost/optional/optional.hpp>
 #include <gflags/gflags_declare.h>
 #include <glog/logging.h>
 #include <gtest/gtest.h>

 #include "kudu/client/client.h"
 #include "kudu/client/replica_controller-internal.h"
 #include "kudu/client/shared_ptr.h" // IWYU pragma: keep
 #include "kudu/consensus/consensus.pb.h"
 #include "kudu/consensus/metadata.pb.h"
 #include "kudu/consensus/quorum_util.h"
 #include "kudu/gutil/map-util.h"
 #include "kudu/gutil/stl_util.h"
 #include "kudu/gutil/strings/substitute.h"
 #include "kudu/gutil/strings/util.h"
 #include "kudu/integration-tests/cluster_itest_util.h"
 #include "kudu/integration-tests/cluster_verifier.h"
 #include "kudu/integration-tests/mini_cluster_fs_inspector.h"
 #include "kudu/integration-tests/raft_consensus-itest-base.h"
 #include "kudu/integration-tests/test_workload.h"
 #include "kudu/master/master.pb.h"
 #include "kudu/mini-cluster/external_mini_cluster.h"
 #include "kudu/tablet/metadata.pb.h"
 #include "kudu/tserver/tablet_server-test-base.h"
 #include "kudu/tserver/tserver.pb.h"
 #include "kudu/util/metrics.h"
 #include "kudu/util/monotime.h"
 #include "kudu/util/net/sockaddr.h"
 #include "kudu/util/pb_util.h"
 #include "kudu/util/scoped_cleanup.h"
 #include "kudu/util/status.h"
 #include "kudu/util/test_macros.h"
 #include "kudu/util/test_util.h"

 DECLARE_int32(heartbeat_interval_ms);
 DECLARE_int32(num_replicas);
 DECLARE_int32(num_tablet_servers);
 DECLARE_double(leader_failure_max_missed_heartbeat_periods);

 METRIC_DECLARE_gauge_int32(tablet_copy_open_client_sessions);
 METRIC_DECLARE_gauge_int32(tablet_copy_open_source_sessions);

 using boost::none;
 using kudu::client::sp::shared_ptr;
 using kudu::client::KuduClient;
 using kudu::client::KuduClientBuilder;
 using kudu::client::KuduScanToken;
 using kudu::client::KuduScanTokenBuilder;
 using kudu::client::KuduTable;
 using kudu::client::internal::ReplicaController;
 using kudu::cluster::ExternalDaemon;
 using kudu::cluster::ExternalMaster;
 using kudu::cluster::ExternalTabletServer;
 using kudu::consensus::EXCLUDE_HEALTH_REPORT;
 using kudu::consensus::IsRaftConfigMember;
 using kudu::consensus::IsRaftConfigVoter;
 using kudu::consensus::RaftPeerPB;
 using kudu::itest::AddServer;
 using kudu::itest::GetConsensusState;
 using kudu::itest::GetInt64Metric;
 using kudu::itest::GetTableLocations;
 using kudu::itest::GetTabletLocations;
 using kudu::itest::LeaderStepDown;
 using kudu::itest::RemoveServer;
 using kudu::itest::StartElection;
 using kudu::itest::TServerDetails;
 using kudu::itest::TabletServerMap;
 using kudu::itest::WAIT_FOR_LEADER;
 using kudu::itest::WaitForNumTabletsOnTS;
 using kudu::itest::WaitForReplicasReportedToMaster;
 using kudu::master::ANY_REPLICA;
 using kudu::master::GetTableLocationsResponsePB;
 using kudu::master::GetTabletLocationsResponsePB;
 using kudu::master::TabletLocationsPB;
 using kudu::master::VOTER_REPLICA;
 using kudu::tablet::TABLET_DATA_COPYING;
 using kudu::tablet::TABLET_DATA_TOMBSTONED;
 using std::string;
 using std::vector;
 using strings::Substitute;

 namespace kudu {
 namespace tserver {

 // Integration test for the raft consensus implementation.
 // Uses the whole tablet server stack with ExternalMiniCluster.
 class RaftConsensusNonVoterITest : public RaftConsensusITestBase {
  public:
   RaftConsensusNonVoterITest() = default;

  protected:
   // Get number of source tablet copy sessions at the specified server.
   Status GetTabletCopySourceSessionsCount(const ExternalDaemon& server,
                                           int64_t* count);

   // Get number of target/client tablet copy sessions at the specified server.
   Status GetTabletCopyTargetSessionsCount(const ExternalDaemon& server,
                                           int64_t* count);

   // Add replica of the specified type for the specified tablet.
   Status AddReplica(const string& tablet_id,
                     const TServerDetails* replica,
                     RaftPeerPB::MemberType replica_type,
                     const MonoDelta& timeout);

   // Remove replica of the specified tablet.
   Status RemoveReplica(const string& tablet_id,
                        const TServerDetails* replica,
                        const MonoDelta& timeout);

   // Change replica membership to the specified type, i.e. promote a replica
   // in case of RaftPeerPB::VOTER member_type, and demote a replica in case of
   // RaftPeerPB::NON_VOTER member_type.
   Status ChangeReplicaMembership(RaftPeerPB::MemberType member_type,
                                  const string& tablet_id,
                                  const TServerDetails* replica,
                                  const MonoDelta& timeout);
 };

 Status RaftConsensusNonVoterITest::GetTabletCopySourceSessionsCount(
     const ExternalDaemon& server, int64_t* count) {
   return GetInt64Metric(server.bound_http_hostport(),
       &METRIC_ENTITY_server, "kudu.tabletserver",
       &METRIC_tablet_copy_open_source_sessions, "value", count);
 }

 Status RaftConsensusNonVoterITest::GetTabletCopyTargetSessionsCount(
     const ExternalDaemon& server, int64_t* count) {
   return GetInt64Metric(server.bound_http_hostport(),
       &METRIC_ENTITY_server, "kudu.tabletserver",
       &METRIC_tablet_copy_open_client_sessions, "value", count);
 }

 static bool IsConfigurationLeaderError(const Status& s) {
   static const string kPattern = "*Replica * is not leader of this config*";
   return s.IsIllegalState() && MatchPattern(s.ToString(), kPattern);
 }

 Status RaftConsensusNonVoterITest::AddReplica(const string& tablet_id,
                                               const TServerDetails* replica,
                                               RaftPeerPB::MemberType replica_type,
                                               const MonoDelta& timeout) {
   const MonoTime deadline = MonoTime::Now() + timeout;
   while (MonoTime::Now() < deadline) {
     TServerDetails* leader = nullptr;
     RETURN_NOT_OK(WaitForLeaderWithCommittedOp(tablet_id, timeout, &leader));
     Status s = AddServer(leader, tablet_id, replica, replica_type, timeout);
     if (IsConfigurationLeaderError(s)) {
       // The leader has changed, retry.
       continue;
     }
     return s;
   }
   return Status::TimedOut(Substitute("timeout adding replica $0 for tablet $1",
                                      replica->uuid(), tablet_id));
 }

 Status RaftConsensusNonVoterITest::RemoveReplica(const string& tablet_id,
                                                  const TServerDetails* replica,
                                                  const MonoDelta& timeout) {
   const MonoTime deadline = MonoTime::Now() + timeout;
   while (MonoTime::Now() < deadline) {
     TServerDetails* leader = nullptr;
     RETURN_NOT_OK(WaitForLeaderWithCommittedOp(tablet_id, timeout, &leader));
     Status s = RemoveServer(leader, tablet_id, replica, timeout);
     if (IsConfigurationLeaderError(s)) {
       // The leader has changed, retry.
       continue;
     }
     return s;
   }
   return Status::TimedOut(Substitute("timeout removing replica $0 for tablet $1",
                                      replica->uuid(), tablet_id));
 }

 Status RaftConsensusNonVoterITest::ChangeReplicaMembership(
     RaftPeerPB::MemberType member_type,
     const string& tablet_id,
     const TServerDetails* replica,
     const MonoDelta& timeout) {
   const MonoTime deadline = MonoTime::Now() + timeout;
   while (MonoTime::Now() < deadline) {
     TServerDetails* leader = nullptr;
     RETURN_NOT_OK(WaitForLeaderWithCommittedOp(tablet_id, timeout, &leader));
     Status s = ChangeReplicaType(leader, tablet_id, replica, member_type, timeout);
     if (IsConfigurationLeaderError(s)) {
       // The leader has changed, retry.
       continue;
     }
     return s;
   }
   return Status::TimedOut(
       Substitute("tablet $0: timeout changing replica $0 membership type",
                  tablet_id, replica->uuid()));
 }

 // Verify that GetTableLocations() and GetTabletLocations() return the expected
 // set of replicas when for the specified replica type filter.
 TEST_F(RaftConsensusNonVoterITest, GetTableAndTabletLocations) {
   const MonoDelta kTimeout = MonoDelta::FromSeconds(60);
   const int kOriginalReplicasNum = 3;
   const vector<string> kMasterFlags = {
     // In the 3-4-3 replication scheme, the catalog manager removes excess
     // replicas, so it's necessary to disable that default behavior since we
     // want the newly added non-voter replica to stay.
     "--catalog_manager_evict_excess_replicas=false",
   };
   FLAGS_num_tablet_servers = kOriginalReplicasNum + 1;
   FLAGS_num_replicas = kOriginalReplicasNum;
   NO_FATALS(BuildAndStart({}, kMasterFlags));
   ASSERT_EQ(FLAGS_num_tablet_servers, tablet_servers_.size());
   ASSERT_EQ(kOriginalReplicasNum, tablet_replicas_.size());

   const string& tablet_id = tablet_id_;
   TabletServerMap replica_servers;
   for (const auto& e : tablet_replicas_) {
     if (e.first == tablet_id) {
       replica_servers.emplace(e.second->uuid(), e.second);
     }
   }
   ASSERT_EQ(FLAGS_num_replicas, replica_servers.size());

   TServerDetails* new_replica = nullptr;
   for (const auto& ts : tablet_servers_) {
     if (replica_servers.find(ts.first) == replica_servers.end()) {
       new_replica = ts.second;
       break;
     }
   }
   ASSERT_NE(nullptr, new_replica);

   ASSERT_OK(AddReplica(tablet_id, new_replica, RaftPeerPB::NON_VOTER, kTimeout));
   // Wait the newly added replica to start.
   ASSERT_OK(WaitForNumTabletsOnTS(
       new_replica, 1, kTimeout, nullptr, tablet::RUNNING));

   const auto count_roles = [](const TabletLocationsPB& tablet_locations,
       int* num_leaders, int* num_followers, int* num_learners) {
     *num_leaders = 0;
     *num_followers = 0;
     *num_learners = 0;
     for (const auto& r : tablet_locations.interned_replicas()) {
       *num_leaders += (r.role() == RaftPeerPB::LEADER) ? 1 : 0;
       *num_followers += (r.role() == RaftPeerPB::FOLLOWER) ? 1 : 0;
       *num_learners += (r.role() == RaftPeerPB::LEARNER) ? 1 : 0;
     }
   };

   // Verify that replica type filter yields appropriate results for
   // GetTableLocations() RPC.
   {
     GetTableLocationsResponsePB table_locations;
     ASSERT_OK(GetTableLocations(cluster_->master_proxy(), table_->name(),
                                 kTimeout, VOTER_REPLICA, /*table_id=*/none,
                                 &table_locations));
     ASSERT_EQ(1, table_locations.tablet_locations().size());
     const TabletLocationsPB& locations = table_locations.tablet_locations(0);
     ASSERT_EQ(tablet_id_, locations.tablet_id());
     ASSERT_EQ(kOriginalReplicasNum, locations.interned_replicas_size());
     int num_leaders = 0, num_followers = 0, num_learners = 0;
     count_roles(locations, &num_leaders, &num_followers, &num_learners);
     ASSERT_EQ(kOriginalReplicasNum, num_leaders + num_followers);
     ASSERT_EQ(0, num_learners);
   }
   {
     GetTableLocationsResponsePB table_locations;
     ASSERT_OK(GetTableLocations(cluster_->master_proxy(), table_->name(),
                                 kTimeout, ANY_REPLICA, /*table_id=*/none,
                                 &table_locations));
     ASSERT_EQ(1, table_locations.tablet_locations().size());
     const TabletLocationsPB& locations = table_locations.tablet_locations(0);
     ASSERT_EQ(tablet_id_, locations.tablet_id());
     ASSERT_EQ(kOriginalReplicasNum + 1, locations.interned_replicas_size());
     int num_leaders = 0, num_followers = 0, num_learners = 0;
     count_roles(locations, &num_leaders, &num_followers, &num_learners);
     ASSERT_EQ(kOriginalReplicasNum, num_leaders + num_followers);
     ASSERT_EQ(1, num_learners);
   }

   // Verify that replica type filter yields appropriate results for
   // GetTabletLocations() RPC.
   {
     GetTabletLocationsResponsePB tablet_locations;
     ASSERT_OK(GetTabletLocations(cluster_->master_proxy(), tablet_id_,
                                  kTimeout, VOTER_REPLICA, &tablet_locations));
     ASSERT_EQ(kOriginalReplicasNum,
               tablet_locations.tablet_locations(0).interned_replicas_size());
     int num_leaders = 0, num_followers = 0, num_learners = 0;
     count_roles(tablet_locations.tablet_locations(0), &num_leaders, &num_followers, &num_learners);
     ASSERT_EQ(kOriginalReplicasNum, num_leaders + num_followers);
     ASSERT_EQ(0, num_learners);
   }
   {
     GetTabletLocationsResponsePB tablet_locations;
     ASSERT_OK(GetTabletLocations(cluster_->master_proxy(), tablet_id_,
                                  kTimeout, ANY_REPLICA, &tablet_locations));
     ASSERT_EQ(kOriginalReplicasNum + 1,
               tablet_locations.tablet_locations(0).interned_replicas_size());
     int num_leaders = 0, num_followers = 0, num_learners = 0;
     count_roles(tablet_locations.tablet_locations(0), &num_leaders, &num_followers, &num_learners);
     ASSERT_EQ(kOriginalReplicasNum, num_leaders + num_followers);
     ASSERT_EQ(1, num_learners);
   }
 }

 // Verify that non-voters replicas are not exposed to a regular Kudu client.
 // However, it's possible to see the replicas.
 TEST_F(RaftConsensusNonVoterITest, ReplicaMatchPolicy) {
   const MonoDelta kTimeout = MonoDelta::FromSeconds(60);
   const int kOriginalReplicasNum = 3;
   FLAGS_num_tablet_servers = kOriginalReplicasNum + 1;
   FLAGS_num_replicas = kOriginalReplicasNum;
   const vector<string> kMasterFlags = {
     // In the 3-4-3 replication scheme, the catalog manager removes excess
     // replicas, so it's necessary to disable that default behavior since we
     // want the newly added non-voter replica to stay.
     "--catalog_manager_evict_excess_replicas=false",
   };
   NO_FATALS(BuildAndStart({}, kMasterFlags));
   ASSERT_EQ(FLAGS_num_tablet_servers, tablet_servers_.size());
   ASSERT_EQ(kOriginalReplicasNum, tablet_replicas_.size());

   const string& tablet_id = tablet_id_;
   TabletServerMap replica_servers;
   for (const auto& e : tablet_replicas_) {
     if (e.first == tablet_id) {
       replica_servers.emplace(e.second->uuid(), e.second);
     }
   }
   ASSERT_EQ(FLAGS_num_replicas, replica_servers.size());

   TServerDetails* new_replica = nullptr;
   for (const auto& ts : tablet_servers_) {
     if (replica_servers.find(ts.first) == replica_servers.end()) {
       new_replica = ts.second;
       break;
     }
   }
   ASSERT_NE(nullptr, new_replica);

   ASSERT_OK(AddReplica(tablet_id, new_replica, RaftPeerPB::NON_VOTER, kTimeout));
   // Wait the newly added replica to start.
   ASSERT_OK(WaitForNumTabletsOnTS(
       new_replica, 1, kTimeout, nullptr, tablet::RUNNING));

   auto count_replicas = [](KuduTable* table, size_t* count) {
     vector<KuduScanToken*> tokens;
     ElementDeleter deleter(&tokens);
     KuduScanTokenBuilder builder(table);
     RETURN_NOT_OK(builder.Build(&tokens));
     size_t replicas_count = std::accumulate(
         tokens.begin(), tokens.end(), 0,
         [](size_t sum, const KuduScanToken* token) {
           return sum + token->tablet().replicas().size();
         });
     *count = replicas_count;
     return Status::OK();
   };

   // The case of regular client: the non-voter replica should not be seen.
   {
     size_t count = 0;
     ASSERT_OK(count_replicas(table_.get(), &count));
     EXPECT_EQ(kOriginalReplicasNum, count);
   }

   // The case of special client used for internal tools, etc.: non-voter
   // replicas should be visible.
   {
     KuduClientBuilder builder;
     ReplicaController::SetVisibility(&builder, ReplicaController::Visibility::ALL);
     shared_ptr<KuduClient> client;
     ASSERT_OK(builder
               .add_master_server_addr(cluster_->master()->bound_rpc_addr().ToString())
               .Build(&client));
     ASSERT_NE(nullptr, client.get());
     shared_ptr<KuduTable> t;
     ASSERT_OK(client->OpenTable(table_->name(), &t));

     size_t count = 0;
     ASSERT_OK(count_replicas(t.get(), &count));
     EXPECT_EQ(kOriginalReplicasNum + 1, count);
   }
 }

 // Ensure that adding a NON_VOTER replica is properly handled by the system:
 //
 //   * Updating Raft configuration for tablet by adding a NON_VOTER replica
 //     succeeds, no errors reported.
 //
 //   * After adding a replica, the system should start tablet copying
 //     to the newly added replica: both the source and the target copy sessions
 //     should be active for some time.
 //
 //   * By the time the newly added replica changes its state to RUNNING,
 //     the tablet copy session should end at both sides.
 //
 //   * Tablet leader reports about the newly added replica to the master.
 //
 //   * If the leader steps down, a new one can be elected and it's possible
 //     to insert data into the table which contains the tablet.
 //
 //   * The tablet stays consistent: ksck verification reports no error,
 //     replicated operation indices match across all replicas,
 //     tablet row count matches the expected number.
 //
 TEST_F(RaftConsensusNonVoterITest, AddNonVoterReplica) {
   const MonoDelta kTimeout = MonoDelta::FromSeconds(60);
   const vector<string> kMasterFlags = {
     // Allow replication factor of 2.
     "--allow_unsafe_replication_factor=true",

     // In the 3-4-3 replication scheme, the catalog manager removes excess
     // replicas, so it's necessary to disable that default behavior since we
     // want the newly added non-voter replica to stay.
     "--catalog_manager_evict_excess_replicas=false",
   };
   const vector<string> kTserverFlags = {
     // Slow down tablet copy to observe active source and target sessions.
     "--tablet_copy_download_file_inject_latency_ms=1000",
   };
   const int kOriginalReplicasNum = 2;

   FLAGS_num_tablet_servers = 3;
   FLAGS_num_replicas = kOriginalReplicasNum;
   NO_FATALS(BuildAndStart(kTserverFlags, kMasterFlags));
   ASSERT_EQ(3, tablet_servers_.size());
   ASSERT_EQ(kOriginalReplicasNum, tablet_replicas_.size());

   const string& tablet_id = tablet_id_;
   TabletServerMap replica_servers;
   for (const auto& e : tablet_replicas_) {
     if (e.first == tablet_id) {
       replica_servers.emplace(e.second->uuid(), e.second);
     }
   }
   ASSERT_EQ(FLAGS_num_replicas, replica_servers.size());

   // Create a test table and insert some data into the table,
   // so the special flag --tablet_copy_download_file_inject_latency_ms
   // could take affect while tablet copy happens down the road.
   TestWorkload workload(cluster_.get());
   workload.set_table_name(kTableId);
   workload.Setup();
   workload.Start();
   while (workload.rows_inserted() < 100) {
     SleepFor(MonoDelta::FromMilliseconds(10));
   }
   workload.StopAndJoin();

   ASSERT_OK(WaitForServersToAgree(kTimeout, replica_servers, tablet_id, 1));

   TServerDetails* new_replica = nullptr;
   for (const auto& ts : tablet_servers_) {
     if (replica_servers.find(ts.first) == replica_servers.end()) {
       new_replica = ts.second;
       break;
     }
   }
   ASSERT_NE(nullptr, new_replica);

   ASSERT_OK(AddReplica(tablet_id, new_replica, RaftPeerPB::NON_VOTER, kTimeout));

   const int new_replica_idx =
       cluster_->tablet_server_index_by_uuid(new_replica->uuid());
   // Wait for the tablet copying to start.
   ASSERT_OK(inspect_->WaitForTabletDataStateOnTS(
       new_replica_idx, tablet_id, { TABLET_DATA_COPYING }, kTimeout));

   TServerDetails* leader = nullptr;
   ASSERT_OK(GetLeaderReplicaWithRetries(tablet_id, &leader));
   const ExternalDaemon& ed_leader =
       *cluster_->tablet_server_by_uuid(leader->uuid());
   const ExternalDaemon& ed_new_replica =
       *cluster_->tablet_server_by_uuid(new_replica->uuid());
   {
     int64_t num_sessions;
     ASSERT_OK(GetTabletCopySourceSessionsCount(ed_leader, &num_sessions));
     EXPECT_EQ(1, num_sessions);
   }
   {
     int64_t num_sessions;
     ASSERT_OK(GetTabletCopyTargetSessionsCount(ed_new_replica, &num_sessions));
     EXPECT_EQ(1, num_sessions);
   }

   // The newly copied replica should be able to start.
   ASSERT_OK(WaitForNumTabletsOnTS(
       new_replica, 1, kTimeout, nullptr, tablet::RUNNING));

   // The tablet copying should complete shortly after tablet state becomes
   // RUNNING. Sampling the counters right after seeing RUNNING tablet status
   // is a little racy: it takes some time to end tablet copy sessions at both
   // sides. So, using ASSERT_EVENTUALLY here to avoid flakiness.
   ASSERT_EVENTUALLY([&]() {
       int64_t num_sessions;
       ASSERT_OK(GetTabletCopySourceSessionsCount(ed_leader, &num_sessions));
       EXPECT_EQ(0, num_sessions);
       ASSERT_OK(GetTabletCopyTargetSessionsCount(ed_new_replica, &num_sessions));
       EXPECT_EQ(0, num_sessions);
     });

   // The master should report about the newly added NON_VOTER tablet replica
   // to the established leader.
   bool has_leader;
   GetTabletLocationsResponsePB tablet_locations;
   ASSERT_OK(WaitForReplicasReportedToMaster(
       cluster_->master_proxy(), kOriginalReplicasNum + 1, tablet_id, kTimeout,
       WAIT_FOR_LEADER, ANY_REPLICA, &has_leader, &tablet_locations));
   ASSERT_TRUE(has_leader);

   // Check the updated cluster is able to elect a leader.
   // The leader role can fluctuate among tablet replicas, so the block below
   // is wrapped into ASSERT_EVENTUALLY.
   ASSERT_EVENTUALLY([&]() {
     TServerDetails* leader = nullptr;
     ASSERT_OK(GetLeaderReplicaWithRetries(tablet_id, &leader));
     ASSERT_OK(LeaderStepDown(leader, tablet_id, kTimeout));
   });

   // Make sure it's possible to insert more data into the table once it's backed
   // by one more (NON_VOTER) replica.
   const int64_t prev_inserted = workload.rows_inserted();
   workload.Start();
   while (workload.rows_inserted() < 2 * prev_inserted) {
     SleepFor(MonoDelta::FromMilliseconds(10));
   }
   workload.StopAndJoin();

   NO_FATALS(cluster_->AssertNoCrashes());
   // Ensure that the replicas converge. Along with other verification steps,
   // ClusterVerifier employs VerifyCommittedOpIdsMatch() to verify that
   // all OpIds match in local files under all tablet servers of the cluster,
   // so NON_VOTER replicas are covered by this check as well.
   ClusterVerifier v(cluster_.get());
   v.SetOperationsTimeout(kTimeout);
   v.SetVerificationTimeout(kTimeout);
   NO_FATALS(v.CheckCluster());
   NO_FATALS(v.CheckRowCount(workload.table_name(),
                             ClusterVerifier::EXACTLY,
                             workload.rows_inserted()));
 }

 // Test how the system reacts on removing a NON_VOTER replica from
 // tablet cluster:
 //
 //   * First, add a NON_VOTER member into the cluster (covered by other test).
 //
 //   * Make sure that changing Raft configuration by removing a NON_VOTER
 //     replica does not return errors.
 //
 //   * After removing such a non-voter replica, the system should not try
 //     to add a new replica instead of the removed one.
 //
 //   * Tablet leader is established and it reports about the removed replica
 //     to the master.
 //
 //   * The updated tablet is still available: it's possible to insert data
 //     into the table which is hosted by the tablet.
 //
 //   * The tablet stays consistent: ksck verification reports no error,
 //     replicated operation indices match across all remaining replicas,
 //     tablet row count matches the expected number.
 //
 TEST_F(RaftConsensusNonVoterITest, AddThenRemoveNonVoterReplica) {
   const MonoDelta kTimeout = MonoDelta::FromSeconds(60);
   const vector<string> kMasterFlags = {
     // Allow replication factor of 2.
     "--allow_unsafe_replication_factor=true",

     // In the 3-4-3 replication scheme, the catalog manager removes excess
     // replicas, so it's necessary to disable that default behavior since we
     // want the newly added non-voter replica to stay.
     "--catalog_manager_evict_excess_replicas=false",
   };
   const int kOriginalReplicasNum = 2;

   FLAGS_num_tablet_servers = 3;
   FLAGS_num_replicas = kOriginalReplicasNum;
   NO_FATALS(BuildAndStart({}, kMasterFlags));
   ASSERT_EQ(3, tablet_servers_.size());
   ASSERT_EQ(kOriginalReplicasNum, tablet_replicas_.size());

   const string& tablet_id = tablet_id_;
   TabletServerMap replica_servers;
   for (const auto& e : tablet_replicas_) {
     if (e.first == tablet_id) {
       replica_servers.emplace(e.second->uuid(), e.second);
     }
   }
   ASSERT_EQ(FLAGS_num_replicas, replica_servers.size());

   TServerDetails* new_replica = nullptr;
   for (const auto& ts : tablet_servers_) {
     if (replica_servers.find(ts.first) == replica_servers.end()) {
       new_replica = ts.second;
       break;
     }
   }
   ASSERT_NE(nullptr, new_replica);
   ASSERT_OK(AddReplica(tablet_id, new_replica, RaftPeerPB::NON_VOTER, kTimeout));

   TestWorkload workload(cluster_.get());
   workload.set_table_name(kTableId);
   workload.Setup();
   workload.Start();
   while (workload.rows_inserted() < 100) {
     SleepFor(MonoDelta::FromMilliseconds(10));
   }
   workload.StopAndJoin();
   ASSERT_OK(WaitForServersToAgree(kTimeout, replica_servers, tablet_id, 1));

   // The newly copied replica should be able to start.
   ASSERT_OK(WaitForNumTabletsOnTS(
       new_replica, 1, kTimeout, nullptr, tablet::RUNNING));

   // Ensure that nothing crashes and the replicas converge.
   NO_FATALS(cluster_->AssertNoCrashes());
   ClusterVerifier v(cluster_.get());
   v.SetOperationsTimeout(kTimeout);
   v.SetVerificationTimeout(kTimeout);
   NO_FATALS(v.CheckCluster());
   NO_FATALS(v.CheckRowCount(workload.table_name(),
                             ClusterVerifier::EXACTLY,
                             workload.rows_inserted()));

   // Remove the newly added replica.
   ASSERT_OK(RemoveReplica(tablet_id, new_replica, kTimeout));
   ASSERT_OK(WaitForServersToAgree(kTimeout, replica_servers, tablet_id, 1));

   // Verify the removed replica gets tombstoned.
   const int new_replica_idx =
       cluster_->tablet_server_index_by_uuid(new_replica->uuid());
   ASSERT_OK(inspect_->WaitForTabletDataStateOnTS(
       new_replica_idx, tablet_id, { TABLET_DATA_TOMBSTONED }, kTimeout));

   // The added and then removed tablet replica should be gone, and the master
   // should report appropriate replica count at this point. The tablet leader
   // should be established.
   bool has_leader;
   GetTabletLocationsResponsePB tablet_locations;
   ASSERT_OK(WaitForReplicasReportedToMaster(
       cluster_->master_proxy(), kOriginalReplicasNum, tablet_id, kTimeout,
       WAIT_FOR_LEADER, ANY_REPLICA, &has_leader, &tablet_locations));
   ASSERT_TRUE(has_leader);

   // Make sure it's possible to insert data into the tablet once the NON_VOTER
   // replica is gone.
   const int64_t prev_inserted = workload.rows_inserted();
   workload.Start();
   while (workload.rows_inserted() < 2 * prev_inserted) {
     SleepFor(MonoDelta::FromMilliseconds(10));
   }
   workload.StopAndJoin();

   // Ensure that nothing crashed and the replicas converge.
   NO_FATALS(cluster_->AssertNoCrashes());
   NO_FATALS(v.CheckCluster());
   NO_FATALS(v.CheckRowCount(workload.table_name(),
                             ClusterVerifier::EXACTLY,
                             workload.rows_inserted()));
 }

 // Test to ensure that a non-voter replica:
 //  * does not vote
 //  * does not start leader elections
 //  * returns an error on RunLeaderElection() RPC call
 TEST_F(RaftConsensusNonVoterITest, NonVoterReplicasDoNotVote) {
   if (!AllowSlowTests()) {
     LOG(WARNING) << "test is skipped; set KUDU_ALLOW_SLOW_TESTS=1 to run";
     return;
   }

   const MonoDelta kTimeout = MonoDelta::FromSeconds(60);
   const int kOriginalReplicasNum = 2;
   const int kHbIntervalMs = 64;
   const int kHbLeaderMissedNum = 1;
   const vector<string> kMasterFlags = {
     // Allow replication factor of 2.
     "--allow_unsafe_replication_factor=true",

     // In the 3-4-3 replication scheme, the catalog manager removes excess
     // replicas, so it's necessary to disable that default behavior since we
     // want the newly added non-voter replica to stay.
     "--catalog_manager_evict_excess_replicas=false",
   };
   const vector<string> kTserverFlags = {
     Substitute("--raft_heartbeat_interval_ms=$0", kHbIntervalMs),
     Substitute("--leader_failure_max_missed_heartbeat_periods=$0",
         kHbLeaderMissedNum),
   };

   FLAGS_num_tablet_servers = 3;
   FLAGS_num_replicas = kOriginalReplicasNum;
   NO_FATALS(BuildAndStart(kTserverFlags, kMasterFlags));
   ASSERT_EQ(3, tablet_servers_.size());
   ASSERT_EQ(kOriginalReplicasNum, tablet_replicas_.size());

   const string& tablet_id = tablet_id_;
   TabletServerMap replica_servers;
   for (const auto& e : tablet_replicas_) {
     if (e.first == tablet_id) {
       replica_servers.emplace(e.second->uuid(), e.second);
     }
   }
   ASSERT_EQ(FLAGS_num_replicas, replica_servers.size());

   ASSERT_OK(WaitForServersToAgree(kTimeout, replica_servers, tablet_id, 1));

   TServerDetails* new_replica = nullptr;
   for (const auto& ts : tablet_servers_) {
     if (replica_servers.find(ts.first) == replica_servers.end()) {
       new_replica = ts.second;
       break;
     }
   }
   ASSERT_NE(nullptr, new_replica);

   ASSERT_OK(AddReplica(tablet_id, new_replica, RaftPeerPB::NON_VOTER, kTimeout));

   // The newly copied replica should be able to start.
   ASSERT_OK(WaitForNumTabletsOnTS(
       new_replica, 1, kTimeout, nullptr, tablet::RUNNING));

   // Ensure that nothing crashes: all tservers must be alive for next step
   // of the scenario.
   NO_FATALS(cluster_->AssertNoCrashes());

   // Make sure a NON_VOTER replica doesn't vote.
   {
     // Pause the current leader and make sure the majority is not achievable.
     // It would not be the case if the non-voter replica could vote in the
     // election initiated after the failure of the current leader was detected.
     TServerDetails* leader;
     ASSERT_OK(GetLeaderReplicaWithRetries(tablet_id, &leader));
     ExternalTabletServer* leader_ts =
         cluster_->tablet_server_by_uuid(leader->uuid());

     int64_t term_leader;
     ASSERT_OK(GetTermMetricValue(leader_ts, &term_leader));

     ASSERT_OK(leader_ts->Pause());
     SCOPED_CLEANUP({
       ASSERT_OK(leader_ts->Resume());
     });
     TServerDetails* new_leader;
     const Status s = GetLeaderReplicaWithRetries(tablet_id, &new_leader, 10);
     ASSERT_TRUE(s.IsNotFound()) << s.ToString();
     ASSERT_OK(leader_ts->Resume());

     // The majority should be achievable once the leader replica is resumed.
     ASSERT_OK(GetLeaderReplicaWithRetries(tablet_id, &new_leader));
     NO_FATALS(cluster_->AssertNoCrashes());
   }

   // Make sure a NON_VOTER replica does not start leader election on start-up.
   {
     // Disable failure detection for all replicas.
     for (int i = 0; i < cluster_->num_tablet_servers(); ++i) {
       ExternalTabletServer* ts = cluster_->tablet_server(i);
       ASSERT_OK(cluster_->SetFlag(ts,
           "enable_leader_failure_detection", "false"));
     }
     ExternalTabletServer* new_replica_ts =
         cluster_->tablet_server_by_uuid(new_replica->uuid());

     // Get the current Raft term for the tablet.
     int64_t term_before_restart = 0;
     ASSERT_OK(GetTermMetricValue(new_replica_ts, &term_before_restart));

     new_replica_ts->Shutdown();
     ASSERT_OK(new_replica_ts->Restart());
     // Wait for the tablet server to start up.
     ASSERT_OK(cluster_->WaitForTabletsRunning(new_replica_ts, 1, kTimeout));

     // Once restarted, the tablet server will have the default disposition
     // for the enable_leader_failure_detection flag, i.e. our new NON_VOTER
     // replica will have leader failure detection enabled. That said,
     // the leader election could trigger if the replica was of VOTER type.
     // However, it's not and no election should be started, and the term
     // must be the same as before starting this NON_VOTER replica.
     // So, give a chance for a new election to happen and compare the terms.
     SleepFor(MonoDelta::FromMilliseconds(
         3L * kHbLeaderMissedNum * kHbIntervalMs));

     int64_t term_after_restart = 0;
     ASSERT_OK(GetTermMetricValue(new_replica_ts, &term_after_restart));
     EXPECT_EQ(term_before_restart, term_after_restart);
   }

   // Make sure a non-voter replica returns an error on RunLeaderElection()
   // RPC call.
   {
     const Status s = StartElection(new_replica, tablet_id_, kTimeout);
     ASSERT_TRUE(s.IsIllegalState()) << s.ToString();
   }
 }

 // Test that it's possible to promote a NON_VOTER replica into a VOTER
 // and demote a VOTER replica into NON_VOTER (if not a leader).
 // Promote and demote a replica under active workload.
 // Promote a replica and remove it, making sure it gets tombstoned.
 TEST_F(RaftConsensusNonVoterITest, PromoteAndDemote) {
   if (!AllowSlowTests()) {
     LOG(WARNING) << "test is skipped; set KUDU_ALLOW_SLOW_TESTS=1 to run";
     return;
   }

   const MonoDelta kTimeout = MonoDelta::FromSeconds(120);
   const int kInitialReplicasNum = 3;
   const int kHbIntervalMs = 50;
   const vector<string> kTserverFlags = {
     // Run the test faster: shorten the default interval for Raft heartbeats.
     Substitute("--raft_heartbeat_interval_ms=$0", kHbIntervalMs),

     // The scenario does not assume replicas might be evicted while running.
     "--evict_failed_followers=false",

     // To avoid flakiness, this scenario disables leader re-elections.
     "--enable_leader_failure_detection=false",
   };
   const vector<string> kMasterFlags = {
     // The election is run manually after BuildAndStart().
     "--catalog_manager_wait_for_new_tablets_to_elect_leader=false",

     // In the 3-4-3 replication scheme, the catalog manager removes excess
     // replicas, so it's necessary to disable that default behavior since we
     // want the newly added non-voter replica to stay.
     "--catalog_manager_evict_excess_replicas=false",
   };

   FLAGS_num_tablet_servers = 4;
   FLAGS_num_replicas = kInitialReplicasNum;
   NO_FATALS(BuildAndStart(kTserverFlags, kMasterFlags));
   ASSERT_FALSE(tablet_replicas_.empty());
   ASSERT_EVENTUALLY([&]() {
     const MonoDelta kElectionTimeout = MonoDelta::FromSeconds(3);
     ASSERT_OK(StartElection(
         tablet_replicas_.begin()->second, tablet_id_, kElectionTimeout));
     ASSERT_OK(WaitUntilLeader(
         tablet_replicas_.begin()->second, tablet_id_, kElectionTimeout));
   });

   ASSERT_EQ(4, tablet_servers_.size());
   ASSERT_EQ(kInitialReplicasNum, tablet_replicas_.size());

   const string& tablet_id = tablet_id_;
   TabletServerMap replica_servers;
   for (const auto& e : tablet_replicas_) {
     if (e.first == tablet_id) {
       replica_servers.emplace(e.second->uuid(), e.second);
     }
   }
   ASSERT_EQ(FLAGS_num_replicas, replica_servers.size());

   ASSERT_OK(WaitForServersToAgree(kTimeout, replica_servers, tablet_id, 1));
   TServerDetails* replica = nullptr;
   for (const auto& ts : tablet_servers_) {
     if (replica_servers.find(ts.first) == replica_servers.end()) {
       replica = ts.second;
       break;
     }
   }
   ASSERT_NE(nullptr, replica);

   ASSERT_OK(AddReplica(tablet_id, replica, RaftPeerPB::NON_VOTER, kTimeout));

   // Wait for tablet copy to complete: the scenario assumes the newly added
   // replica is up-to-date.
   ASSERT_OK(WaitForNumTabletsOnTS(
       replica, 1, kTimeout, nullptr, tablet::RUNNING));

   // It should be impossible to demote a non-voter.
   {
     Status s = ChangeReplicaMembership(RaftPeerPB::NON_VOTER,
                                        tablet_id, replica, kTimeout);
     const auto s_str = s.ToString();
     ASSERT_TRUE(s.IsInvalidArgument()) << s_str;
     ASSERT_STR_CONTAINS(s_str, "must modify a field when calling MODIFY_PEER");
   }

   // It should be possible to promote a non-voter to voter.
   ASSERT_OK(ChangeReplicaMembership(RaftPeerPB::VOTER, tablet_id,
                                     replica, kTimeout));

   // It should be impossible to promote a voter since it's a voter already.
   {
     Status s = ChangeReplicaMembership(RaftPeerPB::VOTER,
                                        tablet_id, replica, kTimeout);
     const auto s_str = s.ToString();
     ASSERT_TRUE(s.IsInvalidArgument()) << s_str;
     ASSERT_STR_CONTAINS(s_str, "must modify a field when calling MODIFY_PEER");
   }

   {
     // It should be impossible to demote the tablet leader.
     TServerDetails* leader;
     ASSERT_OK(GetLeaderReplicaWithRetries(tablet_id, &leader));
     const Status s_demote = ChangeReplicaMembership(RaftPeerPB::NON_VOTER,
                                                     tablet_id, leader, kTimeout);
     const auto s_demote_str = s_demote.ToString();
     ASSERT_TRUE(s_demote.IsInvalidArgument()) << s_demote_str;
     ASSERT_STR_MATCHES(s_demote_str,
         "Cannot modify member type of peer .* because it is the leader");

     // It should be impossible to promote a leader replica since it's
     // already a voter.
     const Status s_promote = ChangeReplicaMembership(RaftPeerPB::VOTER,
                                                      tablet_id, leader, kTimeout);
     const auto s_promote_str = s_promote.ToString();
     ASSERT_TRUE(s_promote.IsInvalidArgument()) << s_promote_str;
     ASSERT_STR_CONTAINS(s_promote_str, "must modify a field when calling MODIFY_PEER");
   }

   // Demote the replica back.
   ASSERT_OK(ChangeReplicaMembership(RaftPeerPB::NON_VOTER,
                                     tablet_id, replica, kTimeout));

   // Promote/demote a replica under active workload.
   {
     TestWorkload workload(cluster_.get());
     workload.set_table_name(kTableId);
     workload.Setup();
     workload.Start();
     while (workload.rows_inserted() < 10) {
       SleepFor(MonoDelta::FromMilliseconds(10));
     }

     ASSERT_OK(ChangeReplicaMembership(RaftPeerPB::VOTER,
                                       tablet_id, replica, kTimeout));
     auto rows_inserted = workload.rows_inserted();
     while (workload.rows_inserted() < rows_inserted * 2) {
       SleepFor(MonoDelta::FromMilliseconds(10));
     }

     ASSERT_OK(ChangeReplicaMembership(RaftPeerPB::NON_VOTER,
                                       tablet_id, replica, kTimeout));
     rows_inserted = workload.rows_inserted();
     while (workload.rows_inserted() < rows_inserted * 2) {
       SleepFor(MonoDelta::FromMilliseconds(10));
     }
     workload.StopAndJoin();

     ASSERT_OK(WaitForServersToAgree(kTimeout, replica_servers, tablet_id, 1));

     ClusterVerifier v(cluster_.get());
     v.SetOperationsTimeout(kTimeout);
     v.SetVerificationTimeout(kTimeout);
     NO_FATALS(v.CheckCluster());
     NO_FATALS(v.CheckRowCount(workload.table_name(),
                               ClusterVerifier::EXACTLY,
                               workload.rows_inserted()));
   }

   // Promote the non-voter replica again and then remove it when it's a voter,
   // making sure it gets tombstoned.
   {
     ASSERT_OK(ChangeReplicaMembership(RaftPeerPB::VOTER,
                                       tablet_id, replica, kTimeout));
     ASSERT_OK(WaitForServersToAgree(kTimeout, replica_servers, tablet_id, 1));

     ASSERT_OK(RemoveReplica(tablet_id, replica, kTimeout));

     // Verify the removed replica gets tombstoned.
     const int replica_idx =
         cluster_->tablet_server_index_by_uuid(replica->uuid());
     ASSERT_OK(inspect_->WaitForTabletDataStateOnTS(
         replica_idx, tablet_id, { TABLET_DATA_TOMBSTONED }, kTimeout));

     // The removed tablet replica should be gone, and the master should report
     // appropriate replica count at this point.
     bool has_leader;
     GetTabletLocationsResponsePB tablet_locations;
     ASSERT_OK(WaitForReplicasReportedToMaster(
         cluster_->master_proxy(), kInitialReplicasNum, tablet_id, kTimeout,
         WAIT_FOR_LEADER, ANY_REPLICA, &has_leader, &tablet_locations));
     ASSERT_TRUE(has_leader);
   }

   NO_FATALS(cluster_->AssertNoCrashes());
 }

 // Verify that a newly promoted replica is a full-fledged voting member:
 //
 // a) The newly promoted replica should be able to participate in elections,
 //    and its vote is counted in.
 //
 // b) The newly promoted replica can become a leader. This is to verify that
 //    once the non-voter replica is promoted to voter membership,
 //    its failure detection mechanism works as expected.
 //
 TEST_F(RaftConsensusNonVoterITest, PromotedReplicaCanVote) {
   if (!AllowSlowTests()) {
     LOG(WARNING) << "test is skipped; set KUDU_ALLOW_SLOW_TESTS=1 to run";
     return;
   }

   const MonoDelta kTimeout = MonoDelta::FromSeconds(60);
   const int kInitialReplicasNum = 3;
   const int kHbIntervalMs = 50;
   const vector<string> kMasterFlags = {
     // In the 3-4-3 replication scheme, the catalog manager removes excess
     // replicas, so it's necessary to disable that default behavior since we
     // want the newly added non-voter replica to stay.
     "--catalog_manager_evict_excess_replicas=false",
   };
   const vector<string> kTserverFlags = {
     // Run the test faster: shorten the default interval for Raft heartbeats.
     Substitute("--raft_heartbeat_interval_ms=$0", kHbIntervalMs),

     // The scenario does not assume replicas might be evicted while running.
     "--evict_failed_followers=false",
   };

   FLAGS_num_tablet_servers = 4;
   FLAGS_num_replicas = kInitialReplicasNum;
   NO_FATALS(BuildAndStart(kTserverFlags, kMasterFlags));
   ASSERT_EQ(4, tablet_servers_.size());
   ASSERT_EQ(kInitialReplicasNum, tablet_replicas_.size());

   const string& tablet_id = tablet_id_;
   TabletServerMap replica_servers;
   for (const auto& e : tablet_replicas_) {
     if (e.first == tablet_id) {
       replica_servers.emplace(e.second->uuid(), e.second);
     }
   }
   ASSERT_EQ(FLAGS_num_replicas, replica_servers.size());

   ASSERT_OK(WaitForServersToAgree(kTimeout, replica_servers, tablet_id, 1));

   // Find the tablet server without a replica of the tablet (yet).
   TServerDetails* replica = nullptr;
   for (const auto& ts : tablet_servers_) {
     if (!ContainsKey(replica_servers, ts.first)) {
       replica = ts.second;
       break;
     }
   }
   ASSERT_NE(nullptr, replica);
   const auto& new_replica_uuid = replica->uuid();

   ASSERT_OK(AddReplica(tablet_id, replica, RaftPeerPB::NON_VOTER, kTimeout));

   // Wait for tablet copy to complete. Otherwise, in the sequence below,
   // the leader may be paused before the copying is complete.
   ASSERT_OK(WaitForNumTabletsOnTS(
       replica, 1, kTimeout, nullptr, tablet::RUNNING));

   ASSERT_OK(ChangeReplicaMembership(RaftPeerPB::VOTER, tablet_id,
                                     replica, kTimeout));
   // The newly added voter needs to be registered in the internal replica map
   // tablet_replicas_: this is necessary for the future calls like
   // GetLeaderReplicaWithRetries() when the replica becomes a leader.
   NO_FATALS(WaitForReplicasAndUpdateLocations(table_->name()));
   ASSERT_EQ(kInitialReplicasNum + 1, tablet_replicas_.size());

   // Verify that the newly promoted replica's vote counts to achieve
   // the 'strict majority' of the voters.
   {
     // First, make sure the newly added replica is not the tablet leader.
     TServerDetails* leader;
     while (true) {
       ASSERT_OK(GetLeaderReplicaWithRetries(tablet_id, &leader));
       if (leader->uuid() != new_replica_uuid) {
         break;
       }
       // In rare cases, the leader replica can change its role right before the
       // step-down request is received.
       TabletServerErrorPB error;
       auto s = LeaderStepDown(leader, tablet_id, kTimeout, &error);
       if (s.IsIllegalState() &&
           error.code() == TabletServerErrorPB::NOT_THE_LEADER) {
         continue;
       }
       ASSERT_OK(s);
     }

     // Pause the newly added replica and the leader.
     ExternalTabletServer* replica_ts =
         cluster_->tablet_server_by_uuid(replica->uuid());
     ASSERT_OK(replica_ts->Pause());
     auto cleanup_replica = MakeScopedCleanup([&]() {
       ASSERT_OK(replica_ts->Resume());
     });

     ExternalTabletServer* leader_ts =
         cluster_->tablet_server_by_uuid(leader->uuid());
     ASSERT_OK(leader_ts->Pause());
     auto cleanup_leader = MakeScopedCleanup([&]() {
       ASSERT_OK(leader_ts->Resume());
     });

     // With both the current leader and the new voter paused, no new leader
     // can be elected since the majority is now 3 out of 4 voters, while
     // just 2 voters are alive.
     Status s = GetLeaderReplicaWithRetries(tablet_id, &leader, 10);
     const auto s_str = s.ToString();
     ASSERT_TRUE(s.IsNotFound()) << s_str;
     ASSERT_STR_CONTAINS(s_str, "leader replica not found");

     // Resume the newly added voter: with the paused leader, the rest of the
     // replicas should be able to elect a new leader, since the majority is 3
     // out of 4 replicas.
     ASSERT_OK(replica_ts->Resume());
     ASSERT_OK(GetLeaderReplicaWithRetries(tablet_id, &leader));
   }

   // Make sure the promoted replica can become a leader using the mechanism
   // of leader failure detection and subsequent re-election.
   TServerDetails* leader = nullptr;
   for (auto i = 0; i < 250; ++i) {
     ASSERT_OK(GetLeaderReplicaWithRetries(tablet_id, &leader));
     if (leader->uuid() == new_replica_uuid) {
       // Success: the newly added replica has become a leader.
       break;
     }

     ExternalTabletServer* leader_ts =
         cluster_->tablet_server_by_uuid(leader->uuid());
     // Pause the current leader to induce leader re-election. The rest of the
     // replicas should be able to elect a new leader, since the majority is 3
     // out of 4 replicas.
     ASSERT_OK(leader_ts->Pause());
     auto cleanup_leader = MakeScopedCleanup([&]() {
       ASSERT_OK(leader_ts->Resume());
     });
     SleepFor(MonoDelta::FromMilliseconds(kHbIntervalMs *
         static_cast<int64_t>(FLAGS_leader_failure_max_missed_heartbeat_periods + 1.0)));
   }
   ASSERT_NE(nullptr, leader);
   ASSERT_EQ(new_replica_uuid, leader->uuid());
 }

 // Add an extra non-voter replica to the tablet and make sure it's evicted
 // by the catalog manager once catalog manager sees its state updated.
 TEST_F(RaftConsensusNonVoterITest, CatalogManagerEvictsExcessNonVoter) {
   if (!AllowSlowTests()) {
     LOG(WARNING) << "test is skipped; set KUDU_ALLOW_SLOW_TESTS=1 to run";
     return;
   }

   const int kReplicaUnavailableSec = 5;
   const MonoDelta kTimeout = MonoDelta::FromSeconds(60);
   const int kReplicasNum = 3;
   FLAGS_num_replicas = kReplicasNum;
   // Need one extra tserver for a new non-voter replica.
   FLAGS_num_tablet_servers = kReplicasNum + 1;
   const vector<string> kMasterFlags = {
     // The scenario runs with the new replica management scheme.
     "--raft_prepare_replacement_before_eviction=true",
     // Don't evict excess replicas to avoid races in the scenario.
     "--catalog_manager_evict_excess_replicas=false",
   };
   const vector<string> kTserverFlags = {
     // The scenario runs with the new replica management scheme.
     "--raft_prepare_replacement_before_eviction=true",
     Substitute("--consensus_rpc_timeout_ms=$0", 1000 * kReplicaUnavailableSec),
     Substitute("--follower_unavailable_considered_failed_sec=$0",
               kReplicaUnavailableSec),
   };

   NO_FATALS(BuildAndStart(kTserverFlags, kMasterFlags));
   ASSERT_EQ(kReplicasNum + 1, tablet_servers_.size());
   ASSERT_EQ(kReplicasNum, tablet_replicas_.size());

   TabletServerMap replica_servers;
   for (const auto& e : tablet_replicas_) {
     if (e.first == tablet_id_) {
       replica_servers.emplace(e.second->uuid(), e.second);
     }
   }
   ASSERT_EQ(FLAGS_num_replicas, replica_servers.size());

   TServerDetails* new_replica = nullptr;
   for (const auto& ts : tablet_servers_) {
     if (replica_servers.find(ts.first) == replica_servers.end()) {
       new_replica = ts.second;
       break;
     }
   }
   ASSERT_NE(nullptr, new_replica);
   ASSERT_OK(AddReplica(tablet_id_, new_replica, RaftPeerPB::NON_VOTER, kTimeout));

   bool has_leader = false;
   GetTabletLocationsResponsePB tablet_locations;
   // Make sure the extra replica is seen by the master.
   ASSERT_OK(WaitForReplicasReportedToMaster(cluster_->master_proxy(),
                                             kReplicasNum + 1,
                                             tablet_id_,
                                             kTimeout,
                                             WAIT_FOR_LEADER,
                                             ANY_REPLICA,
                                             &has_leader,
                                             &tablet_locations));

   // Switch the catalog manager to start evicting excess replicas
   // (that's how it runs by default in the new replica management scheme).
   // Prior to this point, that might lead to a race, since the newly added
   // non-voter replica might be evicted before it's spotted by the
   // WaitForReplicasReportedToMaster() call above.
   for (auto i = 0; i < cluster_->num_masters(); ++i) {
     ExternalMaster* m = cluster_->master(i);
     ASSERT_OK(cluster_->SetFlag(m,
         "catalog_manager_evict_excess_replicas", "true"));
   }

   ExternalTabletServer* new_replica_ts =
       cluster_->tablet_server_by_uuid(new_replica->uuid());
   ASSERT_NE(nullptr, new_replica_ts);
   ASSERT_OK(new_replica_ts->Pause());
   SCOPED_CLEANUP({
     ASSERT_OK(new_replica_ts->Resume());
   });
   SleepFor(MonoDelta::FromSeconds(2 * kReplicaUnavailableSec));
   ASSERT_OK(new_replica_ts->Resume());

   // Make sure the excess non-voter replica is gone.
   ASSERT_OK(WaitForReplicasReportedToMaster(cluster_->master_proxy(),
                                             kReplicasNum,
                                             tablet_id_,
                                             kTimeout,
                                             WAIT_FOR_LEADER,
                                             ANY_REPLICA,
                                             &has_leader,
                                             &tablet_locations));
   NO_FATALS(cluster_->AssertNoCrashes());
 }

 // Check that the catalog manager adds a non-voter replica to replace failed
 // voter replica in a tablet.
 //
 // TODO(aserbin): and make it run for 5 tablet servers.
 TEST_F(RaftConsensusNonVoterITest, CatalogManagerAddsNonVoter) {
   if (!AllowSlowTests()) {
     LOG(WARNING) << "test is skipped; set KUDU_ALLOW_SLOW_TESTS=1 to run";
     return;
   }

   const int kReplicaUnavailableSec = 10;
   const MonoDelta kTimeout = MonoDelta::FromSeconds(6 * kReplicaUnavailableSec);
   const int kReplicasNum = 3;
   FLAGS_num_replicas = kReplicasNum;
   // Will need one extra tserver after the tserver with existing voter replica
   // is stopped. Otherwise, the catalog manager would not be able to spawn
   // a new non-voter replacement replica.
   FLAGS_num_tablet_servers = kReplicasNum + 1;
   const vector<string> kMasterFlags = {
     // The scenario runs with the new replica management scheme.
     "--raft_prepare_replacement_before_eviction=true",
     // Don't evict excess replicas to avoid races in the scenario.
     "--catalog_manager_evict_excess_replicas=false",
   };
   const vector<string> kTserverFlags = {
     // The scenario runs with the new replica management scheme.
     "--raft_prepare_replacement_before_eviction=true",
     Substitute("--follower_unavailable_considered_failed_sec=$0",
                kReplicaUnavailableSec),
   };

   NO_FATALS(BuildAndStart(kTserverFlags, kMasterFlags));
   ASSERT_EQ(kReplicasNum + 1, tablet_servers_.size());
   ASSERT_EQ(kReplicasNum, tablet_replicas_.size());

   const auto server_uuids_with_replica = GetServersWithReplica(tablet_id_);
   ASSERT_FALSE(server_uuids_with_replica.empty());
   ExternalTabletServer* ts_with_replica =
       cluster_->tablet_server_by_uuid(server_uuids_with_replica.front());
   ASSERT_NE(nullptr, ts_with_replica);
   ts_with_replica->Shutdown();

   // Wait for a new non-voter replica added by the catalog manager to
   // replace the failed one.
   bool has_leader = false;
   GetTabletLocationsResponsePB tablet_locations;
   ASSERT_OK(WaitForReplicasReportedToMaster(cluster_->master_proxy(),
                                             kReplicasNum + 1,
                                             tablet_id_,
                                             kTimeout,
                                             WAIT_FOR_LEADER,
                                             ANY_REPLICA,
                                             &has_leader,
                                             &tablet_locations));
   NO_FATALS(cluster_->AssertNoCrashes());
 }

 // Verify the behavior of the catalog manager for the gone-and-back tablet
 // server in --raft_prepare_replacement_before_eviction=true case. This scenario
 // addresses the situation when a tablet server hosting tablet replicas has not
 // been running for some time (e.g., a bit over the time interval specified by
 // the 'follower_unavailable_considered_failed_sec' flag), and then it comes
 // back before the newly added non-voter replicas are promoted. As a result, the
 // original voter replicas from the tablet server should stay, but the newly
 // added non-voter replicas should be evicted.
 TEST_F(RaftConsensusNonVoterITest, TabletServerIsGoneAndBack) {
   if (!AllowSlowTests()) {
     LOG(WARNING) << "test is skipped; set KUDU_ALLOW_SLOW_TESTS=1 to run";
     return;
   }

   const auto kReplicasNum = 3;
   const auto kReplicaUnavailableSec = 5;
   const auto kTimeoutSec = 60;
   const auto kTimeout = MonoDelta::FromSeconds(kTimeoutSec);
   FLAGS_num_replicas = kReplicasNum;
   // Need one extra tserver after the tserver with on of the replicas stopped.
   // Otherwise, the catalog manager would not be able to spawn a new non-voter
   // replacement replicas elsewhere.
   FLAGS_num_tablet_servers = kReplicasNum + 1;
   const vector<string> kMasterFlags = {
     // The scenario runs with the 3-4-3 replica management scheme.
     "--raft_prepare_replacement_before_eviction=true",
   };
   const vector<string> kTserverFlags = {
     // The scenario runs with the 3-4-3 replica management scheme.
     "--raft_prepare_replacement_before_eviction=true",
     Substitute("--follower_unavailable_considered_failed_sec=$0",
                kReplicaUnavailableSec),
     // Slow down tablet copy to avoid new non-voter replicas catching up with
     // the leader replicas, otherwise they might be promoted to voters before
     // the replicas from the 'failed' tablet server is back.
     Substitute("--tablet_copy_download_file_inject_latency_ms=$0",
                MonoDelta::FromSeconds(3 * kTimeoutSec).ToMilliseconds()),
     // Don't wait for the RPC timeout for too long.
     Substitute("--consensus_rpc_timeout_ms=$0", 1000 * kReplicaUnavailableSec),
   };

   NO_FATALS(BuildAndStart(kTserverFlags, kMasterFlags));
   ASSERT_EQ(kReplicasNum + 1, tablet_servers_.size());
   ASSERT_EQ(kReplicasNum, tablet_replicas_.size());

   // Create a test table and insert some data into the table,
   // so the special flag --tablet_copy_download_file_inject_latency_ms
   // could take affect while tablet copy happens down the road.
   TestWorkload workload(cluster_.get());
   workload.set_table_name(kTableId);
   workload.Setup();
   workload.Start();
   while (workload.rows_inserted() < 10) {
     SleepFor(MonoDelta::FromMilliseconds(10));
   }
   workload.StopAndJoin();

   const auto server_uuids_with_replica = GetServersWithReplica(tablet_id_);
   ASSERT_FALSE(server_uuids_with_replica.empty());
   ExternalTabletServer* ts_with_replica =
       cluster_->tablet_server_by_uuid(server_uuids_with_replica.front());
   ASSERT_NE(nullptr, ts_with_replica);
   ts_with_replica->Shutdown();

   // The leader replica marks the non-responsive replica as failed after
   // FLAGS_follower_unavailable_considered_failed_sec time interval. The
   // catalog manager should spot that and add a new non-voter replica as a
   // replacement.
   bool has_leader = false;
   GetTabletLocationsResponsePB tablet_locations;
   ASSERT_OK(WaitForReplicasReportedToMaster(cluster_->master_proxy(),
                                             kReplicasNum + 1,
                                             tablet_id_,
                                             kTimeout,
                                             WAIT_FOR_LEADER,
                                             ANY_REPLICA,
                                             &has_leader,
                                             &tablet_locations));

   // Restart the tablet server with the replica which has been marked as failed.
   ASSERT_OK(ts_with_replica->Restart());

   // Since the new non-voter replica is still not ready for promotion because
   // the tablet copy is in progress, and all the original voter replicas are in
   // good health, the catalog manager should evict an excess non-voter replica.
   ASSERT_OK(WaitForReplicasReportedToMaster(cluster_->master_proxy(),
                                             kReplicasNum,
                                             tablet_id_,
                                             kTimeout,
                                             WAIT_FOR_LEADER,
                                             ANY_REPLICA,
                                             &has_leader,
                                             &tablet_locations));
   // Make sure the replica from the gone-and-back server is part of the config.
   consensus::ConsensusStatePB cstate;
   ASSERT_EVENTUALLY([&] {
     TServerDetails* leader = nullptr;
     ASSERT_OK(GetLeaderReplicaWithRetries(tablet_id_, &leader));
     // The reason for the outside ASSERT_EVENTUALLY is that the leader might
     // have changed in between of these two calls.
     ASSERT_OK(GetConsensusState(leader, tablet_id_, kTimeout, EXCLUDE_HEALTH_REPORT, &cstate));
   });
   ASSERT_TRUE(IsRaftConfigMember(ts_with_replica->uuid(), cstate.committed_config()));

   NO_FATALS(cluster_->AssertNoCrashes());
 }

 // A two-step sceanario: first, an existing tablet replica fails because it
 // falls behind the threshold of the GCed WAL segment threshold. The catalog
 // manager should notice that and evict it right away. Then it should add a new
 // non-voter replica in attempt to replace the evicted one. The newly added
 // non-voter replica becomes unavailable before completing the tablet copy phase.
 // The catalog manager should add a new non-voter replica to make it possible to
 // replace the failed voter replica, so eventually the tablet has appropriate
 // number of functional replicas to guarantee the tablet's replication factor.
 TEST_F(RaftConsensusNonVoterITest, FailedTabletCopy) {
   if (!AllowSlowTests()) {
     LOG(WARNING) << "test is skipped; set KUDU_ALLOW_SLOW_TESTS=1 to run";
     return;
   }

   const auto kReplicasNum = 3;
   const auto kConsensusRpcTimeout = MonoDelta::FromSeconds(5);
   const auto kTimeoutSec = 60;
   const auto kTimeout = MonoDelta::FromSeconds(kTimeoutSec);
   FLAGS_num_replicas = kReplicasNum;
   // Need two extra tablet servers for the scenario.
   FLAGS_num_tablet_servers = kReplicasNum + 2;

   const vector<string> kMasterFlags = {
     // The scenario runs with the 3-4-3 replica management scheme.
     "--raft_prepare_replacement_before_eviction=true",

     // Detect unavailable tablet servers faster.
     Substitute("--tserver_unresponsive_timeout_ms=$0",
         kConsensusRpcTimeout.ToMilliseconds() / 2),
   };
   vector<string> tserver_flags = {
     // The scenario runs with the 3-4-3 replica management scheme.
     "--raft_prepare_replacement_before_eviction=true",

     Substitute("--follower_unavailable_considered_failed_sec=$0",
         2 * static_cast<int>(kConsensusRpcTimeout.ToSeconds())),

     // Don't wait for the RPC timeout for too long.
     Substitute("--consensus_rpc_timeout_ms=$0", kConsensusRpcTimeout.ToMilliseconds()),
   };
   AddFlagsForLogRolls(&tserver_flags); // For CauseFollowerToFallBehindLogGC().

   NO_FATALS(BuildAndStart(tserver_flags, kMasterFlags));
   ASSERT_EQ(kReplicasNum + 2, tablet_servers_.size());
   ASSERT_EQ(kReplicasNum, tablet_replicas_.size());

   // Make sure the tablet copy will fail on one of tablet servers without
   // tablet replicas.
   const auto server_uuids_no_replica = GetServersWithoutReplica(tablet_id_);
   ASSERT_EQ(2, server_uuids_no_replica.size());
   ExternalTabletServer* ts0 =
       cluster_->tablet_server_by_uuid(server_uuids_no_replica.front());
   ASSERT_NE(nullptr, ts0);
   ASSERT_OK(cluster_->SetFlag(ts0,
                               "tablet_copy_fault_crash_on_fetch_all", "1.0"));
   // Make sure the second tablet server is not available as a candidate for
   // the new non-voter replica.
   ExternalTabletServer* ts1 =
       cluster_->tablet_server_by_uuid(server_uuids_no_replica.back());
   ASSERT_NE(nullptr, ts1);
   ts1->Shutdown();
   SleepFor(kConsensusRpcTimeout);

   // Cause follower 'follower_uuid' to fail.
   string follower_uuid;
   {
     TabletServerMap replica_servers;
     for (const auto& e : tablet_replicas_) {
       if (e.first == tablet_id_) {
         replica_servers.emplace(e.second->uuid(), e.second);
       }
     }
     string leader_uuid;
     int64_t orig_term;
     NO_FATALS(CauseFollowerToFallBehindLogGC(
         replica_servers, &leader_uuid, &orig_term, &follower_uuid));

     // Make sure this tablet server is not to host a functional tablet replica,
     // even if the system tries to place one there after evicting current one.
     ExternalTabletServer* ts =
         cluster_->tablet_server_by_uuid(follower_uuid);
     ASSERT_NE(nullptr, ts);
     ASSERT_OK(cluster_->SetFlag(ts,
                                 "tablet_copy_fault_crash_on_fetch_all", "1.0"));
   }

   // The leader replica marks the non-responsive replica as failed after it
   // realizes the replica would not be able to catch up, and the catalog
   // manager evicts the failed replica right away since it failed in an
   // unrecoverable way.
   bool has_leader = false;
   GetTabletLocationsResponsePB tablet_locations;
   ASSERT_OK(WaitForReplicasReportedToMaster(cluster_->master_proxy(),
                                             kReplicasNum - 1,
                                             tablet_id_,
                                             kTimeout,
                                             WAIT_FOR_LEADER,
                                             VOTER_REPLICA,
                                             &has_leader,
                                             &tablet_locations));

   // The system should add a new non-voter as a replacement.
   // However, the tablet server with the new non-voter replica crashes during
   // the tablet copy phase. Give the catalog manager some time to detect that
   // and purge the failed non-voter from the configuration. Also, the TS manager
   // should detect that the crashed server is not a candidate for new replicas
   // anymore.
   ASSERT_OK(WaitForReplicasReportedToMaster(cluster_->master_proxy(),
                                             kReplicasNum,
                                             tablet_id_,
                                             kTimeout,
                                             WAIT_FOR_LEADER,
                                             ANY_REPLICA,
                                             &has_leader,
                                             &tablet_locations));
   consensus::ConsensusStatePB cstate;
   ASSERT_EVENTUALLY([&] {
     TServerDetails* leader = nullptr;
     ASSERT_OK(GetLeaderReplicaWithRetries(tablet_id_, &leader));
     // The reason for the outside ASSERT_EVENTUALLY is that the leader might
     // have changed in between of these two calls.
     ASSERT_OK(GetConsensusState(leader, tablet_id_, kTimeout, EXCLUDE_HEALTH_REPORT, &cstate));
   });
   // The original voter replica that fell behind the WAL catchup threshold should
   // not be there, it should be evicted.
   EXPECT_FALSE(IsRaftConfigMember(follower_uuid, cstate.committed_config()))
       << pb_util::SecureDebugString(cstate.committed_config())
       << "fell behind WAL replica UUID: " << follower_uuid;
   // The first non-voter replica failed on tablet copy cannot be evicted
   // because no replacement replica is available at this point yet.
   EXPECT_TRUE(IsRaftConfigMember(ts0->uuid(), cstate.committed_config()))
       << pb_util::SecureDebugString(cstate.committed_config())
       << "failed tablet copy replica UUID: " << ts0->uuid();
   // No replicas from the second tablet server should be in the config yet.
   EXPECT_FALSE(IsRaftConfigMember(ts1->uuid(), cstate.committed_config()))
       << pb_util::SecureDebugString(cstate.committed_config())
       << "new replacement replica UUID: " << ts1->uuid();

   // Make the second server available for placing a non-voter replica. Tablet
   // copy for replicas on this with this server should succeed.
   ASSERT_OK(ts1->Restart());

   // The system should be able to recover, replacing the failed replica with
   // the replica on ts1. Since it's hard to predict when the replacement takes
   // place, the easiest way is to wait until it's converges into the required
   // state.
   ASSERT_EVENTUALLY([&] {
     bool has_leader = false;
     GetTabletLocationsResponsePB tablet_locations;
     ASSERT_OK(WaitForReplicasReportedToMaster(cluster_->master_proxy(),
                                               kReplicasNum,
                                               tablet_id_,
                                               kTimeout,
                                               WAIT_FOR_LEADER,
                                               ANY_REPLICA,
                                               &has_leader,
                                               &tablet_locations));

     TServerDetails* leader = nullptr;
     ASSERT_OK(GetLeaderReplicaWithRetries(tablet_id_, &leader));
     ASSERT_OK(GetConsensusState(leader, tablet_id_, kTimeout, EXCLUDE_HEALTH_REPORT, &cstate));
     // The original voter replica that fell behind the WAL catchup threshold
     // should be evicted.
     ASSERT_FALSE(IsRaftConfigMember(follower_uuid, cstate.committed_config()))
         << pb_util::SecureDebugString(cstate.committed_config())
         << "fell behind WAL replica UUID: " << follower_uuid;
     // The first non-voter replica failed during the tablet copy phase
     // should not be present.
     ASSERT_FALSE(IsRaftConfigMember(ts0->uuid(), cstate.committed_config()))
         << pb_util::SecureDebugString(cstate.committed_config())
         << "failed tablet copy replica UUID: " << ts0->uuid();
     // The tablet copy on the restarted server should succeed and this replica
     // should replace the original failed replica.
     ASSERT_TRUE(IsRaftConfigMember(ts1->uuid(), cstate.committed_config()))
         << pb_util::SecureDebugString(cstate.committed_config())
         << "new replacement replica UUID: " << ts1->uuid();
   });
 }

 // This test verifies that the replica replacement process continues after
 // restarting tablet servers, where non-voter replicas which completed tablet
 // copy before the restart of the cluster stay in the configuration and then
 // are promoted to voter replica.
 //
 // The following scenario is used: a voter replica fails, and the catalog
 // manager adds a new non-voter replica to replace the failed one. Before the
 // new replica completes its tablet copy, the majority of tablet servers hosting
 // the tablet replicas (except for the leader replica) is shutdown. The leader
 // replica and the new non-voter are shutdown after the tablet copy completes.
 // After that, all tablet servers except for the former leader replica's server
 // are started again.
 TEST_F(RaftConsensusNonVoterITest, RestartClusterWithNonVoter) {
   if (!AllowSlowTests()) {
     LOG(WARNING) << "test is skipped; set KUDU_ALLOW_SLOW_TESTS=1 to run";
     return;
   }

   const auto kReplicasNum = 3;
   const auto kConsensusRpcTimeout = MonoDelta::FromSeconds(5);
   const auto kTimeoutSec = 30;
   const auto kTimeout = MonoDelta::FromSeconds(kTimeoutSec);
   FLAGS_num_replicas = kReplicasNum;
   // Need some extra tablet servers to ensure the catalog manager does not
   // replace almost all the replicas, even if it could.
   FLAGS_num_tablet_servers = 2 * kReplicasNum + 1;

   const vector<string> kMasterFlags = {
     // The scenario runs with the 3-4-3 replica management scheme.
     "--raft_prepare_replacement_before_eviction=true",
   };
   vector<string> tserver_flags = {
     // The scenario runs with the 3-4-3 replica management scheme.
     "--raft_prepare_replacement_before_eviction=true",

     // Inject delay into the tablet copying: this is to allow for shutting
     // down the all the replicas but the leader and the new non-voter before
     // the tablet copy completes.
     Substitute("--tablet_copy_download_file_inject_latency_ms=$0",
                kConsensusRpcTimeout.ToMilliseconds() * 2),

     Substitute("--follower_unavailable_considered_failed_sec=$0",
                static_cast<int>(kConsensusRpcTimeout.ToSeconds() / 2)),

     // Don't wait for the RPC timeout for too long.
     Substitute("--consensus_rpc_timeout_ms=$0",
                kConsensusRpcTimeout.ToMilliseconds()),
   };

   NO_FATALS(BuildAndStart(tserver_flags, kMasterFlags));
   ASSERT_EQ(kReplicasNum + 4, tablet_servers_.size());
   ASSERT_EQ(kReplicasNum, tablet_replicas_.size());

   // Create a test table and insert some data into the table, so the special
   // flag --tablet_copy_download_file_inject_latency_ms could take affect while
   // tablet copy happens down the road.
   TestWorkload workload(cluster_.get());
   workload.set_table_name(kTableId);
   workload.Setup();
   workload.Start();
   while (workload.rows_inserted() < 100) {
     SleepFor(MonoDelta::FromMilliseconds(10));
   }
   workload.StopAndJoin();

   const auto server_uuids_with_replica = GetServersWithReplica(tablet_id_);
   ASSERT_FALSE(server_uuids_with_replica.empty());
   const string& failed_replica_uuid = server_uuids_with_replica.front();
   ExternalTabletServer* ts_with_replica =
       cluster_->tablet_server_by_uuid(failed_replica_uuid);
   ASSERT_NE(nullptr, ts_with_replica);
   ts_with_replica->Shutdown();

   bool has_leader = false;
   GetTabletLocationsResponsePB tablet_locations;
   ASSERT_OK(WaitForReplicasReportedToMaster(cluster_->master_proxy(),
                                             kReplicasNum + 1,
                                             tablet_id_,
                                             kTimeout,
                                             WAIT_FOR_LEADER,
                                             ANY_REPLICA,
                                             &has_leader,
                                             &tablet_locations));
   // Find the location of the new non-voter replica.
   string new_replica_uuid;
   for (const auto& r : tablet_locations.tablet_locations(0).interned_replicas()) {
     if (r.role() != RaftPeerPB::LEARNER) {
       continue;
     }
     new_replica_uuid = tablet_locations.ts_infos(r.ts_info_idx()).permanent_uuid();
     break;
   }
   ASSERT_FALSE(new_replica_uuid.empty());

   TServerDetails* ts_new_replica = FindOrDie(tablet_servers_, new_replica_uuid);
   ASSERT_NE(nullptr, ts_new_replica);

   // Shutdown all servers with tablet replicas but the leader and the non-voter.
   TServerDetails* leader = nullptr;
   ASSERT_OK(GetLeaderReplicaWithRetries(tablet_id_, &leader));
   const ExternalDaemon* ts_leader = cluster_->tablet_server_by_uuid(leader->uuid());

   // There should be only one tablet.
   ASSERT_EQ(tablet_replicas_.count(tablet_id_), tablet_replicas_.size());
   for (const auto& e : tablet_replicas_) {
     const auto& uuid = e.second->uuid();
     if (uuid != ts_leader->uuid() && uuid != new_replica_uuid) {
       cluster_->tablet_server_by_uuid(uuid)->Shutdown();
     }
   }

   // Wait for the newly copied replica to start.
   ASSERT_OK(WaitForNumTabletsOnTS(
       ts_new_replica, 1, kTimeout, nullptr, tablet::RUNNING));

   cluster_->Shutdown();

   // Start all tablet servers except for the server with the 'failed' replica.
   for (auto i = 0; i < cluster_->num_tablet_servers(); ++i) {
     auto* server = cluster_->tablet_server(i);
     if (server->uuid() == failed_replica_uuid) {
       continue;
     }
     ASSERT_OK(server->Restart());
   }
   for (auto i = 0; i < cluster_->num_masters(); ++i) {
     ASSERT_OK(cluster_->master(i)->Restart());
   }
   ASSERT_OK(cluster_->WaitForTabletServerCount(cluster_->num_tablet_servers() - 1,
                                                kTimeout));

   ASSERT_OK(WaitForReplicasReportedToMaster(cluster_->master_proxy(),
                                             kReplicasNum,
                                             tablet_id_,
                                             kTimeout,
                                             WAIT_FOR_LEADER,
                                             ANY_REPLICA,
                                             &has_leader,
                                             &tablet_locations));
   // The newly added replica needs to be registered in the internal
   // tablet_replicas_: this is necessary for the future calls like
   // GetLeaderReplicaWithRetries() when the replica becomes a leader.
   NO_FATALS(WaitForReplicasAndUpdateLocations(table_->name()));

   consensus::ConsensusStatePB cstate;
   ASSERT_EVENTUALLY([&] {
     TServerDetails* leader = nullptr;
     ASSERT_OK(GetLeaderReplicaWithRetries(tablet_id_, &leader));
     // The reason for the outside ASSERT_EVENTUALLY is that the leader might
     // have changed in between of these two calls.
     ASSERT_OK(GetConsensusState(leader, tablet_id_, kTimeout, EXCLUDE_HEALTH_REPORT, &cstate));
   });
   // The failed voter replica should be evicted.
   EXPECT_FALSE(IsRaftConfigMember(failed_replica_uuid, cstate.committed_config()))
       << pb_util::SecureDebugString(cstate.committed_config())
       << "failed replica UUID: " << failed_replica_uuid;
   // The replacement replica should become a voter.
   EXPECT_TRUE(IsRaftConfigVoter(new_replica_uuid, cstate.committed_config()))
       << pb_util::SecureDebugString(cstate.committed_config())
       << "replacement replica UUID: " << new_replica_uuid;
 }

 // This test verifies that the consensus queue correctly distinguishes between
 // voter and non-voter acknowledgements of the Raft messages. Essentially, the
 // leader replica's consensus queue should not count an ack message from
 // a non-voter replica as if it was sent by a voter replica.
 //
 // The test scenario is simple: try to make a configuration change in a 3 voter
 // Raft cluster, adding a new non-voter replica, when a majority of voters
 // is not online. Make sure the configuration change is not committed.
 TEST_F(RaftConsensusNonVoterITest, NonVoterReplicasInConsensusQueue) {
   if (!AllowSlowTests()) {
     LOG(WARNING) << "test is skipped; set KUDU_ALLOW_SLOW_TESTS=1 to run";
     return;
   }

   const MonoDelta kTimeout = MonoDelta::FromSeconds(60);
   const int kOriginalReplicasNum = 3;
   const int kHbIntervalMs = 50;
   const vector<string> kMasterFlags = {
     // Don't evict excess replicas to avoid races in the scenario.
     "--catalog_manager_evict_excess_replicas=false",
   };
   const vector<string> kTserverFlags = {
     Substitute("--raft_heartbeat_interval_ms=$0", kHbIntervalMs),
   };

   FLAGS_num_replicas = kOriginalReplicasNum;
   FLAGS_num_tablet_servers = kOriginalReplicasNum + 1;
   NO_FATALS(BuildAndStart(kTserverFlags, kMasterFlags));
   ASSERT_EQ(kOriginalReplicasNum + 1, tablet_servers_.size());

   const string& tablet_id = tablet_id_;
   TabletServerMap replica_servers;
   for (const auto& e : tablet_replicas_) {
     if (e.first == tablet_id) {
       replica_servers.emplace(e.second->uuid(), e.second);
     }
   }
   ASSERT_EQ(FLAGS_num_replicas, replica_servers.size());

   ASSERT_OK(WaitForServersToAgree(kTimeout, replica_servers, tablet_id, 1));

   TServerDetails* new_replica = nullptr;
   for (const auto& ts : tablet_servers_) {
     if (replica_servers.find(ts.first) == replica_servers.end()) {
       new_replica = ts.second;
       break;
     }
   }
   ASSERT_NE(nullptr, new_replica);

   // Disable failure detection for all replicas.
   for (int i = 0; i < cluster_->num_tablet_servers(); ++i) {
     ExternalTabletServer* ts = cluster_->tablet_server(i);
     ASSERT_OK(cluster_->SetFlag(ts,
         "enable_leader_failure_detection", "false"));
   }

   // Pause all but the leader replica and try to add a new non-voter into the
   // configuration. It should not pass.
   LOG(INFO) << "Getting leader replica...";
   TServerDetails* leader;
   ASSERT_OK(GetLeaderReplicaWithRetries(tablet_id, &leader));

   LOG(INFO) << "Shutting down non-leader replicas...";
   for (auto& e : replica_servers) {
     const auto& uuid = e.first;
     if (uuid == leader->uuid()) continue;
     cluster_->tablet_server_by_uuid(uuid)->Shutdown();
   }

   LOG(INFO) << "Adding NON_VOTER replica...";
   std::thread t([&] {
       AddServer(leader, tablet_id, new_replica, RaftPeerPB::NON_VOTER, kTimeout);
   });
   SCOPED_CLEANUP({ t.join(); });

   // Verify that the configuration hasn't changed.
   LOG(INFO) << "Waiting for pending config...";
   consensus::ConsensusStatePB cstate;
   ASSERT_EVENTUALLY([&] {
     ASSERT_OK(GetConsensusState(leader, tablet_id, kTimeout, EXCLUDE_HEALTH_REPORT, &cstate));
     ASSERT_TRUE(cstate.has_pending_config());
   });

   // Ensure it does not commit.
   SleepFor(MonoDelta::FromSeconds(5));
   ASSERT_OK(GetConsensusState(leader, tablet_id, kTimeout, EXCLUDE_HEALTH_REPORT, &cstate));
   ASSERT_TRUE(cstate.has_pending_config());

   const auto& new_replica_uuid = new_replica->uuid();
   ASSERT_FALSE(IsRaftConfigMember(new_replica_uuid, cstate.committed_config()))
       << pb_util::SecureDebugString(cstate.committed_config())
       << "new non-voter replica UUID: " << new_replica_uuid;
   ASSERT_EQ(kOriginalReplicasNum, cstate.committed_config().peers_size());

   // Restart the tablet servers.
   for (auto& e : replica_servers) {
     const auto& uuid = e.first;
     if (uuid == leader->uuid()) continue;
     ASSERT_OK(cluster_->tablet_server_by_uuid(uuid)->Restart());
   }

   // Once the new replicas come back online, this should be committed.
   ASSERT_EVENTUALLY([&] {
     ASSERT_OK(GetConsensusState(leader, tablet_id, kTimeout, EXCLUDE_HEALTH_REPORT, &cstate));
     ASSERT_FALSE(cstate.has_pending_config());
   });

   NO_FATALS(cluster_->AssertNoCrashes());
 }

 // This test runs master and tablet server with different replica replacement
 // schemes and makes sure that the tablet server is not registered with the
 // master in such case.
 // Also, it makes sure the tablet server crashes to signal the misconfiguration.
 class IncompatibleReplicaReplacementSchemesITest :
     public RaftConsensusNonVoterITest,
     public ::testing::WithParamInterface<std::tuple<bool, bool>> {
 };
 INSTANTIATE_TEST_CASE_P(, IncompatibleReplicaReplacementSchemesITest,
                         ::testing::Combine(::testing::Bool(),
                                            ::testing::Bool()));
 TEST_P(IncompatibleReplicaReplacementSchemesITest, MasterAndTserverMisconfig) {
   FLAGS_num_tablet_servers = 1;
   FLAGS_num_replicas = 1;
   const MonoDelta kTimeout = MonoDelta::FromSeconds(60);
   const int64_t heartbeat_interval_ms = 500;

   const bool is_incompatible_replica_management_fatal = std::get<0>(GetParam());
   const bool is_3_4_3 = std::get<1>(GetParam());

   // The easiest way to have everything setup is to start the cluster with
   // compatible parameters.
   const vector<string> kMasterFlags = {
     Substitute("--raft_prepare_replacement_before_eviction=$0", is_3_4_3),
   };
   const vector<string> kTsFlags = {
     Substitute("--heartbeat_incompatible_replica_management_is_fatal=$0",
         is_incompatible_replica_management_fatal),
     Substitute("--heartbeat_interval_ms=$0", heartbeat_interval_ms),
     Substitute("--raft_prepare_replacement_before_eviction=$0", is_3_4_3),
   };

   NO_FATALS(BuildAndStart(kTsFlags, kMasterFlags));

   vector<master::ListTabletServersResponsePB_Entry> tservers;
   ASSERT_OK(itest::ListTabletServers(cluster_->master_proxy(), kTimeout, &tservers));
   ASSERT_EQ(1, tservers.size());

   ASSERT_EQ(1, cluster_->num_tablet_servers());
   auto* ts = cluster_->tablet_server(0);
   ASSERT_NE(nullptr, ts);
   ts->Shutdown();

   auto* master = cluster_->master(0);
   master->Shutdown();
   ASSERT_OK(master->Restart());

   // Update corresponding flags to induce a misconfiguration between the master
   // and the tablet server.
   ts->mutable_flags()->emplace_back(
       Substitute("--raft_prepare_replacement_before_eviction=$0", !is_3_4_3));
   ASSERT_OK(ts->Restart());
   if (is_incompatible_replica_management_fatal) {
     ASSERT_OK(ts->WaitForFatal(kTimeout));
   }
   SleepFor(MonoDelta::FromMilliseconds(heartbeat_interval_ms * 3));

   // The tablet server should not be registered with the master.
   ASSERT_OK(itest::ListTabletServers(cluster_->master_proxy(), kTimeout, &tservers));
   ASSERT_EQ(0, tservers.size());

   // Inject feature flag not supported by the master and make sure the tablet
   // server will not be registered with incompatible master.
   ts->mutable_flags()->pop_back();
   ts->mutable_flags()->emplace_back("--heartbeat_inject_required_feature_flag=999");
   ts->Shutdown();
   ASSERT_OK(ts->Restart());
   if (is_incompatible_replica_management_fatal) {
     ASSERT_OK(ts->WaitForFatal(kTimeout));
   }
   SleepFor(MonoDelta::FromMilliseconds(heartbeat_interval_ms * 3));

   // The tablet server should not be registered with the master.
   ASSERT_OK(itest::ListTabletServers(cluster_->master_proxy(), kTimeout, &tservers));
   ASSERT_EQ(0, tservers.size());

   if (!is_incompatible_replica_management_fatal) {
     NO_FATALS(cluster_->AssertNoCrashes());
   }
 }

 // This test scenario runs the system with the 3-4-3 replica management scheme
 // having the total number of tablet servers equal to the replication factor
 // of the tablet being tested. The scenario makes one follower replica
 // fall behind the WAL segment GC threshold. The system should be able to
 // replace the failed replica 'in-place', i.e. no additional tablet server is
 // needed for the cluster to recover in such situations. The scenario verifies
 // that the re-replication works as expected when the tablet server hosting the
 // failed replica is:
 //   ** paused and then resumed, emulating a lagging tablet server
 //   ** shut down and then started back up
 class ReplicaBehindWalGcThresholdITest :
     public RaftConsensusNonVoterITest,
     public ::testing::WithParamInterface<
         std::tuple<RaftConsensusITestBase::BehindWalGcBehavior, bool>> {
 };
 INSTANTIATE_TEST_CASE_P(,
     ReplicaBehindWalGcThresholdITest,
     ::testing::Combine(
         ::testing::Values(RaftConsensusITestBase::BehindWalGcBehavior::STOP_CONTINUE,
                           RaftConsensusITestBase::BehindWalGcBehavior::SHUTDOWN_RESTART,
                           RaftConsensusITestBase::BehindWalGcBehavior::SHUTDOWN),
         ::testing::Bool()));
 TEST_P(ReplicaBehindWalGcThresholdITest, ReplicaReplacement) {
   if (!AllowSlowTests()) {
     LOG(WARNING) << "test is skipped; set KUDU_ALLOW_SLOW_TESTS=1 to run";
     return;
   }

   const auto kReplicasNum = 3;
   const auto kTimeoutSec = 60;
   const auto kTimeout = MonoDelta::FromSeconds(kTimeoutSec);
   const auto kUnavaiableFailedSec = 5;
   FLAGS_num_replicas = kReplicasNum;
   FLAGS_num_tablet_servers = kReplicasNum;
   const auto tserver_behavior = std::get<0>(GetParam());
   const bool do_delay_workload = std::get<1>(GetParam());

   vector<string> master_flags = {
     // This scenario runs with the 3-4-3 replica management scheme.
     "--raft_prepare_replacement_before_eviction=true",
   };
   if (tserver_behavior != BehindWalGcBehavior::SHUTDOWN) {
     // This scenario verifies that the system evicts the replica that's falling
     // behind the WAL segment GC threshold. If not shutting down the tablet
     // server hosting the affected replica, it's necessary to avoid races with
     // catalog manager when it replaces the replica that has just been evicted.
     master_flags.emplace_back("--master_add_server_when_underreplicated=false");
   }

   vector<string> tserver_flags = {
     // This scenario is specific for the 3-4-3 replica management scheme.
     "--raft_prepare_replacement_before_eviction=true",

     // Detect unavailable replicas faster.
     Substitute("--follower_unavailable_considered_failed_sec=$0", kUnavaiableFailedSec),
   };
   AddFlagsForLogRolls(&tserver_flags); // For CauseFollowerToFallBehindLogGC().

   NO_FATALS(BuildAndStart(tserver_flags, master_flags));

   string follower_uuid;
   string leader_uuid;
   int64_t orig_term;
   MonoDelta delay;
   if (do_delay_workload) {
     // That's to make the leader replica to report the state of the tablet as
     // FAILED first. Later on, when the replica falls behind the WAL segment GC
     // threshold, the leader replica should report the follower's health status
     // as FAILED_UNRECOVERABLE.
     delay = MonoDelta::FromSeconds(3 * kUnavaiableFailedSec);
   }
   NO_FATALS(CauseFollowerToFallBehindLogGC(
       tablet_servers_, &leader_uuid, &orig_term, &follower_uuid,
       tserver_behavior, delay));

   // The catalog manager should evict the replicas which fell behing the WAL
   // segment GC threshold right away.
   bool has_leader = false;
   GetTabletLocationsResponsePB tablet_locations;
   const auto num_replicas = (tserver_behavior == BehindWalGcBehavior::SHUTDOWN)
       ? kReplicasNum : kReplicasNum - 1;
   ASSERT_OK(WaitForReplicasReportedToMaster(cluster_->master_proxy(),
                                             num_replicas,
                                             tablet_id_,
                                             kTimeout,
                                             WAIT_FOR_LEADER,
                                             ANY_REPLICA,
                                             &has_leader,
                                             &tablet_locations));
   consensus::ConsensusStatePB cstate;
   ASSERT_EVENTUALLY([&] {
     TServerDetails* leader = nullptr;
     ASSERT_OK(GetLeaderReplicaWithRetries(tablet_id_, &leader));
     // The reason for the outside ASSERT_EVENTUALLY is that the leader might
     // have changed in between of these two calls.
     ASSERT_OK(GetConsensusState(leader, tablet_id_, kTimeout, EXCLUDE_HEALTH_REPORT, &cstate));
   });

   if (tserver_behavior == BehindWalGcBehavior::SHUTDOWN) {
     // The original voter replica that fell behind the WAL catchup threshold
     // should be evicted and replaced with a non-voter replica. Since its
     // tablet server is shutdown, the replica is not able to catch up with
     // the leader yet (otherwise, it would be promoted to a voter replica).
     EXPECT_TRUE(IsRaftConfigMember(follower_uuid, cstate.committed_config()))
         << pb_util::SecureDebugString(cstate.committed_config())
         << "fell behind WAL replica UUID: " << follower_uuid;
     EXPECT_FALSE(IsRaftConfigVoter(follower_uuid, cstate.committed_config()))
         << pb_util::SecureDebugString(cstate.committed_config())
         << "fell behind WAL replica UUID: " << follower_uuid;
     // Bring the tablet server with the affected replica back.
     ASSERT_OK(cluster_->tablet_server_by_uuid(follower_uuid)->Restart());
   } else {
     // The replica that fell behind the WAL catchup threshold should be
     // evicted and since the catalog manager is not yet adding replacement
     // replicas, a replacement replica should not be added yet.
     EXPECT_FALSE(IsRaftConfigMember(follower_uuid, cstate.committed_config()))
         << pb_util::SecureDebugString(cstate.committed_config())
         << "fell behind WAL replica UUID: " << follower_uuid;
     // Restore back the default behavior of the catalog manager, so it would
     // add a replacement replica.
     for (auto idx = 0; idx < cluster_->num_masters(); ++idx) {
       auto* master = cluster_->master(idx);
       master->mutable_flags()->emplace_back(
           "--master_add_server_when_underreplicated=true");
       master->Shutdown();
       ASSERT_OK(master->Restart());
       ASSERT_OK(master->WaitForCatalogManager(ExternalMaster::WAIT_FOR_LEADERSHIP));
     }
   }

   // The system should be able to recover, replacing the failed replica.
   ASSERT_OK(WaitForReplicasReportedToMaster(cluster_->master_proxy(),
                                             kReplicasNum,
                                             tablet_id_,
                                             kTimeout,
                                             WAIT_FOR_LEADER,
                                             VOTER_REPLICA,
                                             &has_leader,
                                             &tablet_locations));
   NO_FATALS(cluster_->AssertNoCrashes());
   ClusterVerifier v(cluster_.get());
   v.SetOperationsTimeout(kTimeout);
   v.SetVerificationTimeout(kTimeout);
   NO_FATALS(v.CheckCluster());
 }

 }  // namespace tserver
 }  // namespace kudu