src/kudu/consensus/quorum_util.cc - kudu - Git at Google

 // Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements.  See the NOTICE file
 // distributed with this work for additional information
 // regarding copyright ownership.  The ASF licenses this file
 // to you under the Apache License, Version 2.0 (the
 // "License"); you may not use this file except in compliance
 // with the License.  You may obtain a copy of the License at
 //
 //   http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing,
 // software distributed under the License is distributed on an
 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.
 #include "kudu/consensus/quorum_util.h"

 #include <map>
 #include <ostream>
 #include <queue>
 #include <set>
 #include <string>
 #include <unordered_set>
 #include <utility>
 #include <vector>

 #include <glog/logging.h>

 #include "kudu/common/common.pb.h"
 #include "kudu/gutil/macros.h"
 #include "kudu/gutil/map-util.h"
 #include "kudu/gutil/port.h"
 #include "kudu/gutil/strings/join.h"
 #include "kudu/gutil/strings/substitute.h"
 #include "kudu/util/pb_util.h"
 #include "kudu/util/status.h"

 using google::protobuf::RepeatedPtrField;
 using kudu::pb_util::SecureShortDebugString;
 using kudu::pb_util::SecureDebugString;
 using std::map;
 using std::pair;
 using std::priority_queue;
 using std::set;
 using std::string;
 using std::unordered_set;
 using std::vector;
 using strings::Substitute;

 namespace kudu {
 namespace consensus {

 bool IsRaftConfigMember(const std::string& uuid, const RaftConfigPB& config) {
   for (const RaftPeerPB& peer : config.peers()) {
     if (peer.permanent_uuid() == uuid) {
       return true;
     }
   }
   return false;
 }

 bool IsRaftConfigVoter(const std::string& uuid, const RaftConfigPB& config) {
   for (const RaftPeerPB& peer : config.peers()) {
     if (peer.permanent_uuid() == uuid) {
       return peer.member_type() == RaftPeerPB::VOTER;
     }
   }
   return false;
 }

 bool IsVoterRole(RaftPeerPB::Role role) {
   return role == RaftPeerPB::LEADER || role == RaftPeerPB::FOLLOWER;
 }

 Status GetRaftConfigMember(RaftConfigPB* config,
                            const std::string& uuid,
                            RaftPeerPB** peer_pb) {
   for (RaftPeerPB& peer : *config->mutable_peers()) {
     if (peer.permanent_uuid() == uuid) {
       *peer_pb = &peer;
       return Status::OK();
     }
   }
   return Status::NotFound(Substitute("Peer with uuid $0 not found in consensus config", uuid));
 }

 Status GetRaftConfigLeader(ConsensusStatePB* cstate, RaftPeerPB** peer_pb) {
   if (cstate->leader_uuid().empty()) {
     return Status::NotFound("Consensus config has no leader");
   }
   return GetRaftConfigMember(cstate->mutable_committed_config(), cstate->leader_uuid(), peer_pb);
 }

 bool RemoveFromRaftConfig(RaftConfigPB* config, const string& uuid) {
   RepeatedPtrField<RaftPeerPB> modified_peers;
   bool removed = false;
   for (const RaftPeerPB& peer : config->peers()) {
     if (peer.permanent_uuid() == uuid) {
       removed = true;
       continue;
     }
     *modified_peers.Add() = peer;
   }
   if (!removed) return false;
   config->mutable_peers()->Swap(&modified_peers);
   return true;
 }

 bool ReplicaTypesEqual(const RaftPeerPB& peer1, const RaftPeerPB& peer2) {
   // TODO(mpercy): Include comparison of replica intentions once they are
   // implemented.
   return peer1.member_type() == peer2.member_type();
 }

 int CountVoters(const RaftConfigPB& config) {
   int voters = 0;
   for (const RaftPeerPB& peer : config.peers()) {
     if (peer.member_type() == RaftPeerPB::VOTER) {
       voters++;
     }
   }
   return voters;
 }

 int MajoritySize(int num_voters) {
   DCHECK_GE(num_voters, 1);
   return (num_voters / 2) + 1;
 }

 RaftPeerPB::Role GetConsensusRole(const std::string& peer_uuid,
                                   const std::string& leader_uuid,
                                   const RaftConfigPB& config) {
   if (peer_uuid.empty()) {
     return RaftPeerPB::NON_PARTICIPANT;
   }

   for (const RaftPeerPB& peer : config.peers()) {
     if (peer.permanent_uuid() == peer_uuid) {
       switch (peer.member_type()) {
         case RaftPeerPB::VOTER:
           if (peer_uuid == leader_uuid) {
             return RaftPeerPB::LEADER;
           }
           return RaftPeerPB::FOLLOWER;
         default:
           return RaftPeerPB::LEARNER;
       }
     }
   }
   return RaftPeerPB::NON_PARTICIPANT;
 }

 RaftPeerPB::Role GetConsensusRole(const std::string& peer_uuid,
                                   const ConsensusStatePB& cstate) {
   // The active config is the pending config if there is one, else it's the committed config.
   const RaftConfigPB& config = cstate.has_pending_config() ?
       cstate.pending_config() :
       cstate.committed_config();
   return GetConsensusRole(peer_uuid, cstate.leader_uuid(), config);
 }

 RaftPeerPB::Role GetParticipantRole(const RaftPeerPB& peer,
                                     const ConsensusStatePB& cstate) {
   const auto& peer_uuid = peer.permanent_uuid();
   // Sanity check: make sure the peer is in the committed config.
   DCHECK_NE(RaftPeerPB::NON_PARTICIPANT,
             GetConsensusRole(peer_uuid,
                              cstate.leader_uuid(),
                              cstate.committed_config()))
       << Substitute("peer $0 is not commited config $1",
                     peer_uuid, cstate.ShortDebugString());
   switch (peer.member_type()) {
     case RaftPeerPB::VOTER:
       if (peer_uuid == cstate.leader_uuid()) {
         return RaftPeerPB::LEADER;
       }
       return RaftPeerPB::FOLLOWER;
     default:
       return RaftPeerPB::LEARNER;
   }
 }

 Status VerifyRaftConfig(const RaftConfigPB& config) {
   std::set<string> uuids;
   if (config.peers().empty()) {
     return Status::IllegalState(
         Substitute("RaftConfig must have at least one peer. RaftConfig: $0",
                    SecureShortDebugString(config)));
   }

   // All configurations must have 'opid_index' populated.
   if (!config.has_opid_index()) {
     return Status::IllegalState(
         Substitute("Configs must have opid_index set. RaftConfig: $0",
                    SecureShortDebugString(config)));
   }

   for (const RaftPeerPB& peer : config.peers()) {
     if (!peer.has_permanent_uuid() || peer.permanent_uuid().empty()) {
       return Status::IllegalState(Substitute("One peer didn't have an uuid or had the empty"
           " string. RaftConfig: $0", SecureShortDebugString(config)));
     }
     if (ContainsKey(uuids, peer.permanent_uuid())) {
       return Status::IllegalState(
           Substitute("Found multiple peers with uuid: $0. RaftConfig: $1",
                      peer.permanent_uuid(), SecureShortDebugString(config)));
     }
     uuids.insert(peer.permanent_uuid());

     if (config.peers_size() > 1 && !peer.has_last_known_addr()) {
       return Status::IllegalState(
           Substitute("Peer: $0 has no address. RaftConfig: $1",
                      peer.permanent_uuid(), SecureShortDebugString(config)));
     }
     if (!peer.has_member_type()) {
       return Status::IllegalState(
           Substitute("Peer: $0 has no member type set. RaftConfig: $1", peer.permanent_uuid(),
                      SecureShortDebugString(config)));
     }
   }

   return Status::OK();
 }

 Status VerifyConsensusState(const ConsensusStatePB& cstate) {
   if (!cstate.has_current_term()) {
     return Status::IllegalState("ConsensusStatePB missing current_term",
                                 SecureShortDebugString(cstate));
   }
   if (!cstate.has_committed_config()) {
     return Status::IllegalState("ConsensusStatePB missing config", SecureShortDebugString(cstate));
   }
   RETURN_NOT_OK(VerifyRaftConfig(cstate.committed_config()));
   if (cstate.has_pending_config()) {
     RETURN_NOT_OK(VerifyRaftConfig(cstate.pending_config()));
   }

   if (!cstate.leader_uuid().empty()) {
     if (!IsRaftConfigVoter(cstate.leader_uuid(), cstate.committed_config())
         && cstate.has_pending_config()
         && !IsRaftConfigVoter(cstate.leader_uuid(), cstate.pending_config())) {
       return Status::IllegalState(
           Substitute("Leader with UUID $0 is not a VOTER in the committed or pending config! "
                      "Consensus state: $1",
                      cstate.leader_uuid(), SecureShortDebugString(cstate)));
     }
   }

   return Status::OK();
 }

 std::string DiffRaftConfigs(const RaftConfigPB& old_config,
                             const RaftConfigPB& new_config) {
   // Create dummy ConsensusState objects so we can reuse the code
   // from the below function.
   ConsensusStatePB old_state;
   old_state.mutable_committed_config()->CopyFrom(old_config);
   ConsensusStatePB new_state;
   new_state.mutable_committed_config()->CopyFrom(new_config);

   return DiffConsensusStates(old_state, new_state);
 }

 namespace {

 // A mapping from peer UUID to to <old peer, new peer> pairs.
 typedef map<string, pair<RaftPeerPB, RaftPeerPB>> PeerInfoMap;

 bool DiffPeers(const PeerInfoMap& peer_infos,
                vector<string>* change_strs) {
   bool changes = false;
   for (const auto& e : peer_infos) {
     const auto& old_peer = e.second.first;
     const auto& new_peer = e.second.second;
     if (old_peer.has_permanent_uuid() && !new_peer.has_permanent_uuid()) {
       changes = true;
       change_strs->push_back(
         Substitute("$0 $1 ($2) evicted",
                    RaftPeerPB_MemberType_Name(old_peer.member_type()),
                    old_peer.permanent_uuid(),
                    old_peer.last_known_addr().host()));
     } else if (!old_peer.has_permanent_uuid() && new_peer.has_permanent_uuid()) {
       changes = true;
       change_strs->push_back(
         Substitute("$0 $1 ($2) added",
                    RaftPeerPB_MemberType_Name(new_peer.member_type()),
                    new_peer.permanent_uuid(),
                    new_peer.last_known_addr().host()));
     } else if (old_peer.has_permanent_uuid() && new_peer.has_permanent_uuid()) {
       changes = true;
       if (old_peer.member_type() != new_peer.member_type()) {
         change_strs->push_back(
           Substitute("$0 ($1) changed from $2 to $3",
                      old_peer.permanent_uuid(),
                      old_peer.last_known_addr().host(),
                      RaftPeerPB_MemberType_Name(old_peer.member_type()),
                      RaftPeerPB_MemberType_Name(new_peer.member_type())));
       }
     }
   }
   return changes;
 }

 string PeersString(const RaftConfigPB& config) {
   vector<string> strs;
   for (const auto& p : config.peers()) {
     strs.push_back(Substitute("$0 $1 ($2)",
                               RaftPeerPB_MemberType_Name(p.member_type()),
                               p.permanent_uuid(),
                               p.last_known_addr().host()));
   }
   return JoinStrings(strs, ", ");
 }

 } // anonymous namespace

 string DiffConsensusStates(const ConsensusStatePB& old_state,
                            const ConsensusStatePB& new_state) {
   bool leader_changed = old_state.leader_uuid() != new_state.leader_uuid();
   bool term_changed = old_state.current_term() != new_state.current_term();
   bool config_changed =
       old_state.committed_config().opid_index() != new_state.committed_config().opid_index();

   bool pending_config_gained = !old_state.has_pending_config() && new_state.has_pending_config();
   bool pending_config_lost = old_state.has_pending_config() && !new_state.has_pending_config();

   // Construct a map from Peer UUID to '<old peer, new peer>' pairs.
   // Due to the default construction nature of std::map and std::pair, if a peer
   // is present in one configuration but not the other, we'll end up with an empty
   // protobuf in that element of the pair.
   PeerInfoMap committed_peer_infos;
   for (const auto& p : old_state.committed_config().peers()) {
     committed_peer_infos[p.permanent_uuid()].first = p;
   }
   for (const auto& p : new_state.committed_config().peers()) {
     committed_peer_infos[p.permanent_uuid()].second = p;
   }

   // Now collect strings representing the changes.
   vector<string> change_strs;
   if (config_changed) {
     change_strs.push_back(
       Substitute("config changed from index $0 to $1",
                  old_state.committed_config().opid_index(),
                  new_state.committed_config().opid_index()));
   }

   if (term_changed) {
     change_strs.push_back(
         Substitute("term changed from $0 to $1",
                    old_state.current_term(),
                    new_state.current_term()));
   }

   if (leader_changed) {
     string old_leader = "<none>";
     string new_leader = "<none>";
     if (!old_state.leader_uuid().empty()) {
       old_leader = Substitute("$0 ($1)",
                               old_state.leader_uuid(),
                               committed_peer_infos[old_state.leader_uuid()].first
                                   .last_known_addr().host());
     }
     if (!new_state.leader_uuid().empty()) {
       new_leader = Substitute("$0 ($1)",
                               new_state.leader_uuid(),
                               committed_peer_infos[new_state.leader_uuid()].second
                                   .last_known_addr().host());
     }

     change_strs.push_back(Substitute("leader changed from $0 to $1",
                                      old_leader, new_leader));
   }

   DiffPeers(committed_peer_infos, &change_strs);

   if (pending_config_gained) {
     change_strs.push_back(Substitute("now has a pending config: $0",
                                      PeersString(new_state.pending_config())));
   }
   if (pending_config_lost) {
     change_strs.push_back(Substitute("no longer has a pending config: $0",
                                      PeersString(old_state.pending_config())));
   }

   // A pending config doesn't have a committed opid_index yet, so we determine if there's a change
   // by computing the peer differences.
   if (old_state.has_pending_config() && new_state.has_pending_config()) {
     PeerInfoMap pending_peer_infos;
     for (const auto &p : old_state.pending_config().peers()) {
       pending_peer_infos[p.permanent_uuid()].first = p;
     }
     for (const auto &p : new_state.pending_config().peers()) {
       pending_peer_infos[p.permanent_uuid()].second = p;
     }

     vector<string> pending_change_strs;
     if (DiffPeers(pending_peer_infos, &pending_change_strs)) {
       change_strs.emplace_back("pending config changed");
       change_strs.insert(change_strs.end(), pending_change_strs.cbegin(),
                          pending_change_strs.cend());
     }
   }

   // We expect to have detected some differences above, but in case
   // someone forgets to update this function when adding a new field,
   // it's still useful to report some change unless the protobufs are identical.
   // So, we fall back to just dumping the before/after debug strings.
   if (change_strs.empty()) {
     if (SecureShortDebugString(old_state) == SecureShortDebugString(new_state)) {
       return "no change";
     }
     return Substitute("change from {$0} to {$1}",
                       SecureShortDebugString(old_state),
                       SecureShortDebugString(new_state));
   }

   return JoinStrings(change_strs, ", ");
 }

 // The decision is based on:
 //
 //   * the number of voter replicas that aren't ignored (i.e. that aren't
 //     allowed to be in bad shape), that are definitively in bad shape or are
 //     marked with the REPLACE attribute.
 //
 //   * the number of non-voter replicas marked with the PROMOTE=true attribute
 //     in good or possibly good state.
 //
 // This is because a replica with UNKNOWN reported health state might actually
 // be in good shape. If so, then adding a new replica would lead to
 // over-provisioning. This logic assumes that a non-voter replica does not
 // stay in unknown state for eternity -- the leader replica should take care of
 // that and eventually update the health status either to 'HEALTHY' or 'FAILED'.
 //
 // TODO(aserbin): add a test scenario for the leader replica's logic to cover
 //                the latter case.
 bool ShouldAddReplica(const RaftConfigPB& config,
                       int replication_factor,
                       const unordered_set<string>& uuids_ignored_for_underreplication) {
   int num_voters_total = 0;
   int num_voters_healthy = 0;
   int num_voters_need_replacement = 0;
   int num_non_voters_to_promote = 0;

   // While working with the optional fields related to per-replica health status
   // and attributes, has_a_field()-like methods are not called because of
   // the appropriate default values of those fields.
   VLOG(2) << "config to evaluate: " << SecureDebugString(config);
   for (const RaftPeerPB& peer : config.peers()) {
     const auto overall_health = peer.health_report().overall_health();
     const auto& peer_uuid = peer.permanent_uuid();
     bool ignore_failure_for_underreplication = ContainsKey(uuids_ignored_for_underreplication,
                                                            peer_uuid);
     if (VLOG_IS_ON(1) && ignore_failure_for_underreplication) {
       VLOG(1) << Substitute("ignoring $0 if failed", peer_uuid);
     }
     switch (peer.member_type()) {
       case RaftPeerPB::VOTER:
         ++num_voters_total;
         if (peer.attrs().replace()) {
           ++num_voters_need_replacement;
         } else if (overall_health == HealthReportPB::FAILED ||
                    overall_health == HealthReportPB::FAILED_UNRECOVERABLE) {
           // If the failed peer should be ignored, e.g. the server is in
           // maintenance mode, don't count it towards under-replication.
           if (ignore_failure_for_underreplication) {
             continue;
           }
           ++num_voters_need_replacement;
         }
         if (overall_health == HealthReportPB::HEALTHY) {
           ++num_voters_healthy;
         }
         break;
       case RaftPeerPB::NON_VOTER:
         if (peer.attrs().promote()) {
           // A replica with HEALTHY or UNKNOWN overall health status is
           // considered as a replica to promote: a new non-voter replica is
           // added with UNKNOWN health status. If such a replica is not
           // responsive for a long time, then its state will change to
           // HealthReportPB::FAILED after some time and it will be evicted. But
           // before that, it's considered as a candidate for promotion in the
           // code below.
           if ((overall_health != HealthReportPB::FAILED &&
                overall_health != HealthReportPB::FAILED_UNRECOVERABLE) ||
               ignore_failure_for_underreplication) {
             ++num_non_voters_to_promote;
           }
         }
         break;
       default:
         LOG(DFATAL) << peer.member_type() << ": unsupported member type";
         break;
     }
   }

   // Whether the configuration is under-replicated: the projected number of
   // viable replicas is less than the required replication factor.
   const bool is_under_replicated = replication_factor >
       num_voters_total - num_voters_need_replacement + num_non_voters_to_promote;

   // Whether it's time to add a new replica: the tablet Raft configuration might
   // be under-replicated, but it does not make much sense trying to add a new
   // replica if the configuration change cannot be committed.
   const bool should_add_replica = is_under_replicated &&
       num_voters_healthy >= MajoritySize(num_voters_total);

   VLOG(2) << "decision: the config is" << (is_under_replicated ? " " : " not ")
           << "under-replicated; should" << (should_add_replica ? " " : " not ")
           << "add a non-voter replica";
   return should_add_replica;
 }

 // Whether there is an excess replica to evict.
 bool ShouldEvictReplica(const RaftConfigPB& config,
                         const string& leader_uuid,
                         int replication_factor,
                         string* uuid_to_evict) {
   if (leader_uuid.empty()) {
     // If there is no leader, we can't evict anybody.
     return false;
   }

   typedef pair<string, int> Elem;
   static const auto kCmp = [](const Elem& lhs, const Elem& rhs) {
     // Elements of higher priorty should pop up to the top of the queue.
     return lhs.second < rhs.second;
   };
   typedef priority_queue<Elem, vector<Elem>, decltype(kCmp)> PeerPriorityQueue;

   PeerPriorityQueue pq_non_voters(kCmp);
   PeerPriorityQueue pq_voters(kCmp);

   const auto peer_to_elem = [](const RaftPeerPB& peer) {
     const string& peer_uuid = peer.permanent_uuid();
     const auto overall_health = peer.health_report().overall_health();

     // Non-voter candidates for eviction (in decreasing priority):
     //   * failed unrecoverably
     //   * failed
     //   * in unknown health state
     //   * any other
     //
     // Voter candidates for eviction (in decreasing priority):
     //   * failed unrecoverably and having the attribute REPLACE set
     //   * failed unrecoverably
     //   * failed and having the attribute REPLACE set
     //   * failed
     //   * having the attribute REPLACE set
     //   * in unknown health state
     //   * any other

     int priority = 0;
     switch (overall_health) {
       case HealthReportPB::FAILED_UNRECOVERABLE:
         priority = 8;
         break;
       case HealthReportPB::FAILED:
         priority = 4;
         break;
       case HealthReportPB::HEALTHY:
         priority = 0;
         break;
       case HealthReportPB::UNKNOWN:   FALLTHROUGH_INTENDED;
       default:
         priority = 1;
         break;
     }
     if (peer.member_type() == RaftPeerPB::VOTER && peer.attrs().replace()) {
       priority += 2;
     }
     return Elem(peer_uuid, priority);
   };

   int num_non_voters_total = 0;

   int num_voters_healthy = 0;
   int num_voters_total = 0;
   int num_voters_with_replace = 0;
   int num_voters_viable = 0;

   bool leader_with_replace = false;

   bool has_non_voter_failed = false;
   bool has_non_voter_failed_unrecoverable = false;
   bool has_voter_failed = false;
   bool has_voter_failed_unrecoverable = false;
   bool has_voter_unknown_health = false;

   // While working with the optional fields related to per-replica health status
   // and attributes, has_a_field()-like methods are not called because of
   // the appropriate default values of those fields.
   VLOG(2) << "config to evaluate: " << SecureDebugString(config);
   for (const RaftPeerPB& peer : config.peers()) {
     DCHECK(peer.has_permanent_uuid() && !peer.permanent_uuid().empty());
     const string& peer_uuid = peer.permanent_uuid();
     const auto overall_health = peer.health_report().overall_health();
     const bool failed = overall_health == HealthReportPB::FAILED;
     const bool failed_unrecoverable = overall_health == HealthReportPB::FAILED_UNRECOVERABLE;
     const bool healthy = overall_health == HealthReportPB::HEALTHY;
     const bool unknown = !peer.has_health_report() ||
         !peer.health_report().has_overall_health() ||
         overall_health == HealthReportPB::UNKNOWN;
     const bool has_replace = peer.attrs().replace();

     switch (peer.member_type()) {
       case RaftPeerPB::VOTER:
         // A leader should always report itself as being healthy.
         if (PREDICT_FALSE(peer_uuid == leader_uuid && !healthy)) {
           LOG(WARNING) << Substitute("leader peer $0 reported health as $1; config: $2",
                                      peer_uuid,
                                      HealthReportPB_HealthStatus_Name(
                                         peer.health_report().overall_health()),
                                      SecureShortDebugString(config));
           DCHECK(false) << "Found non-HEALTHY LEADER"; // Crash in DEBUG builds.
           // TODO(KUDU-2335): We have seen this assertion in rare circumstances
           // in pre-commit builds, so until we fix this lifecycle issue we
           // simply do not evict any nodes when the leader is not HEALTHY.
           return false;
         }

         ++num_voters_total;
         if (healthy) {
           ++num_voters_healthy;
           if (!has_replace) {
             ++num_voters_viable;
           }
         }
         if (has_replace) {
           ++num_voters_with_replace;
           if (peer_uuid == leader_uuid) {
             leader_with_replace = true;
           }
         }
         if (peer_uuid == leader_uuid) {
           // Everything below is to keep track of replicas to evict; the leader
           // replica is not to be evicted.
           break;
         }

         pq_voters.emplace(peer_to_elem(peer));
         has_voter_failed |= failed;
         has_voter_failed_unrecoverable |= failed_unrecoverable;
         has_voter_unknown_health |= unknown;
         break;

       case RaftPeerPB::NON_VOTER:
         DCHECK_NE(peer_uuid, leader_uuid) << peer_uuid
             << ": non-voter as a leader; " << SecureShortDebugString(config);
         pq_non_voters.emplace(peer_to_elem(peer));
         ++num_non_voters_total;
         has_non_voter_failed |= failed;
         has_non_voter_failed_unrecoverable |= failed_unrecoverable;
         break;

       default:
         LOG(DFATAL) << peer.member_type() << ": unsupported member type";
         break;
     }
   }

   // Sanity check: the leader replica UUID should not be among those to evict.
   DCHECK(pq_voters.empty() || pq_voters.top().first != leader_uuid);
   DCHECK(pq_non_voters.empty() || pq_non_voters.top().first != leader_uuid);

   // A conservative approach is used when evicting replicas. In short, the
   // removal of replicas from the tablet without exact knowledge of their health
   // status could lead to removing the healthy ones and keeping the failed
   // ones, or attempting a config change operation that cannot be committed.
   // From the other side, if the number of voter replicas in good health is
   // greater or equal to the required replication factor, a replica with any
   // health status can be safely evicted without compromising the availability
   // of the tablet. Also, the eviction policy is more liberal when dealing with
   // failed replicas: if the total number of voter replicas is greater than or
   // equal to the required replication factor, the failed replicas are evicted
   // aggressively. The latter is to avoid polluting tablet servers with failed
   // replicas, reducing the number of possible locations for new non-voter
   // replicas created to replace the failed ones. See below for more details.
   //
   // * A non-voter replica may be evicted regardless of its health status
   //   if the number of voter replicas in good health without the 'replace'
   //   attribute is greater than or equal to the required replication factor.
   //   The idea is to not evict non-voter replicas that might be needed to reach
   //   the required replication factor, while a present non-voter replica could
   //   be a good fit to replace a voter replica, if needed.
   //
   // * A non-voter replica with FAILED or FAILED_UNRECOVERABLE health status
   //   may be evicted if the number of voter replicas in good health without
   //   the 'replace' attribute is greater than or equal to a strict majority
   //   of voter replicas. The idea is to avoid polluting available tablet
   //   servers with failed non-voter replicas, while replacing failed non-voters
   //   with healthy non-voters as aggressively as possible. Also, we want to be
   //   sure that an eviction can succeed before initiating it.
   //
   // * A voter replica may be evicted regardless of its health status
   //   if after the eviction the number of voter replicas in good health will be
   //   greater than or equal to the required replication factor and the leader
   //   replica itself is not marked with the 'replace' attribute. The latter
   //   part of the condition emerges from the following observations:
   //     ** By definition, a voter replica marked with the 'replace' attribute
   //        should be eventually evicted from the Raft group.
   //     ** If all voter replicas are in good health and their total count is
   //        greater than the target replication and only a single one is marked
   //        with the 'replace' attribute, that's the replica to be evicted.
   //     ** Kudu Raft implementation does not support evicting the leader of
   //        a Raft group.
   //    So, the removal of a leader replica marked with the 'replace' attribute
   //    is postponed until the leader replica steps down and becomes a follower.
   //
   // * A voter replica with FAILED health may be evicted only if the total
   //   number of voter replicas is greater than the required replication factor
   //   and the number of *other* voter replicas in good health without the
   //   'replace' attribute is greater than or equal to a strict majority of
   //   voter replicas.
   //
   // * A voter replica with FAILED_UNRECOVERABLE health may be evicted when
   //   the number of *other* voter replicas in good health without the 'replace'
   //   attribute is greater than or equal to a strict majority of voter replicas.
   //
   // * A voter replica in good health marked with the 'replace' attribute may be
   //   evicted when the number of replicas in good health after the eviction
   //   is greater than or equal to the required replication factor.

   bool need_to_evict_non_voter = false;

   // Check if there is any excess non-voter replica. We add non-voter replicas
   // to replace non-viable (i.e. failed or explicitly marked for eviction) ones.
   need_to_evict_non_voter |=
       num_voters_viable >= replication_factor &&
       num_non_voters_total > 0;

   // Some non-voter replica has failed: we want to remove those aggressively.
   // This is to avoid polluting tablet servers with failed replicas. Otherwise,
   // it may be a situation when it's impossible to add a new non-voter replica
   // to replace failed ones.
   need_to_evict_non_voter |=
       has_non_voter_failed ||
       has_non_voter_failed_unrecoverable;

   // All the non-voter-related sub-cases are applicable only when there is at
   // least one non-voter replica and a majority of voter replicas are on-line
   // to commit the Raft configuration change.
   const bool should_evict_non_voter = need_to_evict_non_voter &&
       num_voters_healthy >= MajoritySize(num_voters_total);

   bool need_to_evict_voter = false;

   // The abundant case: can evict any voter replica. The code below will select
   // the most appropriate candidate.
   need_to_evict_voter |= num_voters_viable > replication_factor;

   // Some voter replica has failed: we want to remove those aggressively.
   // This is to avoid polluting tablet servers with failed replicas. Otherwise,
   // it may be a situation when it's impossible to add a new non-voter replica
   // to replace failed ones.
   need_to_evict_voter |= (has_voter_failed || has_voter_failed_unrecoverable);

   // In case if we already have enough healthy replicas running, it's safe to
   // get rid of replicas with unknown health state.
   need_to_evict_voter |=
       num_voters_viable >= replication_factor &&
       has_voter_unknown_health;

   // Working with the replicas marked with the 'replace' attribute:
   // the case when too many replicas are marked with the 'replace' attribute
   // while all required replicas are healthy.
   //
   // In the special case when the leader replica is the only one marked with the
   // 'replace' attribute, the leader replica cannot be evicted.
   need_to_evict_voter |= (num_voters_healthy >= replication_factor) &&
       !(num_voters_with_replace == 1 && leader_with_replace) &&
       ((num_voters_with_replace > replication_factor) ||
        (num_voters_with_replace >= replication_factor && num_voters_viable > 0));

   // Working with the replicas marked with the 'replace' attribute:
   // the case where a few replicas are marked with the 'replace' attribute
   // while all required replicas are healthy.
   //
   // In the special case when the leader replica is the only one marked with the
   // 'replace' attribute, the leader replica cannot be evicted.
   need_to_evict_voter |=
       !(num_voters_with_replace == 1 && leader_with_replace) &&
       (num_voters_with_replace > 0 && num_voters_healthy > replication_factor);

   // The voter-related sub-cases are applicable only when the total number of
   // voter replicas is greater than the target replication factor or it's
   // a non-recoverable failure; meanwhile, a majority of voter replicas should
   // be on-line to commit the Raft configuration change.
   const bool should_evict_voter = need_to_evict_voter &&
       (num_voters_total > replication_factor ||
        has_voter_failed_unrecoverable) &&
       num_voters_healthy >= MajoritySize(num_voters_total - 1);

   const bool should_evict = should_evict_non_voter || should_evict_voter;
   // When we have the same type of failures between voters and non-voters
   // we evict non-voters first, but if there is an irreversibly failed voter and
   // no irreversibly failed non-voters, then we evict such the voter first.
   // That's because a transiently failed non-voter might be back and in good
   // shape a few moments. Also, getting rid of a irreversibly failed voter may
   // be beneficial in case of even-number-of-voters configurations: the majority
   // gets more chances to be actionable if other replica fails.
   //
   // So, the eviction priority order is:
   //   (1) unrecoverable non_voters
   //   (2) unrecoverable voters
   //   (3) evictable non_voters
   //   (4) evictable voters
   string to_evict;
   if (should_evict_non_voter && has_non_voter_failed_unrecoverable) {
     CHECK(!pq_non_voters.empty());
     to_evict = pq_non_voters.top().first;
   } else if (should_evict_voter && has_voter_failed_unrecoverable) {
     CHECK(!pq_voters.empty());
     to_evict = pq_voters.top().first;
   } else if (should_evict_non_voter) {
     CHECK(!pq_non_voters.empty());
     to_evict = pq_non_voters.top().first;
   } else if (should_evict_voter) {
     CHECK(!pq_voters.empty());
     to_evict = pq_voters.top().first;
   }

   DCHECK((!should_evict && to_evict.empty()) ||
          (should_evict && !to_evict.empty()));
   if (should_evict) {
     DCHECK(!to_evict.empty());
     DCHECK_NE(leader_uuid, to_evict);
     if (uuid_to_evict) {
       *uuid_to_evict = to_evict;
     }
   }
   VLOG(2) << "decision: should"
           << (should_evict ? "" : "not") << " evict replica "
           << (should_evict ? to_evict : "");

   return should_evict;
 }

 }  // namespace consensus
 }  // namespace kudu