blob: c7fe8f83717d6f66e9fb4dcec4c38025fa0dc0c5 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <cstdint>
#include <functional>
#include <map>
#include <memory>
#include <string>
#include <unordered_map>
#include "kudu/consensus/consensus.pb.h"
#include "kudu/consensus/consensus_peers.h"
#include "kudu/consensus/metadata.pb.h"
#include "kudu/consensus/raft_consensus.h"
#include "kudu/gutil/macros.h"
#include "kudu/gutil/ref_counted.h"
#include "kudu/rpc/rpc_controller.h"
#include "kudu/util/locks.h"
#include "kudu/util/monotime.h"
#include "kudu/util/status.h"
namespace kudu {
namespace consensus {
// The vote a peer has given.
enum ElectionVote {
VOTE_DENIED = 0,
VOTE_GRANTED = 1,
};
// Simple class to count votes (in-memory, not persisted to disk).
// This class is not thread safe and requires external synchronization.
class VoteCounter {
public:
// Create new VoteCounter with the given majority size.
VoteCounter(int num_voters, int majority_size);
VoteCounter(VoteCounter&&) = default;
// Register a peer's vote.
//
// If the voter already has a vote recorded, but it has a different value than
// the vote specified, returns Status::IllegalArgument.
//
// If the same vote is duplicated, 'is_duplicate' is set to true.
// Otherwise, it is set to false.
// If an OK status is not returned, the value in 'is_duplicate' is undefined.
Status RegisterVote(const std::string& voter_uuid, ElectionVote vote, bool* is_duplicate);
// Return whether the vote is decided yet.
bool IsDecided() const;
// Return decision iff IsDecided() returns true.
// If vote is not yet decided, returns Status::IllegalState().
Status GetDecision(ElectionVote* decision) const;
// Return the total of "Yes" and "No" votes.
int GetTotalVotesCounted() const;
// Return total number of expected votes.
int GetTotalExpectedVotes() const { return num_voters_; }
// Return true iff GetTotalVotesCounted() == num_voters_;
bool AreAllVotesIn() const;
// Return a summary of the election so far, suitable for logging.
std::string GetElectionSummary() const;
private:
friend class VoteCounterTest;
typedef std::map<std::string, ElectionVote> VoteMap;
const int num_voters_;
const int majority_size_;
VoteMap votes_; // Voting record.
int yes_votes_; // Accumulated yes votes, for quick counting.
int no_votes_; // Accumulated no votes.
DISALLOW_COPY_AND_ASSIGN(VoteCounter);
};
// The result of a leader election.
struct ElectionResult {
public:
ElectionResult(VoteRequestPB request, ElectionVote election_decision,
ConsensusTerm highest_term, std::string msg);
// The vote request that was sent to the voters for this election.
const VoteRequestPB vote_request;
// The overall election GRANTED/DENIED decision of the configuration.
const ElectionVote decision;
// The highest term seen from any voter.
const ConsensusTerm highest_voter_term;
// Human-readable explanation of the vote result, if any.
const std::string message;
};
// Driver class to run a leader election.
//
// The caller must pass a callback to the driver, which will be called exactly
// once when a Yes/No decision has been made, except in case of Shutdown()
// on the Messenger or test ThreadPool, in which case no guarantee of a
// callback is provided. In that case, we should not care about the election
// result, because the server is ostensibly shutting down.
//
// For a "Yes" decision, a majority of voters must grant their vote.
//
// A "No" decision may be caused by either one of the following:
// - One of the peers replies with a higher term before a decision is made.
// - A majority of the peers votes "No".
//
// Any votes that come in after a decision has been made and the callback has
// been invoked are logged but ignored. Note that this somewhat strays from the
// letter of the Raft paper, in that replies that come after a "Yes" decision
// do not immediately cause the candidate/leader to step down, but this keeps
// our implementation and API simple, and the newly-minted leader will soon
// discover that it must step down when it attempts to replicate its first
// message to the peers.
//
// This class is thread-safe.
class LeaderElection : public RefCountedThreadSafe<LeaderElection> {
public:
typedef std::function<void(const ElectionResult&)> ElectionDecisionCallback;
// Set up a new leader election driver.
//
// 'proxy_factory' must not go out of scope while LeaderElection is alive.
//
// The 'vote_counter' must be initialized with the candidate's own yes vote.
LeaderElection(RaftConfigPB config,
PeerProxyFactory* proxy_factory,
VoteRequestPB request,
VoteCounter vote_counter,
MonoDelta timeout,
ElectionDecisionCallback decision_callback);
// Run the election: send the vote request to followers.
void Run();
private:
friend class RefCountedThreadSafe<LeaderElection>;
struct VoterState {
std::string peer_uuid;
std::unique_ptr<PeerProxy> proxy;
// If constructing the proxy failed (e.g. due to a DNS resolution issue)
// then 'proxy' will be NULL, and 'proxy_status' will contain the error.
Status proxy_status;
rpc::RpcController rpc;
VoteRequestPB request;
VoteResponsePB response;
std::string PeerInfo() const;
};
typedef std::unordered_map<std::string, std::unique_ptr<VoterState>>
VoterStateMap;
typedef simple_spinlock Lock;
// This class is refcounted.
~LeaderElection();
// Check to see if a decision has been made. If so, invoke decision callback.
// Calls the callback outside of holding a lock.
void CheckForDecision();
// Callback called when the RPC responds.
void VoteResponseRpcCallback(const std::string& voter_uuid);
// Record vote from specified peer.
void RecordVoteUnlocked(const VoterState& state, ElectionVote vote);
// Handle a peer that reponded with a term greater than the election term.
void HandleHigherTermUnlocked(const VoterState& state);
// Log and record a granted vote.
void HandleVoteGrantedUnlocked(const VoterState& state);
// Log the reason for a denied vote and record it.
void HandleVoteDeniedUnlocked(const VoterState& state);
// Returns a string to be prefixed to all log entries.
// This method accesses const members and is thread safe.
std::string LogPrefix() const;
// Helper to reference the term we are running the election for.
ConsensusTerm election_term() const { return request_.candidate_term(); }
// All non-const fields are protected by 'lock_'.
Lock lock_;
// The result returned by the ElectionDecisionCallback.
// NULL if not yet known.
std::unique_ptr<ElectionResult> result_;
// Whether we have responded via the callback yet.
bool has_responded_;
// Active Raft configuration at election start time.
const RaftConfigPB config_;
// Factory used in the creation of new proxies.
PeerProxyFactory* proxy_factory_;
// Election request to send to voters.
const VoteRequestPB request_;
// Object to count the votes.
VoteCounter vote_counter_;
// Timeout for sending RPCs.
const MonoDelta timeout_;
// Callback invoked to notify the caller of an election decision.
const ElectionDecisionCallback decision_callback_;
// Map of UUID -> VoterState.
VoterStateMap voter_state_;
// The highest term seen from a voter so far (or 0 if no votes).
int64_t highest_voter_term_;
};
} // namespace consensus
} // namespace kudu