blob: 611ef3faec2fc6c1704737931d299b37d20c75bf [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
syntax = "proto2";
package kudu.consensus;
option java_package = "org.apache.kudu.consensus";
import "kudu/common/common.proto";
import "kudu/common/wire_protocol.proto";
import "kudu/consensus/metadata.proto";
import "kudu/consensus/opid.proto";
import "kudu/consensus/replica_management.proto";
import "kudu/rpc/rpc_header.proto";
import "kudu/tablet/metadata.proto";
import "kudu/tablet/tablet.proto";
import "kudu/tserver/tserver_admin.proto";
import "kudu/tserver/tserver.proto";
// Consensus-specific errors use this protobuf
message ConsensusErrorPB {
// The codes for consensus responses. These are set in the status when
// some consensus internal error occurs and require special handling
// by the caller. A generic error code is purposefully absent since
// generic errors should use tserver.TabletServerErrorPB.
enum Code {
UNKNOWN = 0;
// Invalid term.
// Sent by peers in response to leader RPCs whenever the term
// of one of the messages sent in a batch is lower than the
// the term the peer is expecting.
INVALID_TERM = 2;
// For leader election.
// The last OpId logged by the candidate is older than the last OpId logged
// by the local peer.
LAST_OPID_TOO_OLD = 3;
// For leader election.
// The local replica has already voted for another candidate in this term.
ALREADY_VOTED = 4;
// The replica does not recognize the caller's request as coming from a
// member of the configuration.
NOT_IN_QUORUM = 5;
// The responder's last entry didn't match the caller's preceding entry.
PRECEDING_ENTRY_DIDNT_MATCH = 6;
// The local replica is either a leader, or has heard from a valid leader
// more recently than the election timeout, so believes the leader to be alive.
LEADER_IS_ALIVE = 7;
// The local replica is in the middle of servicing either another vote
// or an update from a valid leader.
CONSENSUS_BUSY = 8;
// The local replica was unable to prepare a single op.
CANNOT_PREPARE = 9;
}
// The error code.
required Code code = 1;
// The Status object for the error. This will include a textual
// message that may be more useful to present in log messages, etc,
// though its error code is less specific.
required AppStatusPB status = 2;
}
// ===========================================================================
// External Consensus Messages
// ===========================================================================
// The types of operations that need a commit message, i.e. those that require
// at least one round of the consensus algorithm.
enum OperationType {
UNKNOWN_OP = 0;
NO_OP = 1;
WRITE_OP = 3;
ALTER_SCHEMA_OP = 4;
CHANGE_CONFIG_OP = 5;
PARTICIPANT_OP = 6;
}
// The op driver type: indicates whether an op is being executed on a leader or
// a replica.
enum DriverType {
UNKNOWN_DRIVER = 0;
LEADER = 1;
REPLICA = 2;
}
// A configuration change request for the tablet with 'tablet_id'.
// This message is dynamically generated by the leader when AddServer() or
// RemoveServer() is called, and is what gets replicated to the log.
message ChangeConfigRecordPB {
required bytes tablet_id = 1;
// The old committed configuration config for verification purposes.
required RaftConfigPB old_config = 2;
// The new configuration to set the configuration to.
required RaftConfigPB new_config = 3;
}
enum ChangeConfigType {
UNKNOWN_CHANGE = 0;
ADD_PEER = 1;
REMOVE_PEER = 2;
MODIFY_PEER = 3;
}
// A configuration change request for the tablet with 'tablet_id'.
// These requests are restricted to one-by-one operations, as specified in
// Diego Ongaro's Raft PhD thesis.
// This is the RPC request, but it does not end up in the log.
// See also ChangeConfigRecordPB.
message ChangeConfigRequestPB {
// UUID of server this request is addressed to.
optional bytes dest_uuid = 4;
required bytes tablet_id = 1;
// The type of config change requested.
// This field must be specified, but is left as optional due to being an enum.
optional ChangeConfigType type = 2;
// The peer to add or remove.
// When 'type' == ADD_PEER, both the permanent_uuid and last_known_addr
// fields must be set. Otherwise, only the permanent_uuid field is required.
optional RaftPeerPB server = 3;
// The OpId index of the committed config to replace.
// This optional parameter is here to provide an atomic (compare-and-swap)
// ChangeConfig operation. The ChangeConfig() operation will fail if this
// parameter is specified and the committed config does not have a matching
// opid_index. See also the definition of RaftConfigPB.
optional int64 cas_config_opid_index = 5;
}
// A config change request that specifies many things to change.
message BulkChangeConfigRequestPB {
// Each sub-request that is being made as part of this bulk change config request.
message ConfigChangeItemPB {
// The type of config change.
optional ChangeConfigType type = 1;
// The peer to add, remove, or modify.
// The same rules apply here as above in ChangeConfigRequestPB.
optional RaftPeerPB peer = 2;
}
// UUID of server this request is addressed to.
optional bytes dest_uuid = 1;
required bytes tablet_id = 2;
repeated ConfigChangeItemPB config_changes = 3;
// The OpId index of the committed config to replace.
// This parameter serves the same purpose as specified in
// ChangeConfigRequestPB.
optional int64 cas_config_opid_index = 4;
}
// The configuration change response. If any immediate error occurred
// the 'error' field is set with it, otherwise 'new_configuration' is set.
message ChangeConfigResponsePB {
optional tserver.TabletServerErrorPB error = 1;
// Updated configuration after changing the config.
optional RaftPeerPB new_config = 2;
// The timestamp chosen by the server for this change config operation.
// TODO: At the time of writing, this field is never set in the response.
// TODO: Propagate signed timestamps. See KUDU-611.
optional fixed64 timestamp = 3;
}
// A Replicate message, sent to replicas by leader to indicate this operation must
// be stored in the WAL/SM log, as part of the first phase of the two phase
// commit.
message ReplicateMsg {
// The Raft operation ID (term and index) being replicated.
required OpId id = 1;
// The (hybrid or logical) timestamp assigned to this message.
required fixed64 timestamp = 2;
// optional ExternalConsistencyMode external_consistency_mode = 3 [default = NO_CONSISTENCY];
required OperationType op_type = 4;
optional tserver.WriteRequestPB write_request = 5;
optional tserver.AlterSchemaRequestPB alter_schema_request = 6;
optional ChangeConfigRecordPB change_config_record = 7;
optional tserver.ParticipantRequestPB participant_request = 9;
// The client's request id for this message, if it is set.
optional rpc.RequestIdPB request_id = 8;
optional NoOpRequestPB noop_request = 999;
}
// A commit message for a previous operation.
// This is a commit in the consensus sense and may abort/commit any operation
// that required a consensus round.
message CommitMsg {
required OperationType op_type = 1;
// the id of the message this commit pertains to
optional OpId commited_op_id = 2;
// The operations that were applied and/or failed in this op.
optional tablet.TxResultPB result = 3;
}
// ===========================================================================
// Internal Consensus Messages and State
// ===========================================================================
// NO_OP requests are replicated by a peer after being elected leader.
message NoOpRequestPB {
// Allows to set a dummy payload, for tests.
optional bytes payload_for_tests = 1;
// Set to true if the op id for this request is expected to be monotonically
// increasing with the assigned timestamp. For no-ops that are sent by a
// leader marking a successful Raft election, this is true. If not set, it is
// assumed to be true.
optional bool timestamp_in_opid_order = 2;
}
// Status message received in the peer responses.
message ConsensusStatusPB {
// The last message received (and replicated) by the peer.
required OpId last_received = 1;
// The id of the last op that was replicated by the current leader.
// This doesn't necessarily mean that the term of this op equals the current
// term, since the current leader may be replicating ops from a prior term.
// Unset if none currently received.
//
// In the case where there is a log matching property error
// (PRECEDING_ENTRY_DIDNT_MATCH), this field is important and may still be
// set, since the leader queue uses this field in conjuction with
// last_received to decide on the next id to send to the follower.
//
// NOTE: it might seem that the leader itself could track this based on knowing
// which batches were successfully sent. However, the follower is free to
// truncate the batch if an operation in the middle of the batch fails
// to prepare (eg due to memory limiting). In that case, the leader
// will get a success response but still need to re-send some operations.
optional OpId last_received_current_leader = 4;
// The last committed index that is known to the peer.
optional int64 last_committed_idx = 2;
// When the last request failed for some consensus related (internal) reason.
// In some cases the error will have a specific code that the caller will
// have to handle in certain ways.
optional ConsensusErrorPB error = 3;
}
// A request from a candidate peer that wishes to become leader of
// the configuration serving tablet with 'tablet_id'.
// See RAFT sec. 5.2.
message VoteRequestPB {
// UUID of server this request is addressed to.
optional bytes dest_uuid = 6;
// Identifies the tablet configuration a the vote is being requested for.
required string tablet_id = 1;
// The uuid of the sending peer.
required bytes candidate_uuid = 2;
// The term we are requesting a vote for.
// If this term is higher than the callee's term, the callee will update its
// own term to match, and if it is the current leader it will step down.
required int64 candidate_term = 3;
// The candidate node status so that the voter node can
// decide whether to vote for it as LEADER.
//
// In particular, this includes the last OpId persisted in the candidate's
// log, which corresponds to the lastLogIndex and lastLogTerm fields in Raft.
// A replica must vote no for a candidate that has an OpId lower than them.
required ConsensusStatusPB candidate_status = 4;
// Normally, replicas will deny a vote with a LEADER_IS_ALIVE error if
// they are a leader or recently heard from a leader. This is to prevent
// partitioned nodes from disturbing liveness. If this flag is true,
// peers will vote even if they think a leader is alive. This can be used
// for example to force a faster leader hand-off rather than waiting for
// the election timer to expire.
optional bool ignore_live_leader = 5 [ default = false ];
// In a "pre-election", voters should respond how they _would_ have voted
// but not actually record the vote.
optional bool is_pre_election = 7 [ default = false ];
}
// A response from a replica to a leader election request.
message VoteResponsePB {
// The uuid of the node sending the reply.
optional bytes responder_uuid = 1;
// The term of the node sending the reply.
// Allows the candidate to update itself if it is behind.
optional int64 responder_term = 2;
// True if this peer voted for the caller, false otherwise.
optional bool vote_granted = 3;
// TODO: Migrate ConsensusService to the AppStatusPB RPC style and merge these errors.
// Error message from the consensus implementation.
optional ConsensusErrorPB consensus_error = 998;
// A generic error message (such as tablet not found).
optional tserver.TabletServerErrorPB error = 999;
}
// A consensus request message, the basic unit of a consensus round.
message ConsensusRequestPB {
// UUID of server this request is addressed to.
optional bytes dest_uuid = 7;
required string tablet_id = 1;
// The uuid of the peer making the call.
required bytes caller_uuid = 2;
// The caller's term. As only leaders can send messages,
// replicas will accept all messages as long as the term
// is equal to or higher than the last term they know about.
// If a leader receives a request with a term higher than its own,
// it will step down and enter FOLLOWER state (see Raft sec. 5.1).
required int64 caller_term = 3;
// The id of the operation immediately preceding the first
// operation in 'ops'. If the replica is receiving 'ops' for
// the first time 'preceding_id' must match the replica's
// last operation.
//
// This must be set if 'ops' is non-empty.
optional OpId preceding_id = 4;
// The index of the last committed operation in the configuration. This is the
// index of the last operation the leader deemed committed from a consensus
// standpoint (not the last operation the leader applied).
//
// Raft calls this field 'leaderCommit'.
optional int64 committed_index = 8;
// Deprecated field used in Kudu 0.10.0 and earlier. Remains here to prevent
// accidental reuse and provide a nicer error message if the user attempts
// a rolling upgrade.
optional OpId DEPRECATED_committed_index = 5;
// Sequence of operations to be replicated by this peer.
// These will be committed when committed_index advances above their
// respective OpIds. In some cases committed_index can indicate that
// these operations are already committed, in which case they will be
// committed during the same request.
repeated ReplicateMsg ops = 6;
// The highest index that is known to be replicated by all members of
// the configuration.
//
// NOTE: this is not necessarily monotonically increasing. For example, if a node is in
// the process of being added to the configuration but has not yet copied a snapshot,
// this value may drop to 0.
optional int64 all_replicated_index = 9;
// The safe timestamp on the leader.
// This is only set if the leader has no messages to send to the peer or if the last sent
// message is already (raft) committed. By setting this the leader allows followers to advance
// the "safe time" past the timestamp of the last committed message and answer snapshot scans
// in the present in the absense of writes.
optional fixed64 safe_timestamp = 10;
// The index of the most recent operation appended to the leader.
// Followers can use this to determine roughly how far behind they are from the leader.
optional int64 last_idx_appended_to_leader = 11;
}
message ConsensusResponsePB {
// The uuid of the peer making the response.
optional bytes responder_uuid = 1;
// The current term of the peer making the response.
// This is used to update the caller (and make it step down if it is
// out of date).
optional int64 responder_term = 2;
// The current consensus status of the receiver peer.
optional ConsensusStatusPB status = 3;
// Whether the server that hosts the peer is quiescing. This doesn't
// necessarily have any bearing on the state of the Raft peer itself, but it
// does indicate that the peer should not be a candidate for leadership.
optional bool server_quiescing = 4;
// A generic error message (such as tablet not found), per operation
// error messages are sent along with the consensus status.
optional tserver.TabletServerErrorPB error = 999;
}
// A message reflecting the status of an in-flight op.
message OpStatusPB {
required OpId op_id = 1;
required OperationType op_type = 2;
// Time the op has been in flight.
required int64 running_for_micros = 3;
// Quick human-readable description (e.g., ToString() output).
required string description = 4;
// If tracing is enabled when viewing the op, the trace buffer is copied
// here.
optional string trace_buffer = 6;
}
message GetNodeInstanceRequestPB {
}
message GetNodeInstanceResponsePB {
required NodeInstancePB node_instance = 1;
}
// Message that makes the local peer run leader election to be elected leader.
// Assumes that a tablet with 'tablet_id' exists.
message RunLeaderElectionRequestPB {
// UUID of server this request is addressed to.
optional bytes dest_uuid = 2;
// the id of the tablet
required bytes tablet_id = 1;
}
message RunLeaderElectionResponsePB {
// A generic error message (such as tablet not found).
optional tserver.TabletServerErrorPB error = 1;
}
enum LeaderStepDownMode {
// The leader will immediately step down.
ABRUPT = 1;
// The leader will attempt to arrange for a successor to be elected ASAP.
// If it cannot do so, it remains leader.
GRACEFUL = 2;
}
message LeaderStepDownRequestPB {
// UUID of the server this request is addressed to.
optional bytes dest_uuid = 2;
// The id of the tablet.
required bytes tablet_id = 1;
// How the leader will attempt to relinquish its leadership.
optional LeaderStepDownMode mode = 3;
// The UUID of the peer that should be promoted to leader in GRACEFUL mode.
// If unset, the leader will select a successor.
// In ABRUPT mode, it is illegal to set this field.
optional bytes new_leader_uuid = 4;
}
message LeaderStepDownResponsePB {
// A generic error message (such as tablet not found).
optional tserver.TabletServerErrorPB error = 1;
}
enum OpIdType {
UNKNOWN_OPID_TYPE = 0;
RECEIVED_OPID = 1;
COMMITTED_OPID = 2;
}
message GetLastOpIdRequestPB {
// UUID of server this request is addressed to.
optional bytes dest_uuid = 2;
// the id of the tablet
required bytes tablet_id = 1;
// Whether to return the last-received or last-committed OpId.
optional OpIdType opid_type = 3 [ default = RECEIVED_OPID ];
}
message GetLastOpIdResponsePB {
optional OpId opid = 1;
// A generic error message (such as tablet not found).
optional tserver.TabletServerErrorPB error = 2;
}
enum IncludeHealthReport {
UNSPECIFIED_HEALTH_REPORT = 0;
EXCLUDE_HEALTH_REPORT = 1;
INCLUDE_HEALTH_REPORT = 2;
}
message GetConsensusStateRequestPB {
// UUID of server this request is addressed to.
optional bytes dest_uuid = 1;
// The ids of the tablets.
// An empty list means return info for all tablets known to the tablet server.
repeated bytes tablet_ids = 2;
// Include a health report inline in the consensus state PB if
// 'report_health' is set to INCLUDE_HEALTH_REPORT. Even in that case, only
// the leader replica will return a health report for the members of the
// config.
optional IncludeHealthReport report_health = 3;
}
message GetConsensusStateResponsePB {
message TabletConsensusInfoPB {
required bytes tablet_id = 1;
optional ConsensusStatePB cstate = 2;
}
repeated TabletConsensusInfoPB tablets = 1;
optional ReplicaManagementInfoPB replica_management_info = 3;
optional tserver.TabletServerErrorPB error = 2;
}
message StartTabletCopyRequestPB {
// UUID of server this request is addressed to.
optional bytes dest_uuid = 5;
required bytes tablet_id = 1;
// Identification for the host we are copying from.
// TODO: Consider renaming these to copy_source_*.
required bytes copy_peer_uuid = 2;
required HostPortPB copy_peer_addr = 3;
// The caller's term. In the case that the target of this request has a
// TOMBSTONED replica with a term higher than this one, the request will fail.
optional int64 caller_term = 4 [ default = -1 ];
}
message StartTabletCopyResponsePB {
optional tserver.TabletServerErrorPB error = 1;
}
// An unsafe change configuration request for the tablet with 'tablet_id'.
message UnsafeChangeConfigRequestPB {
// UUID of server this request is addressed to.
optional bytes dest_uuid = 1;
required bytes tablet_id = 2;
// Sender identification, it could be a static string as well.
required bytes caller_id = 3;
// The raft config sent to destination server.
// Only the 'permanent_uuid' of each peer in the config is
// required (address-related information is ignored by the server).
// The peers specified in the 'new_config' are required to be a
// subset of (or equal to) the peers in the committed config on the
// destination replica.
required RaftConfigPB new_config = 4;
}
// The unsafe change configuration response. 'error' field is set if operation failed.
message UnsafeChangeConfigResponsePB {
optional tserver.TabletServerErrorPB error = 1;
}
// A Raft implementation.
service ConsensusService {
option (kudu.rpc.default_authz_method) = "AuthorizeServiceUser";
// Analogous to AppendEntries in Raft, but only used for followers.
rpc UpdateConsensus(ConsensusRequestPB) returns (ConsensusResponsePB);
// RequestVote() from Raft.
rpc RequestConsensusVote(VoteRequestPB) returns (VoteResponsePB);
// Implements all of the one-by-one config change operations, including
// AddServer() and RemoveServer() from the Raft specification, as well as
// an operation to change the role of a server between VOTER and NON_VOTER.
// An OK response means the operation was successful.
rpc ChangeConfig(ChangeConfigRequestPB) returns (ChangeConfigResponsePB);
// Implements a one-by-one config change interface that allows for multiple
// peers to be modified at once, including setting attributes and adding or
// removing various peers, as long not more than one voter is added, removed,
// or demoted in a single operation.
rpc BulkChangeConfig(BulkChangeConfigRequestPB) returns (ChangeConfigResponsePB);
// Implements unsafe config change operation for manual recovery use cases.
rpc UnsafeChangeConfig(UnsafeChangeConfigRequestPB) returns (UnsafeChangeConfigResponsePB);
rpc GetNodeInstance(GetNodeInstanceRequestPB) returns (GetNodeInstanceResponsePB);
// Force this node to run a leader election.
rpc RunLeaderElection(RunLeaderElectionRequestPB) returns (RunLeaderElectionResponsePB);
// Force this node to step down as leader.
rpc LeaderStepDown(LeaderStepDownRequestPB) returns (LeaderStepDownResponsePB);
rpc GetLastOpId(GetLastOpIdRequestPB) returns (GetLastOpIdResponsePB);
// Returns the consensus state for a set of tablets.
// Does not return information for tombstoned tablets.
rpc GetConsensusState(GetConsensusStateRequestPB)
returns (GetConsensusStateResponsePB);
// Instruct this server to copy a tablet from another host.
rpc StartTabletCopy(StartTabletCopyRequestPB) returns (StartTabletCopyResponsePB);
}