| // Licensed to the Apache Software Foundation (ASF) under one |
| // or more contributor license agreements. See the NOTICE file |
| // distributed with this work for additional information |
| // regarding copyright ownership. The ASF licenses this file |
| // to you under the Apache License, Version 2.0 (the |
| // "License"); you may not use this file except in compliance |
| // with the License. You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, |
| // software distributed under the License is distributed on an |
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| // KIND, either express or implied. See the License for the |
| // specific language governing permissions and limitations |
| // under the License. |
| syntax = "proto2"; |
| package kudu.consensus; |
| |
| option java_package = "org.apache.kudu.consensus"; |
| |
| import "kudu/common/common.proto"; |
| import "kudu/common/wire_protocol.proto"; |
| import "kudu/consensus/metadata.proto"; |
| import "kudu/consensus/opid.proto"; |
| import "kudu/consensus/replica_management.proto"; |
| import "kudu/rpc/rpc_header.proto"; |
| import "kudu/tablet/metadata.proto"; |
| import "kudu/tablet/tablet.proto"; |
| import "kudu/tserver/tserver_admin.proto"; |
| import "kudu/tserver/tserver.proto"; |
| |
| // Consensus-specific errors use this protobuf |
| message ConsensusErrorPB { |
| // The codes for consensus responses. These are set in the status when |
| // some consensus internal error occurs and require special handling |
| // by the caller. A generic error code is purposefully absent since |
| // generic errors should use tserver.TabletServerErrorPB. |
| enum Code { |
| UNKNOWN = 0; |
| |
| // Invalid term. |
| // Sent by peers in response to leader RPCs whenever the term |
| // of one of the messages sent in a batch is lower than the |
| // the term the peer is expecting. |
| INVALID_TERM = 2; |
| |
| // For leader election. |
| // The last OpId logged by the candidate is older than the last OpId logged |
| // by the local peer. |
| LAST_OPID_TOO_OLD = 3; |
| |
| // For leader election. |
| // The local replica has already voted for another candidate in this term. |
| ALREADY_VOTED = 4; |
| |
| // The replica does not recognize the caller's request as coming from a |
| // member of the configuration. |
| NOT_IN_QUORUM = 5; |
| |
| // The responder's last entry didn't match the caller's preceding entry. |
| PRECEDING_ENTRY_DIDNT_MATCH = 6; |
| |
| // The local replica is either a leader, or has heard from a valid leader |
| // more recently than the election timeout, so believes the leader to be alive. |
| LEADER_IS_ALIVE = 7; |
| |
| // The local replica is in the middle of servicing either another vote |
| // or an update from a valid leader. |
| CONSENSUS_BUSY = 8; |
| |
| // The local replica was unable to prepare a single op. |
| CANNOT_PREPARE = 9; |
| } |
| |
| // The error code. |
| required Code code = 1; |
| |
| // The Status object for the error. This will include a textual |
| // message that may be more useful to present in log messages, etc, |
| // though its error code is less specific. |
| required AppStatusPB status = 2; |
| } |
| |
| // =========================================================================== |
| // External Consensus Messages |
| // =========================================================================== |
| |
| // The types of operations that need a commit message, i.e. those that require |
| // at least one round of the consensus algorithm. |
| enum OperationType { |
| UNKNOWN_OP = 0; |
| NO_OP = 1; |
| WRITE_OP = 3; |
| ALTER_SCHEMA_OP = 4; |
| CHANGE_CONFIG_OP = 5; |
| PARTICIPANT_OP = 6; |
| } |
| |
| // The op driver type: indicates whether an op is being executed on a leader |
| // or follower replica. |
| enum DriverType { |
| UNKNOWN_DRIVER = 0; |
| LEADER = 1; |
| FOLLOWER = 2; |
| } |
| |
| // A configuration change request for the tablet with 'tablet_id'. |
| // This message is dynamically generated by the leader when AddServer() or |
| // RemoveServer() is called, and is what gets replicated to the log. |
| message ChangeConfigRecordPB { |
| required bytes tablet_id = 1; |
| |
| // The old committed configuration config for verification purposes. |
| required RaftConfigPB old_config = 2; |
| |
| // The new configuration to set the configuration to. |
| required RaftConfigPB new_config = 3; |
| } |
| |
| enum ChangeConfigType { |
| UNKNOWN_CHANGE = 0; |
| ADD_PEER = 1; |
| REMOVE_PEER = 2; |
| MODIFY_PEER = 3; |
| } |
| |
| // A configuration change request for the tablet with 'tablet_id'. |
| // These requests are restricted to one-by-one operations, as specified in |
| // Diego Ongaro's Raft PhD thesis. |
| // This is the RPC request, but it does not end up in the log. |
| // See also ChangeConfigRecordPB. |
| message ChangeConfigRequestPB { |
| // UUID of server this request is addressed to. |
| optional bytes dest_uuid = 4; |
| |
| required bytes tablet_id = 1; |
| |
| // The type of config change requested. |
| // This field must be specified, but is left as optional due to being an enum. |
| optional ChangeConfigType type = 2; |
| |
| // The peer to add or remove. |
| // When 'type' == ADD_PEER, both the permanent_uuid and last_known_addr |
| // fields must be set. Otherwise, only the permanent_uuid field is required. |
| optional RaftPeerPB server = 3; |
| |
| // The OpId index of the committed config to replace. |
| // This optional parameter is here to provide an atomic (compare-and-swap) |
| // ChangeConfig operation. The ChangeConfig() operation will fail if this |
| // parameter is specified and the committed config does not have a matching |
| // opid_index. See also the definition of RaftConfigPB. |
| optional int64 cas_config_opid_index = 5; |
| } |
| |
| // A config change request that specifies many things to change. |
| message BulkChangeConfigRequestPB { |
| // Each sub-request that is being made as part of this bulk change config request. |
| message ConfigChangeItemPB { |
| // The type of config change. |
| optional ChangeConfigType type = 1; |
| // The peer to add, remove, or modify. |
| // The same rules apply here as above in ChangeConfigRequestPB. |
| optional RaftPeerPB peer = 2; |
| } |
| |
| // UUID of server this request is addressed to. |
| optional bytes dest_uuid = 1; |
| |
| required bytes tablet_id = 2; |
| |
| repeated ConfigChangeItemPB config_changes = 3; |
| |
| // The OpId index of the committed config to replace. |
| // This parameter serves the same purpose as specified in |
| // ChangeConfigRequestPB. |
| optional int64 cas_config_opid_index = 4; |
| } |
| |
| // The configuration change response. If any immediate error occurred |
| // the 'error' field is set with it, otherwise 'new_configuration' is set. |
| message ChangeConfigResponsePB { |
| optional tserver.TabletServerErrorPB error = 1; |
| |
| // Updated configuration after changing the config. |
| optional RaftPeerPB new_config = 2; |
| |
| // The timestamp chosen by the server for this change config operation. |
| // TODO: At the time of writing, this field is never set in the response. |
| // TODO: Propagate signed timestamps. See KUDU-611. |
| optional fixed64 timestamp = 3; |
| } |
| |
| // A Replicate message, sent to replicas by leader to indicate this operation must |
| // be stored in the WAL/SM log, as part of the first phase of the two phase |
| // commit. |
| message ReplicateMsg { |
| // The Raft operation ID (term and index) being replicated. |
| required OpId id = 1; |
| // The (hybrid or logical) timestamp assigned to this message. |
| required fixed64 timestamp = 2; |
| // optional ExternalConsistencyMode external_consistency_mode = 3 [default = NO_CONSISTENCY]; |
| required OperationType op_type = 4; |
| optional tserver.WriteRequestPB write_request = 5; |
| optional tserver.AlterSchemaRequestPB alter_schema_request = 6; |
| optional ChangeConfigRecordPB change_config_record = 7; |
| optional tserver.ParticipantRequestPB participant_request = 9; |
| |
| // The client's request id for this message, if it is set. |
| optional rpc.RequestIdPB request_id = 8; |
| |
| optional NoOpRequestPB noop_request = 999; |
| } |
| |
| // A commit message for a previous operation. |
| // This is a commit in the consensus sense and may abort/commit any operation |
| // that required a consensus round. |
| message CommitMsg { |
| required OperationType op_type = 1; |
| // the id of the message this commit pertains to |
| optional OpId commited_op_id = 2; |
| // The operations that were applied and/or failed in this op. |
| optional tablet.TxResultPB result = 3; |
| } |
| |
| // =========================================================================== |
| // Internal Consensus Messages and State |
| // =========================================================================== |
| |
| // NO_OP requests are replicated by a peer after being elected leader. |
| message NoOpRequestPB { |
| // Allows to set a dummy payload, for tests. |
| optional bytes payload_for_tests = 1; |
| |
| // Set to true if the op id for this request is expected to be monotonically |
| // increasing with the assigned timestamp. For no-ops that are sent by a |
| // leader marking a successful Raft election, this is true. If not set, it is |
| // assumed to be true. |
| optional bool timestamp_in_opid_order = 2; |
| } |
| |
| // Status message received in the peer responses. |
| message ConsensusStatusPB { |
| |
| // The last message received (and replicated) by the peer. |
| required OpId last_received = 1; |
| |
| // The id of the last op that was replicated by the current leader. |
| // This doesn't necessarily mean that the term of this op equals the current |
| // term, since the current leader may be replicating ops from a prior term. |
| // Unset if none currently received. |
| // |
| // In the case where there is a log matching property error |
| // (PRECEDING_ENTRY_DIDNT_MATCH), this field is important and may still be |
| // set, since the leader queue uses this field in conjuction with |
| // last_received to decide on the next id to send to the follower. |
| // |
| // NOTE: it might seem that the leader itself could track this based on knowing |
| // which batches were successfully sent. However, the follower is free to |
| // truncate the batch if an operation in the middle of the batch fails |
| // to prepare (eg due to memory limiting). In that case, the leader |
| // will get a success response but still need to re-send some operations. |
| optional OpId last_received_current_leader = 4; |
| |
| // The last committed index that is known to the peer. |
| optional int64 last_committed_idx = 2; |
| |
| // When the last request failed for some consensus related (internal) reason. |
| // In some cases the error will have a specific code that the caller will |
| // have to handle in certain ways. |
| optional ConsensusErrorPB error = 3; |
| } |
| |
| // A request from a candidate peer that wishes to become leader of |
| // the configuration serving tablet with 'tablet_id'. |
| // See RAFT sec. 5.2. |
| message VoteRequestPB { |
| // UUID of server this request is addressed to. |
| optional bytes dest_uuid = 6; |
| |
| // Identifies the tablet configuration a the vote is being requested for. |
| required string tablet_id = 1; |
| |
| // The uuid of the sending peer. |
| required bytes candidate_uuid = 2; |
| |
| // The term we are requesting a vote for. |
| // If this term is higher than the callee's term, the callee will update its |
| // own term to match, and if it is the current leader it will step down. |
| required int64 candidate_term = 3; |
| |
| // The candidate node status so that the voter node can |
| // decide whether to vote for it as LEADER. |
| // |
| // In particular, this includes the last OpId persisted in the candidate's |
| // log, which corresponds to the lastLogIndex and lastLogTerm fields in Raft. |
| // A replica must vote no for a candidate that has an OpId lower than them. |
| required ConsensusStatusPB candidate_status = 4; |
| |
| // Normally, replicas will deny a vote with a LEADER_IS_ALIVE error if |
| // they are a leader or recently heard from a leader. This is to prevent |
| // partitioned nodes from disturbing liveness. If this flag is true, |
| // peers will vote even if they think a leader is alive. This can be used |
| // for example to force a faster leader hand-off rather than waiting for |
| // the election timer to expire. |
| optional bool ignore_live_leader = 5 [ default = false ]; |
| |
| // In a "pre-election", voters should respond how they _would_ have voted |
| // but not actually record the vote. |
| optional bool is_pre_election = 7 [ default = false ]; |
| } |
| |
| // A response from a replica to a leader election request. |
| message VoteResponsePB { |
| // The uuid of the node sending the reply. |
| optional bytes responder_uuid = 1; |
| |
| // The term of the node sending the reply. |
| // Allows the candidate to update itself if it is behind. |
| optional int64 responder_term = 2; |
| |
| // True if this peer voted for the caller, false otherwise. |
| optional bool vote_granted = 3; |
| |
| // TODO: Migrate ConsensusService to the AppStatusPB RPC style and merge these errors. |
| // Error message from the consensus implementation. |
| optional ConsensusErrorPB consensus_error = 998; |
| |
| // A generic error message (such as tablet not found). |
| optional tserver.TabletServerErrorPB error = 999; |
| } |
| |
| // A consensus request message, the basic unit of a consensus round. |
| message ConsensusRequestPB { |
| // UUID of server this request is addressed to. |
| optional bytes dest_uuid = 7; |
| |
| required string tablet_id = 1; |
| |
| // The uuid of the peer making the call. |
| required bytes caller_uuid = 2; |
| |
| // The caller's term. As only leaders can send messages, |
| // replicas will accept all messages as long as the term |
| // is equal to or higher than the last term they know about. |
| // If a leader receives a request with a term higher than its own, |
| // it will step down and enter FOLLOWER state (see Raft sec. 5.1). |
| required int64 caller_term = 3; |
| |
| // The id of the operation immediately preceding the first |
| // operation in 'ops'. If the replica is receiving 'ops' for |
| // the first time 'preceding_id' must match the replica's |
| // last operation. |
| // |
| // This must be set if 'ops' is non-empty. |
| optional OpId preceding_id = 4; |
| |
| // The index of the last committed operation in the configuration. This is the |
| // index of the last operation the leader deemed committed from a consensus |
| // standpoint (not the last operation the leader applied). |
| // |
| // Raft calls this field 'leaderCommit'. |
| optional int64 committed_index = 8; |
| |
| // Deprecated field used in Kudu 0.10.0 and earlier. Remains here to prevent |
| // accidental reuse and provide a nicer error message if the user attempts |
| // a rolling upgrade. |
| optional OpId DEPRECATED_committed_index = 5; |
| |
| // Sequence of operations to be replicated by this peer. |
| // These will be committed when committed_index advances above their |
| // respective OpIds. In some cases committed_index can indicate that |
| // these operations are already committed, in which case they will be |
| // committed during the same request. |
| repeated ReplicateMsg ops = 6; |
| |
| // The highest index that is known to be replicated by all members of |
| // the configuration. |
| // |
| // NOTE: this is not necessarily monotonically increasing. For example, if a node is in |
| // the process of being added to the configuration but has not yet copied a snapshot, |
| // this value may drop to 0. |
| optional int64 all_replicated_index = 9; |
| |
| // The safe timestamp on the leader. |
| // This is only set if the leader has no messages to send to the peer or if the last sent |
| // message is already (raft) committed. By setting this the leader allows followers to advance |
| // the "safe time" past the timestamp of the last committed message and answer snapshot scans |
| // in the present in the absense of writes. |
| optional fixed64 safe_timestamp = 10; |
| |
| // The index of the most recent operation appended to the leader. |
| // Followers can use this to determine roughly how far behind they are from the leader. |
| optional int64 last_idx_appended_to_leader = 11; |
| } |
| |
| message ConsensusResponsePB { |
| // The uuid of the peer making the response. |
| optional bytes responder_uuid = 1; |
| |
| // The current term of the peer making the response. |
| // This is used to update the caller (and make it step down if it is |
| // out of date). |
| optional int64 responder_term = 2; |
| |
| // The current consensus status of the receiver peer. |
| optional ConsensusStatusPB status = 3; |
| |
| // Whether the server that hosts the peer is quiescing. This doesn't |
| // necessarily have any bearing on the state of the Raft peer itself, but it |
| // does indicate that the peer should not be a candidate for leadership. |
| optional bool server_quiescing = 4; |
| |
| // A generic error message (such as tablet not found), per operation |
| // error messages are sent along with the consensus status. |
| optional tserver.TabletServerErrorPB error = 999; |
| } |
| |
| // A message reflecting the status of an in-flight op. |
| message OpStatusPB { |
| required OpId op_id = 1; |
| required OperationType op_type = 2; |
| // Time the op has been in flight. |
| required int64 running_for_micros = 3; |
| // Quick human-readable description (e.g., ToString() output). |
| required string description = 4; |
| |
| // If tracing is enabled when viewing the op, the trace buffer is copied |
| // here. |
| optional string trace_buffer = 6; |
| } |
| |
| message GetNodeInstanceRequestPB { |
| } |
| |
| message GetNodeInstanceResponsePB { |
| required NodeInstancePB node_instance = 1; |
| } |
| |
| // Message that makes the local peer run leader election to be elected leader. |
| // Assumes that a tablet with 'tablet_id' exists. |
| message RunLeaderElectionRequestPB { |
| // UUID of server this request is addressed to. |
| optional bytes dest_uuid = 2; |
| |
| // the id of the tablet |
| required bytes tablet_id = 1; |
| } |
| |
| message RunLeaderElectionResponsePB { |
| // A generic error message (such as tablet not found). |
| optional tserver.TabletServerErrorPB error = 1; |
| } |
| |
| enum LeaderStepDownMode { |
| // The leader will immediately step down. |
| ABRUPT = 1; |
| // The leader will attempt to arrange for a successor to be elected ASAP. |
| // If it cannot do so, it remains leader. |
| GRACEFUL = 2; |
| } |
| |
| message LeaderStepDownRequestPB { |
| // UUID of the server this request is addressed to. |
| optional bytes dest_uuid = 2; |
| |
| // The id of the tablet. |
| required bytes tablet_id = 1; |
| |
| // How the leader will attempt to relinquish its leadership. |
| optional LeaderStepDownMode mode = 3; |
| |
| // The UUID of the peer that should be promoted to leader in GRACEFUL mode. |
| // If unset, the leader will select a successor. |
| // In ABRUPT mode, it is illegal to set this field. |
| optional bytes new_leader_uuid = 4; |
| } |
| |
| message LeaderStepDownResponsePB { |
| // A generic error message (such as tablet not found). |
| optional tserver.TabletServerErrorPB error = 1; |
| } |
| |
| enum OpIdType { |
| UNKNOWN_OPID_TYPE = 0; |
| RECEIVED_OPID = 1; |
| COMMITTED_OPID = 2; |
| } |
| |
| message GetLastOpIdRequestPB { |
| // UUID of server this request is addressed to. |
| optional bytes dest_uuid = 2; |
| |
| // the id of the tablet |
| required bytes tablet_id = 1; |
| |
| // Whether to return the last-received or last-committed OpId. |
| optional OpIdType opid_type = 3 [ default = RECEIVED_OPID ]; |
| } |
| |
| message GetLastOpIdResponsePB { |
| optional OpId opid = 1; |
| // A generic error message (such as tablet not found). |
| optional tserver.TabletServerErrorPB error = 2; |
| } |
| |
| enum IncludeHealthReport { |
| UNSPECIFIED_HEALTH_REPORT = 0; |
| EXCLUDE_HEALTH_REPORT = 1; |
| INCLUDE_HEALTH_REPORT = 2; |
| } |
| |
| message GetConsensusStateRequestPB { |
| // UUID of server this request is addressed to. |
| optional bytes dest_uuid = 1; |
| |
| // The ids of the tablets. |
| // An empty list means return info for all tablets known to the tablet server. |
| repeated bytes tablet_ids = 2; |
| |
| // Include a health report inline in the consensus state PB if |
| // 'report_health' is set to INCLUDE_HEALTH_REPORT. Even in that case, only |
| // the leader replica will return a health report for the members of the |
| // config. |
| optional IncludeHealthReport report_health = 3; |
| } |
| |
| message GetConsensusStateResponsePB { |
| message TabletConsensusInfoPB { |
| required bytes tablet_id = 1; |
| optional ConsensusStatePB cstate = 2; |
| } |
| repeated TabletConsensusInfoPB tablets = 1; |
| |
| optional ReplicaManagementInfoPB replica_management_info = 3; |
| |
| optional tserver.TabletServerErrorPB error = 2; |
| } |
| |
| message StartTabletCopyRequestPB { |
| // UUID of server this request is addressed to. |
| optional bytes dest_uuid = 5; |
| |
| required bytes tablet_id = 1; |
| |
| // Identification for the host we are copying from. |
| // TODO: Consider renaming these to copy_source_*. |
| required bytes copy_peer_uuid = 2; |
| required HostPortPB copy_peer_addr = 3; |
| |
| // The caller's term. In the case that the target of this request has a |
| // TOMBSTONED replica with a term higher than this one, the request will fail. |
| optional int64 caller_term = 4 [ default = -1 ]; |
| } |
| |
| message StartTabletCopyResponsePB { |
| optional tserver.TabletServerErrorPB error = 1; |
| } |
| |
| // An unsafe change configuration request for the tablet with 'tablet_id'. |
| message UnsafeChangeConfigRequestPB { |
| // UUID of server this request is addressed to. |
| optional bytes dest_uuid = 1; |
| required bytes tablet_id = 2; |
| |
| // Sender identification, it could be a static string as well. |
| required bytes caller_id = 3; |
| |
| // The raft config sent to destination server. |
| // Only the 'permanent_uuid' of each peer in the config is |
| // required (address-related information is ignored by the server). |
| // The peers specified in the 'new_config' are required to be a |
| // subset of (or equal to) the peers in the committed config on the |
| // destination replica. |
| required RaftConfigPB new_config = 4; |
| } |
| |
| // The unsafe change configuration response. 'error' field is set if operation failed. |
| message UnsafeChangeConfigResponsePB { |
| optional tserver.TabletServerErrorPB error = 1; |
| } |
| |
| // A Raft implementation. |
| service ConsensusService { |
| option (kudu.rpc.default_authz_method) = "AuthorizeServiceUser"; |
| |
| // Analogous to AppendEntries in Raft, but only used for followers. |
| rpc UpdateConsensus(ConsensusRequestPB) returns (ConsensusResponsePB); |
| |
| // RequestVote() from Raft. |
| rpc RequestConsensusVote(VoteRequestPB) returns (VoteResponsePB); |
| |
| // Implements all of the one-by-one config change operations, including |
| // AddServer() and RemoveServer() from the Raft specification, as well as |
| // an operation to change the role of a server between VOTER and NON_VOTER. |
| // An OK response means the operation was successful. |
| rpc ChangeConfig(ChangeConfigRequestPB) returns (ChangeConfigResponsePB); |
| |
| // Implements a one-by-one config change interface that allows for multiple |
| // peers to be modified at once, including setting attributes and adding or |
| // removing various peers, as long not more than one voter is added, removed, |
| // or demoted in a single operation. |
| rpc BulkChangeConfig(BulkChangeConfigRequestPB) returns (ChangeConfigResponsePB); |
| |
| // Implements unsafe config change operation for manual recovery use cases. |
| rpc UnsafeChangeConfig(UnsafeChangeConfigRequestPB) returns (UnsafeChangeConfigResponsePB); |
| |
| rpc GetNodeInstance(GetNodeInstanceRequestPB) returns (GetNodeInstanceResponsePB); |
| |
| // Force this node to run a leader election. |
| rpc RunLeaderElection(RunLeaderElectionRequestPB) returns (RunLeaderElectionResponsePB); |
| |
| // Force this node to step down as leader. |
| rpc LeaderStepDown(LeaderStepDownRequestPB) returns (LeaderStepDownResponsePB); |
| |
| rpc GetLastOpId(GetLastOpIdRequestPB) returns (GetLastOpIdResponsePB); |
| |
| // Returns the consensus state for a set of tablets. |
| // Does not return information for tombstoned tablets. |
| rpc GetConsensusState(GetConsensusStateRequestPB) |
| returns (GetConsensusStateResponsePB); |
| |
| // Instruct this server to copy a tablet from another host. |
| rpc StartTabletCopy(StartTabletCopyRequestPB) returns (StartTabletCopyResponsePB); |
| } |