| // Licensed to the Apache Software Foundation (ASF) under one |
| // or more contributor license agreements. See the NOTICE file |
| // distributed with this work for additional information |
| // regarding copyright ownership. The ASF licenses this file |
| // to you under the Apache License, Version 2.0 (the |
| // "License"); you may not use this file except in compliance |
| // with the License. You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, |
| // software distributed under the License is distributed on an |
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| // KIND, either express or implied. See the License for the |
| // specific language governing permissions and limitations |
| // under the License. |
| package kudu.consensus; |
| |
| option java_package = "org.apache.kudu.consensus"; |
| |
| import "kudu/common/common.proto"; |
| import "kudu/common/wire_protocol.proto"; |
| import "kudu/consensus/metadata.proto"; |
| import "kudu/consensus/opid.proto"; |
| import "kudu/rpc/rpc_header.proto"; |
| import "kudu/tablet/metadata.proto"; |
| import "kudu/tablet/tablet.proto"; |
| import "kudu/tserver/tserver_admin.proto"; |
| import "kudu/tserver/tserver.proto"; |
| |
| // Consensus-specific errors use this protobuf |
| message ConsensusErrorPB { |
| // The codes for consensus responses. These are set in the status when |
| // some consensus internal error occurs and require special handling |
| // by the caller. A generic error code is purposefully absent since |
| // generic errors should use tserver.TabletServerErrorPB. |
| enum Code { |
| UNKNOWN = 0; |
| |
| // Invalid term. |
| // Sent by peers in response to leader RPCs whenever the term |
| // of one of the messages sent in a batch is lower than the |
| // the term the peer is expecting. |
| INVALID_TERM = 2; |
| |
| // For leader election. |
| // The last OpId logged by the candidate is older than the last OpId logged |
| // by the local peer. |
| LAST_OPID_TOO_OLD = 3; |
| |
| // For leader election. |
| // The local replica has already voted for another candidate in this term. |
| ALREADY_VOTED = 4; |
| |
| // The replica does not recognize the caller's request as coming from a |
| // member of the configuration. |
| NOT_IN_QUORUM = 5; |
| |
| // The responder's last entry didn't match the caller's preceding entry. |
| PRECEDING_ENTRY_DIDNT_MATCH = 6; |
| |
| // The local replica is either a leader, or has heard from a valid leader |
| // more recently than the election timeout, so believes the leader to be alive. |
| LEADER_IS_ALIVE = 7; |
| |
| // The local replica is in the middle of servicing either another vote |
| // or an update from a valid leader. |
| CONSENSUS_BUSY = 8; |
| |
| // The local replica was unable to prepare a single transaction. |
| CANNOT_PREPARE = 9; |
| } |
| |
| // The error code. |
| required Code code = 1; |
| |
| // The Status object for the error. This will include a textual |
| // message that may be more useful to present in log messages, etc, |
| // though its error code is less specific. |
| required AppStatusPB status = 2; |
| } |
| |
| // =========================================================================== |
| // External Consensus Messages |
| // =========================================================================== |
| |
| // The types of operations that need a commit message, i.e. those that require |
| // at least one round of the consensus algorithm. |
| enum OperationType { |
| UNKNOWN_OP = 0; |
| NO_OP = 1; |
| WRITE_OP = 3; |
| ALTER_SCHEMA_OP = 4; |
| CHANGE_CONFIG_OP = 5; |
| } |
| |
| // The transaction driver type: indicates whether a transaction is |
| // being executed on a leader or a replica. |
| enum DriverType { |
| UNKNOWN_DRIVER = 0; |
| LEADER = 1; |
| REPLICA = 2; |
| } |
| |
| // A configuration change request for the tablet with 'tablet_id'. |
| // This message is dynamically generated by the leader when AddServer() or |
| // RemoveServer() is called, and is what gets replicated to the log. |
| message ChangeConfigRecordPB { |
| required bytes tablet_id = 1; |
| |
| // The old committed configuration config for verification purposes. |
| required RaftConfigPB old_config = 2; |
| |
| // The new configuration to set the configuration to. |
| required RaftConfigPB new_config = 3; |
| } |
| |
| enum ChangeConfigType { |
| UNKNOWN_CHANGE = 0; |
| ADD_SERVER = 1; |
| REMOVE_SERVER = 2; |
| CHANGE_ROLE = 3; |
| } |
| |
| // A configuration change request for the tablet with 'tablet_id'. |
| // These requests are restricted to one-by-one operations, as specified in |
| // Diego Ongaro's Raft PhD thesis. |
| // This is the RPC request, but it does not end up in the log. |
| // See also ChangeConfigRecordPB. |
| message ChangeConfigRequestPB { |
| // UUID of server this request is addressed to. |
| optional bytes dest_uuid = 4; |
| |
| required bytes tablet_id = 1; |
| |
| // The type of config change requested. |
| // This field must be specified, but is left as optional due to being an enum. |
| optional ChangeConfigType type = 2; |
| |
| // The peer to add or remove. |
| // When 'type' == ADD_SERVER, both the permanent_uuid and last_known_addr |
| // fields must be set. Otherwise, only the permanent_uuid field is required. |
| optional RaftPeerPB server = 3; |
| |
| // The OpId index of the committed config to replace. |
| // This optional parameter is here to provide an atomic (compare-and-swap) |
| // ChangeConfig operation. The ChangeConfig() operation will fail if this |
| // parameter is specified and the committed config does not have a matching |
| // opid_index. See also the definition of RaftConfigPB. |
| optional int64 cas_config_opid_index = 5; |
| } |
| |
| // The configuration change response. If any immediate error occurred |
| // the 'error' field is set with it, otherwise 'new_configuration' is set. |
| message ChangeConfigResponsePB { |
| optional tserver.TabletServerErrorPB error = 1; |
| |
| // Updated configuration after changing the config. |
| optional RaftPeerPB new_config = 2; |
| |
| // The timestamp chosen by the server for this change config operation. |
| // TODO: At the time of writing, this field is never set in the response. |
| // TODO: Propagate signed timestamps. See KUDU-611. |
| optional fixed64 timestamp = 3; |
| } |
| |
| // A Replicate message, sent to replicas by leader to indicate this operation must |
| // be stored in the WAL/SM log, as part of the first phase of the two phase |
| // commit. |
| message ReplicateMsg { |
| // The Raft operation ID (term and index) being replicated. |
| required OpId id = 1; |
| // The (hybrid or logical) timestamp assigned to this message. |
| required fixed64 timestamp = 2; |
| // optional ExternalConsistencyMode external_consistency_mode = 3 [default = NO_CONSISTENCY]; |
| required OperationType op_type = 4; |
| optional tserver.WriteRequestPB write_request = 5; |
| optional tserver.AlterSchemaRequestPB alter_schema_request = 6; |
| optional ChangeConfigRecordPB change_config_record = 7; |
| |
| // The client's request id for this message, if it is set. |
| optional rpc.RequestIdPB request_id = 8; |
| |
| optional NoOpRequestPB noop_request = 999; |
| } |
| |
| // A commit message for a previous operation. |
| // This is a commit in the consensus sense and may abort/commit any operation |
| // that required a consensus round. |
| message CommitMsg { |
| required OperationType op_type = 1; |
| // the id of the message this commit pertains to |
| optional OpId commited_op_id = 2; |
| // The operations that were applied and/or failed in this transaction. |
| optional tablet.TxResultPB result = 3; |
| } |
| |
| // =========================================================================== |
| // Internal Consensus Messages and State |
| // =========================================================================== |
| |
| // NoOp requests, mostly used in tests. |
| message NoOpRequestPB { |
| // Allows to set a dummy payload, for tests. |
| optional bytes payload_for_tests = 1; |
| } |
| |
| // NoOp responses, mostly used in tests. |
| message NoOpResponsePB { |
| // Allows to set a dummy payload, for tests. |
| optional bytes payload_for_tests = 1; |
| } |
| |
| message PerOpErrorPB { |
| // The id of the operation that failed in the other peer. |
| required OpId id = 1; |
| // The Status explaining why the operation failed. |
| required AppStatusPB status = 2; |
| } |
| |
| // Status message received in the peer responses. |
| message ConsensusStatusPB { |
| |
| // The last message received (and replicated) by the peer. |
| required OpId last_received = 1; |
| |
| // The id of the last op that was replicated by the current leader. |
| // This doesn't necessarily mean that the term of this op equals the current |
| // term, since the current leader may be replicating ops from a prior term. |
| // Unset if none currently received. |
| // |
| // In the case where there is a log matching property error |
| // (PRECEDING_ENTRY_DIDNT_MATCH), this field is important and may still be |
| // set, since the leader queue uses this field in conjuction with |
| // last_received to decide on the next id to send to the follower. |
| // |
| // NOTE: it might seem that the leader itself could track this based on knowing |
| // which batches were successfully sent. However, the follower is free to |
| // truncate the batch if an operation in the middle of the batch fails |
| // to prepare (eg due to memory limiting). In that case, the leader |
| // will get a success response but still need to re-send some operations. |
| optional OpId last_received_current_leader = 4; |
| |
| // The last committed index that is known to the peer. |
| optional int64 last_committed_idx = 2; |
| |
| // When the last request failed for some consensus related (internal) reason. |
| // In some cases the error will have a specific code that the caller will |
| // have to handle in certain ways. |
| optional ConsensusErrorPB error = 3; |
| } |
| |
| // A request from a candidate peer that wishes to become leader of |
| // the configuration serving tablet with 'tablet_id'. |
| // See RAFT sec. 5.2. |
| message VoteRequestPB { |
| // UUID of server this request is addressed to. |
| optional bytes dest_uuid = 6; |
| |
| // Identifies the tablet configuration a the vote is being requested for. |
| required string tablet_id = 1; |
| |
| // The uuid of the sending peer. |
| required bytes candidate_uuid = 2; |
| |
| // The term we are requesting a vote for. |
| // If this term is higher than the callee's term, the callee will update its |
| // own term to match, and if it is the current leader it will step down. |
| required int64 candidate_term = 3; |
| |
| // The candidate node status so that the voter node can |
| // decide whether to vote for it as LEADER. |
| // |
| // In particular, this includes the last OpId persisted in the candidate's |
| // log, which corresponds to the lastLogIndex and lastLogTerm fields in Raft. |
| // A replica must vote no for a candidate that has an OpId lower than them. |
| required ConsensusStatusPB candidate_status = 4; |
| |
| // Normally, replicas will deny a vote with a LEADER_IS_ALIVE error if |
| // they are a leader or recently heard from a leader. This is to prevent |
| // partitioned nodes from disturbing liveness. If this flag is true, |
| // peers will vote even if they think a leader is alive. This can be used |
| // for example to force a faster leader hand-off rather than waiting for |
| // the election timer to expire. |
| optional bool ignore_live_leader = 5 [ default = false ]; |
| } |
| |
| // A response from a replica to a leader election request. |
| message VoteResponsePB { |
| // The uuid of the node sending the reply. |
| optional bytes responder_uuid = 1; |
| |
| // The term of the node sending the reply. |
| // Allows the candidate to update itself if it is behind. |
| optional int64 responder_term = 2; |
| |
| // True if this peer voted for the caller, false otherwise. |
| optional bool vote_granted = 3; |
| |
| // TODO: Migrate ConsensusService to the AppStatusPB RPC style and merge these errors. |
| // Error message from the consensus implementation. |
| optional ConsensusErrorPB consensus_error = 998; |
| |
| // A generic error message (such as tablet not found). |
| optional tserver.TabletServerErrorPB error = 999; |
| } |
| |
| // A consensus request message, the basic unit of a consensus round. |
| message ConsensusRequestPB { |
| // UUID of server this request is addressed to. |
| optional bytes dest_uuid = 7; |
| |
| required string tablet_id = 1; |
| |
| // The uuid of the peer making the call. |
| required bytes caller_uuid = 2; |
| |
| // The caller's term. As only leaders can send messages, |
| // replicas will accept all messages as long as the term |
| // is equal to or higher than the last term they know about. |
| // If a leader receives a request with a term higher than its own, |
| // it will step down and enter FOLLOWER state (see Raft sec. 5.1). |
| required int64 caller_term = 3; |
| |
| // The id of the operation immediately preceding the first |
| // operation in 'ops'. If the replica is receiving 'ops' for |
| // the first time 'preceding_id' must match the replica's |
| // last operation. |
| // |
| // This must be set if 'ops' is non-empty. |
| optional OpId preceding_id = 4; |
| |
| // The id of the last committed operation in the configuration. This is the |
| // id of the last operation the leader deemed committed from a consensus |
| // standpoint (not the last operation the leader applied). |
| // |
| // Raft calls this field 'leaderCommit'. |
| required OpId committed_index = 5; |
| |
| // Sequence of operations to be replicated by this peer. |
| // These will be committed when committed_index advances above their |
| // respective OpIds. In some cases committed_index can indicate that |
| // these operations are already committed, in which case they will be |
| // committed during the same request. |
| repeated ReplicateMsg ops = 6; |
| } |
| |
| message ConsensusResponsePB { |
| // The uuid of the peer making the response. |
| optional bytes responder_uuid = 1; |
| |
| // The current term of the peer making the response. |
| // This is used to update the caller (and make it step down if it is |
| // out of date). |
| optional int64 responder_term = 2; |
| |
| // The current consensus status of the receiver peer. |
| optional ConsensusStatusPB status = 3; |
| |
| // A generic error message (such as tablet not found), per operation |
| // error messages are sent along with the consensus status. |
| optional tserver.TabletServerErrorPB error = 999; |
| } |
| |
| // A message reflecting the status of an in-flight transaction. |
| message TransactionStatusPB { |
| required OpId op_id = 1; |
| required OperationType tx_type = 2; |
| // Time the transaction has been in flight. |
| required int64 running_for_micros = 3; |
| // Quick human-readable description (e.g., ToString() output). |
| required string description = 4; |
| |
| // If tracing is enabled when viewing the transaction, the trace |
| // buffer is copied here. |
| optional string trace_buffer = 6; |
| } |
| |
| message GetNodeInstanceRequestPB { |
| } |
| |
| message GetNodeInstanceResponsePB { |
| required NodeInstancePB node_instance = 1; |
| } |
| |
| // Message that makes the local peer run leader election to be elected leader. |
| // Assumes that a tablet with 'tablet_id' exists. |
| message RunLeaderElectionRequestPB { |
| // UUID of server this request is addressed to. |
| optional bytes dest_uuid = 2; |
| |
| // the id of the tablet |
| required bytes tablet_id = 1; |
| } |
| |
| message RunLeaderElectionResponsePB { |
| // A generic error message (such as tablet not found). |
| optional tserver.TabletServerErrorPB error = 1; |
| } |
| |
| message LeaderStepDownRequestPB { |
| // UUID of server this request is addressed to. |
| optional bytes dest_uuid = 2; |
| |
| // The id of the tablet. |
| required bytes tablet_id = 1; |
| } |
| |
| message LeaderStepDownResponsePB { |
| // A generic error message (such as tablet not found). |
| optional tserver.TabletServerErrorPB error = 1; |
| } |
| |
| enum OpIdType { |
| UNKNOWN_OPID_TYPE = 0; |
| RECEIVED_OPID = 1; |
| COMMITTED_OPID = 2; |
| } |
| |
| message GetLastOpIdRequestPB { |
| // UUID of server this request is addressed to. |
| optional bytes dest_uuid = 2; |
| |
| // the id of the tablet |
| required bytes tablet_id = 1; |
| |
| // Whether to return the last-received or last-committed OpId. |
| optional OpIdType opid_type = 3 [ default = RECEIVED_OPID ]; |
| } |
| |
| message GetLastOpIdResponsePB { |
| optional OpId opid = 1; |
| // A generic error message (such as tablet not found). |
| optional tserver.TabletServerErrorPB error = 2; |
| } |
| |
| message GetConsensusStateRequestPB { |
| // UUID of server this request is addressed to. |
| optional bytes dest_uuid = 2; |
| |
| // The id of the tablet. |
| required bytes tablet_id = 1; |
| |
| // Whether to fetch the committed or active consensus state. |
| optional ConsensusConfigType type = 3 [ default = CONSENSUS_CONFIG_COMMITTED ]; |
| } |
| |
| message GetConsensusStateResponsePB { |
| optional ConsensusStatePB cstate = 1; |
| // A generic error message (such as tablet not found). |
| optional tserver.TabletServerErrorPB error = 2; |
| } |
| |
| message StartTabletCopyRequestPB { |
| // UUID of server this request is addressed to. |
| optional bytes dest_uuid = 5; |
| |
| required bytes tablet_id = 1; |
| |
| // Identification for the host we are copying from. |
| // TODO: Consider renaming these to copy_source_*. |
| required bytes copy_peer_uuid = 2; |
| required HostPortPB copy_peer_addr = 3; |
| |
| // The caller's term. In the case that the target of this request has a |
| // TOMBSTONED replica with a term higher than this one, the request will fail. |
| optional int64 caller_term = 4 [ default = -1 ]; |
| } |
| |
| message StartTabletCopyResponsePB { |
| optional tserver.TabletServerErrorPB error = 1; |
| } |
| |
| // A Raft implementation. |
| service ConsensusService { |
| // Analogous to AppendEntries in Raft, but only used for followers. |
| rpc UpdateConsensus(ConsensusRequestPB) returns (ConsensusResponsePB); |
| |
| // RequestVote() from Raft. |
| rpc RequestConsensusVote(VoteRequestPB) returns (VoteResponsePB); |
| |
| // Implements all of the one-by-one config change operations, including |
| // AddServer() and RemoveServer() from the Raft specification, as well as |
| // an operation to change the role of a server between VOTER and NON_VOTER. |
| // An OK response means the operation was successful. |
| rpc ChangeConfig(ChangeConfigRequestPB) returns (ChangeConfigResponsePB); |
| |
| rpc GetNodeInstance(GetNodeInstanceRequestPB) returns (GetNodeInstanceResponsePB); |
| |
| // Force this node to run a leader election. |
| rpc RunLeaderElection(RunLeaderElectionRequestPB) returns (RunLeaderElectionResponsePB); |
| |
| // Force this node to step down as leader. |
| rpc LeaderStepDown(LeaderStepDownRequestPB) returns (LeaderStepDownResponsePB); |
| |
| rpc GetLastOpId(GetLastOpIdRequestPB) returns (GetLastOpIdResponsePB); |
| |
| // Returns the committed Consensus state. |
| rpc GetConsensusState(GetConsensusStateRequestPB) returns (GetConsensusStateResponsePB); |
| |
| // Instruct this server to copy a tablet from another host. |
| rpc StartTabletCopy(StartTabletCopyRequestPB) returns (StartTabletCopyResponsePB); |
| } |