| // Licensed to the Apache Software Foundation (ASF) under one |
| // or more contributor license agreements. See the NOTICE file |
| // distributed with this work for additional information |
| // regarding copyright ownership. The ASF licenses this file |
| // to you under the Apache License, Version 2.0 (the |
| // "License"); you may not use this file except in compliance |
| // with the License. You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, |
| // software distributed under the License is distributed on an |
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| // KIND, either express or implied. See the License for the |
| // specific language governing permissions and limitations |
| // under the License. |
| syntax = "proto2"; |
| package kudu.tablet; |
| |
| option java_package = "org.apache.kudu.tablet"; |
| |
| import "kudu/common/common.proto"; |
| import "kudu/consensus/opid.proto"; |
| import "kudu/fs/fs.proto"; |
| |
| // ============================================================================ |
| // Tablet Metadata |
| // ============================================================================ |
| |
| message ColumnDataPB { |
| required BlockIdPB block = 2; |
| // REMOVED: optional ColumnSchemaPB OBSOLETE_schema = 3; |
| optional int32 column_id = 4; |
| } |
| |
| message DeltaDataPB { |
| required BlockIdPB block = 2; |
| } |
| |
| message RowSetDataPB { |
| required uint64 id = 1; |
| required int64 last_durable_dms_id = 2; |
| repeated ColumnDataPB columns = 3; |
| repeated DeltaDataPB redo_deltas = 4; |
| repeated DeltaDataPB undo_deltas = 5; |
| optional BlockIdPB bloom_block = 6; |
| optional BlockIdPB adhoc_index_block = 7; |
| optional bytes min_encoded_key = 8; |
| optional bytes max_encoded_key = 9; |
| |
| // Number of live rows that have been persisted. |
| optional int64 live_row_count = 10; |
| } |
| |
| // State flags indicating whether the tablet is in the middle of being copied |
| // and is therefore not possible to bring up, whether it has been deleted, or |
| // whether the data is in a usable state. |
| enum TabletDataState { |
| TABLET_DATA_UNKNOWN = 999; |
| |
| // The tablet is set to TABLET_DATA_COPYING state when in the middle of |
| // copying data files from a remote peer. If a tablet server crashes with |
| // a tablet in this state, the tablet must be deleted and |
| // the Tablet Copy process must be restarted for that tablet. |
| TABLET_DATA_COPYING = 0; |
| |
| // Fresh empty tablets and successfully copied tablets are set to the |
| // TABLET_DATA_READY state. |
| TABLET_DATA_READY = 1; |
| |
| // This tablet is in the process of being deleted. |
| // The tablet server should "roll forward" the deletion during boot, |
| // rather than trying to load the tablet. |
| TABLET_DATA_DELETED = 2; |
| |
| // The tablet has been deleted, and now just consists of a "tombstone". |
| TABLET_DATA_TOMBSTONED = 3; |
| } |
| |
| // Metadata that indicates the state of a transaction. |
| message TxnMetadataPB { |
| // Whether the transaction was aborted. If true, 'commit_timestamp' must not |
| // be set. |
| optional bool aborted = 1; |
| |
| // The commit timestamp of the transaction. If set, 'aborted' must not be |
| // set. |
| optional int64 commit_timestamp = 2; |
| |
| // The timestamp used by the MVCC op that tracks the commit of this |
| // transaction. This is sent to the transaction status manager in response to |
| // a BEGIN_COMMIT request to be used to assign a commit timestamp that is |
| // higher than all participants' commit MVCC op timestamps. |
| // |
| // When iterating through mutations at a specific clean snapshot (as in a |
| // READ_AT_SNAPSHOT or diff scan), both this MVCC op timestamp and the commit |
| // timestamp must be applied for the mutation to be considered committed in |
| // that snapshot. |
| // |
| // When iterating through mutations at the latest snapshot (as in READ_LATEST |
| // or during compactions), this MVCC op timestamp must be applied and there |
| // must be a commit timestamp for the mutation to be considered committed -- |
| // this avoids reading dirty, uncommitted rows. |
| optional int64 commit_mvcc_op_timestamp = 3; |
| |
| // Whether or not this transaction has flushed its MRS after committing. If |
| // set to true, Kudu should not create an MRS for this transaction when |
| // bootstrapping. |
| optional bool flushed_committed_mrs = 4; |
| |
| // TODO(awong): add an owner field to this for uncommitted transactions. |
| } |
| |
| // The super-block keeps track of the tablet data blocks. |
| // A tablet contains one or more RowSets, which contain |
| // a set of blocks (one for each column), a set of delta blocks |
| // and optionally a block containing the bloom filter |
| // and a block containing the compound-keys. |
| message TabletSuperBlockPB { |
| // Table ID of the table this tablet is part of. |
| required bytes table_id = 1; |
| |
| // Tablet Id |
| required bytes tablet_id = 2; |
| |
| // The type of table this tablet belongs to. If not set, the assumption is |
| // this is a user-defined table as opposed to a Kudu-internal system table. |
| optional TableTypePB table_type = 19; |
| |
| // The latest durable MemRowSet id |
| required int64 last_durable_mrs_id = 3; |
| |
| // DEPRECATED. |
| optional bytes start_key = 4; |
| |
| // DEPRECATED. |
| optional bytes end_key = 5; |
| |
| // The partition of the table. |
| optional PartitionPB partition = 13; |
| |
| // Tablet RowSets |
| repeated RowSetDataPB rowsets = 6; |
| |
| // The latest schema |
| // TODO: maybe this should be TableSchemaPB? Need to actually put those attributes |
| // into use throughout the code. Using the simpler one for now. |
| required string table_name = 7; |
| required SchemaPB schema = 8; |
| required uint32 schema_version = 9; |
| |
| // The partition schema of the table. |
| optional PartitionSchemaPB partition_schema = 14; |
| |
| // The current state of the tablet's data. |
| optional TabletDataState tablet_data_state = 10 [ default = TABLET_DATA_UNKNOWN ]; |
| |
| // Blocks that became orphans after flushing this superblock. In other |
| // words, the set difference of the blocks belonging to the previous |
| // superblock and this one. |
| // |
| // It's always safe to delete the blocks found here. |
| repeated BlockIdPB orphaned_blocks = 11; |
| |
| // For tablets that have been tombstoned, stores the last OpId stored in the |
| // WAL before tombstoning. |
| // Only relevant for TOMBSTONED tablets. |
| optional consensus.OpId tombstone_last_logged_opid = 12; |
| |
| // Tablet data is spread across a data directory group. If this is not set |
| // and the tablet state is TABLET_DATA_READY, it is assumed that the data is |
| // from a version of Kudu before 1.5.0. In this case, a new group will be |
| // created spanning all data directories. |
| optional DataDirGroupPB data_dir_group = 15; |
| |
| // Whether the tablet supports counting live row. If false, 'live_row_count' |
| // may be inaccurate and should be ignored. It's only supported for the newly |
| // created ones, not for the ancient ones. |
| optional bool supports_live_row_count = 16; |
| |
| // The table's extra-config. |
| optional TableExtraConfigPB extra_config = 17; |
| |
| // The dimension label for tablet. Used by the master to determine load when |
| // creating new tablet replicas based on dimension. |
| optional string dimension_label = 18; |
| |
| // Map from txn ID to metadata associated with the transaction. This is |
| // updated on each metadata flush to reflect the current in-memory state of |
| // transactions. In between an in-memory state update and a flush, |
| // participant ops should be anchored to replay the updates upon restarting. |
| // TODO(awong): consider storing these separately from the superblock. |
| map<int64, TxnMetadataPB> txn_metadata = 20; |
| } |
| |
| // Tablet states represent stages of a TabletReplica's object lifecycle and are |
| // reported to the master via tablet reports. |
| // |
| // Legal state transitions for a single TabletReplica object: |
| // |
| // NOT_INITIALIZED -> INITIALIZED -> BOOTSTRAPPING -> RUNNING -> STOPPING -> STOPPED -> SHUTDOWN |
| // | | | ^ ^ ^ |
| // | | | | | | |
| // | | +------------------+ | | |
| // | +-------------------------------------+ | |
| // +------------------------------------------------------+ |
| // |
| // Since a TabletReplica instance is replaced when a Tablet Copy operation |
| // occurs, from a remote perspective it is possible for a tablet replica to |
| // appear to transition from SHUTDOWN back to NOT_INITIALIZED. |
| enum TabletStatePB { |
| UNKNOWN = 999; |
| |
| // Tablet has not yet been initialized. |
| NOT_INITIALIZED = 6; |
| |
| // Tablet has been initialized but not yet started. |
| INITIALIZED = 5; |
| |
| // Indicates the Tablet is bootstrapping, i.e. that the Tablet is not |
| // available for RPC. |
| BOOTSTRAPPING = 0; |
| |
| // Once the configuration phase is over Peers are in RUNNING state. In this |
| // state Peers are available for client RPCs. |
| RUNNING = 1; |
| |
| // The tablet failed for some reason. TabletReplica::error() will return |
| // the reason for the failure. |
| FAILED = 2; |
| |
| // The Tablet is shutting down, and will not accept further requests. |
| STOPPING = 3; |
| |
| // The tablet has been stopped, possibly because it has been tombstoned. |
| STOPPED = 7; |
| |
| // The Tablet has been completely shut down. |
| SHUTDOWN = 4; |
| } |
| |
| // Statistics for a tablet replica. |
| message ReportedTabletStatsPB { |
| optional uint64 on_disk_size = 1; |
| optional uint64 live_row_count = 2; |
| } |