// Copyright 2012 Cloudera, Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// See the License for the specific language governing permissions and
// limitations under the License.
// Protobufs which are common throughout Kudu.
// This file may contain protobufs which are persisted on disk
// as well as sent on the wire. If a particular protobuf is only
// used as part of the client-server wire protocol, it should go
// in common/wire_protocol.proto instead. If it is only used within
// the server(s), it should go in cfile/cfile.proto, server/metadata.proto,
// etc, as appropriate.
package kudu;
option java_package = "org.kududb";
// If you add a new type keep in mind to add it to the end
// or update AddMapping() functions like the one in
// that have a vector that maps the protobuf tag with the index.
enum DataType {
UINT8 = 0;
INT8 = 1;
UINT16 = 2;
INT16 = 3;
UINT32 = 4;
INT32 = 5;
UINT64 = 6;
INT64 = 7;
BOOL = 9;
FLOAT = 10;
DOUBLE = 11;
BINARY = 12;
enum EncodingType {
RLE = 4;
enum CompressionType {
LZ4 = 3;
ZLIB = 4;
// TODO: Differentiate between the schema attributes
// that are only relevant to the server (e.g.,
// encoding and compression) and those that also
// matter to the client.
message ColumnSchemaPB {
optional uint32 id = 1;
required string name = 2;
required DataType type = 3;
optional bool is_key = 4 [default = false];
optional bool is_nullable = 5 [default = false];
optional bytes read_default_value = 6;
optional bytes write_default_value = 7;
// The following attributes refer to the on-disk storage of the column.
// They won't always be set, depending on context.
optional EncodingType encoding = 8 [default=AUTO_ENCODING];
optional CompressionType compression = 9 [default=DEFAULT_COMPRESSION];
optional int32 cfile_block_size = 10 [default=0];
message SchemaPB {
repeated ColumnSchemaPB columns = 1;
message HostPortPB {
required string host = 1;
required uint32 port = 2;
// The external consistency mode for client requests.
// This defines how transactions and/or sequences of operations that touch
// several TabletServers, in different machines, can be observed by external
// clients.
// Note that ExternalConsistencyMode makes no guarantee on atomicity, i.e.
// no sequence of operations is made atomic (or transactional) just because
// an external consistency mode is set.
// Note also that ExternalConsistencyMode has no implication on the
// consistency between replicas of the same tablet.
enum ExternalConsistencyMode {
// The response to any write will contain a timestamp.
// Any further calls from the same client to other servers will update
// those servers with that timestamp. The user will make sure that the
// timestamp is propagated through back-channels to other
// KuduClient's.
// WARNING: Failure to propagate timestamp information through
// back-channels will negate any external consistency guarantee under this
// mode.
// Example:
// 1 - Client A executes operation X in Tablet A
// 2 - Afterwards, Client A executes operation Y in Tablet B
// Client B may observe the following operation sequences:
// {}, {X}, {X Y}
// This is the default mode.
// The server will guarantee that each transaction is externally
// consistent by making sure that none of its results are visible
// until every Kudu server agrees that the transaction is in the past.
// The client is not obligated to forward timestamp information
// through back-channels.
// WARNING: Depending on the clock synchronization state of TabletServers
// this may imply considerable latency. Moreover operations with
// COMMIT_WAIT requested external consistency will outright fail if
// TabletServer clocks are either unsynchronized or synchronized but
// with a maximum error which surpasses a pre-configured one.
// Example:
// - Client A executes operation X in Tablet A
// - Afterwards, Client A executes operation Y in Tablet B
// Client B may observe the following operation sequences:
// {}, {X}, {X Y}
// The possible read modes for clients.
// Clients set these in Scan requests.
// The server keeps 2 snapshot boundaries:
// - The earliest snapshot: this corresponds to the earliest kept undo records
// in the tablet, meaning the current state (Base) can be undone up to
// this snapshot.
// - The latest snapshot: This corresponds to the instant beyond which no
// no transaction will have an earlier timestamp. Usually this corresponds
// to whatever clock->Now() returns, but can be higher if the client propagates
// a timestamp (see below).
enum ReadMode {
// When READ_LATEST is specified the server will execute the read independently
// of the clock and will always return all visible writes at the time the request
// was received. This type of read does not return a snapshot timestamp since
// it might not be repeatable, i.e. a later read executed at the same snapshot
// timestamp might yield rows that were committed by in-flight transactions.
// This is the default mode.
// When READ_AT_SNAPSHOT is specified the server will attempt to perform a read
// at the required snapshot. If no snapshot is defined the server will take the
// current time as the snapshot timestamp. Snapshot reads are repeatable, i.e.
// all future reads at the same timestamp will yield the same rows. This is
// performed at the expense of waiting for in-flight transactions whose timestamp
// is lower than the snapshot's timestamp to complete.
// When mixing reads and writes clients that specify COMMIT_WAIT as their
// external consistency mode and then use the returned write_timestamp to
// to perform snapshot reads are guaranteed that that snapshot time is
// considered in the past by all servers and no additional action is
// necessary. Clients using CLIENT_PROPAGATED however must forcibly propagate
// the timestamps even at read time, so that the server will not generate
// any more transactions before the snapshot requested by the client.
// The latter option is implemented by allowing the client to specify one or
// two timestamps, the first one obtained from the previous CLIENT_PROPAGATED
// write, directly or through back-channels, must be signed and will be
// checked by the server. The second one, if defined, is the actual snapshot
// read time. When selecting both the latter must be lower than or equal to
// the former.
// TODO implement actually signing the propagated timestamp.
// The possible order modes for clients.
// Clients specify these in new scan requests.
// Ordered scans are fault-tolerant, and can be retried elsewhere in the case
// of tablet server failure. However, ordered scans impose additional overhead
// since the tablet server needs to sort the result rows.
enum OrderMode {
// This is the default order mode.
// The serialized format of a Kudu table partition schema.
message PartitionSchemaPB {
// A column identifier for partition schemas. In general, the name will be
// used when a client creates the table since column IDs are assigned by the
// master. All other uses of partition schemas will use the numeric column ID.
message ColumnIdentifierPB {
oneof identifier {
int32 id = 1;
string name = 2;
message RangeSchemaPB {
// Column identifiers of columns included in the range. All columns must be
// a component of the primary key.
repeated ColumnIdentifierPB columns = 1;
message HashBucketSchemaPB {
// Column identifiers of columns included in the hash. Every column must be
// a component of the primary key.
repeated ColumnIdentifierPB columns = 1;
// Number of buckets into which columns will be hashed. Must be at least 2.
required int32 num_buckets = 2;
// Seed value for hash calculation. Administrators may set a seed value
// on a per-table basis in order to randomize the mapping of rows to
// buckets. Setting a seed provides some amount of protection against denial
// of service attacks when the hash bucket columns contain user provided
// input.
optional uint32 seed = 3;
enum HashAlgorithm {
// The hash algorithm to use for calculating the hash bucket.
optional HashAlgorithm hash_algorithm = 4;
repeated HashBucketSchemaPB hash_bucket_schemas = 1;
optional RangeSchemaPB range_schema = 2;
// The serialized format of a Kudu table partition.
message PartitionPB {
// The hash buckets of the partition. The number of hash buckets must match
// the number of hash bucket components in the partition's schema.
repeated int32 hash_buckets = 1 [packed = true];
// The encoded start partition key (inclusive).
optional bytes partition_key_start = 2;
// The encoded end partition key (exclusive).
optional bytes partition_key_end = 3;