blob: 2d43328c0513192e9585a8c49201629da3abf5e2 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
//
// Protobufs which are common throughout Kudu.
//
// This file may contain protobufs which are persisted on disk
// as well as sent on the wire. If a particular protobuf is only
// used as part of the client-server wire protocol, it should go
// in common/wire_protocol.proto instead. If it is only used within
// the server(s), it should go in cfile/cfile.proto, server/metadata.proto,
// etc, as appropriate.
syntax = "proto2";
package kudu;
option java_package = "org.apache.kudu";
import "kudu/common/row_operations.proto";
import "kudu/util/block_bloom_filter.proto";
import "kudu/util/compression/compression.proto";
import "kudu/util/hash.proto";
import "kudu/util/pb_util.proto";
// If you add a new type keep in mind to add it to the end
// or update AddMapping() functions like the one in key_encoder.cc
// that have a vector that maps the protobuf tag with the index.
enum DataType {
UNKNOWN_DATA = 999;
UINT8 = 0;
INT8 = 1;
UINT16 = 2;
INT16 = 3;
UINT32 = 4;
INT32 = 5;
UINT64 = 6;
INT64 = 7;
STRING = 8;
BOOL = 9;
FLOAT = 10;
DOUBLE = 11;
BINARY = 12;
UNIXTIME_MICROS = 13;
INT128 = 14;
DECIMAL32 = 15;
DECIMAL64 = 16;
DECIMAL128 = 17;
IS_DELETED = 18; // virtual column; not a real data type
VARCHAR = 19;
DATE = 20;
}
enum EncodingType {
UNKNOWN_ENCODING = 999;
AUTO_ENCODING = 0;
PLAIN_ENCODING = 1;
PREFIX_ENCODING = 2;
// GROUP_VARINT encoding is deprecated and no longer implemented.
GROUP_VARINT = 3;
RLE = 4;
DICT_ENCODING = 5;
BIT_SHUFFLE = 6;
}
// Enums that specify the HMS-related configurations for a Kudu mini-cluster.
enum HmsMode {
// No HMS will be started.
NONE = 0;
// The HMS will be started, but will not be configured to use the Kudu
// plugin.
DISABLE_HIVE_METASTORE = 3;
// The HMS will be started and configured to use the Kudu plugin, but the
// Kudu mini-cluster will not be configured to synchronize with it.
ENABLE_HIVE_METASTORE = 1;
// The HMS will be started and configured to use the Kudu plugin, and the
// Kudu mini-cluster will be configured to synchronize with it.
ENABLE_METASTORE_INTEGRATION = 2;
};
// Holds detailed attributes for the column. Only certain fields will be set,
// depending on the type of the column.
message ColumnTypeAttributesPB {
// For decimal columns
optional int32 precision = 1;
optional int32 scale = 2;
// For varchar columns
optional int32 length = 3;
}
// TODO: Differentiate between the schema attributes
// that are only relevant to the server (e.g.,
// encoding and compression) and those that also
// matter to the client.
message ColumnSchemaPB {
optional uint32 id = 1;
required string name = 2;
required DataType type = 3;
optional bool is_key = 4 [default = false];
optional bool is_nullable = 5 [default = false];
// Default values.
// NOTE: as far as clients are concerned, there is only one
// "default value" of a column. The read/write defaults are used
// internally and should not be exposed by any public client APIs.
//
// When passing schemas to the master for create/alter table,
// specify the default in 'read_default_value'.
//
// Contrary to this, when the client opens a table, it will receive
// both the read and write defaults, but the *write* default is
// what should be exposed as the "current" default.
optional bytes read_default_value = 6;
optional bytes write_default_value = 7;
// The following attributes refer to the on-disk storage of the column.
// They won't always be set, depending on context.
optional EncodingType encoding = 8 [default=AUTO_ENCODING];
optional CompressionType compression = 9 [default=DEFAULT_COMPRESSION];
optional int32 cfile_block_size = 10 [default=0];
optional ColumnTypeAttributesPB type_attributes = 11;
// The comment for the column.
optional string comment = 12;
optional bool immutable = 13 [default = false];
}
message ColumnSchemaDeltaPB {
optional string name = 1;
optional string new_name = 2;
optional bytes default_value = 4;
optional bool remove_default = 5;
optional EncodingType encoding = 6;
optional CompressionType compression = 7;
optional int32 block_size = 8;
optional string new_comment = 9;
}
message SchemaPB {
repeated ColumnSchemaPB columns = 1;
}
message HostPortPB {
required string host = 1;
required uint32 port = 2;
}
// The external consistency mode for client requests.
// This defines how ops and/or sequences of operations that touch
// several TabletServers, in different machines, can be observed by external
// clients.
//
// Note that ExternalConsistencyMode makes no guarantee on atomicity, i.e.
// no sequence of operations is made atomic (or transactional) just because
// an external consistency mode is set.
// Note also that ExternalConsistencyMode has no implication on the
// consistency between replicas of the same tablet.
enum ExternalConsistencyMode {
UNKNOWN_EXTERNAL_CONSISTENCY_MODE = 0;
// The response to any write will contain a timestamp.
// Any further calls from the same client to other servers will update
// those servers with that timestamp. The user will make sure that the
// timestamp is propagated through back-channels to other
// KuduClient's.
//
// WARNING: Failure to propagate timestamp information through
// back-channels will negate any external consistency guarantee under this
// mode.
//
// Example:
// 1 - Client A executes operation X in Tablet A
// 2 - Afterwards, Client A executes operation Y in Tablet B
//
//
// Client B may observe the following operation sequences:
// {}, {X}, {X Y}
//
// This is the default mode.
CLIENT_PROPAGATED = 1;
// The server will guarantee that each op is externally consistent by making
// sure that none of its results are visible until every Kudu server agrees
// that the op is in the past. The client is not obligated to forward
// timestamp information through back-channels.
//
// WARNING: Depending on the clock synchronization state of TabletServers
// this may imply considerable latency. Moreover operations with
// COMMIT_WAIT requested external consistency will outright fail if
// TabletServer clocks are either unsynchronized or synchronized but
// with a maximum error which surpasses a pre-configured one.
//
// Example:
// - Client A executes operation X in Tablet A
// - Afterwards, Client A executes operation Y in Tablet B
//
//
// Client B may observe the following operation sequences:
// {}, {X}, {X Y}
COMMIT_WAIT = 2;
};
// The possible read modes for clients.
// Clients set these in Scan requests.
// The server keeps 2 snapshot boundaries:
// - The earliest snapshot: this corresponds to the earliest kept undo records
// in the tablet, meaning the current state (Base) can be undone up to
// this snapshot.
// - The latest snapshot: This corresponds to the instant beyond which no op
// will have an earlier timestamp. Usually this corresponds to whatever
// clock->Now() returns, but can be higher if the client propagates a
// timestamp (see below).
enum ReadMode {
UNKNOWN_READ_MODE = 0;
// When READ_LATEST is specified the server will execute the read independently
// of the clock and will always return all visible writes at the time the request
// was received. This type of read does not return a snapshot timestamp since
// it might not be repeatable, i.e. a later read executed at the same snapshot
// timestamp might yield rows that were committed by in-flight ops.
//
// This is the default mode.
READ_LATEST = 1;
// When READ_AT_SNAPSHOT is specified the server will attempt to perform a read
// at the required snapshot. If no snapshot is defined the server will take the
// current time as the snapshot timestamp. Snapshot reads are repeatable, i.e.
// all future reads at the same timestamp will yield the same rows. This is
// performed at the expense of waiting for in-flight ops whose timestamp
// is lower than the snapshot's timestamp to complete.
//
// When mixing reads and writes clients that specify COMMIT_WAIT as their
// external consistency mode and then use the returned write_timestamp
// to perform snapshot reads are guaranteed that that snapshot time is
// considered in the past by all servers and no additional action is
// necessary. Clients using CLIENT_PROPAGATED however must forcibly propagate
// the timestamps even at read time, so that the server will not generate
// any more ops before the snapshot requested by the client.
// The latter option is implemented by allowing the client to specify one or
// two timestamps, the first one obtained from the previous CLIENT_PROPAGATED
// write, directly or through back-channels, must be signed and will be
// checked by the server. The second one, if defined, is the actual snapshot
// read time. When selecting both the latter must be lower than or equal to
// the former.
// TODO implement actually signing the propagated timestamp.
READ_AT_SNAPSHOT = 2;
// When READ_YOUR_WRITES is specified, the server will pick a timestamp to use
// for a server-local snapshot scan subject to the following criteria:
// (1) It will be higher than the propagated timestamp,
// (2) It will attempt to minimize latency caused by waiting for outstanding
// write ops to complete.
// More specifically, the server will choose the latest timestamp higher than
// the provided propagated timestamp bound that allows execution of the
// reads without being blocked by the in-flight ops (however the
// read can be blocked if the propagated timestamp is higher than some in-flight
// ops). If no propagated timestamp is provided the server will choose
// a timestamp such that all ops before it are committed. The chosen
// timestamp will be returned back to the client as 'snapshot timestamp'. The Kudu
// client library will use it as the propagated timestamp for subsequent reads
// to avoid unnecessarily waiting.
//
// Reads in this mode are not repeatable: two READ_YOUR_WRITES reads, even if
// they provide the same propagated timestamp bound, can execute at different
// timestamps and thus return different results. However, it allows
// read-your-writes and read-your-reads for each client, as the chosen
// timestamp must be higher than the one of the last write or read,
// known from the propagated timestamp.
READ_YOUR_WRITES = 3;
}
// The possible order modes for clients.
// Clients specify these in new scan requests.
// Ordered scans are fault-tolerant, and can be retried elsewhere in the case
// of tablet server failure. However, ordered scans impose additional overhead
// since the tablet server needs to sort the result rows.
enum OrderMode {
UNKNOWN_ORDER_MODE = 0;
// This is the default order mode.
UNORDERED = 1;
ORDERED = 2;
}
// Policy with which to choose among multiple replicas.
enum ReplicaSelection {
UNKNOWN_REPLICA_SELECTION = 0;
// Select the LEADER replica.
LEADER_ONLY = 1;
// Select the closest replica to the client. Replicas are classified from
// closest to furthest as follows:
// - Local replicas
// - Replicas whose tablet server has the same location as the client
// - All other replicas
CLOSEST_REPLICA = 2;
}
// The serialized format of a Kudu table partition schema.
message PartitionSchemaPB {
// A column identifier for partition schemas. In general, the name will be
// used when a client creates the table since column IDs are assigned by the
// master. All other uses of partition schemas will use the numeric column ID.
message ColumnIdentifierPB {
oneof identifier {
int32 id = 1;
string name = 2;
}
}
message RangeSchemaPB {
// Column identifiers of columns included in the range. All columns must be
// a component of the primary key.
repeated ColumnIdentifierPB columns = 1;
}
message HashBucketSchemaPB {
// Column identifiers of columns included in the hash. Every column must be
// a component of the primary key.
repeated ColumnIdentifierPB columns = 1;
// Number of buckets into which columns will be hashed. Must be at least 2.
required int32 num_buckets = 2;
// Seed value for hash calculation. Administrators may set a seed value
// on a per-table basis in order to randomize the mapping of rows to
// buckets. Setting a seed provides some amount of protection against denial
// of service attacks when the hash bucket columns contain user provided
// input.
optional uint32 seed = 3;
// The hash algorithm to use for calculating the hash bucket.
// NOTE: this is not used yet -- don't expect setting it to have any effect
optional HashAlgorithm hash_algorithm = 4;
}
// This data structure represents a range partition with a custom hash schema.
message RangeWithHashSchemaPB {
// Row operations containing the lower and upper range bound for the range.
optional RowOperationsPB range_bounds = 1;
// Hash schema for the range.
repeated HashBucketSchemaPB hash_schema = 2;
}
// Table-wide hash schema. Hash schema for a particular range may be
// overriden by corresponding element in 'custom_hash_schema_ranges'.
repeated HashBucketSchemaPB hash_schema = 1;
// Range schema to partition the key space into ranges.
optional RangeSchemaPB range_schema = 2;
// Two fields were deprecated in favor of using 'custom_hash_schema_ranges'.
reserved 3;
reserved 4;
// If the 'custom_hash_schema_ranges' field is empty, the table-wide hash
// schema specified by the 'hash_schema' field is used for all the ranges
// of the table. Otherwise, particular ranges have their hash schema
// as specified by corresponding elements in 'custom_hash_schema_ranges'.
repeated RangeWithHashSchemaPB custom_hash_schema_ranges = 5;
}
// The serialized format of a Kudu table partition.
message PartitionPB {
// The hash buckets of the partition. The number of hash buckets must match
// the number of hash dimensions in the partition's schema.
repeated int32 hash_buckets = 1 [packed = true];
// The encoded start partition key (inclusive).
optional bytes partition_key_start = 2;
// The encoded end partition key (exclusive).
optional bytes partition_key_end = 3;
}
// A predicate that can be applied on a Kudu column.
message ColumnPredicatePB {
// The predicate column name.
optional string column = 1;
message Range {
// Bounds should be encoded as follows:
// - STRING/BINARY values: simply the exact string value for the bound.
// - other type: the canonical x86 in-memory representation -- eg for
// uint32s, a little-endian value.
//
// Note that this predicate type should not be used for NULL data --
// NULL is defined to neither be greater than or less than other values
// for the comparison operator.
// The inclusive lower bound.
optional bytes lower = 1 [(kudu.REDACT) = true];
// The exclusive upper bound.
optional bytes upper = 2 [(kudu.REDACT) = true];
}
message Equality {
// The inclusive lower bound. See comment in Range for notes on the
// encoding.
optional bytes value = 1 [(kudu.REDACT) = true];
}
message InList {
// A list of values for the field. See comment in Range for notes on
// the encoding.
repeated bytes values = 1 [(kudu.REDACT) = true];
}
message IsNotNull {}
message IsNull {}
message InBloomFilter {
// A list of bloom filters for the field.
repeated BlockBloomFilterPB bloom_filters = 1;
// lower and upper are optional for InBloomFilter.
// When using both InBloomFilter and Range predicate for the same column the
// merged result can be InBloomFilter within specified range.
//
// The inclusive lower bound.
optional bytes lower = 2 [(kudu.REDACT) = true];
// The exclusive upper bound.
optional bytes upper = 3 [(kudu.REDACT) = true];
}
oneof predicate {
Range range = 2;
Equality equality = 3;
IsNotNull is_not_null = 4;
InList in_list = 5;
IsNull is_null = 6;
InBloomFilter in_bloom_filter = 7;
}
}
// The primary key range of a Kudu tablet.
message KeyRangePB {
// Encoded primary key to begin scanning at (inclusive).
optional bytes start_primary_key = 1 [(kudu.REDACT) = true];
// Encoded primary key to stop scanning at (exclusive).
optional bytes stop_primary_key = 2 [(kudu.REDACT) = true];
// Number of bytes in chunk.
required uint64 size_bytes_estimates = 3;
}
message TableExtraConfigPB {
// Number of seconds to retain history for tablets in this table,
// including history required to perform diff scans and incremental
// backups. Reads initiated at a snapshot that is older than this
// age will be rejected. Equivalent to --tablet_history_max_age_sec.
optional int32 history_max_age_sec = 1;
// Priority level of a table for maintenance, it will be clamped into
// range [-FLAGS_max_priority_range, FLAGS_max_priority_range] when
// calculate maintenance priority score.
optional int32 maintenance_priority = 2;
// If set true, the table's data on disk is not compacted.
optional bool disable_compaction = 3;
}
// The type of a given table. This is useful in determining whether a
// table/tablet stores user-specified data, as opposed to being a Kudu-internal
// system table.
enum TableTypePB {
// The table stores user data.
DEFAULT_TABLE = 0;
// The table stores transaction status management metadata.
TXN_STATUS_TABLE = 1;
}