| // Licensed to the Apache Software Foundation (ASF) under one |
| // or more contributor license agreements. See the NOTICE file |
| // distributed with this work for additional information |
| // regarding copyright ownership. The ASF licenses this file |
| // to you under the Apache License, Version 2.0 (the |
| // "License"); you may not use this file except in compliance |
| // with the License. You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, |
| // software distributed under the License is distributed on an |
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| // KIND, either express or implied. See the License for the |
| // specific language governing permissions and limitations |
| // under the License. |
| // |
| // Protobufs which are common throughout Kudu. |
| // |
| // This file may contain protobufs which are persisted on disk |
| // as well as sent on the wire. If a particular protobuf is only |
| // used as part of the client-server wire protocol, it should go |
| // in common/wire_protocol.proto instead. If it is only used within |
| // the server(s), it should go in cfile/cfile.proto, server/metadata.proto, |
| // etc, as appropriate. |
| syntax = "proto2"; |
| package kudu; |
| |
| option java_package = "org.apache.kudu"; |
| |
| import "kudu/common/row_operations.proto"; |
| import "kudu/util/block_bloom_filter.proto"; |
| import "kudu/util/compression/compression.proto"; |
| import "kudu/util/hash.proto"; |
| import "kudu/util/pb_util.proto"; |
| |
| // If you add a new type keep in mind to add it to the end |
| // or update AddMapping() functions like the one in key_encoder.cc |
| // that have a vector that maps the protobuf tag with the index. |
| enum DataType { |
| UNKNOWN_DATA = 999; |
| UINT8 = 0; |
| INT8 = 1; |
| UINT16 = 2; |
| INT16 = 3; |
| UINT32 = 4; |
| INT32 = 5; |
| UINT64 = 6; |
| INT64 = 7; |
| STRING = 8; |
| BOOL = 9; |
| FLOAT = 10; |
| DOUBLE = 11; |
| BINARY = 12; |
| UNIXTIME_MICROS = 13; |
| INT128 = 14; |
| DECIMAL32 = 15; |
| DECIMAL64 = 16; |
| DECIMAL128 = 17; |
| IS_DELETED = 18; // virtual column; not a real data type |
| VARCHAR = 19; |
| DATE = 20; |
| } |
| |
| enum EncodingType { |
| UNKNOWN_ENCODING = 999; |
| AUTO_ENCODING = 0; |
| PLAIN_ENCODING = 1; |
| PREFIX_ENCODING = 2; |
| // GROUP_VARINT encoding is deprecated and no longer implemented. |
| GROUP_VARINT = 3; |
| RLE = 4; |
| DICT_ENCODING = 5; |
| BIT_SHUFFLE = 6; |
| } |
| |
| // Enums that specify the HMS-related configurations for a Kudu mini-cluster. |
| enum HmsMode { |
| // No HMS will be started. |
| NONE = 0; |
| |
| // The HMS will be started, but will not be configured to use the Kudu |
| // plugin. |
| DISABLE_HIVE_METASTORE = 3; |
| |
| // The HMS will be started and configured to use the Kudu plugin, but the |
| // Kudu mini-cluster will not be configured to synchronize with it. |
| ENABLE_HIVE_METASTORE = 1; |
| |
| // The HMS will be started and configured to use the Kudu plugin, and the |
| // Kudu mini-cluster will be configured to synchronize with it. |
| ENABLE_METASTORE_INTEGRATION = 2; |
| }; |
| |
| // Holds detailed attributes for the column. Only certain fields will be set, |
| // depending on the type of the column. |
| message ColumnTypeAttributesPB { |
| // For decimal columns |
| optional int32 precision = 1; |
| optional int32 scale = 2; |
| // For varchar columns |
| optional int32 length = 3; |
| } |
| |
| // TODO: Differentiate between the schema attributes |
| // that are only relevant to the server (e.g., |
| // encoding and compression) and those that also |
| // matter to the client. |
| message ColumnSchemaPB { |
| optional uint32 id = 1; |
| required string name = 2; |
| required DataType type = 3; |
| optional bool is_key = 4 [default = false]; |
| optional bool is_nullable = 5 [default = false]; |
| |
| // Default values. |
| // NOTE: as far as clients are concerned, there is only one |
| // "default value" of a column. The read/write defaults are used |
| // internally and should not be exposed by any public client APIs. |
| // |
| // When passing schemas to the master for create/alter table, |
| // specify the default in 'read_default_value'. |
| // |
| // Contrary to this, when the client opens a table, it will receive |
| // both the read and write defaults, but the *write* default is |
| // what should be exposed as the "current" default. |
| optional bytes read_default_value = 6; |
| optional bytes write_default_value = 7; |
| |
| // The following attributes refer to the on-disk storage of the column. |
| // They won't always be set, depending on context. |
| optional EncodingType encoding = 8 [default=AUTO_ENCODING]; |
| optional CompressionType compression = 9 [default=DEFAULT_COMPRESSION]; |
| optional int32 cfile_block_size = 10 [default=0]; |
| |
| optional ColumnTypeAttributesPB type_attributes = 11; |
| |
| // The comment for the column. |
| optional string comment = 12; |
| |
| optional bool immutable = 13 [default = false]; |
| } |
| |
| message ColumnSchemaDeltaPB { |
| optional string name = 1; |
| optional string new_name = 2; |
| |
| optional bytes default_value = 4; |
| optional bool remove_default = 5; |
| |
| optional EncodingType encoding = 6; |
| optional CompressionType compression = 7; |
| optional int32 block_size = 8; |
| |
| optional string new_comment = 9; |
| } |
| |
| message SchemaPB { |
| repeated ColumnSchemaPB columns = 1; |
| } |
| |
| message HostPortPB { |
| required string host = 1; |
| required uint32 port = 2; |
| } |
| |
| // The external consistency mode for client requests. |
| // This defines how ops and/or sequences of operations that touch |
| // several TabletServers, in different machines, can be observed by external |
| // clients. |
| // |
| // Note that ExternalConsistencyMode makes no guarantee on atomicity, i.e. |
| // no sequence of operations is made atomic (or transactional) just because |
| // an external consistency mode is set. |
| // Note also that ExternalConsistencyMode has no implication on the |
| // consistency between replicas of the same tablet. |
| enum ExternalConsistencyMode { |
| UNKNOWN_EXTERNAL_CONSISTENCY_MODE = 0; |
| |
| // The response to any write will contain a timestamp. |
| // Any further calls from the same client to other servers will update |
| // those servers with that timestamp. The user will make sure that the |
| // timestamp is propagated through back-channels to other |
| // KuduClient's. |
| // |
| // WARNING: Failure to propagate timestamp information through |
| // back-channels will negate any external consistency guarantee under this |
| // mode. |
| // |
| // Example: |
| // 1 - Client A executes operation X in Tablet A |
| // 2 - Afterwards, Client A executes operation Y in Tablet B |
| // |
| // |
| // Client B may observe the following operation sequences: |
| // {}, {X}, {X Y} |
| // |
| // This is the default mode. |
| CLIENT_PROPAGATED = 1; |
| |
| // The server will guarantee that each op is externally consistent by making |
| // sure that none of its results are visible until every Kudu server agrees |
| // that the op is in the past. The client is not obligated to forward |
| // timestamp information through back-channels. |
| // |
| // WARNING: Depending on the clock synchronization state of TabletServers |
| // this may imply considerable latency. Moreover operations with |
| // COMMIT_WAIT requested external consistency will outright fail if |
| // TabletServer clocks are either unsynchronized or synchronized but |
| // with a maximum error which surpasses a pre-configured one. |
| // |
| // Example: |
| // - Client A executes operation X in Tablet A |
| // - Afterwards, Client A executes operation Y in Tablet B |
| // |
| // |
| // Client B may observe the following operation sequences: |
| // {}, {X}, {X Y} |
| COMMIT_WAIT = 2; |
| }; |
| |
| // The possible read modes for clients. |
| // Clients set these in Scan requests. |
| // The server keeps 2 snapshot boundaries: |
| // - The earliest snapshot: this corresponds to the earliest kept undo records |
| // in the tablet, meaning the current state (Base) can be undone up to |
| // this snapshot. |
| // - The latest snapshot: This corresponds to the instant beyond which no op |
| // will have an earlier timestamp. Usually this corresponds to whatever |
| // clock->Now() returns, but can be higher if the client propagates a |
| // timestamp (see below). |
| enum ReadMode { |
| UNKNOWN_READ_MODE = 0; |
| |
| // When READ_LATEST is specified the server will execute the read independently |
| // of the clock and will always return all visible writes at the time the request |
| // was received. This type of read does not return a snapshot timestamp since |
| // it might not be repeatable, i.e. a later read executed at the same snapshot |
| // timestamp might yield rows that were committed by in-flight ops. |
| // |
| // This is the default mode. |
| READ_LATEST = 1; |
| |
| // When READ_AT_SNAPSHOT is specified the server will attempt to perform a read |
| // at the required snapshot. If no snapshot is defined the server will take the |
| // current time as the snapshot timestamp. Snapshot reads are repeatable, i.e. |
| // all future reads at the same timestamp will yield the same rows. This is |
| // performed at the expense of waiting for in-flight ops whose timestamp |
| // is lower than the snapshot's timestamp to complete. |
| // |
| // When mixing reads and writes clients that specify COMMIT_WAIT as their |
| // external consistency mode and then use the returned write_timestamp |
| // to perform snapshot reads are guaranteed that that snapshot time is |
| // considered in the past by all servers and no additional action is |
| // necessary. Clients using CLIENT_PROPAGATED however must forcibly propagate |
| // the timestamps even at read time, so that the server will not generate |
| // any more ops before the snapshot requested by the client. |
| // The latter option is implemented by allowing the client to specify one or |
| // two timestamps, the first one obtained from the previous CLIENT_PROPAGATED |
| // write, directly or through back-channels, must be signed and will be |
| // checked by the server. The second one, if defined, is the actual snapshot |
| // read time. When selecting both the latter must be lower than or equal to |
| // the former. |
| // TODO implement actually signing the propagated timestamp. |
| READ_AT_SNAPSHOT = 2; |
| |
| // When READ_YOUR_WRITES is specified, the server will pick a timestamp to use |
| // for a server-local snapshot scan subject to the following criteria: |
| // (1) It will be higher than the propagated timestamp, |
| // (2) It will attempt to minimize latency caused by waiting for outstanding |
| // write ops to complete. |
| // More specifically, the server will choose the latest timestamp higher than |
| // the provided propagated timestamp bound that allows execution of the |
| // reads without being blocked by the in-flight ops (however the |
| // read can be blocked if the propagated timestamp is higher than some in-flight |
| // ops). If no propagated timestamp is provided the server will choose |
| // a timestamp such that all ops before it are committed. The chosen |
| // timestamp will be returned back to the client as 'snapshot timestamp'. The Kudu |
| // client library will use it as the propagated timestamp for subsequent reads |
| // to avoid unnecessarily waiting. |
| // |
| // Reads in this mode are not repeatable: two READ_YOUR_WRITES reads, even if |
| // they provide the same propagated timestamp bound, can execute at different |
| // timestamps and thus return different results. However, it allows |
| // read-your-writes and read-your-reads for each client, as the chosen |
| // timestamp must be higher than the one of the last write or read, |
| // known from the propagated timestamp. |
| READ_YOUR_WRITES = 3; |
| } |
| |
| // The possible order modes for clients. |
| // Clients specify these in new scan requests. |
| // Ordered scans are fault-tolerant, and can be retried elsewhere in the case |
| // of tablet server failure. However, ordered scans impose additional overhead |
| // since the tablet server needs to sort the result rows. |
| enum OrderMode { |
| UNKNOWN_ORDER_MODE = 0; |
| // This is the default order mode. |
| UNORDERED = 1; |
| ORDERED = 2; |
| } |
| |
| // Policy with which to choose among multiple replicas. |
| enum ReplicaSelection { |
| UNKNOWN_REPLICA_SELECTION = 0; |
| // Select the LEADER replica. |
| LEADER_ONLY = 1; |
| // Select the closest replica to the client. Replicas are classified from |
| // closest to furthest as follows: |
| // - Local replicas |
| // - Replicas whose tablet server has the same location as the client |
| // - All other replicas |
| CLOSEST_REPLICA = 2; |
| } |
| |
| // The serialized format of a Kudu table partition schema. |
| message PartitionSchemaPB { |
| |
| // A column identifier for partition schemas. In general, the name will be |
| // used when a client creates the table since column IDs are assigned by the |
| // master. All other uses of partition schemas will use the numeric column ID. |
| message ColumnIdentifierPB { |
| oneof identifier { |
| int32 id = 1; |
| string name = 2; |
| } |
| } |
| |
| message RangeSchemaPB { |
| // Column identifiers of columns included in the range. All columns must be |
| // a component of the primary key. |
| repeated ColumnIdentifierPB columns = 1; |
| } |
| |
| message HashBucketSchemaPB { |
| // Column identifiers of columns included in the hash. Every column must be |
| // a component of the primary key. |
| repeated ColumnIdentifierPB columns = 1; |
| |
| // Number of buckets into which columns will be hashed. Must be at least 2. |
| required int32 num_buckets = 2; |
| |
| // Seed value for hash calculation. Administrators may set a seed value |
| // on a per-table basis in order to randomize the mapping of rows to |
| // buckets. Setting a seed provides some amount of protection against denial |
| // of service attacks when the hash bucket columns contain user provided |
| // input. |
| optional uint32 seed = 3; |
| |
| // The hash algorithm to use for calculating the hash bucket. |
| // NOTE: this is not used yet -- don't expect setting it to have any effect |
| optional HashAlgorithm hash_algorithm = 4; |
| } |
| |
| // This data structure represents a range partition with a custom hash schema. |
| message RangeWithHashSchemaPB { |
| // Row operations containing the lower and upper range bound for the range. |
| optional RowOperationsPB range_bounds = 1; |
| // Hash schema for the range. |
| repeated HashBucketSchemaPB hash_schema = 2; |
| } |
| |
| // Table-wide hash schema. Hash schema for a particular range may be |
| // overriden by corresponding element in 'custom_hash_schema_ranges'. |
| repeated HashBucketSchemaPB hash_schema = 1; |
| |
| // Range schema to partition the key space into ranges. |
| optional RangeSchemaPB range_schema = 2; |
| |
| // Two fields were deprecated in favor of using 'custom_hash_schema_ranges'. |
| reserved 3; |
| reserved 4; |
| |
| // If the 'custom_hash_schema_ranges' field is empty, the table-wide hash |
| // schema specified by the 'hash_schema' field is used for all the ranges |
| // of the table. Otherwise, particular ranges have their hash schema |
| // as specified by corresponding elements in 'custom_hash_schema_ranges'. |
| repeated RangeWithHashSchemaPB custom_hash_schema_ranges = 5; |
| } |
| |
| // The serialized format of a Kudu table partition. |
| message PartitionPB { |
| // The hash buckets of the partition. The number of hash buckets must match |
| // the number of hash dimensions in the partition's schema. |
| repeated int32 hash_buckets = 1 [packed = true]; |
| // The encoded start partition key (inclusive). |
| optional bytes partition_key_start = 2; |
| // The encoded end partition key (exclusive). |
| optional bytes partition_key_end = 3; |
| } |
| |
| // A predicate that can be applied on a Kudu column. |
| message ColumnPredicatePB { |
| // The predicate column name. |
| optional string column = 1; |
| |
| message Range { |
| |
| // Bounds should be encoded as follows: |
| // - STRING/BINARY values: simply the exact string value for the bound. |
| // - other type: the canonical x86 in-memory representation -- eg for |
| // uint32s, a little-endian value. |
| // |
| // Note that this predicate type should not be used for NULL data -- |
| // NULL is defined to neither be greater than or less than other values |
| // for the comparison operator. |
| |
| // The inclusive lower bound. |
| optional bytes lower = 1 [(kudu.REDACT) = true]; |
| |
| // The exclusive upper bound. |
| optional bytes upper = 2 [(kudu.REDACT) = true]; |
| } |
| |
| message Equality { |
| // The inclusive lower bound. See comment in Range for notes on the |
| // encoding. |
| optional bytes value = 1 [(kudu.REDACT) = true]; |
| } |
| |
| message InList { |
| // A list of values for the field. See comment in Range for notes on |
| // the encoding. |
| repeated bytes values = 1 [(kudu.REDACT) = true]; |
| } |
| |
| message IsNotNull {} |
| |
| message IsNull {} |
| |
| message InBloomFilter { |
| // A list of bloom filters for the field. |
| repeated BlockBloomFilterPB bloom_filters = 1; |
| |
| // lower and upper are optional for InBloomFilter. |
| // When using both InBloomFilter and Range predicate for the same column the |
| // merged result can be InBloomFilter within specified range. |
| // |
| // The inclusive lower bound. |
| optional bytes lower = 2 [(kudu.REDACT) = true]; |
| |
| // The exclusive upper bound. |
| optional bytes upper = 3 [(kudu.REDACT) = true]; |
| } |
| |
| oneof predicate { |
| Range range = 2; |
| Equality equality = 3; |
| IsNotNull is_not_null = 4; |
| InList in_list = 5; |
| IsNull is_null = 6; |
| InBloomFilter in_bloom_filter = 7; |
| } |
| } |
| |
| // The primary key range of a Kudu tablet. |
| message KeyRangePB { |
| // Encoded primary key to begin scanning at (inclusive). |
| optional bytes start_primary_key = 1 [(kudu.REDACT) = true]; |
| // Encoded primary key to stop scanning at (exclusive). |
| optional bytes stop_primary_key = 2 [(kudu.REDACT) = true]; |
| // Number of bytes in chunk. |
| required uint64 size_bytes_estimates = 3; |
| } |
| |
| message TableExtraConfigPB { |
| // Number of seconds to retain history for tablets in this table, |
| // including history required to perform diff scans and incremental |
| // backups. Reads initiated at a snapshot that is older than this |
| // age will be rejected. Equivalent to --tablet_history_max_age_sec. |
| optional int32 history_max_age_sec = 1; |
| |
| // Priority level of a table for maintenance, it will be clamped into |
| // range [-FLAGS_max_priority_range, FLAGS_max_priority_range] when |
| // calculate maintenance priority score. |
| optional int32 maintenance_priority = 2; |
| |
| // If set true, the table's data on disk is not compacted. |
| optional bool disable_compaction = 3; |
| } |
| |
| // The type of a given table. This is useful in determining whether a |
| // table/tablet stores user-specified data, as opposed to being a Kudu-internal |
| // system table. |
| enum TableTypePB { |
| // The table stores user data. |
| DEFAULT_TABLE = 0; |
| |
| // The table stores transaction status management metadata. |
| TXN_STATUS_TABLE = 1; |
| } |