// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.
//
// Protobufs which are common throughout Kudu.
//
// This file may contain protobufs which are persisted on disk
// as well as sent on the wire. If a particular protobuf is only
// used as part of the client-server wire protocol, it should go
// in common/wire_protocol.proto instead. If it is only used within
// the server(s), it should go in cfile/cfile.proto, server/metadata.proto,
// etc, as appropriate.
syntax = "proto2";
package kudu;

option java_package = "org.apache.kudu";

import "kudu/common/row_operations.proto";
import "kudu/util/block_bloom_filter.proto";
import "kudu/util/compression/compression.proto";
import "kudu/util/hash.proto";
import "kudu/util/pb_util.proto";

// If you add a new type keep in mind to add it to the end
// or update AddMapping() functions like the one in key_encoder.cc
// that have a vector that maps the protobuf tag with the index.
enum DataType {
  UNKNOWN_DATA = 999;
  UINT8 = 0;
  INT8 = 1;
  UINT16 = 2;
  INT16 = 3;
  UINT32 = 4;
  INT32 = 5;
  UINT64 = 6;
  INT64 = 7;
  STRING = 8;
  BOOL = 9;
  FLOAT = 10;
  DOUBLE = 11;
  BINARY = 12;
  UNIXTIME_MICROS = 13;
  INT128 = 14;
  DECIMAL32 = 15;
  DECIMAL64 = 16;
  DECIMAL128 = 17;
  IS_DELETED = 18; // virtual column; not a real data type
  VARCHAR = 19;
  DATE = 20;
}

enum EncodingType {
  UNKNOWN_ENCODING = 999;
  AUTO_ENCODING = 0;
  PLAIN_ENCODING = 1;
  PREFIX_ENCODING = 2;
  // GROUP_VARINT encoding is deprecated and no longer implemented.
  GROUP_VARINT = 3;
  RLE = 4;
  DICT_ENCODING = 5;
  BIT_SHUFFLE = 6;
}

// Enums that specify the HMS-related configurations for a Kudu mini-cluster.
enum HmsMode {
  // No HMS will be started.
  NONE = 0;

  // The HMS will be started, but will not be configured to use the Kudu
  // plugin.
  DISABLE_HIVE_METASTORE = 3;

  // The HMS will be started and configured to use the Kudu plugin, but the
  // Kudu mini-cluster will not be configured to synchronize with it.
  ENABLE_HIVE_METASTORE = 1;

  // The HMS will be started and configured to use the Kudu plugin, and the
  // Kudu mini-cluster will be configured to synchronize with it.
  ENABLE_METASTORE_INTEGRATION = 2;
};

// Holds detailed attributes for the column. Only certain fields will be set,
// depending on the type of the column.
message ColumnTypeAttributesPB {
  // For decimal columns
  optional int32 precision = 1;
  optional int32 scale = 2;
  // For varchar columns
  optional int32 length = 3;
}

// TODO: Differentiate between the schema attributes
// that are only relevant to the server (e.g.,
// encoding and compression) and those that also
// matter to the client.
message ColumnSchemaPB {
  optional uint32 id = 1;
  required string name = 2;
  required DataType type = 3;
  optional bool is_key = 4 [default = false];
  optional bool is_nullable = 5 [default = false];

  // Default values.
  // NOTE: as far as clients are concerned, there is only one
  // "default value" of a column. The read/write defaults are used
  // internally and should not be exposed by any public client APIs.
  //
  // When passing schemas to the master for create/alter table,
  // specify the default in 'read_default_value'.
  //
  // Contrary to this, when the client opens a table, it will receive
  // both the read and write defaults, but the *write* default is
  // what should be exposed as the "current" default.
  optional bytes read_default_value = 6;
  optional bytes write_default_value = 7;

  // The following attributes refer to the on-disk storage of the column.
  // They won't always be set, depending on context.
  optional EncodingType encoding = 8 [default=AUTO_ENCODING];
  optional CompressionType compression = 9 [default=DEFAULT_COMPRESSION];
  optional int32 cfile_block_size = 10 [default=0];

  optional ColumnTypeAttributesPB type_attributes = 11;

  // The comment for the column.
  optional string comment = 12;
}

message ColumnSchemaDeltaPB {
  optional string name = 1;
  optional string new_name = 2;

  optional bytes default_value = 4;
  optional bool remove_default = 5;

  optional EncodingType encoding = 6;
  optional CompressionType compression = 7;
  optional int32 block_size = 8;

  optional string new_comment = 9;
}

message SchemaPB {
  repeated ColumnSchemaPB columns = 1;
}

message HostPortPB {
  required string host = 1;
  required uint32 port = 2;
}

// The external consistency mode for client requests.
// This defines how ops and/or sequences of operations that touch
// several TabletServers, in different machines, can be observed by external
// clients.
//
// Note that ExternalConsistencyMode makes no guarantee on atomicity, i.e.
// no sequence of operations is made atomic (or transactional) just because
// an external consistency mode is set.
// Note also that ExternalConsistencyMode has no implication on the
// consistency between replicas of the same tablet.
enum ExternalConsistencyMode {
  UNKNOWN_EXTERNAL_CONSISTENCY_MODE = 0;

  // The response to any write will contain a timestamp.
  // Any further calls from the same client to other servers will update
  // those servers with that timestamp. The user will make sure that the
  // timestamp is propagated through back-channels to other
  // KuduClient's.
  //
  // WARNING: Failure to propagate timestamp information through
  // back-channels will negate any external consistency guarantee under this
  // mode.
  //
  // Example:
  // 1 - Client A executes operation X in Tablet A
  // 2 - Afterwards, Client A executes operation Y in Tablet B
  //
  //
  // Client B may observe the following operation sequences:
  // {}, {X}, {X Y}
  //
  // This is the default mode.
  CLIENT_PROPAGATED = 1;

  // The server will guarantee that each op is externally consistent by making
  // sure that none of its results are visible until every Kudu server agrees
  // that the op is in the past.  The client is not obligated to forward
  // timestamp information through back-channels.
  //
  // WARNING: Depending on the clock synchronization state of TabletServers
  // this may imply considerable latency. Moreover operations with
  // COMMIT_WAIT requested external consistency will outright fail if
  // TabletServer clocks are either unsynchronized or synchronized but
  // with a maximum error which surpasses a pre-configured one.
  //
  // Example:
  // - Client A executes operation X in Tablet A
  // - Afterwards, Client A executes operation Y in Tablet B
  //
  //
  // Client B may observe the following operation sequences:
  // {}, {X}, {X Y}
  COMMIT_WAIT = 2;
};

// The possible read modes for clients.
// Clients set these in Scan requests.
// The server keeps 2 snapshot boundaries:
// - The earliest snapshot: this corresponds to the earliest kept undo records
//   in the tablet, meaning the current state (Base) can be undone up to
//   this snapshot.
// - The latest snapshot: This corresponds to the instant beyond which no op
//   will have an earlier timestamp. Usually this corresponds to whatever
//   clock->Now() returns, but can be higher if the client propagates a
//   timestamp (see below).
enum ReadMode {
  UNKNOWN_READ_MODE = 0;

  // When READ_LATEST is specified the server will execute the read independently
  // of the clock and will always return all visible writes at the time the request
  // was received. This type of read does not return a snapshot timestamp since
  // it might not be repeatable, i.e. a later read executed at the same snapshot
  // timestamp might yield rows that were committed by in-flight ops.
  //
  // This is the default mode.
  READ_LATEST = 1;

  // When READ_AT_SNAPSHOT is specified the server will attempt to perform a read
  // at the required snapshot. If no snapshot is defined the server will take the
  // current time as the snapshot timestamp. Snapshot reads are repeatable, i.e.
  // all future reads at the same timestamp will yield the same rows. This is
  // performed at the expense of waiting for in-flight ops whose timestamp
  // is lower than the snapshot's timestamp to complete.
  //
  // When mixing reads and writes clients that specify COMMIT_WAIT as their
  // external consistency mode and then use the returned write_timestamp
  // to perform snapshot reads are guaranteed that that snapshot time is
  // considered in the past by all servers and no additional action is
  // necessary. Clients using CLIENT_PROPAGATED however must forcibly propagate
  // the timestamps even at read time, so that the server will not generate
  // any more ops before the snapshot requested by the client.
  // The latter option is implemented by allowing the client to specify one or
  // two timestamps, the first one obtained from the previous CLIENT_PROPAGATED
  // write, directly or through back-channels, must be signed and will be
  // checked by the server. The second one, if defined, is the actual snapshot
  // read time. When selecting both the latter must be lower than or equal to
  // the former.
  // TODO implement actually signing the propagated timestamp.
  READ_AT_SNAPSHOT = 2;

  // When READ_YOUR_WRITES is specified, the server will pick a timestamp to use
  // for a server-local snapshot scan subject to the following criteria:
  // (1) It will be higher than the propagated timestamp,
  // (2) It will attempt to minimize latency caused by waiting for outstanding
  //     write ops to complete.
  // More specifically, the server will choose the latest timestamp higher than
  // the provided propagated timestamp bound that allows execution of the
  // reads without being blocked by the in-flight ops (however the
  // read can be blocked if the propagated timestamp is higher than some in-flight
  // ops). If no propagated timestamp is provided the server will choose
  // a timestamp such that all ops before it are committed. The chosen
  // timestamp will be returned back to the client as 'snapshot timestamp'. The Kudu
  // client library will use it as the propagated timestamp for subsequent reads
  // to avoid unnecessarily waiting.
  //
  // Reads in this mode are not repeatable: two READ_YOUR_WRITES reads, even if
  // they provide the same propagated timestamp bound, can execute at different
  // timestamps and thus return different results. However, it allows
  // read-your-writes and read-your-reads for each client, as the chosen
  // timestamp must be higher than the one of the last write or read,
  // known from the propagated timestamp.
  READ_YOUR_WRITES = 3;
}

// The possible order modes for clients.
// Clients specify these in new scan requests.
// Ordered scans are fault-tolerant, and can be retried elsewhere in the case
// of tablet server failure. However, ordered scans impose additional overhead
// since the tablet server needs to sort the result rows.
enum OrderMode {
  UNKNOWN_ORDER_MODE = 0;
  // This is the default order mode.
  UNORDERED = 1;
  ORDERED = 2;
}

// Policy with which to choose among multiple replicas.
enum ReplicaSelection {
  UNKNOWN_REPLICA_SELECTION = 0;
  // Select the LEADER replica.
  LEADER_ONLY = 1;
  // Select the closest replica to the client. Replicas are classified from
  // closest to furthest as follows:
  //   - Local replicas
  //   - Replicas whose tablet server has the same location as the client
  //   - All other replicas
  CLOSEST_REPLICA = 2;
}

// The serialized format of a Kudu table partition schema.
message PartitionSchemaPB {

  // A column identifier for partition schemas. In general, the name will be
  // used when a client creates the table since column IDs are assigned by the
  // master. All other uses of partition schemas will use the numeric column ID.
  message ColumnIdentifierPB {
    oneof identifier {
      int32 id = 1;
      string name = 2;
    }
  }

  message RangeSchemaPB {
    // Column identifiers of columns included in the range. All columns must be
    // a component of the primary key.
    repeated ColumnIdentifierPB columns = 1;
  }

  message HashBucketSchemaPB {
    // Column identifiers of columns included in the hash. Every column must be
    // a component of the primary key.
    repeated ColumnIdentifierPB columns = 1;

    // Number of buckets into which columns will be hashed. Must be at least 2.
    required int32 num_buckets = 2;

    // Seed value for hash calculation. Administrators may set a seed value
    // on a per-table basis in order to randomize the mapping of rows to
    // buckets. Setting a seed provides some amount of protection against denial
    // of service attacks when the hash bucket columns contain user provided
    // input.
    optional uint32 seed = 3;

    // The hash algorithm to use for calculating the hash bucket.
    // NOTE: this is not used yet -- don't expect setting it to have any effect
    optional HashAlgorithm hash_algorithm = 4;
  }

  // This data structure represents a range partition with a custom hash schema.
  message RangeWithHashSchemaPB {
    // Row operations containing the lower and upper range bound for the range.
    optional RowOperationsPB range_bounds = 1;
    // Hash schema for the range.
    repeated HashBucketSchemaPB hash_schema = 2;
  }

  // Table-wide hash schema. Hash schema for a particular range may be
  // overriden by corresponding element in 'custom_hash_schema_ranges'.
  repeated HashBucketSchemaPB hash_schema = 1;

  // Range schema to partition the key space into ranges.
  optional RangeSchemaPB range_schema = 2;

  // Two fields were deprecated in favor of using 'custom_hash_schema_ranges'.
  reserved 3;
  reserved 4;

  // If the 'custom_hash_schema_ranges' field is empty, the table-wide hash
  // schema specified by the 'hash_schema' field is used for all the ranges
  // of the table. Otherwise, particular ranges have their hash schema
  // as specified by corresponding elements in 'custom_hash_schema_ranges'.
  repeated RangeWithHashSchemaPB custom_hash_schema_ranges = 5;
}

// The serialized format of a Kudu table partition.
message PartitionPB {
  // The hash buckets of the partition. The number of hash buckets must match
  // the number of hash dimensions in the partition's schema.
  repeated int32 hash_buckets = 1 [packed = true];
  // The encoded start partition key (inclusive).
  optional bytes partition_key_start = 2;
  // The encoded end partition key (exclusive).
  optional bytes partition_key_end = 3;
}

// A predicate that can be applied on a Kudu column.
message ColumnPredicatePB {
  // The predicate column name.
  optional string column = 1;

  message Range {

    // Bounds should be encoded as follows:
    // - STRING/BINARY values: simply the exact string value for the bound.
    // - other type: the canonical x86 in-memory representation -- eg for
    //   uint32s, a little-endian value.
    //
    // Note that this predicate type should not be used for NULL data --
    // NULL is defined to neither be greater than or less than other values
    // for the comparison operator.

    // The inclusive lower bound.
    optional bytes lower = 1 [(kudu.REDACT) = true];

    // The exclusive upper bound.
    optional bytes upper = 2 [(kudu.REDACT) = true];
  }

  message Equality {
    // The inclusive lower bound. See comment in Range for notes on the
    // encoding.
    optional bytes value = 1 [(kudu.REDACT) = true];
  }

  message InList {
    // A list of values for the field. See comment in Range for notes on
    // the encoding.
    repeated bytes values = 1 [(kudu.REDACT) = true];
  }

  message IsNotNull {}

  message IsNull {}

  message InBloomFilter {
    // A list of bloom filters for the field.
    repeated BlockBloomFilterPB bloom_filters = 1;

    // lower and upper are optional for InBloomFilter.
    // When using both InBloomFilter and Range predicate for the same column the
    // merged result can be InBloomFilter within specified range.
    //
    // The inclusive lower bound.
    optional bytes lower = 2 [(kudu.REDACT) = true];

    // The exclusive upper bound.
    optional bytes upper = 3 [(kudu.REDACT) = true];
  }

  oneof predicate {
    Range range = 2;
    Equality equality = 3;
    IsNotNull is_not_null = 4;
    InList in_list = 5;
    IsNull is_null = 6;
    InBloomFilter in_bloom_filter = 7;
  }
}

// The primary key range of a Kudu tablet.
message KeyRangePB {
  // Encoded primary key to begin scanning at (inclusive).
  optional bytes start_primary_key = 1 [(kudu.REDACT) = true];
  // Encoded primary key to stop scanning at (exclusive).
  optional bytes stop_primary_key = 2 [(kudu.REDACT) = true];
  // Number of bytes in chunk.
  required uint64 size_bytes_estimates = 3;
}

message TableExtraConfigPB {
  // Number of seconds to retain history for tablets in this table,
  // including history required to perform diff scans and incremental
  // backups. Reads initiated at a snapshot that is older than this
  // age will be rejected. Equivalent to --tablet_history_max_age_sec.
  optional int32 history_max_age_sec = 1;

  // Priority level of a table for maintenance, it will be clamped into
  // range [-FLAGS_max_priority_range, FLAGS_max_priority_range] when
  // calculate maintenance priority score.
  optional int32 maintenance_priority = 2;

  // If set true, the table's data on disk is not compacted.
  optional bool disable_compaction = 3;
}

// The type of a given table. This is useful in determining whether a
// table/tablet stores user-specified data, as opposed to being a Kudu-internal
// system table.
enum TableTypePB {
  // The table stores user data.
  DEFAULT_TABLE = 0;

  // The table stores transaction status management metadata.
  TXN_STATUS_TABLE = 1;
}
