| // Licensed to the Apache Software Foundation (ASF) under one |
| // or more contributor license agreements. See the NOTICE file |
| // distributed with this work for additional information |
| // regarding copyright ownership. The ASF licenses this file |
| // to you under the Apache License, Version 2.0 (the |
| // "License"); you may not use this file except in compliance |
| // with the License. You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, |
| // software distributed under the License is distributed on an |
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| // KIND, either express or implied. See the License for the |
| // specific language governing permissions and limitations |
| // under the License. |
| #pragma once |
| |
| #include <cstdint> |
| #include <string> |
| #include <utility> |
| #include <vector> |
| |
| #include <gtest/gtest_prod.h> |
| |
| #include "kudu/common/schema.h" |
| #include "kudu/gutil/port.h" |
| #include "kudu/util/slice.h" |
| #include "kudu/util/status.h" |
| |
| namespace google { |
| namespace protobuf { |
| template <typename T> class RepeatedPtrField; |
| } // namespace protobuf |
| } // namespace google |
| |
| namespace kudu { |
| |
| class Arena; |
| class ConstContiguousRow; |
| class KuduPartialRow; |
| class PartitionPB; |
| class PartitionSchemaPB; |
| class PartitionSchemaPB_HashBucketSchemaPB; |
| template <typename Buffer> class KeyEncoder; |
| |
| // A Partition describes the set of rows that a Tablet is responsible for |
| // serving. Each tablet is assigned a single Partition. |
| // |
| // Partitions consist primarily of a start and end partition key. Every row with |
| // a partition key that falls in a Tablet's Partition will be served by that |
| // tablet. |
| // |
| // In addition to the start and end partition keys, a Partition holds metadata |
| // to determine if a scan can prune, or skip, a partition based on the scan's |
| // start and end primary keys, and predicates. |
| class Partition { |
| public: |
| |
| const std::vector<int32_t>& hash_buckets() const { |
| return hash_buckets_; |
| } |
| |
| Slice range_key_start() const; |
| |
| Slice range_key_end() const; |
| |
| const std::string& partition_key_start() const { |
| return partition_key_start_; |
| } |
| |
| const std::string& partition_key_end() const { |
| return partition_key_end_; |
| } |
| |
| // Returns true if the other partition is equivalent to this one. |
| bool Equals(const Partition& other) const; |
| |
| // Serializes a partition into a protobuf message. |
| void ToPB(PartitionPB* pb) const; |
| |
| // Deserializes a protobuf message into a partition. |
| // |
| // The protobuf message is not validated, since partitions are only expected |
| // to be created by the master process. |
| static void FromPB(const PartitionPB& pb, Partition* partition); |
| |
| private: |
| friend class PartitionSchema; |
| |
| // Helper function for accessing the range key portion of a partition key. |
| Slice range_key(const std::string& partition_key) const; |
| |
| std::vector<int32_t> hash_buckets_; |
| |
| std::string partition_key_start_; |
| std::string partition_key_end_; |
| }; |
| |
| // A partition schema describes how the rows of a table are distributed among |
| // tablets. |
| // |
| // Primarily, a table's partition schema is responsible for translating the |
| // primary key column values of a row into a partition key that can be used to |
| // determine the tablet containing the key. |
| // |
| // The partition schema is made up of zero or more hash bucket components, |
| // followed by a single range component. |
| // |
| // Each hash bucket component includes one or more columns from the primary key |
| // column set, with the restriction that an individual primary key column may |
| // only be included in a single hash component. |
| // |
| // To determine the hash bucket of an individual row, the values of the columns |
| // of the hash component are encoded into bytes (in PK or lexicographic |
| // preserving encoding), then hashed into a u64, then modded into an i32. When |
| // constructing a partition key from a row, the buckets of the row are simply |
| // encoded into the partition key in order (again in PK or lexicographic |
| // preserving encoding). |
| // |
| // The range component contains a (possibly full or empty) subset of the primary |
| // key columns. When encoding the partition key, the columns of the partition |
| // component are encoded in order. |
| // |
| // The above is true of the relationship between rows and partition keys. It |
| // gets trickier with partitions (tablet partition key boundaries), because the |
| // boundaries of tablets do not necessarily align to rows. For instance, |
| // currently the absolute-start and absolute-end primary keys of a table |
| // represented as an empty key, but do not have a corresponding row. Partitions |
| // are similar, but instead of having just one absolute-start and absolute-end, |
| // each component of a partition schema has an absolute-start and absolute-end. |
| // When creating the initial set of partitions during table creation, we deal |
| // with this by "carrying through" absolute-start or absolute-ends into lower |
| // significance components. |
| // |
| // Notes on redaction: |
| // |
| // For the purposes of redaction, Kudu considers partitions and partition |
| // schemas to be metadata - not sensitive data which needs to be redacted from |
| // log files. However, the partition keys of individual rows _are_ considered |
| // sensitive, so we redact them from log messages and error messages. Thus, |
| // methods which format partitions and partition schemas will never redact, but |
| // the methods which format individual partition keys do redact. |
| class PartitionSchema { |
| public: |
| struct RangeSchema { |
| std::vector<ColumnId> column_ids; |
| }; |
| |
| struct HashBucketSchema { |
| std::vector<ColumnId> column_ids; |
| int32_t num_buckets; |
| uint32_t seed; |
| }; |
| |
| typedef std::vector<HashBucketSchema> HashBucketSchemas; |
| // Holds each bound's HashBucketSchemas. |
| typedef std::vector<HashBucketSchemas> RangeHashSchema; |
| |
| struct RangeWithHashSchemas { |
| std::string lower; |
| std::string upper; |
| HashBucketSchemas hash_schemas; |
| }; |
| |
| // Extracts HashBucketSchemas from a protobuf repeated field of hash buckets. |
| static Status ExtractHashBucketSchemasFromPB( |
| const Schema& schema, |
| const google::protobuf::RepeatedPtrField<PartitionSchemaPB_HashBucketSchemaPB>& |
| hash_buckets_pb, |
| HashBucketSchemas* hash_bucket_schemas); |
| |
| // Deserializes a protobuf message into a partition schema. |
| static Status FromPB(const PartitionSchemaPB& pb, |
| const Schema& schema, |
| PartitionSchema* partition_schema) WARN_UNUSED_RESULT; |
| |
| // Serializes a partition schema into a protobuf message. |
| void ToPB(PartitionSchemaPB* pb) const; |
| |
| // Appends the row's encoded partition key into the provided buffer. |
| // On failure, the buffer may have data partially appended. |
| Status EncodeKey(const KuduPartialRow& row, std::string* buf) const WARN_UNUSED_RESULT; |
| Status EncodeKey(const ConstContiguousRow& row, std::string* buf) const WARN_UNUSED_RESULT; |
| |
| // Creates the set of table partitions for a partition schema and collection |
| // of split rows and split bounds. |
| // |
| // Split bounds define disjoint ranges for which tablets will be created. If |
| // empty, then Kudu assumes a single unbounded range. Each split key must fall |
| // into one of the ranges, and results in the range being split. The number |
| // of resulting partitions is the product of the number of hash buckets for |
| // each hash bucket component, multiplied by |
| // (split_rows.size() + max(1, range_bounds.size())). |
| // 'range_hash_schema' contains each range's HashBucketSchemas, |
| // its order corresponds to the bounds in 'range_bounds'. |
| // If 'range_hash_schemas' is empty, the table wide hash schema is used per range. |
| // Size of 'range_hash_schemas' and 'range_bounds' are equal if 'range_hash_schema' isn't empty. |
| Status CreatePartitions(const std::vector<KuduPartialRow>& split_rows, |
| const std::vector<std::pair<KuduPartialRow, |
| KuduPartialRow>>& range_bounds, |
| const RangeHashSchema& range_hash_schemas, |
| const Schema& schema, |
| std::vector<Partition>* partitions) const WARN_UNUSED_RESULT; |
| |
| // Tests if the partition contains the row. |
| Status PartitionContainsRow(const Partition& partition, |
| const KuduPartialRow& row, |
| bool* contains) const WARN_UNUSED_RESULT; |
| Status PartitionContainsRow(const Partition& partition, |
| const ConstContiguousRow& row, |
| bool* contains) const WARN_UNUSED_RESULT; |
| |
| // Tests if the hash partition contains the row with given hash_idx. |
| Status HashPartitionContainsRow(const Partition& partition, |
| const KuduPartialRow& row, |
| int hash_idx, |
| bool* contains) const WARN_UNUSED_RESULT; |
| Status HashPartitionContainsRow(const Partition& partition, |
| const ConstContiguousRow& row, |
| int hash_idx, |
| bool* contains) const WARN_UNUSED_RESULT; |
| |
| // Tests if the range partition contains the row. |
| Status RangePartitionContainsRow(const Partition& partition, |
| const KuduPartialRow& row, |
| bool* contains) const WARN_UNUSED_RESULT; |
| Status RangePartitionContainsRow(const Partition& partition, |
| const ConstContiguousRow& row, |
| bool* contains) const WARN_UNUSED_RESULT; |
| |
| // Returns a text description of the partition suitable for debug printing. |
| // |
| // Partitions are considered metadata, so no redaction will happen on the hash |
| // and range bound values. |
| std::string PartitionDebugString(const Partition& partition, const Schema& schema) const; |
| |
| // Returns a text description of a partition key suitable for debug printing. |
| std::string PartitionKeyDebugString(Slice key, const Schema& schema) const; |
| std::string PartitionKeyDebugString(const KuduPartialRow& row) const; |
| std::string PartitionKeyDebugString(const ConstContiguousRow& row) const; |
| |
| // Returns a text description of the range partition with the provided |
| // inclusive lower bound and exclusive upper bound. |
| // |
| // Range partitions are considered metadata, so no redaction will happen on |
| // the row values. |
| std::string RangePartitionDebugString(const KuduPartialRow& lower_bound, |
| const KuduPartialRow& upper_bound) const; |
| std::string RangePartitionDebugString(Slice lower_bound, |
| Slice upper_bound, |
| const Schema& schema) const; |
| |
| // Returns a text description of this partition schema suitable for debug printing. |
| // |
| // The partition schema is considered metadata, so partition bound information |
| // is not redacted from the returned string. |
| std::string DebugString(const Schema& schema) const; |
| |
| // Returns a text description of this partition schema suitable for display in the web UI. |
| // The format of this string is not guaranteed to be identical cross-version. |
| // |
| // 'range_partitions' should include the set of range partitions in the table, |
| // as formatted by 'RangePartitionDebugString'. |
| std::string DisplayString(const Schema& schema, |
| const std::vector<std::string>& range_partitions) const; |
| |
| // Returns header and entry HTML cells for the partition schema for the master |
| // table web UI. This is an abstraction leak, but it's better than leaking the |
| // internals of partitions to the master path handlers. |
| // |
| // Partitions are considered metadata, so no redaction will be done. |
| std::string PartitionTableHeader(const Schema& schema) const; |
| std::string PartitionTableEntry(const Schema& schema, const Partition& partition) const; |
| |
| // Returns true if the other partition schema is equivalent to this one. |
| bool Equals(const PartitionSchema& other) const; |
| |
| // Transforms an exclusive lower bound range partition key into an inclusive |
| // lower bound range partition key. |
| // |
| // The provided partial row is considered metadata, so error messages may |
| // contain unredacted row data. |
| Status MakeLowerBoundRangePartitionKeyInclusive(KuduPartialRow* row) const; |
| |
| // Transforms an inclusive upper bound range partition key into an exclusive |
| // upper bound range partition key. |
| // |
| // The provided partial row is considered metadata, so error messages may |
| // contain unredacted row data. |
| Status MakeUpperBoundRangePartitionKeyExclusive(KuduPartialRow* row) const; |
| |
| // Decodes a range partition key into a partial row, with variable-length |
| // fields stored in the arena. |
| Status DecodeRangeKey(Slice* encoded_key, |
| KuduPartialRow* partial_row, |
| Arena* arena) const; |
| |
| const RangeSchema& range_partition_schema() const { |
| return range_schema_; |
| } |
| |
| const HashBucketSchemas& hash_partition_schemas() const { |
| return hash_bucket_schemas_; |
| } |
| |
| // Gets the vector containing the column indexes of the range partition keys. |
| // If any of the columns is not in the key range columns then an |
| // InvalidArgument status is returned. |
| Status GetRangeSchemaColumnIndexes(const Schema& schema, |
| std::vector<int>* range_column_idxs) const; |
| |
| // Returns index of given column idx, if it is one of hash key and this hash schema |
| // contains only one column, otherwise returns -1. |
| int32_t TryGetSingleColumnHashPartitionIndex(const Schema& schema, int32_t col_idx) const; |
| |
| // Given a column idx, verify that it is the only column of the range partition. |
| bool IsColumnSingleRangeSchema(const Schema& schema, int32_t col_idx) const; |
| |
| private: |
| friend class PartitionPruner; |
| FRIEND_TEST(PartitionTest, TestIncrementRangePartitionBounds); |
| FRIEND_TEST(PartitionTest, TestIncrementRangePartitionStringBounds); |
| FRIEND_TEST(PartitionTest, TestVarcharRangePartitions); |
| |
| // Returns a text description of the encoded range key suitable for debug printing. |
| std::string RangeKeyDebugString(Slice range_key, const Schema& schema) const; |
| std::string RangeKeyDebugString(const KuduPartialRow& key) const; |
| std::string RangeKeyDebugString(const ConstContiguousRow& key) const; |
| |
| // Encodes the specified columns of a row into lexicographic sort-order |
| // preserving format. |
| static Status EncodeColumns(const KuduPartialRow& row, |
| const std::vector<ColumnId>& column_ids, |
| std::string* buf); |
| |
| // Encodes the specified columns of a row into lexicographic sort-order |
| // preserving format. |
| static Status EncodeColumns(const ConstContiguousRow& row, |
| const std::vector<ColumnId>& column_ids, |
| std::string* buf); |
| |
| // Returns the hash bucket of the encoded hash column. The encoded columns must match the |
| // columns of the hash bucket schema. |
| static int32_t BucketForEncodedColumns(const std::string& encoded_hash_columns, |
| const HashBucketSchema& hash_bucket_schema); |
| |
| // Assigns the row to a hash bucket according to the hash schema. |
| template<typename Row> |
| static Status BucketForRow(const Row& row, |
| const HashBucketSchema& hash_bucket_schema, |
| int32_t* bucket); |
| |
| // PartitionKeyDebugString implementation for row types. |
| template<typename Row> |
| std::string PartitionKeyDebugStringImpl(const Row& row) const; |
| |
| // Private templated helper for PartitionContainsRow. |
| template<typename Row> |
| Status PartitionContainsRowImpl(const Partition& partition, |
| const Row& row, |
| bool* contains) const; |
| |
| // Private templated helper for HashPartitionContainsRow. |
| template<typename Row> |
| Status HashPartitionContainsRowImpl(const Partition& partition, |
| const Row& row, |
| int hash_idx, |
| bool* contains) const; |
| |
| // Private templated helper for RangePartitionContainsRow. |
| template<typename Row> |
| Status RangePartitionContainsRowImpl(const Partition& partition, |
| const Row& row, |
| bool* contains) const; |
| |
| // Private templated helper for EncodeKey. |
| template<typename Row> |
| Status EncodeKeyImpl(const Row& row, std::string* buf) const; |
| |
| // Returns true if all of the columns in the range partition key are unset in |
| // the row. |
| bool IsRangePartitionKeyEmpty(const KuduPartialRow& row) const; |
| |
| // Appends the stringified range partition components of a partial row to a |
| // vector. |
| // |
| // If any columns of the range partition do not exist in the partial row, the |
| // logical minimum value for that column will be used instead. |
| void AppendRangeDebugStringComponentsOrMin(const KuduPartialRow& row, |
| std::vector<std::string>* components) const; |
| |
| // Returns the stringified hash and range schema components of the partition |
| // schema. |
| // |
| // Partition schemas are considered metadata, so no redaction will happen on |
| // the hash and range bound values. |
| std::vector<std::string> DebugStringComponents(const Schema& schema) const; |
| |
| // Encode the provided row into a range key. The row must not include values |
| // for any columns not in the range key. Missing range values will be filled |
| // with the logical minimum value for the column. A row without any values |
| // will encode to an empty string. |
| // |
| // This method is useful used for encoding splits and bounds. |
| Status EncodeRangeKey(const KuduPartialRow& row, const Schema& schema, std::string* key) const; |
| |
| // Decodes the hash bucket component of a partition key into its buckets. |
| // |
| // This should only be called with partition keys created from a row, not with |
| // partition keys from a partition. |
| Status DecodeHashBuckets(Slice* encoded_key, std::vector<int32_t>* buckets) const; |
| |
| // Clears the state of this partition schema. |
| void Clear(); |
| |
| // Validates that this partition schema is valid. Returns OK, or an |
| // appropriate error code for an invalid partition schema. |
| Status Validate(const Schema& schema) const; |
| |
| // Generates hash partitions for each combination of hash buckets in hash_schemas. |
| static std::vector<Partition> GenerateHashPartitions(const HashBucketSchemas& hash_schemas, |
| const KeyEncoder<std::string>& hash_encoder); |
| |
| // Validates the split rows, converts them to partition key form, and inserts |
| // them into splits in sorted order. |
| Status EncodeRangeSplits(const std::vector<KuduPartialRow>& split_rows, |
| const Schema& schema, |
| std::vector<std::string>* splits) const; |
| |
| // Validates the range bounds, converts them to partition key form, and |
| // inserts them into 'bounds_with_hash_schemas' in sorted order. The hash schemas |
| // per range are stored within 'range_hash_schemas'. If 'range_hash_schemas' is empty, |
| // it indicates that the table wide hash schema will be used per range. |
| Status EncodeRangeBounds(const std::vector<std::pair<KuduPartialRow, |
| KuduPartialRow>>& range_bounds, |
| const RangeHashSchema& range_hash_schemas, |
| const Schema& schema, |
| std::vector<RangeWithHashSchemas>* bounds_with_hash_schemas) const; |
| |
| // Splits the encoded range bounds by the split points. The splits and bounds within |
| // 'bounds_with_hash_schemas' must be sorted. If `bounds_with_hash_schemas` is empty, |
| // then a single unbounded range is assumed. If any of the splits falls outside |
| // of the bounds, then an InvalidArgument status is returned. |
| Status SplitRangeBounds(const Schema& schema, |
| std::vector<std::string> splits, |
| std::vector<RangeWithHashSchemas>* bounds_with_hash_schemas) const; |
| |
| // Increments a range partition key, setting 'increment' to true if the |
| // increment succeeds, or false if all range partition columns are already the |
| // maximum value. Unset columns will be incremented to increment(min_value). |
| Status IncrementRangePartitionKey(KuduPartialRow* row, bool* increment) const; |
| |
| HashBucketSchemas hash_bucket_schemas_; |
| RangeSchema range_schema_; |
| }; |
| |
| } // namespace kudu |