blob: 85c58e937680f53fe614578916b17b9b500f2737 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <cstdint>
#include <string>
#include <utility>
#include <vector>
#include <gtest/gtest_prod.h>
#include "kudu/common/schema.h"
#include "kudu/gutil/port.h"
#include "kudu/util/slice.h"
#include "kudu/util/status.h"
namespace google {
namespace protobuf {
template <typename T> class RepeatedPtrField;
} // namespace protobuf
} // namespace google
namespace kudu {
class Arena;
class ConstContiguousRow;
class KuduPartialRow;
class PartitionPB;
class PartitionSchemaPB;
class PartitionSchemaPB_HashBucketSchemaPB;
template <typename Buffer> class KeyEncoder;
// A Partition describes the set of rows that a Tablet is responsible for
// serving. Each tablet is assigned a single Partition.
//
// Partitions consist primarily of a start and end partition key. Every row with
// a partition key that falls in a Tablet's Partition will be served by that
// tablet.
//
// In addition to the start and end partition keys, a Partition holds metadata
// to determine if a scan can prune, or skip, a partition based on the scan's
// start and end primary keys, and predicates.
class Partition {
public:
const std::vector<int32_t>& hash_buckets() const {
return hash_buckets_;
}
Slice range_key_start() const;
Slice range_key_end() const;
const std::string& partition_key_start() const {
return partition_key_start_;
}
const std::string& partition_key_end() const {
return partition_key_end_;
}
// Returns true if the other partition is equivalent to this one.
bool Equals(const Partition& other) const;
// Serializes a partition into a protobuf message.
void ToPB(PartitionPB* pb) const;
// Deserializes a protobuf message into a partition.
//
// The protobuf message is not validated, since partitions are only expected
// to be created by the master process.
static void FromPB(const PartitionPB& pb, Partition* partition);
private:
friend class PartitionSchema;
// Helper function for accessing the range key portion of a partition key.
Slice range_key(const std::string& partition_key) const;
std::vector<int32_t> hash_buckets_;
std::string partition_key_start_;
std::string partition_key_end_;
};
// A partition schema describes how the rows of a table are distributed among
// tablets.
//
// Primarily, a table's partition schema is responsible for translating the
// primary key column values of a row into a partition key that can be used to
// determine the tablet containing the key.
//
// The partition schema is made up of zero or more hash bucket components,
// followed by a single range component.
//
// Each hash bucket component includes one or more columns from the primary key
// column set, with the restriction that an individual primary key column may
// only be included in a single hash component.
//
// To determine the hash bucket of an individual row, the values of the columns
// of the hash component are encoded into bytes (in PK or lexicographic
// preserving encoding), then hashed into a u64, then modded into an i32. When
// constructing a partition key from a row, the buckets of the row are simply
// encoded into the partition key in order (again in PK or lexicographic
// preserving encoding).
//
// The range component contains a (possibly full or empty) subset of the primary
// key columns. When encoding the partition key, the columns of the partition
// component are encoded in order.
//
// The above is true of the relationship between rows and partition keys. It
// gets trickier with partitions (tablet partition key boundaries), because the
// boundaries of tablets do not necessarily align to rows. For instance,
// currently the absolute-start and absolute-end primary keys of a table
// represented as an empty key, but do not have a corresponding row. Partitions
// are similar, but instead of having just one absolute-start and absolute-end,
// each component of a partition schema has an absolute-start and absolute-end.
// When creating the initial set of partitions during table creation, we deal
// with this by "carrying through" absolute-start or absolute-ends into lower
// significance components.
//
// Notes on redaction:
//
// For the purposes of redaction, Kudu considers partitions and partition
// schemas to be metadata - not sensitive data which needs to be redacted from
// log files. However, the partition keys of individual rows _are_ considered
// sensitive, so we redact them from log messages and error messages. Thus,
// methods which format partitions and partition schemas will never redact, but
// the methods which format individual partition keys do redact.
class PartitionSchema {
public:
struct RangeSchema {
std::vector<ColumnId> column_ids;
};
struct HashBucketSchema {
std::vector<ColumnId> column_ids;
int32_t num_buckets;
uint32_t seed;
};
typedef std::vector<HashBucketSchema> HashBucketSchemas;
// Holds each bound's HashBucketSchemas.
typedef std::vector<HashBucketSchemas> RangeHashSchema;
struct RangeWithHashSchemas {
std::string lower;
std::string upper;
HashBucketSchemas hash_schemas;
};
// Extracts HashBucketSchemas from a protobuf repeated field of hash buckets.
static Status ExtractHashBucketSchemasFromPB(
const Schema& schema,
const google::protobuf::RepeatedPtrField<PartitionSchemaPB_HashBucketSchemaPB>&
hash_buckets_pb,
HashBucketSchemas* hash_bucket_schemas);
// Deserializes a protobuf message into a partition schema.
static Status FromPB(const PartitionSchemaPB& pb,
const Schema& schema,
PartitionSchema* partition_schema) WARN_UNUSED_RESULT;
// Serializes a partition schema into a protobuf message.
void ToPB(PartitionSchemaPB* pb) const;
// Appends the row's encoded partition key into the provided buffer.
// On failure, the buffer may have data partially appended.
Status EncodeKey(const KuduPartialRow& row, std::string* buf) const WARN_UNUSED_RESULT;
Status EncodeKey(const ConstContiguousRow& row, std::string* buf) const WARN_UNUSED_RESULT;
// Creates the set of table partitions for a partition schema and collection
// of split rows and split bounds.
//
// Split bounds define disjoint ranges for which tablets will be created. If
// empty, then Kudu assumes a single unbounded range. Each split key must fall
// into one of the ranges, and results in the range being split. The number
// of resulting partitions is the product of the number of hash buckets for
// each hash bucket component, multiplied by
// (split_rows.size() + max(1, range_bounds.size())).
// 'range_hash_schema' contains each range's HashBucketSchemas,
// its order corresponds to the bounds in 'range_bounds'.
// If 'range_hash_schemas' is empty, the table wide hash schema is used per range.
// Size of 'range_hash_schemas' and 'range_bounds' are equal if 'range_hash_schema' isn't empty.
Status CreatePartitions(const std::vector<KuduPartialRow>& split_rows,
const std::vector<std::pair<KuduPartialRow,
KuduPartialRow>>& range_bounds,
const RangeHashSchema& range_hash_schemas,
const Schema& schema,
std::vector<Partition>* partitions) const WARN_UNUSED_RESULT;
// Tests if the partition contains the row.
Status PartitionContainsRow(const Partition& partition,
const KuduPartialRow& row,
bool* contains) const WARN_UNUSED_RESULT;
Status PartitionContainsRow(const Partition& partition,
const ConstContiguousRow& row,
bool* contains) const WARN_UNUSED_RESULT;
// Tests if the hash partition contains the row with given hash_idx.
Status HashPartitionContainsRow(const Partition& partition,
const KuduPartialRow& row,
int hash_idx,
bool* contains) const WARN_UNUSED_RESULT;
Status HashPartitionContainsRow(const Partition& partition,
const ConstContiguousRow& row,
int hash_idx,
bool* contains) const WARN_UNUSED_RESULT;
// Tests if the range partition contains the row.
Status RangePartitionContainsRow(const Partition& partition,
const KuduPartialRow& row,
bool* contains) const WARN_UNUSED_RESULT;
Status RangePartitionContainsRow(const Partition& partition,
const ConstContiguousRow& row,
bool* contains) const WARN_UNUSED_RESULT;
// Returns a text description of the partition suitable for debug printing.
//
// Partitions are considered metadata, so no redaction will happen on the hash
// and range bound values.
std::string PartitionDebugString(const Partition& partition, const Schema& schema) const;
// Returns a text description of a partition key suitable for debug printing.
std::string PartitionKeyDebugString(Slice key, const Schema& schema) const;
std::string PartitionKeyDebugString(const KuduPartialRow& row) const;
std::string PartitionKeyDebugString(const ConstContiguousRow& row) const;
// Returns a text description of the range partition with the provided
// inclusive lower bound and exclusive upper bound.
//
// Range partitions are considered metadata, so no redaction will happen on
// the row values.
std::string RangePartitionDebugString(const KuduPartialRow& lower_bound,
const KuduPartialRow& upper_bound) const;
std::string RangePartitionDebugString(Slice lower_bound,
Slice upper_bound,
const Schema& schema) const;
// Returns a text description of this partition schema suitable for debug printing.
//
// The partition schema is considered metadata, so partition bound information
// is not redacted from the returned string.
std::string DebugString(const Schema& schema) const;
// Returns a text description of this partition schema suitable for display in the web UI.
// The format of this string is not guaranteed to be identical cross-version.
//
// 'range_partitions' should include the set of range partitions in the table,
// as formatted by 'RangePartitionDebugString'.
std::string DisplayString(const Schema& schema,
const std::vector<std::string>& range_partitions) const;
// Returns header and entry HTML cells for the partition schema for the master
// table web UI. This is an abstraction leak, but it's better than leaking the
// internals of partitions to the master path handlers.
//
// Partitions are considered metadata, so no redaction will be done.
std::string PartitionTableHeader(const Schema& schema) const;
std::string PartitionTableEntry(const Schema& schema, const Partition& partition) const;
// Returns true if the other partition schema is equivalent to this one.
bool Equals(const PartitionSchema& other) const;
// Transforms an exclusive lower bound range partition key into an inclusive
// lower bound range partition key.
//
// The provided partial row is considered metadata, so error messages may
// contain unredacted row data.
Status MakeLowerBoundRangePartitionKeyInclusive(KuduPartialRow* row) const;
// Transforms an inclusive upper bound range partition key into an exclusive
// upper bound range partition key.
//
// The provided partial row is considered metadata, so error messages may
// contain unredacted row data.
Status MakeUpperBoundRangePartitionKeyExclusive(KuduPartialRow* row) const;
// Decodes a range partition key into a partial row, with variable-length
// fields stored in the arena.
Status DecodeRangeKey(Slice* encoded_key,
KuduPartialRow* partial_row,
Arena* arena) const;
const RangeSchema& range_partition_schema() const {
return range_schema_;
}
const HashBucketSchemas& hash_partition_schemas() const {
return hash_bucket_schemas_;
}
// Gets the vector containing the column indexes of the range partition keys.
// If any of the columns is not in the key range columns then an
// InvalidArgument status is returned.
Status GetRangeSchemaColumnIndexes(const Schema& schema,
std::vector<int>* range_column_idxs) const;
// Returns index of given column idx, if it is one of hash key and this hash schema
// contains only one column, otherwise returns -1.
int32_t TryGetSingleColumnHashPartitionIndex(const Schema& schema, int32_t col_idx) const;
// Given a column idx, verify that it is the only column of the range partition.
bool IsColumnSingleRangeSchema(const Schema& schema, int32_t col_idx) const;
private:
friend class PartitionPruner;
FRIEND_TEST(PartitionTest, TestIncrementRangePartitionBounds);
FRIEND_TEST(PartitionTest, TestIncrementRangePartitionStringBounds);
FRIEND_TEST(PartitionTest, TestVarcharRangePartitions);
// Returns a text description of the encoded range key suitable for debug printing.
std::string RangeKeyDebugString(Slice range_key, const Schema& schema) const;
std::string RangeKeyDebugString(const KuduPartialRow& key) const;
std::string RangeKeyDebugString(const ConstContiguousRow& key) const;
// Encodes the specified columns of a row into lexicographic sort-order
// preserving format.
static Status EncodeColumns(const KuduPartialRow& row,
const std::vector<ColumnId>& column_ids,
std::string* buf);
// Encodes the specified columns of a row into lexicographic sort-order
// preserving format.
static Status EncodeColumns(const ConstContiguousRow& row,
const std::vector<ColumnId>& column_ids,
std::string* buf);
// Returns the hash bucket of the encoded hash column. The encoded columns must match the
// columns of the hash bucket schema.
static int32_t BucketForEncodedColumns(const std::string& encoded_hash_columns,
const HashBucketSchema& hash_bucket_schema);
// Assigns the row to a hash bucket according to the hash schema.
template<typename Row>
static Status BucketForRow(const Row& row,
const HashBucketSchema& hash_bucket_schema,
int32_t* bucket);
// PartitionKeyDebugString implementation for row types.
template<typename Row>
std::string PartitionKeyDebugStringImpl(const Row& row) const;
// Private templated helper for PartitionContainsRow.
template<typename Row>
Status PartitionContainsRowImpl(const Partition& partition,
const Row& row,
bool* contains) const;
// Private templated helper for HashPartitionContainsRow.
template<typename Row>
Status HashPartitionContainsRowImpl(const Partition& partition,
const Row& row,
int hash_idx,
bool* contains) const;
// Private templated helper for RangePartitionContainsRow.
template<typename Row>
Status RangePartitionContainsRowImpl(const Partition& partition,
const Row& row,
bool* contains) const;
// Private templated helper for EncodeKey.
template<typename Row>
Status EncodeKeyImpl(const Row& row, std::string* buf) const;
// Returns true if all of the columns in the range partition key are unset in
// the row.
bool IsRangePartitionKeyEmpty(const KuduPartialRow& row) const;
// Appends the stringified range partition components of a partial row to a
// vector.
//
// If any columns of the range partition do not exist in the partial row, the
// logical minimum value for that column will be used instead.
void AppendRangeDebugStringComponentsOrMin(const KuduPartialRow& row,
std::vector<std::string>* components) const;
// Returns the stringified hash and range schema components of the partition
// schema.
//
// Partition schemas are considered metadata, so no redaction will happen on
// the hash and range bound values.
std::vector<std::string> DebugStringComponents(const Schema& schema) const;
// Encode the provided row into a range key. The row must not include values
// for any columns not in the range key. Missing range values will be filled
// with the logical minimum value for the column. A row without any values
// will encode to an empty string.
//
// This method is useful used for encoding splits and bounds.
Status EncodeRangeKey(const KuduPartialRow& row, const Schema& schema, std::string* key) const;
// Decodes the hash bucket component of a partition key into its buckets.
//
// This should only be called with partition keys created from a row, not with
// partition keys from a partition.
Status DecodeHashBuckets(Slice* encoded_key, std::vector<int32_t>* buckets) const;
// Clears the state of this partition schema.
void Clear();
// Validates that this partition schema is valid. Returns OK, or an
// appropriate error code for an invalid partition schema.
Status Validate(const Schema& schema) const;
// Generates hash partitions for each combination of hash buckets in hash_schemas.
static std::vector<Partition> GenerateHashPartitions(const HashBucketSchemas& hash_schemas,
const KeyEncoder<std::string>& hash_encoder);
// Validates the split rows, converts them to partition key form, and inserts
// them into splits in sorted order.
Status EncodeRangeSplits(const std::vector<KuduPartialRow>& split_rows,
const Schema& schema,
std::vector<std::string>* splits) const;
// Validates the range bounds, converts them to partition key form, and
// inserts them into 'bounds_with_hash_schemas' in sorted order. The hash schemas
// per range are stored within 'range_hash_schemas'. If 'range_hash_schemas' is empty,
// it indicates that the table wide hash schema will be used per range.
Status EncodeRangeBounds(const std::vector<std::pair<KuduPartialRow,
KuduPartialRow>>& range_bounds,
const RangeHashSchema& range_hash_schemas,
const Schema& schema,
std::vector<RangeWithHashSchemas>* bounds_with_hash_schemas) const;
// Splits the encoded range bounds by the split points. The splits and bounds within
// 'bounds_with_hash_schemas' must be sorted. If `bounds_with_hash_schemas` is empty,
// then a single unbounded range is assumed. If any of the splits falls outside
// of the bounds, then an InvalidArgument status is returned.
Status SplitRangeBounds(const Schema& schema,
std::vector<std::string> splits,
std::vector<RangeWithHashSchemas>* bounds_with_hash_schemas) const;
// Increments a range partition key, setting 'increment' to true if the
// increment succeeds, or false if all range partition columns are already the
// maximum value. Unset columns will be incremented to increment(min_value).
Status IncrementRangePartitionKey(KuduPartialRow* row, bool* increment) const;
HashBucketSchemas hash_bucket_schemas_;
RangeSchema range_schema_;
};
} // namespace kudu