blob: 06527dc209087bd0178d53df0e785d298abe503e [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#ifndef KUDU_COMMON_PARTITION_H
#define KUDU_COMMON_PARTITION_H
#include <algorithm>
#include <string>
#include <vector>
#include "kudu/common/common.pb.h"
#include "kudu/common/key_encoder.h"
#include "kudu/common/partial_row.h"
#include "kudu/common/row.h"
#include "kudu/common/schema.h"
#include "kudu/gutil/ref_counted.h"
#include "kudu/util/status.h"
namespace kudu {
class ColumnRangePredicate;
class ConstContiguousRow;
class KuduPartialRow;
class PartitionSchemaPB;
class TypeInfo;
// A Partition describes the set of rows that a Tablet is responsible for
// serving. Each tablet is assigned a single Partition.
//
// Partitions consist primarily of a start and end partition key. Every row with
// a partition key that falls in a Tablet's Partition will be served by that
// tablet.
//
// In addition to the start and end partition keys, a Partition holds metadata
// to determine if a scan can prune, or skip, a partition based on the scan's
// start and end primary keys, and predicates.
class Partition {
public:
const std::vector<int32_t>& hash_buckets() const {
return hash_buckets_;
}
Slice range_key_start() const;
Slice range_key_end() const;
const std::string& partition_key_start() const {
return partition_key_start_;
}
const std::string& partition_key_end() const {
return partition_key_end_;
}
// Serializes a partition into a protobuf message.
void ToPB(PartitionPB* pb) const;
// Deserializes a protobuf message into a partition.
//
// The protobuf message is not validated, since partitions are only expected
// to be created by the master process.
static void FromPB(const PartitionPB& pb, Partition* partition);
private:
friend class PartitionSchema;
// Helper function for accessing the range key portion of a partition key.
Slice range_key(const std::string& partition_key) const;
std::vector<int32_t> hash_buckets_;
std::string partition_key_start_;
std::string partition_key_end_;
};
// A partition schema describes how the rows of a table are distributed among
// tablets.
//
// Primarily, a table's partition schema is responsible for translating the
// primary key column values of a row into a partition key that can be used to
// determine the tablet containing the key.
//
// The partition schema is made up of zero or more hash bucket components,
// followed by a single range component.
//
// Each hash bucket component includes one or more columns from the primary key
// column set, with the restriction that an individual primary key column may
// only be included in a single hash component.
//
// To determine the hash bucket of an individual row, the values of the columns
// of the hash component are encoded into bytes (in PK or lexicographic
// preserving encoding), then hashed into a u64, then modded into an i32. When
// constructing a partition key from a row, the buckets of the row are simply
// encoded into the partition key in order (again in PK or lexicographic
// preserving encoding).
//
// The range component contains a (possibly full or empty) subset of the primary
// key columns. When encoding the partition key, the columns of the partition
// component are encoded in order.
//
// The above is true of the relationship between rows and partition keys. It
// gets trickier with partitions (tablet partition key boundaries), because the
// boundaries of tablets do not necessarily align to rows. For instance,
// currently the absolute-start and absolute-end primary keys of a table
// represented as an empty key, but do not have a corresponding row. Partitions
// are similar, but instead of having just one absolute-start and absolute-end,
// each component of a partition schema has an absolute-start and absolute-end.
// When creating the initial set of partitions during table creation, we deal
// with this by "carrying through" absolute-start or absolute-ends into lower
// significance components.
class PartitionSchema {
public:
// Deserializes a protobuf message into a partition schema.
static Status FromPB(const PartitionSchemaPB& pb,
const Schema& schema,
PartitionSchema* partition_schema) WARN_UNUSED_RESULT;
// Serializes a partition schema into a protobuf message.
void ToPB(PartitionSchemaPB* pb) const;
// Appends the row's encoded partition key into the provided buffer.
// On failure, the buffer may have data partially appended.
Status EncodeKey(const KuduPartialRow& row, std::string* buf) const WARN_UNUSED_RESULT;
// Appends the row's encoded partition key into the provided buffer.
// On failure, the buffer may have data partially appended.
Status EncodeKey(const ConstContiguousRow& row, std::string* buf) const WARN_UNUSED_RESULT;
// Creates the set of table partitions for a partition schema and collection
// of split rows.
//
// The number of resulting partitions is the product of the number of hash
// buckets for each hash bucket component, multiplied by
// (split_rows.size() + 1).
Status CreatePartitions(const std::vector<KuduPartialRow>& split_rows,
const Schema& schema,
std::vector<Partition>* partitions) const WARN_UNUSED_RESULT;
// Tests if the partition contains the row.
Status PartitionContainsRow(const Partition& partition,
const KuduPartialRow& row,
bool* contains) const WARN_UNUSED_RESULT;
// Tests if the partition contains the row.
Status PartitionContainsRow(const Partition& partition,
const ConstContiguousRow& row,
bool* contains) const WARN_UNUSED_RESULT;
// Returns a text description of the partition suitable for debug printing.
std::string PartitionDebugString(const Partition& partition, const Schema& schema) const;
// Returns a text description of the partial row's partition key suitable for debug printing.
std::string RowDebugString(const KuduPartialRow& row) const;
// Returns a text description of the row's partition key suitable for debug printing.
std::string RowDebugString(const ConstContiguousRow& row) const;
// Returns a text description of the encoded partition key suitable for debug printing.
std::string PartitionKeyDebugString(const std::string& key, const Schema& schema) const;
// Returns a text description of this partition schema suitable for debug printing.
std::string DebugString(const Schema& schema) const;
// Returns true if the other partition schema is equivalent to this one.
bool Equals(const PartitionSchema& other) const;
private:
friend class PartitionPruner;
struct RangeSchema {
std::vector<ColumnId> column_ids;
};
struct HashBucketSchema {
std::vector<ColumnId> column_ids;
int32_t num_buckets;
uint32_t seed;
};
// Encodes the specified columns of a row into lexicographic sort-order
// preserving format.
static Status EncodeColumns(const KuduPartialRow& row,
const std::vector<ColumnId>& column_ids,
std::string* buf);
// Encodes the specified columns of a row into lexicographic sort-order
// preserving format.
static Status EncodeColumns(const ConstContiguousRow& row,
const std::vector<ColumnId>& column_ids,
std::string* buf);
// Returns the hash bucket of the encoded hash column. The encoded columns must match the
// columns of the hash bucket schema.
static int32_t BucketForEncodedColumns(const std::string& encoded_hash_columns,
const HashBucketSchema& hash_bucket_schema);
// Assigns the row to a hash bucket according to the hash schema.
template<typename Row>
static Status BucketForRow(const Row& row,
const HashBucketSchema& hash_bucket_schema,
int32_t* bucket);
// Private templated helper for PartitionContainsRow.
template<typename Row>
Status PartitionContainsRowImpl(const Partition& partition,
const Row& row,
bool* contains) const;
// Private templated helper for EncodeKey.
template<typename Row>
Status EncodeKeyImpl(const Row& row, string* buf) const;
// Appends the stringified range partition components of a partial row to a
// vector.
//
// If any columns of the range partition do not exist in the partial row,
// processing stops and the provided default string piece is appended to the vector.
void AppendRangeDebugStringComponentsOrString(const KuduPartialRow& row,
StringPiece default_string,
std::vector<std::string>* components) const;
// Appends the stringified range partition components of a partial row to a
// vector.
//
// If any columns of the range partition do not exist in the partial row, the
// logical minimum value for that column will be used instead.
void AppendRangeDebugStringComponentsOrMin(const KuduPartialRow& row,
std::vector<std::string>* components) const;
// Decodes a range partition key into a partial row, with variable-length
// fields stored in the arena.
Status DecodeRangeKey(Slice* encode_key,
KuduPartialRow* partial_row,
Arena* arena) const;
// Decodes the hash bucket component of a partition key into its buckets.
//
// This should only be called with partition keys created from a row, not with
// partition keys from a partition.
Status DecodeHashBuckets(Slice* partition_key, std::vector<int32_t>* buckets) const;
// Clears the state of this partition schema.
void Clear();
// Validates that this partition schema is valid. Returns OK, or an
// appropriate error code for an invalid partition schema.
Status Validate(const Schema& schema) const;
std::vector<HashBucketSchema> hash_bucket_schemas_;
RangeSchema range_schema_;
};
} // namespace kudu
#endif