blob: 8996fe29b98a44da8f86227854af876045a50930 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <cstddef>
#include <memory>
#include <set>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <boost/optional/optional.hpp>
#include <gtest/gtest_prod.h>
#include "kudu/master/ts_descriptor.h"
#include "kudu/util/status.h"
namespace kudu {
class ThreadSafeRandom;
namespace master {
// Utility class to help place tablet replicas on tablet servers according
// to the location awareness policy. Currently, this class implements the logic
// specific to location awareness as described in [1], but it could enforce
// other placement policies in the future.
//
// In essence (for details see [1]), the location awareness placement policy
// is about:
// * in case of N locations, N > 2, not placing the majority of replicas
// in one location
// * spreading replicas evenly among available locations
// * within a location, spreading load evenly among tablet servers
//
// [1] https://s.apache.org/location-awareness-design
//
// TODO(aserbin): add a link to the doc once it appears in the upstream repo.
//
// NOTE: in the implementation of this class, it's OK to use an empty string
// in place of boost::none for a location because valid location strings begin
// with '/' and therefore are nonempty.
class PlacementPolicy {
public:
// The 'descs' vector contains information on all available tablet servers
// in the cluster, the 'rng' parameter points to an instance of a random
// generator that the object references to. The random generator instance
// must not be nullptr and must outlive the PlacementPolicy object.
PlacementPolicy(TSDescriptorVector descs,
ThreadSafeRandom* rng);
virtual ~PlacementPolicy() = default;
size_t ts_num() const { return ts_num_; }
// Select tablet servers to host the given number of replicas for a tablet.
//
// Parameters:
// 'nreplicas' The 'nreplicas' parameter specifies the desired replication factor.
// 'dimension' The 'dimension' parameter specifies the dimension information of the tablet.
// If not none, place tablet replicas based on the number of tablets in a
// dimension. Otherwise, based on the number of tablets at a tablet server.
// 'ts_descs' The result set of tablet server descriptors is output into the 'ts_descs'
// placeholder (must not be null).
Status PlaceTabletReplicas(int nreplicas,
const boost::optional<std::string>& dimension,
TSDescriptorVector* ts_descs) const;
// Select tablet server to host an additional tablet replica.
//
// Parameters:
// 'existing' The 'existing' parameter lists current members of the tablet's
// Raft configuration.
// 'dimension' The 'dimension' parameter specifies the dimension information of the tablet.
// If not none, place tablet replicas based on the number of tablets in a
// dimension. Otherwise, based on the number of tablets at a tablet server.
// 'ts_desc' The new member is output into 'ts_desc' placeholer (must not be null).
Status PlaceExtraTabletReplica(TSDescriptorVector existing,
const boost::optional<std::string>& dimension,
std::shared_ptr<TSDescriptor>* ts_desc) const;
private:
// Tablet server descriptors per location. This is the most comprehensive
// information on how tablet servers are placed among locations. Inherently,
// the locations have a sense of proximity and a hierarchy, so '/mega/turbo0'
// is closer to '/mega/turbo1' than '/giga/awesome0' and both '/mega/turbo0'
// and '/mega/turbo1' are affected by conditions currently affecting '/mega'.
// Number of locations is not supposed to be high: at the order of magnitude
// scale, that's about tens.
//
// NOTE: this dictionary container is made unordered since currently no code
// is taking advantage of the order of the keys.
typedef std::unordered_map<std::string, TSDescriptorVector>
LocationToDescriptorsMap;
// Number of tablet replicas per location.
typedef std::unordered_map<std::string, int> ReplicaLocationsInfo;
friend class PlacementPolicyTest;
FRIEND_TEST(PlacementPolicyTest, SelectLocationRandomnessForExtraReplica);
// Get the load of the location: a location with N tablet servers and
// R replicas has load R/N.
//
// Parameters:
// 'location' The location in question.
// 'locations_info' Information on tablet replicas slated for placement,
// but not created yet. That's the placement information
// on to-be-replicas in the context of optimizing tablet
// replica distribution in the cluster.
double GetLocationLoad(const std::string& location,
const ReplicaLocationsInfo& locations_info) const;
// Select locations to place the given number of replicas ('nreplicas') for
// a new tablet. The locations are chosen according to the placement
// policies.
//
// TODO (aserbin): add the reference to the document once it's in the repo.
Status SelectReplicaLocations(int nreplicas,
ReplicaLocationsInfo* locations_info) const;
// Select the given number ('nreplicas') from the set of specified tablet
// servers to place tablet replicas.
Status SelectReplicas(const TSDescriptorVector& source_ts_descs,
int nreplicas,
const boost::optional<std::string>& dimension,
TSDescriptorVector* result_ts_descs) const;
// Given the tablet servers in 'ts_descs', pick a tablet server to host
// a tablet replica, excluding tablet servers in 'excluded'. If there are no
// servers in 'ts_descs' that are not in 'existing', return nullptr.
std::shared_ptr<TSDescriptor> SelectReplica(
const TSDescriptorVector& ts_descs,
const boost::optional<std::string>& dimension,
const std::set<std::shared_ptr<TSDescriptor>>& excluded) const;
// Select location for next replica of a tablet with the specified replication
// factor. In essence, the algorithm picks the least loaded location,
// making sure no location contains the majority of replicas of the tablet,
// if possible.
//
// Parameters:
// 'num_replicas' The total number of tablet replicas to place.
// 'locations_info' Information on tablet replicas slated for placement,
// but not created yet. That's the placement information
// on to-be-replicas in the context of optimizing tablet
// replica distribution in the cluster.
// 'location' The result location pointer, must not be null.
Status SelectLocation(int num_replicas,
const ReplicaLocationsInfo& locations_info,
std::string* location) const;
// Number of available tablet servers.
const size_t ts_num_;
// Random number generator used for selecting replica locations.
// The object that rng_ points at is supposed to be available during the whole
// lifetime of a PlacementPolicy object.
mutable ThreadSafeRandom* rng_;
// Location to TSDescriptorVector map: the distribution of all already
// existing tablet replicas among available tablet servers in the cluster,
// grouped by location.
LocationToDescriptorsMap ltd_;
// A set of known tablet server identifiers (derived from ltd_).
std::unordered_set<std::string> known_ts_ids_;
};
} // namespace master
} // namespace kudu