blob: 2b2fc75671e434c098d44a0c677872893720c423 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <cstdint>
#include <memory>
#include <mutex>
#include <set>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
#include <gtest/gtest_prod.h>
#include "kudu/gutil/callback.h"
#include "kudu/gutil/macros.h"
#include "kudu/gutil/ref_counted.h"
#include "kudu/util/locks.h"
#include "kudu/util/metrics.h"
#include "kudu/util/monotime.h"
#include "kudu/util/random.h"
#include "kudu/util/status.h"
namespace kudu {
class DataDirGroupPB;
class Env;
class ThreadPool;
// We pass around the results of canonicalization to indicate to the
// DataDirManager which, if any, failed to canonicalize.
//
// TODO(awong): move the canonicalization of directories into the
// DataDirManager so we can avoid this extra plumbing.
struct CanonicalizedRootAndStatus {
std::string path;
Status status;
};
typedef std::vector<CanonicalizedRootAndStatus> CanonicalizedRootsList;
namespace fs {
typedef std::unordered_map<int, std::string> UuidByUuidIndexMap;
typedef std::unordered_map<std::string, int> UuidIndexByUuidMap;
class PathInstanceMetadataFile;
struct CreateBlockOptions;
const char kInstanceMetadataFileName[] = "block_manager_instance";
const char kDataDirName[] = "data";
namespace internal {
// A DataDirGroup is a group of directories used by an entity for block
// placement. A group is represented in memory by a list of indices which index
// into the list of all UUIDs found in a PathSetPB. A group is represented
// on-disk as a list of full UUIDs, and as such, when writing or reading from
// disk, a mapping is needed to translate between index and UUID.
//
// The same directory may appear in multiple DataDirGroups.
class DataDirGroup {
public:
DataDirGroup();
explicit DataDirGroup(std::vector<int> uuid_indices);
// Reloads the DataDirGroup with UUID indices for the UUIDs in 'pb' by
// looking them up in 'uuid_idx_by_uuid'.
//
// Returns an error if a uuid cannot be found.
Status LoadFromPB(const UuidIndexByUuidMap& uuid_idx_by_uuid,
const DataDirGroupPB& pb);
// Writes this group's UUIDs to 'pb', looking them up via index in
// 'uuid_by_uuid_idx'.
//
// Returns an error if an index cannot be found.
Status CopyToPB(const UuidByUuidIndexMap& uuid_by_uuid_idx,
DataDirGroupPB* pb) const;
const std::vector<int>& uuid_indices() const { return uuid_indices_; }
private:
// UUID indices corresponding to the data directories within the group.
std::vector<int> uuid_indices_;
};
} // namespace internal
// Detected type of filesystem.
enum class DataDirFsType {
// ext2, ext3, or ext4.
EXT,
// SGI xfs.
XFS,
// None of the above.
OTHER
};
// Defines the behavior of the consistency checks performed when the directory
// manager is opened.
enum class ConsistencyCheckBehavior {
// If the data directories don't match the on-disk path sets, fail.
ENFORCE_CONSISTENCY,
// If the data directories don't match the on-disk path sets, update the
// on-disk data to match. The directory manager must not be read-only.
UPDATE_ON_DISK,
// If the data directories don't match the on-disk path sets, continue
// without updating the on-disk data.
IGNORE_INCONSISTENCY
};
struct DataDirMetrics {
explicit DataDirMetrics(const scoped_refptr<MetricEntity>& entity);
scoped_refptr<AtomicGauge<uint64_t>> data_dirs_failed;
scoped_refptr<AtomicGauge<uint64_t>> data_dirs_full;
};
// Representation of a data directory in use by the block manager.
class DataDir {
public:
DataDir(Env* env,
DataDirMetrics* metrics,
DataDirFsType fs_type,
std::string dir,
std::unique_ptr<PathInstanceMetadataFile> metadata_file,
std::unique_ptr<ThreadPool> pool);
~DataDir();
// Shuts down this dir's thread pool, waiting for any closures submitted via
// ExecClosure() to finish first.
void Shutdown();
// Run a task on this dir's thread pool.
//
// Normally the task is performed asynchronously. However, if submission to
// the pool fails, it runs synchronously on the current thread.
void ExecClosure(const Closure& task);
// Waits for any outstanding closures submitted via ExecClosure() to finish.
void WaitOnClosures();
// Tests whether the data directory is full by comparing the free space of
// its underlying filesystem with a predefined "reserved" space value.
//
// If 'mode' is EXPIRED_ONLY, performs the test only if the dir was last
// determined to be full some time ago. If 'mode' is ALWAYS, the test is
// performed regardless.
//
// Only returns a bad Status in the event of a real error; fullness is
// reflected via is_full().
enum class RefreshMode {
EXPIRED_ONLY,
ALWAYS,
};
Status RefreshIsFull(RefreshMode mode);
DataDirFsType fs_type() const { return fs_type_; }
const std::string& dir() const { return dir_; }
const PathInstanceMetadataFile* instance() const {
return metadata_file_.get();
}
bool is_full() const {
std::lock_guard<simple_spinlock> l(lock_);
return is_full_;
}
private:
Env* env_;
DataDirMetrics* metrics_;
const DataDirFsType fs_type_;
const std::string dir_;
const std::unique_ptr<PathInstanceMetadataFile> metadata_file_;
const std::unique_ptr<ThreadPool> pool_;
bool is_shutdown_;
// Protects 'last_check_is_full_' and 'is_full_'.
mutable simple_spinlock lock_;
MonoTime last_check_is_full_;
bool is_full_;
DISALLOW_COPY_AND_ASSIGN(DataDir);
};
// Directory manager creation options.
struct DataDirManagerOptions {
DataDirManagerOptions();
// The block manager type the directory manager should support.
// Must be either "file" or "log".
//
// Defaults to the value of FLAGS_block_manager.
std::string block_manager_type;
// The entity under which all metrics should be grouped. If null, metrics
// will not be produced.
//
// Defaults to null.
scoped_refptr<MetricEntity> metric_entity;
// Whether the directory manager should only allow reading.
//
// Defaults to false.
bool read_only;
// The behavior to use when comparing the provided data directories to the
// on-disk path sets.
//
// Defaults to ENFORCE_CONSISTENCY.
ConsistencyCheckBehavior consistency_check;
};
// Encapsulates knowledge of data directory management on behalf of block
// managers.
class DataDirManager {
public:
enum class LockMode {
MANDATORY,
OPTIONAL,
NONE,
};
enum class DirDistributionMode {
ACROSS_ALL_DIRS,
USE_FLAG_SPEC,
};
// Public static initializers for use in tests. When used, data_fs_roots is
// expected to be the successfully canonicalized directories.
static Status CreateNewForTests(Env* env, std::vector<std::string> data_fs_roots,
DataDirManagerOptions opts,
std::unique_ptr<DataDirManager>* dd_manager);
static Status OpenExistingForTests(Env* env, std::vector<std::string> data_fs_roots,
DataDirManagerOptions opts,
std::unique_ptr<DataDirManager>* dd_manager);
// Constructs a directory manager and creates its necessary files on-disk.
//
// Returns an error if any of the directories already exist.
static Status CreateNew(Env* env, CanonicalizedRootsList data_fs_roots,
DataDirManagerOptions opts,
std::unique_ptr<DataDirManager>* dd_manager);
// Constructs a directory manager and indexes the files found on-disk.
//
// Returns an error if the number of on-disk directories found exceeds the
// max allowed, or if locks need to be acquired and cannot be.
static Status OpenExisting(Env* env, CanonicalizedRootsList data_fs_roots,
DataDirManagerOptions opts,
std::unique_ptr<DataDirManager>* dd_manager);
// Returns the root names from the input 'root_list'.
static std::vector<std::string> GetRootNames(const CanonicalizedRootsList& root_list);
~DataDirManager();
// Shuts down all directories' thread pools.
void Shutdown();
// Waits on all directories' thread pools.
void WaitOnClosures();
// Returns a list of all data dirs.
const std::vector<std::unique_ptr<DataDir>>& data_dirs() const {
return data_dirs_;
}
// ==========================================================================
// Tablet Placement
// ==========================================================================
// Deserializes a DataDirGroupPB and associates the resulting DataDirGroup
// with a tablet_id.
//
// Returns an error if the tablet already exists or if a data dir in the
// group is missing.
Status LoadDataDirGroupFromPB(const std::string& tablet_id,
const DataDirGroupPB& pb);
// Serializes the DataDirGroupPB associated with the given tablet_id.
//
// Returns an error if the tablet was not already registered or if a data dir
// is missing.
Status GetDataDirGroupPB(const std::string& tablet_id, DataDirGroupPB* pb) const;
// Creates a new data dir group for the specified tablet. Adds data
// directories to this new group until the limit specified by
// fs_target_data_dirs_per_tablet, or until there is no more space.
//
// If 'mode' is ACROSS_ALL_DIRS, ignores the above flag and stripes across
// all disks. This behavior is only used when loading a superblock with no
// DataDirGroup, allowing for backwards compatability with data from older
// version of Kudu.
//
// Results in an error if all disks are full or if the tablet already has a
// data dir group associated with it. If returning with an error, the
// DataDirManager will be unchanged.
Status CreateDataDirGroup(const std::string& tablet_id,
DirDistributionMode mode = DirDistributionMode::USE_FLAG_SPEC);
// Deletes the group for the specified tablet. Maps from tablet_id to group
// and data dir to tablet set are cleared of all references to the tablet.
void DeleteDataDirGroup(const std::string& tablet_id);
// Returns a random directory from the specfied option's data dir group. If
// there is no room in the group, returns an error.
Status GetNextDataDir(const CreateBlockOptions& opts, DataDir** dir);
// Finds the set of tablet_ids in the data dir specified by 'uuid_idx' and
// returns a copy, returning an empty set if none are found.
std::set<std::string> FindTabletsByDataDirUuidIdx(int uuid_idx) const;
// ==========================================================================
// Directory Health
// ==========================================================================
// Adds 'uuid_idx' to the set of failed data directories. This directory will
// no longer be used. Logs an error message prefixed with 'error_message'
// describing what directories are affected.
//
// Returns an error if all directories have failed.
Status MarkDataDirFailed(int uuid_idx, const std::string& error_message = "");
// Fails the directory specified by 'uuid' and logs a warning if all
// directories have failed.
void MarkDataDirFailedByUuid(const std::string& uuid);
// Returns whether or not the 'uuid_idx' refers to a failed directory.
bool IsDataDirFailed(int uuid_idx) const;
// Returns whether the tablet's data is spread across a failed directory.
bool IsTabletInFailedDir(const std::string& tablet_id) const;
const std::set<int> GetFailedDataDirs() const {
shared_lock<rw_spinlock> group_lock(dir_group_lock_.get_lock());
return failed_data_dirs_;
}
// ==========================================================================
// Directory Paths
// ==========================================================================
// Return a list of the canonicalized root directory names.
std::vector<std::string> GetDataRoots() const;
// Return a list of the canonicalized data directory names.
std::vector<std::string> GetDataDirs() const;
// ==========================================================================
// Representation Conversion
// ==========================================================================
// Finds a data directory by uuid index, returning null if it can't be found.
//
// More information on uuid indexes and their relation to data directories
// can be found next to PathSetPB in fs.proto.
DataDir* FindDataDirByUuidIndex(int uuid_idx) const;
// Finds a uuid index by data directory, returning false if it can't be found.
bool FindUuidIndexByDataDir(DataDir* dir, int* uuid_idx) const;
// Finds a uuid index by root path, returning false if it can't be found.
bool FindUuidIndexByRoot(const std::string& root, int* uuid_idx) const;
// Finds a uuid index by UUID, returning false if it can't be found.
bool FindUuidIndexByUuid(const std::string& uuid, int* uuid_idx) const;
// Finds a UUID by canonicalized root name, returning false if it can't be found.
bool FindUuidByRoot(const std::string& root, std::string* uuid) const;
private:
FRIEND_TEST(DataDirsTest, TestCreateGroup);
FRIEND_TEST(DataDirsTest, TestLoadFromPB);
FRIEND_TEST(DataDirsTest, TestLoadBalancingBias);
FRIEND_TEST(DataDirsTest, TestLoadBalancingDistribution);
FRIEND_TEST(DataDirsTest, TestFailedDirNotAddedToGroup);
// Constructs a directory manager.
DataDirManager(Env* env,
DataDirManagerOptions opts,
CanonicalizedRootsList canonicalized_data_roots);
// Initializes the data directories on disk.
//
// Returns an error if initialized directories already exist, or if any of
// the directories experience a disk failure.
Status Create();
// Opens existing data roots from disk and indexes the files found.
//
// Returns an error if the number of on-disk directories found exceeds the
// max allowed, if locks need to be acquired and cannot be, or if the
// metadata directory (i.e. the first one) fails to load.
Status Open();
// Loads the instance files for each data directory.
//
// On success, 'loaded_instances' contains loaded instance objects. It also
// includes instance files that failed to load because they were missing or
// because of a disk failure; they are still considered "loaded" and are
// labeled unhealthy internally.
//
// Returns an error if an instance file fails in an irreconcileable way (e.g.
// the file is locked), or if none of the instance files are healthy.
Status LoadInstances(
std::vector<std::unique_ptr<PathInstanceMetadataFile>>* loaded_instances);
// Initializes new data directories specified by 'root_uuid_pairs_to_create'
// and updates the on-disk instance files of data directories specified by
// 'instances_to_update' using the contents of 'all_uuids', skipping any
// unhealthy instance files.
//
// Returns an error if any disk operations fail.
Status CreateNewDataDirectoriesAndUpdateInstances(
std::vector<std::pair<std::string, std::string>> root_uuid_pairs_to_create,
std::vector<std::unique_ptr<PathInstanceMetadataFile>> instances_to_update,
std::vector<std::string> all_uuids);
// Updates the on-disk instance files specified by 'instances_to_update'
// using the contents of 'new_all_uuids', skipping any unhealthy instance
// files.
//
// Returns an error if any disk operations fail.
Status UpdateInstances(
std::vector<std::unique_ptr<PathInstanceMetadataFile>> instances_to_update,
std::vector<std::string> new_all_uuids);
// Repeatedly selects directories from those available to put into a new
// DataDirGroup until 'group_indices' reaches 'target_size' elements.
// Selection is based on "The Power of Two Choices in Randomized Load
// Balancing", selecting two directories randomly and choosing the one with
// less load, quantified as the number of unique tablets in the directory.
// The resulting behavior fills directories that have fewer tablets stored on
// them while not completely neglecting those with more tablets.
//
// 'group_indices' is an output that stores the list of uuid_indices to be
// added. Although this function does not itself change DataDirManager state,
// its expected usage warrants that it is called within the scope of a
// lock_guard of dir_group_lock_.
void GetDirsForGroupUnlocked(int target_size, std::vector<int>* group_indices);
// Goes through the data dirs in 'uuid_indices' and populates
// 'healthy_indices' with those that haven't failed.
void RemoveUnhealthyDataDirsUnlocked(const std::vector<int>& uuid_indices,
std::vector<int>* healthy_indices) const;
// The environment to be used for all data directory operations.
Env* env_;
// The options that the DataDirManager was created with.
const DataDirManagerOptions opts_;
// The canonicalized roots provided to the constructor, taken verbatim.
//
// - The first data root is used as the metadata root.
// - Common roots in the collections have been deduplicated.
const CanonicalizedRootsList canonicalized_data_fs_roots_;
std::unique_ptr<DataDirMetrics> metrics_;
std::vector<std::unique_ptr<DataDir>> data_dirs_;
typedef std::unordered_map<std::string, std::string> UuidByRootMap;
UuidByRootMap uuid_by_root_;
typedef std::unordered_map<int, DataDir*> UuidIndexMap;
UuidIndexMap data_dir_by_uuid_idx_;
typedef std::unordered_map<DataDir*, int> ReverseUuidIndexMap;
ReverseUuidIndexMap uuid_idx_by_data_dir_;
typedef std::unordered_map<std::string, internal::DataDirGroup> TabletDataDirGroupMap;
TabletDataDirGroupMap group_by_tablet_map_;
typedef std::unordered_map<int, std::set<std::string>> TabletsByUuidIndexMap;
TabletsByUuidIndexMap tablets_by_uuid_idx_map_;
UuidByUuidIndexMap uuid_by_idx_;
UuidIndexByUuidMap idx_by_uuid_;
typedef std::set<int> FailedDataDirSet;
FailedDataDirSet failed_data_dirs_;
// Lock protecting access to the dir group maps and to failed_data_dirs_.
// A percpu_rwlock is used so threads attempting to read (e.g. to get the
// next data directory for a Flush()) do not block each other, while threads
// attempting to write (e.g. to create a new tablet, thereby creating a new
// data directory group) block all threads.
mutable percpu_rwlock dir_group_lock_;
// RNG used to select directories.
ThreadSafeRandom rng_;
DISALLOW_COPY_AND_ASSIGN(DataDirManager);
};
} // namespace fs
} // namespace kudu