blob: 88a17bdd216c0212fe10a5fe9f0ddae4da5d7335 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <stdint.h>
#include <functional>
#include <memory>
#include <mutex>
#include <set>
#include <string>
#include <unordered_map>
#include <vector>
#include "kudu/gutil/macros.h"
#include "kudu/gutil/ref_counted.h"
#include "kudu/util/locks.h"
#include "kudu/util/metrics.h"
#include "kudu/util/monotime.h"
#include "kudu/util/random.h"
#include "kudu/util/status.h"
namespace kudu {
class Env;
class ThreadPool;
// We pass around the results of canonicalization to indicate to the
// directory manager which, if any, failed to canonicalize.
//
// TODO(awong): move the canonicalization of directories into the
// directory manager so we can avoid this extra plumbing.
struct CanonicalizedRootAndStatus {
std::string path;
Status status;
};
typedef std::vector<CanonicalizedRootAndStatus> CanonicalizedRootsList;
namespace fs {
typedef std::unordered_map<int, std::string> UuidByUuidIndexMap;
typedef std::unordered_map<std::string, int> UuidIndexByUuidMap;
class DirInstanceMetadataFile;
// Defines the behavior when opening a directory manager that has an
// inconsistent or incomplete set of instance files.
enum UpdateInstanceBehavior {
// If the directories don't match the on-disk dir sets, update the on-disk
// data to match if not in read-only mode.
UPDATE_AND_IGNORE_FAILURES,
// Like UPDATE_AND_IGNORE_FAILURES, but will return an error if any of the updates to the
// on-disk files fail.
UPDATE_AND_ERROR_ON_FAILURE,
// If the directories don't match the on-disk dir sets, continue without
// updating the on-disk data.
DONT_UPDATE
};
struct DirMetrics {
scoped_refptr<AtomicGauge<uint64_t>> dirs_failed;
scoped_refptr<AtomicGauge<uint64_t>> dirs_full;
};
// Detected type of filesystem.
enum class FsType {
// ext2, ext3, or ext4.
EXT,
// SGI xfs.
XFS,
// None of the above.
OTHER
};
// Representation of a directory (e.g. a data directory).
class Dir {
public:
Dir(Env* env,
DirMetrics* metrics,
FsType fs_type,
std::string dir,
std::unique_ptr<DirInstanceMetadataFile> metadata_file,
std::unique_ptr<ThreadPool> pool);
virtual ~Dir();
// Shuts down this dir's thread pool, waiting for any closures submitted via
// ExecClosure() to finish first.
void Shutdown();
// Run a task on this dir's thread pool.
//
// Normally the task is performed asynchronously. However, if submission to
// the pool fails, it runs synchronously on the current thread.
void ExecClosure(const std::function<void()>& task);
// Waits for any outstanding closures submitted via ExecClosure() to finish.
void WaitOnClosures();
// Tests whether the directory is full by comparing the free space of its
// underlying filesystem with a predefined "reserved" space value.
//
// If 'mode' is EXPIRED_ONLY, performs the test only if the dir was last
// determined to be full some time ago. If 'mode' is ALWAYS, the test is
// performed regardless.
//
// Only returns a bad Status in the event of a real error; fullness is
// reflected via is_full().
enum class RefreshMode {
EXPIRED_ONLY,
ALWAYS,
};
Status RefreshAvailableSpace(RefreshMode mode);
FsType fs_type() const { return fs_type_; }
// Return the full path of this directory.
const std::string& dir() const { return dir_; }
const DirInstanceMetadataFile* instance() const {
return metadata_file_.get();
}
bool is_full() const {
std::lock_guard<simple_spinlock> l(lock_);
return is_full_;
}
int64_t available_bytes() {
std::lock_guard<simple_spinlock> l(lock_);
return available_bytes_;
}
// The amount of time to cache the amount of available space in this
// directory.
virtual int available_space_cache_secs() const = 0;
// The number of bytes to reserve in each directory for non-Kudu usage. A
// value of -1 means 1% of the disk space in a directory will be reserved.
virtual int reserved_bytes() const = 0;
private:
Env* env_;
DirMetrics* metrics_;
const FsType fs_type_;
const std::string dir_;
const std::unique_ptr<DirInstanceMetadataFile> metadata_file_;
const std::unique_ptr<ThreadPool> pool_;
bool is_shutdown_;
// Protects 'last_space_check_', 'is_full_' and 'available_bytes_'.
mutable simple_spinlock lock_;
MonoTime last_space_check_;
bool is_full_;
// The available bytes of this dir, updated by RefreshAvailableSpace.
int64_t available_bytes_;
DISALLOW_COPY_AND_ASSIGN(Dir);
};
struct DirManagerOptions {
public:
// The type of directory this directory manager should support.
//
// Must not be empty.
std::string dir_type;
// The entity under which all metrics should be grouped. If null, metrics
// will not be produced.
//
// Defaults to null.
scoped_refptr<MetricEntity> metric_entity;
// Whether the directory manager should only allow reading.
//
// Defaults to false.
bool read_only;
// Whether to update the on-disk instances when opening directories if
// inconsistencies are detected.
//
// Defaults to UPDATE_AND_IGNORE_FAILURES.
UpdateInstanceBehavior update_instances;
protected:
explicit DirManagerOptions(const std::string& dir_type);
};
class DirManager {
public:
enum class LockMode {
MANDATORY,
OPTIONAL,
NONE,
};
// Returns the root names from the input 'root_list'.
static std::vector<std::string> GetRootNames(const CanonicalizedRootsList& root_list);
virtual ~DirManager();
// Shuts down all directories' thread pools.
void Shutdown();
// Waits on all directories' thread pools.
void WaitOnClosures();
// Returns a list of all dirs.
const std::vector<std::unique_ptr<Dir>>& dirs() const {
return dirs_;
}
// Adds 'uuid_idx' to the set of failed directories. This directory will no
// longer be used. Logs an error message prefixed with 'error_message'
// describing what directories are affected.
//
// Returns an error if all directories have failed.
Status MarkDirFailed(int uuid_idx, const std::string& error_message = "");
// Fails the directory specified by 'uuid' and logs a warning if all
// directories have failed.
void MarkDirFailedByUuid(const std::string& uuid);
// Returns whether or not the 'uuid_idx' refers to a failed directory.
bool IsDirFailed(int uuid_idx) const;
// Returns whether the given tablet exists in a failed directory.
bool IsTabletInFailedDir(const std::string& tablet_id) const;
std::set<int> GetFailedDirs() const {
shared_lock<rw_spinlock> group_lock(dir_group_lock_.get_lock());
return failed_dirs_;
}
bool AreAllDirsFailed() const {
shared_lock<rw_spinlock> group_lock(dir_group_lock_.get_lock());
return failed_dirs_.size() == dirs_.size();
}
// Return a list of the canonicalized root directory names.
std::vector<std::string> GetRoots() const;
// Return a list of the canonicalized directory names.
std::vector<std::string> GetDirs() const;
// Finds a directory by uuid index, returning null if it can't be found.
//
// More information on uuid indexes and their relation to directories
// can be found next to DirSetPB in fs.proto.
Dir* FindDirByUuidIndex(int uuid_idx) const;
// Finds a uuid index by directory, returning false if it can't be found.
bool FindUuidIndexByDir(Dir* dir, int* uuid_idx) const;
// Finds a uuid index by root path, returning false if it can't be found.
bool FindUuidIndexByRoot(const std::string& root, int* uuid_idx) const;
// Finds a uuid index by UUID, returning false if it can't be found.
bool FindUuidIndexByUuid(const std::string& uuid, int* uuid_idx) const;
// Finds a UUID by canonicalized root name, returning false if it can't be found.
bool FindUuidByRoot(const std::string& root, std::string* uuid) const;
// Finds the set of tablet IDs that are registered to use the directory with
// the given UUID index.
std::set<std::string> FindTabletsByDirUuidIdx(int uuid_idx) const;
// Create a new directory using the appropriate directory implementation.
virtual std::unique_ptr<Dir> CreateNewDir(Env* env,
DirMetrics* metrics,
FsType fs_type,
std::string dir,
std::unique_ptr<DirInstanceMetadataFile>,
std::unique_ptr<ThreadPool> pool) = 0;
protected:
// The name to be used by this directory manager for each sub-directory of
// each directory root.
virtual const char* dir_name() const = 0;
// The name to be used by this directory manager for each instance file
// corresponding to this directory manager.
virtual const char* instance_metadata_filename() const = 0;
// Whether to sync the directories when updating this manager's directories.
virtual bool sync_dirs() const = 0;
// Whether to lock the directories to prevent concurrent usage. Note:
// read-only concurrent usage is still allowed.
virtual bool lock_dirs() const = 0;
// The max number of directories to be managed.
virtual int max_dirs() const = 0;
DirManager(Env* env,
std::unique_ptr<DirMetrics> dir_metrics,
int num_threads_per_dir,
const DirManagerOptions& opts,
CanonicalizedRootsList canonicalized_data_roots);
// Initializes the data directories on disk. Returns an error if initialized
// directories already exist.
//
// Note: this doesn't initialize any in-memory state for the directory
// manager.
virtual Status Create();
// Opens existing instance files from disk and indexes the files found.
//
// Returns an error if the number of on-disk directories found exceeds the
// max allowed, if locks need to be acquired and cannot be, or if there are
// no healthy directories.
//
// If appropriate, this will create any missing directories and rewrite
// existing instance files to be consistent with each other.
virtual Status Open();
// Populates the maps to index the given directories.
virtual Status PopulateDirectoryMaps(const std::vector<std::unique_ptr<Dir>>& dirs);
// Helper function to add a directory to the internal maps. Assumes that the
// UUID, UUID index, and directory name have not already been inserted.
void InsertToMaps(const std::string& uuid, int idx, Dir* dir);
// Loads the instance files for each directory root.
//
// On success, 'instance_files' contains instance objects, including those
// that failed to load because they were missing or because of a disk
// error; they are still considered "loaded" and are labeled unhealthy
// internally. 'has_healthy_instances' is set to true if any of the instance
// files are healthy.
//
// Returns an error if an instance file fails in an irreconcileable way (e.g.
// the file is locked).
Status LoadInstances(
std::vector<std::unique_ptr<DirInstanceMetadataFile>>* instance_files,
bool* has_healthy_instances);
// Takes the set of instance files, does some basic verification on them,
// creates any that don't exist on disk, and updates any that have a
// different set of UUIDs stored than the expected set.
//
// Returns an error if there is a configuration error, e.g. if the existing
// instances believe there should be a different block size.
//
// If in UPDATE_AND_IGNORE_FAILURES mode, an error is not returned in the event of a disk
// error. Instead, it is up to the caller to reload the instance files and
// proceed if healthy enough.
//
// If in UPDATE_AND_ERROR_ON_FAILURE mode, a failure to update instances will
// surface as an error.
Status CreateNewDirectoriesAndUpdateInstances(
std::vector<std::unique_ptr<DirInstanceMetadataFile>> instances);
// Updates the on-disk instance files specified by 'instances_to_update'
// (presumably those whose 'all_uuids' field doesn't match 'new_all_uuids')
// using the contents of 'new_all_uuids', skipping any unhealthy instance
// files.
//
// If in UPDATE_AND_IGNORE_FAILURES mode, this is best effort. If any of the instance
// updates fail (e.g. due to a disk error) in this mode, this will log a
// warning about the failed updates and return OK.
//
// If in UPDATE_AND_ERROR_ON_FAILURE mode, any failure will immediately attempt
// to clean up any altered state and return with an error.
Status UpdateHealthyInstances(
const std::vector<std::unique_ptr<DirInstanceMetadataFile>>& instances_to_update,
const std::set<std::string>& new_all_uuids);
// The environment to be used for all directory operations.
Env* env_;
// The number of threads to allocate per directory threadpool.
const int num_threads_per_dir_;
// The options that the Dirmanager was created with.
const DirManagerOptions opts_;
// The canonicalized roots provided to the constructor, taken verbatim.
// Common roots in the collections have been deduplicated.
const CanonicalizedRootsList canonicalized_fs_roots_;
// Directories tracked by this manager.
std::vector<std::unique_ptr<Dir>> dirs_;
// Set of metrics relating to the health of the directories that this manager
// is tracking.
std::unique_ptr<DirMetrics> metrics_;
// Lock protecting access to the directory group maps and to failed_dirs_. A
// percpu_rwlock is used so threads attempting to read (e.g. to get the next
// directory for an operation) do not block each other, while threads
// attempting to write (e.g. to create a new tablet, thereby registering
// directories per tablet) block all threads.
mutable percpu_rwlock dir_group_lock_;
// RNG used to select directories.
mutable ThreadSafeRandom rng_;
typedef std::unordered_map<std::string, std::string> UuidByRootMap;
UuidByRootMap uuid_by_root_;
typedef std::unordered_map<int, Dir*> UuidIndexMap;
UuidIndexMap dir_by_uuid_idx_;
typedef std::unordered_map<Dir*, int> ReverseUuidIndexMap;
ReverseUuidIndexMap uuid_idx_by_dir_;
typedef std::unordered_map<int, std::set<std::string>> TabletsByUuidIndexMap;
TabletsByUuidIndexMap tablets_by_uuid_idx_map_;
UuidByUuidIndexMap uuid_by_idx_;
UuidIndexByUuidMap idx_by_uuid_;
typedef std::set<int> FailedDirSet;
FailedDirSet failed_dirs_;
DISALLOW_COPY_AND_ASSIGN(DirManager);
};
} // namespace fs
} // namespace kudu