blob: 68df1e3cab6ac29d691191bbae6300195e82f56c [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "kudu/fs/fs_manager.h"
#include <cinttypes>
#include <ctime>
#include <functional>
#include <initializer_list>
#include <iostream>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <gflags/gflags.h>
#include <glog/logging.h>
#include "kudu/fs/block_manager.h"
#include "kudu/fs/data_dirs.h"
#include "kudu/fs/default_key_provider.h"
#include "kudu/fs/error_manager.h"
#include "kudu/fs/file_block_manager.h"
#include "kudu/fs/fs.pb.h"
#include "kudu/fs/fs_report.h"
#include "kudu/fs/log_block_manager.h"
#include "kudu/fs/ranger_kms_key_provider.h"
#include "kudu/fs/key_provider.h"
#include "kudu/gutil/map-util.h"
#include "kudu/gutil/port.h"
#include "kudu/gutil/stringprintf.h"
#include "kudu/gutil/strings/escaping.h"
#include "kudu/gutil/strings/join.h"
#include "kudu/gutil/strings/split.h"
#include "kudu/gutil/strings/strcat.h"
#include "kudu/gutil/strings/strip.h"
#include "kudu/gutil/strings/substitute.h"
#include "kudu/gutil/strings/util.h"
#include "kudu/gutil/walltime.h"
#include "kudu/util/env_util.h"
#include "kudu/util/flag_tags.h"
#include "kudu/util/flag_validators.h"
#include "kudu/util/metrics.h"
#include "kudu/util/monotime.h"
#include "kudu/util/net/net_util.h"
#include "kudu/util/oid_generator.h"
#include "kudu/util/path_util.h"
#include "kudu/util/pb_util.h"
#include "kudu/util/scoped_cleanup.h"
#include "kudu/util/slice.h"
#include "kudu/util/stopwatch.h"
#include "kudu/util/timer.h"
DEFINE_bool(cmeta_fsync_override_on_xfs, true,
"Whether to ignore --cmeta_force_fsync and instead always flush if Kudu detects "
"the server is on XFS. This can prevent consensus metadata corruption in the "
"event of sudden server failure. Disabling this flag may cause data loss in "
"the event of a system crash. See KUDU-2195 for more details.");
TAG_FLAG(cmeta_fsync_override_on_xfs, experimental);
TAG_FLAG(cmeta_fsync_override_on_xfs, advanced);
DEFINE_bool(enable_data_block_fsync, true,
"Whether to enable fsync() of data blocks, metadata, and their parent directories. "
"Disabling this flag may cause data loss in the event of a system crash.");
TAG_FLAG(enable_data_block_fsync, unsafe);
#if defined(__linux__)
DEFINE_string(block_manager, "log", "Which block manager to use for storage. "
"Valid options are 'file' and 'log'. The file block manager is not suitable for "
"production use due to scaling limitations.");
#else
DEFINE_string(block_manager, "file", "Which block manager to use for storage. "
"Only the file block manager is supported for non-Linux systems.");
#endif
static bool ValidateBlockManagerType(const char* /*flagname*/, const std::string& value) {
for (const std::string& type : kudu::fs::BlockManager::block_manager_types()) {
if (type == value) return true;
}
return false;
}
DEFINE_validator(block_manager, &ValidateBlockManagerType);
TAG_FLAG(block_manager, advanced);
DEFINE_string(fs_wal_dir, "",
"Directory with write-ahead logs. If this is not specified, the "
"program will not start. May be the same as fs_data_dirs");
TAG_FLAG(fs_wal_dir, stable);
DEFINE_string(fs_data_dirs, "",
"Comma-separated list of directories with data blocks. If this "
"is not specified, fs_wal_dir will be used as the sole data "
"block directory.");
TAG_FLAG(fs_data_dirs, stable);
DEFINE_string(fs_metadata_dir, "",
"Directory with metadata. If this is not specified, for "
"compatibility with Kudu 1.6 and below, Kudu will check the "
"first entry of fs_data_dirs for metadata and use it as the "
"metadata directory if any exists. If none exists, fs_wal_dir "
"will be used as the metadata directory.");
TAG_FLAG(fs_metadata_dir, stable);
DEFINE_int64(fs_wal_dir_reserved_bytes, -1,
"Number of bytes to reserve on the log directory filesystem for "
"non-Kudu usage. The default, which is represented by -1, is that "
"1% of the disk space on each disk will be reserved. Any other "
"value specified represents the number of bytes reserved and must "
"be greater than or equal to 0. Explicit percentages to reserve "
"are not currently supported");
DEFINE_validator(fs_wal_dir_reserved_bytes, [](const char* /*n*/, int64_t v) { return v >= -1; });
TAG_FLAG(fs_wal_dir_reserved_bytes, runtime);
METRIC_DEFINE_gauge_int64(server, log_block_manager_containers_processing_time_startup,
"Time taken to open all log block containers during server startup",
kudu::MetricUnit::kMilliseconds,
"The total time taken by the server to open all the container"
"files during the startup",
kudu::MetricLevel::kDebug);
DEFINE_string(encryption_key_provider, "default",
"Key provider implementation to generate and decrypt server keys. "
"Valid values are: 'default' (not for production usage), and 'ranger-kms'.");
DEFINE_validator(encryption_key_provider, [](const char* /*n*/, const std::string& value) {
return value == "default" || value == "ranger-kms";
});
DEFINE_string(ranger_kms_url, "",
"URL of the Ranger KMS server. Must be set when 'encryption_key_provider' "
"is set to 'ranger-kms'.");
DEFINE_string(encryption_cluster_key_name, "kudu_cluster_key",
"Name of the cluster key that is used to encrypt server encryption keys as "
"stored in Ranger KMS.");
bool ValidateRangerKMSFlags() {
if (FLAGS_encryption_key_provider == "ranger-kms") {
if (FLAGS_ranger_kms_url.empty() || FLAGS_encryption_cluster_key_name.empty()) {
LOG(ERROR) << "If 'encryption_key_provider' is set to 'ranger-kms', then "
"'ranger_kms_url' and 'encryption_cluster_key_name' must also be set.";
return false;
}
}
return true;
}
GROUP_FLAG_VALIDATOR(validate_ranger_kms_flags, ValidateRangerKMSFlags);
DECLARE_bool(encrypt_data_at_rest);
DECLARE_int32(encryption_key_length);
using kudu::fs::BlockManagerOptions;
using kudu::fs::CreateBlockOptions;
using kudu::fs::DataDirManager;
using kudu::fs::DataDirManagerOptions;
using kudu::fs::ErrorHandlerType;
using kudu::fs::ErrorNotificationCb;
using kudu::fs::FsErrorManager;
using kudu::fs::FileBlockManager;
using kudu::fs::FsReport;
using kudu::fs::LogBlockManager;
using kudu::fs::ReadableBlock;
using kudu::fs::UpdateInstanceBehavior;
using kudu::fs::WritableBlock;
using kudu::pb_util::SecureDebugString;
using kudu::security::DefaultKeyProvider;
using kudu::security::RangerKMSKeyProvider;
using std::optional;
using std::ostream;
using std::string;
using std::unique_ptr;
using std::unordered_map;
using std::unordered_set;
using std::vector;
using strings::a2b_hex;
using strings::Substitute;
namespace kudu {
// ==========================================================================
// FS Paths
// ==========================================================================
const char *FsManager::kWalDirName = "wals";
const char *FsManager::kWalFileNamePrefix = "wal";
const char *FsManager::kWalsRecoveryDirSuffix = ".recovery";
const char *FsManager::kTabletMetadataDirName = "tablet-meta";
const char *FsManager::kDataDirName = "data";
const char *FsManager::kInstanceMetadataFileName = "instance";
const char *FsManager::kConsensusMetadataDirName = "consensus-meta";
FsManagerOpts::FsManagerOpts()
: wal_root(FLAGS_fs_wal_dir),
metadata_root(FLAGS_fs_metadata_dir),
block_manager_type(FLAGS_block_manager),
read_only(false),
update_instances(UpdateInstanceBehavior::UPDATE_AND_IGNORE_FAILURES),
file_cache(nullptr),
skip_block_manager(false) {
data_roots = strings::Split(FLAGS_fs_data_dirs, ",", strings::SkipEmpty());
}
FsManagerOpts::FsManagerOpts(const string& root)
: wal_root(root),
data_roots({ root }),
block_manager_type(FLAGS_block_manager),
read_only(false),
update_instances(UpdateInstanceBehavior::UPDATE_AND_IGNORE_FAILURES),
file_cache(nullptr),
skip_block_manager(false) {}
FsManager::FsManager(Env* env, FsManagerOpts opts)
: env_(DCHECK_NOTNULL(env)),
opts_(std::move(opts)),
error_manager_(new FsErrorManager()),
initted_(false),
meta_on_xfs_(false) {
DCHECK(opts_.update_instances == UpdateInstanceBehavior::DONT_UPDATE ||
!opts_.read_only) << "FsManager can only be for updated if not in read-only mode";
if (FLAGS_encrypt_data_at_rest) {
if (FLAGS_encryption_key_provider == "ranger-kms") {
key_provider_.reset(new RangerKMSKeyProvider(FLAGS_ranger_kms_url,
FLAGS_encryption_cluster_key_name));
} else {
key_provider_.reset(new DefaultKeyProvider());
}
}
}
FsManager::~FsManager() {}
void FsManager::SetErrorNotificationCb(ErrorHandlerType e, ErrorNotificationCb cb) {
error_manager_->SetErrorNotificationCb(e, std::move(cb));
}
void FsManager::UnsetErrorNotificationCb(ErrorHandlerType e) {
error_manager_->UnsetErrorNotificationCb(e);
}
Status FsManager::Init() {
if (initted_) {
return Status::OK();
}
// The wal root must be set.
if (opts_.wal_root.empty()) {
return Status::IOError("Write-ahead log directory (fs_wal_dir) not provided");
}
// Deduplicate all of the roots.
unordered_set<string> all_roots = { opts_.wal_root };
all_roots.insert(opts_.data_roots.begin(), opts_.data_roots.end());
// If the metadata root not set, Kudu will either use the wal root or the
// first data root, in which case we needn't canonicalize additional roots.
if (!opts_.metadata_root.empty()) {
all_roots.insert(opts_.metadata_root);
}
// Build a map of original root --> canonicalized root, sanitizing each
// root as we go and storing the canonicalization status.
typedef unordered_map<string, CanonicalizedRootAndStatus> RootMap;
RootMap canonicalized_roots;
for (const string& root : all_roots) {
if (root.empty()) {
return Status::IOError("Empty string provided for path");
}
if (root[0] != '/') {
return Status::IOError(
Substitute("Relative path $0 provided", root));
}
string root_copy = root;
StripWhiteSpace(&root_copy);
if (root != root_copy) {
return Status::IOError(
Substitute("Path $0 contains illegal whitespace", root));
}
// Strip the basename when canonicalizing, as it may not exist. The
// dirname, however, must exist.
string canonicalized;
Status s = env_->Canonicalize(DirName(root), &canonicalized);
if (PREDICT_FALSE(!s.ok())) {
if (s.IsNotFound() || s.IsDiskFailure()) {
// If the directory fails to canonicalize due to disk failure, store
// the non-canonicalized form and the returned error.
canonicalized = DirName(root);
} else {
return s.CloneAndPrepend(Substitute("Failed to canonicalize $0", root));
}
}
canonicalized = JoinPathSegments(canonicalized, BaseName(root));
InsertOrDie(&canonicalized_roots, root, { canonicalized, s });
}
// All done, use the map to set the canonicalized state.
canonicalized_wal_fs_root_ = FindOrDie(canonicalized_roots, opts_.wal_root);
unordered_set<string> unique_roots;
if (!opts_.data_roots.empty()) {
for (const string& data_fs_root : opts_.data_roots) {
const auto& root = FindOrDie(canonicalized_roots, data_fs_root);
if (InsertIfNotPresent(&unique_roots, root.path)) {
canonicalized_data_fs_roots_.emplace_back(root);
canonicalized_all_fs_roots_.emplace_back(root);
}
}
} else {
LOG(INFO) << "Data directories (fs_data_dirs) not provided";
LOG(INFO) << "Using write-ahead log directory (fs_wal_dir) as data directory";
canonicalized_data_fs_roots_.emplace_back(canonicalized_wal_fs_root_);
}
if (InsertIfNotPresent(&unique_roots, canonicalized_wal_fs_root_.path)) {
canonicalized_all_fs_roots_.emplace_back(canonicalized_wal_fs_root_);
}
// Decide on a metadata root to use.
if (opts_.metadata_root.empty()) {
// Check the first data root for metadata.
const string meta_dir_in_data_root = JoinPathSegments(canonicalized_data_fs_roots_[0].path,
kTabletMetadataDirName);
// If there is already metadata in the first data root, use it. Otherwise,
// use the WAL root.
LOG(INFO) << "Metadata directory not provided";
if (env_->FileExists(meta_dir_in_data_root)) {
canonicalized_metadata_fs_root_ = canonicalized_data_fs_roots_[0];
LOG(INFO) << "Using existing metadata directory in first data directory";
} else {
canonicalized_metadata_fs_root_ = canonicalized_wal_fs_root_;
LOG(INFO) << "Using write-ahead log directory (fs_wal_dir) as metadata directory";
}
} else {
// Keep track of the explicitly-defined metadata root.
canonicalized_metadata_fs_root_ = FindOrDie(canonicalized_roots, opts_.metadata_root);
if (InsertIfNotPresent(&unique_roots, canonicalized_metadata_fs_root_.path)) {
canonicalized_all_fs_roots_.emplace_back(canonicalized_metadata_fs_root_);
}
}
// The server cannot start if the WAL root or metadata root failed to
// canonicalize.
const string& wal_root = canonicalized_wal_fs_root_.path;
RETURN_NOT_OK_PREPEND(canonicalized_wal_fs_root_.status,
Substitute("Write-ahead log directory $0 failed to canonicalize", wal_root));
const string& meta_root = canonicalized_metadata_fs_root_.path;
RETURN_NOT_OK_PREPEND(canonicalized_metadata_fs_root_.status,
Substitute("Metadata directory $0 failed to canonicalize", meta_root));
if (VLOG_IS_ON(1)) {
VLOG(1) << "WAL root: " << canonicalized_wal_fs_root_.path;
VLOG(1) << "Metadata root: " << canonicalized_metadata_fs_root_.path;
VLOG(1) << "Data roots: " <<
JoinStrings(DataDirManager::GetRootNames(canonicalized_data_fs_roots_), ",");
VLOG(1) << "All roots: " <<
JoinStrings(DataDirManager::GetRootNames(canonicalized_all_fs_roots_), ",");
}
initted_ = true;
return Status::OK();
}
void FsManager::InitBlockManager() {
BlockManagerOptions bm_opts;
bm_opts.metric_entity = opts_.metric_entity;
bm_opts.parent_mem_tracker = opts_.parent_mem_tracker;
bm_opts.read_only = opts_.read_only;
if (opts_.block_manager_type == "file") {
block_manager_.reset(new FileBlockManager(
env_, dd_manager_.get(), error_manager_.get(), opts_.file_cache, std::move(bm_opts)));
} else {
block_manager_.reset(new LogBlockManager(
env_, dd_manager_.get(), error_manager_.get(), opts_.file_cache, std::move(bm_opts)));
}
}
Status FsManager::PartialOpen(CanonicalizedRootsList* missing_roots) {
RETURN_NOT_OK(Init());
string reference_instance_path;
for (auto& root : canonicalized_all_fs_roots_) {
if (!root.status.ok()) {
continue;
}
unique_ptr<InstanceMetadataPB> pb(new InstanceMetadataPB);
Status s = pb_util::ReadPBContainerFromPath(env_, GetInstanceMetadataPath(root.path),
pb.get(), pb_util::NOT_SENSITIVE);
if (PREDICT_FALSE(!s.ok())) {
if (s.IsNotFound()) {
if (missing_roots) {
missing_roots->emplace_back(root);
}
continue;
}
if (s.IsDiskFailure()) {
root.status = s.CloneAndPrepend("Failed to open instance file");
continue;
}
return s;
}
if (!metadata_) {
metadata_.reset(pb.release());
reference_instance_path = root.path;
} else if (pb->uuid() != metadata_->uuid()) {
return Status::Corruption(Substitute(
"Mismatched UUIDs across filesystem roots: The path $0 contains UUID $1 vs. "
"the path $2 contains UUID $3; configuring multiple Kudu processes with the same "
"directory is not supported",
reference_instance_path, metadata_->uuid() , root.path, pb->uuid()));
}
}
if (!metadata_) {
return Status::NotFound("could not find a healthy instance file");
}
const auto& meta_root_path = canonicalized_metadata_fs_root_.path;
const auto bad_meta_fs_msg =
Substitute("Could not determine file system of metadata directory $0",
meta_root_path);
Status s = env_->IsOnXfsFilesystem(meta_root_path, &meta_on_xfs_);
if (FLAGS_cmeta_fsync_override_on_xfs) {
// Err on the side of visibility if we expect a behavior change based on
// the file system.
RETURN_NOT_OK_PREPEND(s, bad_meta_fs_msg);
} else {
WARN_NOT_OK(s, bad_meta_fs_msg);
}
VLOG(1) << Substitute("Detected metadata directory is $0on an XFS mount: $1",
meta_on_xfs_ ? "" : "not ", meta_root_path);
return Status::OK();
}
Status FsManager::Open(FsReport* report, Timer* read_instance_metadata_files,
Timer* read_data_directories,
std::atomic<int>* containers_processed,
std::atomic<int>* containers_total) {
if (read_instance_metadata_files) {
read_instance_metadata_files->Start();
}
// Load and verify the instance metadata files.
//
// Done first to minimize side effects in the case that the configured roots
// are not yet initialized on disk.
CanonicalizedRootsList missing_roots;
RETURN_NOT_OK(PartialOpen(&missing_roots));
// Ensure all of the ancillary directories exist.
vector<string> ancillary_dirs = { GetWalsRootDir(),
GetTabletMetadataDir(),
GetConsensusMetadataDir() };
for (const auto& d : ancillary_dirs) {
bool is_dir;
RETURN_NOT_OK_PREPEND(env_->IsDirectory(d, &is_dir),
Substitute("could not verify required directory $0", d));
if (!is_dir) {
return Status::Corruption(
Substitute("Required directory $0 exists but is not a directory", d));
}
}
// In the event of failure, delete everything we created.
vector<string> created_dirs;
vector<string> created_files;
auto deleter = MakeScopedCleanup([&]() {
// Delete files first so that the directories will be empty when deleted.
for (const auto& f : created_files) {
WARN_NOT_OK(env_->DeleteFile(f), "Could not delete file " + f);
}
// Delete directories in reverse order since parent directories will have
// been added before child directories.
for (auto it = created_dirs.rbegin(); it != created_dirs.rend(); it++) {
WARN_NOT_OK(env_->DeleteDir(*it), "Could not delete dir " + *it);
}
});
// Create any missing roots, if desired.
if (!opts_.read_only &&
opts_.update_instances != UpdateInstanceBehavior::DONT_UPDATE) {
Status s = CreateFileSystemRoots(
missing_roots, *metadata_, &created_dirs, &created_files);
static const string kUnableToCreateMsg = "unable to create missing filesystem roots";
if (opts_.update_instances == UpdateInstanceBehavior::UPDATE_AND_IGNORE_FAILURES) {
// We only warn on error here -- regardless of errors, we might be in
// good enough shape to open the DataDirManager.
WARN_NOT_OK(s, kUnableToCreateMsg);
} else if (opts_.update_instances == UpdateInstanceBehavior::UPDATE_AND_ERROR_ON_FAILURE) {
RETURN_NOT_OK_PREPEND(s, kUnableToCreateMsg);
}
}
if (read_instance_metadata_files) {
read_instance_metadata_files->Stop();
}
if (!server_key().empty() && key_provider_) {
string server_key;
RETURN_NOT_OK(key_provider_->DecryptServerKey(this->server_key(),
this->server_key_iv(),
this->server_key_version(),
&server_key));
// 'server_key' is a hexadecimal string and SetEncryptionKey expects bits
// (hex / 2 = bytes * 8 = bits).
env_->SetEncryptionKey(reinterpret_cast<const uint8_t*>(
a2b_hex(server_key).c_str()),
server_key.length() * 4);
}
// Open the directory manager if it has not been opened already.
if (!dd_manager_) {
DataDirManagerOptions dm_opts;
dm_opts.metric_entity = opts_.metric_entity;
dm_opts.read_only = opts_.read_only;
dm_opts.dir_type = opts_.block_manager_type;
dm_opts.update_instances = opts_.update_instances;
LOG_TIMING(INFO, "opening directory manager") {
RETURN_NOT_OK(DataDirManager::OpenExisting(env_,
canonicalized_data_fs_roots_, dm_opts, &dd_manager_));
}
}
// Only clean temporary files after the data dir manager successfully opened.
// This ensures that we were able to obtain the exclusive directory locks
// on the data directories before we start deleting files.
if (!opts_.read_only) {
CleanTmpFiles();
CheckAndFixPermissions();
}
// Set an initial error handler to mark data directories as failed.
error_manager_->SetErrorNotificationCb(
ErrorHandlerType::DISK_ERROR, [this](const string& uuid) {
this->dd_manager_->MarkDirFailedByUuid(uuid);
});
// Finally, initialize and open the block manager if needed.
if (!opts_.skip_block_manager) {
if (read_data_directories) {
read_data_directories->Start();
}
InitBlockManager();
LOG_TIMING(INFO, "opening block manager") {
if (opts_.block_manager_type == "file") {
RETURN_NOT_OK(block_manager_->Open(report));
} else {
RETURN_NOT_OK(block_manager_->Open(report, containers_processed, containers_total));
}
}
if (read_data_directories) {
read_data_directories->Stop();
if (opts_.metric_entity && opts_.block_manager_type == "log") {
METRIC_log_block_manager_containers_processing_time_startup.Instantiate(opts_.metric_entity,
(read_data_directories->TimeElapsed()).ToMilliseconds());
}
}
}
// Report wal and metadata directories.
if (report) {
report->wal_dir = canonicalized_wal_fs_root_.path;
report->metadata_dir = canonicalized_metadata_fs_root_.path;
}
if (FLAGS_enable_data_block_fsync) {
// Files/directories created by the directory manager in the fs roots have
// been synchronized, so now is a good time to sync the roots themselves.
WARN_NOT_OK(env_util::SyncAllParentDirs(env_, created_dirs, created_files),
"could not sync newly created fs roots");
}
LOG(INFO) << "Opened local filesystem: " <<
JoinStrings(DataDirManager::GetRootNames(canonicalized_all_fs_roots_), ",")
<< std::endl << SecureDebugString(*metadata_);
if (!created_dirs.empty()) {
LOG(INFO) << "New directories created while opening local filesystem: " <<
JoinStrings(created_dirs, ", ");
}
if (!created_files.empty()) {
LOG(INFO) << "New files created while opening local filesystem: " <<
JoinStrings(created_files, ", ");
}
// Success: do not delete any missing roots created.
deleter.cancel();
return Status::OK();
}
Status FsManager::CreateInitialFileSystemLayout(optional<string> uuid,
optional<string> server_key,
optional<string> server_key_iv,
optional<string> server_key_version) {
CHECK(!opts_.read_only);
RETURN_NOT_OK(Init());
// In the event of failure, delete everything we created.
vector<string> created_dirs;
vector<string> created_files;
auto deleter = MakeScopedCleanup([&]() {
// Delete files first so that the directories will be empty when deleted.
for (const auto& f : created_files) {
WARN_NOT_OK(env_->DeleteFile(f), "Could not delete file " + f);
}
// Delete directories in reverse order since parent directories will have
// been added before child directories.
for (auto it = created_dirs.rbegin(); it != created_dirs.rend(); it++) {
WARN_NOT_OK(env_->DeleteDir(*it), "Could not delete dir " + *it);
}
});
// Create the filesystem roots.
//
// Files/directories created will NOT be synchronized to disk.
InstanceMetadataPB metadata;
RETURN_NOT_OK_PREPEND(CreateInstanceMetadata(std::move(uuid),
std::move(server_key),
std::move(server_key_iv),
std::move(server_key_version),
&metadata),
"unable to create instance metadata");
RETURN_NOT_OK_PREPEND(FsManager::CreateFileSystemRoots(
canonicalized_all_fs_roots_, metadata, &created_dirs, &created_files),
"unable to create file system roots");
// Create ancillary directories.
vector<string> ancillary_dirs = { GetWalsRootDir(),
GetTabletMetadataDir(),
GetConsensusMetadataDir() };
for (const string& dir : ancillary_dirs) {
bool created;
RETURN_NOT_OK_PREPEND(env_util::CreateDirIfMissing(env_, dir, &created),
Substitute("Unable to create directory $0", dir));
if (created) {
created_dirs.emplace_back(dir);
}
}
// Create the directory manager.
//
// All files/directories created will be synchronized to disk.
DataDirManagerOptions dm_opts;
dm_opts.metric_entity = opts_.metric_entity;
dm_opts.read_only = opts_.read_only;
LOG_TIMING(INFO, "creating directory manager") {
RETURN_NOT_OK_PREPEND(DataDirManager::CreateNew(
env_, canonicalized_data_fs_roots_, dm_opts, &dd_manager_),
"Unable to create directory manager");
}
if (FLAGS_enable_data_block_fsync) {
// Files/directories created by the directory manager in the fs roots have
// been synchronized, so now is a good time to sync the roots themselves.
WARN_NOT_OK(env_util::SyncAllParentDirs(env_, created_dirs, created_files),
"could not sync newly created fs roots");
}
// Success: don't delete any files.
deleter.cancel();
return Status::OK();
}
Status FsManager::CreateFileSystemRoots(
CanonicalizedRootsList canonicalized_roots,
const InstanceMetadataPB& metadata,
vector<string>* created_dirs,
vector<string>* created_files) {
CHECK(!opts_.read_only);
// It's OK if a root already exists as long as there's nothing in it.
vector<string> non_empty_roots;
for (const auto& root : canonicalized_roots) {
if (!root.status.ok()) {
return Status::IOError("cannot create FS layout; at least one directory "
"failed to canonicalize", root.path);
}
if (!env_->FileExists(root.path)) {
// We'll create the directory below.
continue;
}
bool is_empty;
RETURN_NOT_OK_PREPEND(env_util::IsDirectoryEmpty(env_, root.path, &is_empty),
"unable to check if FSManager root is empty");
if (!is_empty) {
non_empty_roots.emplace_back(root.path);
}
}
if (!non_empty_roots.empty()) {
return Status::AlreadyPresent(
Substitute("FSManager roots already exist: $0", JoinStrings(non_empty_roots, ",")));
}
// All roots are either empty or non-existent. Create missing roots and all
// subdirectories.
for (const auto& root : canonicalized_roots) {
if (!root.status.ok()) {
continue;
}
string root_name = root.path;
bool created;
RETURN_NOT_OK_PREPEND(env_util::CreateDirIfMissing(env_, root_name, &created),
"unable to create FSManager root");
if (created) {
created_dirs->emplace_back(root_name);
}
RETURN_NOT_OK_PREPEND(WriteInstanceMetadata(metadata, root_name),
"unable to write instance metadata");
created_files->emplace_back(GetInstanceMetadataPath(root_name));
}
return Status::OK();
}
Status FsManager::CreateInstanceMetadata(optional<string> uuid,
optional<string> server_key,
optional<string> server_key_iv,
optional<string> server_key_version,
InstanceMetadataPB* metadata) {
if (uuid) {
string canonicalized_uuid;
RETURN_NOT_OK(oid_generator_.Canonicalize(*uuid, &canonicalized_uuid));
metadata->set_uuid(canonicalized_uuid);
} else {
metadata->set_uuid(oid_generator_.Next());
}
if (server_key && server_key_iv && server_key_version) {
metadata->set_server_key(*server_key);
metadata->set_server_key_iv(*server_key_iv);
metadata->set_server_key_version(*server_key_version);
} else if (server_key || server_key_iv || server_key_version) {
return Status::InvalidArgument(
"'server_key', 'server_key_iv', and 'server_key_version' must be specified "
"together (either all of them must be specified, or none of them).");
} else if (FLAGS_encrypt_data_at_rest) {
string key_version;
RETURN_NOT_OK_PREPEND(
key_provider_->GenerateEncryptedServerKey(metadata->mutable_server_key(),
metadata->mutable_server_key_iv(),
&key_version),
"failed to generate encrypted server key");
metadata->set_server_key_version(key_version);
}
string time_str;
StringAppendStrftime(&time_str, "%Y-%m-%d %H:%M:%S", time(nullptr), false);
string hostname;
if (!GetHostname(&hostname).ok()) {
hostname = "<unknown host>";
}
metadata->set_format_stamp(Substitute("Formatted at $0 on $1", time_str, hostname));
return Status::OK();
}
Status FsManager::WriteInstanceMetadata(const InstanceMetadataPB& metadata,
const string& root) {
const string path = GetInstanceMetadataPath(root);
// The instance metadata is written effectively once per TS, so the
// durability cost is negligible.
RETURN_NOT_OK(pb_util::WritePBContainerToPath(env_, path,
metadata,
pb_util::NO_OVERWRITE,
pb_util::SYNC,
pb_util::NOT_SENSITIVE));
LOG(INFO) << "Generated new instance metadata in path " << path << ":\n"
<< SecureDebugString(metadata);
return Status::OK();
}
const string& FsManager::uuid() const {
return CHECK_NOTNULL(metadata_.get())->uuid();
}
const string& FsManager::server_key() const {
return CHECK_NOTNULL(metadata_.get())->server_key();
}
const string& FsManager::server_key_iv() const {
return CHECK_NOTNULL(metadata_.get())->server_key_iv();
}
const string& FsManager::server_key_version() const {
return CHECK_NOTNULL(metadata_.get())->server_key_version();
}
vector<string> FsManager::GetDataRootDirs() const {
// Get the data subdirectory for each data root.
return dd_manager_->GetDirs();
}
string FsManager::GetTabletMetadataDir() const {
DCHECK(initted_);
return JoinPathSegments(canonicalized_metadata_fs_root_.path, kTabletMetadataDirName);
}
string FsManager::GetTabletMetadataPath(const string& tablet_id) const {
return JoinPathSegments(GetTabletMetadataDir(), tablet_id);
}
bool FsManager::IsValidTabletId(const string& fname) {
// Prevent warning logs for hidden files or ./..
if (PREDICT_FALSE(HasPrefixString(fname, "."))) {
VLOG(1) << "Ignoring hidden file in tablet metadata dir: " << fname;
return false;
}
string canonicalized_uuid;
Status s = oid_generator_.Canonicalize(fname, &canonicalized_uuid);
if (PREDICT_FALSE(!s.ok())) {
LOG(WARNING) << "Ignoring file in tablet metadata dir: " << fname << ": " <<
s.message().ToString();
return false;
}
if (PREDICT_FALSE(fname != canonicalized_uuid)) {
LOG(WARNING) << "Ignoring file in tablet metadata dir: " << fname << ": " <<
Substitute("canonicalized uuid $0 does not match file name",
canonicalized_uuid);
return false;
}
return true;
}
Status FsManager::ListTabletIds(vector<string>* tablet_ids) {
string dir = GetTabletMetadataDir();
vector<string> children;
RETURN_NOT_OK_PREPEND(ListDir(dir, &children),
Substitute("Couldn't list tablets in metadata directory $0", dir));
// Suppose all children are valid tablet metadata.
tablet_ids->reserve(children.size());
for (auto& child : children) {
if (PREDICT_FALSE(!IsValidTabletId(child))) {
continue;
}
tablet_ids->emplace_back(std::move(child));
}
return Status::OK();
}
string FsManager::GetInstanceMetadataPath(const string& root) const {
return JoinPathSegments(root, kInstanceMetadataFileName);
}
string FsManager::GetTabletWalRecoveryDir(const string& tablet_id) const {
string path = JoinPathSegments(GetWalsRootDir(), tablet_id);
StrAppend(&path, kWalsRecoveryDirSuffix);
return path;
}
string FsManager::GetWalSegmentFileName(const string& tablet_id,
uint64_t sequence_number) const {
return JoinPathSegments(GetTabletWalDir(tablet_id),
strings::Substitute("$0-$1",
kWalFileNamePrefix,
StringPrintf("%09" PRIu64, sequence_number)));
}
void FsManager::CleanTmpFiles() {
DCHECK(!opts_.read_only);
// Temporary files in the Block Manager directories are cleaned during
// Block Manager startup.
for (const auto& s : { GetWalsRootDir(),
GetTabletMetadataDir(),
GetConsensusMetadataDir() }) {
WARN_NOT_OK(env_util::DeleteTmpFilesRecursively(env_, s),
Substitute("Error deleting tmp files in $0", s));
}
}
void FsManager::CheckAndFixPermissions() {
for (const auto& root : canonicalized_all_fs_roots_) {
if (!root.status.ok()) {
continue;
}
WARN_NOT_OK(env_->EnsureFileModeAdheresToUmask(root.path),
Substitute("could not check and fix permissions for path: $0",
root.path));
}
}
// ==========================================================================
// Dump/Debug utils
// ==========================================================================
void FsManager::DumpFileSystemTree(ostream& out) {
DCHECK(initted_);
for (const auto& root : canonicalized_all_fs_roots_) {
if (!root.status.ok()) {
continue;
}
out << "File-System Root: " << root.path << std::endl;
vector<string> objects;
Status s = env_->GetChildren(root.path, &objects);
if (!s.ok()) {
LOG(ERROR) << "Unable to list the fs-tree: " << s.ToString();
return;
}
DumpFileSystemTree(out, "|-", root.path, objects);
}
}
void FsManager::DumpFileSystemTree(ostream& out, const string& prefix,
const string& path, const vector<string>& objects) {
for (const string& name : objects) {
if (name == "." || name == "..") continue;
vector<string> sub_objects;
string sub_path = JoinPathSegments(path, name);
Status s = env_->GetChildren(sub_path, &sub_objects);
if (s.ok()) {
out << prefix << name << "/" << std::endl;
DumpFileSystemTree(out, prefix + "---", sub_path, sub_objects);
} else {
out << prefix << name << std::endl;
}
}
}
// ==========================================================================
// Data read/write interfaces
// ==========================================================================
Status FsManager::CreateNewBlock(const CreateBlockOptions& opts, unique_ptr<WritableBlock>* block) {
CHECK(!opts_.read_only);
DCHECK(block_manager_);
return block_manager_->CreateBlock(opts, block);
}
Status FsManager::OpenBlock(const BlockId& block_id, unique_ptr<ReadableBlock>* block) {
DCHECK(block_manager_);
return block_manager_->OpenBlock(block_id, block);
}
bool FsManager::BlockExists(const BlockId& block_id) const {
DCHECK(block_manager_);
unique_ptr<ReadableBlock> block;
return block_manager_->OpenBlock(block_id, &block).ok();
}
} // namespace kudu