blob: 75aa76486730c00a4e4f1e2ea98ca144cccea0fa [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "kudu/fs/dir_util.h"
#include <cstdint>
#include <ostream>
#include <set>
#include <utility>
#include <gflags/gflags_declare.h>
#include <glog/logging.h>
#include "kudu/fs/fs.pb.h"
#include "kudu/gutil/map-util.h"
#include "kudu/gutil/port.h"
#include "kudu/gutil/strings/substitute.h"
#include "kudu/util/env.h"
#include "kudu/util/env_util.h"
#include "kudu/util/path_util.h"
#include "kudu/util/pb_util.h"
#include "kudu/util/scoped_cleanup.h"
DECLARE_bool(enable_data_block_fsync);
using kudu::pb_util::CreateMode;
using std::set;
using std::string;
using std::unique_ptr;
using strings::Substitute;
namespace kudu {
namespace fs {
namespace {
const char kHolePunchErrorMsg[] =
"Error during hole punch test. The log block manager requires a "
"filesystem with hole punching support such as ext4 or xfs. On el6, "
"kernel version 2.6.32-358 or newer is required. To run without hole "
"punching (at the cost of some efficiency and scalability), reconfigure "
"Kudu to use the file block manager. Refer to the Kudu documentation for "
"more details. WARNING: the file block manager is not suitable for "
"production use and should be used only for small-scale evaluation and "
"development on systems where hole-punching is not available. It's "
"impossible to switch between block managers after data is written to the "
"server. Raw error message follows";
Status CheckHolePunch(Env* env, const string& path) {
// Arbitrary constants.
static uint64_t kFileSize = 4096 * 4;
static uint64_t kHoleOffset = 4096;
static uint64_t kHoleSize = 8192;
static uint64_t kPunchedFileSize = kFileSize - kHoleSize;
// Open the test file.
string filename = JoinPathSegments(path, "hole_punch_test_file");
unique_ptr<RWFile> file;
RWFileOptions opts;
// Encrypted files are larger due to the header size, which causes the file
// size calculations below to be off. We don't need the file to be encrypted
// for the hole punch test.
opts.is_sensitive = false;
RETURN_NOT_OK(env->NewRWFile(opts, filename, &file));
// The file has been created; delete it on exit no matter what happens.
auto file_deleter = MakeScopedCleanup([&]() {
WARN_NOT_OK(env->DeleteFile(filename),
"Could not delete file " + filename);
});
// Preallocate it, making sure the file's size is what we'd expect.
uint64_t sz;
RETURN_NOT_OK(file->PreAllocate(0, kFileSize, RWFile::CHANGE_FILE_SIZE));
RETURN_NOT_OK(env->GetFileSizeOnDisk(filename, &sz));
if (sz != kFileSize) {
return Status::IOError(Substitute(
"Unexpected pre-punch file size for $0: expected $1 but got $2",
filename, kFileSize, sz));
}
// Punch the hole, testing the file's size again.
RETURN_NOT_OK(file->PunchHole(kHoleOffset, kHoleSize));
RETURN_NOT_OK(env->GetFileSizeOnDisk(filename, &sz));
if (sz != kPunchedFileSize) {
return Status::IOError(Substitute(
"Unexpected post-punch file size for $0: expected $1 but got $2",
filename, kPunchedFileSize, sz));
}
return Status::OK();
}
} // anonymous namespace
// Evaluates 'status_expr' and if it results in a disk-failure error, logs a
// message and marks the instance as unhealthy, returning with no error.
//
// Note: A disk failure may thwart attempts to read directory entries at the OS
// level, leading to NotFound errors when reading the instance files. As such,
// we treat missing instances the same way we treat those that yield more
// blatant disk failure POSIX codes.
//
// Note: if a non-disk-failure error is produced, the instance will remain
// healthy. These errors should be handled externally.
#define RETURN_NOT_OK_FAIL_INSTANCE_PREPEND(status_expr, msg) do { \
const Status& _s = (status_expr); \
if (PREDICT_FALSE(!_s.ok())) { \
const Status _s_prepended = _s.CloneAndPrepend(msg); \
if (_s.IsNotFound() || _s.IsDiskFailure()) { \
health_status_ = _s_prepended; \
VLOG(1) << "Directory instance has status: " << _s_prepended.ToString(); \
return Status::OK(); \
} \
return _s_prepended; \
} \
} while (0)
DirInstanceMetadataFile::DirInstanceMetadataFile(Env* env,
string uuid,
string dir_type,
string filename)
: env_(env),
uuid_(std::move(uuid)),
dir_type_(std::move(dir_type)),
filename_(std::move(filename)) {}
DirInstanceMetadataFile::~DirInstanceMetadataFile() {
if (lock_) {
WARN_NOT_OK(Unlock(), Substitute("Failed to unlock file $0", filename_));
}
}
Status DirInstanceMetadataFile::Create(const set<string>& all_uuids,
bool* created_dir) {
DCHECK(!lock_);
DCHECK(ContainsKey(all_uuids, uuid_));
const string dir_name = dir();
bool created;
// Create the directory if needed.
RETURN_NOT_OK_FAIL_INSTANCE_PREPEND(
env_util::CreateDirIfMissing(env_, dir_name, &created),
Substitute("Could not create directory $0", dir_name));
auto cleanup_dir_on_failure = MakeScopedCleanup([&] {
if (created) {
WARN_NOT_OK(env_->DeleteDir(dir_name), "Could not remove newly-created directory");
}
});
// If we're initializing the log block manager, check that we support
// hole-punching.
if (dir_type_ == "log") {
RETURN_NOT_OK_FAIL_INSTANCE_PREPEND(CheckHolePunch(env_, dir_name),
kHolePunchErrorMsg);
}
// Create a temporary file with which to fetch the filesystem's block size.
//
// This is a safer bet than using the parent directory as some filesystems
// advertise different block sizes for directories than for files. On top of
// that, the value may inform intra-file layout decisions made by Kudu, so
// it's more correct to derive it from a file in any case.
string created_filename;
string tmp_template = JoinPathSegments(
dir_name, Substitute("getblocksize$0.XXXXXX", kTmpInfix));
unique_ptr<WritableFile> tmp_file;
WritableFileOptions opts;
opts.is_sensitive = false;
RETURN_NOT_OK_FAIL_INSTANCE_PREPEND(
env_->NewTempWritableFile(opts,
tmp_template,
&created_filename, &tmp_file),
"failed to create temp file while checking block size");
SCOPED_CLEANUP({
WARN_NOT_OK(env_->DeleteFile(created_filename),
"could not delete temporary file");
});
uint64_t block_size;
RETURN_NOT_OK_FAIL_INSTANCE_PREPEND(env_->GetBlockSize(created_filename, &block_size),
"failed to check block size");
// Set up the directory set.
DirInstanceMetadataPB new_instance;
DirSetPB* new_dir_set = new_instance.mutable_dir_set();
new_dir_set->set_uuid(uuid_);
new_dir_set->mutable_all_uuids()->Reserve(all_uuids.size());
for (const string& u : all_uuids) {
new_dir_set->add_all_uuids(u);
}
// And the rest of the metadata.
new_instance.set_dir_type(dir_type_);
new_instance.set_filesystem_block_size_bytes(block_size);
RETURN_NOT_OK_FAIL_INSTANCE_PREPEND(pb_util::WritePBContainerToPath(
env_, filename_, new_instance,
pb_util::NO_OVERWRITE,
FLAGS_enable_data_block_fsync ? pb_util::SYNC : pb_util::NO_SYNC,
pb_util::NOT_SENSITIVE),
"failed to write PB");
// Now that we're returning success, we don't need to clean anything up, and
// we can indicate to callers there is a new directory to clean up (if
// appropriate).
cleanup_dir_on_failure.cancel();
if (created_dir) {
*created_dir = created;
}
return Status::OK();
}
Status DirInstanceMetadataFile::LoadFromDisk() {
DCHECK(!lock_) <<
"Opening a metadata file that's already locked would release the lock";
unique_ptr<DirInstanceMetadataPB> pb(new DirInstanceMetadataPB());
RETURN_NOT_OK_FAIL_INSTANCE_PREPEND(
pb_util::ReadPBContainerFromPath(env_, filename_, pb.get(), pb_util::NOT_SENSITIVE),
Substitute("Failed to read metadata file from $0", filename_));
if (pb->dir_type() != dir_type_) {
return Status::IOError(Substitute(
"existing instance was written using the '$0' format; cannot restart "
"with a different format type '$1'",
pb->dir_type(), dir_type_));
}
uint64_t block_size;
RETURN_NOT_OK_FAIL_INSTANCE_PREPEND(env_->GetBlockSize(filename_, &block_size),
Substitute("Failed to load metadata file. Could not get block size of $0", filename_));
if (pb->filesystem_block_size_bytes() != block_size) {
return Status::IOError("Wrong filesystem block size", Substitute(
"Expected $0 but was $1", pb->filesystem_block_size_bytes(), block_size));
}
uuid_ = pb->dir_set().uuid();
metadata_ = std::move(pb);
return Status::OK();
}
Status DirInstanceMetadataFile::Lock() {
DCHECK(!lock_);
FileLock* lock;
RETURN_NOT_OK_FAIL_INSTANCE_PREPEND(env_->LockFile(filename_, &lock),
"Could not lock instance file. Make sure that "
"Kudu is not already running and you are not trying to run "
"Kudu with a different user than before");
lock_.reset(lock);
return Status::OK();
}
Status DirInstanceMetadataFile::Unlock() {
DCHECK(lock_);
RETURN_NOT_OK_FAIL_INSTANCE_PREPEND(env_->UnlockFile(lock_.release()),
Substitute("Could not unlock $0", filename_));
return Status::OK();
}
} // namespace fs
} // namespace kudu