| // Licensed to the Apache Software Foundation (ASF) under one |
| // or more contributor license agreements. See the NOTICE file |
| // distributed with this work for additional information |
| // regarding copyright ownership. The ASF licenses this file |
| // to you under the Apache License, Version 2.0 (the |
| // "License"); you may not use this file except in compliance |
| // with the License. You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, |
| // software distributed under the License is distributed on an |
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| // KIND, either express or implied. See the License for the |
| // specific language governing permissions and limitations |
| // under the License. |
| #include "kudu/fs/block_manager_util.h" |
| |
| #include <cstdint> |
| #include <ostream> |
| #include <set> |
| #include <unordered_map> |
| #include <utility> |
| |
| #include <gflags/gflags_declare.h> |
| #include <glog/logging.h> |
| |
| #include "kudu/fs/fs.pb.h" |
| #include "kudu/gutil/map-util.h" |
| #include "kudu/gutil/port.h" |
| #include "kudu/gutil/strings/join.h" |
| #include "kudu/gutil/strings/substitute.h" |
| #include "kudu/util/env.h" |
| #include "kudu/util/path_util.h" |
| #include "kudu/util/pb_util.h" |
| #include "kudu/util/scoped_cleanup.h" |
| #include "kudu/util/test_util_prod.h" |
| |
| DECLARE_bool(enable_data_block_fsync); |
| |
| namespace kudu { |
| namespace fs { |
| |
| using pb_util::CreateMode; |
| using std::set; |
| using std::string; |
| using std::unique_ptr; |
| using std::unordered_map; |
| using std::vector; |
| using strings::Substitute; |
| |
| // Evaluates 'status_expr' and if it results in a disk-failure error, logs a |
| // message and marks the instance as unhealthy, returning with no error. |
| // |
| // Note: A disk failure may thwart attempts to read directory entries at the OS |
| // level, leading to NotFound errors when reading the instance files. As such, |
| // we treat missing instances the same way we treat those that yield more |
| // blatant disk failure POSIX codes. |
| // |
| // Note: if a non-disk-failure error is produced, the instance will remain |
| // healthy. These errors should be handled externally. |
| #define RETURN_NOT_OK_FAIL_INSTANCE_PREPEND(status_expr, msg) do { \ |
| const Status& _s = (status_expr); \ |
| if (PREDICT_FALSE(!_s.ok())) { \ |
| const Status _s_prepended = _s.CloneAndPrepend(msg); \ |
| if (_s.IsNotFound() || _s.IsDiskFailure()) { \ |
| health_status_ = _s_prepended; \ |
| LOG(INFO) << "Instance is unhealthy: " << _s_prepended.ToString(); \ |
| return Status::OK(); \ |
| } \ |
| return _s_prepended; \ |
| } \ |
| } while (0) |
| |
| PathInstanceMetadataFile::PathInstanceMetadataFile(Env* env, |
| string block_manager_type, |
| string filename) |
| : env_(env), |
| block_manager_type_(std::move(block_manager_type)), |
| filename_(std::move(filename)) {} |
| |
| PathInstanceMetadataFile::~PathInstanceMetadataFile() { |
| if (lock_) { |
| WARN_NOT_OK(Unlock(), Substitute("Failed to unlock file $0", filename_)); |
| } |
| } |
| |
| Status PathInstanceMetadataFile::Create(const string& uuid, const vector<string>& all_uuids) { |
| DCHECK(!lock_) << |
| "Creating a metadata file that's already locked would release the lock"; |
| DCHECK(ContainsKey(set<string>(all_uuids.begin(), all_uuids.end()), uuid)); |
| |
| // Create a temporary file with which to fetch the filesystem's block size. |
| // |
| // This is a safer bet than using the parent directory as some filesystems |
| // advertise different block sizes for directories than for files. On top of |
| // that, the value may inform intra-file layout decisions made by Kudu, so |
| // it's more correct to derive it from a file in any case. |
| string created_filename; |
| string tmp_template = JoinPathSegments( |
| DirName(filename_), Substitute("getblocksize$0.XXXXXX", kTmpInfix)); |
| unique_ptr<WritableFile> tmp_file; |
| RETURN_NOT_OK(env_->NewTempWritableFile(WritableFileOptions(), |
| tmp_template, |
| &created_filename, &tmp_file)); |
| SCOPED_CLEANUP({ |
| WARN_NOT_OK(env_->DeleteFile(created_filename), |
| "could not delete temporary file"); |
| }); |
| uint64_t block_size; |
| RETURN_NOT_OK(env_->GetBlockSize(created_filename, &block_size)); |
| |
| PathInstanceMetadataPB new_instance; |
| |
| // Set up the path set. |
| PathSetPB* new_path_set = new_instance.mutable_path_set(); |
| new_path_set->set_uuid(uuid); |
| new_path_set->mutable_all_uuids()->Reserve(all_uuids.size()); |
| for (const string& u : all_uuids) { |
| new_path_set->add_all_uuids(u); |
| } |
| |
| // And the rest of the metadata. |
| new_instance.set_block_manager_type(block_manager_type_); |
| new_instance.set_filesystem_block_size_bytes(block_size); |
| |
| return pb_util::WritePBContainerToPath( |
| env_, filename_, new_instance, |
| pb_util::NO_OVERWRITE, |
| FLAGS_enable_data_block_fsync ? pb_util::SYNC : pb_util::NO_SYNC); |
| } |
| |
| Status PathInstanceMetadataFile::LoadFromDisk() { |
| DCHECK(!lock_) << |
| "Opening a metadata file that's already locked would release the lock"; |
| |
| unique_ptr<PathInstanceMetadataPB> pb(new PathInstanceMetadataPB()); |
| RETURN_NOT_OK_FAIL_INSTANCE_PREPEND(pb_util::ReadPBContainerFromPath(env_, filename_, pb.get()), |
| Substitute("Failed to read metadata file from $0", filename_)); |
| |
| if (pb->block_manager_type() != block_manager_type_) { |
| return Status::IOError(Substitute( |
| "existing data was written using the '$0' block manager; cannot restart " |
| "with a different block manager '$1' without reformatting", |
| pb->block_manager_type(), block_manager_type_)); |
| } |
| |
| uint64_t block_size; |
| RETURN_NOT_OK_FAIL_INSTANCE_PREPEND(env_->GetBlockSize(filename_, &block_size), |
| Substitute("Failed to load metadata file. Could not get block size of $0", filename_)); |
| if (pb->filesystem_block_size_bytes() != block_size) { |
| return Status::IOError("Wrong filesystem block size", Substitute( |
| "Expected $0 but was $1", pb->filesystem_block_size_bytes(), block_size)); |
| } |
| |
| metadata_.swap(pb); |
| return Status::OK(); |
| } |
| |
| Status PathInstanceMetadataFile::Lock() { |
| DCHECK(!lock_); |
| |
| FileLock* lock; |
| RETURN_NOT_OK_FAIL_INSTANCE_PREPEND(env_->LockFile(filename_, &lock), |
| "Could not lock block_manager_instance file. Make sure that " |
| "Kudu is not already running and you are not trying to run " |
| "Kudu with a different user than before"); |
| lock_.reset(lock); |
| return Status::OK(); |
| } |
| |
| Status PathInstanceMetadataFile::Unlock() { |
| DCHECK(lock_); |
| |
| RETURN_NOT_OK_FAIL_INSTANCE_PREPEND(env_->UnlockFile(lock_.release()), |
| Substitute("Could not unlock $0", filename_)); |
| return Status::OK(); |
| } |
| |
| void PathInstanceMetadataFile::SetMetadataForTests( |
| unique_ptr<PathInstanceMetadataPB> metadata) { |
| DCHECK(IsGTest()); |
| metadata_ = std::move(metadata); |
| } |
| |
| Status PathInstanceMetadataFile::CheckIntegrity( |
| const vector<unique_ptr<PathInstanceMetadataFile>>& instances) { |
| CHECK(!instances.empty()); |
| |
| // Note: although this verification works at the level of UUIDs and instance |
| // files, the (user-facing) error messages are reported in terms of data |
| // directories, because UUIDs and instance files are internal details. |
| |
| int first_healthy = -1; |
| for (int i = 0; i < instances.size(); i++) { |
| if (instances[i]->healthy()) { |
| first_healthy = i; |
| break; |
| } |
| } |
| if (first_healthy == -1) { |
| return Status::NotFound("no healthy data directories found"); |
| } |
| |
| // Map of instance UUID to path instance structure. Tracks duplicate UUIDs. |
| unordered_map<string, PathInstanceMetadataFile*> uuids; |
| |
| // Set of UUIDs specified in the path set of the first healthy instance. All |
| // instances will be compared against it to make sure all path sets match. |
| set<string> all_uuids(instances[first_healthy]->metadata()->path_set().all_uuids().begin(), |
| instances[first_healthy]->metadata()->path_set().all_uuids().end()); |
| |
| if (all_uuids.size() != instances.size()) { |
| return Status::IOError( |
| Substitute("$0 data directories provided, but expected $1", |
| instances.size(), all_uuids.size())); |
| } |
| |
| for (const auto& instance : instances) { |
| // If the instance has failed (e.g. due to disk failure), there's no |
| // telling what its metadata looks like. Ignore it, and continue checking |
| // integrity across the healthy instances. |
| if (!instance->healthy()) { |
| continue; |
| } |
| const PathSetPB& path_set = instance->metadata()->path_set(); |
| |
| // Check that the instance's UUID has not been claimed by another instance. |
| PathInstanceMetadataFile** other = InsertOrReturnExisting( |
| &uuids, path_set.uuid(), instance.get()); |
| if (other) { |
| return Status::IOError( |
| Substitute("Data directories $0 and $1 have duplicate instance metadata UUIDs", |
| (*other)->dir(), instance->dir()), |
| path_set.uuid()); |
| } |
| |
| // Check that the instance's UUID is a member of all_uuids. |
| if (!ContainsKey(all_uuids, path_set.uuid())) { |
| return Status::IOError( |
| Substitute("Data directory $0 instance metadata contains unexpected UUID", |
| instance->dir()), |
| path_set.uuid()); |
| } |
| |
| // Check that the instance's UUID set does not contain duplicates. |
| set<string> deduplicated_uuids(path_set.all_uuids().begin(), |
| path_set.all_uuids().end()); |
| string all_uuids_str = JoinStrings(path_set.all_uuids(), ","); |
| if (deduplicated_uuids.size() != path_set.all_uuids_size()) { |
| return Status::IOError( |
| Substitute("Data directory $0 instance metadata path set contains duplicate UUIDs", |
| instance->dir()), |
| JoinStrings(path_set.all_uuids(), ",")); |
| } |
| |
| // Check that the instance's UUID set matches the expected set. |
| if (deduplicated_uuids != all_uuids) { |
| return Status::IOError( |
| Substitute("Data directories $0 and $1 have different instance metadata UUID sets", |
| instances[0]->dir(), instance->dir()), |
| Substitute("$0 vs $1", JoinStrings(all_uuids, ","), all_uuids_str)); |
| } |
| } |
| |
| return Status::OK(); |
| } |
| |
| } // namespace fs |
| } // namespace kudu |