blob: 8a32d5714e5ee9338c0f967145cd029f0939685d [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <functional>
#include <string>
#include <unordered_map>
#include <utility>
#include <glog/logging.h>
#include "kudu/fs/dir_manager.h"
#include "kudu/fs/dir_util.h"
#include "kudu/gutil/port.h"
#include "kudu/util/mutex.h"
namespace kudu {
namespace fs {
// Callback to error-handling code. The input string is the UUID a failed
// component.
//
// e.g. the ErrorNotificationCb for disk failure handling takes the UUID of a
// directory, marks it failed, and shuts down the tablets in that directory.
typedef std::function<void(const std::string&)> ErrorNotificationCb;
// Evaluates the expression and handles it if it results in an error.
// Returns if the status is an error.
#define RETURN_NOT_OK_HANDLE_ERROR(status_expr) do { \
const Status& _s = (status_expr); \
if (PREDICT_TRUE(_s.ok())) { \
break; \
} \
HandleError(_s); \
return _s; \
} while (0)
// Evaluates the expression and runs 'err_handler' if it results in a disk
// failure. Returns if the expression results in an error.
#define RETURN_NOT_OK_HANDLE_DISK_FAILURE(status_expr, err_handler) do { \
const Status& _s = (status_expr); \
if (PREDICT_TRUE(_s.ok())) { \
break; \
} \
if (_s.IsDiskFailure()) { \
(err_handler); \
} \
return _s; \
} while (0)
// Evaluates the expression and runs 'err_handler' if it results in a
// corruption. Returns if the expression results in an error.
#define RETURN_NOT_OK_HANDLE_CORRUPTION(status_expr, err_handler) do { \
const Status& _s = (status_expr); \
if (PREDICT_TRUE(_s.ok())) { \
break; \
} \
if (_s.IsCorruption()) { \
(err_handler); \
} \
return _s; \
} while (0)
// Evaluates the expression and runs 'err_handler' if it results in a disk
// failure.
#define HANDLE_DISK_FAILURE(status_expr, err_handler) do { \
const Status& _s = (status_expr); \
if (PREDICT_FALSE(_s.IsDiskFailure())) { \
(err_handler); \
} \
} while (0)
enum ErrorHandlerType {
// For disk failures.
DISK_ERROR,
// For errors that caused by no data dirs being available (e.g. if all disks
// are full or failed when creating a block).
//
// TODO(awong): Register an actual error-handling callback for
// NO_AVAILABLE_DISKS. Some errors may surface indirectly due to disk errors,
// but may not have touched disk, and thus may have not called the DISK_ERROR
// error handler.
//
// For example, if all of the disks in a tablet's directory group have
// already failed due to disk errors, the tablet would not be able to create
// a new block and return an error, despite CreateNewBlock() not actually
// touching disk and triggering running error handling. Callers of
// CreateNewBlock() will expect that if an error is returned, it has been
// handled, and may hit a CHECK failure otherwise. As such, before returning
// an error, CreateNewBlock() must wait for any in-flight error handling to
// finish.
//
// While this currently runs a no-op, it serves to enforce that any
// error-handling caused by ERROR1 that may have indirectly caused ERROR2
// (e.g. if ERROR1 is a disk error of the only disk on the server, and ERROR2
// is the subsequent failure to create a block because all disks have been
// marked as failed) must complete before ERROR2 can be returned to its
// caller.
NO_AVAILABLE_DISKS,
// For CFile corruptions.
CFILE_CORRUPTION,
// For broken invariants caused by KUDU-2233.
KUDU_2233_CORRUPTION,
};
// When certain operations fail, the side effects of the error can span multiple
// layers, many of which we prefer to keep separate. The FsErrorManager
// registers and runs error handlers without adding cross-layer dependencies.
// Additionally, it enforces one callback is run at a time, and that each
// callback fully completes before returning.
//
// e.g. the TSTabletManager registers a callback to handle disk failure.
// Blocks and other entities that may hit disk failures can call it without
// knowing about the TSTabletManager.
class FsErrorManager {
public:
FsErrorManager();
// Sets the error notification callback.
//
// This should be called when the callback's callee is initialized.
void SetErrorNotificationCb(ErrorHandlerType e, ErrorNotificationCb cb);
// Resets the error notification callback.
//
// This must be called before the callback's callee is destroyed.
void UnsetErrorNotificationCb(ErrorHandlerType e);
// Runs the error notification callback.
//
// 'uuid' is the full UUID of the component that failed.
void RunErrorNotificationCb(ErrorHandlerType e, const std::string& uuid) const;
// Runs the error notification callback with the UUID of 'dir'.
void RunErrorNotificationCb(ErrorHandlerType e, const Dir* dir) const {
DCHECK_EQ(e, ErrorHandlerType::DISK_ERROR);
RunErrorNotificationCb(e, dir->instance()->uuid());
}
private:
// Callbacks to be run when an error occurs.
std::unordered_map<ErrorHandlerType, ErrorNotificationCb, std::hash<int>> callbacks_;
// Protects calls to notifications, enforcing that a single callback runs at
// a time.
mutable Mutex lock_;
};
} // namespace fs
} // namespace kudu