blob: 838c80167956ec10191536ddd850303999045521 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <stdint.h>
#include <algorithm>
#include <cctype>
#include <fstream>
#include <functional>
#include <iomanip>
#include <list>
#include <memory>
#include <set>
#include <sstream>
#include <tuple>
#include <utility>
#include <mesos/module.hpp>
#include <mesos/roles.hpp>
#include <mesos/authentication/authenticator.hpp>
#include <mesos/authorizer/authorizer.hpp>
#include <mesos/allocator/allocator.hpp>
#include <mesos/master/contender.hpp>
#include <mesos/master/detector.hpp>
#include <mesos/module/authenticator.hpp>
#include <mesos/scheduler/scheduler.hpp>
#include <process/check.hpp>
#include <process/collect.hpp>
#include <process/defer.hpp>
#include <process/delay.hpp>
#include <process/http.hpp>
#include <process/id.hpp>
#include <process/limiter.hpp>
#include <process/owned.hpp>
#include <process/run.hpp>
#include <process/shared.hpp>
#include <process/metrics/metrics.hpp>
#include <stout/check.hpp>
#include <stout/duration.hpp>
#include <stout/error.hpp>
#include <stout/foreach.hpp>
#include <stout/ip.hpp>
#include <stout/lambda.hpp>
#include <stout/multihashmap.hpp>
#include <stout/net.hpp>
#include <stout/nothing.hpp>
#include <stout/numify.hpp>
#include <stout/option.hpp>
#include <stout/path.hpp>
#include <stout/stringify.hpp>
#include <stout/unreachable.hpp>
#include <stout/utils.hpp>
#include <stout/uuid.hpp>
#include "authentication/cram_md5/authenticator.hpp"
#include "common/build.hpp"
#include "common/http.hpp"
#include "common/protobuf_utils.hpp"
#include "common/resource_quantities.hpp"
#include "common/status_utils.hpp"
#include "credentials/credentials.hpp"
#include "hook/manager.hpp"
#include "logging/flags.hpp"
#include "logging/logging.hpp"
#include "master/flags.hpp"
#include "master/master.hpp"
#include "master/registry_operations.hpp"
#include "master/weights.hpp"
#include "module/manager.hpp"
#include "watcher/whitelist_watcher.hpp"
using google::protobuf::RepeatedPtrField;
using std::list;
using std::reference_wrapper;
using std::set;
using std::shared_ptr;
using std::string;
using std::tie;
using std::tuple;
using std::vector;
using process::await;
using process::wait; // Necessary on some OS's to disambiguate.
using process::Clock;
using process::ExitedEvent;
using process::Failure;
using process::Future;
using process::MessageEvent;
using process::Owned;
using process::PID;
using process::Process;
using process::Promise;
using process::RateLimiter;
using process::Shared;
using process::Time;
using process::Timer;
using process::UPID;
using process::http::Pipe;
using process::http::authentication::Principal;
using process::metrics::Counter;
using google::protobuf::RepeatedPtrField;
namespace mesos {
namespace internal {
namespace master {
using mesos::allocator::Allocator;
using mesos::authorization::createSubject;
using mesos::authorization::VIEW_ROLE;
using mesos::authorization::VIEW_FRAMEWORK;
using mesos::authorization::VIEW_TASK;
using mesos::authorization::VIEW_EXECUTOR;
using mesos::master::contender::MasterContender;
using mesos::master::detector::MasterDetector;
using mesos::internal::ResourceQuantities;
static bool isValidFailoverTimeout(const FrameworkInfo& frameworkInfo);
class SlaveObserver : public ProtobufProcess<SlaveObserver>
{
public:
SlaveObserver(const UPID& _slave,
const SlaveInfo& _slaveInfo,
const SlaveID& _slaveId,
const PID<Master>& _master,
const Option<shared_ptr<RateLimiter>>& _limiter,
const shared_ptr<Metrics>& _metrics,
const Duration& _slavePingTimeout,
const size_t _maxSlavePingTimeouts)
: ProcessBase(process::ID::generate("slave-observer")),
slave(_slave),
slaveInfo(_slaveInfo),
slaveId(_slaveId),
master(_master),
limiter(_limiter),
metrics(_metrics),
slavePingTimeout(_slavePingTimeout),
maxSlavePingTimeouts(_maxSlavePingTimeouts),
timeouts(0),
pinged(false),
connected(true)
{
install<PongSlaveMessage>(&SlaveObserver::pong);
}
void reconnect()
{
connected = true;
}
void disconnect()
{
connected = false;
}
protected:
void initialize() override
{
ping();
}
void ping()
{
PingSlaveMessage message;
message.set_connected(connected);
send(slave, message);
pinged = true;
delay(slavePingTimeout, self(), &SlaveObserver::timeout);
}
void pong()
{
timeouts = 0;
pinged = false;
// Cancel any pending unreachable transitions.
if (markingUnreachable.isSome()) {
// Need a copy for non-const access.
Future<Nothing> future = markingUnreachable.get();
future.discard();
}
}
void timeout()
{
if (pinged) {
timeouts++; // No pong has been received before the timeout.
if (timeouts >= maxSlavePingTimeouts) {
// No pong has been received for the last
// 'maxSlavePingTimeouts' pings.
markUnreachable();
}
}
// NOTE: We keep pinging even if we schedule a transition to
// UNREACHABLE. This is because if the slave eventually responds
// to a ping, we can cancel the UNREACHABLE transition.
ping();
}
// Marking slaves unreachable is rate-limited and can be canceled if
// a pong is received before `_markUnreachable` is called.
//
// TODO(neilc): Using a rate-limit when marking slaves unreachable
// is only necessary for frameworks that are not PARTITION_AWARE.
// For such frameworks, we shutdown their tasks when an unreachable
// agent reregisters, so a rate-limit is a useful safety
// precaution. Once all frameworks are PARTITION_AWARE, we can
// likely remove the rate-limit (MESOS-5948).
void markUnreachable()
{
if (markingUnreachable.isSome()) {
return; // Unreachable transition is already in progress.
}
Future<Nothing> acquire = Nothing();
if (limiter.isSome()) {
LOG(INFO) << "Scheduling transition of agent " << slaveId
<< " to UNREACHABLE because of health check timeout";
acquire = limiter.get()->acquire();
}
markingUnreachable = acquire.onAny(defer(self(), &Self::_markUnreachable));
++metrics->slave_unreachable_scheduled;
}
void _markUnreachable()
{
CHECK_SOME(markingUnreachable);
const Future<Nothing>& future = markingUnreachable.get();
CHECK(!future.isFailed());
if (future.isReady()) {
++metrics->slave_unreachable_completed;
dispatch(master,
&Master::markUnreachable,
slaveInfo,
false,
"health check timed out");
} else if (future.isDiscarded()) {
LOG(INFO) << "Canceling transition of agent " << slaveId
<< " to UNREACHABLE because a pong was received!";
++metrics->slave_unreachable_canceled;
}
markingUnreachable = None();
}
private:
const UPID slave;
const SlaveInfo slaveInfo;
const SlaveID slaveId;
const PID<Master> master;
const Option<shared_ptr<RateLimiter>> limiter;
shared_ptr<Metrics> metrics;
Option<Future<Nothing>> markingUnreachable;
const Duration slavePingTimeout;
const size_t maxSlavePingTimeouts;
uint32_t timeouts;
bool pinged;
bool connected;
};
Master::Master(
Allocator* _allocator,
Registrar* _registrar,
Files* _files,
MasterContender* _contender,
MasterDetector* _detector,
const Option<Authorizer*>& _authorizer,
const Option<shared_ptr<RateLimiter>>& _slaveRemovalLimiter,
const Flags& _flags)
: ProcessBase("master"),
flags(_flags),
http(this),
allocator(_allocator),
registrar(_registrar),
files(_files),
contender(_contender),
detector(_detector),
authorizer(_authorizer),
frameworks(flags),
subscribers(this),
authenticator(None()),
metrics(new Metrics(*this)),
electedTime(None())
{
slaves.limiter = _slaveRemovalLimiter;
// NOTE: We populate 'info_' here instead of inside 'initialize()'
// because 'StandaloneMasterDetector' needs access to the info.
// Master ID is generated randomly based on UUID.
info_.set_id(id::UUID::random().toString());
// NOTE: Currently, we store ip in MasterInfo in network order,
// which should be fixed. See MESOS-1201 for details.
// TODO(marco): The ip, port, hostname fields above are
// being deprecated; the code should be removed once
// the deprecation cycle is complete.
info_.set_ip(self().address.ip.in()->s_addr);
info_.set_port(self().address.port);
info_.set_pid(self());
info_.set_version(MESOS_VERSION);
for (const MasterInfo::Capability& capability : MASTER_CAPABILITIES()) {
info_.add_capabilities()->CopyFrom(capability);
}
// Determine our hostname or use the hostname provided.
string hostname;
if (flags.hostname.isNone()) {
if (flags.hostname_lookup) {
Try<string> result = net::getHostname(self().address.ip);
if (result.isError()) {
LOG(FATAL) << "Failed to get hostname: " << result.error();
}
hostname = result.get();
} else {
// We use the IP address for hostname if the user requested us
// NOT to look it up, and it wasn't explicitly set via --hostname:
hostname = stringify(self().address.ip);
}
} else {
hostname = flags.hostname.get();
}
info_.set_hostname(hostname);
// This uses the new `Address` message in `MasterInfo`.
info_.mutable_address()->set_ip(stringify(self().address.ip));
info_.mutable_address()->set_port(self().address.port);
info_.mutable_address()->set_hostname(hostname);
if (flags.domain.isSome()) {
info_.mutable_domain()->CopyFrom(flags.domain.get());
}
}
Master::~Master() {}
hashset<string> Master::misingMinimumCapabilities(
const MasterInfo& masterInfo, const Registry& registry)
{
if (registry.minimum_capabilities().size() == 0) {
return hashset<string>();
}
hashset<string> minimumCapabilities, masterCapabilities;
foreach (
const Registry::MinimumCapability& minimumCapability,
registry.minimum_capabilities()) {
minimumCapabilities.insert(minimumCapability.capability());
}
foreach (
const MasterInfo::Capability& masterCapability,
masterInfo.capabilities()) {
masterCapabilities.insert(
MasterInfo::Capability::Type_Name(masterCapability.type()));
}
return minimumCapabilities - masterCapabilities;
}
// TODO(vinod): Update this interface to return failed futures when
// capacity is reached.
struct BoundedRateLimiter
{
BoundedRateLimiter(double qps, Option<uint64_t> _capacity)
: limiter(new process::RateLimiter(qps)),
capacity(_capacity),
messages(0) {}
process::Owned<process::RateLimiter> limiter;
const Option<uint64_t> capacity;
// Number of outstanding messages for this RateLimiter.
// NOTE: ExitedEvents are throttled but not counted towards
// the capacity here.
uint64_t messages;
};
void Master::initialize()
{
LOG(INFO) << "Master " << info_.id() << " (" << info_.hostname() << ")"
<< " started on " << string(self()).substr(7);
LOG(INFO) << "Flags at startup: " << flags;
if (process::address().ip.isLoopback()) {
LOG(WARNING) << "\n**************************************************\n"
<< "Master bound to loopback interface!"
<< " Cannot communicate with remote schedulers or agents."
<< " You might want to set '--ip' flag to a routable"
<< " IP address.\n"
<< "**************************************************";
}
// NOTE: We enforce a minimum slave reregister timeout because the
// slave bounds its (re-)registration retries based on the minimum.
if (flags.agent_reregister_timeout < MIN_AGENT_REREGISTER_TIMEOUT) {
EXIT(EXIT_FAILURE)
<< "Invalid value '" << flags.agent_reregister_timeout << "'"
<< " for --agent_reregister_timeout:"
<< " Must be at least " << MIN_AGENT_REREGISTER_TIMEOUT;
}
// Parse the percentage for the slave removal limit.
// TODO(bmahler): Add a 'Percentage' abstraction.
if (!strings::endsWith(flags.recovery_agent_removal_limit, "%")) {
EXIT(EXIT_FAILURE)
<< "Invalid value '" << flags.recovery_agent_removal_limit << "'"
<< " for --recovery_agent_removal_percent_limit: " << "missing '%'";
}
Try<double> limit = numify<double>(
strings::remove(
flags.recovery_agent_removal_limit,
"%",
strings::SUFFIX));
if (limit.isError()) {
EXIT(EXIT_FAILURE)
<< "Invalid value '" << flags.recovery_agent_removal_limit << "'"
<< " for --recovery_agent_removal_percent_limit: " << limit.error();
}
if (limit.get() < 0.0 || limit.get() > 100.0) {
EXIT(EXIT_FAILURE)
<< "Invalid value '" << flags.recovery_agent_removal_limit << "'"
<< " for --recovery_agent_removal_percent_limit:"
<< " Must be within [0%-100%]";
}
// Log authentication state.
if (flags.authenticate_frameworks) {
LOG(INFO) << "Master only allowing authenticated frameworks to register";
} else {
LOG(INFO) << "Master allowing unauthenticated frameworks to register";
}
if (flags.authenticate_agents) {
LOG(INFO) << "Master only allowing authenticated agents to register";
} else {
LOG(INFO) << "Master allowing unauthenticated agents to register";
}
if (flags.authenticate_http_frameworks) {
LOG(INFO) << "Master only allowing authenticated HTTP frameworks to "
<< "register";
} else {
LOG(INFO) << "Master allowing HTTP frameworks to register without "
<< "authentication";
}
// Load credentials.
Option<Credentials> credentials;
if (flags.credentials.isSome()) {
Result<Credentials> _credentials =
credentials::read(flags.credentials.get());
if (_credentials.isError()) {
EXIT(EXIT_FAILURE) << _credentials.error() << " (see --credentials flag)";
} else if (_credentials.isNone()) {
EXIT(EXIT_FAILURE)
<< "Credentials file must contain at least one credential"
<< " (see --credentials flag)";
}
// Store credentials in master to use them in routes.
credentials = _credentials.get();
}
// Extract authenticator names and validate them.
authenticatorNames = strings::split(flags.authenticators, ",");
if (authenticatorNames.empty()) {
EXIT(EXIT_FAILURE) << "No authenticator specified";
}
if (authenticatorNames.size() > 1) {
EXIT(EXIT_FAILURE) << "Multiple authenticators not supported";
}
if (authenticatorNames[0] != DEFAULT_AUTHENTICATOR &&
!modules::ModuleManager::contains<Authenticator>(
authenticatorNames[0])) {
EXIT(EXIT_FAILURE)
<< "Authenticator '" << authenticatorNames[0] << "' not found."
<< " Check the spelling (compare to '" << DEFAULT_AUTHENTICATOR << "')"
<< " or verify that the authenticator was loaded successfully"
<< " (see --modules)";
}
// TODO(tillt): Allow multiple authenticators to be loaded and enable
// the authenticatee to select the appropriate one. See MESOS-1939.
if (authenticatorNames[0] == DEFAULT_AUTHENTICATOR) {
LOG(INFO) << "Using default '" << DEFAULT_AUTHENTICATOR
<< "' authenticator";
authenticator = new cram_md5::CRAMMD5Authenticator();
} else {
Try<Authenticator*> module =
modules::ModuleManager::create<Authenticator>(authenticatorNames[0]);
if (module.isError()) {
EXIT(EXIT_FAILURE)
<< "Could not create authenticator module '"
<< authenticatorNames[0] << "': " << module.error();
}
LOG(INFO) << "Using '" << authenticatorNames[0] << "' authenticator";
authenticator = module.get();
}
// Give Authenticator access to credentials when needed.
CHECK_SOME(authenticator);
Try<Nothing> initialize = authenticator.get()->initialize(credentials);
if (initialize.isError()) {
const string error =
"Failed to initialize authenticator '" + authenticatorNames[0] +
"': " + initialize.error();
if (flags.authenticate_frameworks || flags.authenticate_agents) {
EXIT(EXIT_FAILURE)
<< "Failed to start master with authentication enabled: " << error;
} else {
// A failure to initialize the authenticator does lead to
// unusable authentication but still allows non authenticating
// frameworks and slaves to connect.
LOG(WARNING) << "Only non-authenticating frameworks and agents are "
<< "allowed to connect. "
<< "Authentication is disabled: " << error;
delete authenticator.get();
authenticator = None();
}
}
if (flags.authenticate_http_readonly) {
Try<Nothing> result = initializeHttpAuthenticators(
READONLY_HTTP_AUTHENTICATION_REALM,
strings::split(flags.http_authenticators, ","),
credentials);
if (result.isError()) {
EXIT(EXIT_FAILURE) << result.error();
}
}
if (flags.authenticate_http_readwrite) {
Try<Nothing> result = initializeHttpAuthenticators(
READWRITE_HTTP_AUTHENTICATION_REALM,
strings::split(flags.http_authenticators, ","),
credentials);
if (result.isError()) {
EXIT(EXIT_FAILURE) << result.error();
}
}
if (flags.authenticate_http_frameworks) {
// The `--http_framework_authenticators` flag should always be set when HTTP
// framework authentication is enabled.
if (flags.http_framework_authenticators.isNone()) {
EXIT(EXIT_FAILURE)
<< "Missing `--http_framework_authenticators` flag. This must be used "
<< "in conjunction with `--authenticate_http_frameworks`";
}
Try<Nothing> result = initializeHttpAuthenticators(
DEFAULT_HTTP_FRAMEWORK_AUTHENTICATION_REALM,
strings::split(flags.http_framework_authenticators.get(), ","),
credentials);
if (result.isError()) {
EXIT(EXIT_FAILURE) << result.error();
}
}
if (authorizer.isSome()) {
LOG(INFO) << "Authorization enabled";
}
if (flags.rate_limits.isSome()) {
// Add framework rate limiters.
foreach (const RateLimit& limit_, flags.rate_limits->limits()) {
if (frameworks.limiters.contains(limit_.principal())) {
EXIT(EXIT_FAILURE)
<< "Duplicate principal " << limit_.principal()
<< " found in RateLimits configuration";
}
if (limit_.has_qps() && limit_.qps() <= 0) {
EXIT(EXIT_FAILURE)
<< "Invalid qps: " << limit_.qps()
<< ". It must be a positive number";
}
if (limit_.has_qps()) {
Option<uint64_t> capacity;
if (limit_.has_capacity()) {
capacity = limit_.capacity();
}
frameworks.limiters.put(
limit_.principal(),
Owned<BoundedRateLimiter>(
new BoundedRateLimiter(limit_.qps(), capacity)));
} else {
frameworks.limiters.put(limit_.principal(), None());
}
}
if (flags.rate_limits->has_aggregate_default_qps() &&
flags.rate_limits->aggregate_default_qps() <= 0) {
EXIT(EXIT_FAILURE)
<< "Invalid aggregate_default_qps: "
<< flags.rate_limits->aggregate_default_qps()
<< ". It must be a positive number";
}
if (flags.rate_limits->has_aggregate_default_qps()) {
Option<uint64_t> capacity;
if (flags.rate_limits->has_aggregate_default_capacity()) {
capacity = flags.rate_limits->aggregate_default_capacity();
}
frameworks.defaultLimiter =
Owned<BoundedRateLimiter>(new BoundedRateLimiter(
flags.rate_limits->aggregate_default_qps(), capacity));
}
LOG(INFO) << "Framework rate limiting enabled";
}
// If the rate limiter is injected for testing,
// the flag may not be set.
if (slaves.limiter.isSome() && flags.agent_removal_rate_limit.isSome()) {
LOG(INFO) << "Agent removal is rate limited to "
<< flags.agent_removal_rate_limit.get();
}
// If "--roles" is set, configure the role whitelist.
// TODO(neilc): Remove support for explicit roles in ~Mesos 0.32.
if (flags.roles.isSome()) {
LOG(WARNING) << "The '--roles' flag is deprecated. This flag will be "
<< "removed in the future. See the Mesos 0.27 upgrade "
<< "notes for more information";
Try<vector<string>> roles = roles::parse(flags.roles.get());
if (roles.isError()) {
EXIT(EXIT_FAILURE) << "Failed to parse roles: " << roles.error();
}
roleWhitelist = hashset<string>();
foreach (const string& role, roles.get()) {
roleWhitelist->insert(role);
}
if (roleWhitelist->size() < roles->size()) {
LOG(WARNING) << "Duplicate values in '--roles': " << flags.roles.get();
}
// The default role is always allowed.
roleWhitelist->insert("*");
}
// Add role weights.
if (flags.weights.isSome()) {
vector<string> tokens = strings::tokenize(flags.weights.get(), ",");
foreach (const string& token, tokens) {
vector<string> pair = strings::tokenize(token, "=");
if (pair.size() != 2) {
EXIT(EXIT_FAILURE)
<< "Invalid weight: '" << token << "'. --weights should"
<< " be of the form 'role=weight,role=weight'";
} else if (!isWhitelistedRole(pair[0])) {
EXIT(EXIT_FAILURE)
<< "Invalid weight: '" << token << "'. " << pair[0]
<< " is not a valid role";
}
double weight = atof(pair[1].c_str());
if (weight <= 0) {
EXIT(EXIT_FAILURE)
<< "Invalid weight: '" << token << "'. Weights must be positive";
}
weights[pair[0]] = weight;
}
}
// Verify the timeout is greater than zero.
if (flags.offer_timeout.isSome() &&
flags.offer_timeout.get() <= Duration::zero()) {
EXIT(EXIT_FAILURE)
<< "Invalid value '" << flags.offer_timeout.get() << "'"
<< " for --offer_timeout: Must be greater than zero";
}
// Parse min_allocatable_resources.
vector<ResourceQuantities> minAllocatableResources;
foreach (
const string& token,
strings::tokenize(flags.min_allocatable_resources, "|")) {
Try<ResourceQuantities> resourceQuantities =
ResourceQuantities::fromString(token);
if (resourceQuantities.isError()) {
EXIT(EXIT_FAILURE) << "Error parsing min_allocatable_resources '"
<< flags.min_allocatable_resources
<< "': " << resourceQuantities.error();
}
// We check the configuration against first-class resources and warn
// against possible mis-configuration (e.g. typo).
set<string> firstClassResources = {"cpus", "mem", "disk", "ports", "gpus"};
for (auto it = resourceQuantities->begin(); it != resourceQuantities->end();
++it) {
if (firstClassResources.count(it->first) == 0) {
LOG(WARNING) << "Non-first-class resource '" << it->first
<< "' is configured as part of min_allocatable_resources";
}
}
minAllocatableResources.push_back(resourceQuantities.get());
}
// Initialize the allocator.
allocator->initialize(
flags.allocation_interval,
defer(self(), &Master::offer, lambda::_1, lambda::_2),
defer(self(), &Master::inverseOffer, lambda::_1, lambda::_2),
flags.fair_sharing_excluded_resource_names,
flags.filter_gpu_resources,
flags.domain,
minAllocatableResources,
flags.max_completed_frameworks);
// Parse the whitelist. Passing Allocator::updateWhitelist()
// callback is safe because we shut down the whitelistWatcher in
// Master::finalize(), while allocator lifetime is greater than
// masters. Therefore there is no risk of calling into an allocator
// that has been cleaned up.
whitelistWatcher = new WhitelistWatcher(
flags.whitelist,
WHITELIST_WATCH_INTERVAL,
[this](const Option<hashset<string>>& whitelist) {
return allocator->updateWhitelist(whitelist);
});
spawn(whitelistWatcher);
nextFrameworkId = 0;
nextSlaveId = 0;
nextOfferId = 0;
startTime = Clock::now();
install<scheduler::Call>(&Master::receive);
// Install handler functions for certain messages.
install<SubmitSchedulerRequest>(
&Master::submitScheduler,
&SubmitSchedulerRequest::name);
install<RegisterFrameworkMessage>(
&Master::registerFramework);
install<ReregisterFrameworkMessage>(
&Master::reregisterFramework);
install<UnregisterFrameworkMessage>(
&Master::unregisterFramework,
&UnregisterFrameworkMessage::framework_id);
install<DeactivateFrameworkMessage>(
&Master::deactivateFramework,
&DeactivateFrameworkMessage::framework_id);
install<ResourceRequestMessage>(
&Master::resourceRequest,
&ResourceRequestMessage::framework_id,
&ResourceRequestMessage::requests);
install<LaunchTasksMessage>(
&Master::launchTasks);
install<ReviveOffersMessage>(
&Master::reviveOffers,
&ReviveOffersMessage::framework_id,
&ReviveOffersMessage::roles);
install<KillTaskMessage>(
&Master::killTask,
&KillTaskMessage::framework_id,
&KillTaskMessage::task_id);
install<StatusUpdateAcknowledgementMessage>(
&Master::statusUpdateAcknowledgement);
install<FrameworkToExecutorMessage>(
&Master::schedulerMessage);
install<RegisterSlaveMessage>(
&Master::registerSlave);
install<ReregisterSlaveMessage>(
&Master::reregisterSlave);
install<UnregisterSlaveMessage>(
&Master::unregisterSlave,
&UnregisterSlaveMessage::slave_id);
install<StatusUpdateMessage>(
&Master::statusUpdate);
// Added in 0.24.0 to support HTTP schedulers. Since
// these do not have a pid, the slave must forward
// messages through the master.
install<ExecutorToFrameworkMessage>(
&Master::executorMessage);
install<ReconcileTasksMessage>(
&Master::reconcileTasks);
install<UpdateOperationStatusMessage>(
&Master::updateOperationStatus);
install<ExitedExecutorMessage>(
&Master::exitedExecutor,
&ExitedExecutorMessage::slave_id,
&ExitedExecutorMessage::framework_id,
&ExitedExecutorMessage::executor_id,
&ExitedExecutorMessage::status);
install<UpdateSlaveMessage>(&Master::updateSlave);
install<AuthenticateMessage>(
&Master::authenticate,
&AuthenticateMessage::pid);
// Setup HTTP routes.
route("/api/v1",
// TODO(benh): Is this authentication realm sufficient or do
// we need some kind of hybrid if we expect both schedulers
// and operators/tooling to use this endpoint?
READWRITE_HTTP_AUTHENTICATION_REALM,
Http::API_HELP(),
[this](const process::http::Request& request,
const Option<Principal>& principal) {
logRequest(request);
return http.api(request, principal);
});
route("/api/v1/scheduler",
DEFAULT_HTTP_FRAMEWORK_AUTHENTICATION_REALM,
Http::SCHEDULER_HELP(),
[this](const process::http::Request& request,
const Option<Principal>& principal) {
logRequest(request);
return http.scheduler(request, principal);
});
route("/create-volumes",
READWRITE_HTTP_AUTHENTICATION_REALM,
Http::CREATE_VOLUMES_HELP(),
[this](const process::http::Request& request,
const Option<Principal>& principal) {
logRequest(request);
return http.createVolumes(request, principal);
});
route("/destroy-volumes",
READWRITE_HTTP_AUTHENTICATION_REALM,
Http::DESTROY_VOLUMES_HELP(),
[this](const process::http::Request& request,
const Option<Principal>& principal) {
logRequest(request);
return http.destroyVolumes(request, principal);
});
route("/frameworks",
READONLY_HTTP_AUTHENTICATION_REALM,
Http::FRAMEWORKS_HELP(),
[this](const process::http::Request& request,
const Option<Principal>& principal) {
logRequest(request);
return http.frameworks(request, principal);
});
route("/flags",
READONLY_HTTP_AUTHENTICATION_REALM,
Http::FLAGS_HELP(),
[this](const process::http::Request& request,
const Option<Principal>& principal) {
logRequest(request);
return http.flags(request, principal);
});
route("/health",
Http::HEALTH_HELP(),
[this](const process::http::Request& request) {
return http.health(request);
});
route("/redirect",
Http::REDIRECT_HELP(),
[this](const process::http::Request& request) {
return http.redirect(request);
});
route("/reserve",
READWRITE_HTTP_AUTHENTICATION_REALM,
Http::RESERVE_HELP(),
[this](const process::http::Request& request,
const Option<Principal>& principal) {
logRequest(request);
return http.reserve(request, principal);
});
// TODO(ijimenez): Remove this endpoint at the end of the
// deprecation cycle on 0.26.
route("/roles.json",
READONLY_HTTP_AUTHENTICATION_REALM,
Http::ROLES_HELP(),
[this](const process::http::Request& request,
const Option<Principal>& principal) {
logRequest(request);
return http.roles(request, principal);
});
route("/roles",
READONLY_HTTP_AUTHENTICATION_REALM,
Http::ROLES_HELP(),
[this](const process::http::Request& request,
const Option<Principal>& principal) {
logRequest(request);
return http.roles(request, principal);
});
route("/teardown",
READWRITE_HTTP_AUTHENTICATION_REALM,
Http::TEARDOWN_HELP(),
[this](const process::http::Request& request,
const Option<Principal>& principal) {
logRequest(request);
return http.teardown(request, principal);
});
route("/slaves",
READONLY_HTTP_AUTHENTICATION_REALM,
Http::SLAVES_HELP(),
[this](const process::http::Request& request,
const Option<Principal>& principal) {
logRequest(request);
return http.slaves(request, principal);
});
// TODO(ijimenez): Remove this endpoint at the end of the
// deprecation cycle on 0.26.
route("/state.json",
READONLY_HTTP_AUTHENTICATION_REALM,
Http::STATE_HELP(),
[this](const process::http::Request& request,
const Option<Principal>& principal) {
logRequest(request);
return http.state(request, principal);
});
route("/state",
READONLY_HTTP_AUTHENTICATION_REALM,
Http::STATE_HELP(),
[this](const process::http::Request& request,
const Option<Principal>& principal) {
logRequest(request);
return http.state(request, principal);
});
route("/state-summary",
READONLY_HTTP_AUTHENTICATION_REALM,
Http::STATESUMMARY_HELP(),
[this](const process::http::Request& request,
const Option<Principal>& principal) {
logRequest(request);
return http.stateSummary(request, principal);
});
// TODO(ijimenez): Remove this endpoint at the end of the
// deprecation cycle.
route("/tasks.json",
READONLY_HTTP_AUTHENTICATION_REALM,
Http::TASKS_HELP(),
[this](const process::http::Request& request,
const Option<Principal>& principal) {
logRequest(request);
return http.tasks(request, principal);
});
route("/tasks",
READONLY_HTTP_AUTHENTICATION_REALM,
Http::TASKS_HELP(),
[this](const process::http::Request& request,
const Option<Principal>& principal) {
logRequest(request);
return http.tasks(request, principal);
});
route("/maintenance/schedule",
READWRITE_HTTP_AUTHENTICATION_REALM,
Http::MAINTENANCE_SCHEDULE_HELP(),
[this](const process::http::Request& request,
const Option<Principal>& principal) {
logRequest(request);
return http.maintenanceSchedule(request, principal);
});
route("/maintenance/status",
READONLY_HTTP_AUTHENTICATION_REALM,
Http::MAINTENANCE_STATUS_HELP(),
[this](const process::http::Request& request,
const Option<Principal>& principal) {
logRequest(request);
return http.maintenanceStatus(request, principal);
});
route("/machine/down",
READWRITE_HTTP_AUTHENTICATION_REALM,
Http::MACHINE_DOWN_HELP(),
[this](const process::http::Request& request,
const Option<Principal>& principal) {
logRequest(request);
return http.machineDown(request, principal);
});
route("/machine/up",
READWRITE_HTTP_AUTHENTICATION_REALM,
Http::MACHINE_UP_HELP(),
[this](const process::http::Request& request,
const Option<Principal>& principal) {
logRequest(request);
return http.machineUp(request, principal);
});
route("/unreserve",
READWRITE_HTTP_AUTHENTICATION_REALM,
Http::UNRESERVE_HELP(),
[this](const process::http::Request& request,
const Option<Principal>& principal) {
logRequest(request);
return http.unreserve(request, principal);
});
route("/quota",
READWRITE_HTTP_AUTHENTICATION_REALM,
Http::QUOTA_HELP(),
[this](const process::http::Request& request,
const Option<Principal>& principal) {
logRequest(request);
return http.quota(request, principal);
});
route("/weights",
READWRITE_HTTP_AUTHENTICATION_REALM,
Http::WEIGHTS_HELP(),
[this](const process::http::Request& request,
const Option<Principal>& principal) {
logRequest(request);
return http.weights(request, principal);
});
// Provide HTTP assets from a "webui" directory. This is either
// specified via flags (which is necessary for running out of the
// build directory before 'make install') or determined at build
// time via the preprocessor macro '-DMESOS_WEBUI_DIR' set in the
// Makefile.
provide("", path::join(flags.webui_dir, "index.html"));
provide("app", path::join(flags.webui_dir, "app"));
provide("assets", path::join(flags.webui_dir, "assets"));
const PID<Master> masterPid = self();
auto authorize = [masterPid](const Option<Principal>& principal) {
return dispatch(masterPid, &Master::authorizeLogAccess, principal);
};
// Expose the log file for the webui. Fall back to 'log_dir' if
// an explicit file was not specified.
if (flags.external_log_file.isSome()) {
files->attach(flags.external_log_file.get(), "/master/log", authorize)
.onAny(defer(self(),
&Self::fileAttached,
lambda::_1,
flags.external_log_file.get()));
} else if (flags.log_dir.isSome()) {
Try<string> log = logging::getLogFile(
logging::getLogSeverity(flags.logging_level));
if (log.isError()) {
LOG(ERROR) << "Master log file cannot be found: " << log.error();
} else {
files->attach(log.get(), "/master/log", authorize)
.onAny(defer(self(), &Self::fileAttached, lambda::_1, log.get()));
}
}
contender->initialize(info_);
// Start contending to be a leading master and detecting the current
// leader.
contender->contend()
.onAny(defer(self(), &Master::contended, lambda::_1));
detector->detect()
.onAny(defer(self(), &Master::detected, lambda::_1));
}
void Master::finalize()
{
LOG(INFO) << "Master terminating";
// NOTE: Even though we remove the slave and framework from the
// allocator, it is possible that offers are already dispatched to
// this master. In tests, if a new master (with the same PID) is
// started, it might process the offers from the old master's
// allocator.
// TODO(vinod): Fix the above race by changing the allocator
// interface to return a stream of offer events.
// Remove the slaves.
foreachvalue (Slave* slave, slaves.registered) {
// We first remove the slave from the allocator so that any
// recovered resources below are not reoffered.
allocator->removeSlave(slave->id);
foreachkey (const FrameworkID& frameworkId, utils::copy(slave->tasks)) {
foreachvalue (Task* task, utils::copy(slave->tasks[frameworkId])) {
removeTask(task);
}
}
// Remove executors.
foreachkey (const FrameworkID& frameworkId, utils::copy(slave->executors)) {
foreachkey (const ExecutorID& executorId,
utils::copy(slave->executors[frameworkId])) {
removeExecutor(slave, frameworkId, executorId);
}
}
// Remove offers.
foreach (Offer* offer, utils::copy(slave->offers)) {
removeOffer(offer);
}
// Remove inverse offers.
foreach (InverseOffer* inverseOffer, utils::copy(slave->inverseOffers)) {
// We don't need to update the allocator because the slave has already
// been removed.
removeInverseOffer(inverseOffer);
}
// Remove pending tasks from the slave. Don't bother
// recovering the resources in the allocator.
slave->pendingTasks.clear();
// Terminate the slave observer.
terminate(slave->observer);
wait(slave->observer);
delete slave->observer;
delete slave;
}
slaves.registered.clear();
// Remove the frameworks.
// Note we are not deleting the pointers to the frameworks from the
// roles because it is unnecessary bookkeeping at this point since
// we are shutting down.
foreachvalue (Framework* framework, frameworks.registered) {
allocator->removeFramework(framework->id());
// Remove pending tasks from the framework. Don't bother
// recovering the resources in the allocator.
framework->pendingTasks.clear();
// No tasks/executors/offers should remain since the slaves
// have been removed.
CHECK(framework->tasks.empty());
CHECK(framework->executors.empty());
CHECK(framework->offers.empty());
CHECK(framework->inverseOffers.empty());
delete framework;
}
frameworks.registered.clear();
CHECK(offers.empty());
CHECK(inverseOffers.empty());
foreachvalue (Future<Option<string>> future, authenticating) {
// NOTE: This is necessary during tests because a copy of
// this future is used to setup authentication timeout. If a
// test doesn't discard this future, authentication timeout might
// fire in a different test and any associated callbacks
// (e.g., '_authenticate()') would be called. This is because the
// master pid doesn't change across the tests.
// TODO(vinod): This seems to be a bug in libprocess or the
// testing infrastructure.
future.discard();
}
foreachvalue (Role* role, roles) {
delete role;
}
roles.clear();
// NOTE: This is necessary during tests because we don't want the
// timer to fire in a different test and invoke the callback.
// The callback would be invoked because the master pid doesn't
// change across the tests.
// TODO(vinod): This seems to be a bug in libprocess or the
// testing infrastructure.
if (slaves.recoveredTimer.isSome()) {
Clock::cancel(slaves.recoveredTimer.get());
}
if (registryGcTimer.isSome()) {
Clock::cancel(registryGcTimer.get());
}
terminate(whitelistWatcher);
wait(whitelistWatcher);
delete whitelistWatcher;
if (authenticator.isSome()) {
delete authenticator.get();
}
}
void Master::exited(const FrameworkID& frameworkId, const HttpConnection& http)
{
foreachvalue (Framework* framework, frameworks.registered) {
if (framework->http.isSome() && framework->http->writer == http.writer) {
CHECK_EQ(frameworkId, framework->id());
_exited(framework);
return;
}
// If the framework has reconnected, the writer will not match
// above, and we will have a framework with a matching id.
if (frameworkId == framework->id()) {
LOG(INFO) << "Ignoring disconnection for framework "
<< *framework << " as it has already reconnected";
return;
}
}
}
void Master::exited(const UPID& pid)
{
foreachvalue (Framework* framework, frameworks.registered) {
if (framework->pid == pid) {
// See comments in `receive()` on why we send an error message
// to the framework upon detecting a disconnection.
FrameworkErrorMessage message;
message.set_message("Framework disconnected");
framework->send(message);
_exited(framework);
return;
}
}
if (Slave* slave = slaves.registered.get(pid)) {
LOG(INFO) << "Agent " << *slave << " disconnected";
if (slave->connected) {
disconnect(slave);
// The semantics when a registered slave gets disconnected are as
// follows for each framework running on that slave:
//
// 1) If the framework is checkpointing: No immediate action is
// taken. The slave is given a chance to reconnect until the
// slave observer times out (75s) and removes the slave.
//
// 2) If the framework is not-checkpointing: The slave is not
// removed but the framework is removed from the slave's
// structs, its tasks transitioned to LOST and resources
// recovered.
hashset<FrameworkID> frameworkIds =
slave->tasks.keys() | slave->executors.keys();
foreach (const FrameworkID& frameworkId, frameworkIds) {
Framework* framework = getFramework(frameworkId);
CHECK_NOTNULL(framework);
if (!framework->info.checkpoint()) {
LOG(INFO) << "Removing framework " << *framework
<< " from disconnected agent " << *slave
<< " because the framework is not checkpointing";
removeFramework(slave, framework);
}
}
// If the master -> agent socket breaks, we expect that either
// (a) the agent will fail to respond to pings and be marked
// unreachable, or (b) the agent will receive a ping, notice the
// master thinks it is disconnected, and then reregister. There
// is a third possibility: if the agent restarts but hangs
// during agent recovery, it will respond to pings but never
// attempt to reregister (MESOS-6286).
//
// To handle this case, we expect that an agent whose socket has
// broken will reregister within `agent_reregister_timeout`. If
// the agent doesn't reregister, it is marked unreachable.
slave->reregistrationTimer =
delay(flags.agent_reregister_timeout,
self(),
&Master::agentReregisterTimeout,
slave->id);
} else {
// NOTE: A duplicate exited() event is possible for a slave
// because its PID doesn't change on restart. See MESOS-675
// for details.
LOG(WARNING) << "Ignoring duplicate exited() notification for "
<< "agent " << *slave;
}
}
}
void Master::agentReregisterTimeout(const SlaveID& slaveId)
{
Slave* slave = slaves.registered.get(slaveId);
// The slave might have been removed or reregistered concurrently
// with the timeout expiring.
if (slave == nullptr || slave->connected) {
return;
}
// Remove the slave in a rate limited manner, similar to how the
// SlaveObserver removes slaves.
Future<Nothing> acquire = Nothing();
if (slaves.limiter.isSome()) {
LOG(INFO) << "Scheduling removal of agent "
<< *slave
<< "; did not reregister within "
<< flags.agent_reregister_timeout << " after disconnecting";
acquire = slaves.limiter.get()->acquire();
}
acquire
.then(defer(self(), &Self::_agentReregisterTimeout, slaveId));
++metrics->slave_unreachable_scheduled;
}
Nothing Master::_agentReregisterTimeout(const SlaveID& slaveId)
{
Slave* slave = slaves.registered.get(slaveId);
// The slave might have been removed or reregistered while we were
// waiting to acquire the rate limit.
if (slave == nullptr || slave->connected) {
++metrics->slave_unreachable_canceled;
return Nothing();
}
++metrics->slave_unreachable_completed;
markUnreachable(
slave->info,
false,
"agent did not reregister within " +
stringify(flags.agent_reregister_timeout) +
" after disconnecting");
return Nothing();
}
void Master::_exited(Framework* framework)
{
LOG(INFO) << "Framework " << *framework << " disconnected";
// Disconnect the framework.
if (framework->connected()) {
disconnect(framework);
}
// We can assume framework's failover_timeout is valid
// because it has been validated in framework subscription.
Try<Duration> failoverTimeout_ =
Duration::create(framework->info.failover_timeout());
CHECK_SOME(failoverTimeout_);
Duration failoverTimeout = failoverTimeout_.get();
LOG(INFO) << "Giving framework " << *framework << " "
<< failoverTimeout << " to failover";
// Delay dispatching a message to ourselves for the timeout.
delay(failoverTimeout,
self(),
&Master::frameworkFailoverTimeout,
framework->id(),
framework->reregisteredTime);
}
Future<bool> Master::authorizeLogAccess(const Option<Principal>& principal)
{
if (authorizer.isNone()) {
return true;
}
authorization::Request request;
request.set_action(authorization::ACCESS_MESOS_LOG);
Option<authorization::Subject> subject = createSubject(principal);
if (subject.isSome()) {
request.mutable_subject()->CopyFrom(subject.get());
}
return authorizer.get()->authorized(request);
}
void Master::consume(MessageEvent&& event)
{
// There are three cases about the message's UPID with respect to
// 'frameworks.principals':
// 1) if a <UPID, principal> pair exists and the principal is Some,
// it's a framework with its principal specified.
// 2) if a <UPID, principal> pair exists and the principal is None,
// it's a framework without a principal.
// 3) if a <UPID, principal> pair does not exist in the map, it's
// either an unregistered framework or not a framework.
// The logic for framework message counters and rate limiting
// mainly concerns with whether the UPID is a *registered*
// framework and whether the framework has a principal so we use
// these two temp variables to simplify the condition checks below.
bool isRegisteredFramework =
frameworks.principals.contains(event.message.from);
const Option<string> principal = isRegisteredFramework
? frameworks.principals[event.message.from]
: Option<string>::none();
// Increment the "message_received" counter if the message is from
// a framework and such a counter is configured for it.
// See comments for 'Master::Metrics::Frameworks' and
// 'Master::Frameworks::principals' for details.
if (principal.isSome()) {
// If the framework has a principal, the counter must exist.
CHECK(metrics->frameworks.contains(principal.get()));
Counter messages_received =
metrics->frameworks.get(principal.get()).get()->messages_received;
++messages_received;
}
// All messages are filtered when non-leading.
if (!elected()) {
VLOG(1) << "Dropping '" << event.message.name << "' message since "
<< "not elected yet";
++metrics->dropped_messages;
return;
}
CHECK_SOME(recovered);
// All messages are filtered while recovering.
// TODO(bmahler): Consider instead re-enqueing *all* messages
// through recover(). What are the performance implications of
// the additional queueing delay and the accumulated backlog
// of messages post-recovery?
if (!recovered->isReady()) {
VLOG(1) << "Dropping '" << event.message.name << "' message since "
<< "not recovered yet";
++metrics->dropped_messages;
return;
}
// Throttle the message if it's a framework message and a
// RateLimiter is configured for the framework's principal.
// The framework is throttled by the default RateLimiter if:
// 1) the default RateLimiter is configured (and)
// 2) the framework doesn't have a principal or its principal is
// not specified in 'flags.rate_limits'.
// The framework is not throttled if:
// 1) the default RateLimiter is not configured to handle case 2)
// above. (or)
// 2) the principal exists in RateLimits but 'qps' is not set.
if (principal.isSome() &&
frameworks.limiters.contains(principal.get()) &&
frameworks.limiters[principal.get()].isSome()) {
const Owned<BoundedRateLimiter>& limiter =
frameworks.limiters[principal.get()].get();
if (limiter->capacity.isNone() ||
limiter->messages < limiter->capacity.get()) {
limiter->messages++;
limiter->limiter->acquire()
.onReady(defer(self(), &Self::throttled, std::move(event), principal));
} else {
exceededCapacity(
event,
principal,
limiter->capacity.get());
}
} else if ((principal.isNone() ||
!frameworks.limiters.contains(principal.get())) &&
isRegisteredFramework &&
frameworks.defaultLimiter.isSome()) {
if (frameworks.defaultLimiter.get()->capacity.isNone() ||
frameworks.defaultLimiter.get()->messages <
frameworks.defaultLimiter.get()->capacity.get()) {
frameworks.defaultLimiter.get()->messages++;
frameworks.defaultLimiter.get()->limiter->acquire()
.onReady(defer(self(), &Self::throttled, std::move(event), None()));
} else {
exceededCapacity(
event,
principal,
frameworks.defaultLimiter.get()->capacity.get());
}
} else {
_consume(std::move(event));
}
}
void Master::consume(ExitedEvent&& event)
{
// See comments in 'consume(MessageEvent&& event)' for which
// RateLimiter is used to throttle this UPID and when it is not
// throttled.
// Note that throttling ExitedEvent is necessary so the order
// between MessageEvents and ExitedEvents from the same PID is
// maintained. Also ExitedEvents are not subject to the capacity.
bool isRegisteredFramework = frameworks.principals.contains(event.pid);
const Option<string> principal = isRegisteredFramework
? frameworks.principals[event.pid]
: Option<string>::none();
// Necessary to disambiguate below.
typedef void(Self::*F)(ExitedEvent&&);
if (principal.isSome() &&
frameworks.limiters.contains(principal.get()) &&
frameworks.limiters[principal.get()].isSome()) {
frameworks.limiters[principal.get()].get()->limiter->acquire().onReady(
defer(self(), static_cast<F>(&Self::_consume), std::move(event)));
} else if ((principal.isNone() ||
!frameworks.limiters.contains(principal.get())) &&
isRegisteredFramework &&
frameworks.defaultLimiter.isSome()) {
frameworks.defaultLimiter.get()->limiter->acquire().onReady(
defer(self(), static_cast<F>(&Self::_consume), std::move(event)));
} else {
_consume(std::move(event));
}
}
// TODO(greggomann): Change this to accept an `Option<Principal>`
// when MESOS-7202 is resolved.
void Master::throttled(
MessageEvent&& event,
const Option<string>& principal)
{
// We already know a RateLimiter is used to throttle this event so
// here we only need to determine which.
if (principal.isSome()) {
CHECK_SOME(frameworks.limiters[principal.get()]);
frameworks.limiters[principal.get()].get()->messages--;
} else {
CHECK_SOME(frameworks.defaultLimiter);
frameworks.defaultLimiter.get()->messages--;
}
_consume(std::move(event));
}
void Master::_consume(MessageEvent&& event)
{
// Obtain the principal before processing the Message because the
// mapping may be deleted in handling 'UnregisterFrameworkMessage'
// but its counter still needs to be incremented for this message.
const Option<string> principal =
frameworks.principals.contains(event.message.from)
? frameworks.principals[event.message.from]
: Option<string>::none();
ProtobufProcess<Master>::consume(std::move(event));
// Increment 'messages_processed' counter if it still exists.
// Note that it could be removed in handling
// 'UnregisterFrameworkMessage' if it's the last framework with
// this principal.
if (principal.isSome() && metrics->frameworks.contains(principal.get())) {
Counter messages_processed =
metrics->frameworks.get(principal.get()).get()->messages_processed;
++messages_processed;
}
}
// TODO(greggomann): Change this to accept an `Option<Principal>`
// when MESOS-7202 is resolved.
void Master::exceededCapacity(
const MessageEvent& event,
const Option<string>& principal,
uint64_t capacity)
{
LOG(WARNING) << "Dropping message " << event.message.name << " from "
<< event.message.from
<< (principal.isSome() ? "(" + principal.get() + ")" : "")
<< ": capacity(" << capacity << ") exceeded";
// Send an error to the framework which will abort the scheduler
// driver.
// NOTE: The scheduler driver will send back a
// DeactivateFrameworkMessage which may be dropped as well but this
// should be fine because the scheduler is already informed of an
// unrecoverable error and should take action to recover.
FrameworkErrorMessage message;
message.set_message(
"Message " + event.message.name +
" dropped: capacity(" + stringify(capacity) + ") exceeded");
send(event.message.from, message);
}
void Master::_consume(ExitedEvent&& event)
{
Process<Master>::consume(std::move(event));
}
void fail(const string& message, const string& failure)
{
LOG(FATAL) << message << ": " << failure;
}
Future<Nothing> Master::recover()
{
if (!elected()) {
return Failure("Not elected as leading master");
}
if (recovered.isNone()) {
LOG(INFO) << "Recovering from registrar";
recovered = registrar->recover(info_)
.then(defer(self(), &Self::_recover, lambda::_1));
}
return recovered.get();
}
Future<Nothing> Master::_recover(const Registry& registry)
{
hashset<string> missingCapabilities =
misingMinimumCapabilities(info_, registry);
if (!missingCapabilities.empty()) {
LOG(ERROR) << "Master is missing the following minimum capabilities: "
<< strings::join<hashset<string>>(", ", missingCapabilities)
<< ". See the following documentation for steps to safely "
<< "recover from this state: "
<< "http://mesos.apache.org/documentation/latest/downgrades";
EXIT(EXIT_FAILURE);
}
foreach (const Registry::Slave& slave, registry.slaves().slaves()) {
SlaveInfo slaveInfo = slave.info();
// We store the `SlaveInfo`'s resources in the `pre-reservation-refinement`
// in order to support downgrades. We convert them back to `post-` format
// here so that we can keep our invariant of working with `post-` format
// resources within master memory.
upgradeResources(&slaveInfo);
slaves.recovered.put(slaveInfo.id(), slaveInfo);
}
foreach (const Registry::UnreachableSlave& unreachable,
registry.unreachable().slaves()) {
CHECK(!slaves.unreachable.contains(unreachable.id()));
slaves.unreachable[unreachable.id()] = unreachable.timestamp();
}
foreach (const Registry::GoneSlave& gone,
registry.gone().slaves()) {
slaves.gone[gone.id()] = gone.timestamp();
}
// Set up a timer for age-based registry GC.
scheduleRegistryGc();
// Set up a timeout for slaves to reregister.
slaves.recoveredTimer =
delay(flags.agent_reregister_timeout,
self(),
&Self::recoveredSlavesTimeout,
registry);
// Save the maintenance schedule.
foreach (const mesos::maintenance::Schedule& schedule, registry.schedules()) {
maintenance.schedules.push_back(schedule);
}
// Save the machine info for each machine.
foreach (const Registry::Machine& machine, registry.machines().machines()) {
machines[machine.info().id()] = Machine(machine.info());
}
// Save the quotas for each role.
foreach (const Registry::Quota& quota, registry.quotas()) {
quotas[quota.info().role()] = Quota{quota.info()};
}
// We notify the allocator via the `recover()` call. This has to be
// done before the first agent reregisters and makes its resources
// available for allocation. This is necessary because at this point
// the allocator is already initialized and ready to perform
// allocations. An allocator may decide to hold off with allocation
// until after it restores a view of the cluster state.
int expectedAgentCount = registry.slaves().slaves().size();
allocator->recover(expectedAgentCount, quotas);
// TODO(alexr): Consider adding a sanity check: whether quotas are
// satisfiable given all recovering agents reregister. We may want
// to notify operators early if total quota cannot be met.
// Recover weights, and update the allocator accordingly. If we
// recovered weights from the registry, any weights specified on the
// command-line are ignored. If no weights were recovered from the
// registry, any weights specified on the command-line are used and
// then stored in the registry.
vector<WeightInfo> weightInfos;
if (registry.weights_size() != 0) {
// TODO(Yongqiao Wang): After the Mesos master quorum is achieved,
// operator can send an update weights request to do a batch
// configuration for weights, so the `--weights` flag can be
// deprecated and this check can eventually be removed.
if (!weights.empty()) {
LOG(WARNING) << "Ignoring --weights flag '" << flags.weights.get()
<< "' and recovering the weights from registry";
weights.clear();
}
foreach (const Registry::Weight& weight, registry.weights()) {
WeightInfo weightInfo;
weightInfo.set_role(weight.info().role());
weightInfo.set_weight(weight.info().weight());
weightInfos.push_back(weightInfo);
weights[weight.info().role()] = weight.info().weight();
}
} else if (!weights.empty()) {
foreachpair (const string& role, double weight, weights) {
WeightInfo weightInfo;
weightInfo.set_role(role);
weightInfo.set_weight(weight);
weightInfos.push_back(weightInfo);
}
registrar->apply(Owned<RegistryOperation>(
new weights::UpdateWeights(weightInfos)));
}
allocator->updateWeights(weightInfos);
// Recovery is now complete!
LOG(INFO) << "Recovered " << registry.slaves().slaves().size() << " agents"
<< " from the registry (" << Bytes(registry.ByteSize()) << ")"
<< "; allowing " << flags.agent_reregister_timeout
<< " for agents to reregister";
return Nothing();
}
void Master::scheduleRegistryGc()
{
registryGcTimer = delay(flags.registry_gc_interval,
self(),
&Self::doRegistryGc);
}
void Master::doRegistryGc()
{
// Schedule next periodic GC.
scheduleRegistryGc();
// Determine which unreachable agents to GC from the registry, if
// any. We do this by examining the master's in-memory copy of the
// unreachable list and checking two criteria, "age" and "count". To
// check the "count" criteria, we remove elements from the beginning
// of the list until it contains at most "registry_max_agent_count"
// elements (note that `slaves.unreachable` is a `LinkedHashMap`,
// which provides iteration over keys in insertion-order). To check
// the "age" criteria, we remove any element in the list whose age
// is more than "registry_max_agent_age". Note that for the latter,
// we check the entire list, not just the beginning: this avoids
// requiring that the list be kept sorted by timestamp.
//
// We build a candidate list of SlaveIDs to remove. We then try to
// remove this list from the registry. Note that all the slaveIDs we
// want to remove might not be found in the registrar's copy of the
// unreachable list; this can occur if there is a concurrent write
// (e.g., an unreachable agent we want to GC reregisters
// concurrently). In this situation, we skip removing any elements
// we don't find.
auto prune = [this](const LinkedHashMap<SlaveID, TimeInfo>& slaves) {
size_t count = slaves.size();
TimeInfo currentTime = protobuf::getCurrentTime();
hashset<SlaveID> toRemove;
foreachpair (const SlaveID& slaveId,
const TimeInfo& removalTime,
slaves) {
// Count-based GC.
CHECK(toRemove.size() <= count);
size_t liveCount = count - toRemove.size();
if (liveCount > flags.registry_max_agent_count) {
toRemove.insert(slaveId);
continue;
}
// Age-based GC.
Duration age = Nanoseconds(
currentTime.nanoseconds() - removalTime.nanoseconds());
if (age > flags.registry_max_agent_age) {
toRemove.insert(slaveId);
}
}
return toRemove;
};
hashset<SlaveID> toRemoveUnreachable = prune(slaves.unreachable);
hashset<SlaveID> toRemoveGone = prune(slaves.gone);
if (toRemoveUnreachable.empty() && toRemoveGone.empty()) {
VLOG(1) << "Skipping periodic registry garbage collection: "
<< "no agents qualify for removal";
return;
}
VLOG(1) << "Attempting to remove " << toRemoveUnreachable.size()
<< " unreachable and " << toRemoveGone.size()
<< " gone agents from the registry";
registrar->apply(Owned<RegistryOperation>(
new Prune(toRemoveUnreachable, toRemoveGone)))
.onAny(defer(self(),
&Self::_doRegistryGc,
toRemoveUnreachable,
toRemoveGone,
lambda::_1));
}
void Master::_doRegistryGc(
const hashset<SlaveID>& toRemoveUnreachable,
const hashset<SlaveID>& toRemoveGone,
const Future<bool>& registrarResult)
{
CHECK(!registrarResult.isDiscarded());
CHECK(!registrarResult.isFailed());
// `Prune` registry operation should never fail.
CHECK(registrarResult.get());
// Update in-memory state to be consistent with registry changes. If
// there was a concurrent registry operation that also modified the
// unreachable/gone list (e.g., an agent in `toRemoveXXX` concurrently
// reregistered), entries in `toRemove` might not appear in
// `slaves.unreachable` or `slaves.gone`.
//
// TODO(neilc): It would be nice to verify that the effect of these
// in-memory updates is equivalent to the changes made by the registry
// operation, but there isn't an easy way to do that.
size_t numRemovedUnreachable = 0;
foreach (const SlaveID& slaveId, toRemoveUnreachable) {
if (!slaves.unreachable.contains(slaveId)) {
LOG(WARNING) << "Failed to garbage collect " << slaveId
<< " from the unreachable list";
continue;
}
slaves.unreachable.erase(slaveId);
// TODO(vinod): Consider moving these tasks into `completedTasks` by
// transitioning them to a terminal state and sending status updates.
// But it's not clear what this state should be. If a framework
// reconciles these tasks after this point it would get `TASK_UNKNOWN`
// which seems appropriate but we don't keep tasks in this state in-memory.
if (slaves.unreachableTasks.contains(slaveId)) {
foreachkey (const FrameworkID& frameworkId,
slaves.unreachableTasks.at(slaveId)) {
Framework* framework = getFramework(frameworkId);
if (framework != nullptr) {
foreach (const TaskID& taskId,
slaves.unreachableTasks.at(slaveId).at(frameworkId)) {
framework->unreachableTasks.erase(taskId);
}
}
}
}
slaves.unreachableTasks.erase(slaveId);
numRemovedUnreachable++;
}
size_t numRemovedGone = 0;
foreach (const SlaveID& slaveId, toRemoveGone) {
if (!slaves.gone.contains(slaveId)) {
LOG(WARNING) << "Failed to garbage collect " << slaveId
<< " from the gone list";
continue;
}
slaves.gone.erase(slaveId);
numRemovedGone++;
}
// TODO(neilc): Add a metric for # of agents discarded from the registry?
LOG(INFO) << "Garbage collected " << numRemovedUnreachable
<< " unreachable and " << numRemovedGone
<< " gone agents from the registry";
}
void Master::recoveredSlavesTimeout(const Registry& registry)
{
CHECK(elected());
// TODO(bmahler): Add a 'Percentage' abstraction.
Try<double> limit_ = numify<double>(
strings::remove(
flags.recovery_agent_removal_limit,
"%",
strings::SUFFIX));
CHECK_SOME(limit_);
double limit = limit_.get() / 100.0;
// Compute the percentage of slaves to be removed, if it exceeds the
// safety-net limit, bail!
double removalPercentage =
(1.0 * slaves.recovered.size()) /
(1.0 * registry.slaves().slaves().size());
if (removalPercentage > limit) {
EXIT(EXIT_FAILURE)
<< "Post-recovery agent removal limit exceeded! After "
<< flags.agent_reregister_timeout
<< " there were " << slaves.recovered.size()
<< " (" << removalPercentage * 100 << "%) agents recovered from the"
<< " registry that did not reregister: \n"
<< stringify(slaves.recovered.keys()) << "\n "
<< " The configured removal limit is " << limit * 100 << "%. Please"
<< " investigate or increase this limit to proceed further";
}
// Remove the slaves in a rate limited manner, similar to how the
// SlaveObserver removes slaves.
foreach (const Registry::Slave& slave, registry.slaves().slaves()) {
// The slave is removed from `recovered` when it completes the
// re-registration process. If the slave is in `reregistering`, it
// has started but not yet finished reregistering. In either
// case, we don't want to try to remove it.
if (!slaves.recovered.contains(slave.info().id()) ||
slaves.reregistering.contains(slave.info().id())) {
continue;
}
Future<Nothing> acquire = Nothing();
if (slaves.limiter.isSome()) {
LOG(INFO) << "Scheduling removal of agent "
<< slave.info().id() << " (" << slave.info().hostname() << ")"
<< "; did not reregister within "
<< flags.agent_reregister_timeout << " after master failover";
acquire = slaves.limiter.get()->acquire();
}
const string failure = "Agent removal rate limit acquisition failed";
// TODO(bmahler): Cancelation currently occurs within by returning
// early from `markUnreachable` *without* the "discarder" having
// discarded the rate limit token. This approach means that if
// agents reregister while many of the marking unreachable
// operations are in progress, the rate that we mark unreachable
// will "slow down" rather than stay constant. We should instead
// discard the rate limit token when the agent reregisters and
// handle the discard here. See MESOS-8386.
acquire
.onFailed(lambda::bind(fail, failure, lambda::_1))
.onDiscarded(lambda::bind(fail, failure, "discarded"))
.then(defer(self(),
&Self::markUnreachable,
slave.info(),
true,
"did not reregister within"
" " + stringify(flags.agent_reregister_timeout) +
" after master failover"))
.then(defer(self(), [=](bool marked) {
if (marked) {
++metrics->slave_unreachable_completed;
} else {
++metrics->slave_unreachable_canceled;
}
return Nothing();
}));
++metrics->slave_unreachable_scheduled;
}
}
void Master::sendSlaveLost(const SlaveInfo& slaveInfo)
{
foreachvalue (Framework* framework, frameworks.registered) {
if (!framework->connected()) {
continue;
}
LOG(INFO) << "Notifying framework " << *framework << " of lost agent "
<< slaveInfo.id() << " (" << slaveInfo.hostname() << ")";
LostSlaveMessage message;
message.mutable_slave_id()->MergeFrom(slaveInfo.id());
framework->send(message);
}
if (HookManager::hooksAvailable()) {
HookManager::masterSlaveLostHook(slaveInfo);
}
}
void Master::fileAttached(const Future<Nothing>& result, const string& path)
{
if (result.isReady()) {
LOG(INFO) << "Successfully attached file '" << path << "'";
} else {
LOG(ERROR) << "Failed to attach file '" << path << "': "
<< (result.isFailed() ? result.failure() : "discarded");
}
}
void Master::submitScheduler(const string& name)
{
LOG(INFO) << "Scheduler submit request for " << name;
SubmitSchedulerResponse response;
response.set_okay(false);
reply(response);
}
void Master::contended(const Future<Future<Nothing>>& candidacy)
{
CHECK(!candidacy.isDiscarded());
if (candidacy.isFailed()) {
EXIT(EXIT_FAILURE) << "Failed to contend: " << candidacy.failure();
}
// Watch for candidacy change.
candidacy
->onAny(defer(self(), &Master::lostCandidacy, lambda::_1));
}
void Master::lostCandidacy(const Future<Nothing>& lost)
{
CHECK(!lost.isDiscarded());
if (lost.isFailed()) {
EXIT(EXIT_FAILURE) << "Failed to watch for candidacy: " << lost.failure();
}
if (elected()) {
EXIT(EXIT_FAILURE) << "Lost candidacy as a leader... committing suicide!";
}
LOG(INFO) << "Lost candidacy as a follower... Contend again";
contender->contend()
.onAny(defer(self(), &Master::contended, lambda::_1));
}
void Master::detected(const Future<Option<MasterInfo>>& _leader)
{
CHECK(!_leader.isDiscarded());
if (_leader.isFailed()) {
EXIT(EXIT_FAILURE)
<< "Failed to detect the leading master: " << _leader.failure()
<< "; committing suicide!";
}
bool wasElected = elected();
leader = _leader.get();
if (elected()) {
electedTime = Clock::now();
if (!wasElected) {
LOG(INFO) << "Elected as the leading master!";
// Begin the recovery process, bail if it fails or is discarded.
recover()
.onFailed(lambda::bind(fail, "Recovery failed", lambda::_1))
.onDiscarded(lambda::bind(fail, "Recovery failed", "discarded"));
} else {
// This happens if there is a ZK blip that causes a re-election
// but the same leading master is elected as leader.
LOG(INFO) << "Re-elected as the leading master";
}
} else if (leader.isSome()) {
// A different node has been elected as the leading master.
LOG(INFO) << "The newly elected leader is " << leader->pid()
<< " with id " << leader->id();
if (wasElected) {
EXIT(EXIT_FAILURE) << "Conceded leadership to another master..."
<< " committing suicide!";
}
// If this master and the current leader both have a configured
// domain and the current leader is located in a different region,
// exit with an error message: this indicates a configuration
// error, since all masters must be in the same region.
if (leader->has_domain() && info_.has_domain()) {
const DomainInfo& leaderDomain = leader->domain();
const DomainInfo& selfDomain = info_.domain();
// We currently reject configured domains without fault domains,
// but that might change in the future. For compatibility with
// future versions of Mesos, we treat a master with a configured
// domain but no fault domain as equivalent to a master with no
// configured domain.
if (leaderDomain.has_fault_domain() && selfDomain.has_fault_domain()) {
const DomainInfo::FaultDomain::RegionInfo& leaderRegion =
leaderDomain.fault_domain().region();
const DomainInfo::FaultDomain::RegionInfo& selfRegion =
selfDomain.fault_domain().region();
if (leaderRegion != selfRegion) {
EXIT(EXIT_FAILURE) << "Leading master uses domain "
<< leaderDomain << "; this master is "
<< "configured to use domain "
<< selfDomain << "; all masters in the "
<< "same cluster must use the same region";
}
}
}
} else {
// If an election occured and no leader was elected, `None` is returned.
LOG(INFO) << "No master was elected.";
if (wasElected) {
EXIT(EXIT_FAILURE) << "Lost leadership after indecisive election..."
<< " committing suicide!";
}
}
// Keep detecting.
detector->detect(leader)
.onAny(defer(self(), &Master::detected, lambda::_1));
}
Future<bool> Master::authorizeFramework(
const FrameworkInfo& frameworkInfo)
{
if (authorizer.isNone()) {
return true; // Authorization is disabled.
}
LOG(INFO) << "Authorizing framework principal '" << frameworkInfo.principal()
<< "' to receive offers for roles '"
<< stringify(protobuf::framework::getRoles(frameworkInfo)) << "'";
authorization::Request request;
request.set_action(authorization::REGISTER_FRAMEWORK);
if (frameworkInfo.has_principal()) {
request.mutable_subject()->set_value(frameworkInfo.principal());
}
request.mutable_object()->mutable_framework_info()->CopyFrom(frameworkInfo);
// For non-`MULTI_ROLE` frameworks, also propagate its single role
// via the request's `value` field. This is purely for backwards
// compatibility as the `value` field is deprecated. Note that this
// means that authorizers relying on the deprecated field will see
// an empty string in `value` for `MULTI_ROLE` frameworks.
//
// TODO(bbannier): Remove this at the end of `value`'s deprecation
// cycle, see MESOS-7073.
if (!protobuf::frameworkHasCapability(
frameworkInfo, FrameworkInfo::Capability::MULTI_ROLE)) {
request.mutable_object()->set_value(frameworkInfo.role());
}
return authorizer.get()->authorized(request);
}
Option<Error> Master::validateFrameworkAuthentication(
const FrameworkInfo& frameworkInfo,
const UPID& from)
{
if (authenticating.contains(from)) {
return Error("Re-authentication in progress");
}
if (flags.authenticate_frameworks && !authenticated.contains(from)) {
// This could happen if another authentication request came
// through before we are here or if a framework tried to
// (re-)register without authentication.
return Error("Framework at " + stringify(from) + " is not authenticated");
}
// TODO(bmahler): Currently the scheduler driver does not
// set 'principal', so we allow frameworks to omit it.
if (frameworkInfo.has_principal() &&
authenticated.contains(from) &&
frameworkInfo.principal() != authenticated[from]) {
return Error("Framework principal '" + frameworkInfo.principal() + "'"
" does not match authenticated principal"
" '" + authenticated[from] + "'");
}
return None();
}
void Master::drop(
const UPID& from,
const scheduler::Call& call,
const string& message)
{
// TODO(bmahler): Increment a metric.
LOG(WARNING) << "Dropping " << call.type() << " call"
<< " from framework " << call.framework_id()
<< " at " << from << ": " << message;
}
void Master::drop(
Framework* framework,
const Offer::Operation& operation,
const string& message)
{
CHECK_NOTNULL(framework);
// TODO(jieyu): Increment a metric.
LOG(WARNING) << "Dropping " << Offer::Operation::Type_Name(operation.type())
<< " operation from framework " << *framework
<< ": " << message;
// NOTE: The operation validation code should be refactored. Due to the order
// of validation, it's possible that this function will be called before the
// master validates that operations from v0 frameworks should not have their
// ID set.
if (operation.has_id() && framework->http.isSome()) {
scheduler::Event update;
update.set_type(scheduler::Event::UPDATE_OPERATION_STATUS);
// NOTE: We do not attempt to set the agent or resource provider IDs for
// dropped operations as we cannot guarantee to always know their values.
//
// TODO(bbannier): Set agent or resource provider ID if we know
// for certain that the operation was valid.
*update.mutable_update_operation_status()->mutable_status() =
protobuf::createOperationStatus(
OperationState::OPERATION_ERROR,
operation.id(),
message);
framework->send(update);
}
}
void Master::drop(
Framework* framework,
const scheduler::Call& call,
const string& message)
{
CHECK_NOTNULL(framework);
// TODO(gyliu513): Increment a metric.
LOG(WARNING) << "Dropping " << call.type() << " call"
<< " from framework " << *framework
<< ": " << message;
}
void Master::drop(
Framework* framework,
const scheduler::Call::Suppress& suppress,
const string& message)
{
scheduler::Call call;
call.set_type(scheduler::Call::SUPPRESS);
call.mutable_suppress()->CopyFrom(suppress);
drop(framework, call, message);
}
void Master::drop(
Framework* framework,
const scheduler::Call::Revive& revive,
const string& message)
{
scheduler::Call call;
call.set_type(scheduler::Call::REVIVE);
call.mutable_revive()->CopyFrom(revive);
drop(framework, call, message);
}
void Master::receive(
const UPID& from,
scheduler::Call&& call)
{
// TODO(vinod): Add metrics for calls.
Option<Error> error = validation::scheduler::call::validate(call);
if (error.isSome()) {
metrics->incrementInvalidSchedulerCalls(call);
drop(from, call, error->message);
return;
}
if (call.type() == scheduler::Call::SUBSCRIBE) {
subscribe(from, call.subscribe());
return;
}
// We consolidate the framework lookup and pid validation logic here
// because they are common for all the call handlers.
Framework* framework = getFramework(call.framework_id());
if (framework == nullptr) {
drop(from, call, "Framework cannot be found");
return;
}
if (framework->pid != from) {
drop(from, call, "Call is not from registered framework");
return;
}
framework->metrics.incrementCall(call.type());
// This is possible when master --> framework link is broken (i.e., one
// way network partition) and the framework is not aware of it. There
// is no way for driver based frameworks to detect this in the absence
// of periodic heartbeat events. We send an error message to the framework
// causing the scheduler driver to abort when this happens.
if (!framework->connected()) {
const string error = "Framework disconnected";
LOG(INFO) << "Refusing " << call.type() << " call from framework "
<< *framework << ": " << error;
FrameworkErrorMessage message;
message.set_message(error);
send(from, message);
return;
}
switch (call.type()) {
case scheduler::Call::SUBSCRIBE:
// SUBSCRIBE call should have been handled above.
LOG(FATAL) << "Unexpected 'SUBSCRIBE' call";
case scheduler::Call::TEARDOWN:
teardown(framework);
break;
case scheduler::Call::ACCEPT:
accept(framework, std::move(*call.mutable_accept()));
break;
case scheduler::Call::DECLINE:
decline(framework, std::move(*call.mutable_decline()));
break;
case scheduler::Call::ACCEPT_INVERSE_OFFERS:
acceptInverseOffers(framework, call.accept_inverse_offers());
break;
case scheduler::Call::DECLINE_INVERSE_OFFERS:
declineInverseOffers(framework, call.decline_inverse_offers());
break;
case scheduler::Call::REVIVE:
revive(framework, call.revive());
break;
case scheduler::Call::KILL:
kill(framework, call.kill());
break;
case scheduler::Call::SHUTDOWN:
shutdown(framework, call.shutdown());
break;
case scheduler::Call::ACKNOWLEDGE: {
acknowledge(framework, std::move(*call.mutable_acknowledge()));
break;
}
case scheduler::Call::ACKNOWLEDGE_OPERATION_STATUS: {
drop(
from,
call,
"'ACKNOWLEDGE_OPERATION_STATUS' is not supported by the v0 API");
break;
}
case scheduler::Call::RECONCILE:
reconcile(framework, std::move(*call.mutable_reconcile()));
break;
case scheduler::Call::RECONCILE_OPERATIONS:
drop(
from,
call,
"'RECONCILE_OPERATIONS' is not supported by the v0 API");
break;
case scheduler::Call::MESSAGE:
message(framework, std::move(*call.mutable_message()));
break;
case scheduler::Call::REQUEST:
request(framework, call.request());
break;
case scheduler::Call::SUPPRESS:
suppress(framework, call.suppress());
break;
case scheduler::Call::UNKNOWN:
LOG(WARNING) << "'UNKNOWN' call";
break;
}
}
void Master::registerFramework(
const UPID& from,
RegisterFrameworkMessage&& registerFrameworkMessage)
{
FrameworkInfo frameworkInfo =
std::move(*registerFrameworkMessage.mutable_framework());
if (frameworkInfo.has_id() && !frameworkInfo.id().value().empty()) {
const string error = "Registering with 'id' already set";
LOG(INFO) << "Refusing registration request of framework"
<< " '" << frameworkInfo.name() << "' at " << from
<< ": " << error;
FrameworkErrorMessage message;
message.set_message(error);
send(from, message);
return;
}
scheduler::Call::Subscribe call;
*call.mutable_framework_info() = std::move(frameworkInfo);
subscribe(from, call);
}
void Master::reregisterFramework(
const UPID& from,
ReregisterFrameworkMessage&& reregisterFrameworkMessage)
{
FrameworkInfo frameworkInfo =
std::move(*reregisterFrameworkMessage.mutable_framework());
if (!frameworkInfo.has_id() || frameworkInfo.id().value().empty()) {
const string error = "Re-registering without an 'id'";
LOG(INFO) << "Refusing re-registration request of framework"
<< " '" << frameworkInfo.name() << "' at " << from
<< ": " << error;
FrameworkErrorMessage message;
message.set_message(error);
send(from, message);
return;
}
scheduler::Call::Subscribe call;
*call.mutable_framework_info() = std::move(frameworkInfo);
call.set_force(reregisterFrameworkMessage.failover());
subscribe(from, call);
}
void Master::subscribe(
HttpConnection http,
const scheduler::Call::Subscribe& subscribe)
{
// TODO(anand): Authenticate the framework.
const FrameworkInfo& frameworkInfo = subscribe.framework_info();
// Update messages_{re}register_framework accordingly.
if (!frameworkInfo.has_id() || frameworkInfo.id() == "") {
++metrics->messages_register_framework;
} else {
++metrics->messages_reregister_framework;
}
LOG(INFO) << "Received subscription request for"
<< " HTTP framework '" << frameworkInfo.name() << "'";
Option<Error> validationError =
validation::framework::validate(frameworkInfo);
if (validationError.isNone()) {
// Check the framework's role(s) against the whitelist.
set<string> invalidRoles;
if (protobuf::frameworkHasCapability(
frameworkInfo,
FrameworkInfo::Capability::MULTI_ROLE)) {
foreach (const string& role, frameworkInfo.roles()) {
if (!isWhitelistedRole(role)) {
invalidRoles.insert(role);
}
}
} else {
if (!isWhitelistedRole(frameworkInfo.role())) {
invalidRoles.insert(frameworkInfo.role());
}
}
if (!invalidRoles.empty()) {
validationError = Error("Roles " + stringify(invalidRoles) +
" are not present in master's --roles");
}
}
// Ensure each of the suppressed role is contained in the list of roles.
set<string> frameworkRoles = protobuf::framework::getRoles(frameworkInfo);
set<string> suppressedRoles = set<string>(
subscribe.suppressed_roles().begin(), subscribe.suppressed_roles().end());
if (validationError.isNone()) {
// The suppressed roles must be contained within the list of all
// roles for the framwork.
foreach (const string& role, suppressedRoles) {
if (!frameworkRoles.count(role)) {
validationError = Error("Suppressed role '" + role +
"' is not contained in the list of roles");
break;
}
}
}
// TODO(vinod): Deprecate this in favor of authorization.
if (validationError.isNone() &&
frameworkInfo.user() == "root" && !flags.root_submissions) {
validationError = Error("User 'root' is not allowed to run frameworks"
" without --root_submissions set");
}
if (validationError.isNone() && frameworkInfo.has_id() &&
isCompletedFramework(frameworkInfo.id())) {
// This could happen if a framework tries to subscribe after its failover
// timeout has elapsed, or it has been torn down via the operator API.
//
// TODO(vinod): Master should persist admitted frameworks to the
// registry and remove them from it after failover timeout.
validationError = Error("Framework has been removed");
}
if (validationError.isNone() && !isValidFailoverTimeout(frameworkInfo)) {
validationError = Error("The framework failover_timeout (" +
stringify(frameworkInfo.failover_timeout()) +
") is invalid");
}
if (validationError.isSome()) {
LOG(INFO) << "Refusing subscription of framework"
<< " '" << frameworkInfo.name() << "': "
<< validationError->message;
FrameworkErrorMessage message;
message.set_message(validationError->message);
http.send(message);
http.close();
return;
}
// Need to disambiguate for the compiler.
void (Master::*_subscribe)(
HttpConnection,
const FrameworkInfo&,
bool,
const set<string>&,
const Future<bool>&) = &Self::_subscribe;
authorizeFramework(frameworkInfo)
.onAny(defer(self(),
_subscribe,
http,
frameworkInfo,
subscribe.force(),
suppressedRoles,
lambda::_1));
}
void Master::_subscribe(
HttpConnection http,
const FrameworkInfo& frameworkInfo,
bool force,
const set<string>& suppressedRoles,
const Future<bool>& authorized)
{
CHECK(!authorized.isDiscarded());
Option<Error> authorizationError = None();
if (authorized.isFailed()) {
authorizationError =
Error("Authorization failure: " + authorized.failure());
} else if (!authorized.get()) {
authorizationError = Error(
"Not authorized to use roles '" +
stringify(protobuf::framework::getRoles(frameworkInfo)) + "'");
}
if (authorizationError.isSome()) {
LOG(INFO) << "Refusing subscription of framework"
<< " '" << frameworkInfo.name() << "'"
<< ": " << authorizationError->message;
FrameworkErrorMessage message;
message.set_message(authorizationError->message);
http.send(message);
http.close();
return;
}
LOG(INFO) << "Subscribing framework '" << frameworkInfo.name()
<< "' with checkpointing "
<< (frameworkInfo.checkpoint() ? "enabled" : "disabled")
<< " and capabilities " << frameworkInfo.capabilities();
if (!frameworkInfo.has_id() || frameworkInfo.id() == "") {
// If we are here the framework is subscribing for the first time.
// Assign a new FrameworkID.
FrameworkInfo frameworkInfo_ = frameworkInfo;
frameworkInfo_.mutable_id()->CopyFrom(newFrameworkId());
Framework* framework = new Framework(this, flags, frameworkInfo_, http);
addFramework(framework, suppressedRoles);
framework->metrics.incrementCall(scheduler::Call::SUBSCRIBE);
FrameworkRegisteredMessage message;
message.mutable_framework_id()->MergeFrom(framework->id());
message.mutable_master_info()->MergeFrom(info_);
framework->send(message);
// Start the heartbeat after sending SUBSCRIBED event.
framework->heartbeat();
if (!subscribers.subscribed.empty()) {
subscribers.send(
protobuf::master::event::createFrameworkAdded(*framework));
}
return;
}
// If we are here the framework has already been assigned an id.
CHECK(!frameworkInfo.id().value().empty());
Framework* framework = getFramework(frameworkInfo.id());
if (framework == nullptr) {
// The framework has not yet reregistered after master failover.
// Furthermore, no agents have reregistered running one of this
// framework's tasks. Reconstruct a `Framework` object from the
// supplied `FrameworkInfo`.
recoverFramework(frameworkInfo, suppressedRoles);
framework = getFramework(frameworkInfo.id());
}
CHECK_NOTNULL(framework);
framework->metrics.incrementCall(scheduler::Call::SUBSCRIBE);
if (!framework->recovered()) {
// The framework has previously been registered with this master;
// it may or may not currently be connected.
updateFramework(framework, frameworkInfo, suppressedRoles);
framework->reregisteredTime = Clock::now();
// Always failover the old framework connection. See MESOS-4712 for details.
failoverFramework(framework, http);
} else {
// The framework has not yet reregistered after master failover.
Try<Nothing> activate = activateRecoveredFramework(
framework, frameworkInfo, None(), http, suppressedRoles);
if (activate.isError()) {
LOG(INFO) << "Could not update FrameworkInfo of framework '"
<< frameworkInfo.name() << "': " << activate.error();
FrameworkErrorMessage message;
message.set_message(activate.error());
http.send(message);
http.close();
return;
}
}
if (!subscribers.subscribed.empty()) {
subscribers.send(
protobuf::master::event::createFrameworkUpdated(*framework));
}
// Broadcast the new framework pid to all the slaves. We have to
// broadcast because an executor might be running on a slave but
// it currently isn't running any tasks.
foreachvalue (Slave* slave, slaves.registered) {
UpdateFrameworkMessage message;
message.mutable_framework_id()->CopyFrom(frameworkInfo.id());
// TODO(anand): We set 'pid' to UPID() for http frameworks
// as 'pid' was made optional in 0.24.0. In 0.25.0, we
// no longer have to set pid here for http frameworks.
message.set_pid(UPID());
message.mutable_framework_info()->CopyFrom(frameworkInfo);
send(slave->pid, message);
}
}
void Master::subscribe(
const UPID& from,
const scheduler::Call::Subscribe& subscribe)
{
FrameworkInfo frameworkInfo = subscribe.framework_info();
// Update messages_{re}register_framework accordingly.
if (!frameworkInfo.has_id() || frameworkInfo.id() == "") {
++metrics->messages_register_framework;
} else {
++metrics->messages_reregister_framework;
}
if (authenticating.contains(from)) {
// TODO(vinod): Consider dropping this request and fix the tests
// to deal with the drop. Currently there is a race between master
// realizing the framework is authenticated and framework sending
// a subscribe call. Dropping this message will cause the
// framework to retry slowing down the tests.
LOG(INFO) << "Queuing up SUBSCRIBE call for"
<< " framework '" << frameworkInfo.name() << "' at " << from
<< " because authentication is still in progress";
// Need to disambiguate for the compiler.
void (Master::*f)(const UPID&, const scheduler::Call::Subscribe&)
= &Self::subscribe;
authenticating[from]
.onReady(defer(self(), f, from, subscribe));
return;
}
Option<Error> validationError =
validation::framework::validate(frameworkInfo);
if (validationError.isNone()) {
// Check the framework's role(s) against the whitelist.
set<string> invalidRoles;
if (protobuf::frameworkHasCapability(
frameworkInfo,
FrameworkInfo::Capability::MULTI_ROLE)) {
foreach (const string& role, frameworkInfo.roles()) {
if (!isWhitelistedRole(role)) {
invalidRoles.insert(role);
}
}
} else {
if (!isWhitelistedRole(frameworkInfo.role())) {
invalidRoles.insert(frameworkInfo.role());
}
}
if (!invalidRoles.empty()) {
validationError = Error("Roles " + stringify(invalidRoles) +
" are not present in the master's --roles");
}
}
// Ensure each of the suppressed role is contained in the list of roles.
set<string> frameworkRoles = protobuf::framework::getRoles(frameworkInfo);
set<string> suppressedRoles = set<string>(
subscribe.suppressed_roles().begin(), subscribe.suppressed_roles().end());
if (validationError.isNone()) {
// The suppressed roles must be contained within the list of all
// roles for the framwork.
foreach (const string& role, suppressedRoles) {
if (!frameworkRoles.count(role)) {
validationError = Error("Suppressed role '" + role +
"' is not contained in the list of roles");
break;
}
}
}
// TODO(vinod): Deprecate this in favor of authorization.
if (validationError.isNone() &&
frameworkInfo.user() == "root" && !flags.root_submissions) {
validationError = Error("User 'root' is not allowed to run frameworks"
" without --root_submissions set");
}
if (validationError.isNone() && frameworkInfo.has_id() &&
isCompletedFramework(frameworkInfo.id())) {
// This could happen if a framework tries to subscribe after its
// failover timeout has elapsed or it unregistered itself by
// calling 'stop()' on the scheduler driver.
//
// TODO(vinod): Master should persist admitted frameworks to the
// registry and remove them from it after failover timeout.
validationError = Error("Framework has been removed");
}
if (validationError.isNone() && !isValidFailoverTimeout(frameworkInfo)) {
validationError = Error("The framework failover_timeout (" +
stringify(frameworkInfo.failover_timeout()) +
") is invalid");
}
// Note that re-authentication errors are already handled above.
if (validationError.isNone()) {
validationError = validateFrameworkAuthentication(frameworkInfo, from);
}
if (validationError.isSome()) {
LOG(INFO) << "Refusing subscription of framework"
<< " '" << frameworkInfo.name() << "' at " << from << ": "
<< validationError->message;
FrameworkErrorMessage message;
message.set_message(validationError->message);
send(from, message);
return;
}
LOG(INFO) << "Received SUBSCRIBE call for"
<< " framework '" << frameworkInfo.name() << "' at " << from;
// We allow an authenticated framework to not specify a principal
// in `FrameworkInfo` but we'd prefer to log a WARNING here. We also
// set `FrameworkInfo.principal` to the value of authenticated principal
// and use it for authorization later when it happens.
if (!frameworkInfo.has_principal() && authenticated.contains(from)) {
LOG(WARNING)
<< "Setting 'principal' in FrameworkInfo to '" << authenticated[from]
<< "' because the framework authenticated with that principal but did "
<< "not set it in FrameworkInfo";
frameworkInfo.set_principal(authenticated[from]);
}
// Need to disambiguate for the compiler.
void (Master::*_subscribe)(
const UPID&,
const FrameworkInfo&,
bool,
const set<string>&,
const Future<bool>&) = &Self::_subscribe;
authorizeFramework(frameworkInfo)
.onAny(defer(self(),
_subscribe,
from,
frameworkInfo,
subscribe.force(),
suppressedRoles,
lambda::_1));
}
void Master::_subscribe(
const UPID& from,
const FrameworkInfo& frameworkInfo,
bool force,
const set<string>& suppressedRoles,
const Future<bool>& authorized)
{
CHECK(!authorized.isDiscarded());
Option<Error> authorizationError = None();
if (authorized.isFailed()) {
authorizationError =
Error("Authorization failure: " + authorized.failure());
} else if (!authorized.get()) {
authorizationError = Error(
"Not authorized to use roles '" +
stringify(protobuf::framework::getRoles(frameworkInfo)) + "'");
}
if (authorizationError.isSome()) {
LOG(INFO) << "Refusing subscription of framework"
<< " '" << frameworkInfo.name() << "' at " << from
<< ": " << authorizationError->message;
FrameworkErrorMessage message;
message.set_message(authorizationError->message);
send(from, message);
return;
}
// At this point, authentications errors will be due to
// re-authentication during the authorization process,
// so we drop the subscription.
Option<Error> authenticationError =
validateFrameworkAuthentication(frameworkInfo, from);
if (authenticationError.isSome()) {
LOG(INFO) << "Dropping SUBSCRIBE call for framework"
<< " '" << frameworkInfo.name() << "' at " << from
<< ": " << authenticationError->message;
return;
}
LOG(INFO) << "Subscribing framework " << frameworkInfo.name()
<< " with checkpointing "
<< (frameworkInfo.checkpoint() ? "enabled" : "disabled")
<< " and capabilities " << frameworkInfo.capabilities();
if (!frameworkInfo.has_id() || frameworkInfo.id().value().empty()) {
// If we are here the framework is subscribing for the first time.
// Check if this framework is already subscribed (because it retries).
foreachvalue (Framework* framework, frameworks.registered) {
if (framework->pid == from) {
LOG(INFO) << "Framework " << *framework
<< " already subscribed, resending acknowledgement";
FrameworkRegisteredMessage message;
message.mutable_framework_id()->MergeFrom(framework->id());
message.mutable_master_info()->MergeFrom(info_);
framework->send(message);
return;
}
}
CHECK(!frameworks.principals.contains(from));
// Assign a new FrameworkID.
FrameworkInfo frameworkInfo_ = frameworkInfo;
frameworkInfo_.mutable_id()->CopyFrom(newFrameworkId());
Framework* framework = new Framework(this, flags, frameworkInfo_, from);
addFramework(framework, suppressedRoles);
FrameworkRegisteredMessage message;
message.mutable_framework_id()->MergeFrom(framework->id());
message.mutable_master_info()->MergeFrom(info_);
framework->send(message);
if (!subscribers.subscribed.empty()) {
subscribers.send(
protobuf::master::event::createFrameworkAdded(*framework));
}
return;
}
// If we are here the framework has already been assigned an id.
CHECK(!frameworkInfo.id().value().empty());
// Check whether we got a subscribe from a framework whose UPID duplicates
// a framework that is already connected. Note that we don't send an error
// response because that would go to the framework that is already connected.
if (frameworks.principals.contains(from)) {
foreachvalue (Framework* framework, frameworks.registered) {
if (framework->pid == from && framework->id() != frameworkInfo.id()) {
LOG(ERROR) << "Dropping SUBSCRIBE call for framework '"
<< frameworkInfo.name() << "': " << *framework
<< " already connected at " << from;
return;
}
}
}
Framework* framework = getFramework(frameworkInfo.id());
if (framework == nullptr) {
// The framework has not yet reregistered after master failover.
// Furthermore, no agents have reregistered running one of this
// framework's tasks. Reconstruct a `Framework` object from the
// supplied `FrameworkInfo`.
recoverFramework(frameworkInfo, suppressedRoles);
framework = getFramework(frameworkInfo.id());
}
CHECK_NOTNULL(framework);
if (!framework->recovered()) {
// The framework has previously been registered with this master;
// it may or may not currently be connected.
//
// Using the "force" field of the scheduler allows us to keep a
// scheduler that got partitioned but didn't die (in ZooKeeper
// speak this means didn't lose their session) and then
// eventually tried to connect to this master even though
// another instance of their scheduler has reconnected.
// Test for the error case first.
if ((framework->pid != from) && !force) {
LOG(ERROR) << "Disallowing subscription attempt of"
<< " framework " << *framework
<< " because it is not expected from " << from;
FrameworkErrorMessage message;
message.set_message("Framework failed over");
send(from, message);
return;
}
// It is now safe to update the framework fields since the request is now
// guaranteed to be successful. We use the fields passed in during
// re-registration.
updateFramework(framework, frameworkInfo, suppressedRoles);
framework->reregisteredTime = Clock::now();
if (force) {
// TODO(vinod): Now that the scheduler pid is unique we don't
// need to call 'failoverFramework()' if the pid hasn't changed
// (i.e., duplicate message). Instead we can just send the
// FrameworkReregisteredMessage back and activate the framework
// if necesssary.
LOG(INFO) << "Framework " << *framework << " failed over";
failoverFramework(framework, from);
if (!subscribers.subscribed.empty()) {
subscribers.send(
protobuf::master::event::createFrameworkUpdated(*framework));
}
} else {
LOG(INFO) << "Allowing framework " << *framework
<< " to subscribe with an already used id";
// Remove any offers sent to this framework.
// NOTE: We need to do this because the scheduler might have
// replied to the offers but the driver might have dropped
// those messages since it wasn't connected to the master.
foreach (Offer* offer, utils::copy(framework->offers)) {
allocator->recoverResources(
offer->framework_id(),
offer->slave_id(),
offer->resources(),
None());
removeOffer(offer, true); // Rescind.
}
// Also remove inverse offers.
foreach (InverseOffer* inverseOffer,
utils::copy(framework->inverseOffers)) {
allocator->updateInverseOffer(
inverseOffer->slave_id(),
inverseOffer->framework_id(),
UnavailableResources{
inverseOffer->resources(),
inverseOffer->unavailability()},
None());
removeInverseOffer(inverseOffer, true); // Rescind.
}
// Relink to the framework. This might be necessary if the
// framework link previously broke.
link(framework->pid.get());
// Reactivate the framework.
// NOTE: We do this after recovering resources (above) so that
// the allocator has the correct view of the framework's share.
if (!framework->active()) {
framework->setFrameworkState(Framework::State::ACTIVE);
allocator->activateFramework(framework->id());
}
FrameworkReregisteredMessage message;
message.mutable_framework_id()->MergeFrom(frameworkInfo.id());
message.mutable_master_info()->MergeFrom(info_);
framework->send(message);
if (!subscribers.subscribed.empty()) {
subscribers.send(
protobuf::master::event::createFrameworkUpdated(*framework));
}
return;
}
} else {
// The framework has not yet reregistered after master failover.
Try<Nothing> activate = activateRecoveredFramework(
framework, frameworkInfo, from, None(), suppressedRoles);
if (activate.isError()) {
LOG(INFO) << "Could not update FrameworkInfo of framework '"
<< frameworkInfo.name() << "': " << activate.error();
FrameworkErrorMessage message;
message.set_message(activate.error());
send(from, message);
return;
}
if (!subscribers.subscribed.empty()) {
subscribers.send(
protobuf::master::event::createFrameworkUpdated(*framework));
}
}
// Broadcast the new framework pid to all the slaves. We have to
// broadcast because an executor might be running on a slave but
// it currently isn't running any tasks.
foreachvalue (Slave* slave, slaves.registered) {
UpdateFrameworkMessage message;
message.mutable_framework_id()->CopyFrom(frameworkInfo.id());
message.set_pid(from);
message.mutable_framework_info()->CopyFrom(frameworkInfo);
send(slave->pid, message);
}
}
void Master::unregisterFramework(
const UPID& from,
const FrameworkID& frameworkId)
{
LOG(INFO) << "Asked to unregister framework " << frameworkId;
Framework* framework = getFramework(frameworkId);
if (framework != nullptr) {
if (framework->pid == from) {
teardown(framework);
} else {
LOG(WARNING)
<< "Ignoring unregister framework message for framework " << *framework
<< " because it is not expected from " << from;
}
}
}
void Master::deactivateFramework(
const UPID& from,
const FrameworkID& frameworkId)
{
++metrics->messages_deactivate_framework;
Framework* framework = getFramework(frameworkId);
if (framework == nullptr) {
LOG(WARNING)
<< "Ignoring deactivate framework message for framework " << frameworkId
<< " because the framework cannot be found";
return;
}
if (framework->pid != from) {
LOG(WARNING)
<< "Ignoring deactivate framework message for framework " << *framework
<< " because it is not expected from " << from;
return;
}
if (!framework->connected()) {
LOG(INFO)
<< "Ignoring deactivate framework message for framework" << *framework
<< " because it is disconnected";
return;
}
if (framework->active()) {
deactivate(framework, true);
}
}
void Master::disconnect(Framework* framework)
{
CHECK_NOTNULL(framework);
CHECK(framework->connected());
if (framework->active()) {
deactivate(framework, true);
}
LOG(INFO) << "Disconnecting framework " << *framework;
framework->setFrameworkState(Framework::State::DISCONNECTED);
if (framework->pid.isSome()) {
// Remove the framework from authenticated. This is safe because
// a framework will always reauthenticate before (re-)registering.
authenticated.erase(framework->pid.get());
} else {
CHECK_SOME(framework->http);
// Close the HTTP connection, which may already have
// been closed due to scheduler disconnection.
framework->http->close();
}
}
void Master::deactivate(Framework* framework, bool rescind)
{
CHECK_NOTNULL(framework);
CHECK(framework->active());
LOG(INFO) << "Deactivating framework " << *framework;
framework->setFrameworkState(Framework::State::INACTIVE);
// Tell the allocator to stop allocating resources to this framework.
allocator->deactivateFramework(framework->id());
// Remove the framework's offers.
foreach (Offer* offer, utils::copy(framework->offers)) {
allocator->recoverResources(
offer->framework_id(),
offer->slave_id(),
offer->resources(),
None());
removeOffer(offer, rescind);
}
// Remove the framework's inverse offers.
foreach (InverseOffer* inverseOffer, utils::copy(framework->inverseOffers)) {
allocator->updateInverseOffer(
inverseOffer->slave_id(),
inverseOffer->framework_id(),
UnavailableResources{
inverseOffer->resources(),
inverseOffer->unavailability()},
None());
removeInverseOffer(inverseOffer, rescind);
}
}
void Master::disconnect(Slave* slave)
{
CHECK_NOTNULL(slave);
LOG(INFO) << "Disconnecting agent " << *slave;
slave->connected = false;
// Inform the slave observer.
dispatch(slave->observer, &SlaveObserver::disconnect);
// Remove the slave from authenticated. This is safe because
// a slave will always reauthenticate before (re-)registering.
authenticated.erase(slave->pid);
deactivate(slave);
}
void Master::deactivate(Slave* slave)
{
CHECK_NOTNULL(slave);
LOG(INFO) << "Deactivating agent " << *slave;
slave->active = false;
allocator->deactivateSlave(slave->id);
// Remove and rescind offers.
foreach (Offer* offer, utils::copy(slave->offers)) {
allocator->recoverResources(
offer->framework_id(),
slave->id,
offer->resources(),
None());
removeOffer(offer, true); // Rescind!
}
// Remove and rescind inverse offers.
foreach (InverseOffer* inverseOffer, utils::copy(slave->inverseOffers)) {
allocator->updateInverseOffer(
slave->id,
inverseOffer->framework_id(),
UnavailableResources{
inverseOffer->resources(),
inverseOffer->unavailability()},
None());
removeInverseOffer(inverseOffer, true); // Rescind!
}
}
void Master::resourceRequest(
const UPID& from,
const FrameworkID& frameworkId,
const vector<Request>& requests)
{
Framework* framework = getFramework(frameworkId);
if (framework == nullptr) {
LOG(WARNING)
<< "Ignoring resource request message from framework " << frameworkId
<< " because the framework cannot be found";
return;
}
if (framework->pid != from) {
LOG(WARNING)
<< "Ignoring resource request message from framework " << *framework
<< " because it is not expected from " << from;
return;
}
scheduler::Call::Request call;
foreach (const Request& request, requests) {
call.add_requests()->CopyFrom(request);
}
request(framework, call);
}
void Master::request(
Framework* framework,
const scheduler::Call::Request& request)
{
CHECK_NOTNULL(framework);
LOG(INFO) << "Processing REQUEST call for framework " << *framework;
++metrics->messages_resource_request;
allocator->requestResources(
framework->id(),
google::protobuf::convert(request.requests()));
}
void Master::suppress(
Framework* framework,
const scheduler::Call::Suppress& suppress)
{
CHECK_NOTNULL(framework);
LOG(INFO) << "Processing SUPPRESS call for framework " << *framework;
++metrics->messages_suppress_offers;
set<string> roles;
// Validate the roles, if provided. We need to make sure the
// roles is valid and also contained within the framework roles.
// Note that if a single role is invalid, we drop the entire
// call and do not suppress the valid roles.
foreach (const string& role, suppress.roles()) {
Option<Error> roleError = roles::validate(role);
if (roleError.isSome()) {
drop(framework,
suppress,
"suppression role '" + role + "' is invalid: " + roleError->message);
return;
}
if (framework->roles.count(role) == 0) {
drop(framework,
suppress,
"suppression role '" + role + "' is not one"
" of the frameworks's subscribed roles");
return;
}
roles.insert(role);
}
allocator->suppressOffers(framework->id(), roles);
}
bool Master::isWhitelistedRole(const string& name) const
{
if (roleWhitelist.isNone()) {
return true;
}
return roleWhitelist->contains(name);
}
void Master::launchTasks(
const UPID& from,
LaunchTasksMessage&& launchTasksMessage)
{
Framework* framework = getFramework(launchTasksMessage.framework_id());
if (framework == nullptr) {
LOG(WARNING)
<< "Ignoring launch tasks message for offers "
<< stringify(launchTasksMessage.offer_ids())
<< " of framework " << launchTasksMessage.framework_id()
<< " because the framework cannot be found";
return;
}
if (framework->pid != from) {
LOG(WARNING)
<< "Ignoring launch tasks message for offers "
<< stringify(launchTasksMessage.offer_ids())
<< " from '" << from << "' because it is not from the"
<< " registered framework " << *framework;
return;
}
// Currently when no tasks are specified in the launchTasks message
// it is implicitly considered a decline of the offers.
if (!launchTasksMessage.tasks().empty()) {
scheduler::Call::Accept message;
*message.mutable_filters() =
std::move(*launchTasksMessage.mutable_filters());
*message.mutable_offer_ids() =
std::move(*launchTasksMessage.mutable_offer_ids());
Offer::Operation* operation = message.add_operations();
operation->set_type(Offer::Operation::LAUNCH);
*operation->mutable_launch()->mutable_task_infos() =
std::move(*launchTasksMessage.mutable_tasks());
accept(framework, std::move(message));
} else {
scheduler::Call::Decline message;
*message.mutable_filters() =
std::move(*launchTasksMessage.mutable_filters());
*message.mutable_offer_ids() =
std::move(*launchTasksMessage.mutable_offer_ids());
decline(framework, std::move(message));
}
}
Future<bool> Master::authorizeTask(
const TaskInfo& task,
Framework* framework)
{
CHECK_NOTNULL(framework);
if (authorizer.isNone()) {
return true; // Authorization is disabled.
}
// Authorize the task.
authorization::Request request;
if (framework->info.has_principal()) {
request.mutable_subject()->set_value(framework->info.principal());
}
request.set_action(authorization::RUN_TASK);
authorization::Object* object = request.mutable_object();
object->mutable_task_info()->CopyFrom(task);
object->mutable_framework_info()->CopyFrom(framework->info);
LOG(INFO)
<< "Authorizing framework principal '"
<< (framework->info.has_principal() ? framework->info.principal() : "ANY")
<< "' to launch task " << task.task_id();
return authorizer.get()->authorized(request);
}
Future<bool> Master::authorizeReserveResources(
const Offer::Operation::Reserve& reserve,
const Option<Principal>& principal)
{
// Authorizing the reserve operation is equivalent to authorizing
// the resources specified in the operation.
return authorizeReserveResources(reserve.resources(), principal);
}
Future<bool> Master::authorizeReserveResources(
const Resources& resources,
const Option<Principal>& principal)
{
if (authorizer.isNone()) {
return true; // Authorization is disabled.
}
authorization::Request request;
request.set_action(authorization::RESERVE_RESOURCES);
Option<authorization::Subject> subject = createSubject(principal);
if (subject.isSome()) {
request.mutable_subject()->CopyFrom(subject.get());
}
// The operation will be authorized if the entity is allowed to make
// reservations for all roles included in `reserve.resources`.
// Add an element to `request.roles` for each unique role in the resources.
hashset<string> roles;
vector<Future<bool>> authorizations;
foreach (const Resource& resource, resources) {
// NOTE: Since authorization happens __before__ validation and resource
// format conversion, we must look for roles that may appear in both
// "pre" and "post" reservation-refinement formats. This may not even be
// valid, but we rely on validation being performed aftewards.
string role;
if (resource.reservations_size() > 0) {
// Check for the role in the "post-reservation-refinement" format.
//
// If there is a stack of reservations, we only perform authorization
// for the most refined reservation, since we only support "pushing"
// one reservation at a time. That is, all of the previous reservations
// must have already been authorized.
role = resource.reservations().rbegin()->role();
} else {
// Check for the role in the "pre-reservation-refinement" format.
role = resource.role();
}
if (!roles.contains(role)) {
roles.insert(role);
request.mutable_object()->mutable_resource()->CopyFrom(resource);
request.mutable_object()->set_value(role);
authorizations.push_back(authorizer.get()->authorized(request));
}
}
LOG(INFO) << "Authorizing principal '"
<< (principal.isSome() ? stringify(principal.get()) : "ANY")
<< "' to reserve resources '" << resources << "'";
// NOTE: Empty authorizations are not valid and are checked by a validator.
// However under certain circumstances, this method can be called before
// the validation occur and the case must be considered non erroneous.
// TODO(arojas): Consider ensuring that `validate()` is called before
// `authorizeReserveResources` so a `CHECK(!roles.empty())` can be added.
if (authorizations.empty()) {
return authorizer.get()->authorized(request);
}
return collectAuthorizations(authorizations);
}
Future<bool> Master::authorizeUnreserveResources(
const Offer::Operation::Unreserve& unreserve,
const Option<Principal>& principal)
{
if (authorizer.isNone()) {
return true; // Authorization is disabled.
}
authorization::Request request;
request.set_action(authorization::UNRESERVE_RESOURCES);
Option<authorization::Subject> subject = createSubject(principal);
if (subject.isSome()) {
request.mutable_subject()->CopyFrom(subject.get());
}
vector<Future<bool>> authorizations;
foreach (const Resource& resource, unreserve.resources()) {
// NOTE: Since authorization happens __before__ validation and resource
// format conversion, we must look for the principal that may appear in
// both "pre" and "post" reservation-refinement formats. This may not be
// valid, but we rely on validation being performed later.
Option<string> principal;
if (resource.reservations_size() > 0 &&
resource.reservations().rbegin()->has_principal()) {
// Check for roles in the "post-reservation-refinement" format.
principal = resource.reservations().rbegin()->principal();
} else if (
resource.has_reservation() && resource.reservation().has_principal()) {
// Check for roles in the "pre-reservation-refinement" format.
principal = resource.reservation().principal();
}
if (principal.isSome()) {
request.mutable_object()->mutable_resource()->CopyFrom(resource);
request.mutable_object()->set_value(principal.get());
authorizations.push_back(authorizer.get()->authorized(request));
}
}
LOG(INFO) << "Authorizing principal '"
<< (principal.isSome() ? stringify(principal.get()) : "ANY")
<< "' to unreserve resources '" << unreserve.resources() << "'";
if (authorizations.empty()) {
return authorizer.get()->authorized(request);
}
return collectAuthorizations(authorizations);
}
Future<bool> Master::authorizeCreateVolume(
const Offer::Operation::Create& create,
const Option<Principal>& principal)
{
if (authorizer.isNone()) {
return true; // Authorization is disabled.
}
authorization::Request request;
request.set_action(authorization::CREATE_VOLUME);
Option<authorization::Subject> subject = createSubject(principal);
if (subject.isSome()) {
request.mutable_subject()->CopyFrom(subject.get());
}
// The operation will be authorized if the entity is allowed to create
// volumes for all roles included in `create.volumes`.
// Add an element to `request.roles` for each unique role in the volumes.
hashset<string> roles;
vector<Future<bool>> authorizations;
foreach (const Resource& volume, create.volumes()) {
string role;
if (volume.reservations_size() > 0) {
// Check for role in the "post-reservation-refinement" format.
//
// If there is a stack of reservations, we only perform authorization
// for the most refined reservation, since we only support "pushing"
// one reservation at a time. That is, all of the previous reservations
// must have already been authorized.
role = volume.reservations().rbegin()->role();
} else {
// Check for role in the "pre-reservation-refinement" format.
role = volume.role();
}
if (!roles.contains(role)) {
roles.insert(role);
request.mutable_object()->mutable_resource()->CopyFrom(volume);
request.mutable_object()->set_value(role);
authorizations.push_back(authorizer.get()->authorized(request));
}
}
LOG(INFO) << "Authorizing principal '"
<< (principal.isSome() ? stringify(principal.get()) : "ANY")
<< "' to create volumes '" << create.volumes() << "'";
if (authorizations.empty()) {
return authorizer.get()->authorized(request);
}
return collectAuthorizations(authorizations);
}
Future<bool> Master::authorizeDestroyVolume(
const Offer::Operation::Destroy& destroy,
const Option<Principal>& principal)
{
if (authorizer.isNone()) {
return true; // Authorization is disabled.
}
authorization::Request request;
request.set_action(authorization::DESTROY_VOLUME);
Option<authorization::Subject> subject = createSubject(principal);
if (subject.isSome()) {
request.mutable_subject()->CopyFrom(subject.get());
}
vector<Future<bool>> authorizations;
foreach (const Resource& volume, destroy.volumes()) {
// NOTE: Since validation of this operation may be performed after
// authorization, we must check here that this resource is a persistent
// volume. If it isn't, the error will be caught during validation.
if (volume.has_disk() && volume.disk().has_persistence()) {
request.mutable_object()->mutable_resource()->CopyFrom(volume);
request.mutable_object()->set_value(
volume.disk().persistence().principal());
authorizations.push_back(authorizer.get()->authorized(request));
}
}
LOG(INFO) << "Authorizing principal '"
<< (principal.isSome() ? stringify(principal.get()) : "ANY")
<< "' to destroy volumes '" << destroy.volumes() << "'";
if (authorizations.empty()) {
return authorizer.get()->authorized(request);
}
return collectAuthorizations(authorizations);
}
Future<bool> Master::authorizeResizeVolume(
const Resource& volume,
const Option<Principal>& principal)
{
if (authorizer.isNone()) {
return true; // Authorization is disabled.
}
authorization::Request request;
request.set_action(authorization::RESIZE_VOLUME);
Option<authorization::Subject> subject = createSubject(principal);
if (subject.isSome()) {
request.mutable_subject()->CopyFrom(subject.get());
}
request.mutable_object()->mutable_resource()->CopyFrom(volume);
string role;
if (volume.reservations_size() > 0) {
// Check for role in the "post-reservation-refinement" format.
role = volume.reservations().rbegin()->role();
} else {
// Check for role in the "pre-reservation-refinement" format.
role = volume.role();
}
request.mutable_object()->set_value(role);
LOG(INFO) << "Authorizing principal '"
<< (principal.isSome() ? stringify(principal.get()) : "ANY")
<< "' to resize volume '" << volume << "'";
return authorizer.get()->authorized(request);
}
Future<bool> Master::authorizeCreateDisk(
const Offer::Operation::CreateDisk& createDisk,
const Option<Principal>& principal)
{
if (authorizer.isNone()) {
return true; // Authorization is disabled.
}
const Resource& resource = createDisk.source();
Option<authorization::Action> action;
switch (createDisk.target_type()) {
case Resource::DiskInfo::Source::MOUNT: {
action = authorization::CREATE_MOUNT_DISK;
break;
}
case Resource::DiskInfo::Source::BLOCK: {
action = authorization::CREATE_BLOCK_DISK;
break;
}
case Resource::DiskInfo::Source::UNKNOWN:
case Resource::DiskInfo::Source::PATH:
case Resource::DiskInfo::Source::RAW: {
return Failure(
"Failed to authorize principal '" +
(principal.isSome() ? stringify(principal.get()) : "ANY") +
"' to create a " + stringify(createDisk.target_type()) +
" disk from '" + stringify(resource) + "': Unsupported disk type");
}
}
authorization::Request request;
request.set_action(CHECK_NOTNONE(action));
Option<authorization::Subject> subject = createSubject(principal);
if (subject.isSome()) {
request.mutable_subject()->CopyFrom(subject.get());
}
request.mutable_object()->mutable_resource()->CopyFrom(resource);
// We set `object.value` in addition to `object.resource` to support legacy
// authorizers making only use of this deprecated field.
//
// NOTE: We rely on the master to ensure that the resource is in the
// post-reservation-refinement format and set the value to the most refined
// role, or default to '*' for consistency if there is no reservation.
CHECK(!resource.has_role()) << resource;
CHECK(!resource.has_reservation()) << resource;
request.mutable_object()->set_value(
resource.reservations().empty()
? "*"
: resource.reservations().rbegin()->role());
LOG(INFO) << "Authorizing principal '"
<< (principal.isSome() ? stringify(principal.get()) : "ANY")
<< "' to create a " << createDisk.target_type() << " disk from '"
<< createDisk.source() << "'";
return authorizer.get()->authorized(request);
}
Future<bool> Master::authorizeDestroyDisk(
const Offer::Operation::DestroyDisk& destroyDisk,
const Option<Principal>& principal)
{
if (authorizer.isNone()) {
return true; // Authorization is disabled.
}
const Resource& resource = destroyDisk.source();
Option<authorization::Action> action;
switch (resource.disk().source().type()) {
case Resource::DiskInfo::Source::MOUNT: {
action = authorization::DESTROY_MOUNT_DISK;
break;
}
case Resource::DiskInfo::Source::BLOCK: {
action = authorization::DESTROY_BLOCK_DISK;
break;
}
case Resource::DiskInfo::Source::RAW: {
action = authorization::DESTROY_RAW_DISK;
break;
}
case Resource::DiskInfo::Source::UNKNOWN:
case Resource::DiskInfo::Source::PATH: {
return Failure(
"Failed to authorize principal '" +
(principal.isSome() ? stringify(principal.get()) : "ANY") +
"' to destroy disk '" + stringify(resource) +
"': Unsupported disk type");
}
}
authorization::Request request;
request.set_action(CHECK_NOTNONE(action));
Option<authorization::Subject> subject = createSubject(principal);
if (subject.isSome()) {
request.mutable_subject()->CopyFrom(subject.get());
}
request.mutable_object()->mutable_resource()->CopyFrom(resource);
// We set `object.value` in addition to `object.resource` to support legacy
// authorizers making only use of this deprecated field.
//
// NOTE: We rely on the master to ensure that the resource is in the
// post-reservation-refinement format and set the value to the most refined
// role, or default to '*' for consistency if there is no reservation.
CHECK(!resource.has_role()) << resource;
CHECK(!resource.has_reservation()) << resource;
request.mutable_object()->set_value(
resource.reservations().empty()
? "*"
: resource.reservations().rbegin()->role());
LOG(INFO) << "Authorizing principal '"
<< (principal.isSome() ? stringify(principal.get()) : "ANY")
<< "' to destroy disk '" << destroyDisk.source() << "'";
return authorizer.get()->authorized(request);
}
Future<bool> Master::authorizeSlave(
const SlaveInfo& slaveInfo,
const Option<Principal>& principal)
{
if (authorizer.isNone()) {
return true;
}
vector<Future<bool>> authorizations;
// First authorize whether the agent can register.
LOG(INFO) << "Authorizing agent providing resources "
<< "'" << stringify(Resources(slaveInfo.resources())) << "' "
<< (principal.isSome()
? "with principal '" + stringify(principal.get()) + "'"
: "without a principal");
authorization::Request request;
request.set_action(authorization::REGISTER_AGENT);
Option<authorization::Subject> subject = createSubject(principal);
if (subject.isSome()) {
request.mutable_subject()->CopyFrom(subject.get());
}
// No need to set the request's object as it is implicitly set to
// ANY by the authorizer.
authorizations.push_back(authorizer.get()->authorized(request));
// Next, if static reservations exist, also authorize them.
//
// NOTE: We don't look at dynamic reservations in checkpointed
// resources because they should have gone through authorization
// against the framework / operator's principal when they were
// created. In constrast, static reservations are initiated by the
// agent's principal and authorizing them helps prevent agents from
// advertising reserved resources of arbitrary roles.
if (!Resources(slaveInfo.resources()).reserved().empty()) {
authorizations.push_back(
authorizeReserveResources(slaveInfo.resources(), principal));
}
return collectAuthorizations(authorizations);
}
bool Master::isLaunchExecutor(
const ExecutorID& executorId,
Framework* framework,
Slave* slave) const
{
CHECK_NOTNULL(framework);
CHECK_NOTNULL(slave);
if (!slave->hasExecutor(framework->id(), executorId)) {
CHECK(!framework->hasExecutor(slave->id, executorId))
<< "Executor '" << executorId
<< "' known to the framework " << *framework
<< " but unknown to the agent " << *slave;
return true;
}
return false;
}
void Master::addExecutor(
const ExecutorInfo& executorInfo,
Framework* framework,
Slave* slave)
{
CHECK_NOTNULL(framework);
CHECK_NOTNULL(slave);
CHECK(slave->connected) << "Adding executor " << executorInfo.executor_id()
<< " to disconnected agent " << *slave;
slave->addExecutor(framework->id(), executorInfo);
framework->addExecutor(slave->id, executorInfo);
}
void Master::addTask(
const TaskInfo& task,
Framework* framework,
Slave* slave)
{
CHECK_NOTNULL(framework);
CHECK_NOTNULL(slave);
CHECK(slave->connected) << "Adding task " << task.task_id()
<< " to disconnected agent " << *slave;
// Add the task to the framework and slave.
Task* t = new Task(protobuf::createTask(task, TASK_STAGING, framework->id()));
slave->addTask(t);
framework->addTask(t);
}
void Master::accept(
Framework* framework,
scheduler::Call::Accept&& accept)
{
CHECK_NOTNULL(framework);
// Bump metrics.
foreach (const Offer::Operation& operation, accept.operations()) {
if (operation.type() == Offer::Operation::LAUNCH) {
if (operation.launch().task_infos().size() > 0) {
++metrics->messages_launch_tasks;
} else {
++metrics->messages_decline_offers;
LOG(WARNING) << "Implicitly declining offers: " << accept.offer_ids()
<< " in ACCEPT call for framework " << framework->id()
<< " as the launch operation specified no tasks";
}
}
// TODO(mpark): Add metrics for LAUNCH_GROUP operation.
// TODO(jieyu): Add metrics for non launch operations.
}
// TODO(bmahler): We currently only support using multiple offers
// for a single slave.
Resources offeredResources;
Option<SlaveID> slaveId = None();
Option<Error> error = None();
Option<Resource::AllocationInfo> allocationInfo = None();
if (accept.offer_ids().size() == 0) {
error = Error("No offers specified");
} else {
// Validate the offers.
error = validation::offer::validate(accept.offer_ids(), this, framework);
size_t offersAccepted = 0;
// Compute offered resources and remove the offers. If the
// validation failed, return resources to the allocator.
foreach (const OfferID& offerId, accept.offer_ids()) {
Offer* offer = getOffer(offerId);
if (offer != nullptr) {
// Don't bother adding resources to `offeredResources` in case
// validation failed; just recover them.
if (error.isSome()) {
allocator->recoverResources(
offer->framework_id(),
offer->slave_id(),
offer->resources(),
None());
} else {
slaveId = offer->slave_id();
allocationInfo = offer->allocation_info();
offeredResources += offer->resources();
offersAccepted++;
}
removeOffer(offer);
continue;
}
// If the offer was not in our offer set, then this offer is no
// longer valid.
LOG(WARNING) << "Ignoring accept of offer " << offerId
<< " since it is no longer valid";
}
framework->metrics.offers_accepted += offersAccepted;
}
// If invalid, send TASK_DROPPED for the launch attempts. If the
// framework is not partition-aware, send TASK_LOST instead. If
// other operations have their `id` field set, then send
// OPERATION_DROPPED updates for them.
//
// TODO(jieyu): Consider adding a 'drop' overload for ACCEPT call to
// consistently handle message dropping. It would be ideal if the
// 'drop' overload can handle both resource recovery and lost task
// notifications.
if (error.isSome()) {
LOG(WARNING) << "ACCEPT call used invalid offers '" << accept.offer_ids()
<< "': " << error->message;
TaskState newTaskState = TASK_DROPPED;
if (!framework->capabilities.partitionAware) {
newTaskState = TASK_LOST;
}
foreach (const Offer::Operation& operation, accept.operations()) {
if (operation.type() != Offer::Operation::LAUNCH &&
operation.type() != Offer::Operation::LAUNCH_GROUP) {
drop(framework,
operation,
"Operation attempted with invalid offers: " + error->message);
continue;
}
const RepeatedPtrField<TaskInfo>& tasks = [&]() {
if (operation.type() == Offer::Operation::LAUNCH) {
return operation.launch().task_infos();
} else if (operation.type() == Offer::Operation::LAUNCH_GROUP) {
return operation.launch_group().task_group().tasks();
}
UNREACHABLE();
}();
foreach (const TaskInfo& task, tasks) {
const StatusUpdate& update = protobuf::createStatusUpdate(
framework->id(),
task.slave_id(),
task.task_id(),
newTaskState,
TaskStatus::SOURCE_MASTER,
None(),
"Task launched with invalid offers: " + error->message,
TaskStatus::REASON_INVALID_OFFERS);
if (framework->capabilities.partitionAware) {
metrics->tasks_dropped++;
} else {
metrics->tasks_lost++;
}
metrics->incrementTasksStates(
newTaskState,
TaskStatus::SOURCE_MASTER,
TaskStatus::REASON_INVALID_OFFERS);
forward(update, UPID(), framework);
}
}
return;
}
CHECK_SOME(slaveId);
Slave* slave = slaves.registered.get(slaveId.get());
CHECK_NOTNULL(slave);
// Validate and upgrade all of the resources in `accept.operations`:
//
// For an operation except LAUNCH and LAUNCH_GROUP which contains invalid
// resources,
// - if the framework has elected to receive feedback by setting the `id`
// field, then we send an offer operation status update with a state of
// OPERATION_ERROR.
// - if the framework has not set the `id` field, then we simply drop the
// operation.
//
// If a LAUNCH or LAUNCH_GROUP operation contains invalid resources, we send
// a TASK_ERROR status update per task.
//
//
// If the framework is requesting offer operation status updates by setting
// the `id` field in an operation, then also verify that the relevant agent
// has the RESOURCE_PROVIDER capability. If it does not, then send an offer
// operation status update with a state of OPERATION_ERROR.
//
// LAUNCH and LAUNCH_GROUP operations cannot receive offer operation status,
// updates, so we send a TASK_ERROR status update per task when these
// operations set the `id` field.
{
// Used to send TASK_ERROR status updates for tasks in invalid LAUNCH
// and LAUNCH_GROUP operations. Note that we don't need to recover
// the resources here because we always continue onto `_accept`
// which recovers the unused resources at the end.
//
// TODO(mpark): Consider pulling this out in a more reusable manner.
auto sendStatusUpdates = [&](
const RepeatedPtrField<TaskInfo>& tasks,
TaskStatus::Reason reason,
const string& message) {
foreach (const TaskInfo& task, tasks) {
const StatusUpdate& update = protobuf::createStatusUpdate(
framework->id(),
task.slave_id(),
task.task_id(),
TASK_ERROR,
TaskStatus::SOURCE_MASTER,
None(),
message,
reason);
metrics->tasks_error++;
metrics->incrementTasksStates(
TASK_ERROR, TaskStatus::SOURCE_MASTER, reason);
forward(update, UPID(), framework);
}
};
// We move out the `accept.operations`, and re-insert the operations
// with the resources validated and upgraded.
RepeatedPtrField<Offer::Operation> operations = accept.operations();
accept.clear_operations();
foreach (Offer::Operation& operation, operations) {
Option<Error> error = validateAndUpgradeResources(&operation);
if (error.isSome()) {
switch (operation.type()) {
case Offer::Operation::RESERVE:
case Offer::Operation::UNRESERVE:
case Offer::Operation::CREATE:
case Offer::Operation::DESTROY:
case Offer::Operation::GROW_VOLUME:
case Offer::Operation::SHRINK_VOLUME:
case Offer::Operation::CREATE_DISK:
case Offer::Operation::DESTROY_DISK: {
drop(framework,
operation,
"Operation attempted with invalid resources: " +
error->message);
break;
}
case Offer::Operation::LAUNCH: {
sendStatusUpdates(
operation.launch().task_infos(),
TaskStatus::REASON_TASK_INVALID,
error->message);
break;
}
case Offer::Operation::LAUNCH_GROUP: {
sendStatusUpdates(
operation.launch_group().task_group().tasks(),
TaskStatus::REASON_TASK_GROUP_INVALID,
error->message);
break;
}
case Offer::Operation::UNKNOWN: {
LOG(WARNING) << "Ignoring unknown operation";
break;
}
}
} else if (operation.has_id()) {
// The `id` field is set, which means operation feedback is requested.
//
// Operation feedback is not supported for LAUNCH or LAUNCH_GROUP
// operations, so we drop them and send TASK_ERROR status updates.
//
// For other operations, verify that they have been sent by an HTTP
// framework and that they are destined for an agent with the
// RESOURCE_PROVIDER capability.
switch (operation.type()) {
case Offer::Operation::LAUNCH: {
sendStatusUpdates(
operation.launch().task_infos(),
TaskStatus::REASON_TASK_INVALID,
"The `id` field cannot be set on LAUNCH operations");
break;
}
case Offer::Operation::LAUNCH_GROUP: {
sendStatusUpdates(
operation.launch_group().task_group().tasks(),
TaskStatus::REASON_TASK_GROUP_INVALID,
"The `id` field cannot be set on LAUNCH_GROUP operations");
break;
}
case Offer::Operation::RESERVE:
case Offer::Operation::UNRESERVE:
case Offer::Operation::CREATE:
case Offer::Operation::DESTROY:
case Offer::Operation::GROW_VOLUME:
case Offer::Operation::SHRINK_VOLUME:
case Offer::Operation::CREATE_DISK:
case Offer::Operation::DESTROY_DISK: {
if (framework->http.isNone()) {
const string message =
"The 'id' field was set in an offer operation, but operation"
" feedback is not supported for the SchedulerDriver API";
LOG(WARNING) << "Dropping "
<< Offer::Operation::Type_Name(operation.type())
<< " operation from framework " << *framework << ": "
<< message;
// Send an error which will cause the scheduler driver to abort.
FrameworkErrorMessage frameworkError;
frameworkError.set_message(
message +
"; please use the HTTP scheduler API for this feature");
framework->send(frameworkError);
break;
}
if (getResourceProviderId(operation).isNone()) {
drop(framework,
operation,
"Operation requested feedback, but it affects resources not"
" managed by a resource provider");
break;
}
if (!slave->capabilities.resourceProvider) {
drop(framework,
operation,
"Operation requested feedback, but agent " +
stringify(slaveId.get()) +
" does not have the required RESOURCE_PROVIDER capability");
break;
}
accept.add_operations()->CopyFrom(operation);
break;
}
case Offer::Operation::UNKNOWN: {
LOG(WARNING) << "Ignoring unknown operation";
break;
}
}
} else {
// Resource validation succeeded and feedback is not requested,
// so add the operation.
accept.add_operations()->CopyFrom(operation);
}
}
}
// We make various adjustments to the `Offer::Operation`s,
// typically for backward/forward compatibility.
// TODO(mpark): Pull this out to a master normalization utility.
foreach (Offer::Operation& operation, *accept.mutable_operations()) {
// With the addition of the MULTI_ROLE capability, the resources
// within an offer now contain an `AllocationInfo`. We therefore
// inject the offer's allocation info into the operation's
// resources if the scheduler has not done so already.
CHECK_SOME(allocationInfo);
protobuf::injectAllocationInfo(&operation, allocationInfo.get());
switch (operation.type()) {
case Offer::Operation::RESERVE:
case Offer::Operation::UNRESERVE:
case Offer::Operation::CREATE:
case Offer::Operation::DESTROY:
case Offer::Operation::GROW_VOLUME:
case Offer::Operation::SHRINK_VOLUME:
case Offer::Operation::CREATE_DISK:
case Offer::Operation::DESTROY_DISK: {
// No-op.
break;
}
case Offer::Operation::LAUNCH: {
foreach (
TaskInfo& task, *operation.mutable_launch()->mutable_task_infos()) {
// TODO(haosdent): Once we have internal `TaskInfo` separate from
// the v0 `TaskInfo` (see MESOS-6268), consider extracting the
// following adaptation code into devolve methods from v0 and v1
// `TaskInfo` to internal `TaskInfo`.
//
// Make a copy of the original task so that we can fill the missing
// `framework_id` in `ExecutorInfo` if needed. This field was added
// to the API later and thus was made optional.
if (task.has_executor() && !task.executor().has_framework_id()) {
task.mutable_executor()->mutable_framework_id()->CopyFrom(
framework->id());
}
// For backwards compatibility with the v0 and v1 API, when
// the type of the health check is not specified, determine
// its type from the `http` and `command` fields.
//
// TODO(haosdent): Remove this after the deprecation cycle which
// starts in 2.0.
if (task.has_health_check() && !task.health_check().has_type()) {
LOG(WARNING) << "The type of health check is not set; use of "
<< "'HealthCheck' without specifying 'type' will be "
<< "deprecated in Mesos 2.0";
const HealthCheck& healthCheck = task.health_check();
if (healthCheck.has_command() && !healthCheck.has_http()) {
task.mutable_health_check()->set_type(HealthCheck::COMMAND);
} else if (healthCheck.has_http() && !healthCheck.has_command()) {
task.mutable_health_check()->set_type(HealthCheck::HTTP);
}
}
}
break;
}
case Offer::Operation::LAUNCH_GROUP: {
const ExecutorInfo& executor = operation.launch_group().executor();
TaskGroupInfo* taskGroup =
operation.mutable_launch_group()->mutable_task_group();
// Mutate `TaskInfo` to include `ExecutorInfo` to make it easy
// for operator API and WebUI to get access to the corresponding
// executor for tasks in the task group.
foreach (TaskInfo& task, *taskGroup->mutable_tasks()) {
if (!task.has_executor()) {
task.mutable_executor()->CopyFrom(executor);
}
}
break;
}
case Offer::Operation::UNKNOWN: {
// No-op.
break;
}
}
}
LOG(INFO) << "Processing ACCEPT call for offers: " << accept.offer_ids()
<< " on agent " << *slave << " for framework " << *framework;
vector<Future<bool>> futures;
foreach (const Offer::Operation& operation, accept.operations()) {
switch (operation.type()) {
case Offer::Operation::LAUNCH:
case Offer::Operation::LAUNCH_GROUP: {
const RepeatedPtrField<TaskInfo>& tasks = [&]() {
if (operation.type() == Offer::Operation::LAUNCH) {
return operation.launch().task_infos();
} else if (operation.type() == Offer::Operation::LAUNCH_GROUP) {
return operation.launch_group().task_group().tasks();
}
UNREACHABLE();
}();
// Authorize the tasks. A task is in 'framework->pendingTasks'
// and 'slave->pendingTasks' before it is authorized.
foreach (const TaskInfo& task, tasks) {
futures.push_back(authorizeTask(task, framework));
// Add to the framework's list of pending tasks.
//
// NOTE: If two tasks have the same ID, the second one will
// not be put into 'framework->pendingTasks', therefore
// will not be launched (and TASK_ERROR will be sent).
// Unfortunately, we can't tell the difference between a
// duplicate TaskID and getting killed while pending
// (removed from the map). So it's possible that we send
// a TASK_ERROR after a TASK_KILLED (see _accept())!
if (!framework->pendingTasks.contains(task.task_id())) {
framework->pendingTasks[task.task_id()] = task;
}
// Add to the slave's list of pending tasks.
if (!slave->pendingTasks.contains(framework->id()) ||
!slave->pendingTasks[framework->id()].contains(task.task_id())) {
slave->pendingTasks[framework->id()][task.task_id()] = task;
}
}
break;
}
// NOTE: When handling RESERVE and UNRESERVE operations, authorization
// will proceed even if no principal is specified, although currently
// resources cannot be reserved or unreserved unless a principal is
// provided. Any RESERVE/UNRESERVE operation with no associated principal
// will be found invalid when `validate()` is called in `_accept()` below.
// The RESERVE operation allows a principal to reserve resources.
case Offer::Operation::RESERVE: {
Option<Principal> principal = framework->info.has_principal()
? Principal(framework->info.principal())
: Option<Principal>::none();
futures.push_back(
authorizeReserveResources(
operation.reserve(), principal));
break;
}
// The UNRESERVE operation allows a principal to unreserve resources.
case Offer::Operation::UNRESERVE: {
Option<Principal> principal = framework->info.has_principal()
? Principal(framework->info.principal())
: Option<Principal>::none();
futures.push_back(
authorizeUnreserveResources(
operation.unreserve(), principal));
break;
}
// The CREATE operation allows the creation of a persistent volume.
case Offer::Operation::CREATE: {
Option<Principal> principal = framework->info.has_principal()
? Principal(framework->info.principal())
: Option<Principal>::none();
futures.push_back(
authorizeCreateVolume(
operation.create(), principal));
break;
}
// The DESTROY operation allows the destruction of a persistent volume.
case Offer::Operation::DESTROY: {
Option<Principal> principal = framework->info.has_principal()
? Principal(framework->info.principal())
: Option<Principal>::none();
futures.push_back(
authorizeDestroyVolume(
operation.destroy(), principal));
break;
}
case Offer::Operation::GROW_VOLUME: {
Option<Principal> principal = framework->info.has_principal()
? Principal(framework->info.principal())
: Option<Principal>::none();
futures.push_back(
authorizeResizeVolume(
operation.grow_volume().volume(), principal));
break;
}
case Offer::Operation::SHRINK_VOLUME: {
Option<Principal> principal = framework->info.has_principal()
? Principal(framework->info.principal())
: Option<Principal>::none();
futures.push_back(
authorizeResizeVolume(
operation.shrink_volume().volume(), principal));
break;
}
case Offer::Operation::CREATE_DISK: {
Option<Principal> principal = framework->info.has_principal()
? Principal(framework->info.principal())
: Option<Principal>::none();
futures.push_back(
authorizeCreateDisk(
operation.create_disk(), principal));
break;
}
case Offer::Operation::DESTROY_DISK: {
Option<Principal> principal = framework->info.has_principal()
? Principal(framework->info.principal())
: Option<Principal>::none();
futures.push_back(
authorizeDestroyDisk(
operation.destroy_disk(), principal));
break;
}
case Offer::Operation::UNKNOWN: {
// TODO(vinod): Send an error event to the scheduler?
LOG(WARNING) << "Ignoring unknown operation";
break;
}
}
}
// Wait for all the tasks to be authorized.
await(futures)
.onAny(defer(self(),
&Master::_accept,
framework->id(),
slaveId.get(),
offeredResources,
std::move(accept),
lambda::_1));
}
void Master::_accept(
const FrameworkID& frameworkId,
const SlaveID& slaveId,
const Resources& offeredResources,
scheduler::Call::Accept&& accept,
const Future<vector<Future<bool>>>& _authorizations)
{
Framework* framework = getFramework(frameworkId);
// TODO(jieyu): Consider using the 'drop' overload mentioned in
// 'accept' to consistently handle dropping ACCEPT calls.
if (framework == nullptr) {
LOG(WARNING)
<< "Ignoring ACCEPT call for framework " << frameworkId
<< " because the framework cannot be found";
// Tell the allocator about the recovered resources.
allocator->recoverResources(
frameworkId,
slaveId,
offeredResources,
None());
return;
}
Slave* slave = slaves.registered.get(slaveId);
if (slave == nullptr || !slave->connected) {
TaskState newTaskState = TASK_DROPPED;
if (!framework->capabilities.partitionAware) {
newTaskState = TASK_LOST;
}
foreach (const Offer::Operation& operation, accept.operations()) {
if (operation.type() != Offer::Operation::LAUNCH &&
operation.type() != Offer::Operation::LAUNCH_GROUP) {
continue;
}
const RepeatedPtrField<TaskInfo>& tasks = [&]() {
if (operation.type() == Offer::Operation::LAUNCH) {
return operation.launch().task_infos();
} else {
CHECK_EQ(Offer::Operation::LAUNCH_GROUP, operation.type());
return operation.launch_group().task_group().tasks();
}
}();
foreach (const TaskInfo& task, tasks) {
// Remove the task from being pending.
framework->pendingTasks.erase(task.task_id());
if (slave != nullptr) {
slave->pendingTasks[framework->id()].erase(task.task_id());
if (slave->pendingTasks[framework->id()].empty()) {
slave->pendingTasks.erase(framework->id());
}
}
const TaskStatus::Reason reason =
slave == nullptr ? TaskStatus::REASON_SLAVE_REMOVED
: TaskStatus::REASON_SLAVE_DISCONNECTED;
const StatusUpdate& update = protobuf::createStatusUpdate(
framework->id(),
task.slave_id(),
task.task_id(),
newTaskState,
TaskStatus::SOURCE_MASTER,
None(),
slave == nullptr ? "Agent removed" : "Agent disconnected",
reason);
if (framework->capabilities.partitionAware) {
metrics->tasks_dropped++;
} else {
metrics->tasks_lost++;
}
metrics->incrementTasksStates(
newTaskState,
TaskStatus::SOURCE_MASTER,
reason);
forward(update, UPID(), framework);
}
}
// Tell the allocator about the recovered resources.
allocator->recoverResources(
frameworkId,
slaveId,
offeredResources,
None());
return;
}
// Some operations update the offered resources. We keep
// updated offered resources here. When a task is successfully
// launched, we remove its resource from offered resources.
Resources _offeredResources = offeredResources;
// Converted resources from volume resizes. These converted resources are not
// put into `_offeredResources`, so no other operations can consume them.
// TODO(zhitao): This will be unnecessary once `GROW_VOLUME` and
// `SHRINK_VOLUME` become non-speculative.
Resources resizedResources;
// We keep track of the shared resources from the offers separately.
// `offeredSharedResources` can be modified by CREATE/DESTROY but we
// don't remove from it when a task is successfully launched so this
// variable always tracks the *total* amount. We do this to support
// validation of tasks involving shared resources. See comments in
// the LAUNCH case below.
Resources offeredSharedResources = offeredResources.shared();
// Maintain a list of resource conversions to pass to the allocator
// as a result of operations. Note that:
// 1) We drop invalid operations.
// 2) For LAUNCH operations, we drop invalid tasks. LAUNCH operation
// will result in resource conversions because of shared
// resources.
// 3) Currently, LAUNCH_GROUP won't result in resource conversions
// because shared resources are not supported yet if the
// framework uses LAUNCH_GROUP operation.
//
// The order of the conversions is important and preserved.
vector<ResourceConversion> conversions;
// The order of `authorizations` must match the order of the operations and/or
// tasks in `accept.operations()` as they are iterated through simultaneously.
CHECK_READY(_authorizations);
std::deque<Future<bool>> authorizations(
_authorizations->begin(), _authorizations->end());
foreach (const Offer::Operation& operation, accept.operations()) {
switch (operation.type()) {
// The RESERVE operation allows a principal to reserve resources.
case Offer::Operation::RESERVE: {
CHECK(!authorizations.empty());
Future<bool> authorization = authorizations.front();
authorizations.pop_front();
CHECK(!authorization.isDiscarded());
if (authorization.isFailed()) {
// TODO(greggomann): We may want to retry this failed authorization
// request rather than dropping it immediately.
drop(framework,
operation,
"Authorization of principal '" + framework->info.principal() +
"' to reserve resources failed: " + authorization.failure());
continue;
} else if (!authorization.get()) {
drop(framework,
operation,
"Not authorized to reserve resources as '" +
framework->info.principal() + "'");
continue;
}
Option<Principal> principal = framework->info.has_principal()
? Principal(framework->info.principal())
: Option<Principal>::none();
// Make sure this reserve operation is valid.
Option<Error> error = validation::operation::validate(
operation.reserve(),
principal,
slave->capabilities,
framework->info);
if (error.isSome()) {
drop(
framework,
operation,
error->message + "; on agent " + stringify(*slave));
continue;
}
// Test the given operation on the included resources.
Try<vector<ResourceConversion>> _conversions =
getResourceConversions(operation);
if (_conversions.isError()) {
drop(framework, operation, _conversions.error());
continue;
}
Try<Resources> resources = _offeredResources.apply(_conversions.get());
if (resources.isError()) {
drop(framework, operation, resources.error());
continue;
}
_offeredResources = resources.get();
LOG(INFO) << "Applying RESERVE operation for resources "
<< operation.reserve().resources() << " from framework "
<< *framework << " to agent " << *slave;
_apply(slave, framework, operation);
conversions.insert(
conversions.end(),
_conversions->begin(),
_conversions->end());
break;
}
// The UNRESERVE operation allows a principal to unreserve resources.
case Offer::Operation::UNRESERVE: {
CHECK(!authorizations.empty());
Future<bool> authorization = authorizations.front();
authorizations.pop_front();
CHECK(!authorization.isDiscarded());
if (authorization.isFailed()) {
// TODO(greggomann): We may want to retry this failed authorization
// request rather than dropping it immediately.
drop(framework,
operation,
"Authorization of principal '" + framework->info.principal() +
"' to unreserve resources failed: " +
authorization.failure());
continue;
} else if (!authorization.get()) {
drop(framework,
operation,
"Not authorized to unreserve resources as '" +
framework->info.principal() + "'");
continue;
}
// Make sure this unreserve operation is valid.
Option<Error> error =
validation::operation::validate(operation.unreserve());
if (error.isSome()) {
drop(framework, operation, error->message);
continue;
}
// Test the given operation on the included resources.
Try<vector<ResourceConversion>> _conversions =
getResourceConversions(operation);
if (_conversions.isError()) {
drop(framework, operation, _conversions.error());
continue;
}
Try<Resources> resources = _offeredResources.apply(_conversions.get());
if (resources.isError()) {
drop(framework, operation, resources.error());
continue;
}
_offeredResources = resources.get();
LOG(INFO) << "Applying UNRESERVE operation for resources "
<< operation.unreserve().resources() << " from framework "
<< *framework << " to agent " << *slave;
_apply(slave, framework, operation);
conversions.insert(
conversions.end(),
_conversions->begin(),
_conversions->end());
break;
}
case Offer::Operation::CREATE: {
CHECK(!authorizations.empty());
Future<bool> authorization = authorizations.front();
authorizations.pop_front();
CHECK(!authorization.isDiscarded());
if (authorization.isFailed()) {
// TODO(greggomann): We may want to retry this failed authorization
// request rather than dropping it immediately.
drop(framework,
operation,
"Authorization of principal '" + framework->info.principal() +
"' to create persistent volumes failed: " +
authorization.failure());
continue;
} else if (!authorization.get()) {
drop(framework,
operation,
"Not authorized to create persistent volumes as '" +
framework->info.principal() + "'");
continue;
}
Option<Principal> principal = framework->info.has_principal()
? Principal(framework->info.principal())
: Option<Principal>::none();
// Make sure this create operation is valid.
Option<Error> error = validation::operation::validate(
operation.create(),
slave->checkpointedResources,
principal,
slave->capabilities,
framework->info);
if (error.isSome()) {
drop(
framework,
operation,
error->message + "; on agent " + stringify(*slave));
continue;
}
// Test the given operation on the included resources.
Try<vector<ResourceConversion>> _conversions =
getResourceConversions(operation);
if (_conversions.isError()) {
drop(framework, operation, _conversions.error());
continue;
}
Try<Resources> resources = _offeredResources.apply(_conversions.get());
if (resources.isError()) {
drop(framework, operation, resources.error());
continue;
}
_offeredResources = resources.get();
offeredSharedResources = _offeredResources.shared();
LOG(INFO) << "Applying CREATE operation for volumes "
<< operation.create().volumes() << " from framework "
<< *framework << " to agent " << *slave;
_apply(slave, framework, operation);
conversions.insert(
conversions.end(),
_conversions->begin(),
_conversions->end());
break;
}
case Offer::Operation::DESTROY: {
CHECK(!authorizations.empty());
Future<bool> authorization = authorizations.front();
authorizations.pop_front();
CHECK(!authorization.isDiscarded());
if (authorization.isFailed()) {
// TODO(greggomann): We may want to retry this failed authorization
// request rather than dropping it immediately.
drop(framework,
operation,
"Authorization of principal '" + framework->info.principal() +
"' to destroy persistent volumes failed: " +
authorization.failure());
continue;
} else if (!authorization.get()) {
drop(framework,
operation,
"Not authorized to destroy persistent volumes as '" +
framework->info.principal() + "'");
continue;
}
// Make sure this destroy operation is valid.
Option<Error> error = validation::operation::validate(
operation.destroy(),
slave->checkpointedResources,
slave->usedResources,
slave->pendingTasks);
if (error.isSome()) {
drop(framework, operation, error->message);
continue;
}
// If any offer from this slave contains a volume that needs
// to be destroyed, we should process it, but we should also
// rescind those offers.
foreach (Offer* offer, utils::copy(slave->offers)) {
const Resources& offered = offer->resources();
foreach (const Resource& volume, operation.destroy().volumes()) {
if (offered.contains(volume)) {
allocator->recoverResources(
offer->framework_id(),
offer->slave_id(),
offered,
None());
removeOffer(offer, true);
// This offer may contain other volumes that are being destroyed.
// However, we have already rescinded it, so we should move on
// to the next offer.
break;
}
}
}
// Test the given operation on the included resources.
Try<vector<ResourceConversion>> _conversions =
getResourceConversions(operation);
if (_conversions.isError()) {
drop(framework, operation, _conversions.error());
continue;
}
Try<Resources> resources = _offeredResources.apply(_conversions.get());
if (resources.isError()) {
drop(framework, operation, resources.error());
continue;
}
_offeredResources = resources.get();
offeredSharedResources = _offeredResources.shared();
LOG(INFO) << "Applying DESTROY operation for volumes "
<< operation.destroy().volumes() << " from framework "
<< *framework << " to agent " << *slave;
_apply(slave, framework, operation);
conversions.insert(
conversions.end(),
_conversions->begin(),
_conversions->end());
break;
}
case Offer::Operation::GROW_VOLUME: {
CHECK(!authorizations.empty());
Future<bool> authorization = authorizations.front();
authorizations.pop_front();
CHECK(!authorization.isDiscarded());
if (authorization.isFailed()) {
// TODO(greggomann): We may want to retry this failed authorization
// request rather than dropping it immediately.
drop(framework,
operation,
"Authorization of principal '" + framework->info.principal() +
"' to grow a volume failed: " +
authorization.failure());
continue;
} else if (!authorization.get()) {
drop(framework,
operation,
"Not authorized to grow a volume as '" +
framework->info.principal() + "'");
continue;
}
// Make sure this grow volume operation is valid.
Option<Error> error = validation::operation::validate(
operation.grow_volume(), slave->capabilities);
if (error.isSome()) {
drop(
framework,
operation,
error->message + "; on agent " + stringify(*slave));
continue;
}
// TODO(zhitao): Convert this operation to non-speculative once we can
// support that in the operator API.
Try<vector<ResourceConversion>> _conversions =
getResourceConversions(operation);
if (_conversions.isError()) {
drop(framework, operation, _conversions.error());
continue;
}
CHECK_EQ(1u, _conversions->size());
const Resources& consumed = _conversions->at(0).consumed;
const Resources& converted = _conversions->at(0).converted;
if (!_offeredResources.contains(consumed)) {
drop(
framework,
operation,
"Invalid GROW_VOLUME operation: " +
stringify(_offeredResources) + " does not contain " +
stringify(consumed));
continue;
}
_offeredResources -= consumed;
resizedResources += converted;
LOG(INFO) << "Processing GROW_VOLUME operation for volume "
<< operation.grow_volume().volume()
<< " with additional resource "
<< operation.grow_volume().addition()
<< " from framework "
<< *framework << " on agent " << *slave;
_apply(slave, framework, operation);
conversions.insert(
conversions.end(),
_conversions->begin(),
_conversions->end());
break;
}
case Offer::Operation::SHRINK_VOLUME: {
CHECK(!authorizations.empty());
Future<bool> authorization = authorizations.front();
authorizations.pop_front();
CHECK(!authorization.isDiscarded());
if (authorization.isFailed()) {
// TODO(greggomann): We may want to retry this failed authorization
// request rather than dropping it immediately.
drop(framework,
operation,
"Authorization of principal '" + framework->info.principal() +
"' to shrink a volume failed: " +
authorization.failure());
continue;
} else if (!authorization.get()) {
drop(framework,
operation,
"Not authorized to shrink a volume as '" +
framework->info.principal() + "'");
continue;
}
// Make sure this shrink volume operation is valid.
Option<Error> error = validation::operation::validate(
operation.shrink_volume(), slave->capabilities);
if (error.isSome()) {
drop(
framework,
operation,
error->message + "; on agent " + stringify(*slave));
continue;
}
// TODO(zhitao): Convert this operation to non-speculative once we can
// support that in the operator API.
Try<vector<ResourceConversion>> _conversions =
getResourceConversions(operation);
if (_conversions.isError()) {
drop(framework, operation, _conversions.error());
continue;
}
CHECK_EQ(1u, _conversions->size());
const Resources& consumed = _conversions->at(0).consumed;
const Resources& converted = _conversions->at(0).converted;
if (!_offeredResources.contains(consumed)) {
drop(
framework,
operation,
"Invalid SHRINK_VOLUME operation: " +
stringify(_offeredResources) + " does not contain " +
stringify(consumed));
continue;
}
_offeredResources -= consumed;
resizedResources += converted;
LOG(INFO) << "Processing SHRINK_VOLUME operation for volume "
<< operation.shrink_volume().volume()
<< " subtracting scalar value "
<< operation.shrink_volume().subtract()
<< " from framework "
<< *framework << " on agent " << *slave;
_apply(slave, framework, operation);
conversions.insert(
conversions.end(),
_conversions->begin(),
_conversions->end());
break;
}
case Offer::Operation::LAUNCH: {
foreach (const TaskInfo& task, operation.launch().task_infos()) {
CHECK(!authorizations.empty());
Future<bool> authorization = authorizations.front();
authorizations.pop_front();
// The task will not be in `pendingTasks` if it has been
// killed in the interim. No need to send TASK_KILLED in
// this case as it has already been sent. Note however that
// we cannot currently distinguish between the task being
// killed and the task having a duplicate TaskID within
// `pendingTasks`. Therefore we must still validate the task
// to ensure we send the TASK_ERROR in the case that it has a
// duplicate TaskID.
//
// TODO(bmahler): We may send TASK_ERROR after a TASK_KILLED
// if a task was killed (removed from `pendingTasks`) *and*
// the task is invalid or unauthorized here.
bool pending = framework->pendingTasks.contains(task.task_id());
framework->pendingTasks.erase(task.task_id());
slave->pendingTasks[framework->id()].erase(task.task_id());
if (slave->pendingTasks[framework->id()].empty()) {
slave->pendingTasks.erase(framework->id());
}
CHECK(!authorization.isDiscarded());
if (authorization.isFailed() || !authorization.get()) {
string user = framework->info.user(); // Default user.
if (task.has_command() && task.command().has_user()) {
user = task.command().user();
} else if (task.has_executor() &&
task.executor().command().has_user()) {
user = task.executor().command().user();
}
const StatusUpdate& update = protobuf::createStatusUpdate(
framework->id(),
task.slave_id(),
task.task_id(),
TASK_ERROR,
TaskStatus::SOURCE_MASTER,
None(),
authorization.isFailed() ?
"Authorization failure: " + authorization.failure() :
"Not authorized to launch as user '" + user + "'",
TaskStatus::REASON_TASK_UNAUTHORIZED);
metrics->tasks_error++;
metrics->incrementTasksStates(
TASK_ERROR,
TaskStatus::SOURCE_MASTER,
TaskStatus::REASON_TASK_UNAUTHORIZED);
forward(update, UPID(), framework);
continue; // Continue to the next task.
}
// Validate the task.
// We add back offered shared resources for validation even if they
// are already consumed by other tasks in the same ACCEPT call. This
// allows these tasks to use more copies of the same shared resource
// than those being offered. e.g., 2 tasks can be launched on 1 copy
// of a shared persistent volume from the offer; 3 tasks can be
// launched on 2 copies of a shared persistent volume from 2 offers.
Resources available =
_offeredResources.nonShared() + offeredSharedResources;
Option<Error> error =
validation::task::validate(task, framework, slave, available);
if (error.isSome()) {
const StatusUpdate& update = protobuf::createStatusUpdate(
framework->id(),
task.slave_id(),
task.task_id(),
TASK_ERROR,
TaskStatus::SOURCE_MASTER,
None(),
error->message,
TaskStatus::REASON_TASK_INVALID);
metrics->tasks_error++;
metrics->incrementTasksStates(
TASK_ERROR,
TaskStatus::SOURCE_MASTER,
TaskStatus::REASON_TASK_INVALID);
forward(update, UPID(), framework);
continue; // Continue to the next task.
}
// Add task.
if (pending) {
Resources consumed;
bool launchExecutor = true;
if (task.has_executor()) {
launchExecutor = isLaunchExecutor(
task.executor().executor_id(), framework, slave);
// Master tracks the new executor only if the task is not a
// command task.
if (launchExecutor) {
addExecutor(task.executor(), framework, slave);
consumed += task.executor().resources();
}
}
addTask(task, framework, slave);
consumed += task.resources();
CHECK(available.contains(consumed))
<< available << " does not contain " << consumed;
// Determine the additional instances of shared resources
// needed to be added to the allocations since we support
// tasks requesting more instances of shared resources
// than those being offered.
const Resources& consumedShared = consumed.shared();
// Check that offered resources contain at least one copy
// of each consumed shared resource (guaranteed by master
// validation).
foreach (const Resource& resource, consumedShared) {
CHECK(offeredSharedResources.contains(resource));
}
Resources additional = consumedShared - _offeredResources.shared();
if (!additional.empty()) {
LOG(INFO) << "Allocating additional resources " << additional
<< " for task " << task.task_id()
<< " of framework " << *framework
<< " on agent " << *slave;
conversions.emplace_back(Resources(), additional);
}
_offeredResources -= consumed;
RunTaskMessage message;
message.mutable_framework()->MergeFrom(framework->info);
hashmap<Option<ResourceProviderID>, UUID> resourceVersions;
if (slave->resourceVersion.isSome()) {
resourceVersions.put(None(), slave->resourceVersion.get());
}
foreachpair (
const ResourceProviderID& resourceProviderId,
const Slave::ResourceProvider& resourceProvider,
slave->resourceProviders) {
resourceVersions.put(
resourceProviderId, resourceProvider.resourceVersion);
}
message.mutable_resource_version_uuids()->CopyFrom(
protobuf::createResourceVersions(resourceVersions));
// TODO(anand): We set 'pid' to UPID() for http frameworks
// as 'pid' was made optional in 0.24.0. In 0.25.0, we
// no longer have to set pid here for http frameworks.
message.set_pid(framework->pid.getOrElse(UPID()));
message.mutable_task()->MergeFrom(task);
message.set_launch_executor(launchExecutor);
if (HookManager::hooksAvailable()) {
// Set labels retrieved from label-decorator hooks.
message.mutable_task()->mutable_labels()->CopyFrom(
HookManager::masterLaunchTaskLabelDecorator(
task,
framework->info,
slave->info));
}
// If the agent does not support reservation refinement, downgrade
// the task / executor resources to the "pre-reservation-refinement"
// format. This cannot contain any refined reservations since
// the master rejects attempts to create refined reservations
// on non-capable agents.
if (!slave->capabilities.reservationRefinement) {
CHECK_SOME(downgradeResources(&message));
}
LOG(INFO) << "Launching task " << task.task_id() << " of framework "
<< *framework << " with resources " << task.resources()
<< " on agent " << *slave << " on "
<< (launchExecutor ?
" new executor" : " existing executor");
// Increment this metric here for LAUNCH since it
// does not make use of the `_apply()` function.
framework->metrics.incrementOperation(operation);
send(slave->pid, message);
}
}
break;
}
case Offer::Operation::LAUNCH_GROUP: {
// We must ensure that the entire group can be launched. This
// means all tasks in the group must be authorized and valid.
// If any tasks in the group have been killed in the interim
// we must kill the entire group.
const ExecutorInfo& executor = operation.launch_group().executor();
const TaskGroupInfo& taskGroup = operation.launch_group().task_group();
// Remove all the tasks from being pending.
hashset<TaskID> killed;
foreach (const TaskInfo& task, taskGroup.tasks()) {
bool pending = framework->pendingTasks.contains(task.task_id());
framework->pendingTasks.erase(task.task_id());
slave->pendingTasks[framework->id()].erase(task.task_id());
if (slave->pendingTasks[framework->id()].empty()) {
slave->pendingTasks.erase(framework->id());
}
if (!pending) {
killed.insert(task.task_id());
}
}
// Note that we do not fill in the `ExecutorInfo.framework_id`
// since we do not have to support backwards compatibility like
// in the `Launch` operation case.
// TODO(bmahler): Consider injecting some default (cpus, mem, disk)
// resources when the framework omits the executor resources.
// See if there are any authorization or validation errors.
// Note that we'll only report the first error we encounter
// for the group.
//
// TODO(anindya_sinha): If task group uses shared resources, this
// validation needs to be enhanced to accommodate multiple copies
// of shared resources across tasks within the task group.
Option<Error> error;
Option<TaskStatus::Reason> reason;
// NOTE: We check for the authorization errors first and never break the
// loop to ensure that all authorization futures for this task group are
// iterated through.
foreach (const TaskInfo& task, taskGroup.tasks()) {
CHECK(!authorizations.empty());
Future<bool> authorization = authorizations.front();
authorizations.pop_front();
CHECK(!authorization.isDiscarded());
if (authorization.isFailed()) {
error = Error("Failed to authorize task"
" '" + stringify(task.task_id()) + "'"
": " + authorization.failure());
} else if (!authorization.get()) {
string user = framework->info.user(); // Default user.
if (task.has_command() && task.command().has_user()) {
user = task.command().user();
}
error = Error("Task '" + stringify(task.task_id()) + "'"
" is not authorized to launch as"
" user '" + user + "'");
}
}
if (error.isSome()) {
reason = TaskStatus::REASON_TASK_GROUP_UNAUTHORIZED;
} else {
error = validation::task::group::validate(
taskGroup, executor, framework, slave, _offeredResources);
if (error.isSome()) {
reason = TaskStatus::REASON_TASK_GROUP_INVALID;
}
}
if (error.isSome()) {
CHECK_SOME(reason);
// NOTE: If some of these invalid or unauthorized tasks were
// killed already, here we end up sending a TASK_ERROR after
// having already sent TASK_KILLED.
foreach (const TaskInfo& task, taskGroup.tasks()) {
const StatusUpdate& update = protobuf::createStatusUpdate(
framework->id(),
task.slave_id(),
task.task_id(),
TASK_ERROR,
TaskStatus::SOURCE_MASTER,
None(),
error->message,
reason.get());
metrics->tasks_error++;
metrics->incrementTasksStates(
TASK_ERROR, TaskStatus::SOURCE_MASTER, reason.get());
forward(update, UPID(), framework);
}
continue;
}
// If task(s) were killed, send TASK_KILLED for
// all of the remaining tasks, since a TaskGroup must
// be delivered in its entirety.
//
// TODO(bmahler): Do this killing when processing
// the `Kill` call, rather than doing it here.
if (!killed.empty()) {
foreach (const TaskInfo& task, taskGroup.tasks()) {
if (!killed.contains(task.task_id())) {
const StatusUpdate& update = protobuf::createStatusUpdate(
framework->id(),
task.slave_id(),
task.task_id(),
TASK_KILLED,
TaskStatus::SOURCE_MASTER,
None(),
"A task within the task group was killed before"
" delivery to the agent",
TaskStatus::REASON_TASK_KILLED_DURING_LAUNCH);
metrics->tasks_killed++;
// TODO(bmahler): Increment the task state source metric,
// we currently cannot because it requires each source
// requires a reason.
forward(update, UPID(), framework);
}
}
continue;
}
// Now launch the task group!
RunTaskGroupMessage message;
message.mutable_framework()->CopyFrom(framework->info);
message.mutable_executor()->CopyFrom(executor);
message.mutable_task_group()->CopyFrom(taskGroup);
hashmap<Option<ResourceProviderID>, UUID> resourceVersions;
if (slave->resourceVersion.isSome()) {
resourceVersions.put(None(), slave->resourceVersion.get());
}
foreachpair (
const ResourceProviderID& resourceProviderId,
const Slave::ResourceProvider& resourceProvider,
slave->resourceProviders) {
resourceVersions.put(
resourceProviderId, resourceProvider.resourceVersion);
}
message.mutable_resource_version_uuids()->CopyFrom(
protobuf::createResourceVersions(resourceVersions));
set<TaskID> taskIds;
Resources totalResources;
Resources executorResources;
bool launchExecutor =
isLaunchExecutor(executor.executor_id(), framework, slave);
if (launchExecutor) {
addExecutor(executor, framework, slave);
executorResources = executor.resources();
totalResources += executorResources;
}
message.set_launch_executor(launchExecutor);
foreach (
TaskInfo& task, *message.mutable_task_group()->mutable_tasks()) {
taskIds.insert(task.task_id());
totalResources += task.resources();
addTask(task, framework, slave);
if (HookManager::hooksAvailable()) {
// Set labels retrieved from label-decorator hooks.
task.mutable_labels()->CopyFrom(
HookManager::masterLaunchTaskLabelDecorator(
task,
framework->info,
slave->info));
}
}
CHECK(_offeredResources.contains(totalResources))
<< _offeredResources << " does not contain " << totalResources;
_offeredResources -= totalResources;
// If the agent does not support reservation refinement, downgrade
// the task and executor resources to the "pre-reservation-refinement"
// format. This cannot contain any refined reservations since
// the master rejects attempts to create refined reservations
// on non-capable agents.
if (!slave->capabilities.reservationRefinement) {
CHECK_SOME(downgradeResources(&message));
}
LOG(INFO) << "Launching task group " << stringify(taskIds)
<< " of framework " << *framework << " with resources "
<< totalResources - executorResources << " on agent "
<< *slave << " on "
<< (launchExecutor ? " new executor" : " existing executor");
// Increment this metric here for LAUNCH_GROUP since it
// does not make use of the `_apply()` function.
framework->metrics.incrementOperation(operation);
send(slave->pid, message);
break;
}
case Offer::Operation::CREATE_DISK: {
const Resource::DiskInfo::Source::Type diskType =
operation.create_disk().target_type();
CHECK(!authorizations.empty());
Future<bool> authorization = authorizations.front();
authorizations.pop_front();
CHECK(!authorization.isDiscarded());
if (authorization.isFailed()) {
// TODO(greggomann): We may want to retry this failed authorization
// request rather than dropping it immediately.
drop(framework,
operation,
"Authorization of principal '" + framework->info.principal() +
"' to create a " + stringify(diskType) + " disk failed: " +
authorization.failure());
continue;
} else if (!authorization.get()) {
drop(framework,
operation,
"Not authorized to create a " + stringify(diskType) +
" disk as '" + framework->info.principal() + "'");
continue;
}
if (!slave->capabilities.resourceProvider) {
drop(framework,
operation,
"Not supported on agent " + stringify(*slave) +
" because it does not have RESOURCE_PROVIDER capability");
continue;
}
Option<Error> error = validation::operation::validate(
operation.create_disk());
if (error.isSome()) {
drop(framework, operation, error->message);
continue;
}
const Resource& consumed = operation.create_disk().source();
if (!_offeredResources.contains(consumed)) {
drop(framework,
operation,
"Invalid CREATE_DISK Operation: " +
stringify(_offeredResources) + " does not contain " +
stringify(consumed));
continue;
}
_offeredResources -= consumed;
LOG(INFO) << "Processing CREATE_DISK operation with source "
<< operation.create_disk().source() << " from framework "
<< *framework << " to agent " << *slave;
_apply(slave, framework, operation);
break;
}
case Offer::Operation::DESTROY_DISK: {
const Resource::DiskInfo::Source::Type diskType =
operation.destroy_disk().source().disk().source().type();
CHECK(!authorizations.empty());
Future<bool> authorization = authorizations.front();
authorizations.pop_front();
CHECK(!authorization.isDiscarded());
if (authorization.isFailed()) {
// TODO(greggomann): We may want to retry this failed authorization
// request rather than dropping it immediately.
drop(framework,
operation,
"Authorization of principal '" + framework->info.principal() +
"' to destroy a " + stringify(diskType) + " disk failed: " +
authorization.failure());
continue;
} else if (!authorization.get()) {
drop(framework,
operation,
"Not authorized to destroy a " + stringify(diskType) +
" disk as '" + framework->info.principal() + "'");
continue;
}
if (!slave->capabilities.resourceProvider) {
drop(framework,
operation,
"Not supported on agent " + stringify(*slave) +
" because it does not have RESOURCE_PROVIDER capability");
continue;
}
Option<Error> error = validation::operation::validate(
operation.destroy_disk());
if (error.isSome()) {
drop(framework, operation, error->message);
continue;
}
const Resource& consumed = operation.destroy_disk().source();
if (!_offeredResources.contains(consumed)) {
drop(framework,
operation,
"Invalid DESTROY_DISK Operation: " +
stringify(_offeredResources) + " does not contain " +
stringify(consumed));
continue;
}
_offeredResources -= consumed;
LOG(INFO) << "Processing DESTROY_DISK operation for volume "
<< operation.destroy_disk().source() << " from framework "
<< *framework << " to agent " << *slave;
_apply(slave, framework, operation);
break;
}
case Offer::Operation::UNKNOWN: {
LOG(WARNING) << "Ignoring unknown operation";
break;
}
}
}
CHECK(authorizations.empty())
<< "Authorization results not processed: "
<< stringify(
vector<Future<bool>>(authorizations.begin(), authorizations.end()));
// Update the allocator based on the operations.
if (!conversions.empty()) {
allocator->updateAllocation(
frameworkId,
slaveId,
offeredResources,
conversions);
}
// We now need to compute the amounts of remaining (1) speculatively converted
// resources to recover without a filter and (2) resources that are implicitly
// declined with the filter:
//
// Speculatively converted resources
// = (offered resources).apply(speculative operations)
// - resources consumed by non-speculative operations
// - offered resources not consumed by any operation
// = `_offeredResources` - offered resources not consumed by any operation
// = `_offeredResources` - offered resources
//
// (The last equality holds because resource subtraction yields no negatives.)
//
// Implicitly declined resources
// = (offered resources).apply(speculative operations)
// - resources consumed by non-speculative operations
// - speculatively converted resources
// = `_offeredResources` - speculatively converted resources
//
// TODO(zhitao): Right now `GROW_VOLUME` and `SHRINK_VOLUME` are implemented
// as speculative operations. Since the plan is to make them non-speculative
// in the future, their results are not in `_offeredResources`, so we add them
// back here. Remove this once the operations become non-speculative.
Resources speculativelyConverted =
_offeredResources + resizedResources - offeredResources;
Resources implicitlyDeclined = _offeredResources - speculativelyConverted;
// Tell the allocator about the net speculatively converted resources. These
// resources should not be implicitly declined.
if (!speculativelyConverted.empty()) {
allocator->recoverResources(
frameworkId, slaveId, speculativelyConverted, None());
}
// Tell the allocator about the implicitly declined resources.
if (!implicitlyDeclined.empty()) {
allocator->recoverResources(
frameworkId, slaveId, implicitlyDeclined, accept.filters());
}
}
void Master::acceptInverseOffers(
Framework* framework,
const scheduler::Call::AcceptInverseOffers& accept)
{
CHECK_NOTNULL(framework);
Option<Error> error;
if (accept.inverse_offer_ids().size() == 0) {
error = Error("No inverse offers specified");
} else {
LOG(INFO) << "Processing ACCEPT_INVERSE_OFFERS call for inverse offers: "
<< accept.inverse_offer_ids() << " for framework " << *framework;
// Validate the inverse offers.
error = validation::offer::validateInverseOffers(
accept.inverse_offer_ids(),
this,
framework);
// Update each inverse offer in the allocator with the accept and
// filter.
// TODO(anand): Notify the framework if some of the offers were invalid.
foreach (const OfferID& offerId, accept.inverse_offer_ids()) {
InverseOffer* inverseOffer = getInverseOffer(offerId);
if (inverseOffer != nullptr) {
mesos::allocator::InverseOfferStatus status;
status.set_status(mesos::allocator::InverseOfferStatus::ACCEPT);
status.mutable_framework_id()->CopyFrom(inverseOffer->framework_id());
status.mutable_timestamp()->CopyFrom(protobuf::getCurrentTime());
allocator->updateInverseOffer(
inverseOffer->slave_id(),
inverseOffer->framework_id(),
UnavailableResources{
inverseOffer->resources(),
inverseOffer->unavailability()},
status,
accept.filters());
removeInverseOffer(inverseOffer);
continue;
}
// If the offer was not in our inverse offer set, then this
// offer is no longer valid.
LOG(WARNING) << "Ignoring accept of inverse offer " << offerId
<< " since it is no longer valid";
}
}
if (error.isSome()) {
LOG(WARNING) << "ACCEPT_INVERSE_OFFERS call used invalid offers '"
<< accept.inverse_offer_ids() << "': " << error->message;
}
}
void Master::decline(
Framework* framework,
scheduler::Call::Decline&& decline)
{
CHECK_NOTNULL(framework);
LOG(INFO) << "Processing DECLINE call for offers: " << decline.offer_ids()
<< " for framework " << *framework;
++metrics->messages_decline_offers;
size_t offersDeclined = 0;
// Return resources to the allocator.
foreach (const OfferID& offerId, decline.offer_ids()) {
Offer* offer = getOffer(offerId);
if (offer != nullptr) {
allocator->recoverResources(
offer->framework_id(),
offer->slave_id(),
offer->resources(),
decline.filters());
removeOffer(offer);
offersDeclined++;
continue;
}
// If the offer was not in our offer set, then this offer is no
// longer valid.
LOG(WARNING) << "Ignoring decline of offer " << offerId
<< " since it is no longer valid";
}
framework->metrics.offers_declined += offersDeclined;
}
void Master::declineInverseOffers(
Framework* framework,
const scheduler::Call::DeclineInverseOffers& decline)
{
CHECK_NOTNULL(framework);
LOG(INFO) << "Processing DECLINE_INVERSE_OFFERS call for inverse offers: "
<< decline.inverse_offer_ids() << " for framework " << *framework;
// Update each inverse offer in the allocator with the decline and
// filter.
foreach (const OfferID& offerId, decline.inverse_offer_ids()) {
// Try it as an inverse offer. If this fails then the offer is no
// longer valid.
InverseOffer* inverseOffer = getInverseOffer(offerId);
if (inverseOffer != nullptr) { // If this is an inverse offer.
mesos::allocator::InverseOfferStatus status;
status.set_status(mesos::allocator::InverseOfferStatus::DECLINE);
status.mutable_framework_id()->CopyFrom(inverseOffer->framework_id());
status.mutable_timestamp()->CopyFrom(protobuf::getCurrentTime());
allocator->updateInverseOffer(
inverseOffer->slave_id(),
inverseOffer->framework_id(),
UnavailableResources{
inverseOffer->resources(),
inverseOffer->unavailability()},
status,
decline.filters());
removeInverseOffer(inverseOffer);
continue;
}
// If the offer was not in our inverse offer set, then this
// offer is no longer valid.
LOG(WARNING) << "Ignoring decline of inverse offer " << offerId
<< " since it is no longer valid";
}
}
void Master::reviveOffers(
const UPID& from,
const FrameworkID& frameworkId,
const vector<string>& roles)
{
Framework* framework = getFramework(frameworkId);
if (framework == nullptr) {
LOG(WARNING)
<< "Ignoring revive offers message for framework " << frameworkId
<< " because the framework cannot be found";
return;
}
if (framework->pid != from) {
LOG(WARNING)
<< "Ignoring revive offers message for framework " << *framework
<< " because it is not expected from " << from;
return;
}
scheduler::Call::Revive call;
foreach (const string& role, roles) {
call.add_roles(role);
}
revive(framework, call);
}
void Master::revive(
Framework* framework,
const scheduler::Call::Revive& revive)
{
CHECK_NOTNULL(framework);
LOG(INFO) << "Processing REVIVE call for framework " << *framework;
++metrics->messages_revive_offers;
set<string> roles;
// Validate the roles, if provided. We need to make sure the
// roles is valid and also contained within the framework roles.
// Note that if a single role is invalid, we drop the entire
// call and do not suppress the valid roles.
foreach (const string& role, revive.roles()) {
Option<Error> roleError = roles::validate(role);
if (roleError.isSome()) {
drop(framework,
revive,
"revive role '" + role + "' is invalid: " + roleError->message);
return;
}
if (framework->roles.count(role) == 0) {
drop(framework,
revive,
"revive role '" + role + "' is not one"
" of the frameworks's subscribed roles");
return;
}
roles.insert(role);
}
allocator->reviveOffers(framework->id(), roles);
}
void Master::killTask(
const UPID& from,
const FrameworkID& frameworkId,
const TaskID& taskId)
{
Framework* framework = getFramework(frameworkId);
if (framework == nullptr) {
LOG(WARNING)
<< "Ignoring kill task message for task " << taskId << " of framework "
<< frameworkId << " because the framework cannot be found";
return;
}
if (framework->pid != from) {
LOG(WARNING)
<< "Ignoring kill task message for task " << taskId << " of framework "
<< *framework << " because it is not expected from " << from;
return;
}
scheduler::Call::Kill call;
call.mutable_task_id()->CopyFrom(taskId);
// NOTE: Kill policy in kill task is not supported for schedulers
// sending `KillTaskMessage` instead of `scheduler::Call::Kill`.
kill(framework, call);
}
void Master::kill(Framework* framework, const scheduler::Call::Kill& kill)
{
CHECK_NOTNULL(framework);
const TaskID& taskId = kill.task_id();
const Option<SlaveID> slaveId =
kill.has_slave_id() ? Option<SlaveID>(kill.slave_id()) : None();
LOG(INFO) << "Processing KILL call for task '" << taskId << "'"
<< " of framework " << *framework;
++metrics->messages_kill_task;
if (framework->pendingTasks.contains(taskId)) {
// Remove from pending tasks.
framework->pendingTasks.erase(taskId);
if (slaveId.isSome()) {
Slave* slave = slaves.registered.get(slaveId.get());
if (slave != nullptr) {
slave->pendingTasks[framework->id()].erase(taskId);
if (slave->pendingTasks[framework->id()].empty()) {
slave->pendingTasks.erase(framework->id());
}
}
}
const StatusUpdate& update = protobuf::createStatusUpdate(
framework->id(),
slaveId,
taskId,
TASK_KILLED,
TaskStatus::SOURCE_MASTER,
None(),
"Killed before delivery to the agent",
TaskStatus::REASON_TASK_KILLED_DURING_LAUNCH);
forward(update, UPID(), framework);
return;
}
Task* task = framework->getTask(taskId);
if (task == nullptr) {
LOG(WARNING) << "Cannot kill task " << taskId
<< " of framework " << *framework
<< " because it is unknown; performing reconciliation";
scheduler::Call::Reconcile message;
scheduler::Call::Reconcile::Task* t = message.add_tasks();
*t->mutable_task_id() = taskId;
if (slaveId.isSome()) {
*t->mutable_slave_id() = slaveId.get();
}
reconcile(framework, std::move(message));
return;
}
if (slaveId.isSome() && slaveId.get() != task->slave_id()) {
LOG(WARNING) << "Cannot kill task " << taskId << " of agent "
<< slaveId.get() << " of framework " << *framework
<< " because it belongs to different agent "
<< task->slave_id();
// TODO(vinod): Return a "Bad Request" when using HTTP API.
return;
}
Slave* slave = slaves.registered.get(task->slave_id());
CHECK(slave != nullptr) << "Unknown agent " << task->slave_id();
// We add the task to 'killedTasks' here because the slave
// might be partitioned or disconnected but the master
// doesn't know it yet.
slave->killedTasks.put(framework->id(), taskId);
// NOTE: This task will be properly reconciled when the disconnected slave
// reregisters with the master.
// We send the KillTaskMessage even if we have already sent one, just in case
// the previous one was dropped by the network but it didn't trigger a slave
// re-registration (and hence reconciliation).
if (slave->connected) {
LOG(INFO) << "Telling agent " << *slave
<< " to kill task " << taskId
<< " of framework " << *framework;
KillTaskMessage message;
message.mutable_framework_id()->MergeFrom(framework->id());
message.mutable_task_id()->MergeFrom(taskId);
if (kill.has_kill_policy()) {
message.mutable_kill_policy()->MergeFrom(kill.kill_policy());
}
send(slave->pid, message);
} else {
LOG(WARNING) << "Cannot kill task " << taskId
<< " of framework " << *framework
<< " because the agent " << *slave << " is disconnected."
<< " Kill will be retried if the agent reregisters";
}
}
void Master::statusUpdateAcknowledgement(
const UPID& from,
StatusUpdateAcknowledgementMessage&& statusUpdateAcknowledgementMessage)
{
const SlaveID& slaveId =
statusUpdateAcknowledgementMessage.slave_id();
const FrameworkID& frameworkId =
statusUpdateAcknowledgementMessage.framework_id();
const TaskID& taskId =
statusUpdateAcknowledgementMessage.task_id();
const string& uuid =
statusUpdateAcknowledgementMessage.uuid();
// TODO(bmahler): Consider adding a message validator abstraction
// for the master that takes care of all this boilerplate. Ideally
// by the time we process messages in the critical master code, we
// can assume that they are valid. This will become especially
// important as validation logic is moved out of the scheduler
// driver and into the master.
Try<id::UUID> uuid_ = id::UUID::fromBytes(uuid);
if (uuid_.isError()) {
LOG(WARNING)
<< "Ignoring status update acknowledgement "
<< " for task " << taskId << " of framework " << frameworkId
<< " on agent " << slaveId << " due to: " << uuid_.error();
metrics->invalid_status_update_acknowledgements++;
return;
}
Framework* framework = getFramework(frameworkId);
if (framework == nullptr) {
LOG(WARNING)
<< "Ignoring status update acknowledgement for status "
<< uuid_.get() << " of task " << taskId << " of framework "
<< frameworkId << " on agent " << slaveId << " because the framework "
<< "cannot be found";
metrics->invalid_status_update_acknowledgements++;
return;
}
if (framework->pid != from) {
LOG(WARNING)
<< "Ignoring status update acknowledgement for status "
<< uuid_.get() << " of task " << taskId << " of framework "
<< *framework << " on agent " << slaveId << " because it is not "
<< "expected from " << from;
metrics->invalid_status_update_acknowledgements++;
return;
}
scheduler::Call::Acknowledge message;
*message.mutable_slave_id() =
std::move(*statusUpdateAcknowledgementMessage.mutable_slave_id());
*message.mutable_task_id() =
std::move(*statusUpdateAcknowledgementMessage.mutable_task_id());
*message.mutable_uuid() =
std::move(*statusUpdateAcknowledgementMessage.mutable_uuid());
acknowledge(framework, std::move(message));
}
void Master::acknowledge(
Framework* framework,
scheduler::Call::Acknowledge&& acknowledge)
{
CHECK_NOTNULL(framework);
metrics->messages_status_update_acknowledgement++;
const SlaveID& slaveId = acknowledge.slave_id();
const TaskID& taskId = acknowledge.task_id();
Try<id::UUID> uuid_ = id::UUID::fromBytes(acknowledge.uuid());
CHECK_SOME(uuid_);
const id::UUID uuid = uuid_.get();
Slave* slave = slaves.registered.get(slaveId);
if (slave == nullptr) {
LOG(WARNING)
<< "Cannot send status update acknowledgement for status " << uuid
<< " of task " << taskId << " of framework " << *framework
<< " to agent " << slaveId << " because agent is not registered";
metrics->invalid_status_update_acknowledgements++;
return;
}
if (!slave->connected) {
LOG(WARNING)
<< "Cannot send status update acknowledgement for status " << uuid
<< " of task " << taskId << " of framework " << *framework
<< " to agent " << *slave << " because agent is disconnected";
metrics->invalid_status_update_acknowledgements++;
return;
}
LOG(INFO)
<< "Processing ACKNOWLEDGE call for status " << uuid
<< " for task " << taskId
<< " of framework " << *framework
<< " on agent " << slaveId;
Task* task = slave->getTask(framework->id(), taskId);
if (task != nullptr) {
// Status update state and uuid should be either set or unset
// together.
CHECK_EQ(task->has_status_update_uuid(), task->has_status_update_state());
if (!task->has_status_update_state()) {
// Task should have status update state set because it must have
// been set when the update corresponding to this
// acknowledgement was processed by the master. But in case this
// acknowledgement was intended for the old run of the master
// and the task belongs to a 0.20.0 slave, we could be here.
// Dropping the acknowledgement is safe because the slave will
// retry the update, at which point the master will set the
// status update state.
LOG(WARNING)
<< "Ignoring status update acknowledgement for status " << uuid
<< " of task " << taskId << " of framework " << *framework
<< " to agent " << *slave << " because the update was not"
<< " sent by this master";
metrics->invalid_status_update_acknowledgements++;
return;
}
// Remove the task once the terminal update is acknowledged.
if (protobuf::isTerminalState(task->status_update_state()) &&
id::UUID::fromBytes(task->status_update_uuid()).get() == uuid) {
removeTask(task);
}
}
StatusUpdateAcknowledgementMessage message;
*message.mutable_slave_id() = std::move(*acknowledge.mutable_slave_id());
*message.mutable_framework_id() = framework->id();
*message.mutable_task_id() = std::move(*acknowledge.mutable_task_id());
*message.mutable_uuid() = std::move(*acknowledge.mutable_uuid());
send(slave->pid, message);
metrics->valid_status_update_acknowledgements++;
}
void Master::acknowledgeOperationStatus(
Framework* framework,
scheduler::Call::AcknowledgeOperationStatus&& acknowledge)
{
CHECK_NOTNULL(framework);
metrics->messages_operation_status_update_acknowledgement++;
const OperationID& operationId = acknowledge.operation_id();
Try<id::UUID> statusUuid_ = id::UUID::fromBytes(acknowledge.uuid());
CHECK_SOME(statusUuid_);
const id::UUID statusUuid = statusUuid_.get();
CHECK(acknowledge.has_slave_id());
const SlaveID& slaveId = acknowledge.slave_id();
CHECK(acknowledge.has_resource_provider_id());
Slave* slave = slaves.registered.get(slaveId);
if (slave == nullptr) {
LOG(WARNING)
<< "Cannot send operation status update acknowledgement for status "
<< statusUuid << " of operation '" << operationId << "'"
<< " of framework " << *framework << " to agent " << slaveId
<< " because agent is not registered";
metrics->invalid_operation_status_update_acknowledgements++;
return;
}
if (!slave->connected) {
LOG(WARNING)
<< "Cannot send operation status update acknowledgement for status "
<< statusUuid << " of operation '" << operationId << "'"
<< " of framework " << *framework << " to agent " << slaveId
<< " because agent is disconnected";
metrics->invalid_operation_status_update_acknowledgements++;
return;
}
if (!slave->capabilities.resourceProvider) {
LOG(WARNING)
<< "Cannot send operation status update acknowledgement for status "
<< statusUuid << " of operation '" << operationId << "'"
<< " of framework " << *framework << " to agent " << slaveId
<< " because the agent does not support resource providers";
metrics->invalid_operation_status_update_acknowledgements++;
return;
}
const Option<UUID> operationUuid_ =
framework->operationUUIDs.get(operationId);
if (operationUuid_.isNone()) {
LOG(WARNING)
<< "Cannot send operation status update acknowledgement for status "
<< statusUuid << " of operation '" << operationId << "'"
<< " of framework" << *framework << " to agent " << slaveId
<< " because the operation is unknown";
metrics->invalid_operation_status_update_acknowledgements++;
return;
}
const UUID operationUuid = operationUuid_.get();
Operation* operation = slave->getOperation(operationUuid);
CHECK_NOTNULL(operation);
auto it = std::find_if(
operation->statuses().begin(),
operation->statuses().end(),
[&statusUuid](const OperationStatus& operationStatus) {
return operationStatus.has_uuid() &&
operationStatus.uuid().value() == statusUuid.toBytes();
});
if (it == operation->statuses().end()) {
LOG(WARNING)
<< "Ignoring operation status acknowledgement for status " << statusUuid
<< " of operation '" << operationId << "'"
<< " (uuid " << operationUuid << ")"
<< " of framework" << *framework
<< " because the operation status is unknown";
metrics->invalid_status_update_acknowledgements++;
return;
}
const OperationStatus& acknowledgedStatus = *it;
LOG(INFO) << "Processing ACKNOWLEDGE_OPERATION_STATUS call for status "
<< statusUuid << " of operation '" << operationId << "'"
<< " (uuid " << operationUuid << ")"
<< " of framework " << *framework << " on agent " << slaveId;
// If the acknowledged status update is terminal, remove the operation.
if (protobuf::isTerminalState(acknowledgedStatus.state())) {
removeOperation(operation);
}
AcknowledgeOperationStatusMessage message;
message.mutable_status_uuid()->set_value(statusUuid.toBytes());
*message.mutable_operation_uuid() = std::move(operationUuid);
*message.mutable_resource_provider_id() =
std::move(*acknowledge.mutable_resource_provider_id());
send(slave->pid, message);
metrics->valid_operation_status_update_acknowledgements++;
}
void Master::schedulerMessage(
const UPID& from,
FrameworkToExecutorMessage&& frameworkToExecutorMessage)
{
const FrameworkID& frameworkId = frameworkToExecutorMessage.framework_id();
const ExecutorID& executorId = frameworkToExecutorMessage.executor_id();
Framework* framework = getFramework(frameworkId);
if (framework == nullptr) {
LOG(WARNING) << "Ignoring framework message"
<< " for executor '" << executorId << "'"
<< " of framework " << frameworkId
<< " because the framework cannot be found";
metrics->invalid_framework_to_executor_messages++;
return;
}
if (framework->pid != from) {
LOG(WARNING)
<< "Ignoring framework message for executor '" << executorId
<< "' of framework " << *framework
<< " because it is not expected from " << from;
metrics->invalid_framework_to_executor_messages++;
return;
}
scheduler::Call::Message message_;
*message_.mutable_slave_id() =
std::move(*frameworkToExecutorMessage.mutable_slave_id());
*message_.mutable_executor_id() =
std::move(*frameworkToExecutorMessage.mutable_executor_id());
*message_.mutable_data() =
std::move(*frameworkToExecutorMessage.mutable_data());
message(framework, std::move(message_));
}
void Master::executorMessage(
const UPID& from,
ExecutorToFrameworkMessage&& executorToFrameworkMessage)
{
const SlaveID& slaveId = executorToFrameworkMessage.slave_id();
const FrameworkID& frameworkId = executorToFrameworkMessage.framework_id();
const ExecutorID& executorId = executorToFrameworkMessage.executor_id();
metrics->messages_executor_to_framework++;
if (slaves.removed.get(slaveId).isSome()) {
// If the slave has been removed, drop the executor message. The
// master is no longer trying to health check this slave; when the
// slave realizes it hasn't received any pings from the master, it
// will eventually try to reregister.
LOG(WARNING) << "Ignoring executor message"
<< " from executor" << " '" << executorId << "'"
<< " of framework " << frameworkId
<< " on removed agent " << slaveId;
metrics->invalid_executor_to_framework_messages++;
return;
}
// The slave should (re-)register with the master before
// forwarding executor messages.
Slave* slave = slaves.registered.get(slaveId);
if (slave == nullptr) {
LOG(WARNING) << "Ignoring executor message"
<< " from executor '" << executorId << "'"
<< " of framework " << frameworkId
<< " on unknown agent " << slaveId;
metrics->invalid_executor_to_framework_messages++;
return;
}
Framework* framework = getFramework(frameworkId);
if (framework == nullptr) {
LOG(WARNING) << "Not forwarding executor message"
<< " for executor '" << executorId << "'"
<< " of framework " << frameworkId
<< " on agent " << *slave
<< " because the framework is unknown";
metrics->invalid_executor_to_framework_messages++;
return;
}
if (!framework->connected()) {
LOG(WARNING) << "Not forwarding executor message for executor '"
<< executorId << "' of framework " << frameworkId
<< " on agent " << *slave
<< " because the framework is disconnected";
metrics->invalid_executor_to_framework_messages++;
return;
}
ExecutorToFrameworkMessage message;
*message.mutable_slave_id() =
std::move(*executorToFrameworkMessage.mutable_slave_id());
*message.mutable_framework_id() =
std::move(*executorToFrameworkMessage.mutable_framework_id());
*message.mutable_executor_id() =
std::move(*executorToFrameworkMessage.mutable_executor_id());
*message.mutable_data() =
std::move(*executorToFrameworkMessage.mutable_data());
framework->send(message);
metrics->valid_executor_to_framework_messages++;
}
void Master::message(
Framework* framework,
scheduler::Call::Message&& message)
{
CHECK_NOTNULL(framework);
metrics->messages_framework_to_executor++;
Slave* slave = slaves.registered.get(message.slave_id());
if (slave == nullptr) {
LOG(WARNING) << "Cannot send framework message for framework "
<< *framework << " to agent " << message.slave_id()
<< " because agent is not registered";
metrics->invalid_framework_to_executor_messages++;
return;
}
if (!slave->connected) {
LOG(WARNING) << "Cannot send framework message for framework "
<< *framework << " to agent " << *slave
<< " because agent is disconnected";
metrics->invalid_framework_to_executor_messages++;
return;
}
LOG(INFO) << "Processing MESSAGE call from framework "
<< *framework << " to agent " << *slave;
FrameworkToExecutorMessage message_;
*message_.mutable_slave_id() = std::move(*message.mutable_slave_id());
*message_.mutable_framework_id() = framework->id();
*message_.mutable_executor_id() = std::move(*message.mutable_executor_id());
*message_.mutable_data() = std::move(*message.mutable_data());
send(slave->pid, message_);
metrics->valid_framework_to_executor_messages++;
}
void Master::registerSlave(
const UPID& from,
RegisterSlaveMessage&& registerSlaveMessage)
{
++metrics->messages_register_slave;
if (authenticating.contains(from)) {
LOG(INFO) << "Queuing up registration request from " << from
<< " because authentication is still in progress";
authenticating[from]
.onReady(defer(self(),
&Self::registerSlave,
from,
std::move(registerSlaveMessage)));
return;
}
if (flags.authenticate_agents && !authenticated.contains(from)) {
// This could happen if another authentication request came
// through before we are here or if a slave tried to register
// without authentication.
LOG(WARNING) << "Refusing registration of agent at " << from
<< " because it is not authenticated";
return;
}
Option<Error> error =
validation::master::message::registerSlave(registerSlaveMessage);
if (error.isSome()) {
LOG(WARNING) << "Dropping registration of agent at " << from
<< " because it sent an invalid registration: "
<< error->message;
return;
}
if (slaves.registering.contains(from)) {
LOG(INFO) << "Ignoring register agent message from " << from
<< " (" << registerSlaveMessage.slave().hostname()
<< ") as registration is already in progress";
return;
}
LOG(INFO) << "Received register agent message from " << from
<< " (" << registerSlaveMessage.slave().hostname() << ")";
slaves.registering.insert(from);
// Update all resources passed by the agent to `POST_RESERVATION_REFINEMENT`
// format. We do this as early as possible so that we only use a single
// format inside master, and downgrade again if necessary when they leave the
// master (e.g. when writing to the registry).
upgradeResources(&registerSlaveMessage);
// Note that the principal may be empty if authentication is not
// required. Also it is passed along because it may be removed from
// `authenticated` while the authorization is pending.
Option<Principal> principal = authenticated.contains(from)
? Principal(authenticated.at(from))
: Option<Principal>::none();
// Calling the `onAny` continuation below separately so we can move
// `registerSlaveMessage` without it being evaluated before it's used
// by `authorizeSlave`.
Future<bool> authorization =
authorizeSlave(registerSlaveMessage.slave(), principal);
authorization
.onAny(defer(self(),
&Self::_registerSlave,
from,
std::move(registerSlaveMessage),
principal,
lambda::_1));
}
void Master::_registerSlave(
const UPID& pid,
RegisterSlaveMessage&& registerSlaveMessage,
const Option<Principal>& principal,
const Future<bool>& authorized)
{
CHECK(!authorized.isDiscarded());
CHECK(slaves.registering.contains(pid));
const SlaveInfo& slaveInfo = registerSlaveMessage.slave();
Option<string> authorizationError = None();
if (authorized.isFailed()) {
authorizationError = "Authorization failure: " + authorized.failure();
} else if (!authorized.get()) {
authorizationError =
"Not authorized to register agent providing resources "
"'" + stringify(Resources(slaveInfo.resources())) + "' " +
(principal.isSome()
? "with principal '" + stringify(principal.get()) + "'"
: "without a principal");
}
if (authorizationError.isSome()) {
LOG(WARNING) << "Refusing registration of agent at " << pid
<< " (" << slaveInfo.hostname() << ")"
<< ": " << authorizationError.get();
slaves.registering.erase(pid);
return;
}
VLOG(1) << "Authorized registration of agent at " << pid
<< " (" << slaveInfo.hostname() << ")";
MachineID machineId;
machineId.set_hostname(slaveInfo.hostname());
machineId.set_ip(stringify(pid.address.ip));
// Slaves are not allowed to register while the machine they are on is in
// `DOWN` mode.
if (machines.contains(machineId) &&
machines[machineId].info.mode() == MachineInfo::DOWN) {
LOG(WARNING) << "Refusing registration of agent at " << pid
<< " because the machine '" << machineId << "' that it is "
<< "running on is `DOWN`";
ShutdownMessage message;
message.set_message("Machine is `DOWN`");
send(pid, message);
slaves.registering.erase(pid);
return;
}
// Ignore registration attempts by agents running old Mesos versions.
// We expect that the agent's version is in SemVer format; if the
// version cannot be parsed, the registration attempt is ignored.
const string& version = registerSlaveMessage.version();
Try<Version> parsedVersion = Version::parse(version);
if (parsedVersion.isError()) {
LOG(WARNING) << "Failed to parse version '" << version << "'"
<< " of agent at " << pid << ": " << parsedVersion.error()
<< "; ignoring agent registration attempt";
slaves.registering.erase(pid);
return;
} else if (parsedVersion.get() < MINIMUM_AGENT_VERSION) {
LOG(WARNING) << "Ignoring registration attempt from old agent at "
<< pid << ": agent version is " << parsedVersion.get()
<< ", minimum supported agent version is "
<< MINIMUM_AGENT_VERSION;
slaves.registering.erase(pid);
return;
}
// If the agent is configured with a domain but the master is not,
// we can't determine whether the agent is remote. To be safe, we
// don't allow the agent to register. We don't shutdown the agent so
// that any tasks on the agent can continue to run.
//
// TODO(neilc): Consider sending a warning to agent (MESOS-7615).
if (slaveInfo.has_domain() && !info_.has_domain()) {
LOG(WARNING) << "Agent at " << pid << " is configured with "
<< "domain " << slaveInfo.domain() << " "
<< "but the master has no configured domain. "
<< "Ignoring agent registration attempt";
slaves.registering.erase(pid);
return;
}
// Don't allow agents without domain if domains are required.
// We don't shutdown the agent to allow it to restart itself with
// the correct domain and without losing tasks.
if (flags.require_agent_domain && !slaveInfo.has_domain()) {
LOG(WARNING) << "Agent at " << pid << " attempted to register without "
<< "a domain, but this master is configured to require agent "
<< "domains. Ignoring agent registration attempt";
slaves.registering.erase(pid);
return;
}
// Check if this slave is already registered (because it retries).
if (Slave* slave = slaves.registered.get(pid)) {
if (!slave->connected) {
// The slave was previously disconnected but it is now trying
// to register as a new slave.
// There are several possible reasons for this to happen:
// - If the slave failed recovery and hence registering as a new
// slave before the master removed the old slave from its map.
// - If the slave was shutting down while it had a registration
// retry scheduled. See MESOS-8463.
LOG(INFO) << "Removing old disconnected agent " << *slave
<< " because a registration attempt occurred";
removeSlave(slave,
"a new agent registered at the same address",
metrics->slave_removals_reason_registered);
} else {
CHECK(slave->active)
<< "Unexpected connected but deactivated agent " << *slave;
LOG(INFO) << "Agent " << *slave << " already registered,"
<< " resending acknowledgement";
Duration pingTimeout =
flags.agent_ping_timeout * flags.max_agent_ping_timeouts;
MasterSlaveConnection connection;
connection.set_total_ping_timeout_seconds(pingTimeout.secs());
SlaveRegisteredMessage message;
message.mutable_slave_id()->CopyFrom(slave->id);
message.mutable_connection()->CopyFrom(connection);
send(pid, message);
slaves.registering.erase(pid);
return;
}
}
// Create and add the slave id.
SlaveID slaveId = newSlaveId();
LOG(INFO) << "Registering agent at " << pid << " ("
<< slaveInfo.hostname() << ") with id " << slaveId;
SlaveInfo slaveInfo_ = slaveInfo;
slaveInfo_.mutable_id()->CopyFrom(slaveId);
registerSlaveMessage.mutable_slave()->mutable_id()->CopyFrom(slaveId);
registrar->apply(Owned<RegistryOperation>(new AdmitSlave(slaveInfo_)))
.onAny(defer(self(),
&Self::__registerSlave,
pid,
std::move(registerSlaveMessage),
lambda::_1));
}
void Master::__registerSlave(
const UPID& pid,
RegisterSlaveMessage&& registerSlaveMessage,
const Future<bool>& admit)
{
CHECK(slaves.registering.contains(pid));
CHECK(!admit.isDiscarded());
const SlaveInfo& slaveInfo = registerSlaveMessage.slave();
if (admit.isFailed()) {
LOG(FATAL) << "Failed to admit agent " << slaveInfo.id() << " at " << pid
<< " (" << slaveInfo.hostname() << "): " << admit.failure();
}
if (!admit.get()) {
// This should only happen if there is a slaveID collision, but that
// is extremely unlikely in practice: slaveIDs are prefixed with the
// master ID, which is a randomly generated UUID. In this situation,
// we ignore the registration attempt. The slave will eventually try
// to register again and be assigned a new slave ID.
LOG(WARNING) << "Agent " << slaveInfo.id() << " at " << pid
<< " (" << slaveInfo.hostname() << ") was assigned"
<< " an agent ID that already appears in the registry;"
<< " ignoring registration attempt";
slaves.registering.erase(pid);
return;
}
VLOG(1) << "Admitted agent " << slaveInfo.id() << " at " << pid
<< " (" << slaveInfo.hostname() << ")";
MachineID machineId;
machineId.set_hostname(slaveInfo.hostname());
machineId.set_ip(stringify(pid.address.ip));
vector<SlaveInfo::Capability> agentCapabilities = google::protobuf::convert(
std::move(*registerSlaveMessage.mutable_agent_capabilities()));
vector<Resource> checkpointedResources = google::protobuf::convert(
std::move(*registerSlaveMessage.mutable_checkpointed_resources()));
Option<UUID> resourceVersion;
if (registerSlaveMessage.has_resource_version_uuid()) {
resourceVersion = registerSlaveMessage.resource_version_uuid();
}
Slave* slave = new Slave(
this,
slaveInfo,
pid,
machineId,
registerSlaveMessage.version(),
std::move(agentCapabilities),
Clock::now(),
std::move(checkpointedResources),
resourceVersion);
++metrics->slave_registrations;
addSlave(slave, {});
Duration pingTimeout =
flags.agent_ping_timeout * flags.max_agent_ping_timeouts;
MasterSlaveConnection connection;
connection.set_total_ping_timeout_seconds(pingTimeout.secs());
SlaveRegisteredMessage message;
message.mutable_slave_id()->CopyFrom(slave->id);
message.mutable_connection()->CopyFrom(connection);
send(slave->pid, message);
// Note that we convert to `Resources` for output as it's faster than
// logging raw protobuf data. Conversion is safe, as resources have
// already passed validation.
LOG(INFO) << "Registered agent " << *slave
<< " with " << Resources(slave->info.resources());
slaves.registering.erase(pid);
}
void Master::reregisterSlave(
const UPID& from,
ReregisterSlaveMessage&& reregisterSlaveMessage)
{
++metrics->messages_reregister_slave;
if (authenticating.contains(from)) {
LOG(INFO) << "Queuing up re-registration request from " << from
<< " because authentication is still in progress";
authenticating[from]
.onReady(defer(self(),
&Self::reregisterSlave,
from,
std::move(reregisterSlaveMessage)));
return;
}
if (flags.authenticate_agents && !authenticated.contains(from)) {
// This could happen if another authentication request came
// through before we are here or if a slave tried to
// reregister without authentication.
LOG(WARNING) << "Refusing re-registration of agent at " << from
<< " because it is not authenticated";
return;
}
// TODO(bevers): Technically this behaviour seems to be incorrect, since we
// discard the newer re-registration attempt, which might have additional
// capabilities or a higher version (or a changed SlaveInfo, after Mesos 1.5).
// However, this should very rarely happen in practice, and nobody seems to
// have complained about it so far.
const SlaveInfo& slaveInfo = reregisterSlaveMessage.slave();
if (slaves.reregistering.contains(slaveInfo.id())) {
LOG(INFO)
<< "Ignoring reregister agent message from agent "
<< slaveInfo.id() << " at " << from << " ("
<< slaveInfo.hostname() << ") as re-registration is already in progress";
return;
}
if (slaves.markingGone.contains(slaveInfo.id())) {
LOG(INFO)
<< "Ignoring reregister agent message from agent "
<< slaveInfo.id() << " at " << from << " ("
<< slaveInfo.hostname() << ") as a gone operation is already in progress";
return;
}
if (slaves.gone.contains(slaveInfo.id())) {
LOG(WARNING) << "Refusing re-registration of agent at " << from
<< " because it is already marked gone";
ShutdownMessage message;
message.set_message("Agent has been marked gone");
send(from, message);
return;
}
Option<Error> error =
validation::master::message::reregisterSlave(reregisterSlaveMessage);
if (error.isSome()) {
LOG(WARNING) << "Dropping re-registration of agent at " << from
<< " because it sent an invalid re-registration: "
<< error->message;
return;
}
LOG(INFO) << "Received reregister agent message from agent "
<< slaveInfo.id() << " at " << from << " ("
<< slaveInfo.hostname() << ")";
// TODO(bevers): Create a guard object calling `insert()` in its constructor
// and `erase()` in its destructor, to avoid the manual bookkeeping.
slaves.reregistering.insert(slaveInfo.id());
// Update all resources passed by the agent to `POST_RESERVATION_REFINEMENT`
// format. We do this as early as possible so that we only use a single
// format inside master, and downgrade again if necessary when they leave the
// master (e.g. when writing to the registry).
upgradeResources(&reregisterSlaveMessage);
// Note that the principal may be empty if authentication is not
// required. Also it is passed along because it may be removed from
// `authenticated` while the authorization is pending.
Option<Principal> principal = authenticated.contains(from)
? Principal(authenticated.at(from))
: Option<Principal>::none();
// Calling the `onAny` continuation below separately so we can move
// `reregisterSlaveMessage` without it being evaluated before it's used
// by `authorizeSlave`.
Future<bool> authorization =
authorizeSlave(reregisterSlaveMessage.slave(), principal);
authorization
.onAny(defer(self(),
&Self::_reregisterSlave,
from,
std::move(reregisterSlaveMessage),
principal,
lambda::_1));
}
void Master::_reregisterSlave(
const UPID& pid,
ReregisterSlaveMessage&& reregisterSlaveMessage,
const Option<Principal>& principal,
const Future<bool>& authorized)
{
CHECK(!authorized.isDiscarded());
const SlaveInfo& slaveInfo = reregisterSlaveMessage.slave();
CHECK(slaves.reregistering.contains(slaveInfo.id()));
Option<string> authorizationError = None();
if (authorized.isFailed()) {
authorizationError = "Authorization failure: " + authorized.failure();
} else if (!authorized.get()) {
authorizationError =
"Not authorized to reregister agent providing resources "
"'" + stringify(Resources(slaveInfo.resources())) + "' " +
(principal.isSome()
? "with principal '" + stringify(principal.get()) + "'"
: "without a principal");
}
if (authorizationError.isSome()) {
LOG(WARNING) << "Refusing re-registration of agent " << slaveInfo.id()
<< " at " << pid << " (" << slaveInfo.hostname() << ")"
<< ": " << authorizationError.get();
slaves.reregistering.erase(slaveInfo.id());
return;
}
if (slaves.markingGone.contains(slaveInfo.id())) {
LOG(INFO)
<< "Ignoring reregister agent message from agent "
<< slaveInfo.id() << " at " << pid << " ("
<< slaveInfo.hostname() << ") as a gone operation is already in progress";
slaves.reregistering.erase(slaveInfo.id());
return;
}
if (slaves.gone.contains(slaveInfo.id())) {
LOG(WARNING) << "Refusing re-registration of agent at " << pid
<< " because it is already marked gone";
ShutdownMessage message;
message.set_message("Agent has been marked gone");
send(pid, message);
slaves.reregistering.erase(slaveInfo.id());
return;
}
VLOG(1) << "Authorized re-registration of agent " << slaveInfo.id()
<< " at " << pid << " (" << slaveInfo.hostname() << ")";
MachineID machineId;
machineId.set_hostname(slaveInfo.hostname());
machineId.set_ip(stringify(pid.address.ip));
// Slaves are not allowed to reregister while the machine they are on is in
// 'DOWN` mode.
if (machines.contains(machineId) &&
machines[machineId].info.mode() == MachineInfo::DOWN) {
LOG(WARNING) << "Refusing re-registration of agent at " << pid
<< " because the machine '" << machineId << "' that it is "
<< "running on is `DOWN`";
ShutdownMessage message;
message.set_message("Machine is `DOWN`");
send(pid, message);
slaves.reregistering.erase(slaveInfo.id());
return;
}
// Ignore re-registration attempts by agents running old Mesos versions.
// We expect that the agent's version is in SemVer format; if the
// version cannot be parsed, the re-registration attempt is ignored.
const string& version = reregisterSlaveMessage.version();
Try<Version> parsedVersion = Version::parse(version);
if (parsedVersion.isError()) {
LOG(WARNING) << "Failed to parse version '" << version << "'"
<< " of agent at " << pid << ": " << parsedVersion.error()
<< "; ignoring agent re-registration attempt";
slaves.reregistering.erase(slaveInfo.id());
return;
} else if (parsedVersion.get() < MINIMUM_AGENT_VERSION) {
LOG(WARNING) << "Ignoring re-registration attempt from old agent at "
<< pid << ": agent version is " << parsedVersion.get()
<< ", minimum supported agent version is "
<< MINIMUM_AGENT_VERSION;
slaves.reregistering.erase(slaveInfo.id());
return;
}
// If the agent is configured with a domain but the master is not,
// we can't determine whether the agent is remote. To be safe, we
// don't allow the agent to reregister. We don't shutdown the agent
// so that any tasks on the agent can continue to run.
//
// TODO(neilc): Consider sending a warning to agent (MESOS-7615).
if (slaveInfo.has_domain() && !info_.has_domain()) {
LOG(WARNING) << "Agent at " << pid << " is configured with "
<< "domain " << slaveInfo.domain() << " "
<< "but the master has no configured domain."
<< "Ignoring agent re-registration attempt";
slaves.reregistering.erase(slaveInfo.id());
return;
}
// Don't allow agents without domain if domains are required.
// We don't shutdown the agent to allow it to restart itself with
// the correct domain and without losing tasks.
if (flags.require_agent_domain && !slaveInfo.has_domain()) {
LOG(WARNING) << "Agent at " << pid << " attempted to register without "
<< "a domain, but this master is configured to require agent "
<< "domains. Ignoring agent re-registration attempt";
slaves.reregistering.erase(slaveInfo.id());
return;
}
if (Slave* slave = slaves.registered.get(slaveInfo.id())) {
CHECK(!slaves.recovered.contains(slaveInfo.id()));
// NOTE: This handles the case where a slave tries to
// reregister with an existing master (e.g. because of a
// spurious Zookeeper session expiration or after the slave
// recovers after a restart).
// For now, we assume this slave is not nefarious (eventually
// this will be handled by orthogonal security measures like key
// based authentication).
VLOG(1) << "Agent is already marked as registered: " << slaveInfo.id()
<< " at " << pid << " (" << slaveInfo.hostname() << ")";
// We don't allow reregistering this way with a different IP or
// hostname. This is because maintenance is scheduled at the
// machine level; so we would need to re-validate the slave's
// unavailability if the machine it is running on changed.
if (slave->pid.address.ip != pid.address.ip ||
slave->info.hostname() != slaveInfo.hostname()) {
LOG(WARNING) << "Agent " << slaveInfo.id() << " at " << pid
<< " (" << slaveInfo.hostname() << ") attempted to "
<< "reregister with different IP / hostname; expected "
<< slave->pid.address.ip << " (" << slave->info.hostname()
<< ") shutting it down";
ShutdownMessage message;
message.set_message(
"Agent attempted to reregister with different IP / hostname");
send(pid, message);
slaves.reregistering.erase(slaveInfo.id());
return;
}
// Skip updating the registry if `slaveInfo` did not change from its
// previously known state.
if (slaveInfo == slave->info) {
___reregisterSlave(
pid,
std::move(reregisterSlaveMessage),
true);
} else {
registrar->apply(Owned<RegistryOperation>(new UpdateSlave(slaveInfo)))
.onAny(defer(self(),
&Self::___reregisterSlave,
pid,
std::move(reregisterSlaveMessage),
lambda::_1));
}
} else if (slaves.recovered.contains(slaveInfo.id())) {
// The agent likely is reregistering after a master failover as it
// is in the list recovered from the registry.
VLOG(1) << "Re-admitting recovered agent " << slaveInfo.id()
<< " at " << pid << "(" << slaveInfo.hostname() << ")";
SlaveInfo recoveredInfo = slaves.recovered.at(slaveInfo.id());
// Skip updating the registry if `slaveInfo` did not change from its
// previously known state (see also MESOS-7711).
if (slaveInfo == recoveredInfo) {
__reregisterSlave(
pid,
std::move(reregisterSlaveMessage),
true);
} else {
registrar->apply(Owned<RegistryOperation>(new UpdateSlave(slaveInfo)))
.onAny(defer(self(),
&Self::__reregisterSlave,
pid,
std::move(reregisterSlaveMessage),
lambda::_1));
}
} else {
// In the common case, the slave has been marked unreachable
// by the master, so we move the slave to the reachable list and
// readmit it. If the slave isn't in the unreachable list (which
// might occur if the slave's entry in the unreachable list is
// GC'd), we admit the slave anyway.
VLOG(1) << "Consulting registry about agent " << slaveInfo.id()
<< " at " << pid << "(" << slaveInfo.hostname() << ")";
registrar->apply(Owned<RegistryOperation>(
new MarkSlaveReachable(slaveInfo)))
.onAny(defer(self(),
&Self::__reregisterSlave,
pid,
std::move(reregisterSlaveMessage),
lambda::_1));
}
}
void Master::__reregisterSlave(
const UPID& pid,
ReregisterSlaveMessage&& reregisterSlaveMessage,
const Future<bool>& future)
{
const SlaveInfo& slaveInfo = reregisterSlaveMessage.slave();
CHECK(slaves.reregistering.contains(slaveInfo.id()));
if (future.isFailed()) {
LOG(FATAL) << "Failed to update registry for agent " << slaveInfo.id()
<< " at " << pid << " (" << slaveInfo.hostname() << "): "
<< future.failure();
}
CHECK(!future.isDiscarded());
// Neither the `UpdateSlave` nor `MarkSlaveReachable` registry operations
// should ever fail.
CHECK(future.get());
if (slaves.markingGone.contains(slaveInfo.id())) {
LOG(INFO)
<< "Ignoring reregister agent message from agent "
<< slaveInfo.id() << " at " << pid << " ("
<< slaveInfo.hostname() << ") as a gone operation is already in progress";
slaves.reregistering.erase(slaveInfo.id());
return;
}
if (slaves.gone.contains(slaveInfo.id())) {
LOG(WARNING) << "Refusing re-registration of agent at " << pid
<< " because it is already marked gone";
ShutdownMessage message;
message.set_message("Agent has been marked gone");
send(pid, message);
slaves.reregistering.erase(slaveInfo.id());
return;
}
VLOG(1) << "Re-admitted agent " << slaveInfo.id() << " at " << pid
<< " (" << slaveInfo.hostname() << ")";
// For agents without the MULTI_ROLE capability,
// we need to inject the allocation role inside
// the task and executor resources;
auto injectAllocationInfo = [](
RepeatedPtrField<Resource>* resources,
const FrameworkInfo& frameworkInfo)
{
set<string> roles = protobuf::framework::getRoles(frameworkInfo);
foreach (Resource& resource, *resources) {
if (!resource.has_allocation_info()) {
if (roles.size() != 1) {
LOG(FATAL) << "Missing 'Resource.AllocationInfo' for resources"
<< " allocated to MULTI_ROLE framework"
<< " '" << frameworkInfo.name() << "'";
}
resource.mutable_allocation_info()->set_role(*roles.begin());
}
}
};
vector<SlaveInfo::Capability> agentCapabilities =
google::protobuf::convert(reregisterSlaveMessage.agent_capabilities());
// Adjust the agent's task and executor infos to ensure
// compatibility with old agents without certain capabilities.
protobuf::slave::Capabilities slaveCapabilities(agentCapabilities);
// If the agent is not multi-role capable, inject allocation info.
if (!slaveCapabilities.multiRole) {
hashmap<FrameworkID, reference_wrapper<const FrameworkInfo>> frameworks;
foreach (const FrameworkInfo& framework,
reregisterSlaveMessage.frameworks()) {
frameworks.emplace(framework.id(), framework);
}
foreach (Task& task, *reregisterSlaveMessage.mutable_tasks()) {
CHECK(frameworks.contains(task.framework_id()));
injectAllocationInfo(
task.mutable_resources(),
frameworks.at(task.framework_id()));
}
foreach (ExecutorInfo& executor,
*reregisterSlaveMessage.mutable_executor_infos()) {
CHECK(frameworks.contains(executor.framework_id()));
injectAllocationInfo(
executor.mutable_resources(),
frameworks.at(executor.framework_id()));
}
}
MachineID machineId;
machineId.set_hostname(slaveInfo.hostname());
machineId.set_ip(stringify(pid.address.ip));
// For easy lookup, first determine the set of FrameworkIDs on the
// reregistering agent that are partition-aware.
hashset<FrameworkID> partitionAwareFrameworks;
foreach (const FrameworkInfo& framework,
reregisterSlaveMessage.frameworks()) {
if (protobuf::frameworkHasCapability(
framework, FrameworkInfo::Capability::PARTITION_AWARE)) {
partitionAwareFrameworks.insert(framework.id());
}
}
// All tasks except the ones from completed frameworks are re-added to the
// master (those tasks were previously marked "unreachable", so they
// should be removed from that collection).
vector<Task> recoveredTasks;
foreach (Task& task, *reregisterSlaveMessage.mutable_tasks()) {
const FrameworkID& frameworkId = task.framework_id();
// Don't re-add tasks whose framework has been shutdown at the
// master. Such frameworks will be shutdown on the agent below.
if (isCompletedFramework(frameworkId)) {
continue;
}
if (!slaves.recovered.contains(slaveInfo.id())) {
Framework* framework = getFramework(frameworkId);
if (framework != nullptr) {
framework->unreachableTasks.erase(task.task_id());
// The master transitions task to terminal state on its own in certain
// scenarios (e.g., framework or agent teardown) before instructing the
// agent to remove it. However, we are not guaranteed that the message
// reaches the agent and is processed by it. If the agent fails to act
// on the message, tasks the master has declared terminal might reappear
// from the agent as non-terminal, see e.g., MESOS-9940.
//
// Avoid tracking a task as both terminal and non-terminal by
// garbage-collected completed tasks which come back as running.
framework->completedTasks.erase(
std::remove_if(
framework->completedTasks.begin(),
framework->completedTasks.end(),
[&](const Owned<Task>& task_) {
return task_.get() && task_->task_id() == task.task_id();
}),
framework->completedTasks.end());
}
const string message = slaves.unreachable.contains(slaveInfo.id())
? "Unreachable agent re-reregistered"
: "Unknown agent reregistered";
const StatusUpdate& update = protobuf::createStatusUpdate(
task.framework_id(),
task.slave_id(),
task.task_id(),
task.state(),
TaskStatus::SOURCE_MASTER,
None(),
message,
TaskStatus::REASON_SLAVE_REREGISTERED,
(task.has_executor_id()
? Option<ExecutorID>(task.executor_id()) : None()),
protobuf::getTaskHealth(task),
protobuf::getTaskCheckStatus(task),
None(),
protobuf::getTaskContainerStatus(task));
if (framework == nullptr || !framework->connected()) {
LOG(WARNING) << "Dropping update " << update
<< (update.status().has_message()
? " '" + update.status().message() + "'"
: "")
<< " for "
<< (framework == nullptr ? "unknown" : "disconnected")
<< " framework " << frameworkId;
} else {
forward(update, UPID(), framework);
}
}
recoveredTasks.push_back(std::move(task));
}
// All tasks from this agent are now reachable so clean them up from
// the master's unreachable task records.
if (slaves.unreachableTasks.contains(slaveInfo.id())) {
foreachkey (const FrameworkID& frameworkId,
slaves.unreachableTasks.at(slaveInfo.id())) {
Framework* framework = getFramework(frameworkId);
if (framework != nullptr) {
foreach (const TaskID& taskId,
slaves.unreachableTasks.at(slaveInfo.id()).at(frameworkId)) {
framework->unreachableTasks.erase(taskId);
}
}
}
}
slaves.unreachableTasks.erase(slaveInfo.id());
vector<Resource> checkpointedResources = google::protobuf::convert(
std::move(*reregisterSlaveMessage.mutable_checkpointed_resources()));
vector<ExecutorInfo> executorInfos = google::protobuf::convert(
std::move(*reregisterSlaveMessage.mutable_executor_infos()));
Option<UUID> resourceVersion;
if (reregisterSlaveMessage.has_resource_version_uuid()) {
resourceVersion = reregisterSlaveMessage.resource_version_uuid();
}
slaves.recovered.erase(slaveInfo.id());
Slave* slave = new Slave(
this,
slaveInfo,
pid,
machineId,
reregisterSlaveMessage.version(),
std::move(agentCapabilities),
Clock::now(),
std::move(checkpointedResources),
resourceVersion,
std::move(executorInfos),
std::move(recoveredTasks));
slave->reregisteredTime = Clock::now();
++metrics->slave_reregistrations;
slaves.removed.erase(slave->id);
slaves.unreachable.erase(slave->id);
vector<Archive::Framework> completedFrameworks = google::protobuf::convert(
std::move(*reregisterSlaveMessage.mutable_completed_frameworks()));
addSlave(slave, std::move(completedFrameworks));
Duration pingTimeout =
flags.agent_ping_timeout * flags.max_agent_ping_timeouts;
MasterSlaveConnection connection;
connection.set_total_ping_timeout_seconds(pingTimeout.secs());
SlaveReregisteredMessage message;
message.mutable_slave_id()->CopyFrom(slave->id);
message.mutable_connection()->CopyFrom(connection);
send(slave->pid, message);
// Note that we convert to `Resources` for output as it's faster than
// logging raw protobuf data. Conversion is safe, as resources have
// already passed validation.
LOG(INFO) << "Re-registered agent " << *slave
<< " with " << Resources(slave->info.resources());
// Any framework that is completed at the master but still running
// at the slave is shutdown. This can occur if the framework was
// removed when the slave was partitioned. NOTE: This is just a
// short-term hack because information about completed frameworks is
// lost when the master fails over. Also, we only store a limited
// number of completed frameworks. A proper fix likely involves
// storing framework information in the registry (MESOS-1719).
foreach (const FrameworkInfo& framework,
reregisterSlaveMessage.frameworks()) {
if (isCompletedFramework(framework.id())) {
LOG(INFO) << "Shutting down framework " << framework.id()
<< " at reregistered agent " << *slave
<< " because the framework has been shutdown at the master";
ShutdownFrameworkMessage message;
message.mutable_framework_id()->MergeFrom(framework.id());
send(slave->pid, message);
}
}
// TODO(bmahler): Consider moving this in to `updateSlaveFrameworks`,
// would be helpful when there are a large total number of frameworks
// in the cluster.
const vector<FrameworkInfo> frameworks = google::protobuf::convert(
std::move(*reregisterSlaveMessage.mutable_frameworks()));
updateSlaveFrameworks(slave, frameworks);
slaves.reregistering.erase(slaveInfo.id());
}
void Master::___reregisterSlave(
const process::UPID& pid,
ReregisterSlaveMessage&& reregisterSlaveMessage,
const process::Future<bool>& updated)
{
const SlaveInfo& slaveInfo = reregisterSlaveMessage.slave();
CHECK(slaves.reregistering.contains(slaveInfo.id()));
CHECK_READY(updated);
CHECK(updated.get());
VLOG(1) << "Registry updated for slave " << slaveInfo.id() << " at " << pid
<< "(" << slaveInfo.hostname() << ")";
if (slaves.markingGone.contains(slaveInfo.id())) {
LOG(INFO)
<< "Ignoring reregister agent message from agent "
<< slaveInfo.id() << " at " << pid << " ("
<< slaveInfo.hostname() << ") as a gone operation is already in progress";
slaves.reregistering.erase(slaveInfo.id());
return;
}
if (slaves.gone.contains(slaveInfo.id())) {
LOG(WARNING) << "Refusing re-registration of agent at " << pid
<< " because it is already marked gone";
ShutdownMessage message;
message.set_message("Agent has been marked gone");
send(pid, message);
slaves.reregistering.erase(slaveInfo.id());
return;
}
if (!slaves.registered.contains(slaveInfo.id())) {
LOG(WARNING)
<< "Dropping ongoing re-registration attempt of slave " << slaveInfo.id()
<< " at " << pid << "(" << slaveInfo.hostname() << ") "
<< "because the re-registration timeout was reached.";
slaves.reregistering.erase(slaveInfo.id());
// Don't send a ShutdownMessage here because tasks from partition-aware
// frameworks running on this host might still be recovered when the slave
// retries the re-registration.
return;
}
Slave* slave = slaves.registered.get(slaveInfo.id());
// Update the slave pid and relink to it.
// NOTE: Re-linking the slave here always rather than only when
// the slave is disconnected can lead to multiple exited events
// in succession for a disconnected slave. As a result, we
// ignore duplicate exited events for disconnected slaves.
// See: https://issues.apache.org/jira/browse/MESOS-675
slave->pid = pid;
link(slave->pid);
const string& version = reregisterSlaveMessage.version();
const vector<SlaveInfo::Capability> agentCapabilities =
google::protobuf::convert(reregisterSlaveMessage.agent_capabilities());
Option<UUID> resourceVersion;
if (reregisterSlaveMessage.has_resource_version_uuid()) {
resourceVersion = reregisterSlaveMessage.resource_version_uuid();
}
// Update our view of checkpointed agent resources for resource
// provider-capable agents; for other agents the master will resend
// checkpointed resources after reregistration.
const Resources checkpointedResources =
slave->capabilities.resourceProvider
? Resources(reregisterSlaveMessage.checkpointed_resources())
: slave->checkpointedResources;
Try<Nothing> stateUpdated = slave->update(
slaveInfo,
version,
agentCapabilities,
checkpointedResources,
resourceVersion);
// As of now, the only way `slave->update()` can fail is if the agent sent
// different checkpointed resources than it had before. A well-behaving
// agent shouldn't do this, so this one is either malicious or buggy. Either
// way, we refuse the re-registration attempt.
if (stateUpdated.isError()) {
LOG(WARNING) << "Refusing re-registration of agent " << slaveInfo.id()
<< " at " << pid << " (" << slaveInfo.hostname() << ")"
<< " because state update failed: " << stateUpdated.error();
ShutdownMessage message;
message.set_message(stateUpdated.error());
send(pid, message);
slaves.reregistering.erase(slaveInfo.id());
return;
}
slave->reregisteredTime = Clock::now();
allocator->updateSlave(
slave->id,
slave->info,
slave->totalResources,
agentCapabilities);
const vector<ExecutorInfo> executorInfos =
google::protobuf::convert(reregisterSlaveMessage.executor_infos());
const vector<Task> tasks =
google::protobuf::convert(reregisterSlaveMessage.tasks());
const vector<FrameworkInfo> frameworks =
google::protobuf::convert(reregisterSlaveMessage.frameworks());
// Reconcile tasks between master and slave, and send the
// `SlaveReregisteredMessage`.
reconcileKnownSlave(slave, executorInfos, tasks);
// If this is a disconnected slave, add it back to the allocator.
// This is done after reconciliation to ensure the allocator's
// offers include the recovered resources initially on this
// slave.
if (!slave->connected) {
CHECK(slave->reregistrationTimer.isSome());
Clock::cancel(slave->reregistrationTimer.get());
slave->connected = true;
dispatch(slave->observer, &SlaveObserver::reconnect);
slave->active = true;
allocator->activateSlave(slave->id);
}
CHECK(slave->active)
<< "Unexpected connected but deactivated agent " << *slave;
// Inform the agent of the new framework pids for its tasks, and
// recover any unknown frameworks from the slave info.
updateSlaveFrameworks(slave, frameworks);
slaves.reregistering.erase(slaveInfo.id());
// If the agent is not resource provider capable (legacy agent),
// send checkpointed resources to the agent. This is important for
// the cases where the master didn't fail over. In that case, the
// master might have already applied an operation that the agent
// didn't see (e.g., due to a breaking connection). This message
// will sync the state between the master and the agent about
// checkpointed resources.
//
// New agents that are resource provider capable will always
// update the master with total resources during re-registration.
// Therefore, no need to send checkpointed resources to the new
// agent in this case.
if (!slave->capabilities.resourceProvider) {
CheckpointResourcesMessage message;
message.mutable_resources()->CopyFrom(slave->checkpointedResources);
if (!slave->capabilities.reservationRefinement) {
// If the agent is not refinement-capable, don't send it
// checkpointed resources that contain refined reservations. This
// might occur if a reservation refinement is created but never
// reaches the agent (e.g., due to network partition), and then
// the agent is downgraded before the partition heals.
//
// TODO(neilc): It would probably be better to prevent the agent
// from reregistering in this scenario.
Try<Nothing> result = downgradeResources(&message);
if (result.isError()) {
LOG(WARNING) << "Not sending updated checkpointed resources "
<< slave->checkpointedResources
<< " with refined reservations, since agent " << *slave
<< " is not RESERVATION_REFINEMENT-capable.";
return;
}
}
LOG(INFO) << "Sending updated checkpointed resources "
<< slave->checkpointedResources
<< " to agent " << *slave;
send(slave->pid, message);
}
}
void Master::updateSlaveFrameworks(
Slave* slave,
const vector<FrameworkInfo>& frameworks)
{
CHECK_NOTNULL(slave);
// Send the latest framework pids to the slave.
foreach (const FrameworkInfo& frameworkInfo, frameworks) {
CHECK(frameworkInfo.has_id());
Framework* framework = getFramework(frameworkInfo.id());
if (framework != nullptr) {
// TODO(bmahler): Copying the framework info here can be
// expensive, consider only sending this message when
// there has been a change vs what the agent reported.
UpdateFrameworkMessage message;
message.mutable_framework_id()->CopyFrom(framework->id());
message.mutable_framework_info()->CopyFrom(framework->info);
// TODO(anand): We set 'pid' to UPID() for http frameworks
// as 'pid' was made optional in 0.24.0. In 0.25.0, we
// no longer have to set pid here for http frameworks.
message.set_pid(framework->pid.getOrElse(UPID()));
send(slave->pid, message);
} else {
// The agent is running a framework that the master doesn't know
// about. Recover the framework using the `FrameworkInfo`
// supplied by the agent.
// We skip recovering the framework if it has already been
// marked completed at the master. In this situation, the master
// has already told the agent to shutdown the framework in
// `__reregisterSlave`.
if (isCompletedFramework(frameworkInfo.id())) {
continue;
}
LOG(INFO) << "Recovering framework " << frameworkInfo.id()
<< " from reregistering agent " << *slave;
recoverFramework(frameworkInfo, {});
}
}
}
void Master::unregisterSlave(const UPID& from, const SlaveID& slaveId)
{
++metrics->messages_unregister_slave;
Slave* slave = slaves.registered.get(slaveId);
if (slave == nullptr) {
LOG(WARNING) << "Ignoring unregister agent message from " << from
<< " for unknown agent";
return;
}
if (slave->pid != from) {
LOG(WARNING) << "Ignoring unregister agent message from " << from
<< " because it is not the agent " << slave->pid;
return;
}
removeSlave(slave,
"the agent unregistered",
metrics->slave_removals_reason_unregistered);
}
void Master::updateFramework(
Framework* framework,
const FrameworkInfo& frameworkInfo,
const set<string>& suppressedRoles)
{
LOG(INFO) << "Updating framework " << *framework << " with roles "
<< stringify(suppressedRoles) << " suppressed";
// NOTE: The allocator takes care of activating/deactivating
// the frameworks from the added/removed roles, respectively.
allocator->updateFramework(framework->id(), frameworkInfo, suppressedRoles);
// First, remove the offers allocated to roles being removed.
foreach (Offer* offer, utils::copy(framework->offers)) {
set<string> newRoles = protobuf::framework::getRoles(frameworkInfo);
if (newRoles.count(offer->allocation_info().role()) > 0) {
continue;
}
allocator->recoverResources(
offer->framework_id(),
offer->slave_id(),
offer->resources(),
None());
removeOffer(offer, true); // Rescind!
}
framework->update(frameworkInfo);
}
void Master::updateSlave(UpdateSlaveMessage&& message)
{
++metrics->messages_update_slave;
upgradeResources(&message);
const SlaveID& slaveId = message.slave_id();
if (slaves.removed.get(slaveId).isSome()) {
// If the slave has been removed, drop the status update. The
// master is no longer trying to health check this slave; when the
// slave realizes it hasn't received any pings from the master, it
// will eventually try to reregister.
LOG(WARNING) << "Ignoring update on removed agent " << slaveId;
return;
}
Slave* slave = slaves.registered.get(slaveId);
if (slave == nullptr) {
LOG(WARNING) << "Ignoring update on removed agent " << slaveId;
return;
}
// NOTE: We must *first* update the agent's resources before we
// recover the resources. If we recovered the resources first,
// an allocation could trigger between recovering resources and
// updating the agent in the allocator. This would lead us to
// re-send out the stale oversubscribed resources!
// If agent does not specify the `update_oversubscribed_resources`
// field, we assume we should set `oversubscribedResources` to be
// backwards-compatibility with older agents (version < 1.5).
const bool hasOversubscribed =
!message.has_update_oversubscribed_resources() ||
message.update_oversubscribed_resources();
Option<Resources> newOversubscribed;
if (hasOversubscribed) {
const Resources& oversubscribedResources =
message.oversubscribed_resources();
LOG(INFO) << "Received update of agent " << *slave << " with total"
<< " oversubscribed resources " << oversubscribedResources;
newOversubscribed = oversubscribedResources;
}
Resources newResourceProviderResources;
if (message.has_resource_providers()) {
foreach (
const UpdateSlaveMessage::ResourceProvider& resourceProvider,
message.resource_providers().providers()) {
newResourceProviderResources += resourceProvider.total_resources();
}
}
auto agentResources = [](const Resource& resource) {
return !resource.has_provider_id();
};
const Resources newSlaveResources =
slave->totalResources.nonRevocable().filter(agentResources) +
newOversubscribed.getOrElse(
slave->totalResources.revocable().filter(agentResources)) +
newResourceProviderResources;
// TODO(bbannier): We only need to update if any changes from
// resource providers are reported.
bool updated = slave->totalResources != newSlaveResources;
// Check if the agent's resource version changed.
if (!updated && message.has_resource_version_uuid() &&
(slave->resourceVersion.isNone() ||
(slave->resourceVersion.isSome() &&
message.resource_version_uuid() != slave->resourceVersion.get()))) {
updated = true;
}
// Check if the known operations for this agent changed.
if (!updated) {
// Below we loop over all received operations and check whether
// they are known to the master; operations can be unknown to the
// master after a master failover. To handle dropped operations on
// agent failover we explicitly track the received operations and
// compare them against the operations known to the master.
hashset<UUID> receivedOperations;
foreach (const Operation& operation, message.operations().operations()) {
if (!slave->operations.contains(operation.uuid())) {
updated = true;
break;
}
if (*slave->operations.at(operation.uuid()) != operation) {
updated = true;
break;
}
receivedOperations.insert(operation.uuid());
}
if (receivedOperations.size() != slave->operations.size()) {
updated = true;
}
}
// Check if resource provider information changed.
if (!updated && message.has_resource_providers()) {
foreach (
const UpdateSlaveMessage::ResourceProvider& receivedProvider,
message.resource_providers().providers()) {
CHECK(receivedProvider.has_info());
CHECK(receivedProvider.info().has_id());
const ResourceProviderID& resourceProviderId =
receivedProvider.info().id();
if (!slave->resourceProviders.contains(resourceProviderId)) {
updated = true;
break;
}
const Slave::ResourceProvider& storedProvider =
slave->resourceProviders.at(resourceProviderId);
if (storedProvider.info != receivedProvider.info() ||
storedProvider.totalResources != receivedProvider.total_resources() ||
storedProvider.resourceVersion !=
receivedProvider.resource_version_uuid()) {
updated = true;
break;
}
foreach (
const Operation& operation,
receivedProvider.operations().operations()) {
if (!storedProvider.operations.contains(operation.uuid())) {
updated = true;
break;
}
if (*storedProvider.operations.at(operation.uuid()) != operation) {
updated = true;
break;
}
}
}
}
if (!updated) {
LOG(INFO) << "Ignoring update on agent " << *slave
<< " as it reports no changes";
return;
}
// Check invariants of the received update.
{
foreach (
const UpdateSlaveMessage::ResourceProvider& resourceProvider,
message.resource_providers().providers()) {
CHECK(resourceProvider.has_info());
CHECK(resourceProvider.info().has_id());
const ResourceProviderID& providerId = resourceProvider.info().id();
const Option<Slave::ResourceProvider>& oldProvider =
slave->resourceProviders.get(providerId);
if (oldProvider.isSome()) {
// For known resource providers the master should always know at least
// as many non-terminal operations as the agent. While an
// operation might get lost on the way to the agent or resource
// provider, or become terminal inside the agent, the master would never
// make an operation known to the agent terminal without the agent
// doing that first.
//
// NOTE: We only consider non-terminal operations here as there is an
// edge case where the master removes a terminal operation from
// its own state when it passes on an acknowledgement from a framework
// to the agent, but the agent fails over before it can process the
// acknowledgement, or the agent initiates an unrelated
// `UpdateSlaveMessage`.
foreach (
const Operation& operation,
resourceProvider.operations().operations()) {
if (!protobuf::isTerminalState(operation.latest_status().state())) {
CHECK(oldProvider->operations.contains(operation.uuid()))
<< "Agent tried to reconcile unknown non-terminal operation "
<< operation.uuid();
}
}
}
}
}
// Update master and allocator state.
if (hasOversubscribed) {
slave->totalResources -= slave->totalResources.revocable();
slave->totalResources += message.oversubscribed_resources();
// TODO(bbannier): Track oversubscribed resources for resource
// providers as well.
}
ReconcileOperationsMessage reconcile;
// Reconcile operations on agent-default resources.
hashset<UUID> newOperations;
foreach (const Operation& operation, message.operations().operations()) {
newOperations.insert(operation.uuid());
}
foreachkey (const UUID& uuid, slave->operations) {
if (!message.has_operations() || !newOperations.contains(uuid)) {
LOG(WARNING) << "Performing explicit reconciliation with agent for"
<< " known operation " << uuid
<< " since it was not present in original"
<< " reconciliation message from agent";
ReconcileOperationsMessage::Operation* reconcileOperation =
reconcile.add_operations();
reconcileOperation->mutable_operation_uuid()->CopyFrom(uuid);
}
}
foreach (
const UpdateSlaveMessage::ResourceProvider& resourceProvider,
message.resource_providers().providers()) {
CHECK(resourceProvider.has_info());
CHECK(resourceProvider.info().has_id());
const ResourceProviderID& providerId = resourceProvider.info().id();
// Below we only add operations to our state from resource
// providers which are unknown, or possibly remove them for known
// resource providers. This works since the master should always
// know more operations of known resource providers than any
// resource provider itself.
//
// NOTE: We do not mutate operation statuses here; that is the
// responsibility of the `updateOperationStatus` handler.
//
// There still exists an edge case where the master might remove a
// terminal operation from its state when passing an
// acknowledgement from a framework on to the agent, with the
// agent failing over before the acknowledgement can be processed.
// In that case the agent would track an operation unknown to the
// master.
//
// TODO(bbannier): We might want to consider to also learn about
// new (terminal) operations when observing messages from status
// update managers to frameworks.
if (!slave->resourceProviders.contains(providerId)) {
// If this is a not previously seen resource provider we had a master
// failover. Add the resources and operations to our state.
CHECK(
resourceProvider.total_resources().empty() ||
!slave->totalResources.contains(resourceProvider.total_resources()));
// We add the resource provider to the master first so
// that it can be found when e.g., adding operations.
slave->resourceProviders.put(
providerId,
{resourceProvider.info(),
resourceProvider.total_resources(),
resourceProvider.resource_version_uuid(),
{}});
hashmap<FrameworkID, Resources> usedByOperations;
foreach (
const Operation& operation,
resourceProvider.operations().operations()) {
// Update to bookkeeping of operations.
Framework* framework = nullptr;
if (operation.has_framework_id()) {
framework = getFramework(operation.framework_id());
}
addOperation(framework, slave, new Operation(operation));
if (!protobuf::isTerminalState(operation.latest_status().state()) &&
operation.has_framework_id()) {
// If we do not yet know the `FrameworkInfo` of the framework the
// operation originated from, we cannot properly track the operation
// at this point.
//
// TODO(bbannier): Consider introducing ways of making sure an agent
// always knows the `FrameworkInfo` of operations triggered on its
// resources, e.g., by adding an explicit `FrameworkInfo` to
// operations like is already done for `RunTaskMessage`, see
// MESOS-8582.
if (framework == nullptr) {
LOG(WARNING)
<< "Cannot properly account for operation " << operation.uuid()
<< " learnt in reconciliation of agent " << slaveId
<< " since framework " << operation.framework_id()
<< " is unknown; this can lead to assertion failures after the"
" operation terminates, see MESOS-8536";
continue;
}
Try<Resources> consumedResources =
protobuf::getConsumedResources(operation.info());
CHECK_SOME(consumedResources)
<< "Could not determine resources consumed by operation "
<< operation.uuid();
usedByOperations[operation.framework_id()] +=
consumedResources.get();
}
}
slave->totalResources += resourceProvider.total_resources();
allocator->addResourceProvider(
slaveId, resourceProvider.total_resources(), usedByOperations);
} else {
// If this is a known resource provider its total capacity cannot have
// changed, and it would not know about any non-terminal operations not
// already known to the master. However, it might not have received an
// operation for a couple different reasons:
//
// - The resource provider or agent could have failed over
// before the operation's `ApplyOperationMessage` could be
// received.
// - The operation's `ApplyOperationMessage` could have raced
// with this `UpdateSlaveMessage`.
//
// In both of these cases, we need to reconcile such operations explicitly
// with the agent. For operations which the agent or resource provider
// does not recognize, an OPERATION_DROPPED status update will be
// generated and the master will remove the operation from its state upon
// receipt of that update.
CHECK(slave->resourceProviders.contains(providerId));
Slave::ResourceProvider& oldProvider =
slave->resourceProviders.at(providerId);
hashmap<UUID, const Operation*> newOperations;
foreach (
const Operation& operation,
resourceProvider.operations().operations()) {
newOperations.put(operation.uuid(), &operation);
}
foreachpair (
const UUID& uuid, Operation* oldOperation, oldProvider.operations) {
if (!newOperations.contains(uuid)) {
LOG(WARNING) << "Performing explicit reconciliation with agent for"
<< " known operation " << uuid
<< " since it was not present in original"
<< " reconciliation message from agent";
ReconcileOperationsMessage::Operation* reconcileOperation =
reconcile.add_operations();
reconcileOperation->mutable_operation_uuid()->CopyFrom(uuid);
reconcileOperation->mutable_resource_provider_id()->CopyFrom(
providerId);
} else {
// If a known operation became terminal between any previous offer
// operation status update and this `UpdateSlaveMessage`, the total
// resources we were sent already had the operation applied. We need
// to update the state of the operation to terminal here so that any
// update sent by the agent later does not cause us to apply the
// operation again.
const Operation* newOperation = newOperations.at(uuid);
if (!protobuf::isTerminalState(
oldOperation->latest_status().state()) &&
protobuf::isTerminalState(
newOperation->latest_status().state())) {
Operation* operation = CHECK_NOTNULL(slave->getOperation(uuid));
UpdateOperationStatusMessage update =
protobuf::createUpdateOperationStatusMessage(
uuid,
newOperation->latest_status(),
newOperation->latest_status(),
operation->framework_id(),
operation->slave_id());
updateOperation(
operation, update, false); // Do not update resources.
}
}
}
// Reconcile the total resources. This includes undoing
// speculated operations which are only visible in the total,
// but never in the used resources. We explicitly allow for
// resource providers to change from or to zero capacity.
const Resources oldResources =
slave->totalResources.filter([&providerId](const Resource& resource) {
return resource.provider_id() == providerId;
});
slave->totalResources -= oldResources;
slave->totalResources += resourceProvider.total_resources();
oldProvider.totalResources = resourceProvider.total_resources();
// Reconcile resource versions.
oldProvider.resourceVersion = resourceProvider.resource_version_uuid();
}
}
if (reconcile.operations_size() > 0) {
send(slave->pid, reconcile);
}
// Now update the agent's state and total resources in the allocator.
allocator->updateSlave(slaveId, slave->info, slave->totalResources);
// Then rescind outstanding offers affected by the update.
// NOTE: Need a copy of offers because the offers are removed inside the loop.
foreach (Offer* offer, utils::copy(slave->offers)) {
bool rescind = false;
const Resources& offered = offer->resources();
// Since updates of the agent's oversubscribed resources are sent at regular
// intervals, we only rescind offers containing revocable resources to
// reduce churn.
if (hasOversubscribed && !offered.revocable().empty()) {
LOG(INFO) << "Removing offer " << offer->id()
<< " with revocable resources " << offered << " on agent "
<< *slave;
rescind = true;
}
// Updates on resource providers can change the agent total
// resources, so we rescind all offers.
//
// TODO(bbannier): Only rescind offers possibly containing
// affected resources.
const Resources offeredResourceProviderResources = offered.filter(
[](const Resource& resource) { return resource.has_provider_id(); });
if (message.has_resource_providers() &&
!offeredResourceProviderResources.empty()) {
LOG(INFO)
<< "Removing offer " << offer->id()
<< " with resources " << offered << " on agent " << *slave;
rescind = true;
}
if (!rescind) {
continue;
}
allocator->recoverResources(
offer->framework_id(),
offer->slave_id(),
offered,
None());
removeOffer(offer, true); // Rescind.
}
// NOTE: We don't need to rescind inverse offers here as they are unrelated to
// oversubscription.
}
void Master::updateUnavailability(
const MachineID& machineId,
const Option<Unavailability>& unavailability)
{
if (unavailability.isSome()) {
machines[machineId].info.mutable_unavailability()->CopyFrom(
unavailability.get());
} else {
machines[machineId].info.clear_unavailability();
}
// TODO(jmlvanre): Only update allocator and rescind offers if the
// unavailability has actually changed.
if (machines.contains(machineId)) {
// For every slave on this machine, update the allocator.
foreach (const SlaveID& slaveId, machines[machineId].slaves) {
// The slave should not be in the machines mapping if it is removed.
CHECK(slaves.removed.get(slaveId).isNone());
// The slave should be registered if it is in the machines mapping.
CHECK(slaves.registered.contains(slaveId));
Slave* slave = slaves.registered.get(slaveId);
if (unavailability.isSome()) {
// TODO(jmlvanre): Add stream operator for unavailability.
LOG(INFO) << "Updating unavailability of agent " << *slave
<< ", starting at "
<< Nanoseconds(unavailability->start().nanoseconds());
} else {
LOG(INFO) << "Removing unavailability of agent " << *slave;
}
// Remove and rescind offers since we want to inform frameworks of the
// unavailability change as soon as possible.
foreach (Offer* offer, utils::copy(slave->offers)) {
allocator->recoverResources(
offer->framework_id(), slave->id, offer->resources(), None());
removeOffer(offer, true); // Rescind!
}
// Remove and rescind inverse offers since the allocator will send new
// inverse offers for the updated unavailability.
foreach (InverseOffer* inverseOffer, utils::copy(slave->inverseOffers)) {
allocator->updateInverseOffer(
slave->id,
inverseOffer->framework_id(),
UnavailableResources{
inverseOffer->resources(),
inverseOffer->unavailability()},
None());
removeInverseOffer(inverseOffer, true); // Rescind!
}
// We remove / rescind all the offers first so that any calls to the
// allocator to modify its internal state are queued before the update of
// the unavailability in the allocator. We do this so that the allocator's
// state can start from a "clean slate" for the new unavailability.
// NOTE: Any calls from the Allocator back into the master, for example
// `offer()`, are guaranteed to happen after this function exits due to
// the Actor pattern.
allocator->updateUnavailability(slaveId, unavailability);
}
}
}
// TODO(vinod): Since 0.22.0, we can use 'from' instead of 'pid'
// because the status updates will be sent by the slave.
//
// TODO(vinod): Add a benchmark test for status update handling.
void Master::statusUpdate(StatusUpdateMessage&& statusUpdateMessage)
{
const StatusUpdate& update = statusUpdateMessage.update();
const UPID& pid = statusUpdateMessage.pid();
CHECK_NE(pid, UPID());
++metrics->messages_status_update;
if (slaves.removed.get(update.slave_id()).isSome()) {
// If the slave has been removed, drop the status update. The
// master is no longer trying to health check this slave; when the
// slave realizes it hasn't received any pings from the master, it
// will eventually try to reregister.
LOG(WARNING) << "Ignoring status update " << update
<< " from removed agent " << pid
<< " with id " << update.slave_id();
metrics->invalid_status_updates++;
return;
}
Slave* slave = slaves.registered.get(update.slave_id());
if (slave == nullptr) {
LOG(WARNING) << "Ignoring status update " << update
<< " from unknown agent " << pid
<< " with id " << update.slave_id();
metrics->invalid_status_updates++;
return;
}
Try<id::UUID> uuid = id::UUID::fromBytes(update.uuid());
if (uuid.isError()) {
LOG(WARNING) << "Ignoring status update "
<< " from agent " << *slave
<< ": " << uuid.error();
++metrics->invalid_status_updates;
return;
}
LOG(INFO) << "Status update " << update << " from agent " << *slave;
// Agents >= 0.26 should always correctly set task status uuid.
CHECK(update.status().has_uuid());
bool validStatusUpdate = true;
Framework* framework = getFramework(update.framework_id());
// A framework might not have reregistered upon a master failover or
// got disconnected.
if (framework != nullptr && framework->connected()) {
forward(update, pid, framework);
} else {
validStatusUpdate = false;
LOG(WARNING) << "Received status update " << update << " from agent "
<< *slave << " for "
<< (framework == nullptr ? "an unknown " : "a disconnected ")
<< "framework";
}
// Lookup the task and see if we need to update anything locally.
Task* task = slave->getTask(update.framework_id(), update.status().task_id());
if (task == nullptr) {
// TODO(neilc): We might see status updates for non-partition
// aware tasks running on a partitioned agent that has
// reregistered with the master. The master marks such tasks
// completed when the agent partitions; it will shutdown the
// framework when the agent-reregisters, but we may see a number
// of status updates before the framework is shutdown.
LOG(WARNING) << "Could not lookup task for status update " << update
<< " from agent " << *slave;
metrics->invalid_status_updates++;
return;
}
updateTask(task, update);
validStatusUpdate
? metrics->valid_status_updates++ : metrics->invalid_status_updates++;
}
void Master::forward(
const StatusUpdate& update,
const UPID& acknowledgee,
Framework* framework)
{
CHECK_NOTNULL(framework);
if (!acknowledgee) {
LOG(INFO) << "Sending status update " << update
<< (update.status().has_message()
? " '" + update.status().message() + "'"
: "");
} else {
LOG(INFO) << "Forwarding status update " << update;
}
// The task might not exist in master's memory (e.g., failed task validation).
Task* task = framework->getTask(update.status().task_id());
if (task != nullptr) {
// Set the status update state and uuid for the task. Note that
// master-generated updates are terminal and do not have a uuid
// (in which case the master also calls `removeTask()`).
if (update.has_uuid()) {
task->set_status_update_state(update.status().state());
task->set_status_update_uuid(update.status().uuid());
}
}
StatusUpdateMessage message;
message.mutable_update()->MergeFrom(update);
message.set_pid(acknowledgee);
framework->send(message);
}
void Master::updateOperationStatus(UpdateOperationStatusMessage&& update)
{
CHECK(update.has_slave_id())
<< "External resource provider is not supported yet";
const SlaveID& slaveId = update.slave_id();
// The status update for the operation might be for an
// operator API call, thus the framework ID here is optional.
Option<FrameworkID> frameworkId = update.has_framework_id()
? update.framework_id()
: Option<FrameworkID>::none();
Slave* slave = slaves.registered.get(slaveId);
const UUID& uuid = update.operation_uuid();
// This is possible if the agent is marked as unreachable or gone,
// or has initiated a graceful shutdown. In either of those cases,
// ignore the operation status update.
//
// TODO(jieyu): If the agent is unreachable or has initiated a
// graceful shutdown, we can still forward the update to the
// framework so that the framework can get notified about the offer
// operation early. However, the acknowledgement of the update won't
// be able to reach the agent in those cases. If the agent is gone,
// we cannot forward the update because the master might already
// tell the framework that the operation is gone.
if (slave == nullptr) {
LOG(WARNING) << "Ignoring status update for operation '"
<< update.status().operation_id()
<< "' (uuid: " << uuid << ") for "
<< (frameworkId.isSome()
? "framework " + stringify(frameworkId.get())
: "an operator API call")
<< ": Agent " << slaveId << " is not registered";
return;
}
Operation* operation = slave->getOperation(update.operation_uuid());
if (operation == nullptr) {
LOG(ERROR) << "Failed to find the operation '"
<< update.status().operation_id() << "' (uuid: " << uuid << ")"
<< " for " << (frameworkId.isSome()
? "framework " + stringify(frameworkId.get())
: "an operator API call")
<< " on agent " << slaveId;
return;
}
if (operation->info().has_id()) {
// Agents don't include the framework and operation IDs when sending
// operation status updates for dropped operations in response to a
// `ReconcileOperationsMessage`, but they can be deduced from the operation
// info kept on the master.
// Only operations done via the scheduler API can have an ID.
CHECK(operation->has_framework_id());
frameworkId = operation->framework_id();
update.mutable_status()->mutable_operation_id()->CopyFrom(
operation->info().id());
}
updateOperation(operation, update);
CHECK(operation->statuses_size() > 0);
const OperationStatus& latestStatus = *operation->statuses().rbegin();
if (operation->info().has_id()) {
// Forward the status update to the framework.
Framework* framework = getFramework(frameworkId.get());
if (framework == nullptr || !framework->connected()) {
LOG(WARNING) << "Received operation status update " << update
<< ", but the framework is "
<< (framework == nullptr ? "unknown" : "disconnected");
} else {
LOG(INFO) << "Forwarding operation status update " << update;
framework->send(update);
}
if (protobuf::isTerminalState(latestStatus.state()) &&
!latestStatus.has_uuid()) {
// Remove the operation if the update is terminal and it is not
// reliably sent.
removeOperation(operation);
}
} else {
if (latestStatus.has_uuid()) {
// This update is being sent reliably, and it doesn't have an operation
// ID, so the master has to send an acknowledgement.
Result<ResourceProviderID> resourceProviderId =
getResourceProviderId(operation->info());
// TODO(greggomann): Remove this CHECK once the agent is sending reliable
// updates for operations on its default resources. See MESOS-8194.
CHECK_SOME(resourceProviderId);
AcknowledgeOperationStatusMessage acknowledgement;
acknowledgement.mutable_status_uuid()->CopyFrom(latestStatus.uuid());
acknowledgement.mutable_operation_uuid()->CopyFrom(operation->uuid());
acknowledgement.mutable_resource_provider_id()->CopyFrom(
resourceProviderId.get());
CHECK(slave->capabilities.resourceProvider);
send(slave->pid, acknowledgement);
}
if (protobuf::isTerminalState(latestStatus.state())) {
removeOperation(operation);
}
}
}
void Master::exitedExecutor(
const UPID& from,
const SlaveID& slaveId,
const FrameworkID& frameworkId,
const ExecutorID& executorId,
int32_t status)
{
++metrics->messages_exited_executor;
if (slaves.removed.get(slaveId).isSome()) {
// If the slave has been removed, drop the executor message. The
// master is no longer trying to health check this slave; when the
// slave realizes it hasn't received any pings from the master, it
// will eventually try to reregister.
LOG(WARNING) << "Ignoring exited executor '" << executorId
<< "' of framework " << frameworkId
<< " on removed agent " << slaveId;
return;
}
Slave* slave = slaves.registered.get(slaveId);
if (slave == nullptr) {
LOG(WARNING) << "Ignoring exited executor '" << executorId
<< "' of framework " << frameworkId
<< " on unknown agent " << slaveId;
return;
}
// Only update master's internal data structures here for proper
// accounting. The TASK_LOST updates are handled by the slave.
if (!slave->hasExecutor(frameworkId, executorId)) {
LOG(WARNING) << "Ignoring unknown exited executor '" << executorId
<< "' of framework " << frameworkId
<< " on agent " << *slave;
return;
}
LOG(INFO) << "Executor '" << executorId
<< "' of framework " << frameworkId
<< " on agent " << *slave << ": "
<< WSTRINGIFY(status);
removeExecutor(slave, frameworkId, executorId);
// TODO(vinod): Reliably forward this message to the scheduler.
Framework* framework = getFramework(frameworkId);
if (framework == nullptr || !framework->connected()) {
string status = (framework == nullptr ? "unknown" : "disconnected");
LOG(WARNING)
<< "Not forwarding exited executor message for executor '" << executorId
<< "' of framework " << frameworkId << " on agent " << *slave
<< " because the framework is " << status;
return;
}
ExitedExecutorMessage message;
message.mutable_executor_id()->CopyFrom(executorId);
message.mutable_framework_id()->CopyFrom(frameworkId);
message.mutable_slave_id()->CopyFrom(slaveId);
message.set_status(status);
framework->send(message);
}
void Master::shutdown(
Framework* framework,
const scheduler::Call::Shutdown& shutdown)
{
CHECK_NOTNULL(framework);
// TODO(vinod): Add a metric for executor shutdowns.
const SlaveID& slaveId = shutdown.slave_id();
const ExecutorID& executorId = shutdown.executor_id();
const FrameworkID& frameworkId = framework->id();
Slave* slave = slaves.registered.get(slaveId);
if (slave == nullptr) {
LOG(WARNING) << "Unable to shutdown executor '" << executorId
<< "' of framework " << frameworkId
<< " of unknown agent " << slaveId;
return;
}
LOG(INFO) << "Processing SHUTDOWN call for executor '" << executorId
<< "' of framework " << *framework << " on agent " << slaveId;
ShutdownExecutorMessage message;
message.mutable_executor_id()->CopyFrom(executorId);
message.mutable_framework_id()->CopyFrom(frameworkId);
send(slave->pid, message);
}
Future<bool> Master::markUnreachable(
const SlaveInfo& slave,
bool duringMasterFailover,
const string& message)
{
if (duringMasterFailover && !slaves.recovered.contains(slave.id())) {
LOG(INFO) << "Skipping transition of agent"
<< " " << slave.id() << " (" << slave.hostname() << ")"
<< " to unreachable because it reregistered in the interim";
return false;
}
if (!duringMasterFailover && !slaves.registered.contains(slave.id())) {
// Possible when the `SlaveObserver` dispatches a message to
// mark an unhealthy slave as unreachable, but the slave is
// concurrently removed for another reason (e.g.,
// `UnregisterSlaveMessage` is received).
LOG(WARNING) << "Skipping transition of agent"
<< " " << slave.id() << " (" << slave.hostname() << ")"
<< " to unreachable because it has already been removed"
<< " or marked unreachable";
return false;
}
// The slave might be in the process of reregistering without
// the marking unreachable having been canceled.
if (slaves.reregistering.contains(slave.id())) {
LOG(INFO) << "Skipping transition of agent"
<< " " << slave.id() << " (" << slave.hostname() << ")"
<< " to unreachable because it is reregistering";
return false;
}
if (slaves.markingUnreachable.contains(slave.id())) {
// We might already be marking this slave unreachable. This is
// possible if marking the slave unreachable in the registry takes
// a long time. While the registry operation is in progress, the
// `SlaveObserver` will continue to ping the slave; if the slave
// fails another health check, the `SlaveObserver` will trigger
// another attempt to mark it unreachable. Also possible if
// `agentReregisterTimeout` marks the slave unreachable
// concurrently with the slave observer doing so.
LOG(WARNING) << "Skipping transition of agent"
<< " " << slave.id() << " (" << slave.hostname() << ")"
<< " to unreachable because another unreachable"
<< " transition is already in progress";
return false;
}
if (slaves.removing.contains(slave.id())) {
LOG(WARNING) << "Skipping transition of agent"
<< " " << slave.id() << " (" << slave.hostname() << ")"
<< " to unreachable because it is being removed";
return false;
}
if (slaves.removed.get(slave.id()).isSome()) {
LOG(WARNING) << "Skipping transition of agent"
<< " " << slave.id() << " (" << slave.hostname() << ")"
<< " to unreachable because it has been removed";
return false;
}
if (slaves.markingGone.contains(slave.id())) {
LOG(WARNING) << "Skipping transition of agent"
<< " " << slave.id() << " (" << slave.hostname() << ")"
<< " to unreachable because it is being marked as gone";
return false;
}
if (slaves.gone.contains(slave.id())) {
LOG(WARNING) << "Skipping transition of agent"
<< " " << slave.id() << " (" << slave.hostname() << ")"
<< " to unreachable because it has been marked as gone";
return false;
}
LOG(INFO) << "Marking agent " << slave.id() << " (" << slave.hostname() << ")"
<< " unreachable: " << message;
CHECK(!slaves.unreachable.contains(slave.id()));
slaves.markingUnreachable.insert(slave.id());
// Use the same timestamp for all status updates sent below;
// we also use this timestamp when updating the registry.
TimeInfo unreachableTime = protobuf::getCurrentTime();
const string failure = "Failed to mark agent " + stringify(slave.id()) +
" (" + slave.hostname() + ") as unreachable in the registry";
// Update the registry to move this slave from the list of admitted
// slaves to the list of unreachable slaves. After this is complete,
// we can remove the slave from the master's in-memory state and
// send TASK_UNREACHABLE / TASK_LOST updates to the frameworks.
return undiscardable(
registrar->apply(Owned<RegistryOperation>(
new MarkSlaveUnreachable(slave, unreachableTime)))
.onFailed(lambda::bind(fail, failure, lambda::_1))
.onDiscarded(lambda::bind(fail, failure, "discarded"))
.then(defer(self(), [=](bool result) {
_markUnreachable(
slave, unreachableTime, duringMasterFailover, message, result);
return true;
})));
}
void Master::_markUnreachable(
const SlaveInfo& slave,
const TimeInfo& unreachableTime,
bool duringMasterFailover,
const string& message,
bool registrarResult)
{
// `MarkSlaveUnreachable` registry operation should never fail.
CHECK(registrarResult);
CHECK(slaves.markingUnreachable.contains(slave.id()));
slaves.markingUnreachable.erase(slave.id());
LOG(INFO) << "Marked agent"
<< " " << slave.id() << " (" << slave.hostname() << ")"
<< " unreachable: " << message;
++metrics->slave_removals;
++metrics->slave_removals_reason_unhealthy;
CHECK(!slaves.unreachable.contains(slave.id()));
slaves.unreachable[slave.id()] = unreachableTime;
if (duringMasterFailover) {
CHECK(slaves.recovered.contains(slave.id()));
slaves.recovered.erase(slave.id());
++metrics->recovery_slave_removals;
// TODO(bmahler): Tell partition aware frameworks that the
// agent is unreachable rather than lost. This requires a
// new capability.
sendSlaveLost(slave);
} else {
CHECK(slaves.registered.contains(slave.id()));
__removeSlave(slaves.registered.get(slave.id()), message, unreachableTime);
}
}
void Master::markGone(const SlaveID& slaveId, const TimeInfo& goneTime)
{
CHECK(slaves.markingGone.contains(slaveId));
slaves.markingGone.erase(slaveId);
slaves.gone[slaveId] = goneTime;
const string message = "Agent has been marked gone";
Slave* slave = slaves.registered.get(slaveId);
// If the `Slave` struct does not exist, then the agent
// must be either recovered or unreachable.
if (slave == nullptr) {
CHECK(slaves.recovered.contains(slaveId) ||
slaves.unreachable.contains(slaveId));
// When a recovered agent is marked gone, we have no task metadata to use in
// order to send task status updates. We could retain this agent ID and send
// updates upon reregistration but do not currently do this. See MESOS-9739.
if (slaves.recovered.contains(slaveId)) {
return;
}
slaves.unreachable.erase(slaveId);
// TODO(vinod): Consider moving these tasks into `completedTasks` by
// transitioning them to a terminal state and sending status updates.
// But it's not clear what this state should be. If a framework
// reconciles these tasks after this point it would get `TASK_UNKNOWN`
// which seems appropriate but we don't keep tasks in this state in-memory.
if (slaves.unreachableTasks.contains(slaveId)) {
foreachkey (const FrameworkID& frameworkId,
slaves.unreachableTasks.at(slaveId)) {
Framework* framework = getFramework(frameworkId);
if (framework == nullptr) {
continue;
}
TaskState newTaskState = TASK_GONE_BY_OPERATOR;
TaskStatus::Reason newTaskReason =
TaskStatus::REASON_SLAVE_REMOVED_BY_OPERATOR;
if (!framework->capabilities.partitionAware) {
newTaskState = TASK_LOST;
newTaskReason = TaskStatus::REASON_SLAVE_REMOVED;
}
foreach (const TaskID& taskId,
slaves.unreachableTasks.at(slaveId).at(frameworkId)) {
if (framework->unreachableTasks.contains(taskId)) {
const Owned<Task>& task = framework->unreachableTasks.at(taskId);
const StatusUpdate& update = protobuf::createStatusUpdate(
task->framework_id(),
task->slave_id(),
task->task_id(),
newTaskState,
TaskStatus::SOURCE_MASTER,
None(),
message,
newTaskReason,
(task->has_executor_id()
? Option<ExecutorID>(task->executor_id())
: None()));
updateTask(task.get(), update);
if (!framework->connected()) {
LOG(WARNING) << "Dropping update " << update
<< " for disconnected "
<< " framework " << frameworkId;
} else {
forward(update, UPID(), framework);
}
// Move task from unreachable map to completed map.
framework->addCompletedTask(std::move(*task));
framework->unreachableTasks.erase(taskId);
}
}
}
slaves.unreachableTasks.erase(slaveId);
}
return;
}
// Shutdown the agent if it transitioned to gone.
ShutdownMessage shutdownMessage;
shutdownMessage.set_message(message);
send(slave->pid, shutdownMessage);
__removeSlave(slave, message, None());
}
void Master::reconcileTasks(
const UPID& from,
ReconcileTasksMessage&& reconcileTasksMessage)
{
const FrameworkID& frameworkId = reconcileTasksMessage.framework_id();
Framework* framework = getFramework(frameworkId);
if (framework == nullptr) {
LOG(WARNING) << "Unknown framework " << frameworkId << " at " << from
<< " attempted to reconcile tasks";
return;
}
if (framework->pid != from) {
LOG(WARNING)
<< "Ignoring reconcile tasks message for framework " << *framework
<< " because it is not expected from " << from;
return;
}
scheduler::Call::Reconcile message;
message.mutable_tasks()->Reserve(reconcileTasksMessage.statuses_size());
foreach (TaskStatus& status, *reconcileTasksMessage.mutable_statuses()) {
scheduler::Call::Reconcile::Task* t = message.add_tasks();
*t->mutable_task_id() = std::move(status.task_id());
if (status.has_slave_id()) {
*t->mutable_slave_id() = std::move(status.slave_id());
}
}
reconcile(framework, std::move(message));
}
void Master::reconcile(
Framework* framework,
scheduler::Call::Reconcile&& reconcile)
{
CHECK_NOTNULL(framework);
++metrics->messages_reconcile_tasks;
if (reconcile.tasks().empty()) {
// Implicit reconciliation.
LOG(INFO) << "Performing implicit task state reconciliation"
" for framework " << *framework;
foreachvalue (const TaskInfo& task, framework->pendingTasks) {
StatusUpdate update = protobuf::createStatusUpdate(
framework->id(),
task.slave_id(),
task.task_id(),
TASK_STAGING,
TaskStatus::SOURCE_MASTER,
None(),
"Reconciliation: Latest task state",
TaskStatus::REASON_RECONCILIATION);
VLOG(1) << "Sending implicit reconciliation state "
<< update.status().state()
<< " for task " << update.status().task_id()
<< " of framework " << *framework;
// TODO(bmahler): Consider using forward(); might lead to too
// much logging.
StatusUpdateMessage message;
*message.mutable_update() = std::move(update);
framework->send(message);
}
foreachvalue (Task* task, framework->tasks) {
const TaskState& state = task->has_status_update_state()
? task->status_update_state()
: task->state();
const Option<ExecutorID>& executorId = task->has_executor_id()
? Option<ExecutorID>(task->executor_id())
: None();
StatusUpdate update = protobuf::createStatusUpdate(
framework->id(),
task->slave_id(),
task->task_id(),
state,
TaskStatus::SOURCE_MASTER,
None(),
"Reconciliation: Latest task state",
TaskStatus::REASON_RECONCILIATION,
executorId,
protobuf::getTaskHealth(*task),
protobuf::getTaskCheckStatus(*task),
None(),
protobuf::getTaskContainerStatus(*task));
VLOG(1) << "Sending implicit reconciliation state "
<< update.status().state()
<< " for task " << update.status().task_id()
<< " of framework " << *framework;
// TODO(bmahler): Consider using forward(); might lead to too
// much logging.
StatusUpdateMessage message;
*message.mutable_update() = std::move(update);
framework->send(message);
}
return;
}
// Explicit reconciliation.
LOG(INFO) << "Performing explicit task state reconciliation"
<< " for " << reconcile.tasks().size() << " tasks"
<< " of framework " << *framework;
// Explicit reconciliation occurs for the following cases:
// (1) Task is known, but pending: TASK_STAGING.
// (2) Task is known: send the latest state.
// (3) Task is unknown, slave is recovered: no-op.
// (4) Task is unknown, slave is registered: TASK_GONE.
// (5) Task is unknown, slave is unreachable: TASK_UNREACHABLE.
// (6) Task is unknown, slave is gone: TASK_GONE_BY_OPERATOR.
// (7) Task is unknown, slave is unknown: TASK_UNKNOWN.
//
// For case (3), if the slave ID is not provided, we err on the
// side of caution and do not reply if there are *any* recovered
// slaves that haven't reregistered, since the task could reside
// on one of these slaves.
//
// For cases (4), (5), (6) and (7) TASK_LOST is sent instead if the
// framework has not opted-in to the PARTITION_AWARE capability.
foreach (const scheduler::Call::Reconcile::Task& t, reconcile.tasks()) {
Option<SlaveID> slaveId = None();
if (t.has_slave_id()) {
slaveId = t.slave_id();
}
Option<StatusUpdate> update = None();
Task* task = framework->getTask(t.task_id());
if (framework->pendingTasks.contains(t.task_id())) {
// (1) Task is known, but pending: TASK_STAGING.
const TaskInfo& task_ = framework->pendingTasks[t.task_id()];
update = protobuf::createStatusUpdate(
framework->id(),
task_.slave_id(),
task_.task_id(),
TASK_STAGING,
TaskStatus::SOURCE_MASTER,
None(),
"Reconciliation: Latest task state",
TaskStatus::REASON_RECONCILIATION);
} else if (task != nullptr) {
// (2) Task is known: send the latest status update state.
const TaskState& state = task->has_status_update_state()
? task->status_update_state()
: task->state();
const Option<ExecutorID> executorId = task->has_executor_id()
? Option<ExecutorID>(task->executor_id())
: None();
update = protobuf::createStatusUpdate(
framework->id(),
task->slave_id(),
task->task_id(),
state,
TaskStatus::SOURCE_MASTER,
None(),
"Reconciliation: Latest task state",
TaskStatus::REASON_RECONCILIATION,
executorId,
protobuf::getTaskHealth(*task),
protobuf::getTaskCheckStatus(*task),
None(),
protobuf::getTaskContainerStatus(*task));
} else if ((slaveId.isSome() && slaves.recovered.contains(slaveId.get())) ||
(slaveId.isNone() && !slaves.recovered.empty())) {
// (3) Task is unknown, slave is recovered: no-op. The framework
// will have to retry this and will not receive a response until
// the agent either registers, or is marked unreachable after the
// timeout.
LOG(INFO) << "Dropping reconciliation of task " << t.task_id()
<< " for framework " << *framework << " because "
<< (slaveId.isSome() ?
"agent " + stringify(slaveId.get()) + " has" :
"some agents have")
<< " not yet reregistered with the master";
} else if (slaveId.isSome() && slaves.registered.contains(slaveId.get())) {
// (4) Task is unknown, slave is registered: TASK_GONE. If the
// framework does not have the PARTITION_AWARE capability, send
// TASK_LOST for backward compatibility.
TaskState taskState = TASK_GONE;
if (!framework->capabilities.partitionAware) {
taskState = TASK_LOST;
}
update = protobuf::createStatusUpdate(
framework->id(),
slaveId.get(),
t.task_id(),
taskState,
TaskStatus::SOURCE_MASTER,
None(),
"Reconciliation: Task is unknown to the agent",
TaskStatus::REASON_RECONCILIATION);
} else if (slaveId.isSome() && slaves.unreachable.contains(slaveId.get())) {
// (5) Slave is unreachable: TASK_UNREACHABLE. If the framework
// does not have the PARTITION_AWARE capability, send TASK_LOST
// for backward compatibility. In either case, the status update
// also includes the time when the slave was marked unreachable.
const TimeInfo& unreachableTime = slaves.unreachable.at(slaveId.get());
TaskState taskState = TASK_UNREACHABLE;
if (!framework->capabilities.partitionAware) {
taskState = TASK_LOST;
}
update = protobuf::createStatusUpdate(
framework->id(),
slaveId.get(),
t.task_id(),
taskState,
TaskStatus::SOURCE_MASTER,
None(),
"Reconciliation: Task is unreachable",
TaskStatus::REASON_RECONCILIATION,
None(),
None(),
None(),
None(),
None(),
unreachableTime);
} else if (slaveId.isSome() && slaves.gone.contains(slaveId.get())) {
// (6) Slave is gone: TASK_GONE_BY_OPERATOR. If the framework
// does not have the PARTITION_AWARE capability, send TASK_LOST
// for backward compatibility.
TaskState taskState = TASK_GONE_BY_OPERATOR;
if (!framework->capabilities.partitionAware) {
taskState = TASK_LOST;
}
update = protobuf::createStatusUpdate(
framework->id(),
slaveId.get(),
t.task_id(),
taskState,
TaskStatus::SOURCE_MASTER,
None(),
"Reconciliation: Task is gone",
TaskStatus::REASON_RECONCILIATION);
} else {
// (7) Task is unknown, slave is unknown: TASK_UNKNOWN. If the
// framework does not have the PARTITION_AWARE capability, send
// TASK_LOST for backward compatibility.
TaskState taskState = TASK_UNKNOWN;
if (!framework->capabilities.partitionAware) {
taskState = TASK_LOST;
}
update = protobuf::createStatusUpdate(
framework->id(),
slaveId,
t.task_id(),
taskState,
TaskStatus::SOURCE_MASTER,
None(),
"Reconciliation: Task is unknown",
TaskStatus::REASON_RECONCILIATION);
}
if (update.isSome()) {
VLOG(1) << "Sending explicit reconciliation state "
<< update->status().state()
<< " for task " << update->status().task_id()
<< " of framework " << *framework;
// TODO(bmahler): Consider using forward(); might lead to too
// much logging.
StatusUpdateMessage message;
*message.mutable_update() = std::move(update.get());
framework->send(message);
}
}
}
scheduler::Response::ReconcileOperations Master::reconcileOperations(
Framework* framework,
const scheduler::Call::ReconcileOperations& reconcile)
{
CHECK_NOTNULL(framework);
++metrics->messages_reconcile_operations;
scheduler::Response::ReconcileOperations response;
if (reconcile.operations_size() == 0) {
// Implicit reconciliation.
LOG(INFO) << "Performing implicit operation state reconciliation"
" for framework " << *framework;
response.mutable_operation_statuses()->Reserve(
framework->operations.size());
foreachvalue (Operation* operation, framework->operations) {
if (operation->statuses().empty()) {
// This can happen if the operation is pending.
response.add_operation_statuses()->CopyFrom(operation->latest_status());
} else {
response.add_operation_statuses()->CopyFrom(
*operation->statuses().rbegin());
}
}
return response;
}
// Explicit reconciliation.
LOG(INFO) << "Performing explicit operation state reconciliation for "
<< reconcile.operations_size() << " operations of framework "
<< *framework;
// Explicit reconciliation occurs for the following cases:
// (1) Operation is known: the latest status sent to the framework.
// (2) Operation is unknown, slave is recovered: OPERATION_RECOVERING.
// (3) Operation is unknown, slave is registered: OPERATION_UNKNOWN.
// (4) Operation is unknown, slave is unreachable: OPERATION_UNREACHABLE.
// (5) Operation is unknown, slave is gone: OPERATION_GONE_BY_OPERATOR.
// (6) Operation is unknown, slave is unknown: OPERATION_UNKNOWN.
// (7) Operation is unknown, slave ID is not specified: OPERATION_UNKNOWN.
foreach (const scheduler::Call::ReconcileOperations::Operation& operation,
reconcile.operations()) {
Option<SlaveID> slaveId = None();
if (operation.has_slave_id()) {
slaveId = operation.slave_id();
}
Option<ResourceProviderID> resourceProviderId = None();
if (operation.has_resource_provider_id()) {
resourceProviderId = operation.resource_provider_id();
}
Option<Operation*> frameworkOperation =
framework->getOperation(operation.operation_id());
OperationStatus* status = response.add_operation_statuses();
if (frameworkOperation.isSome()) {
// (1) Operation is known: resend the latest status sent to the framework.
if (frameworkOperation.get()->statuses().empty()) {
// This can happen if the operation is pending.
*status = frameworkOperation.get()->latest_status();
} else {
*status = *frameworkOperation.get()->statuses().rbegin();
}
} else if (slaveId.isSome() && slaves.recovered.contains(slaveId.get())) {
// (2) Operation is unknown, slave is recovered: OPERATION_RECOVERING.
*status = protobuf::createOperationStatus(
OperationState::OPERATION_RECOVERING,
operation.operation_id(),
"Reconciliation: Agent is recovered but has not re-registered",
None(),
None(),
slaveId,
resourceProviderId);
} else if (slaveId.isSome() && slaves.registered.contains(slaveId.get())) {
// (3) Operation is unknown, slave is registered: OPERATION_UNKNOWN.
*status = protobuf::createOperationStatus(
OperationState::OPERATION_UNKNOWN,
operation.operation_id(),
"Reconciliation: Operation is unknown",
None(),
None(),
slaveId,
resourceProviderId);
} else if (slaveId.isSome() && slaves.unreachable.contains(slaveId.get())) {
// (4) Operation is unknown, slave is unreachable: OPERATION_UNREACHABLE.
*status = protobuf::createOperationStatus(
OperationState::OPERATION_UNREACHABLE,
operation.operation_id(),
"Reconciliation: Agent is unreachable",
None(),
None(),
slaveId,
resourceProviderId);
} else if (slaveId.isSome() && slaves.gone.contains(slaveId.get())) {
// (5) Operation is unknown, slave is gone: OPERATION_GONE_BY_OPERATOR.
*status = protobuf::createOperationStatus(
OperationState::OPERATION_GONE_BY_OPERATOR,
operation.operation_id(),
"Reconciliation: Agent marked gone by operator",
None(),
None(),
slaveId,
resourceProviderId);
} else if (slaveId.isSome()) {
// (6) Operation is unknown, slave is unknown: OPERATION_UNKNOWN.
*status = protobuf::createOperationStatus(
OperationState::OPERATION_UNKNOWN,
operation.operation_id(),
"Reconciliation: Both operation and agent are unknown",
None(),
None(),
slaveId,
resourceProviderId);
} else {
// (7) Operation is unknown, slave is unknown: OPERATION_UNKNOWN.
*status = protobuf::createOperationStatus(
OperationState::OPERATION_UNKNOWN,
operation.operation_id(),
"Reconciliation: Operation is unknown and no 'agent_id' was"
" provided",
None(),
None(),
slaveId,
resourceProviderId);
}
}
return response;
}
void Master::frameworkFailoverTimeout(const FrameworkID& frameworkId,
const Time& reregisteredTime)
{
Framework* framework = getFramework(frameworkId);
if (framework != nullptr && !framework->connected()) {
// If the re-registration time has not changed, then the framework
// has not reregistered within the failover timeout.
if (framework->reregisteredTime == reregisteredTime) {
LOG(INFO) << "Framework failover timeout, removing framework "
<< *framework;
removeFramework(framework);
}
}
}
void Master::offer(
const FrameworkID& frameworkId,
const hashmap<string, hashmap<SlaveID, Resources>>& resources)
{
if (!frameworks.registered.contains(frameworkId) ||
!frameworks.registered[frameworkId]->active()) {
LOG(WARNING) << "Master returning resources offered to framework "
<< frameworkId << " because the framework"
<< " has terminated or is inactive";
foreachkey (const string& role, resources) {
foreachpair (const SlaveID& slaveId,
const Resources& offered,
resources.at(role)) {
allocator->recoverResources(frameworkId, slaveId, offered, None());
}
}
return;
}
Framework* framework = CHECK_NOTNULL(frameworks.registered.at(frameworkId));
// Each offer we create is tied to a single agent
// and a single allocation role.
ResourceOffersMessage message;
// We keep track of the offer IDs so that we can log them.
vector<OfferID> offerIds;
foreachkey (const string& role, resources) {
foreachpair (const SlaveID& slaveId,
const Resources& offered,
resources.at(role)) {
Slave* slave = slaves.registered.get(slaveId);
if (slave == nullptr) {
LOG(WARNING)
<< "Master returning resources offered to framework " << *framework
<< " because agent " << slaveId << " is not valid";
allocator->recoverResources(frameworkId, slaveId, offered, None());
continue;
}
// This could happen if the allocator dispatched 'Master::offer' before
// the slave was deactivated in the allocator.
if (!slave->active) {
LOG(WARNING)
<< "Master returning resources offered because agent " << *slave
<< " is " << (slave->connected ? "deactivated" : "disconnected");
allocator->recoverResources(frameworkId, slaveId, offered, None());
continue;
}
#ifdef ENABLE_PORT_MAPPING_ISOLATOR
// TODO(dhamon): This flag is required as the static allocation of
// ephemeral ports leads to a maximum number of containers that can
// be created on each slave. Once MESOS-1654 is fixed and ephemeral
// ports are a first class resource, this can be removed.
if (flags.max_executors_per_agent.isSome()) {
// Check that we haven't hit the executor limit.
size_t numExecutors = 0;
foreachkey (const FrameworkID& frameworkId, slave->executors) {
numExecutors += slave->executors[frameworkId].keys().size();
}
if (numExecutors >= flags.max_executors_per_agent.get()) {
LOG(WARNING) << "Master returning resources offered because agent "
<< *slave << " has reached the maximum number of "
<< "executors";
// Pass a default filter to avoid getting this same offer immediately
// from the allocator.
allocator->recoverResources(frameworkId, slaveId, offered, Filters());
continue;
}
}
#endif // ENABLE_PORT_MAPPING_ISOLATOR
// TODO(vinod): Split regular and revocable resources into
// separate offers, so that rescinding offers with revocable
// resources does not affect offers with regular resources.
// TODO(bmahler): Set "https" if only "https" is supported.
mesos::URL url;
url.set_scheme("http");
url.mutable_address()->set_hostname(slave->info.hostname());
url.mutable_address()->set_ip(stringify(slave->pid.address.ip));
url.mutable_address()->set_port(slave->pid.address.port);
url.set_path("/" + slave->pid.id);
Offer* offer = new Offer();
offer->mutable_id()->MergeFrom(newOfferId());
offer->mutable_framework_id()->MergeFrom(framework->id());
offer->mutable_slave_id()->MergeFrom(slave->id);
offer->set_hostname(slave->info.hostname());
offer->mutable_url()->MergeFrom(url);
offer->mutable_resources()->MergeFrom(offered);
offer->mutable_attributes()->MergeFrom(slave->info.attributes());
offer->mutable_allocation_info()->set_role(role);
if (slave->info.has_domain()) {
offer->mutable_domain()->MergeFrom(slave->info.domain());
}
// Add all framework's executors running on this slave.
if (slave->executors.contains(framework->id())) {
const hashmap<ExecutorID, ExecutorInfo>& executors =
slave->executors[framework->id()];
foreachkey (const ExecutorID& executorId, executors) {
offer->add_executor_ids()->MergeFrom(executorId);
}
}
// If the slave in this offer is planned to be unavailable due to
// maintenance in the future, then set the Unavailability.
CHECK(machines.contains(slave->machineId));
if (machines[slave->machineId].info.has_unavailability()) {
offer->mutable_unavailability()->CopyFrom(
machines[slave->machineId].info.unavailability());
}
offers[offer->id()] = offer;
framework->addOffer(offer);
slave->addOffer(offer);
if (flags.offer_timeout.isSome()) {
// Rescind the offer after the timeout elapses.
offerTimers[offer->id()] =
delay(flags.offer_timeout.get(),
self(),
&Self::offerTimeout,
offer->id());
}
// TODO(jieyu): For now, we strip 'ephemeral_ports' resource from
// offers so that frameworks do not see this resource. This is a
// short term workaround. Revisit this once we resolve MESOS-1654.
Offer offer_ = *offer;
offer_.clear_resources();
foreach (const Resource& resource, offered) {
if (resource.name() != "ephemeral_ports") {
offer_.add_resources()->CopyFrom(resource);
}
}
// Per MESOS-8237, it is problematic to show the
// `Resource.allocation_info` for pre-MULTI_ROLE schedulers.
// Pre-MULTI_ROLE schedulers are not `AllocationInfo` aware,
// and since they may be performing operations that
// implicitly uses all of Resource's state (e.g. equality
// comparison), we strip the `AllocationInfo` from `Resource`,
// as well as Offer. The idea here is that since the
// information doesn't provide any value to a pre-MULTI_ROLE
// scheduler, we preserve the old `Offer` format for them.
if (!framework->capabilities.multiRole) {
offer_.clear_allocation_info();
foreach (Resource& resource, *offer_.mutable_resources()) {
resource.clear_allocation_info();
}
}
if (!framework->capabilities.reservationRefinement) {
convertResourceFormat(
offer_.mutable_resources(), PRE_RESERVATION_REFINEMENT);
}
// Add the offer *AND* the corresponding slave's PID.
message.add_offers()->MergeFrom(offer_);
message.add_pids(slave->pid);
offerIds.push_back(offer_.id());
VLOG(2) << "Sending offer " << offer_.id()
<< " containing resources " << offered
<< " on agent " << *slave
<< " to framework " << *framework;
}
}
if (message.offers().size() == 0) {
return;
}
LOG(INFO) << "Sending offers " << offerIds << " to framework " << *framework;
framework->metrics.offers_sent += message.offers().size();
framework->send(message);
}
void Master::inverseOffer(
const FrameworkID& frameworkId,
const hashmap<SlaveID, UnavailableResources>& resources)
{
if (!frameworks.registered.contains(frameworkId) ||
!frameworks.registered[frameworkId]->active()) {
LOG(INFO) << "Master ignoring inverse offers to framework " << frameworkId
<< " because the framework has terminated or is inactive";
return;
}
// Create an inverse offer for each slave and add it to the message.
InverseOffersMessage message;
Framework* framework = CHECK_NOTNULL(frameworks.registered[frameworkId]);
foreachpair (const SlaveID& slaveId,
const UnavailableResources& unavailableResources,
resources) {
Slave* slave = slaves.registered.get(slaveId);
if (slave == nullptr) {
LOG(INFO)
<< "Master ignoring inverse offers to framework " << *framework
<< " because agent " << slaveId << " is not valid";
continue;
}
// This could happen if the allocator dispatched 'Master::inverseOffer'
// before the slave was deactivated in the allocator.
if (!slave->active) {
LOG(INFO)
<< "Master ignoring inverse offers to framework " << *framework
<< " because agent " << *slave << " is "
<< (slave->connected ? "deactivated" : "disconnected");
continue;
}
// This could happen if the allocator dispatched `Master::inverseOffer`
// before the unavailability was removed in the master.
if (!machines.contains(slave->machineId) ||
!machines.at(slave->machineId).info.has_unavailability()) {
LOG(INFO)
<< "Master dropping inverse offers to framework " << *framework
<< " because agent " << *slave << " had its unavailability revoked.";
continue;
}
// TODO(bmahler): Set "https" if only "https" is supported.
mesos::URL url;
url.set_scheme("http");
url.mutable_address()->set_hostname(slave->info.hostname());
url.mutable_address()->set_ip(stringify(slave->pid.address.ip));
url.mutable_address()->set_port(slave->pid.address.port);
url.set_path("/" + slave->pid.id);
InverseOffer* inverseOffer = new InverseOffer();
// We use the same id generator as regular offers so that we can
// have unique ids across both. This way we can re-use some of the
// `OfferID` only messages.
inverseOffer->mutable_id()->CopyFrom(newOfferId());
inverseOffer->mutable_framework_id()->CopyFrom(framework->id());
inverseOffer->mutable_slave_id()->CopyFrom(slave->id);
inverseOffer->mutable_url()->CopyFrom(url);
inverseOffer->mutable_unavailability()->CopyFrom(
unavailableResources.unavailability);
inverseOffers[inverseOffer->id()] = inverseOffer;
framework->addInverseOffer(inverseOffer);
slave->addInverseOffer(inverseOffer);
// TODO(jmlvanre): Do we want a separate flag for inverse offer
// timeout?
if (flags.offer_timeout.isSome()) {
// Rescind the inverse offer after the timeout elapses.
inverseOfferTimers[inverseOffer->id()] =
delay(flags.offer_timeout.get(),
self(),
&Self::inverseOfferTimeout,
inverseOffer->id());
}
// Add the inverse offer *AND* the corresponding slave's PID.
message.add_inverse_offers()->CopyFrom(*inverseOffer);
message.add_pids(slave->pid);
}
if (message.inverse_offers().size() == 0) {
return;
}
vector<OfferID> inverseOfferIds;
foreach (const InverseOffer& inverseOffer, message.inverse_offers()) {
inverseOfferIds.push_back(inverseOffer.id());
}
LOG(INFO) << "Sending inverse offers " << inverseOfferIds << " to framework "
<< *framework;
framework->send(message);
}
// TODO(vinod): If due to network partition there are two instances
// of the framework that think they are leaders and try to
// authenticate with master they would be stepping on each other's
// toes. Currently it is tricky to detect this case because the
// 'authenticate' message doesn't contain the 'FrameworkID'.
// 'from' is the authenticatee process with which to communicate.
// 'pid' is the framework/slave process being authenticated.
void Master::authenticate(const UPID& from, const UPID& pid)
{
++metrics->messages_authenticate;
// An authentication request is sent by a client (slave/framework)
// in the following cases:
//
// 1. First time the client is connecting.
// This is straightforward; just proceed with authentication.
//
// 2. Client retried because of ZK expiration / authentication timeout.
// If the client is already authenticated, it will be removed from
// the 'authenticated' map and authentication is retried.
//
// 3. Client restarted.
// 3.1. We are here after receiving 'exited()' from old client.
// This is safe because the client will be first marked as
// disconnected and then when it reregisters it will be
// marked as connected.
//
// 3.2. We are here before receiving 'exited()' from old client.
// This is tricky only if the PID of the client doesn't change
// after restart; true for slave but not for framework.
// If the PID doesn't change the master might mark the client
// disconnected *after* the client reregisters.
// This is safe because the client (slave) will be informed
// about this discrepancy via ping messages so that it can
// reregister.
bool erased = authenticated.erase(pid) > 0;
if (authenticator.isNone()) {
// The default authenticator is CRAM-MD5 rather than none.
// Since the default parameters specify CRAM-MD5 authenticator, no
// required authentication, and no credentials, we must support
// this for starting successfully.
// In this case, we must allow non-authenticating frameworks /
// slaves to register without authentication, but we will return
// an AuthenticationError if they actually try to authenticate.
// TODO(tillt): We need to make sure this does not cause retries.
// See MESOS-2379.
LOG(ERROR) << "Received authentication request from " << pid
<< " but authenticator is not loaded";
AuthenticationErrorMessage message;
message.set_error("No authenticator loaded");
send(from, message);
return;
}
// If a new authentication is occurring for a client that already
// has an authentication in progress, we discard the old one
// (since the client is no longer interested in it) and
// immediately proceed with the new authentication.
if (authenticating.contains(pid)) {
authenticating.at(pid).discard();
authenticating.erase(pid);
LOG(INFO) << "Re-authenticating " << pid << ";"
<< " discarding outstanding authentication";
} else {
LOG(INFO) << "Authenticating " << pid
<< (erased ? "; clearing previous authentication" : "");
}
// Start authentication.
const Future<Option<string>> future = authenticator.get()->authenticate(from);
// Save our state.
authenticating[pid] = future;
future.onAny(defer(self(), &Self::_authenticate, pid, future));
// Don't wait for authentication to complete forever.
delay(flags.authentication_v0_timeout,
self(),
&Self::authenticationTimeout,
future);
}
void Master::_authenticate(
const UPID& pid,
const Future<Option<string>>& future)
{
// Ignore stale authentication results (if the authentication
// future has been overwritten).
if (authenticating.get(pid) != future) {
LOG(INFO) << "Ignoring stale authentication result of " << pid;
return;
}
if (future.isReady() && future->isSome()) {
LOG(INFO) << "Successfully authenticated principal '" << future->get()
<< "' at " << pid;
authenticated.put(pid, future->get());
} else if (future.isReady() && future->isNone()) {
LOG(INFO) << "Authentication of " << pid << " was unsuccessful:"
<< " Invalid credentials";
} else if (future.isFailed()) {
LOG(WARNING) << "An error ocurred while attempting to authenticate " << pid
<< ": " << future.failure();
} else {
LOG(INFO) << "Authentication of " << pid << " was discarded";
}
authenticating.erase(pid);
}
void Master::authenticationTimeout(Future<Option<string>> future)
{
// Note that a 'discard' here is safe even if another
// authenticator is in progress because this copy of the future
// corresponds to the original authenticator that started the timer.
if (future.discard()) { // This is a no-op if the future is already ready.
LOG(WARNING) << "Authentication timed out";
}
}
void Master::reconcileKnownSlave(
Slave* slave,
const vector<ExecutorInfo>& executors,
const vector<Task>& tasks)
{
CHECK_NOTNULL(slave);
// TODO(bmahler): There's an implicit assumption here the slave
// cannot have tasks unknown to the master. This _should_ be the
// case since the causal relationship is:
// slave removes task -> master removes task
// Add error logging for any violations of this assumption!
// We convert the 'tasks' into a map for easier lookup below.
multihashmap<FrameworkID, TaskID> slaveTasks;
foreach (const Task& task, tasks) {
slaveTasks.put(task.framework_id(), task.task_id());
}
// Look for tasks missing in the slave's re-registration message.
// This can occur when:
// (1) a launch message was dropped (e.g. slave failed over), or
// (2) the slave re-registration raced with a launch message, in
// which case the slave actually received the task.
// To resolve both cases correctly, we must reconcile through the
// slave. For slaves that do not support reconciliation, we keep
// the old semantics and cover only case (1) via TASK_LOST.
Duration pingTimeout =
flags.agent_ping_timeout * flags.max_agent_ping_timeouts;
MasterSlaveConnection connection;
connection.set_total_ping_timeout_seconds(pingTimeout.secs());
SlaveReregisteredMessage reregistered;
reregistered.mutable_slave_id()->CopyFrom(slave->id);
reregistered.mutable_connection()->CopyFrom(connection);
foreachkey (const FrameworkID& frameworkId, slave->tasks) {
ReconcileTasksMessage reconcile;
foreachvalue (Task* task, slave->tasks[frameworkId]) {
if (!slaveTasks.contains(task->framework_id(), task->task_id())) {
LOG(WARNING) << "Task " << task->task_id()
<< " of framework " << task->framework_id()
<< " unknown to the agent " << *slave
<< " during re-registration: reconciling with the agent";
// NOTE: The slave doesn't look at the task state when it
// reconciles the task. We send the master's view of the
// current task state since it might be useful in the future.
const TaskState& state = task->has_status_update_state()
? task->status_update_state()
: task->state();
TaskStatus* status = reconcile.add_statuses();
status->mutable_task_id()->CopyFrom(task->task_id());
status->mutable_slave_id()->CopyFrom(slave->id);
status->set_state(state);
status->set_source(TaskStatus::SOURCE_MASTER);
status->set_message("Reconciliation request");
status->set_reason(TaskStatus::REASON_RECONCILIATION);
status->set_timestamp(Clock::now().secs());
}
}
if (reconcile.statuses_size() > 0) {
// NOTE: This function is only invoked when a slave reregisters
// with a master that previously knew about the slave and has
// not marked it unreachable. If the master has any tasks for
// the agent that are not known to the agent itself, it MUST
// have the FrameworkInfo for those tasks. This is because if a
// master has a task that the agent doesn't know about, the
// framework must have reregistered with this master since the
// last master failover.
Framework* framework = CHECK_NOTNULL(getFramework(frameworkId));
CHECK(!framework->recovered());
reconcile.mutable_framework_id()->CopyFrom(frameworkId);
reconcile.mutable_framework()->CopyFrom(framework->info);
reregistered.add_reconciliations()->CopyFrom(reconcile);
}
}
// Re-register the slave.
send(slave->pid, reregistered);
// Likewise, any executors that are present in the master but
// not present in the slave must be removed to correctly account
// for resources. First we index the executors for fast lookup below.
multihashmap<FrameworkID, ExecutorID> slaveExecutors;
foreach (const ExecutorInfo& executor, executors) {
// Master validates that `framework_id` is set during task launch.
CHECK(executor.has_framework_id());
slaveExecutors.put(executor.framework_id(), executor.executor_id());
}
// Now that we have the index for lookup, remove all the executors
// in the master that are not known to the slave.
//
// NOTE: A copy is needed because removeExecutor modifies
// slave->executors.
foreachkey (const FrameworkID& frameworkId, utils::copy(slave->executors)) {
foreachkey (const ExecutorID& executorId,
utils::copy(slave->executors[frameworkId])) {
if (!slaveExecutors.contains(frameworkId, executorId)) {
// TODO(bmahler): Reconcile executors correctly between the
// master and the slave, see:
// MESOS-1466, MESOS-1800, and MESOS-1720.
LOG(WARNING) << "Executor '" << executorId
<< "' of framework " << frameworkId
<< " possibly unknown to the agent " << *slave;
removeExecutor(slave, frameworkId, executorId);
}
}
}
// Send KillTaskMessages for tasks in 'killedTasks' that are
// still alive on the slave. This could happen if the slave
// did not receive KillTaskMessage because of a partition or
// disconnection.
foreach (const Task& task, tasks) {
if (!protobuf::isTerminalState(task.state()) &&
slave->killedTasks.contains(task.framework_id(), task.task_id())) {
LOG(WARNING) << " Agent " << *slave
<< " has non-terminal task " << task.task_id()
<< " that is supposed to be killed. Killing it now!";
KillTaskMessage message;
message.mutable_framework_id()->MergeFrom(task.framework_id());
message.mutable_task_id()->MergeFrom(task.task_id());
send(slave->pid, message);
}
}
// Send ShutdownFrameworkMessages for frameworks that are completed.
// This could happen if the message wasn't received by the slave
// (e.g., slave was down, partitioned).
//
// NOTE: This is a short-term hack because this information is lost
// when the master fails over. Also, we only store a limited number
// of completed frameworks.
//
// TODO(vinod): Revisit this when registrar is in place. It would
// likely involve storing this information in the registrar.
foreachvalue (const Owned<Framework>& framework,
frameworks.completed) {
if (slaveTasks.contains(framework->id())) {
LOG(WARNING) << "Agent " << *slave
<< " reregistered with completed framework " << *framework
<< ". Shutting down the framework on the agent";
ShutdownFrameworkMessage message;
message.mutable_framework_id()->MergeFrom(framework->id());
send(slave->pid, message);
}
}
}
void Master::addFramework(
Framework* framework,
const set<string>& suppressedRoles)
{
CHECK_NOTNULL(framework);
CHECK(!frameworks.registered.contains(framework->id()))
<< "Framework " << *framework << " already exists!";
LOG(INFO) << "Adding framework " << *framework << " with roles "
<< stringify(suppressedRoles) << " suppressed";
frameworks.registered[framework->id()] = framework;
if (framework->connected()) {
if (framework->pid.isSome()) {
link(framework->pid.get());
} else {
CHECK_SOME(framework->http);
const HttpConnection& http = framework->http.get();
http.closed()
.onAny(defer(self(), &Self::exited, framework->id(), http));
}
}
// There should be no offered resources yet!
CHECK_EQ(Resources(), framework->totalOfferedResources);
allocator->addFramework(
framework->id(),
framework->info,
framework->usedResources,
framework->active(),
suppressedRoles);
// Export framework metrics if a principal is specified in `FrameworkInfo`.
Option<string> principal = framework->info.has_principal()
? Option<string>(framework->info.principal())
: None();
if (framework->pid.isSome()) {
CHECK(!frameworks.principals.contains(framework->pid.get()));
frameworks.principals.put(framework->pid.get(), principal);
}
if (principal.isSome()) {
// Create new framework metrics if this framework is the first
// one of this principal. Otherwise existing metrics are reused.
if (!metrics->frameworks.contains(principal.get())) {
metrics->frameworks.put(
principal.get(),
Owned<Metrics::Frameworks>(
new Metrics::Frameworks(principal.get())));
}
}
}
void Master::recoverFramework(
const FrameworkInfo& info,
const set<string>& suppressedRoles)
{
CHECK(!frameworks.registered.contains(info.id()));
Framework* framework = new Framework(this, flags, info);
// Send a `FRAMEWORK_ADDED` event to subscribers before adding recovered tasks
// so the framework ID referred by any succeeding `TASK_ADDED` event will be
// known to subscribers.
if (!subscribers.subscribed.empty()) {
subscribers.send(protobuf::master::event::createFrameworkAdded(*framework));
}
// Add active operations, tasks, and executors to the framework.
foreachvalue (Slave* slave, slaves.registered) {
if (slave->tasks.contains(framework->id())) {
foreachvalue (Task* task, slave->tasks.at(framework->id())) {
framework->addTask(task);
}
}
if (slave->executors.contains(framework->id())) {
foreachvalue (const ExecutorInfo& executor,
slave->executors.at(framework->id())) {
framework->addExecutor(slave->id, executor);
}
}
foreachvalue (Operation* operation, slave->operations) {
if (operation->has_framework_id() &&
operation->framework_id() == framework->id()) {
framework->addOperation(operation);
}
}
foreachvalue (const Slave::ResourceProvider& resourceProvider,
slave->resourceProviders) {
foreachvalue (Operation* operation, resourceProvider.operations) {
if (operation->has_framework_id() &&
operation->framework_id() == framework->id()) {
framework->addOperation(operation);
}
}
}
}
addFramework(framework, suppressedRoles);
}
Try<Nothing> Master::activateRecoveredFramework(
Framework* framework,
const FrameworkInfo& frameworkInfo,
const Option<UPID>& pid,
const Option<HttpConnection>& http,
const set<string>& suppressedRoles)
{
// Exactly one of `pid` or `http` must be provided.
CHECK(pid.isSome() != http.isSome());
CHECK_NOTNULL(framework);
CHECK(framework->recovered());
CHECK(framework->offers.empty());
CHECK(framework->inverseOffers.empty());
CHECK(framework->pid.isNone());
CHECK(framework->http.isNone());
updateFramework(framework, frameworkInfo, suppressedRoles);
// Updating `registeredTime` here is debatable: ideally,
// `registeredTime` would be the time at which the framework first
// registered with the master. However, we cannot determine this
// because the time at which a framework first registered is not
// persisted across master failover.
framework->registeredTime = Clock::now();
framework->reregisteredTime = Clock::now();
// Update the framework's connection state.
if (pid.isSome()) {
framework->updateConnection(pid.get());
link(pid.get());
} else {
framework->updateConnection(http.get());
http->closed()
.onAny(defer(self(), &Self::exited, framework->id(), http.get()));
}
// Activate the framework.
framework->setFrameworkState(Framework::State::ACTIVE);
allocator->activateFramework(framework->id());
// Export framework metrics if a principal is specified in `FrameworkInfo`.
Option<string> principal = framework->info.has_principal()
? Option<string>(framework->info.principal())
: None();
if (framework->pid.isSome()) {
CHECK(!frameworks.principals.contains(framework->pid.get()));
frameworks.principals.put(framework->pid.get(), principal);
}
// We expect the framework metrics for this principal to be created
// when the framework is recovered. This implies that the framework
// principal cannot change on re-registration, which is currently
// the case (MESOS-2842).
if (principal.isSome()) {
CHECK(metrics->frameworks.contains(principal.get()));
}
if (pid.isSome()) {
// TODO(bmahler): We have to send a registered message here for
// the reregistering framework, per the API contract. Send
// reregister here per MESOS-786; requires deprecation or it
// will break frameworks.
FrameworkRegisteredMessage message;
message.mutable_framework_id()->MergeFrom(framework->id());
message.mutable_master_info()->MergeFrom(info_);
framework->send(message);
} else {
FrameworkReregisteredMessage message;
message.mutable_framework_id()->MergeFrom(framework->id());
message.mutable_master_info()->MergeFrom(info_);
framework->send(message);
// Start the heartbeat after sending SUBSCRIBED event.
framework->heartbeat();
}
return Nothing();
}
void Master::failoverFramework(Framework* framework, const HttpConnection& http)
{
CHECK_NOTNULL(framework);
// Notify the old connected framework that it has failed over.
// This is safe to do even if it is a retry because the framework is expected
// to close the old connection (and hence not receive any more responses)
// before sending subscription request on a new connection.
if (framework->connected()) {
FrameworkErrorMessage message;
message.set_message("Framework failed over");
framework->send(message);
}
// If this is an upgrade, clear the authentication related data.
if (framework->pid.isSome()) {
authenticated.erase(framework->pid.get());
CHECK(frameworks.principals.contains(framework->pid.get()));
Option<string> principal = frameworks.principals[framework->pid.get()];
frameworks.principals.erase(framework->pid.get());
}
framework->updateConnection(http);
http.closed()
.onAny(defer(self(), &Self::exited, framework->id(), http));
_failoverFramework(framework);
// Start the heartbeat after sending SUBSCRIBED event.
framework->heartbeat();
}
// Replace the scheduler for a framework with a new process ID, in the
// event of a scheduler failover.
void Master::failoverFramework(Framework* framework, const UPID& newPid)
{
CHECK_NOTNULL(framework);
const Option<UPID> oldPid = framework->pid;
// There are a few failover cases to consider:
// 1. The pid has changed or it was previously a HTTP based scheduler.
// In these cases we definitely want to send a FrameworkErrorMessage to
// shut down the older scheduler.
// 2. The pid has not changed.
// 2.1 The old scheduler on that pid failed over to a new
// instance on the same pid. No need to shut down the old
// scheduler as it is necessarily dead.
// 2.2 This is a duplicate message. In this case, the scheduler
// has not failed over, so we do not want to shut it down.
if (oldPid != newPid && framework->connected()) {
FrameworkErrorMessage message;
message.set_message("Framework failed over");
framework->send(message);
}
framework->updateConnection(newPid);
link(newPid);
_failoverFramework(framework);
CHECK_SOME(framework->pid);
// Update the principal mapping for this framework, which is
// needed to keep the per-principal framework metrics accurate.
if (oldPid.isSome() && frameworks.principals.contains(oldPid.get())) {
frameworks.principals.erase(oldPid.get());
}
frameworks.principals[newPid] = authenticated.get(newPid);
}
void Master::_failoverFramework(Framework* framework)
{
// Remove the framework's offers (if they weren't removed before).
foreach (Offer* offer, utils::copy(framework->offers)) {
allocator->recoverResources(
offer->framework_id(), offer->slave_id(), offer->resources(), None());
removeOffer(offer);
}
// Also remove the inverse offers.
foreach (InverseOffer* inverseOffer, utils::copy(framework->inverseOffers)) {
allocator->updateInverseOffer(
inverseOffer->slave_id(),
inverseOffer->framework_id(),
UnavailableResources{
inverseOffer->resources(),
inverseOffer->unavailability()},
None());
removeInverseOffer(inverseOffer);
}
CHECK(!framework->recovered());
// Reactivate the framework, if needed.
// NOTE: We do this after recovering resources (above) so that
// the allocator has the correct view of the framework's share.
if (!framework->active()) {
framework->setFrameworkState(Framework::State::ACTIVE);
allocator->activateFramework(framework->id());
}
// The scheduler driver safely ignores any duplicate registration
// messages, so we don't need to compare the old and new pids here.
FrameworkRegisteredMessage message;
message.mutable_framework_id()->MergeFrom(framework->id());
message.mutable_master_info()->MergeFrom(info_);
framework->send(message);
}
void Master::teardown(Framework* framework)
{
CHECK_NOTNULL(framework);
LOG(INFO) << "Processing TEARDOWN call for framework " << *framework;
++metrics->messages_unregister_framework;
removeFramework(framework);
}
void Master::removeFramework(Framework* framework)
{
CHECK_NOTNULL(framework);
LOG(INFO) << "Removing framework " << *framework;
if (framework->active()) {
// Deactivate framework, but don't bother rescinding offers
// because the framework is being removed.
deactivate(framework, false);
}
// The framework's offers should have been removed when the
// framework was deactivated.
CHECK(framework->offers.empty());
CHECK(framework->inverseOffers.empty());
foreachvalue (Slave* slave, slaves.registered) {
// Remove the pending tasks from the slave.
slave->pendingTasks.erase(framework->id());
// Tell slaves to shutdown the framework.
ShutdownFrameworkMessage message;
message.mutable_framework_id()->MergeFrom(framework->id());
send(slave->pid, message);
}
// Remove the pending tasks from the framework.
framework->pendingTasks.clear();
// Remove pointers to the framework's tasks in slaves and mark those
// tasks as completed.
foreachvalue (Task* task, utils::copy(framework->tasks)) {
Slave* slave = slaves.registered.get(task->slave_id());
// Since we only find out about tasks when the slave reregisters,
// it must be the case that the slave exists!
CHECK(slave != nullptr)
<< "Unknown agent " << task->slave_id()
<< " for task " << task->task_id();
// The task is implicitly killed, and TASK_KILLED is the closest
// state we have by now. We mark the task and remove it, without
// sending the update. However, a task may finish during the
// executor graceful shutdown period. By marking such task as
// killed and moving it to completed, we lose the opportunity to
// collect the possible finished status. We tolerate this,
// because we expect that if the framework has been asked to shut
// down, its user is not interested in results anymore.
//
// TODO(alex): Consider a more descriptive state, e.g. TASK_ABANDONED.
//
// TODO(neilc): Marking the task KILLED before it has actually
// terminated is misleading. Instead, we should consider leaving
// the task in its current state at the master; if/when the agent
// shuts down the framework, we should arrange for a terminal
// status update to be delivered to the master and update the
// state of the task at that time (MESOS-6608).
const StatusUpdate& update = protobuf::createStatusUpdate(
task->framework_id(),
task->slave_id(),
task->task_id(),
TASK_KILLED,
TaskStatus::SOURCE_MASTER,
None(),
"Framework " + framework->id().value() + " removed",
TaskStatus::REASON_FRAMEWORK_REMOVED,
(task->has_executor_id()
? Option<ExecutorID>(task->executor_id())
: None()));
updateTask(task, update);
removeTask(task);
}
// Mark the framework's unreachable tasks as completed.
foreach (const TaskID& taskId, framework->unreachableTasks.keys()) {
const Owned<Task>& task = framework->unreachableTasks.at(taskId);
// TODO(neilc): Per comment above, using TASK_KILLED here is not
// ideal. It would be better to use TASK_UNREACHABLE here and only
// transition it to a terminal state when the agent reregisters
// and the task is shutdown (MESOS-6608).
const StatusUpdate& update = protobuf::createStatusUpdate(
task->framework_id(),
task->slave_id(),
task->task_id(),
TASK_KILLED,
TaskStatus::SOURCE_MASTER,
None(),
"Framework " + framework->id().value() + " removed",
TaskStatus::REASON_FRAMEWORK_REMOVED,
(task->has_executor_id()
? Option<ExecutorID>(task->executor_id())
: None()));
updateTask(task.get(), update);
// We don't need to remove the task from the slave, because the
// task was removed when the agent was marked unreachable.
CHECK(!slaves.registered.contains(task->slave_id()))
<< "Unreachable task " << task->task_id()
<< " of framework " << task->framework_id()
<< " was found on registered agent " << task->slave_id();
// Move task from unreachable map to completed map.
framework->addCompletedTask(std::move(*task));
framework->unreachableTasks.erase(taskId);
}
// Remove the framework's executors for correct resource accounting.
foreachkey (const SlaveID& slaveId, utils::copy(framework->executors)) {
Slave* slave = slaves.registered.get(slaveId);
if (slave != nullptr) {
foreachkey (const ExecutorID& executorId,
utils::copy(framework->executors[slaveId])) {
removeExecutor(slave, framework->id(), executorId);
}
}
}
foreachvalue (Operation* operation, utils::copy(framework->operations)) {
framework->removeOperation(operation);
}
// TODO(benh): Similar code between removeFramework and
// failoverFramework needs to be shared!
// TODO(benh): unlink(framework->pid);
// For http frameworks, close the connection.
if (framework->http.isSome()) {
framework->http->close();
}
framework->unregisteredTime = Clock::now();
foreach (const string& role, framework->roles) {
framework->untrackUnderRole(role);
}
// TODO(anand): This only works for pid based frameworks. We would
// need similar authentication logic for http frameworks.
if (framework->pid.isSome()) {
authenticated.erase(framework->pid.get());
CHECK(frameworks.principals.contains(framework->pid.get()));
Option<string> principal = frameworks.principals[framework->pid.get()];
frameworks.principals.erase(framework->pid.get());
// Remove the metrics for the principal if this framework is the
// last one with this principal.
if (principal.isSome() &&
!frameworks.principals.containsValue(principal.get())) {
CHECK(metrics->frameworks.contains(principal.get()));
metrics->frameworks.erase(principal.get());
}
}
// Remove the framework.
frameworks.registered.erase(framework->id());
allocator->removeFramework(framework->id());
// The framework pointer is now owned by `frameworks.completed`.
frameworks.completed.set(framework->id(), Owned<Framework>(framework));
if (!subscribers.subscribed.empty()) {
subscribers.send(
protobuf::master::event::createFrameworkRemoved(framework->info));
}
}
void Master::removeFramework(Slave* slave, Framework* framework)
{
CHECK_NOTNULL(slave);
CHECK_NOTNULL(framework);
LOG(INFO) << "Removing framework " << *framework
<< " from agent " << *slave;
// Remove pointers to framework's tasks in slaves, and send status
// updates.
// NOTE: A copy is needed because removeTask modifies slave->tasks.
foreachvalue (Task* task, utils::copy(slave->tasks[framework->id()])) {
// Remove tasks that belong to this framework.
if (task->framework_id() == framework->id()) {
// A framework might not actually exist because the master failed
// over and the framework hasn't reconnected yet. For more info
// please see the comments in 'removeFramework(Framework*)'.
const StatusUpdate& update = protobuf::createStatusUpdate(
task->framework_id(),
task->slave_id(),
task->task_id(),
TASK_LOST,
TaskStatus::SOURCE_MASTER,
None(),
"Agent " + slave->info.hostname() + " disconnected",
TaskStatus::REASON_SLAVE_DISCONNECTED,
(task->has_executor_id()
? Option<ExecutorID>(task->executor_id()) : None()));
updateTask(task, update);
removeTask(task);
if (framework->connected()) {
forward(update, UPID(), framework);
}
}
}
// Remove the framework's executors from the slave and framework
// for proper resource accounting.
if (slave->executors.contains(framework->id())) {
foreachkey (const ExecutorID& executorId,
utils::copy(slave->executors[framework->id()])) {
removeExecutor(slave, framework->id(), executorId);
}
}
}
void Master::addSlave(
Slave* slave,
vector<Archive::Framework>&& completedFrameworks)
{
CHECK_NOTNULL(slave);
CHECK(!slaves.registered.contains(slave->id));
CHECK(!slaves.unreachable.contains(slave->id));
CHECK(slaves.removed.get(slave->id).isNone());
slaves.registered.put(slave);
link(slave->pid);
// Map the slave to the machine it is running on.
CHECK(!machines[slave->machineId].slaves.contains(slave->id));
machines[slave->machineId].slaves.insert(slave->id);
// Set up an observer for the slave.
slave->observer = new SlaveObserver(
slave->pid,
slave->info,
slave->id,
self(),
slaves.limiter,
metrics,
flags.agent_ping_timeout,
flags.max_agent_ping_timeouts);
spawn(slave->observer);
// Add the slave's executors to the frameworks.
foreachkey (const FrameworkID& frameworkId, slave->executors) {
Framework* framework = getFramework(frameworkId);
// If the framework has not reregistered yet and this is the
// first agent to reregister that is running the framework, we
// skip adding the framework's executors here. Instead, the
// framework will be recovered in `__reregisterSlave` and its
// executors will be added by `recoverFramework`.
if (framework == nullptr) {
continue;
}
foreachvalue (const ExecutorInfo& executorInfo,
slave->executors[frameworkId]) {
framework->addExecutor(slave->id, executorInfo);
}
}
// Add the slave's tasks to the frameworks.
foreachkey (const FrameworkID& frameworkId, slave->tasks) {
Framework* framework = getFramework(frameworkId);
// If the framework has not reregistered yet and this is the
// first agent to reregister that is running the framework, we
// skip adding the framework's tasks here. Instead, the framework
// will be recovered in `__reregisterSlave` and its tasks will be
// added by `recoverFramework`.
if (framework == nullptr) {
continue;
}
foreachvalue (Task* task, slave->tasks[frameworkId]) {
framework->addTask(task);
}
}
// Re-add completed tasks reported by the slave.
//
// Note that a slave considers a framework completed when it has no
// tasks/executors running for that framework. But a master
// considers a framework completed when the framework is removed
// after a failover timeout.
//
// TODO(vinod): Reconcile the notion of a completed framework across
// the master and slave.
foreach (Archive::Framework& completedFramework, completedFrameworks) {
Framework* framework = getFramework(
completedFramework.framework_info().id());
foreach (Task& task, *completedFramework.mutable_tasks()) {
if (framework != nullptr) {
VLOG(2) << "Re-adding completed task " << task.task_id()
<< " of framework " << *framework
<< " that ran on agent " << *slave;
framework->addCompletedTask(std::move(task));
} else {
// The framework might not be reregistered yet.
//
// TODO(vinod): Revisit these semantics when we store frameworks'
// information in the registrar.
LOG(WARNING) << "Possibly orphaned completed task " << task.task_id()
<< " of framework " << task.framework_id()
<< " that ran on agent " << *slave;
}
}
}
CHECK(machines.contains(slave->machineId));
// Only set unavailability if the protobuf has one set.
Option<Unavailability> unavailability = None();
if (machines[slave->machineId].info.has_unavailability()) {
unavailability = machines[slave->machineId].info.unavailability();
}
allocator->addSlave(
slave->id,
slave->info,
google::protobuf::convert(slave->capabilities.toRepeatedPtrField()),
unavailability,
slave->totalResources,
slave->usedResources);
if (!subscribers.subscribed.empty()) {
subscribers.send(protobuf::master::event::createAgentAdded(*slave));
}
}
void Master::removeSlave(
Slave* slave,
const string& message,
Option<Counter> reason)
{
CHECK_NOTNULL(slave);
// It would be better to remove the slave here instead of continuing
// to mark it unreachable, but probably not worth the complexity.
if (slaves.markingUnreachable.contains(slave->id)) {
LOG(WARNING) << "Ignoring removal of agent " << *slave
<< " that is in the process of being marked unreachable";
return;
}
if (slaves.markingGone.contains(slave->id)) {
LOG(WARNING) << "Ignoring removal of agent " << *slave
<< " that is in the process of being marked gone";
return;
}
// This should not be possible, but we protect against it anyway for
// the sake of paranoia.
if (slaves.removing.contains(slave->id)) {
LOG(WARNING) << "Ignoring removal of agent " << *slave
<< " that is in the process of being removed";
return;
}
slaves.removing.insert(slave->id);
LOG(INFO) << "Removing agent " << *slave << ": " << message;
// Remove this slave from the registrar. Note that we update the
// registry BEFORE we update the master's in-memory state; this
// means that until the registry operation has completed, the slave
// is not considered to be removed (so we might offer its resources
// to frameworks, etc.). Ensuring that the registry update succeeds
// before we modify in-memory state ensures that external clients
// see consistent behavior if the master fails over.
registrar->apply(Owned<RegistryOperation>(new RemoveSlave(slave->info)))
.onAny(defer(self(),
&Self::_removeSlave,
slave,
lambda::_1,
message,
reason));
}
void Master::_removeSlave(
Slave* slave,
const Future<bool>& registrarResult,
const string& removalCause,
Option<Counter> reason)
{
CHECK_NOTNULL(slave);
CHECK(slaves.removing.contains(slave->info.id()));
slaves.removing.erase(slave->info.id());
CHECK(!registrarResult.isDiscarded());
if (registrarResult.isFailed()) {
LOG(FATAL) << "Failed to remove agent " << *slave
<< " from the registrar: " << registrarResult.failure();
}
// Should not happen: the master will only try to remove agents that
// are currently admitted.
CHECK(registrarResult.get())
<< "Agent " << *slave
<< "already removed from the registrar";
LOG(INFO) << "Removed agent " << *slave << ": " << removalCause;
++metrics->slave_removals;
if (reason.isSome()) {
++utils::copy(reason.get()); // Remove const.
}
// We want to remove the slave first, to avoid the allocator
// re-allocating the recovered resources.
//
// NOTE: Removing the slave is not sufficient for recovering the
// resources in the allocator, because the "Sorters" are updated
// only within recoverResources() (see MESOS-621). The calls to
// recoverResources() below are therefore required, even though
// the slave is already removed.
allocator->removeSlave(slave->id);
// Transition the tasks to lost and remove them.
foreachkey (const FrameworkID& frameworkId, utils::copy(slave->tasks)) {
Framework* framework = getFramework(frameworkId);
foreachvalue (Task* task, utils::copy(slave->tasks[frameworkId])) {
// TODO(bmahler): Differentiate between agent removal reasons
// (e.g. unhealthy vs. unregistered for maintenance).
const StatusUpdate& update = protobuf::createStatusUpdate(
task->framework_id(),
task->slave_id(),
task->task_id(),
TASK_LOST,
TaskStatus::SOURCE_MASTER,
None(),
"Agent " + slave->info.hostname() + " removed: " + removalCause,
TaskStatus::REASON_SLAVE_REMOVED,
(task->has_executor_id() ?
Option<ExecutorID>(task->executor_id()) : None()));
updateTask(task, update);
removeTask(task);
if (framework == nullptr || !framework->connected()) {
LOG(WARNING) << "Dropping update " << update
<< " for unknown framework " << frameworkId;
} else {
forward(update, UPID(), framework);
}
}
}
// Remove executors from the slave for proper resource accounting.
foreachkey (const FrameworkID& frameworkId, utils::copy(slave->executors)) {
foreachkey (const ExecutorID& executorId,
utils::copy(slave->executors[frameworkId])) {
removeExecutor(slave, frameworkId, executorId);
}
}
foreach (Offer* offer, utils::copy(slave->offers)) {
// TODO(vinod): We don't need to call 'Allocator::recoverResources'
// once MESOS-621 is fixed.
allocator->recoverResources(
offer->framework_id(), slave->id, offer->resources(), None());
// Remove and rescind offers.
removeOffer(offer, true); // Rescind!
}
// Remove inverse offers because sending them for a slave that is
// gone doesn't make sense.
foreach (InverseOffer* inverseOffer, utils::copy(slave->inverseOffers)) {
// We don't need to update the allocator because we've already called
// `RemoveSlave()`.
// Remove and rescind inverse offers.
removeInverseOffer(inverseOffer, true); // Rescind!
}
// Remove the pending tasks from the slave.
slave->pendingTasks.clear();
// Mark the slave as being removed.
slaves.registered.remove(slave);
slaves.removed.put(slave->id, Nothing());
authenticated.erase(slave->pid);
// Remove the slave from the `machines` mapping.
CHECK(machines.contains(slave->machineId));
CHECK(machines[slave->machineId].slaves.contains(slave->id));
machines[slave->machineId].slaves.erase(slave->id);
// Kill the slave observer.
terminate(slave->observer);
wait(slave->observer);
delete slave->observer;
// TODO(benh): unlink(slave->pid);
sendSlaveLost(slave->info);
if (!subscribers.subscribed.empty()) {
subscribers.send(protobuf::master::event::createAgentRemoved(slave->id));
}
delete slave;
}
void Master::__removeSlave(
Slave* slave,
const string& message,
const Option<TimeInfo>& unreachableTime)
{
// We want to remove the slave first, to avoid the allocator
// re-allocating the recovered resources.
//
// NOTE: Removing the slave is not sufficient for recovering the
// resources in the allocator, because the "Sorters" are updated
// only within recoverResources() (see MESOS-621). The calls to
// recoverResources() below are therefore required, even though
// the slave is already removed.
allocator->removeSlave(slave->id);
// Transition tasks to TASK_UNREACHABLE/TASK_GONE_BY_OPERATOR/TASK_LOST
// and remove them. We only use TASK_UNREACHABLE/TASK_GONE_BY_OPERATOR if
// the framework has opted in to the PARTITION_AWARE capability.
foreachkey (const FrameworkID& frameworkId, utils::copy(slave->tasks)) {
Framework* framework = getFramework(frameworkId);
CHECK_NOTNULL(framework);
TaskState newTaskState = TASK_UNREACHABLE;
TaskStatus::Reason newTaskReason = TaskStatus::REASON_SLAVE_REMOVED;
// Needed to convey task unreachability because we lose this
// information from the task state if `TASK_LOST` is used.
bool unreachable = true;
if (!framework->capabilities.partitionAware) {
newTaskState = TASK_LOST;
} else if (unreachableTime.isNone()) {
unreachable = false;
newTaskState = TASK_GONE_BY_OPERATOR;
newTaskReason = TaskStatus::REASON_SLAVE_REMOVED_BY_OPERATOR;
}
foreachvalue (Task* task, utils::copy(slave->tasks[frameworkId])) {
const StatusUpdate& update = protobuf::createStatusUpdate(
task->framework_id(),
task->slave_id(),
task->task_id(),
newTaskState,
TaskStatus::SOURCE_MASTER,
None(),
message,
newTaskReason,
(task->has_executor_id() ?
Option<ExecutorID>(task->executor_id()) : None()),
None(),
None(),
None(),
None(),
unreachableTime.isSome() ? unreachableTime : None());
updateTask(task, update);
removeTask(task, unreachable);
if (!framework->connected()) {
LOG(WARNING) << "Dropping update " << update
<< " for disconnected "
<< " framework " << frameworkId;
} else {
forward(update, UPID(), framework);
}
}
}
// Remove executors from the slave for proper resource accounting.
foreachkey (const FrameworkID& frameworkId, utils::copy(slave->executors)) {
foreachkey (const ExecutorID& executorId,
utils::copy(slave->executors[frameworkId])) {
removeExecutor(slave, frameworkId, executorId);
}
}
foreach (Offer* offer, utils::copy(slave->offers)) {
// TODO(vinod): We don't need to call 'Allocator::recoverResources'
// once MESOS-621 is fixed.
allocator->recoverResources(
offer->framework_id(), slave->id, offer->resources(), None());
// Remove and rescind offers.
removeOffer(offer, true); // Rescind!
}
// Remove inverse offers because sending them for a slave that is
// unreachable doesn't make sense.
foreach (InverseOffer* inverseOffer, utils::copy(slave->inverseOffers)) {
// We don't need to update the allocator because we've already called
// `RemoveSlave()`.
// Remove and rescind inverse offers.
removeInverseOffer(inverseOffer, true); // Rescind!
}
// Mark the slave as being removed.
slaves.registered.remove(slave);
slaves.removed.put(slave->id, Nothing());
authenticated.erase(slave->pid);
// Remove the slave from the `machines` mapping.
CHECK(machines.contains(slave->machineId));
CHECK(machines[slave->machineId].slaves.contains(slave->id));
machines[slave->machineId].slaves.erase(slave->id);
// Kill the slave observer.
terminate(slave->observer);
wait(slave->observer);
delete slave->observer;
// TODO(benh): unlink(slave->pid);
// TODO(bmahler): Tell partition aware frameworks that the
// agent is unreachable rather than lost, if applicable.
// This requires a new capability.
sendSlaveLost(slave->info);
delete slave;
}
void Master::updateTask(Task* task, const StatusUpdate& update)
{
CHECK_NOTNULL(task);
// Get the unacknowledged status.
const TaskStatus& status = update.status();
// NOTE: Refer to comments on `StatusUpdate` message in messages.proto for
// the difference between `update.latest_state()` and `status.state()`.
// Updates from the slave have 'latest_state' set.
Option<TaskState> latestState;
if (update.has_latest_state()) {
latestState = update.latest_state();
}
const TaskState updateState = latestState.getOrElse(status.state());
// Determine whether the task transitioned to terminal or
// unreachable prior to changing the task state.
auto isTerminalOrUnreachableState = [](const TaskState& state) {
return protobuf::isTerminalState(state) || state == TASK_UNREACHABLE;
};
bool transitionedToTerminalOrUnreachable =
!isTerminalOrUnreachableState(task->state()) &&
isTerminalOrUnreachableState(updateState);
// Indicates whether we should send a notification to subscribers,
// set if the task transitioned to a new state.
bool sendSubscribersUpdate = false;
Framework* framework = getFramework(task->framework_id());
// If the task has already transitioned to a terminal state,
// do not update its state. Note that we are being defensive
// here because this should not happen unless there is a bug
// in the master code.
//
// TODO(bmahler): Check that we're not transitioning from
// TASK_UNREACHABLE to another state.
if (!protobuf::isTerminalState(task->state())) {
if (task->state() != updateState && framework != nullptr) {
// When we observe a transition away from a non-terminal state,
// decrement the relevant metric.
framework->metrics.decrementActiveTaskState(task->state());
framework->metrics.incrementTaskState(updateState);
}
task->set_state(updateState);
}
// If this is a (health) check status update, always forward it to
// subscribers.
if (status.reason() == TaskStatus::REASON_TASK_CHECK_STATUS_UPDATED ||
status.reason() == TaskStatus::REASON_TASK_HEALTH_CHECK_STATUS_UPDATED) {
sendSubscribersUpdate = true;
}
// TODO(brenden): Consider wiping the `message` field?
if (task->statuses_size() > 0 &&
task->statuses(task->statuses_size() - 1).state() == status.state()) {
task->mutable_statuses()->RemoveLast();
} else {
// Send a `TASK_UPDATED` event for every new task state.
sendSubscribersUpdate = true;
}
task->add_statuses()->CopyFrom(status);
// Delete data (maybe very large since it's stored by on-top framework) we
// are not interested in to avoid OOM.
// For example: mesos-master is running on a machine with 4GB free memory,
// if every task stores 10MB data into TaskStatus, then mesos-master will be
// killed by OOM killer after 400 tasks have finished.
// MESOS-1746.
task->mutable_statuses(task->statuses_size() - 1)->clear_data();
if (sendSubscribersUpdate && !subscribers.subscribed.empty()) {
// If the framework has been removed, the task would have already
// transitioned to `TASK_KILLED` by `removeFramework()`, thus
// `sendSubscribersUpdate` shouldn't have been set to true.
// TODO(chhsiao): This may be changed after MESOS-6608 is resolved.
CHECK_NOTNULL(framework);
subscribers.send(
protobuf::master::event::createTaskUpdated(
*task, task->state(), status),
framework->info,
*task);
}
LOG(INFO) << "Updating the state of task " << task->task_id()
<< " of framework " << task->framework_id()
<< " (latest state: " << task->state()
<< ", status update state: " << status.state() << ")";
// Once the task transitioned to terminal or unreachable,
// recover the resources.
if (transitionedToTerminalOrUnreachable) {
allocator->recoverResources(
task->framework_id(),
task->slave_id(),
task->resources(),
None());
// The slave owns the Task object and cannot be nullptr.
Slave* slave = slaves.registered.get(task->slave_id());
CHECK_NOTNULL(slave);
slave->recoverResources(task);
if (framework != nullptr) {
framework->recoverResources(task);
}
switch (status.state()) {
case TASK_FINISHED: ++metrics->tasks_finished; break;
case TASK_FAILED: ++metrics->tasks_failed; break;
case TASK_KILLED: ++metrics->tasks_killed; break;
case TASK_LOST: ++metrics->tasks_lost; break;
case TASK_ERROR: ++metrics->tasks_error; break;
case TASK_DROPPED: ++metrics->tasks_dropped; break;
case TASK_GONE: ++metrics->tasks_gone; break;
case TASK_GONE_BY_OPERATOR: ++metrics->tasks_gone_by_operator; break;
// The following are non-terminal and use gauge based metrics.
case TASK_STARTING: break;
case TASK_STAGING: break;
case TASK_RUNNING: break;
case TASK_KILLING: break;
case TASK_UNREACHABLE: break;
// Should not happen.
case TASK_UNKNOWN:
LOG(FATAL) << "Unexpected TASK_UNKNOWN for in-memory task";
break;
}
if (status.has_reason()) {
metrics->incrementTasksStates(
status.state(),
status.source(),
status.reason());
}
}
}
void Master::removeTask(Task* task, bool unreachable)
{
CHECK_NOTNULL(task);
// The slave owns the Task object and cannot be nullptr.
Slave* slave = slaves.registered.get(task->slave_id());
CHECK_NOTNULL(slave);
// Note that we explicitly convert from protobuf to `Resources` here
// and then use the result below to avoid performance penalty for multiple
// conversions and validations implied by conversion.
// Conversion is safe, as resources have already passed validation.
const Resources resources = task->resources();
// The invariant here is that the master will recover the resources
// prior to removing terminal or unreachable tasks. If the task is
// not terminal or unreachable, we must recover the resources here.
//
// TODO(bmahler): Currently, only `Master::finalize()` will call
// `removeTask()` with a non-terminal task. Consider fixing this
// and instead CHECKing here to simplify the logic.
if (!protobuf::isTerminalState(task->state()) &&
task->state() != TASK_UNREACHABLE) {
CHECK(!unreachable) << task->task_id();
// Note that we use `Resources` for output as it's faster than
// logging raw protobuf data.
LOG(WARNING) << "Removing task " << task->task_id()
<< " with resources " << resources
<< " of framework " << task->framework_id()
<< " on agent " << *slave
<< " in non-terminal state " << task->state();
allocator->recoverResources(
task->framework_id(),
task->slave_id(),
resources,
None());
} else {
// Note that we use `Resources` for output as it's faster than
// logging raw protobuf data.
LOG(INFO) << "Removing task " << task->task_id()
<< " with resources " << resources
<< " of framework " << task->framework_id()
<< " on agent " << *slave;
}
if (unreachable) {
slaves.unreachableTasks[slave->id][task->framework_id()]
.push_back(task->task_id());
}
// Remove from framework.
Framework* framework = getFramework(task->framework_id());
if (framework != nullptr) { // A framework might not be reregistered yet.
framework->removeTask(task, unreachable);
}
// Remove from slave.
slave->removeTask(task);
delete task;
}
void Master::removeExecutor(
Slave* slave,
const FrameworkID& frameworkId,
const ExecutorID& executorId)
{
CHECK_NOTNULL(slave);
CHECK(slave->hasExecutor(frameworkId, executorId));
ExecutorInfo executor = slave->executors[frameworkId][executorId];
LOG(INFO) << "Removing executor '" << executorId
<< "' with resources " << executor.resources()
<< " of framework " << frameworkId << " on agent " << *slave;
allocator->recoverResources(
frameworkId, slave->id, executor.resources(), None());
Framework* framework = getFramework(frameworkId);
if (framework != nullptr) { // The framework might not be reregistered yet.
framework->removeExecutor(slave->id, executorId);
}
slave->removeExecutor(frameworkId, executorId);
}
void Master::addOperation(
Framework* framework,
Slave* slave,
Operation* operation)
{
CHECK_NOTNULL(operation);
CHECK_NOTNULL(slave);
slave->addOperation(operation);
if (framework != nullptr) {
framework->addOperation(operation);
}
}
void Master::updateOperation(
Operation* operation,
const UpdateOperationStatusMessage& update,
bool convertResources)
{
CHECK_NOTNULL(operation);
const OperationStatus& status =
update.has_latest_status() ? update.latest_status() : update.status();
LOG(INFO) << "Updating the state of operation '" << operation->info().id()
<< "' (uuid: " << update.operation_uuid() << ") for"
<< (operation->has_framework_id()
? " framework " + stringify(operation->framework_id())
: " an operator API call")
<< " (latest state: " << operation->latest_status().state()
<< ", status update state: " << status.state() << ")";
// Whether the operation has just become terminated.
const bool terminated =
!protobuf::isTerminalState(operation->latest_status().state()) &&
protobuf::isTerminalState(status.state());
// If the operation has already transitioned to a terminal state,
// do not update its state.
if (!protobuf::isTerminalState(operation->latest_status().state())) {
operation->mutable_latest_status()->CopyFrom(status);
}
// TODO(gkleiman): Revisit the de-duplication logic (MESOS-8441) - if two
// different terminal statuses arrive, we could end up with different states
// in `latest_status` and the front of statuses list.
if (operation->statuses().empty() ||
*(operation->statuses().rbegin()) != status) {
operation->add_statuses()->CopyFrom(status);
}
if (!terminated) {
return;
}
// Update resource accounting in the master and in the allocator.
// NOTE: For the "old" operations (RESERVE, UNRESERVE, CREATE,
// DESTROY), the master speculatively assumes that the operation
// will be successful when it accepts the operations. Therefore, we
// don't need to update the resource accounting for those types of
// operations in the master and in the allocator states upon
// receiving a terminal status update.
if (protobuf::isSpeculativeOperation(operation->info())) {
return;
}
// We currently do not support non-speculated operations not
// triggered by a framework (e.g., over the operator API).
CHECK(operation->has_framework_id());
Try<Resources> consumed = protobuf::getConsumedResources(operation->info());
CHECK_SOME(consumed);
CHECK(operation->has_slave_id())
<< "External resource provider is not supported yet";
// The slave owns the Operation object and cannot be nullptr.
// TODO(jieyu): Revisit this once we introduce support for external
// resource provider.
Slave* slave = slaves.registered.get(operation->slave_id());
CHECK_NOTNULL(slave);
switch (operation->latest_status().state()) {
// Terminal state, and the conversion is successful.
case OPERATION_FINISHED: {
const Resources converted =
operation->latest_status().converted_resources();
if (convertResources) {
allocator->updateAllocation(
operation->framework_id(),
operation->slave_id(),
consumed.get(),
{ResourceConversion(consumed.get(), converted)});
allocator->recoverResources(
operation->framework_id(),
operation->slave_id(),
converted,
None());
Resources consumedUnallocated = consumed.get();
consumedUnallocated.unallocate();
Resources convertedUnallocated = converted;
convertedUnallocated.unallocate();
slave->apply(
{ResourceConversion(consumedUnallocated, convertedUnallocated)});
} else {
allocator->recoverResources(
operation->framework_id(),
operation->slave_id(),
consumed.get(),
None());
}
break;
}
// Terminal state, and the conversion has failed.
case OPERATION_FAILED:
case OPERATION_ERROR:
case OPERATION_DROPPED: {
allocator->recoverResources(
operation->framework_id(),
operation->slave_id(),
consumed.get(),
None());
break;
}
// Non-terminal or not expected from an agent. This shouldn't happen.
case OPERATION_UNSUPPORTED:
case OPERATION_PENDING:
case OPERATION_UNREACHABLE:
case OPERATION_GONE_BY_OPERATOR:
case OPERATION_RECOVERING:
case OPERATION_UNKNOWN: {
LOG(FATAL) << "Unexpected operation state "
<< operation->latest_status().state();
break;
}
}
slave->recoverResources(operation);
Framework* framework = getFramework(operation->framework_id());
if (framework != nullptr) {
framework->recoverResources(operation);
}
}
void Master::removeOperation(Operation* operation)
{
CHECK_NOTNULL(operation);
// Remove from framework.
Framework* framework = operation->has_framework_id()
? getFramework(operation->framework_id())
: nullptr;
if (framework != nullptr) {
framework->removeOperation(operation);
}
// Remove from slave.
CHECK(operation->has_slave_id())
<< "External resource provider is not supported yet";
Slave* slave = slaves.registered.get(operation->slave_id());
CHECK_NOTNULL(slave);
slave->removeOperation(operation);
// If the operation was not speculated and is not terminal we
// need to also recover its used resources in the allocator.
if (!protobuf::isSpeculativeOperation(operation->info()) &&
!protobuf::isTerminalState(operation->latest_status().state())) {
Try<Resources> consumed = protobuf::getConsumedResources(operation->info());
CHECK_SOME(consumed);
allocator->recoverResources(
operation->framework_id(),
operation->slave_id(),
consumed.get(),
None());
}
delete operation;
}
Future<Nothing> Master::apply(Slave* slave, const Offer::Operation& operation)
{
CHECK_NOTNULL(slave);
return allocator->updateAvailable(slave->id, {operation})
.onReady(defer(self(), &Master::_apply, slave, nullptr, operation));
}
void Master::_apply(
Slave* slave,
Framework* framework,
const Offer::Operation& operationInfo)
{
CHECK_NOTNULL(slave);
if (slave->capabilities.resourceProvider) {
Result<ResourceProviderID> resourceProviderId =
getResourceProviderId(operationInfo);
// This must have been validated by the caller.
CHECK(!resourceProviderId.isError());
CHECK(
resourceProviderId.isNone() ||
slave->resourceProviders.contains(resourceProviderId.get()))
<< "Resource provider " + stringify(resourceProviderId.get()) +
" is unknown";
CHECK_SOME(slave->resourceVersion);
const UUID resourceVersion = resourceProviderId.isNone()
? slave->resourceVersion.get()
: slave->resourceProviders.get(resourceProviderId.get())->resourceVersion;
Operation* operation = new Operation(protobuf::createOperation(
operationInfo,
protobuf::createOperationStatus(
OPERATION_PENDING,
operationInfo.has_id()
? operationInfo.id()
: Option<OperationID>::none(),
None(),
None(),
None(),
slave->id,
resourceProviderId.isSome()
? Some(resourceProviderId.get())
: Option<ResourceProviderID>::none()),
framework != nullptr ? framework->id() : Option<FrameworkID>::none(),
slave->id));
addOperation(framework, slave, operation);
if (protobuf::isSpeculativeOperation(operation->info())) {
Offer::Operation strippedOperationInfo = operation->info();
protobuf::stripAllocationInfo(&strippedOperationInfo);
Try<vector<ResourceConversion>> conversions =
getResourceConversions(strippedOperationInfo);
CHECK_SOME(conversions);
slave->apply(conversions.get());
}
ApplyOperationMessage message;
if (framework != nullptr) {
message.mutable_framework_id()->CopyFrom(framework->id());
}
message.mutable_operation_info()->CopyFrom(operation->info());
message.mutable_operation_uuid()->CopyFrom(operation->uuid());
if (resourceProviderId.isSome()) {
message.mutable_resource_version_uuid()
->mutable_resource_provider_id()
->CopyFrom(resourceProviderId.get());
}
message.mutable_resource_version_uuid()->mutable_uuid()->CopyFrom(
resourceVersion);
LOG(INFO) << "Sending operation '" << operation->info().id()
<< "' (uuid: " << operation->uuid() << ") "
<< "to agent " << *slave;
send(slave->pid, message);
} else {
if (!protobuf::isSpeculativeOperation(operationInfo)) {
LOG(FATAL) << "Unexpected operation to apply on agent " << *slave;
}
// We need to strip the allocation info from the operation's
// resources in order to apply the operation successfully
// since the agent's total is stored as unallocated resources.
Offer::Operation strippedOperationInfo = operationInfo;
protobuf::stripAllocationInfo(&strippedOperationInfo);
Try<vector<ResourceConversion>> conversions =
getResourceConversions(strippedOperationInfo);
CHECK_SOME(conversions);
slave->apply(conversions.get());
CheckpointResourcesMessage message;
message.mutable_resources()->CopyFrom(slave->checkpointedResources);
if (!slave->capabilities.reservationRefinement) {
// If the agent is not refinement-capable, don't send it
// checkpointed resources that contain refined reservations. This
// might occur if a reservation refinement is created but never
// reaches the agent (e.g., due to network partition), and then
// the agent is downgraded before the partition heals.
//
// TODO(neilc): It would probably be better to prevent the agent
// from reregistering in this scenario.
Try<Nothing> result = downgradeResources(&message);
if (result.isError()) {
LOG(WARNING) << "Not sending updated checkpointed resources "
<< slave->checkpointedResources
<< " with refined reservations, since agent " << *slave
<< " is not RESERVATION_REFINEMENT-capable.";
return;
}
}
LOG(INFO) << "Sending updated checkpointed resources "
<< slave->checkpointedResources
<< " to agent " << *slave;
send(slave->pid, message);
}
if (framework != nullptr) {
// We increment per-framework operation metrics for all operations except
// LAUNCH and LAUNCH_GROUP here.
framework->metrics.incrementOperation(operationInfo);
}
}
void Master::offerTimeout(const OfferID& offerId)
{
Offer* offer = getOffer(offerId);
if (offer != nullptr) {
allocator->recoverResources(
offer->framework_id(), offer->slave_id(), offer->resources(), None());
removeOffer(offer, true);
}
}
// TODO(vinod): Instead of 'removeOffer()', consider implementing
// 'useOffer()', 'discardOffer()' and 'rescindOffer()' for clarity.
void Master::removeOffer(Offer* offer, bool rescind)
{
// Remove from framework.
Framework* framework = getFramework(offer->framework_id());
CHECK(framework != nullptr)
<< "Unknown framework " << offer->framework_id()
<< " in the offer " << offer->id();
framework->removeOffer(offer);
// Remove from slave.
Slave* slave = slaves.registered.get(offer->slave_id());
CHECK(slave != nullptr)
<< "Unknown agent " << offer->slave_id()
<< " in the offer " << offer->id();
slave->removeOffer(offer);
if (rescind) {
RescindResourceOfferMessage message;
message.mutable_offer_id()->MergeFrom(offer->id());
framework->metrics.offers_rescinded++;
framework->send(message);
}
// Remove and cancel offer removal timers. Canceling the Timers is
// only done to avoid having too many active Timers in libprocess.
if (offerTimers.contains(offer->id())) {
Clock::cancel(offerTimers[offer->id()]);
offerTimers.erase(offer->id());
}
// Delete it.
LOG(INFO) << "Removing offer " << offer->id();
offers.erase(offer->id());
delete offer;
}
void Master::inverseOfferTimeout(const OfferID& inverseOfferId)
{
InverseOffer* inverseOffer = getInverseOffer(inverseOfferId);
if (inverseOffer != nullptr) {
allocator->updateInverseOffer(
inverseOffer->slave_id(),
inverseOffer->framework_id(),
UnavailableResources{
inverseOffer->resources(),
inverseOffer->unavailability()},
None());
removeInverseOffer(inverseOffer, true);
}
}
void Master::removeInverseOffer(InverseOffer* inverseOffer, bool rescind)
{
// Remove from framework.
Framework* framework = getFramework(inverseOffer->framework_id());
CHECK(framework != nullptr)
<< "Unknown framework " << inverseOffer->framework_id()
<< " in the inverse offer " << inverseOffer->id();
framework->removeInverseOffer(inverseOffer);
// Remove from slave.
Slave* slave = slaves.registered.get(inverseOffer->slave_id());
CHECK(slave != nullptr)
<< "Unknown agent " << inverseOffer->slave_id()
<< " in the inverse offer " << inverseOffer->id();
slave->removeInverseOffer(inverseOffer);
if (rescind) {
RescindInverseOfferMessage message;
message.mutable_inverse_offer_id()->CopyFrom(inverseOffer->id());
framework->send(message);
}
// Remove and cancel inverse offer removal timers. Canceling the Timers is
// only done to avoid having too many active Timers in libprocess.
if (inverseOfferTimers.contains(inverseOffer->id())) {
Clock::cancel(inverseOfferTimers[inverseOffer->id()]);
inverseOfferTimers.erase(inverseOffer->id());
}
// Delete it.
inverseOffers.erase(inverseOffer->id());
delete inverseOffer;
}
bool Master::isCompletedFramework(const FrameworkID& frameworkId)
{
return frameworks.completed.contains(frameworkId);
}
// TODO(bmahler): Consider killing this.
Framework* Master::getFramework(const FrameworkID& frameworkId) const
{
return frameworks.registered.contains(frameworkId)
? frameworks.registered.at(frameworkId)
: nullptr;
}
// TODO(bmahler): Consider killing this.
Offer* Master::getOffer(const OfferID& offerId) const
{
return offers.contains(offerId) ? offers.at(offerId) : nullptr;
}
// TODO(bmahler): Consider killing this.
InverseOffer* Master::getInverseOffer(const OfferID& inverseOfferId) const
{
return inverseOffers.contains(inverseOfferId)
? inverseOffers.at(inverseOfferId)
: nullptr;
}
// Create a new framework ID. We format the ID as MASTERID-FWID, where
// MASTERID is the ID of the master (randomly generated UUID) and FWID
// is an increasing integer.
FrameworkID Master::newFrameworkId()
{
std::ostringstream out;
out << info_.id() << "-" << std::setw(4)
<< std::setfill('0') << nextFrameworkId++;
FrameworkID frameworkId;
frameworkId.set_value(out.str());
return frameworkId;
}
OfferID Master::newOfferId()
{
OfferID offerId;
offerId.set_value(info_.id() + "-O" + stringify(nextOfferId++));
return offerId;
}
SlaveID Master::newSlaveId()
{
SlaveID slaveId;
slaveId.set_value(info_.id() + "-S" + stringify(nextSlaveId++));
return slaveId;
}
double Master::_slaves_connected()
{
double count = 0.0;
foreachvalue (Slave* slave, slaves.registered) {
if (slave->connected) {
count++;
}
}
return count;
}
double Master::_slaves_disconnected()
{
double count = 0.0;
foreachvalue (Slave* slave, slaves.registered) {
if (!slave->connected) {
count++;
}
}
return count;
}
double Master::_slaves_active()
{
double count = 0.0;
foreachvalue (Slave* slave, slaves.registered) {
if (slave->active) {
count++;
}
}
return count;
}
double Master::_slaves_inactive()
{
double count = 0.0;
foreachvalue (Slave* slave, slaves.registered) {
if (!slave->active) {
count++;
}
}
return count;
}
double Master::_slaves_unreachable()
{
return static_cast<double>(slaves.unreachable.size());
}
double Master::_frameworks_connected()
{
double count = 0.0;
foreachvalue (Framework* framework, frameworks.registered) {
if (framework->connected()) {
count++;
}
}
return count;
}
double Master::_frameworks_disconnected()
{
double count = 0.0;
foreachvalue (Framework* framework, frameworks.registered) {
if (!framework->connected()) {
count++;
}
}
return count;
}
double Master::_frameworks_active()
{
double count = 0.0;
foreachvalue (Framework* framework, frameworks.registered) {
if (framework->active()) {
count++;
}
}
return count;
}
double Master::_frameworks_inactive()
{
double count = 0.0;
foreachvalue (Framework* framework, frameworks.registered) {
if (!framework->active()) {
count++;
}
}
return count;
}
double Master::_tasks_staging()
{
double count = 0.0;
// Add the tasks pending validation / authorization.
foreachvalue (Framework* framework, frameworks.registered) {
count += framework->pendingTasks.size();
}
foreachvalue (Slave* slave, slaves.registered) {
typedef hashmap<TaskID, Task*> TaskMap;
foreachvalue (const TaskMap& tasks, slave->tasks) {
foreachvalue (const Task* task, tasks) {
if (task->state() == TASK_STAGING) {
count++;
}
}
}
}
return count;
}
double Master::_tasks_starting()
{
double count = 0.0;
foreachvalue (Slave* slave, slaves.registered) {
typedef hashmap<TaskID, Task*> TaskMap;
foreachvalue (const TaskMap& tasks, slave->tasks) {
foreachvalue (const Task* task, tasks) {
if (task->state() == TASK_STARTING) {
count++;
}
}
}
}
return count;
}
double Master::_tasks_running()
{
double count = 0.0;
foreachvalue (Slave* slave, slaves.registered) {
typedef hashmap<TaskID, Task*> TaskMap;
foreachvalue (const TaskMap& tasks, slave->tasks) {
foreachvalue (const Task* task, tasks) {
if (task->state() == TASK_RUNNING) {
count++;
}
}
}
}
return count;
}
double Master::_tasks_unreachable()
{
double count = 0.0;
foreachvalue (Framework* framework, frameworks.registered) {
foreachvalue (const Owned<Task>& task, framework->unreachableTasks) {
if (task->state() == TASK_UNREACHABLE) {
count++;
}
}
}
return count;
}
double Master::_tasks_killing()
{
double count = 0.0;
foreachvalue (Slave* slave, slaves.registered) {
typedef hashmap<TaskID, Task*> TaskMap;
foreachvalue (const TaskMap& tasks, slave->tasks) {
foreachvalue (const Task* task, tasks) {
if (task->state() == TASK_KILLING) {
count++;
}
}
}
}
return count;
}
double Master::_resources_total(const string& name)
{
double total = 0.0;
foreachvalue (Slave* slave, slaves.registered) {
foreach (const Resource& resource, slave->info.resources()) {
if (resource.name() == name && resource.type() == Value::SCALAR) {
total += resource.scalar().value();
}
}
}
return total;
}
double Master::_resources_used(const string& name)
{
double used = 0.0;
foreachvalue (Slave* slave, slaves.registered) {
// We use `Resources` arithmetic to accummulate the resources since the
// `+=` operator de-duplicates the same shared resources across frameworks.
Resources slaveUsed;
foreachvalue (const Resources& resources, slave->usedResources) {
slaveUsed += resources.nonRevocable();
}
used +=
slaveUsed.get<Value::Scalar>(name).getOrElse(Value::Scalar()).value();
}
return used;
}
double Master::_resources_percent(const string& name)
{
double total = _resources_total(name);
if (total == 0.0) {
return 0.0;
}
return _resources_used(name) / total;
}
double Master::_resources_revocable_total(const string& name)
{
double total = 0.0;
foreachvalue (Slave* slave, slaves.registered) {
foreach (const Resource& resource, slave->totalResources.revocable()) {
if (resource.name() == name && resource.type() == Value::SCALAR) {
total += resource.scalar().value();
}
}
}
return total;
}
double Master::_resources_revocable_used(const string& name)
{
double used = 0.0;
foreachvalue (Slave* slave, slaves.registered) {
// We use `Resources` arithmetic to accummulate the resources since the
// `+=` operator de-duplicates the same shared resources across frameworks.
Resources slaveUsed;
foreachvalue (const Resources& resources, slave->usedResources) {
slaveUsed += resources.revocable();
}
used +=
slaveUsed.get<Value::Scalar>(name).getOrElse(Value::Scalar()).value();
}
return used;
}
double Master::_resources_revocable_percent(const string& name)
{
double total = _resources_revocable_total(name);
if (total == 0.0) {
return 0.0;
}
return _resources_revocable_used(name) / total;
}
static bool isValidFailoverTimeout(const FrameworkInfo& frameworkInfo)
{
return Duration::create(frameworkInfo.failover_timeout()).isSome();
}
void Master::Subscribers::send(
mesos::master::Event&& event,
const Option<FrameworkInfo>& frameworkInfo,
const Option<Task>& task)
{
VLOG(1) << "Notifying all active subscribers about " << event.type()
<< " event";
// Create a single copy of the event for all subscribers to share.
Shared<mesos::master::Event> sharedEvent(
new mesos::master::Event(std::move(event)));
// Create a single copy of `FrameworkInfo` and `Task` for all
// subscribers to share.
Shared<FrameworkInfo> sharedFrameworkInfo(
frameworkInfo.isSome()
? new FrameworkInfo(frameworkInfo.get()) : nullptr);
Shared<Task> sharedTask(task.isSome() ? new Task(task.get()) : nullptr);
foreachvalue (const Owned<Subscriber>& subscriber, subscribed) {
subscriber->getApprovers(
master->authorizer,
{VIEW_ROLE, VIEW_FRAMEWORK, VIEW_TASK, VIEW_EXECUTOR})
.then(defer(
master->self(),
[=](const Owned<ObjectApprovers>& approvers) {
subscriber->send(
sharedEvent,
approvers,
sharedFrameworkInfo,
sharedTask);
return Nothing();
}));
}
}
Future<Owned<ObjectApprovers>> Master::Subscribers::Subscriber::getApprovers(
const Option<Authorizer*>& authorizer,
std::initializer_list<authorization::Action> actions)
{
Future<Owned<ObjectApprovers>> approvers =
ObjectApprovers::create(authorizer, principal, actions);
return approversSequence.add<Owned<ObjectApprovers>>(
[approvers] { return approvers; });
}
void Master::Subscribers::Subscriber::send(
const Shared<mesos::master::Event>& event,
const Owned<ObjectApprovers>& approvers,
const Shared<FrameworkInfo>& frameworkInfo,
const Shared<Task>& task)
{
switch (event->type()) {
case mesos::master::Event::TASK_ADDED: {
CHECK_NOTNULL(frameworkInfo.get());
if (approvers->approved<VIEW_TASK>(
event->task_added().task(), *frameworkInfo) &&
approvers->approved<VIEW_FRAMEWORK>(*frameworkInfo)) {
http.send<mesos::master::Event, v1::master::Event>(*event);
}
break;
}
case mesos::master::Event::TASK_UPDATED: {
CHECK_NOTNULL(frameworkInfo.get());
CHECK_NOTNULL(task.get());
if (approvers->approved<VIEW_TASK>(*task, *frameworkInfo) &&
approvers->approved<VIEW_FRAMEWORK>(*frameworkInfo)) {
http.send<mesos::master::Event, v1::master::Event>(*event);
}
break;
}
case mesos::master::Event::FRAMEWORK_ADDED: {
if (approvers->approved<VIEW_FRAMEWORK>(
event->framework_added().framework().framework_info())) {
mesos::master::Event event_(*event);
event_.mutable_framework_added()->mutable_framework()->
mutable_allocated_resources()->Clear();
event_.mutable_framework_added()->mutable_framework()->
mutable_offered_resources()->Clear();
foreach(
const Resource& resource,
event->framework_added().framework().allocated_resources()) {
if (approvers->approved<VIEW_ROLE>(resource)) {
event_.mutable_framework_added()->mutable_framework()->
add_allocated_resources()->CopyFrom(resource);
}
}
foreach(
const Resource& resource,
event->framework_added().framework().offered_resources()) {
if (approvers->approved<VIEW_ROLE>(resource)) {
event_.mutable_framework_added()->mutable_framework()->
add_offered_resources()->CopyFrom(resource);
}
}
http.send<mesos::master::Event, v1::master::Event>(event_);
}
break;
}
case mesos::master::Event::FRAMEWORK_UPDATED: {
if (approvers->approved<VIEW_FRAMEWORK>(
event->framework_updated().framework().framework_info())) {
mesos::master::Event event_(*event);
event_.mutable_framework_updated()->mutable_framework()->
mutable_allocated_resources()->Clear();
event_.mutable_framework_updated()->mutable_framework()->
mutable_offered_resources()->Clear();
foreach(
const Resource& resource,
event->framework_updated().framework().allocated_resources()) {
if (approvers->approved<VIEW_ROLE>(resource)) {
event_.mutable_framework_updated()->mutable_framework()->
add_allocated_resources()->CopyFrom(resource);
}
}
foreach(
const Resource& resource,
event->framework_updated().framework().offered_resources()) {
if (approvers->approved<VIEW_ROLE>(resource)) {
event_.mutable_framework_updated()->mutable_framework()->
add_offered_resources()->CopyFrom(resource);
}
}
http.send<mesos::master::Event, v1::master::Event>(event_);
}
break;
}
case mesos::master::Event::FRAMEWORK_REMOVED: {
if (approvers->approved<VIEW_FRAMEWORK>(
event->framework_removed().framework_info())) {
http.send<mesos::master::Event, v1::master::Event>(*event);
}
break;
}
case mesos::master::Event::AGENT_ADDED: {
mesos::master::Event event_(*event);
event_.mutable_agent_added()->mutable_agent()->
mutable_total_resources()->Clear();
foreach(
const Resource& resource,
event->agent_added().agent().total_resources()) {
if (approvers->approved<VIEW_ROLE>(resource)) {
event_.mutable_agent_added()->mutable_agent()->add_total_resources()
->CopyFrom(resource);
}
}
http.send<mesos::master::Event, v1::master::Event>(event_);
break;
}
case mesos::master::Event::AGENT_REMOVED:
case mesos::master::Event::SUBSCRIBED:
case mesos::master::Event::HEARTBEAT:
case mesos::master::Event::UNKNOWN:
http.send<mesos::master::Event, v1::master::Event>(*event);
break;
}
}
void Master::exited(const id::UUID& id)
{
if (!subscribers.subscribed.contains(id)) {
LOG(WARNING) << "Unknown subscriber " << id << " disconnected";
return;
}
LOG(INFO) << "Removed subscriber " << id
<< " from the list of active subscribers";
subscribers.subscribed.erase(id);
}
void Master::subscribe(
const HttpConnection& http,
const Option<Principal>& principal)
{
LOG(INFO) << "Added subscriber " << http.streamId
<< " to the list of active subscribers";
http.closed()
.onAny(defer(self(),
[this, http](const Future<Nothing>&) {
exited(http.streamId);
}));
subscribers.subscribed.put(
http.streamId,
Owned<Subscribers::Subscriber>(
new Subscribers::Subscriber{http, principal}));
}
Slave::Slave(
Master* const _master,
SlaveInfo _info,
const UPID& _pid,
const MachineID& _machineId,
const string& _version,
vector<SlaveInfo::Capability> _capabilites,
const Time& _registeredTime,
vector<Resource> _checkpointedResources,
const Option<UUID>& _resourceVersion,
vector<ExecutorInfo> executorInfos,
vector<Task> tasks)
: master(_master),
id(_info.id()),
info(std::move(_info)),
machineId(_machineId),
pid(_pid),
version(_version),
capabilities(std::move(_capabilites)),
registeredTime(_registeredTime),
connected(true),
active(true),
checkpointedResources(std::move(_checkpointedResources)),
resourceVersion(_resourceVersion),
observer(nullptr)
{
CHECK(info.has_id());
Try<Resources> resources = applyCheckpointedResources(
info.resources(),
checkpointedResources);
// NOTE: This should be validated during slave recovery.
CHECK_SOME(resources);
totalResources = resources.get();
foreach (ExecutorInfo& executorInfo, executorInfos) {
CHECK(executorInfo.has_framework_id());
addExecutor(executorInfo.framework_id(), std::move(executorInfo));
}
foreach (Task& task, tasks) {
addTask(new Task(std::move(task)));
}
}
Slave::~Slave()
{
if (reregistrationTimer.isSome()) {
process::Clock::cancel(reregistrationTimer.get());
}
}
Task* Slave::getTask(const FrameworkID& frameworkId, const TaskID& taskId) const
{
if (tasks.contains(frameworkId) && tasks.at(frameworkId).contains(taskId)) {
return tasks.at(frameworkId).at(taskId);
}
return nullptr;
}
void Slave::addTask(Task* task)
{
const TaskID& taskId = task->task_id();
const FrameworkID& frameworkId = task->framework_id();
CHECK(!tasks[frameworkId].contains(taskId))
<< "Duplicate task " << taskId << " of framework " << frameworkId;
// Verify that Resource.AllocationInfo is set,
// this should be guaranteed by the master.
foreach (const Resource& resource, task->resources()) {
CHECK(resource.has_allocation_info());
}
tasks[frameworkId][taskId] = task;
// Note that we explicitly convert from protobuf to `Resources` here
// and then use the result below to avoid performance penalty for multiple
// conversions and validations implied by conversion.
// Conversion is safe, as resources have already passed validation.
const Resources resources = task->resources();
CHECK(task->state() != TASK_UNREACHABLE)
<< "Task '" << taskId << "' of framework " << frameworkId
<< " added in TASK_UNREACHABLE state";
if (!protobuf::isTerminalState(task->state())) {
usedResources[frameworkId] += resources;
}
// Note that we use `Resources` for output as it's faster than
// logging raw protobuf data.
LOG(INFO) << "Adding task " << taskId
<< " with resources " << resources
<< " on agent " << *this;
}
void Slave::recoverResources(Task* task)
{
const TaskID& taskId = task->task_id();
const FrameworkID& frameworkId = task->framework_id();
CHECK(protobuf::isTerminalState(task->state()) ||
task->state() == TASK_UNREACHABLE)
<< "Task '" << taskId << "' of framework " << frameworkId
<< " is in unexpected state " << task->state();
CHECK(tasks.at(frameworkId).contains(taskId))
<< "Unknown task " << taskId << " of framework " << frameworkId;
usedResources[frameworkId] -= task->resources();
if (usedResources[frameworkId].empty()) {
usedResources.erase(frameworkId);
}
}
void Slave::removeTask(Task* task)
{
const TaskID& taskId = task->task_id();
const FrameworkID& frameworkId = task->framework_id();
CHECK(tasks.at(frameworkId).contains(taskId))
<< "Unknown task " << taskId << " of framework " << frameworkId;
// The invariant here is that the master will have already called
// `recoverResources()` prior to removing terminal or unreachable tasks.
//
// TODO(bmahler): The unreachable case could be avoided if
// we updated `removeSlave` in the allocator to recover the
// resources (see MESOS-621) so that the master could just
// remove the unreachable agent from the allocator.
if (!protobuf::isTerminalState(task->state()) &&
task->state() != TASK_UNREACHABLE) {
// We cannot call `Slave::recoverResources()` here because
// it expects the task to be terminal or unreachable.
usedResources[frameworkId] -= task->resources();
if (usedResources[frameworkId].empty()) {
usedResources.erase(frameworkId);
}
}
tasks[frameworkId].erase(taskId);
if (tasks[frameworkId].empty()) {
tasks.erase(frameworkId);
}
killedTasks.remove(frameworkId, taskId);
}
void Slave::addOperation(Operation* operation)
{
Result<ResourceProviderID> resourceProviderId =
getResourceProviderId(operation->info());
CHECK(!resourceProviderId.isError()) << resourceProviderId.error();
if (resourceProviderId.isNone()) {
operations.put(operation->uuid(), operation);
} else {
CHECK(resourceProviders.contains(resourceProviderId.get()));
ResourceProvider& resourceProvider =
resourceProviders.at(resourceProviderId.get());
resourceProvider.operations.put(operation->uuid(), operation);
}
if (!protobuf::isSpeculativeOperation(operation->info()) &&
!protobuf::isTerminalState(operation->latest_status().state())) {
Try<Resources> consumed = protobuf::getConsumedResources(operation->info());
CHECK_SOME(consumed);
// There isn't support for non-speculative operations using the
// operator API. We can assume the framework ID has been set.
CHECK(operation->has_framework_id());
usedResources[operation->framework_id()] += consumed.get();
}
}
void Slave::recoverResources(Operation* operation)
{
// TODO(jieyu): Currently, we do not keep track of used resources
// for operations that are created by the operator through the
// operator API endpoint.
if (!operation->has_framework_id()) {
return;
}
const FrameworkID& frameworkId = operation->framework_id();
if (protobuf::isSpeculativeOperation(operation->info())) {
return;
}
Try<Resources> consumed = protobuf::getConsumedResources(operation->info());
CHECK_SOME(consumed);
CHECK(usedResources[frameworkId].contains(consumed.get()))
<< "Unknown resources " << consumed.get()
<< " of framework " << frameworkId;
usedResources[frameworkId] -= consumed.get();
if (usedResources[frameworkId].empty()) {
usedResources.erase(frameworkId);
}
}
void Slave::removeOperation(Operation* operation)
{
const UUID& uuid = operation->uuid();
Result<ResourceProviderID> resourceProviderId =
getResourceProviderId(operation->info());
CHECK(!resourceProviderId.isError()) << resourceProviderId.error();
// Recover the resource used by this operation.
if (!protobuf::isSpeculativeOperation(operation->info()) &&
!protobuf::isTerminalState(operation->latest_status().state())) {
recoverResources(operation);
}
// Remove the operation.
if (resourceProviderId.isNone()) {
CHECK(operations.contains(uuid))
<< "Unknown operation (uuid: " << uuid << ")"
<< " to agent " << *this;
operations.erase(operation->uuid());
} else {
CHECK(resourceProviders.contains(resourceProviderId.get()))
<< "resource provider " << resourceProviderId.get() << " is unknown";
ResourceProvider& resourceProvider =
resourceProviders.at(resourceProviderId.get());
CHECK(resourceProvider.operations.contains(uuid))
<< "Unknown operation (uuid: " << uuid << ")"
<< " to resource provider " << resourceProviderId.get()
<< " on agent " << *this;
resourceProvider.operations.erase(operation->uuid());
}
}
Operation* Slave::getOperation(const UUID& uuid) const
{
if (operations.contains(uuid)) {
return operations.at(uuid);
}
foreachvalue (const ResourceProvider& resourceProvider, resourceProviders) {
if (resourceProvider.operations.contains(uuid)) {
return resourceProvider.operations.at(uuid);
}
}
return nullptr;
}
void Slave::addOffer(Offer* offer)
{
CHECK(!offers.contains(offer)) << "Duplicate offer " << offer->id();
offers.insert(offer);
offeredResources += offer->resources();
}
void Slave::removeOffer(Offer* offer)
{
CHECK(offers.contains(offer)) << "Unknown offer " << offer->id();
offeredResources -= offer->resources();
offers.erase(offer);
}
void Slave::addInverseOffer(InverseOffer* inverseOffer)
{
CHECK(!inverseOffers.contains(inverseOffer))
<< "Duplicate inverse offer " << inverseOffer->id();
inverseOffers.insert(inverseOffer);
}
void Slave::removeInverseOffer(InverseOffer* inverseOffer)
{
CHECK(inverseOffers.contains(inverseOffer))
<< "Unknown inverse offer " << inverseOffer->id();
inverseOffers.erase(inverseOffer);
}
bool Slave::hasExecutor(const FrameworkID& frameworkId,
const ExecutorID& executorId) const
{
return executors.contains(frameworkId) &&
executors.at(frameworkId).contains(executorId);
}
void Slave::addExecutor(const FrameworkID& frameworkId,
const ExecutorInfo& executorInfo)
{
CHECK(!hasExecutor(frameworkId, executorInfo.executor_id()))
<< "Duplicate executor '" << executorInfo.executor_id()
<< "' of framework " << frameworkId;
// Verify that Resource.AllocationInfo is set,
// this should be guaranteed by the master.
foreach (const Resource& resource, executorInfo.resources()) {
CHECK(resource.has_allocation_info());
}
executors[frameworkId][executorInfo.executor_id()] = executorInfo;
usedResources[frameworkId] += executorInfo.resources();
}
void Slave::removeExecutor(const FrameworkID& frameworkId,
const ExecutorID& executorId)
{
CHECK(hasExecutor(frameworkId, executorId))
<< "Unknown executor '" << executorId << "' of framework " << frameworkId;
usedResources[frameworkId] -=
executors[frameworkId][executorId].resources();
if (usedResources[frameworkId].empty()) {
usedResources.erase(frameworkId);
}
executors[frameworkId].erase(executorId);
if (executors[frameworkId].empty()) {
executors.erase(frameworkId);
}
}
void Slave::apply(const vector<ResourceConversion>& conversions)
{
Try<Resources> resources = totalResources.apply(conversions);
CHECK_SOME(resources);
totalResources = resources.get();
checkpointedResources = totalResources.filter(needCheckpointing);
// Also apply the conversion to the explicitly maintained resource
// provider resources.
foreach (const ResourceConversion& conversion, conversions) {
Result<ResourceProviderID> providerId =
getResourceProviderId(conversion.consumed);
if (providerId.isNone()) {
continue;
}
CHECK_SOME(providerId);
CHECK(resourceProviders.contains(providerId.get()));
ResourceProvider& provider = resourceProviders.at(providerId.get());
CHECK(provider.totalResources.contains(conversion.consumed));
provider.totalResources -= conversion.consumed;
provider.totalResources += conversion.converted;
}
}
Try<Nothing> Slave::update(
const SlaveInfo& _info,
const string& _version,
const vector<SlaveInfo::Capability>& _capabilities,
const Resources& _checkpointedResources,
const Option<UUID>& _resourceVersion)
{
Try<Resources> resources = applyCheckpointedResources(
_info.resources(),
_checkpointedResources);
// This should be validated during slave recovery.
if (resources.isError()) {
return Error(resources.error());
}
version = _version;
capabilities = _capabilities;
info = _info;
checkpointedResources = _checkpointedResources;
// There is a short window here where `totalResources` can have an old value,
// but it should be relatively short because the agent will send
// an `UpdateSlaveMessage` with the new total resources immediately after
// reregistering in this case.
totalResources = resources.get();
resourceVersion = _resourceVersion;
return Nothing();
}
} // namespace master {
} // namespace internal {
} // namespace mesos {