| // Licensed to the Apache Software Foundation (ASF) under one |
| // or more contributor license agreements. See the NOTICE file |
| // distributed with this work for additional information |
| // regarding copyright ownership. The ASF licenses this file |
| // to you under the Apache License, Version 2.0 (the |
| // "License"); you may not use this file except in compliance |
| // with the License. You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| #include <stdint.h> |
| |
| #include <algorithm> |
| #include <cctype> |
| #include <fstream> |
| #include <functional> |
| #include <iomanip> |
| #include <list> |
| #include <memory> |
| #include <set> |
| #include <sstream> |
| #include <tuple> |
| #include <utility> |
| |
| #include <mesos/module.hpp> |
| #include <mesos/roles.hpp> |
| |
| #include <mesos/authentication/authenticator.hpp> |
| |
| #include <mesos/authorizer/authorizer.hpp> |
| |
| #include <mesos/allocator/allocator.hpp> |
| #include <mesos/master/contender.hpp> |
| #include <mesos/master/detector.hpp> |
| |
| #include <mesos/module/authenticator.hpp> |
| |
| #include <mesos/scheduler/scheduler.hpp> |
| |
| #include <process/check.hpp> |
| #include <process/collect.hpp> |
| #include <process/defer.hpp> |
| #include <process/delay.hpp> |
| #include <process/http.hpp> |
| #include <process/id.hpp> |
| #include <process/limiter.hpp> |
| #include <process/owned.hpp> |
| #include <process/run.hpp> |
| #include <process/shared.hpp> |
| |
| #include <process/metrics/metrics.hpp> |
| |
| #include <stout/check.hpp> |
| #include <stout/duration.hpp> |
| #include <stout/error.hpp> |
| #include <stout/foreach.hpp> |
| #include <stout/ip.hpp> |
| #include <stout/lambda.hpp> |
| #include <stout/multihashmap.hpp> |
| #include <stout/net.hpp> |
| #include <stout/nothing.hpp> |
| #include <stout/numify.hpp> |
| #include <stout/option.hpp> |
| #include <stout/path.hpp> |
| #include <stout/stringify.hpp> |
| #include <stout/unreachable.hpp> |
| #include <stout/utils.hpp> |
| #include <stout/uuid.hpp> |
| |
| #include "authentication/cram_md5/authenticator.hpp" |
| |
| #include "common/build.hpp" |
| #include "common/http.hpp" |
| #include "common/protobuf_utils.hpp" |
| #include "common/resource_quantities.hpp" |
| #include "common/status_utils.hpp" |
| |
| #include "credentials/credentials.hpp" |
| |
| #include "hook/manager.hpp" |
| |
| #include "logging/flags.hpp" |
| #include "logging/logging.hpp" |
| |
| #include "master/flags.hpp" |
| #include "master/master.hpp" |
| #include "master/registry_operations.hpp" |
| #include "master/weights.hpp" |
| |
| #include "module/manager.hpp" |
| |
| #include "watcher/whitelist_watcher.hpp" |
| |
| using google::protobuf::RepeatedPtrField; |
| |
| using std::list; |
| using std::reference_wrapper; |
| using std::set; |
| using std::shared_ptr; |
| using std::string; |
| using std::tie; |
| using std::tuple; |
| using std::vector; |
| |
| using process::await; |
| using process::wait; // Necessary on some OS's to disambiguate. |
| using process::Clock; |
| using process::ExitedEvent; |
| using process::Failure; |
| using process::Future; |
| using process::MessageEvent; |
| using process::Owned; |
| using process::PID; |
| using process::Process; |
| using process::Promise; |
| using process::RateLimiter; |
| using process::Shared; |
| using process::Time; |
| using process::Timer; |
| using process::UPID; |
| |
| using process::http::Pipe; |
| |
| using process::http::authentication::Principal; |
| |
| using process::metrics::Counter; |
| |
| using google::protobuf::RepeatedPtrField; |
| |
| namespace mesos { |
| namespace internal { |
| namespace master { |
| |
| using mesos::allocator::Allocator; |
| |
| using mesos::authorization::createSubject; |
| using mesos::authorization::VIEW_ROLE; |
| using mesos::authorization::VIEW_FRAMEWORK; |
| using mesos::authorization::VIEW_TASK; |
| using mesos::authorization::VIEW_EXECUTOR; |
| |
| using mesos::master::contender::MasterContender; |
| |
| using mesos::master::detector::MasterDetector; |
| |
| using mesos::internal::ResourceQuantities; |
| |
| static bool isValidFailoverTimeout(const FrameworkInfo& frameworkInfo); |
| |
| |
| class SlaveObserver : public ProtobufProcess<SlaveObserver> |
| { |
| public: |
| SlaveObserver(const UPID& _slave, |
| const SlaveInfo& _slaveInfo, |
| const SlaveID& _slaveId, |
| const PID<Master>& _master, |
| const Option<shared_ptr<RateLimiter>>& _limiter, |
| const shared_ptr<Metrics>& _metrics, |
| const Duration& _slavePingTimeout, |
| const size_t _maxSlavePingTimeouts) |
| : ProcessBase(process::ID::generate("slave-observer")), |
| slave(_slave), |
| slaveInfo(_slaveInfo), |
| slaveId(_slaveId), |
| master(_master), |
| limiter(_limiter), |
| metrics(_metrics), |
| slavePingTimeout(_slavePingTimeout), |
| maxSlavePingTimeouts(_maxSlavePingTimeouts), |
| timeouts(0), |
| pinged(false), |
| connected(true) |
| { |
| install<PongSlaveMessage>(&SlaveObserver::pong); |
| } |
| |
| void reconnect() |
| { |
| connected = true; |
| } |
| |
| void disconnect() |
| { |
| connected = false; |
| } |
| |
| protected: |
| void initialize() override |
| { |
| ping(); |
| } |
| |
| void ping() |
| { |
| PingSlaveMessage message; |
| message.set_connected(connected); |
| send(slave, message); |
| |
| pinged = true; |
| delay(slavePingTimeout, self(), &SlaveObserver::timeout); |
| } |
| |
| void pong() |
| { |
| timeouts = 0; |
| pinged = false; |
| |
| // Cancel any pending unreachable transitions. |
| if (markingUnreachable.isSome()) { |
| // Need a copy for non-const access. |
| Future<Nothing> future = markingUnreachable.get(); |
| future.discard(); |
| } |
| } |
| |
| void timeout() |
| { |
| if (pinged) { |
| timeouts++; // No pong has been received before the timeout. |
| if (timeouts >= maxSlavePingTimeouts) { |
| // No pong has been received for the last |
| // 'maxSlavePingTimeouts' pings. |
| markUnreachable(); |
| } |
| } |
| |
| // NOTE: We keep pinging even if we schedule a transition to |
| // UNREACHABLE. This is because if the slave eventually responds |
| // to a ping, we can cancel the UNREACHABLE transition. |
| ping(); |
| } |
| |
| // Marking slaves unreachable is rate-limited and can be canceled if |
| // a pong is received before `_markUnreachable` is called. |
| // |
| // TODO(neilc): Using a rate-limit when marking slaves unreachable |
| // is only necessary for frameworks that are not PARTITION_AWARE. |
| // For such frameworks, we shutdown their tasks when an unreachable |
| // agent reregisters, so a rate-limit is a useful safety |
| // precaution. Once all frameworks are PARTITION_AWARE, we can |
| // likely remove the rate-limit (MESOS-5948). |
| void markUnreachable() |
| { |
| if (markingUnreachable.isSome()) { |
| return; // Unreachable transition is already in progress. |
| } |
| |
| Future<Nothing> acquire = Nothing(); |
| |
| if (limiter.isSome()) { |
| LOG(INFO) << "Scheduling transition of agent " << slaveId |
| << " to UNREACHABLE because of health check timeout"; |
| |
| acquire = limiter.get()->acquire(); |
| } |
| |
| markingUnreachable = acquire.onAny(defer(self(), &Self::_markUnreachable)); |
| ++metrics->slave_unreachable_scheduled; |
| } |
| |
| void _markUnreachable() |
| { |
| CHECK_SOME(markingUnreachable); |
| |
| const Future<Nothing>& future = markingUnreachable.get(); |
| |
| CHECK(!future.isFailed()); |
| |
| if (future.isReady()) { |
| ++metrics->slave_unreachable_completed; |
| |
| dispatch(master, |
| &Master::markUnreachable, |
| slaveInfo, |
| false, |
| "health check timed out"); |
| } else if (future.isDiscarded()) { |
| LOG(INFO) << "Canceling transition of agent " << slaveId |
| << " to UNREACHABLE because a pong was received!"; |
| |
| ++metrics->slave_unreachable_canceled; |
| } |
| |
| markingUnreachable = None(); |
| } |
| |
| private: |
| const UPID slave; |
| const SlaveInfo slaveInfo; |
| const SlaveID slaveId; |
| const PID<Master> master; |
| const Option<shared_ptr<RateLimiter>> limiter; |
| shared_ptr<Metrics> metrics; |
| Option<Future<Nothing>> markingUnreachable; |
| const Duration slavePingTimeout; |
| const size_t maxSlavePingTimeouts; |
| uint32_t timeouts; |
| bool pinged; |
| bool connected; |
| }; |
| |
| |
| Master::Master( |
| Allocator* _allocator, |
| Registrar* _registrar, |
| Files* _files, |
| MasterContender* _contender, |
| MasterDetector* _detector, |
| const Option<Authorizer*>& _authorizer, |
| const Option<shared_ptr<RateLimiter>>& _slaveRemovalLimiter, |
| const Flags& _flags) |
| : ProcessBase("master"), |
| flags(_flags), |
| http(this), |
| allocator(_allocator), |
| registrar(_registrar), |
| files(_files), |
| contender(_contender), |
| detector(_detector), |
| authorizer(_authorizer), |
| frameworks(flags), |
| subscribers(this), |
| authenticator(None()), |
| metrics(new Metrics(*this)), |
| electedTime(None()) |
| { |
| slaves.limiter = _slaveRemovalLimiter; |
| |
| // NOTE: We populate 'info_' here instead of inside 'initialize()' |
| // because 'StandaloneMasterDetector' needs access to the info. |
| |
| // Master ID is generated randomly based on UUID. |
| info_.set_id(id::UUID::random().toString()); |
| |
| // NOTE: Currently, we store ip in MasterInfo in network order, |
| // which should be fixed. See MESOS-1201 for details. |
| // TODO(marco): The ip, port, hostname fields above are |
| // being deprecated; the code should be removed once |
| // the deprecation cycle is complete. |
| info_.set_ip(self().address.ip.in()->s_addr); |
| |
| info_.set_port(self().address.port); |
| info_.set_pid(self()); |
| info_.set_version(MESOS_VERSION); |
| |
| for (const MasterInfo::Capability& capability : MASTER_CAPABILITIES()) { |
| info_.add_capabilities()->CopyFrom(capability); |
| } |
| |
| // Determine our hostname or use the hostname provided. |
| string hostname; |
| |
| if (flags.hostname.isNone()) { |
| if (flags.hostname_lookup) { |
| Try<string> result = net::getHostname(self().address.ip); |
| |
| if (result.isError()) { |
| LOG(FATAL) << "Failed to get hostname: " << result.error(); |
| } |
| |
| hostname = result.get(); |
| } else { |
| // We use the IP address for hostname if the user requested us |
| // NOT to look it up, and it wasn't explicitly set via --hostname: |
| hostname = stringify(self().address.ip); |
| } |
| } else { |
| hostname = flags.hostname.get(); |
| } |
| |
| info_.set_hostname(hostname); |
| |
| // This uses the new `Address` message in `MasterInfo`. |
| info_.mutable_address()->set_ip(stringify(self().address.ip)); |
| info_.mutable_address()->set_port(self().address.port); |
| info_.mutable_address()->set_hostname(hostname); |
| |
| if (flags.domain.isSome()) { |
| info_.mutable_domain()->CopyFrom(flags.domain.get()); |
| } |
| } |
| |
| |
| Master::~Master() {} |
| |
| |
| hashset<string> Master::misingMinimumCapabilities( |
| const MasterInfo& masterInfo, const Registry& registry) |
| { |
| if (registry.minimum_capabilities().size() == 0) { |
| return hashset<string>(); |
| } |
| |
| hashset<string> minimumCapabilities, masterCapabilities; |
| |
| foreach ( |
| const Registry::MinimumCapability& minimumCapability, |
| registry.minimum_capabilities()) { |
| minimumCapabilities.insert(minimumCapability.capability()); |
| } |
| |
| foreach ( |
| const MasterInfo::Capability& masterCapability, |
| masterInfo.capabilities()) { |
| masterCapabilities.insert( |
| MasterInfo::Capability::Type_Name(masterCapability.type())); |
| } |
| |
| return minimumCapabilities - masterCapabilities; |
| } |
| |
| |
| // TODO(vinod): Update this interface to return failed futures when |
| // capacity is reached. |
| struct BoundedRateLimiter |
| { |
| BoundedRateLimiter(double qps, Option<uint64_t> _capacity) |
| : limiter(new process::RateLimiter(qps)), |
| capacity(_capacity), |
| messages(0) {} |
| |
| process::Owned<process::RateLimiter> limiter; |
| const Option<uint64_t> capacity; |
| |
| // Number of outstanding messages for this RateLimiter. |
| // NOTE: ExitedEvents are throttled but not counted towards |
| // the capacity here. |
| uint64_t messages; |
| }; |
| |
| |
| void Master::initialize() |
| { |
| LOG(INFO) << "Master " << info_.id() << " (" << info_.hostname() << ")" |
| << " started on " << string(self()).substr(7); |
| |
| LOG(INFO) << "Flags at startup: " << flags; |
| |
| if (process::address().ip.isLoopback()) { |
| LOG(WARNING) << "\n**************************************************\n" |
| << "Master bound to loopback interface!" |
| << " Cannot communicate with remote schedulers or agents." |
| << " You might want to set '--ip' flag to a routable" |
| << " IP address.\n" |
| << "**************************************************"; |
| } |
| |
| // NOTE: We enforce a minimum slave reregister timeout because the |
| // slave bounds its (re-)registration retries based on the minimum. |
| if (flags.agent_reregister_timeout < MIN_AGENT_REREGISTER_TIMEOUT) { |
| EXIT(EXIT_FAILURE) |
| << "Invalid value '" << flags.agent_reregister_timeout << "'" |
| << " for --agent_reregister_timeout:" |
| << " Must be at least " << MIN_AGENT_REREGISTER_TIMEOUT; |
| } |
| |
| // Parse the percentage for the slave removal limit. |
| // TODO(bmahler): Add a 'Percentage' abstraction. |
| if (!strings::endsWith(flags.recovery_agent_removal_limit, "%")) { |
| EXIT(EXIT_FAILURE) |
| << "Invalid value '" << flags.recovery_agent_removal_limit << "'" |
| << " for --recovery_agent_removal_percent_limit: " << "missing '%'"; |
| } |
| |
| Try<double> limit = numify<double>( |
| strings::remove( |
| flags.recovery_agent_removal_limit, |
| "%", |
| strings::SUFFIX)); |
| |
| if (limit.isError()) { |
| EXIT(EXIT_FAILURE) |
| << "Invalid value '" << flags.recovery_agent_removal_limit << "'" |
| << " for --recovery_agent_removal_percent_limit: " << limit.error(); |
| } |
| |
| if (limit.get() < 0.0 || limit.get() > 100.0) { |
| EXIT(EXIT_FAILURE) |
| << "Invalid value '" << flags.recovery_agent_removal_limit << "'" |
| << " for --recovery_agent_removal_percent_limit:" |
| << " Must be within [0%-100%]"; |
| } |
| |
| // Log authentication state. |
| if (flags.authenticate_frameworks) { |
| LOG(INFO) << "Master only allowing authenticated frameworks to register"; |
| } else { |
| LOG(INFO) << "Master allowing unauthenticated frameworks to register"; |
| } |
| |
| if (flags.authenticate_agents) { |
| LOG(INFO) << "Master only allowing authenticated agents to register"; |
| } else { |
| LOG(INFO) << "Master allowing unauthenticated agents to register"; |
| } |
| |
| if (flags.authenticate_http_frameworks) { |
| LOG(INFO) << "Master only allowing authenticated HTTP frameworks to " |
| << "register"; |
| } else { |
| LOG(INFO) << "Master allowing HTTP frameworks to register without " |
| << "authentication"; |
| } |
| |
| // Load credentials. |
| Option<Credentials> credentials; |
| if (flags.credentials.isSome()) { |
| Result<Credentials> _credentials = |
| credentials::read(flags.credentials.get()); |
| if (_credentials.isError()) { |
| EXIT(EXIT_FAILURE) << _credentials.error() << " (see --credentials flag)"; |
| } else if (_credentials.isNone()) { |
| EXIT(EXIT_FAILURE) |
| << "Credentials file must contain at least one credential" |
| << " (see --credentials flag)"; |
| } |
| // Store credentials in master to use them in routes. |
| credentials = _credentials.get(); |
| } |
| |
| // Extract authenticator names and validate them. |
| authenticatorNames = strings::split(flags.authenticators, ","); |
| if (authenticatorNames.empty()) { |
| EXIT(EXIT_FAILURE) << "No authenticator specified"; |
| } |
| if (authenticatorNames.size() > 1) { |
| EXIT(EXIT_FAILURE) << "Multiple authenticators not supported"; |
| } |
| if (authenticatorNames[0] != DEFAULT_AUTHENTICATOR && |
| !modules::ModuleManager::contains<Authenticator>( |
| authenticatorNames[0])) { |
| EXIT(EXIT_FAILURE) |
| << "Authenticator '" << authenticatorNames[0] << "' not found." |
| << " Check the spelling (compare to '" << DEFAULT_AUTHENTICATOR << "')" |
| << " or verify that the authenticator was loaded successfully" |
| << " (see --modules)"; |
| } |
| |
| // TODO(tillt): Allow multiple authenticators to be loaded and enable |
| // the authenticatee to select the appropriate one. See MESOS-1939. |
| if (authenticatorNames[0] == DEFAULT_AUTHENTICATOR) { |
| LOG(INFO) << "Using default '" << DEFAULT_AUTHENTICATOR |
| << "' authenticator"; |
| |
| authenticator = new cram_md5::CRAMMD5Authenticator(); |
| } else { |
| Try<Authenticator*> module = |
| modules::ModuleManager::create<Authenticator>(authenticatorNames[0]); |
| if (module.isError()) { |
| EXIT(EXIT_FAILURE) |
| << "Could not create authenticator module '" |
| << authenticatorNames[0] << "': " << module.error(); |
| } |
| LOG(INFO) << "Using '" << authenticatorNames[0] << "' authenticator"; |
| authenticator = module.get(); |
| } |
| |
| // Give Authenticator access to credentials when needed. |
| CHECK_SOME(authenticator); |
| Try<Nothing> initialize = authenticator.get()->initialize(credentials); |
| if (initialize.isError()) { |
| const string error = |
| "Failed to initialize authenticator '" + authenticatorNames[0] + |
| "': " + initialize.error(); |
| if (flags.authenticate_frameworks || flags.authenticate_agents) { |
| EXIT(EXIT_FAILURE) |
| << "Failed to start master with authentication enabled: " << error; |
| } else { |
| // A failure to initialize the authenticator does lead to |
| // unusable authentication but still allows non authenticating |
| // frameworks and slaves to connect. |
| LOG(WARNING) << "Only non-authenticating frameworks and agents are " |
| << "allowed to connect. " |
| << "Authentication is disabled: " << error; |
| |
| delete authenticator.get(); |
| authenticator = None(); |
| } |
| } |
| |
| if (flags.authenticate_http_readonly) { |
| Try<Nothing> result = initializeHttpAuthenticators( |
| READONLY_HTTP_AUTHENTICATION_REALM, |
| strings::split(flags.http_authenticators, ","), |
| credentials); |
| |
| if (result.isError()) { |
| EXIT(EXIT_FAILURE) << result.error(); |
| } |
| } |
| |
| if (flags.authenticate_http_readwrite) { |
| Try<Nothing> result = initializeHttpAuthenticators( |
| READWRITE_HTTP_AUTHENTICATION_REALM, |
| strings::split(flags.http_authenticators, ","), |
| credentials); |
| |
| if (result.isError()) { |
| EXIT(EXIT_FAILURE) << result.error(); |
| } |
| } |
| |
| if (flags.authenticate_http_frameworks) { |
| // The `--http_framework_authenticators` flag should always be set when HTTP |
| // framework authentication is enabled. |
| if (flags.http_framework_authenticators.isNone()) { |
| EXIT(EXIT_FAILURE) |
| << "Missing `--http_framework_authenticators` flag. This must be used " |
| << "in conjunction with `--authenticate_http_frameworks`"; |
| } |
| |
| Try<Nothing> result = initializeHttpAuthenticators( |
| DEFAULT_HTTP_FRAMEWORK_AUTHENTICATION_REALM, |
| strings::split(flags.http_framework_authenticators.get(), ","), |
| credentials); |
| |
| if (result.isError()) { |
| EXIT(EXIT_FAILURE) << result.error(); |
| } |
| } |
| |
| if (authorizer.isSome()) { |
| LOG(INFO) << "Authorization enabled"; |
| } |
| |
| if (flags.rate_limits.isSome()) { |
| // Add framework rate limiters. |
| foreach (const RateLimit& limit_, flags.rate_limits->limits()) { |
| if (frameworks.limiters.contains(limit_.principal())) { |
| EXIT(EXIT_FAILURE) |
| << "Duplicate principal " << limit_.principal() |
| << " found in RateLimits configuration"; |
| } |
| |
| if (limit_.has_qps() && limit_.qps() <= 0) { |
| EXIT(EXIT_FAILURE) |
| << "Invalid qps: " << limit_.qps() |
| << ". It must be a positive number"; |
| } |
| |
| if (limit_.has_qps()) { |
| Option<uint64_t> capacity; |
| if (limit_.has_capacity()) { |
| capacity = limit_.capacity(); |
| } |
| frameworks.limiters.put( |
| limit_.principal(), |
| Owned<BoundedRateLimiter>( |
| new BoundedRateLimiter(limit_.qps(), capacity))); |
| } else { |
| frameworks.limiters.put(limit_.principal(), None()); |
| } |
| } |
| |
| if (flags.rate_limits->has_aggregate_default_qps() && |
| flags.rate_limits->aggregate_default_qps() <= 0) { |
| EXIT(EXIT_FAILURE) |
| << "Invalid aggregate_default_qps: " |
| << flags.rate_limits->aggregate_default_qps() |
| << ". It must be a positive number"; |
| } |
| |
| if (flags.rate_limits->has_aggregate_default_qps()) { |
| Option<uint64_t> capacity; |
| if (flags.rate_limits->has_aggregate_default_capacity()) { |
| capacity = flags.rate_limits->aggregate_default_capacity(); |
| } |
| frameworks.defaultLimiter = |
| Owned<BoundedRateLimiter>(new BoundedRateLimiter( |
| flags.rate_limits->aggregate_default_qps(), capacity)); |
| } |
| |
| LOG(INFO) << "Framework rate limiting enabled"; |
| } |
| |
| // If the rate limiter is injected for testing, |
| // the flag may not be set. |
| if (slaves.limiter.isSome() && flags.agent_removal_rate_limit.isSome()) { |
| LOG(INFO) << "Agent removal is rate limited to " |
| << flags.agent_removal_rate_limit.get(); |
| } |
| |
| // If "--roles" is set, configure the role whitelist. |
| // TODO(neilc): Remove support for explicit roles in ~Mesos 0.32. |
| if (flags.roles.isSome()) { |
| LOG(WARNING) << "The '--roles' flag is deprecated. This flag will be " |
| << "removed in the future. See the Mesos 0.27 upgrade " |
| << "notes for more information"; |
| |
| Try<vector<string>> roles = roles::parse(flags.roles.get()); |
| if (roles.isError()) { |
| EXIT(EXIT_FAILURE) << "Failed to parse roles: " << roles.error(); |
| } |
| |
| roleWhitelist = hashset<string>(); |
| foreach (const string& role, roles.get()) { |
| roleWhitelist->insert(role); |
| } |
| |
| if (roleWhitelist->size() < roles->size()) { |
| LOG(WARNING) << "Duplicate values in '--roles': " << flags.roles.get(); |
| } |
| |
| // The default role is always allowed. |
| roleWhitelist->insert("*"); |
| } |
| |
| // Add role weights. |
| if (flags.weights.isSome()) { |
| vector<string> tokens = strings::tokenize(flags.weights.get(), ","); |
| |
| foreach (const string& token, tokens) { |
| vector<string> pair = strings::tokenize(token, "="); |
| if (pair.size() != 2) { |
| EXIT(EXIT_FAILURE) |
| << "Invalid weight: '" << token << "'. --weights should" |
| << " be of the form 'role=weight,role=weight'"; |
| } else if (!isWhitelistedRole(pair[0])) { |
| EXIT(EXIT_FAILURE) |
| << "Invalid weight: '" << token << "'. " << pair[0] |
| << " is not a valid role"; |
| } |
| |
| double weight = atof(pair[1].c_str()); |
| if (weight <= 0) { |
| EXIT(EXIT_FAILURE) |
| << "Invalid weight: '" << token << "'. Weights must be positive"; |
| } |
| |
| weights[pair[0]] = weight; |
| } |
| } |
| |
| // Verify the timeout is greater than zero. |
| if (flags.offer_timeout.isSome() && |
| flags.offer_timeout.get() <= Duration::zero()) { |
| EXIT(EXIT_FAILURE) |
| << "Invalid value '" << flags.offer_timeout.get() << "'" |
| << " for --offer_timeout: Must be greater than zero"; |
| } |
| |
| // Parse min_allocatable_resources. |
| vector<ResourceQuantities> minAllocatableResources; |
| foreach ( |
| const string& token, |
| strings::tokenize(flags.min_allocatable_resources, "|")) { |
| Try<ResourceQuantities> resourceQuantities = |
| ResourceQuantities::fromString(token); |
| |
| if (resourceQuantities.isError()) { |
| EXIT(EXIT_FAILURE) << "Error parsing min_allocatable_resources '" |
| << flags.min_allocatable_resources |
| << "': " << resourceQuantities.error(); |
| } |
| |
| // We check the configuration against first-class resources and warn |
| // against possible mis-configuration (e.g. typo). |
| set<string> firstClassResources = {"cpus", "mem", "disk", "ports", "gpus"}; |
| for (auto it = resourceQuantities->begin(); it != resourceQuantities->end(); |
| ++it) { |
| if (firstClassResources.count(it->first) == 0) { |
| LOG(WARNING) << "Non-first-class resource '" << it->first |
| << "' is configured as part of min_allocatable_resources"; |
| } |
| } |
| |
| minAllocatableResources.push_back(resourceQuantities.get()); |
| } |
| |
| // Initialize the allocator. |
| allocator->initialize( |
| flags.allocation_interval, |
| defer(self(), &Master::offer, lambda::_1, lambda::_2), |
| defer(self(), &Master::inverseOffer, lambda::_1, lambda::_2), |
| flags.fair_sharing_excluded_resource_names, |
| flags.filter_gpu_resources, |
| flags.domain, |
| minAllocatableResources, |
| flags.max_completed_frameworks); |
| |
| // Parse the whitelist. Passing Allocator::updateWhitelist() |
| // callback is safe because we shut down the whitelistWatcher in |
| // Master::finalize(), while allocator lifetime is greater than |
| // masters. Therefore there is no risk of calling into an allocator |
| // that has been cleaned up. |
| whitelistWatcher = new WhitelistWatcher( |
| flags.whitelist, |
| WHITELIST_WATCH_INTERVAL, |
| [this](const Option<hashset<string>>& whitelist) { |
| return allocator->updateWhitelist(whitelist); |
| }); |
| spawn(whitelistWatcher); |
| |
| nextFrameworkId = 0; |
| nextSlaveId = 0; |
| nextOfferId = 0; |
| |
| startTime = Clock::now(); |
| |
| install<scheduler::Call>(&Master::receive); |
| |
| // Install handler functions for certain messages. |
| install<SubmitSchedulerRequest>( |
| &Master::submitScheduler, |
| &SubmitSchedulerRequest::name); |
| |
| install<RegisterFrameworkMessage>( |
| &Master::registerFramework); |
| |
| install<ReregisterFrameworkMessage>( |
| &Master::reregisterFramework); |
| |
| install<UnregisterFrameworkMessage>( |
| &Master::unregisterFramework, |
| &UnregisterFrameworkMessage::framework_id); |
| |
| install<DeactivateFrameworkMessage>( |
| &Master::deactivateFramework, |
| &DeactivateFrameworkMessage::framework_id); |
| |
| install<ResourceRequestMessage>( |
| &Master::resourceRequest, |
| &ResourceRequestMessage::framework_id, |
| &ResourceRequestMessage::requests); |
| |
| install<LaunchTasksMessage>( |
| &Master::launchTasks); |
| |
| install<ReviveOffersMessage>( |
| &Master::reviveOffers, |
| &ReviveOffersMessage::framework_id, |
| &ReviveOffersMessage::roles); |
| |
| install<KillTaskMessage>( |
| &Master::killTask, |
| &KillTaskMessage::framework_id, |
| &KillTaskMessage::task_id); |
| |
| install<StatusUpdateAcknowledgementMessage>( |
| &Master::statusUpdateAcknowledgement); |
| |
| install<FrameworkToExecutorMessage>( |
| &Master::schedulerMessage); |
| |
| install<RegisterSlaveMessage>( |
| &Master::registerSlave); |
| |
| install<ReregisterSlaveMessage>( |
| &Master::reregisterSlave); |
| |
| install<UnregisterSlaveMessage>( |
| &Master::unregisterSlave, |
| &UnregisterSlaveMessage::slave_id); |
| |
| install<StatusUpdateMessage>( |
| &Master::statusUpdate); |
| |
| // Added in 0.24.0 to support HTTP schedulers. Since |
| // these do not have a pid, the slave must forward |
| // messages through the master. |
| install<ExecutorToFrameworkMessage>( |
| &Master::executorMessage); |
| |
| install<ReconcileTasksMessage>( |
| &Master::reconcileTasks); |
| |
| install<UpdateOperationStatusMessage>( |
| &Master::updateOperationStatus); |
| |
| install<ExitedExecutorMessage>( |
| &Master::exitedExecutor, |
| &ExitedExecutorMessage::slave_id, |
| &ExitedExecutorMessage::framework_id, |
| &ExitedExecutorMessage::executor_id, |
| &ExitedExecutorMessage::status); |
| |
| install<UpdateSlaveMessage>(&Master::updateSlave); |
| |
| install<AuthenticateMessage>( |
| &Master::authenticate, |
| &AuthenticateMessage::pid); |
| |
| // Setup HTTP routes. |
| route("/api/v1", |
| // TODO(benh): Is this authentication realm sufficient or do |
| // we need some kind of hybrid if we expect both schedulers |
| // and operators/tooling to use this endpoint? |
| READWRITE_HTTP_AUTHENTICATION_REALM, |
| Http::API_HELP(), |
| [this](const process::http::Request& request, |
| const Option<Principal>& principal) { |
| logRequest(request); |
| return http.api(request, principal); |
| }); |
| route("/api/v1/scheduler", |
| DEFAULT_HTTP_FRAMEWORK_AUTHENTICATION_REALM, |
| Http::SCHEDULER_HELP(), |
| [this](const process::http::Request& request, |
| const Option<Principal>& principal) { |
| logRequest(request); |
| return http.scheduler(request, principal); |
| }); |
| route("/create-volumes", |
| READWRITE_HTTP_AUTHENTICATION_REALM, |
| Http::CREATE_VOLUMES_HELP(), |
| [this](const process::http::Request& request, |
| const Option<Principal>& principal) { |
| logRequest(request); |
| return http.createVolumes(request, principal); |
| }); |
| route("/destroy-volumes", |
| READWRITE_HTTP_AUTHENTICATION_REALM, |
| Http::DESTROY_VOLUMES_HELP(), |
| [this](const process::http::Request& request, |
| const Option<Principal>& principal) { |
| logRequest(request); |
| return http.destroyVolumes(request, principal); |
| }); |
| route("/frameworks", |
| READONLY_HTTP_AUTHENTICATION_REALM, |
| Http::FRAMEWORKS_HELP(), |
| [this](const process::http::Request& request, |
| const Option<Principal>& principal) { |
| logRequest(request); |
| return http.frameworks(request, principal); |
| }); |
| route("/flags", |
| READONLY_HTTP_AUTHENTICATION_REALM, |
| Http::FLAGS_HELP(), |
| [this](const process::http::Request& request, |
| const Option<Principal>& principal) { |
| logRequest(request); |
| return http.flags(request, principal); |
| }); |
| route("/health", |
| Http::HEALTH_HELP(), |
| [this](const process::http::Request& request) { |
| return http.health(request); |
| }); |
| route("/redirect", |
| Http::REDIRECT_HELP(), |
| [this](const process::http::Request& request) { |
| return http.redirect(request); |
| }); |
| route("/reserve", |
| READWRITE_HTTP_AUTHENTICATION_REALM, |
| Http::RESERVE_HELP(), |
| [this](const process::http::Request& request, |
| const Option<Principal>& principal) { |
| logRequest(request); |
| return http.reserve(request, principal); |
| }); |
| // TODO(ijimenez): Remove this endpoint at the end of the |
| // deprecation cycle on 0.26. |
| route("/roles.json", |
| READONLY_HTTP_AUTHENTICATION_REALM, |
| Http::ROLES_HELP(), |
| [this](const process::http::Request& request, |
| const Option<Principal>& principal) { |
| logRequest(request); |
| return http.roles(request, principal); |
| }); |
| route("/roles", |
| READONLY_HTTP_AUTHENTICATION_REALM, |
| Http::ROLES_HELP(), |
| [this](const process::http::Request& request, |
| const Option<Principal>& principal) { |
| logRequest(request); |
| return http.roles(request, principal); |
| }); |
| route("/teardown", |
| READWRITE_HTTP_AUTHENTICATION_REALM, |
| Http::TEARDOWN_HELP(), |
| [this](const process::http::Request& request, |
| const Option<Principal>& principal) { |
| logRequest(request); |
| return http.teardown(request, principal); |
| }); |
| route("/slaves", |
| READONLY_HTTP_AUTHENTICATION_REALM, |
| Http::SLAVES_HELP(), |
| [this](const process::http::Request& request, |
| const Option<Principal>& principal) { |
| logRequest(request); |
| return http.slaves(request, principal); |
| }); |
| // TODO(ijimenez): Remove this endpoint at the end of the |
| // deprecation cycle on 0.26. |
| route("/state.json", |
| READONLY_HTTP_AUTHENTICATION_REALM, |
| Http::STATE_HELP(), |
| [this](const process::http::Request& request, |
| const Option<Principal>& principal) { |
| logRequest(request); |
| return http.state(request, principal); |
| }); |
| route("/state", |
| READONLY_HTTP_AUTHENTICATION_REALM, |
| Http::STATE_HELP(), |
| [this](const process::http::Request& request, |
| const Option<Principal>& principal) { |
| logRequest(request); |
| return http.state(request, principal); |
| }); |
| route("/state-summary", |
| READONLY_HTTP_AUTHENTICATION_REALM, |
| Http::STATESUMMARY_HELP(), |
| [this](const process::http::Request& request, |
| const Option<Principal>& principal) { |
| logRequest(request); |
| return http.stateSummary(request, principal); |
| }); |
| // TODO(ijimenez): Remove this endpoint at the end of the |
| // deprecation cycle. |
| route("/tasks.json", |
| READONLY_HTTP_AUTHENTICATION_REALM, |
| Http::TASKS_HELP(), |
| [this](const process::http::Request& request, |
| const Option<Principal>& principal) { |
| logRequest(request); |
| return http.tasks(request, principal); |
| }); |
| route("/tasks", |
| READONLY_HTTP_AUTHENTICATION_REALM, |
| Http::TASKS_HELP(), |
| [this](const process::http::Request& request, |
| const Option<Principal>& principal) { |
| logRequest(request); |
| return http.tasks(request, principal); |
| }); |
| route("/maintenance/schedule", |
| READWRITE_HTTP_AUTHENTICATION_REALM, |
| Http::MAINTENANCE_SCHEDULE_HELP(), |
| [this](const process::http::Request& request, |
| const Option<Principal>& principal) { |
| logRequest(request); |
| return http.maintenanceSchedule(request, principal); |
| }); |
| route("/maintenance/status", |
| READONLY_HTTP_AUTHENTICATION_REALM, |
| Http::MAINTENANCE_STATUS_HELP(), |
| [this](const process::http::Request& request, |
| const Option<Principal>& principal) { |
| logRequest(request); |
| return http.maintenanceStatus(request, principal); |
| }); |
| route("/machine/down", |
| READWRITE_HTTP_AUTHENTICATION_REALM, |
| Http::MACHINE_DOWN_HELP(), |
| [this](const process::http::Request& request, |
| const Option<Principal>& principal) { |
| logRequest(request); |
| return http.machineDown(request, principal); |
| }); |
| route("/machine/up", |
| READWRITE_HTTP_AUTHENTICATION_REALM, |
| Http::MACHINE_UP_HELP(), |
| [this](const process::http::Request& request, |
| const Option<Principal>& principal) { |
| logRequest(request); |
| return http.machineUp(request, principal); |
| }); |
| route("/unreserve", |
| READWRITE_HTTP_AUTHENTICATION_REALM, |
| Http::UNRESERVE_HELP(), |
| [this](const process::http::Request& request, |
| const Option<Principal>& principal) { |
| logRequest(request); |
| return http.unreserve(request, principal); |
| }); |
| route("/quota", |
| READWRITE_HTTP_AUTHENTICATION_REALM, |
| Http::QUOTA_HELP(), |
| [this](const process::http::Request& request, |
| const Option<Principal>& principal) { |
| logRequest(request); |
| return http.quota(request, principal); |
| }); |
| route("/weights", |
| READWRITE_HTTP_AUTHENTICATION_REALM, |
| Http::WEIGHTS_HELP(), |
| [this](const process::http::Request& request, |
| const Option<Principal>& principal) { |
| logRequest(request); |
| return http.weights(request, principal); |
| }); |
| |
| // Provide HTTP assets from a "webui" directory. This is either |
| // specified via flags (which is necessary for running out of the |
| // build directory before 'make install') or determined at build |
| // time via the preprocessor macro '-DMESOS_WEBUI_DIR' set in the |
| // Makefile. |
| provide("", path::join(flags.webui_dir, "index.html")); |
| provide("app", path::join(flags.webui_dir, "app")); |
| provide("assets", path::join(flags.webui_dir, "assets")); |
| |
| const PID<Master> masterPid = self(); |
| |
| auto authorize = [masterPid](const Option<Principal>& principal) { |
| return dispatch(masterPid, &Master::authorizeLogAccess, principal); |
| }; |
| |
| // Expose the log file for the webui. Fall back to 'log_dir' if |
| // an explicit file was not specified. |
| if (flags.external_log_file.isSome()) { |
| files->attach(flags.external_log_file.get(), "/master/log", authorize) |
| .onAny(defer(self(), |
| &Self::fileAttached, |
| lambda::_1, |
| flags.external_log_file.get())); |
| } else if (flags.log_dir.isSome()) { |
| Try<string> log = logging::getLogFile( |
| logging::getLogSeverity(flags.logging_level)); |
| |
| if (log.isError()) { |
| LOG(ERROR) << "Master log file cannot be found: " << log.error(); |
| } else { |
| files->attach(log.get(), "/master/log", authorize) |
| .onAny(defer(self(), &Self::fileAttached, lambda::_1, log.get())); |
| } |
| } |
| |
| contender->initialize(info_); |
| |
| // Start contending to be a leading master and detecting the current |
| // leader. |
| contender->contend() |
| .onAny(defer(self(), &Master::contended, lambda::_1)); |
| detector->detect() |
| .onAny(defer(self(), &Master::detected, lambda::_1)); |
| } |
| |
| |
| void Master::finalize() |
| { |
| LOG(INFO) << "Master terminating"; |
| |
| // NOTE: Even though we remove the slave and framework from the |
| // allocator, it is possible that offers are already dispatched to |
| // this master. In tests, if a new master (with the same PID) is |
| // started, it might process the offers from the old master's |
| // allocator. |
| // TODO(vinod): Fix the above race by changing the allocator |
| // interface to return a stream of offer events. |
| |
| // Remove the slaves. |
| foreachvalue (Slave* slave, slaves.registered) { |
| // We first remove the slave from the allocator so that any |
| // recovered resources below are not reoffered. |
| allocator->removeSlave(slave->id); |
| |
| foreachkey (const FrameworkID& frameworkId, utils::copy(slave->tasks)) { |
| foreachvalue (Task* task, utils::copy(slave->tasks[frameworkId])) { |
| removeTask(task); |
| } |
| } |
| |
| // Remove executors. |
| foreachkey (const FrameworkID& frameworkId, utils::copy(slave->executors)) { |
| foreachkey (const ExecutorID& executorId, |
| utils::copy(slave->executors[frameworkId])) { |
| removeExecutor(slave, frameworkId, executorId); |
| } |
| } |
| |
| // Remove offers. |
| foreach (Offer* offer, utils::copy(slave->offers)) { |
| removeOffer(offer); |
| } |
| |
| // Remove inverse offers. |
| foreach (InverseOffer* inverseOffer, utils::copy(slave->inverseOffers)) { |
| // We don't need to update the allocator because the slave has already |
| // been removed. |
| removeInverseOffer(inverseOffer); |
| } |
| |
| // Remove pending tasks from the slave. Don't bother |
| // recovering the resources in the allocator. |
| slave->pendingTasks.clear(); |
| |
| // Terminate the slave observer. |
| terminate(slave->observer); |
| wait(slave->observer); |
| |
| delete slave->observer; |
| delete slave; |
| } |
| slaves.registered.clear(); |
| |
| // Remove the frameworks. |
| // Note we are not deleting the pointers to the frameworks from the |
| // roles because it is unnecessary bookkeeping at this point since |
| // we are shutting down. |
| foreachvalue (Framework* framework, frameworks.registered) { |
| allocator->removeFramework(framework->id()); |
| |
| // Remove pending tasks from the framework. Don't bother |
| // recovering the resources in the allocator. |
| framework->pendingTasks.clear(); |
| |
| // No tasks/executors/offers should remain since the slaves |
| // have been removed. |
| CHECK(framework->tasks.empty()); |
| CHECK(framework->executors.empty()); |
| CHECK(framework->offers.empty()); |
| CHECK(framework->inverseOffers.empty()); |
| |
| delete framework; |
| } |
| frameworks.registered.clear(); |
| |
| CHECK(offers.empty()); |
| CHECK(inverseOffers.empty()); |
| |
| foreachvalue (Future<Option<string>> future, authenticating) { |
| // NOTE: This is necessary during tests because a copy of |
| // this future is used to setup authentication timeout. If a |
| // test doesn't discard this future, authentication timeout might |
| // fire in a different test and any associated callbacks |
| // (e.g., '_authenticate()') would be called. This is because the |
| // master pid doesn't change across the tests. |
| // TODO(vinod): This seems to be a bug in libprocess or the |
| // testing infrastructure. |
| future.discard(); |
| } |
| |
| foreachvalue (Role* role, roles) { |
| delete role; |
| } |
| roles.clear(); |
| |
| // NOTE: This is necessary during tests because we don't want the |
| // timer to fire in a different test and invoke the callback. |
| // The callback would be invoked because the master pid doesn't |
| // change across the tests. |
| // TODO(vinod): This seems to be a bug in libprocess or the |
| // testing infrastructure. |
| if (slaves.recoveredTimer.isSome()) { |
| Clock::cancel(slaves.recoveredTimer.get()); |
| } |
| |
| if (registryGcTimer.isSome()) { |
| Clock::cancel(registryGcTimer.get()); |
| } |
| |
| terminate(whitelistWatcher); |
| wait(whitelistWatcher); |
| delete whitelistWatcher; |
| |
| if (authenticator.isSome()) { |
| delete authenticator.get(); |
| } |
| } |
| |
| |
| void Master::exited(const FrameworkID& frameworkId, const HttpConnection& http) |
| { |
| foreachvalue (Framework* framework, frameworks.registered) { |
| if (framework->http.isSome() && framework->http->writer == http.writer) { |
| CHECK_EQ(frameworkId, framework->id()); |
| _exited(framework); |
| return; |
| } |
| |
| // If the framework has reconnected, the writer will not match |
| // above, and we will have a framework with a matching id. |
| if (frameworkId == framework->id()) { |
| LOG(INFO) << "Ignoring disconnection for framework " |
| << *framework << " as it has already reconnected"; |
| |
| return; |
| } |
| } |
| } |
| |
| |
| void Master::exited(const UPID& pid) |
| { |
| foreachvalue (Framework* framework, frameworks.registered) { |
| if (framework->pid == pid) { |
| // See comments in `receive()` on why we send an error message |
| // to the framework upon detecting a disconnection. |
| FrameworkErrorMessage message; |
| message.set_message("Framework disconnected"); |
| framework->send(message); |
| |
| _exited(framework); |
| return; |
| } |
| } |
| |
| if (Slave* slave = slaves.registered.get(pid)) { |
| LOG(INFO) << "Agent " << *slave << " disconnected"; |
| |
| if (slave->connected) { |
| disconnect(slave); |
| |
| // The semantics when a registered slave gets disconnected are as |
| // follows for each framework running on that slave: |
| // |
| // 1) If the framework is checkpointing: No immediate action is |
| // taken. The slave is given a chance to reconnect until the |
| // slave observer times out (75s) and removes the slave. |
| // |
| // 2) If the framework is not-checkpointing: The slave is not |
| // removed but the framework is removed from the slave's |
| // structs, its tasks transitioned to LOST and resources |
| // recovered. |
| hashset<FrameworkID> frameworkIds = |
| slave->tasks.keys() | slave->executors.keys(); |
| |
| foreach (const FrameworkID& frameworkId, frameworkIds) { |
| Framework* framework = getFramework(frameworkId); |
| CHECK_NOTNULL(framework); |
| |
| if (!framework->info.checkpoint()) { |
| LOG(INFO) << "Removing framework " << *framework |
| << " from disconnected agent " << *slave |
| << " because the framework is not checkpointing"; |
| |
| removeFramework(slave, framework); |
| } |
| } |
| |
| // If the master -> agent socket breaks, we expect that either |
| // (a) the agent will fail to respond to pings and be marked |
| // unreachable, or (b) the agent will receive a ping, notice the |
| // master thinks it is disconnected, and then reregister. There |
| // is a third possibility: if the agent restarts but hangs |
| // during agent recovery, it will respond to pings but never |
| // attempt to reregister (MESOS-6286). |
| // |
| // To handle this case, we expect that an agent whose socket has |
| // broken will reregister within `agent_reregister_timeout`. If |
| // the agent doesn't reregister, it is marked unreachable. |
| slave->reregistrationTimer = |
| delay(flags.agent_reregister_timeout, |
| self(), |
| &Master::agentReregisterTimeout, |
| slave->id); |
| } else { |
| // NOTE: A duplicate exited() event is possible for a slave |
| // because its PID doesn't change on restart. See MESOS-675 |
| // for details. |
| LOG(WARNING) << "Ignoring duplicate exited() notification for " |
| << "agent " << *slave; |
| } |
| } |
| } |
| |
| |
| void Master::agentReregisterTimeout(const SlaveID& slaveId) |
| { |
| Slave* slave = slaves.registered.get(slaveId); |
| |
| // The slave might have been removed or reregistered concurrently |
| // with the timeout expiring. |
| if (slave == nullptr || slave->connected) { |
| return; |
| } |
| |
| // Remove the slave in a rate limited manner, similar to how the |
| // SlaveObserver removes slaves. |
| Future<Nothing> acquire = Nothing(); |
| |
| if (slaves.limiter.isSome()) { |
| LOG(INFO) << "Scheduling removal of agent " |
| << *slave |
| << "; did not reregister within " |
| << flags.agent_reregister_timeout << " after disconnecting"; |
| |
| acquire = slaves.limiter.get()->acquire(); |
| } |
| |
| acquire |
| .then(defer(self(), &Self::_agentReregisterTimeout, slaveId)); |
| |
| ++metrics->slave_unreachable_scheduled; |
| } |
| |
| |
| Nothing Master::_agentReregisterTimeout(const SlaveID& slaveId) |
| { |
| Slave* slave = slaves.registered.get(slaveId); |
| |
| // The slave might have been removed or reregistered while we were |
| // waiting to acquire the rate limit. |
| if (slave == nullptr || slave->connected) { |
| ++metrics->slave_unreachable_canceled; |
| return Nothing(); |
| } |
| |
| ++metrics->slave_unreachable_completed; |
| |
| markUnreachable( |
| slave->info, |
| false, |
| "agent did not reregister within " + |
| stringify(flags.agent_reregister_timeout) + |
| " after disconnecting"); |
| |
| return Nothing(); |
| } |
| |
| |
| void Master::_exited(Framework* framework) |
| { |
| LOG(INFO) << "Framework " << *framework << " disconnected"; |
| |
| // Disconnect the framework. |
| if (framework->connected()) { |
| disconnect(framework); |
| } |
| |
| // We can assume framework's failover_timeout is valid |
| // because it has been validated in framework subscription. |
| Try<Duration> failoverTimeout_ = |
| Duration::create(framework->info.failover_timeout()); |
| |
| CHECK_SOME(failoverTimeout_); |
| Duration failoverTimeout = failoverTimeout_.get(); |
| |
| LOG(INFO) << "Giving framework " << *framework << " " |
| << failoverTimeout << " to failover"; |
| |
| // Delay dispatching a message to ourselves for the timeout. |
| delay(failoverTimeout, |
| self(), |
| &Master::frameworkFailoverTimeout, |
| framework->id(), |
| framework->reregisteredTime); |
| } |
| |
| |
| Future<bool> Master::authorizeLogAccess(const Option<Principal>& principal) |
| { |
| if (authorizer.isNone()) { |
| return true; |
| } |
| |
| authorization::Request request; |
| request.set_action(authorization::ACCESS_MESOS_LOG); |
| |
| Option<authorization::Subject> subject = createSubject(principal); |
| if (subject.isSome()) { |
| request.mutable_subject()->CopyFrom(subject.get()); |
| } |
| |
| return authorizer.get()->authorized(request); |
| } |
| |
| |
| void Master::consume(MessageEvent&& event) |
| { |
| // There are three cases about the message's UPID with respect to |
| // 'frameworks.principals': |
| // 1) if a <UPID, principal> pair exists and the principal is Some, |
| // it's a framework with its principal specified. |
| // 2) if a <UPID, principal> pair exists and the principal is None, |
| // it's a framework without a principal. |
| // 3) if a <UPID, principal> pair does not exist in the map, it's |
| // either an unregistered framework or not a framework. |
| // The logic for framework message counters and rate limiting |
| // mainly concerns with whether the UPID is a *registered* |
| // framework and whether the framework has a principal so we use |
| // these two temp variables to simplify the condition checks below. |
| bool isRegisteredFramework = |
| frameworks.principals.contains(event.message.from); |
| const Option<string> principal = isRegisteredFramework |
| ? frameworks.principals[event.message.from] |
| : Option<string>::none(); |
| |
| // Increment the "message_received" counter if the message is from |
| // a framework and such a counter is configured for it. |
| // See comments for 'Master::Metrics::Frameworks' and |
| // 'Master::Frameworks::principals' for details. |
| if (principal.isSome()) { |
| // If the framework has a principal, the counter must exist. |
| CHECK(metrics->frameworks.contains(principal.get())); |
| Counter messages_received = |
| metrics->frameworks.get(principal.get()).get()->messages_received; |
| ++messages_received; |
| } |
| |
| // All messages are filtered when non-leading. |
| if (!elected()) { |
| VLOG(1) << "Dropping '" << event.message.name << "' message since " |
| << "not elected yet"; |
| |
| ++metrics->dropped_messages; |
| return; |
| } |
| |
| CHECK_SOME(recovered); |
| |
| // All messages are filtered while recovering. |
| // TODO(bmahler): Consider instead re-enqueing *all* messages |
| // through recover(). What are the performance implications of |
| // the additional queueing delay and the accumulated backlog |
| // of messages post-recovery? |
| if (!recovered->isReady()) { |
| VLOG(1) << "Dropping '" << event.message.name << "' message since " |
| << "not recovered yet"; |
| |
| ++metrics->dropped_messages; |
| return; |
| } |
| |
| // Throttle the message if it's a framework message and a |
| // RateLimiter is configured for the framework's principal. |
| // The framework is throttled by the default RateLimiter if: |
| // 1) the default RateLimiter is configured (and) |
| // 2) the framework doesn't have a principal or its principal is |
| // not specified in 'flags.rate_limits'. |
| // The framework is not throttled if: |
| // 1) the default RateLimiter is not configured to handle case 2) |
| // above. (or) |
| // 2) the principal exists in RateLimits but 'qps' is not set. |
| if (principal.isSome() && |
| frameworks.limiters.contains(principal.get()) && |
| frameworks.limiters[principal.get()].isSome()) { |
| const Owned<BoundedRateLimiter>& limiter = |
| frameworks.limiters[principal.get()].get(); |
| |
| if (limiter->capacity.isNone() || |
| limiter->messages < limiter->capacity.get()) { |
| limiter->messages++; |
| limiter->limiter->acquire() |
| .onReady(defer(self(), &Self::throttled, std::move(event), principal)); |
| } else { |
| exceededCapacity( |
| event, |
| principal, |
| limiter->capacity.get()); |
| } |
| } else if ((principal.isNone() || |
| !frameworks.limiters.contains(principal.get())) && |
| isRegisteredFramework && |
| frameworks.defaultLimiter.isSome()) { |
| if (frameworks.defaultLimiter.get()->capacity.isNone() || |
| frameworks.defaultLimiter.get()->messages < |
| frameworks.defaultLimiter.get()->capacity.get()) { |
| frameworks.defaultLimiter.get()->messages++; |
| frameworks.defaultLimiter.get()->limiter->acquire() |
| .onReady(defer(self(), &Self::throttled, std::move(event), None())); |
| } else { |
| exceededCapacity( |
| event, |
| principal, |
| frameworks.defaultLimiter.get()->capacity.get()); |
| } |
| } else { |
| _consume(std::move(event)); |
| } |
| } |
| |
| |
| void Master::consume(ExitedEvent&& event) |
| { |
| // See comments in 'consume(MessageEvent&& event)' for which |
| // RateLimiter is used to throttle this UPID and when it is not |
| // throttled. |
| // Note that throttling ExitedEvent is necessary so the order |
| // between MessageEvents and ExitedEvents from the same PID is |
| // maintained. Also ExitedEvents are not subject to the capacity. |
| bool isRegisteredFramework = frameworks.principals.contains(event.pid); |
| const Option<string> principal = isRegisteredFramework |
| ? frameworks.principals[event.pid] |
| : Option<string>::none(); |
| |
| // Necessary to disambiguate below. |
| typedef void(Self::*F)(ExitedEvent&&); |
| |
| if (principal.isSome() && |
| frameworks.limiters.contains(principal.get()) && |
| frameworks.limiters[principal.get()].isSome()) { |
| frameworks.limiters[principal.get()].get()->limiter->acquire().onReady( |
| defer(self(), static_cast<F>(&Self::_consume), std::move(event))); |
| } else if ((principal.isNone() || |
| !frameworks.limiters.contains(principal.get())) && |
| isRegisteredFramework && |
| frameworks.defaultLimiter.isSome()) { |
| frameworks.defaultLimiter.get()->limiter->acquire().onReady( |
| defer(self(), static_cast<F>(&Self::_consume), std::move(event))); |
| } else { |
| _consume(std::move(event)); |
| } |
| } |
| |
| |
| // TODO(greggomann): Change this to accept an `Option<Principal>` |
| // when MESOS-7202 is resolved. |
| void Master::throttled( |
| MessageEvent&& event, |
| const Option<string>& principal) |
| { |
| // We already know a RateLimiter is used to throttle this event so |
| // here we only need to determine which. |
| if (principal.isSome()) { |
| CHECK_SOME(frameworks.limiters[principal.get()]); |
| frameworks.limiters[principal.get()].get()->messages--; |
| } else { |
| CHECK_SOME(frameworks.defaultLimiter); |
| frameworks.defaultLimiter.get()->messages--; |
| } |
| |
| _consume(std::move(event)); |
| } |
| |
| |
| void Master::_consume(MessageEvent&& event) |
| { |
| // Obtain the principal before processing the Message because the |
| // mapping may be deleted in handling 'UnregisterFrameworkMessage' |
| // but its counter still needs to be incremented for this message. |
| const Option<string> principal = |
| frameworks.principals.contains(event.message.from) |
| ? frameworks.principals[event.message.from] |
| : Option<string>::none(); |
| |
| ProtobufProcess<Master>::consume(std::move(event)); |
| |
| // Increment 'messages_processed' counter if it still exists. |
| // Note that it could be removed in handling |
| // 'UnregisterFrameworkMessage' if it's the last framework with |
| // this principal. |
| if (principal.isSome() && metrics->frameworks.contains(principal.get())) { |
| Counter messages_processed = |
| metrics->frameworks.get(principal.get()).get()->messages_processed; |
| ++messages_processed; |
| } |
| } |
| |
| |
| // TODO(greggomann): Change this to accept an `Option<Principal>` |
| // when MESOS-7202 is resolved. |
| void Master::exceededCapacity( |
| const MessageEvent& event, |
| const Option<string>& principal, |
| uint64_t capacity) |
| { |
| LOG(WARNING) << "Dropping message " << event.message.name << " from " |
| << event.message.from |
| << (principal.isSome() ? "(" + principal.get() + ")" : "") |
| << ": capacity(" << capacity << ") exceeded"; |
| |
| // Send an error to the framework which will abort the scheduler |
| // driver. |
| // NOTE: The scheduler driver will send back a |
| // DeactivateFrameworkMessage which may be dropped as well but this |
| // should be fine because the scheduler is already informed of an |
| // unrecoverable error and should take action to recover. |
| FrameworkErrorMessage message; |
| message.set_message( |
| "Message " + event.message.name + |
| " dropped: capacity(" + stringify(capacity) + ") exceeded"); |
| send(event.message.from, message); |
| } |
| |
| |
| void Master::_consume(ExitedEvent&& event) |
| { |
| Process<Master>::consume(std::move(event)); |
| } |
| |
| |
| void fail(const string& message, const string& failure) |
| { |
| LOG(FATAL) << message << ": " << failure; |
| } |
| |
| |
| Future<Nothing> Master::recover() |
| { |
| if (!elected()) { |
| return Failure("Not elected as leading master"); |
| } |
| |
| if (recovered.isNone()) { |
| LOG(INFO) << "Recovering from registrar"; |
| |
| recovered = registrar->recover(info_) |
| .then(defer(self(), &Self::_recover, lambda::_1)); |
| } |
| |
| return recovered.get(); |
| } |
| |
| |
| Future<Nothing> Master::_recover(const Registry& registry) |
| { |
| hashset<string> missingCapabilities = |
| misingMinimumCapabilities(info_, registry); |
| |
| if (!missingCapabilities.empty()) { |
| LOG(ERROR) << "Master is missing the following minimum capabilities: " |
| << strings::join<hashset<string>>(", ", missingCapabilities) |
| << ". See the following documentation for steps to safely " |
| << "recover from this state: " |
| << "http://mesos.apache.org/documentation/latest/downgrades"; |
| EXIT(EXIT_FAILURE); |
| } |
| |
| foreach (const Registry::Slave& slave, registry.slaves().slaves()) { |
| SlaveInfo slaveInfo = slave.info(); |
| |
| // We store the `SlaveInfo`'s resources in the `pre-reservation-refinement` |
| // in order to support downgrades. We convert them back to `post-` format |
| // here so that we can keep our invariant of working with `post-` format |
| // resources within master memory. |
| upgradeResources(&slaveInfo); |
| |
| slaves.recovered.put(slaveInfo.id(), slaveInfo); |
| } |
| |
| foreach (const Registry::UnreachableSlave& unreachable, |
| registry.unreachable().slaves()) { |
| CHECK(!slaves.unreachable.contains(unreachable.id())); |
| slaves.unreachable[unreachable.id()] = unreachable.timestamp(); |
| } |
| |
| foreach (const Registry::GoneSlave& gone, |
| registry.gone().slaves()) { |
| slaves.gone[gone.id()] = gone.timestamp(); |
| } |
| |
| // Set up a timer for age-based registry GC. |
| scheduleRegistryGc(); |
| |
| // Set up a timeout for slaves to reregister. |
| slaves.recoveredTimer = |
| delay(flags.agent_reregister_timeout, |
| self(), |
| &Self::recoveredSlavesTimeout, |
| registry); |
| |
| // Save the maintenance schedule. |
| foreach (const mesos::maintenance::Schedule& schedule, registry.schedules()) { |
| maintenance.schedules.push_back(schedule); |
| } |
| |
| // Save the machine info for each machine. |
| foreach (const Registry::Machine& machine, registry.machines().machines()) { |
| machines[machine.info().id()] = Machine(machine.info()); |
| } |
| |
| // Save the quotas for each role. |
| foreach (const Registry::Quota& quota, registry.quotas()) { |
| quotas[quota.info().role()] = Quota{quota.info()}; |
| } |
| |
| // We notify the allocator via the `recover()` call. This has to be |
| // done before the first agent reregisters and makes its resources |
| // available for allocation. This is necessary because at this point |
| // the allocator is already initialized and ready to perform |
| // allocations. An allocator may decide to hold off with allocation |
| // until after it restores a view of the cluster state. |
| int expectedAgentCount = registry.slaves().slaves().size(); |
| allocator->recover(expectedAgentCount, quotas); |
| |
| // TODO(alexr): Consider adding a sanity check: whether quotas are |
| // satisfiable given all recovering agents reregister. We may want |
| // to notify operators early if total quota cannot be met. |
| |
| // Recover weights, and update the allocator accordingly. If we |
| // recovered weights from the registry, any weights specified on the |
| // command-line are ignored. If no weights were recovered from the |
| // registry, any weights specified on the command-line are used and |
| // then stored in the registry. |
| vector<WeightInfo> weightInfos; |
| |
| if (registry.weights_size() != 0) { |
| // TODO(Yongqiao Wang): After the Mesos master quorum is achieved, |
| // operator can send an update weights request to do a batch |
| // configuration for weights, so the `--weights` flag can be |
| // deprecated and this check can eventually be removed. |
| if (!weights.empty()) { |
| LOG(WARNING) << "Ignoring --weights flag '" << flags.weights.get() |
| << "' and recovering the weights from registry"; |
| |
| weights.clear(); |
| } |
| |
| foreach (const Registry::Weight& weight, registry.weights()) { |
| WeightInfo weightInfo; |
| weightInfo.set_role(weight.info().role()); |
| weightInfo.set_weight(weight.info().weight()); |
| weightInfos.push_back(weightInfo); |
| |
| weights[weight.info().role()] = weight.info().weight(); |
| } |
| } else if (!weights.empty()) { |
| foreachpair (const string& role, double weight, weights) { |
| WeightInfo weightInfo; |
| weightInfo.set_role(role); |
| weightInfo.set_weight(weight); |
| weightInfos.push_back(weightInfo); |
| } |
| registrar->apply(Owned<RegistryOperation>( |
| new weights::UpdateWeights(weightInfos))); |
| } |
| |
| allocator->updateWeights(weightInfos); |
| |
| // Recovery is now complete! |
| LOG(INFO) << "Recovered " << registry.slaves().slaves().size() << " agents" |
| << " from the registry (" << Bytes(registry.ByteSize()) << ")" |
| << "; allowing " << flags.agent_reregister_timeout |
| << " for agents to reregister"; |
| |
| return Nothing(); |
| } |
| |
| |
| void Master::scheduleRegistryGc() |
| { |
| registryGcTimer = delay(flags.registry_gc_interval, |
| self(), |
| &Self::doRegistryGc); |
| } |
| |
| |
| void Master::doRegistryGc() |
| { |
| // Schedule next periodic GC. |
| scheduleRegistryGc(); |
| |
| // Determine which unreachable agents to GC from the registry, if |
| // any. We do this by examining the master's in-memory copy of the |
| // unreachable list and checking two criteria, "age" and "count". To |
| // check the "count" criteria, we remove elements from the beginning |
| // of the list until it contains at most "registry_max_agent_count" |
| // elements (note that `slaves.unreachable` is a `LinkedHashMap`, |
| // which provides iteration over keys in insertion-order). To check |
| // the "age" criteria, we remove any element in the list whose age |
| // is more than "registry_max_agent_age". Note that for the latter, |
| // we check the entire list, not just the beginning: this avoids |
| // requiring that the list be kept sorted by timestamp. |
| // |
| // We build a candidate list of SlaveIDs to remove. We then try to |
| // remove this list from the registry. Note that all the slaveIDs we |
| // want to remove might not be found in the registrar's copy of the |
| // unreachable list; this can occur if there is a concurrent write |
| // (e.g., an unreachable agent we want to GC reregisters |
| // concurrently). In this situation, we skip removing any elements |
| // we don't find. |
| |
| auto prune = [this](const LinkedHashMap<SlaveID, TimeInfo>& slaves) { |
| size_t count = slaves.size(); |
| TimeInfo currentTime = protobuf::getCurrentTime(); |
| hashset<SlaveID> toRemove; |
| |
| foreachpair (const SlaveID& slaveId, |
| const TimeInfo& removalTime, |
| slaves) { |
| // Count-based GC. |
| CHECK(toRemove.size() <= count); |
| |
| size_t liveCount = count - toRemove.size(); |
| if (liveCount > flags.registry_max_agent_count) { |
| toRemove.insert(slaveId); |
| continue; |
| } |
| |
| // Age-based GC. |
| Duration age = Nanoseconds( |
| currentTime.nanoseconds() - removalTime.nanoseconds()); |
| |
| if (age > flags.registry_max_agent_age) { |
| toRemove.insert(slaveId); |
| } |
| } |
| |
| return toRemove; |
| }; |
| |
| hashset<SlaveID> toRemoveUnreachable = prune(slaves.unreachable); |
| hashset<SlaveID> toRemoveGone = prune(slaves.gone); |
| |
| if (toRemoveUnreachable.empty() && toRemoveGone.empty()) { |
| VLOG(1) << "Skipping periodic registry garbage collection: " |
| << "no agents qualify for removal"; |
| |
| return; |
| } |
| |
| VLOG(1) << "Attempting to remove " << toRemoveUnreachable.size() |
| << " unreachable and " << toRemoveGone.size() |
| << " gone agents from the registry"; |
| |
| registrar->apply(Owned<RegistryOperation>( |
| new Prune(toRemoveUnreachable, toRemoveGone))) |
| .onAny(defer(self(), |
| &Self::_doRegistryGc, |
| toRemoveUnreachable, |
| toRemoveGone, |
| lambda::_1)); |
| } |
| |
| |
| void Master::_doRegistryGc( |
| const hashset<SlaveID>& toRemoveUnreachable, |
| const hashset<SlaveID>& toRemoveGone, |
| const Future<bool>& registrarResult) |
| { |
| CHECK(!registrarResult.isDiscarded()); |
| CHECK(!registrarResult.isFailed()); |
| |
| // `Prune` registry operation should never fail. |
| CHECK(registrarResult.get()); |
| |
| // Update in-memory state to be consistent with registry changes. If |
| // there was a concurrent registry operation that also modified the |
| // unreachable/gone list (e.g., an agent in `toRemoveXXX` concurrently |
| // reregistered), entries in `toRemove` might not appear in |
| // `slaves.unreachable` or `slaves.gone`. |
| // |
| // TODO(neilc): It would be nice to verify that the effect of these |
| // in-memory updates is equivalent to the changes made by the registry |
| // operation, but there isn't an easy way to do that. |
| |
| size_t numRemovedUnreachable = 0; |
| foreach (const SlaveID& slaveId, toRemoveUnreachable) { |
| if (!slaves.unreachable.contains(slaveId)) { |
| LOG(WARNING) << "Failed to garbage collect " << slaveId |
| << " from the unreachable list"; |
| |
| continue; |
| } |
| |
| slaves.unreachable.erase(slaveId); |
| |
| // TODO(vinod): Consider moving these tasks into `completedTasks` by |
| // transitioning them to a terminal state and sending status updates. |
| // But it's not clear what this state should be. If a framework |
| // reconciles these tasks after this point it would get `TASK_UNKNOWN` |
| // which seems appropriate but we don't keep tasks in this state in-memory. |
| if (slaves.unreachableTasks.contains(slaveId)) { |
| foreachkey (const FrameworkID& frameworkId, |
| slaves.unreachableTasks.at(slaveId)) { |
| Framework* framework = getFramework(frameworkId); |
| if (framework != nullptr) { |
| foreach (const TaskID& taskId, |
| slaves.unreachableTasks.at(slaveId).at(frameworkId)) { |
| framework->unreachableTasks.erase(taskId); |
| } |
| } |
| } |
| } |
| |
| slaves.unreachableTasks.erase(slaveId); |
| |
| numRemovedUnreachable++; |
| } |
| |
| size_t numRemovedGone = 0; |
| foreach (const SlaveID& slaveId, toRemoveGone) { |
| if (!slaves.gone.contains(slaveId)) { |
| LOG(WARNING) << "Failed to garbage collect " << slaveId |
| << " from the gone list"; |
| |
| continue; |
| } |
| |
| slaves.gone.erase(slaveId); |
| numRemovedGone++; |
| } |
| |
| // TODO(neilc): Add a metric for # of agents discarded from the registry? |
| LOG(INFO) << "Garbage collected " << numRemovedUnreachable |
| << " unreachable and " << numRemovedGone |
| << " gone agents from the registry"; |
| } |
| |
| |
| void Master::recoveredSlavesTimeout(const Registry& registry) |
| { |
| CHECK(elected()); |
| |
| // TODO(bmahler): Add a 'Percentage' abstraction. |
| Try<double> limit_ = numify<double>( |
| strings::remove( |
| flags.recovery_agent_removal_limit, |
| "%", |
| strings::SUFFIX)); |
| |
| CHECK_SOME(limit_); |
| |
| double limit = limit_.get() / 100.0; |
| |
| // Compute the percentage of slaves to be removed, if it exceeds the |
| // safety-net limit, bail! |
| double removalPercentage = |
| (1.0 * slaves.recovered.size()) / |
| (1.0 * registry.slaves().slaves().size()); |
| |
| if (removalPercentage > limit) { |
| EXIT(EXIT_FAILURE) |
| << "Post-recovery agent removal limit exceeded! After " |
| << flags.agent_reregister_timeout |
| << " there were " << slaves.recovered.size() |
| << " (" << removalPercentage * 100 << "%) agents recovered from the" |
| << " registry that did not reregister: \n" |
| << stringify(slaves.recovered.keys()) << "\n " |
| << " The configured removal limit is " << limit * 100 << "%. Please" |
| << " investigate or increase this limit to proceed further"; |
| } |
| |
| // Remove the slaves in a rate limited manner, similar to how the |
| // SlaveObserver removes slaves. |
| foreach (const Registry::Slave& slave, registry.slaves().slaves()) { |
| // The slave is removed from `recovered` when it completes the |
| // re-registration process. If the slave is in `reregistering`, it |
| // has started but not yet finished reregistering. In either |
| // case, we don't want to try to remove it. |
| if (!slaves.recovered.contains(slave.info().id()) || |
| slaves.reregistering.contains(slave.info().id())) { |
| continue; |
| } |
| |
| Future<Nothing> acquire = Nothing(); |
| |
| if (slaves.limiter.isSome()) { |
| LOG(INFO) << "Scheduling removal of agent " |
| << slave.info().id() << " (" << slave.info().hostname() << ")" |
| << "; did not reregister within " |
| << flags.agent_reregister_timeout << " after master failover"; |
| |
| acquire = slaves.limiter.get()->acquire(); |
| } |
| |
| const string failure = "Agent removal rate limit acquisition failed"; |
| |
| // TODO(bmahler): Cancelation currently occurs within by returning |
| // early from `markUnreachable` *without* the "discarder" having |
| // discarded the rate limit token. This approach means that if |
| // agents reregister while many of the marking unreachable |
| // operations are in progress, the rate that we mark unreachable |
| // will "slow down" rather than stay constant. We should instead |
| // discard the rate limit token when the agent reregisters and |
| // handle the discard here. See MESOS-8386. |
| acquire |
| .onFailed(lambda::bind(fail, failure, lambda::_1)) |
| .onDiscarded(lambda::bind(fail, failure, "discarded")) |
| .then(defer(self(), |
| &Self::markUnreachable, |
| slave.info(), |
| true, |
| "did not reregister within" |
| " " + stringify(flags.agent_reregister_timeout) + |
| " after master failover")) |
| .then(defer(self(), [=](bool marked) { |
| if (marked) { |
| ++metrics->slave_unreachable_completed; |
| } else { |
| ++metrics->slave_unreachable_canceled; |
| } |
| |
| return Nothing(); |
| })); |
| |
| ++metrics->slave_unreachable_scheduled; |
| } |
| } |
| |
| |
| void Master::sendSlaveLost(const SlaveInfo& slaveInfo) |
| { |
| foreachvalue (Framework* framework, frameworks.registered) { |
| if (!framework->connected()) { |
| continue; |
| } |
| |
| LOG(INFO) << "Notifying framework " << *framework << " of lost agent " |
| << slaveInfo.id() << " (" << slaveInfo.hostname() << ")"; |
| |
| LostSlaveMessage message; |
| message.mutable_slave_id()->MergeFrom(slaveInfo.id()); |
| framework->send(message); |
| } |
| |
| if (HookManager::hooksAvailable()) { |
| HookManager::masterSlaveLostHook(slaveInfo); |
| } |
| } |
| |
| |
| void Master::fileAttached(const Future<Nothing>& result, const string& path) |
| { |
| if (result.isReady()) { |
| LOG(INFO) << "Successfully attached file '" << path << "'"; |
| } else { |
| LOG(ERROR) << "Failed to attach file '" << path << "': " |
| << (result.isFailed() ? result.failure() : "discarded"); |
| } |
| } |
| |
| |
| void Master::submitScheduler(const string& name) |
| { |
| LOG(INFO) << "Scheduler submit request for " << name; |
| SubmitSchedulerResponse response; |
| response.set_okay(false); |
| reply(response); |
| } |
| |
| |
| void Master::contended(const Future<Future<Nothing>>& candidacy) |
| { |
| CHECK(!candidacy.isDiscarded()); |
| |
| if (candidacy.isFailed()) { |
| EXIT(EXIT_FAILURE) << "Failed to contend: " << candidacy.failure(); |
| } |
| |
| // Watch for candidacy change. |
| candidacy |
| ->onAny(defer(self(), &Master::lostCandidacy, lambda::_1)); |
| } |
| |
| |
| void Master::lostCandidacy(const Future<Nothing>& lost) |
| { |
| CHECK(!lost.isDiscarded()); |
| |
| if (lost.isFailed()) { |
| EXIT(EXIT_FAILURE) << "Failed to watch for candidacy: " << lost.failure(); |
| } |
| |
| if (elected()) { |
| EXIT(EXIT_FAILURE) << "Lost candidacy as a leader... committing suicide!"; |
| } |
| |
| LOG(INFO) << "Lost candidacy as a follower... Contend again"; |
| contender->contend() |
| .onAny(defer(self(), &Master::contended, lambda::_1)); |
| } |
| |
| |
| void Master::detected(const Future<Option<MasterInfo>>& _leader) |
| { |
| CHECK(!_leader.isDiscarded()); |
| |
| if (_leader.isFailed()) { |
| EXIT(EXIT_FAILURE) |
| << "Failed to detect the leading master: " << _leader.failure() |
| << "; committing suicide!"; |
| } |
| |
| bool wasElected = elected(); |
| leader = _leader.get(); |
| |
| if (elected()) { |
| electedTime = Clock::now(); |
| |
| if (!wasElected) { |
| LOG(INFO) << "Elected as the leading master!"; |
| |
| // Begin the recovery process, bail if it fails or is discarded. |
| recover() |
| .onFailed(lambda::bind(fail, "Recovery failed", lambda::_1)) |
| .onDiscarded(lambda::bind(fail, "Recovery failed", "discarded")); |
| } else { |
| // This happens if there is a ZK blip that causes a re-election |
| // but the same leading master is elected as leader. |
| LOG(INFO) << "Re-elected as the leading master"; |
| } |
| } else if (leader.isSome()) { |
| // A different node has been elected as the leading master. |
| LOG(INFO) << "The newly elected leader is " << leader->pid() |
| << " with id " << leader->id(); |
| |
| if (wasElected) { |
| EXIT(EXIT_FAILURE) << "Conceded leadership to another master..." |
| << " committing suicide!"; |
| } |
| |
| // If this master and the current leader both have a configured |
| // domain and the current leader is located in a different region, |
| // exit with an error message: this indicates a configuration |
| // error, since all masters must be in the same region. |
| if (leader->has_domain() && info_.has_domain()) { |
| const DomainInfo& leaderDomain = leader->domain(); |
| const DomainInfo& selfDomain = info_.domain(); |
| |
| // We currently reject configured domains without fault domains, |
| // but that might change in the future. For compatibility with |
| // future versions of Mesos, we treat a master with a configured |
| // domain but no fault domain as equivalent to a master with no |
| // configured domain. |
| if (leaderDomain.has_fault_domain() && selfDomain.has_fault_domain()) { |
| const DomainInfo::FaultDomain::RegionInfo& leaderRegion = |
| leaderDomain.fault_domain().region(); |
| const DomainInfo::FaultDomain::RegionInfo& selfRegion = |
| selfDomain.fault_domain().region(); |
| |
| if (leaderRegion != selfRegion) { |
| EXIT(EXIT_FAILURE) << "Leading master uses domain " |
| << leaderDomain << "; this master is " |
| << "configured to use domain " |
| << selfDomain << "; all masters in the " |
| << "same cluster must use the same region"; |
| } |
| } |
| } |
| } else { |
| // If an election occured and no leader was elected, `None` is returned. |
| LOG(INFO) << "No master was elected."; |
| |
| if (wasElected) { |
| EXIT(EXIT_FAILURE) << "Lost leadership after indecisive election..." |
| << " committing suicide!"; |
| } |
| } |
| |
| // Keep detecting. |
| detector->detect(leader) |
| .onAny(defer(self(), &Master::detected, lambda::_1)); |
| } |
| |
| |
| Future<bool> Master::authorizeFramework( |
| const FrameworkInfo& frameworkInfo) |
| { |
| if (authorizer.isNone()) { |
| return true; // Authorization is disabled. |
| } |
| |
| LOG(INFO) << "Authorizing framework principal '" << frameworkInfo.principal() |
| << "' to receive offers for roles '" |
| << stringify(protobuf::framework::getRoles(frameworkInfo)) << "'"; |
| |
| authorization::Request request; |
| request.set_action(authorization::REGISTER_FRAMEWORK); |
| |
| if (frameworkInfo.has_principal()) { |
| request.mutable_subject()->set_value(frameworkInfo.principal()); |
| } |
| |
| request.mutable_object()->mutable_framework_info()->CopyFrom(frameworkInfo); |
| |
| // For non-`MULTI_ROLE` frameworks, also propagate its single role |
| // via the request's `value` field. This is purely for backwards |
| // compatibility as the `value` field is deprecated. Note that this |
| // means that authorizers relying on the deprecated field will see |
| // an empty string in `value` for `MULTI_ROLE` frameworks. |
| // |
| // TODO(bbannier): Remove this at the end of `value`'s deprecation |
| // cycle, see MESOS-7073. |
| if (!protobuf::frameworkHasCapability( |
| frameworkInfo, FrameworkInfo::Capability::MULTI_ROLE)) { |
| request.mutable_object()->set_value(frameworkInfo.role()); |
| } |
| |
| return authorizer.get()->authorized(request); |
| } |
| |
| |
| Option<Error> Master::validateFrameworkAuthentication( |
| const FrameworkInfo& frameworkInfo, |
| const UPID& from) |
| { |
| if (authenticating.contains(from)) { |
| return Error("Re-authentication in progress"); |
| } |
| |
| if (flags.authenticate_frameworks && !authenticated.contains(from)) { |
| // This could happen if another authentication request came |
| // through before we are here or if a framework tried to |
| // (re-)register without authentication. |
| return Error("Framework at " + stringify(from) + " is not authenticated"); |
| } |
| |
| // TODO(bmahler): Currently the scheduler driver does not |
| // set 'principal', so we allow frameworks to omit it. |
| if (frameworkInfo.has_principal() && |
| authenticated.contains(from) && |
| frameworkInfo.principal() != authenticated[from]) { |
| return Error("Framework principal '" + frameworkInfo.principal() + "'" |
| " does not match authenticated principal" |
| " '" + authenticated[from] + "'"); |
| } |
| |
| return None(); |
| } |
| |
| |
| void Master::drop( |
| const UPID& from, |
| const scheduler::Call& call, |
| const string& message) |
| { |
| // TODO(bmahler): Increment a metric. |
| |
| LOG(WARNING) << "Dropping " << call.type() << " call" |
| << " from framework " << call.framework_id() |
| << " at " << from << ": " << message; |
| } |
| |
| |
| void Master::drop( |
| Framework* framework, |
| const Offer::Operation& operation, |
| const string& message) |
| { |
| CHECK_NOTNULL(framework); |
| |
| // TODO(jieyu): Increment a metric. |
| |
| LOG(WARNING) << "Dropping " << Offer::Operation::Type_Name(operation.type()) |
| << " operation from framework " << *framework |
| << ": " << message; |
| |
| // NOTE: The operation validation code should be refactored. Due to the order |
| // of validation, it's possible that this function will be called before the |
| // master validates that operations from v0 frameworks should not have their |
| // ID set. |
| if (operation.has_id() && framework->http.isSome()) { |
| scheduler::Event update; |
| update.set_type(scheduler::Event::UPDATE_OPERATION_STATUS); |
| |
| // NOTE: We do not attempt to set the agent or resource provider IDs for |
| // dropped operations as we cannot guarantee to always know their values. |
| // |
| // TODO(bbannier): Set agent or resource provider ID if we know |
| // for certain that the operation was valid. |
| *update.mutable_update_operation_status()->mutable_status() = |
| protobuf::createOperationStatus( |
| OperationState::OPERATION_ERROR, |
| operation.id(), |
| message); |
| |
| framework->send(update); |
| } |
| } |
| |
| |
| void Master::drop( |
| Framework* framework, |
| const scheduler::Call& call, |
| const string& message) |
| { |
| CHECK_NOTNULL(framework); |
| |
| // TODO(gyliu513): Increment a metric. |
| |
| LOG(WARNING) << "Dropping " << call.type() << " call" |
| << " from framework " << *framework |
| << ": " << message; |
| } |
| |
| |
| void Master::drop( |
| Framework* framework, |
| const scheduler::Call::Suppress& suppress, |
| const string& message) |
| { |
| scheduler::Call call; |
| call.set_type(scheduler::Call::SUPPRESS); |
| call.mutable_suppress()->CopyFrom(suppress); |
| |
| drop(framework, call, message); |
| } |
| |
| |
| void Master::drop( |
| Framework* framework, |
| const scheduler::Call::Revive& revive, |
| const string& message) |
| { |
| scheduler::Call call; |
| call.set_type(scheduler::Call::REVIVE); |
| call.mutable_revive()->CopyFrom(revive); |
| |
| drop(framework, call, message); |
| } |
| |
| |
| void Master::receive( |
| const UPID& from, |
| scheduler::Call&& call) |
| { |
| // TODO(vinod): Add metrics for calls. |
| |
| Option<Error> error = validation::scheduler::call::validate(call); |
| |
| if (error.isSome()) { |
| metrics->incrementInvalidSchedulerCalls(call); |
| drop(from, call, error->message); |
| return; |
| } |
| |
| if (call.type() == scheduler::Call::SUBSCRIBE) { |
| subscribe(from, call.subscribe()); |
| return; |
| } |
| |
| // We consolidate the framework lookup and pid validation logic here |
| // because they are common for all the call handlers. |
| Framework* framework = getFramework(call.framework_id()); |
| |
| if (framework == nullptr) { |
| drop(from, call, "Framework cannot be found"); |
| return; |
| } |
| |
| if (framework->pid != from) { |
| drop(from, call, "Call is not from registered framework"); |
| return; |
| } |
| |
| framework->metrics.incrementCall(call.type()); |
| |
| // This is possible when master --> framework link is broken (i.e., one |
| // way network partition) and the framework is not aware of it. There |
| // is no way for driver based frameworks to detect this in the absence |
| // of periodic heartbeat events. We send an error message to the framework |
| // causing the scheduler driver to abort when this happens. |
| if (!framework->connected()) { |
| const string error = "Framework disconnected"; |
| |
| LOG(INFO) << "Refusing " << call.type() << " call from framework " |
| << *framework << ": " << error; |
| |
| FrameworkErrorMessage message; |
| message.set_message(error); |
| send(from, message); |
| return; |
| } |
| |
| switch (call.type()) { |
| case scheduler::Call::SUBSCRIBE: |
| // SUBSCRIBE call should have been handled above. |
| LOG(FATAL) << "Unexpected 'SUBSCRIBE' call"; |
| |
| case scheduler::Call::TEARDOWN: |
| teardown(framework); |
| break; |
| |
| case scheduler::Call::ACCEPT: |
| accept(framework, std::move(*call.mutable_accept())); |
| break; |
| |
| case scheduler::Call::DECLINE: |
| decline(framework, std::move(*call.mutable_decline())); |
| break; |
| |
| case scheduler::Call::ACCEPT_INVERSE_OFFERS: |
| acceptInverseOffers(framework, call.accept_inverse_offers()); |
| break; |
| |
| case scheduler::Call::DECLINE_INVERSE_OFFERS: |
| declineInverseOffers(framework, call.decline_inverse_offers()); |
| break; |
| |
| case scheduler::Call::REVIVE: |
| revive(framework, call.revive()); |
| break; |
| |
| case scheduler::Call::KILL: |
| kill(framework, call.kill()); |
| break; |
| |
| case scheduler::Call::SHUTDOWN: |
| shutdown(framework, call.shutdown()); |
| break; |
| |
| case scheduler::Call::ACKNOWLEDGE: { |
| acknowledge(framework, std::move(*call.mutable_acknowledge())); |
| break; |
| } |
| |
| case scheduler::Call::ACKNOWLEDGE_OPERATION_STATUS: { |
| drop( |
| from, |
| call, |
| "'ACKNOWLEDGE_OPERATION_STATUS' is not supported by the v0 API"); |
| break; |
| } |
| |
| case scheduler::Call::RECONCILE: |
| reconcile(framework, std::move(*call.mutable_reconcile())); |
| break; |
| |
| case scheduler::Call::RECONCILE_OPERATIONS: |
| drop( |
| from, |
| call, |
| "'RECONCILE_OPERATIONS' is not supported by the v0 API"); |
| break; |
| |
| case scheduler::Call::MESSAGE: |
| message(framework, std::move(*call.mutable_message())); |
| break; |
| |
| case scheduler::Call::REQUEST: |
| request(framework, call.request()); |
| break; |
| |
| case scheduler::Call::SUPPRESS: |
| suppress(framework, call.suppress()); |
| break; |
| |
| case scheduler::Call::UNKNOWN: |
| LOG(WARNING) << "'UNKNOWN' call"; |
| break; |
| } |
| } |
| |
| |
| void Master::registerFramework( |
| const UPID& from, |
| RegisterFrameworkMessage&& registerFrameworkMessage) |
| { |
| FrameworkInfo frameworkInfo = |
| std::move(*registerFrameworkMessage.mutable_framework()); |
| |
| if (frameworkInfo.has_id() && !frameworkInfo.id().value().empty()) { |
| const string error = "Registering with 'id' already set"; |
| |
| LOG(INFO) << "Refusing registration request of framework" |
| << " '" << frameworkInfo.name() << "' at " << from |
| << ": " << error; |
| |
| FrameworkErrorMessage message; |
| message.set_message(error); |
| send(from, message); |
| return; |
| } |
| |
| scheduler::Call::Subscribe call; |
| *call.mutable_framework_info() = std::move(frameworkInfo); |
| |
| subscribe(from, call); |
| } |
| |
| |
| void Master::reregisterFramework( |
| const UPID& from, |
| ReregisterFrameworkMessage&& reregisterFrameworkMessage) |
| { |
| FrameworkInfo frameworkInfo = |
| std::move(*reregisterFrameworkMessage.mutable_framework()); |
| |
| if (!frameworkInfo.has_id() || frameworkInfo.id().value().empty()) { |
| const string error = "Re-registering without an 'id'"; |
| |
| LOG(INFO) << "Refusing re-registration request of framework" |
| << " '" << frameworkInfo.name() << "' at " << from |
| << ": " << error; |
| |
| FrameworkErrorMessage message; |
| message.set_message(error); |
| send(from, message); |
| return; |
| } |
| |
| scheduler::Call::Subscribe call; |
| *call.mutable_framework_info() = std::move(frameworkInfo); |
| call.set_force(reregisterFrameworkMessage.failover()); |
| |
| subscribe(from, call); |
| } |
| |
| |
| void Master::subscribe( |
| HttpConnection http, |
| const scheduler::Call::Subscribe& subscribe) |
| { |
| // TODO(anand): Authenticate the framework. |
| |
| const FrameworkInfo& frameworkInfo = subscribe.framework_info(); |
| |
| // Update messages_{re}register_framework accordingly. |
| if (!frameworkInfo.has_id() || frameworkInfo.id() == "") { |
| ++metrics->messages_register_framework; |
| } else { |
| ++metrics->messages_reregister_framework; |
| } |
| |
| LOG(INFO) << "Received subscription request for" |
| << " HTTP framework '" << frameworkInfo.name() << "'"; |
| |
| Option<Error> validationError = |
| validation::framework::validate(frameworkInfo); |
| |
| if (validationError.isNone()) { |
| // Check the framework's role(s) against the whitelist. |
| set<string> invalidRoles; |
| |
| if (protobuf::frameworkHasCapability( |
| frameworkInfo, |
| FrameworkInfo::Capability::MULTI_ROLE)) { |
| foreach (const string& role, frameworkInfo.roles()) { |
| if (!isWhitelistedRole(role)) { |
| invalidRoles.insert(role); |
| } |
| } |
| } else { |
| if (!isWhitelistedRole(frameworkInfo.role())) { |
| invalidRoles.insert(frameworkInfo.role()); |
| } |
| } |
| |
| if (!invalidRoles.empty()) { |
| validationError = Error("Roles " + stringify(invalidRoles) + |
| " are not present in master's --roles"); |
| } |
| } |
| |
| // Ensure each of the suppressed role is contained in the list of roles. |
| set<string> frameworkRoles = protobuf::framework::getRoles(frameworkInfo); |
| set<string> suppressedRoles = set<string>( |
| subscribe.suppressed_roles().begin(), subscribe.suppressed_roles().end()); |
| |
| if (validationError.isNone()) { |
| // The suppressed roles must be contained within the list of all |
| // roles for the framwork. |
| foreach (const string& role, suppressedRoles) { |
| if (!frameworkRoles.count(role)) { |
| validationError = Error("Suppressed role '" + role + |
| "' is not contained in the list of roles"); |
| |
| break; |
| } |
| } |
| } |
| |
| // TODO(vinod): Deprecate this in favor of authorization. |
| if (validationError.isNone() && |
| frameworkInfo.user() == "root" && !flags.root_submissions) { |
| validationError = Error("User 'root' is not allowed to run frameworks" |
| " without --root_submissions set"); |
| } |
| |
| if (validationError.isNone() && frameworkInfo.has_id() && |
| isCompletedFramework(frameworkInfo.id())) { |
| // This could happen if a framework tries to subscribe after its failover |
| // timeout has elapsed, or it has been torn down via the operator API. |
| // |
| // TODO(vinod): Master should persist admitted frameworks to the |
| // registry and remove them from it after failover timeout. |
| validationError = Error("Framework has been removed"); |
| } |
| |
| if (validationError.isNone() && !isValidFailoverTimeout(frameworkInfo)) { |
| validationError = Error("The framework failover_timeout (" + |
| stringify(frameworkInfo.failover_timeout()) + |
| ") is invalid"); |
| } |
| |
| if (validationError.isSome()) { |
| LOG(INFO) << "Refusing subscription of framework" |
| << " '" << frameworkInfo.name() << "': " |
| << validationError->message; |
| |
| FrameworkErrorMessage message; |
| message.set_message(validationError->message); |
| |
| http.send(message); |
| http.close(); |
| return; |
| } |
| |
| // Need to disambiguate for the compiler. |
| void (Master::*_subscribe)( |
| HttpConnection, |
| const FrameworkInfo&, |
| bool, |
| const set<string>&, |
| const Future<bool>&) = &Self::_subscribe; |
| |
| authorizeFramework(frameworkInfo) |
| .onAny(defer(self(), |
| _subscribe, |
| http, |
| frameworkInfo, |
| subscribe.force(), |
| suppressedRoles, |
| lambda::_1)); |
| } |
| |
| |
| void Master::_subscribe( |
| HttpConnection http, |
| const FrameworkInfo& frameworkInfo, |
| bool force, |
| const set<string>& suppressedRoles, |
| const Future<bool>& authorized) |
| { |
| CHECK(!authorized.isDiscarded()); |
| |
| Option<Error> authorizationError = None(); |
| |
| if (authorized.isFailed()) { |
| authorizationError = |
| Error("Authorization failure: " + authorized.failure()); |
| } else if (!authorized.get()) { |
| authorizationError = Error( |
| "Not authorized to use roles '" + |
| stringify(protobuf::framework::getRoles(frameworkInfo)) + "'"); |
| } |
| |
| if (authorizationError.isSome()) { |
| LOG(INFO) << "Refusing subscription of framework" |
| << " '" << frameworkInfo.name() << "'" |
| << ": " << authorizationError->message; |
| |
| FrameworkErrorMessage message; |
| message.set_message(authorizationError->message); |
| http.send(message); |
| http.close(); |
| return; |
| } |
| |
| LOG(INFO) << "Subscribing framework '" << frameworkInfo.name() |
| << "' with checkpointing " |
| << (frameworkInfo.checkpoint() ? "enabled" : "disabled") |
| << " and capabilities " << frameworkInfo.capabilities(); |
| |
| if (!frameworkInfo.has_id() || frameworkInfo.id() == "") { |
| // If we are here the framework is subscribing for the first time. |
| // Assign a new FrameworkID. |
| FrameworkInfo frameworkInfo_ = frameworkInfo; |
| frameworkInfo_.mutable_id()->CopyFrom(newFrameworkId()); |
| |
| Framework* framework = new Framework(this, flags, frameworkInfo_, http); |
| |
| addFramework(framework, suppressedRoles); |
| |
| framework->metrics.incrementCall(scheduler::Call::SUBSCRIBE); |
| |
| FrameworkRegisteredMessage message; |
| message.mutable_framework_id()->MergeFrom(framework->id()); |
| message.mutable_master_info()->MergeFrom(info_); |
| framework->send(message); |
| |
| // Start the heartbeat after sending SUBSCRIBED event. |
| framework->heartbeat(); |
| |
| if (!subscribers.subscribed.empty()) { |
| subscribers.send( |
| protobuf::master::event::createFrameworkAdded(*framework)); |
| } |
| |
| return; |
| } |
| |
| // If we are here the framework has already been assigned an id. |
| CHECK(!frameworkInfo.id().value().empty()); |
| |
| Framework* framework = getFramework(frameworkInfo.id()); |
| |
| if (framework == nullptr) { |
| // The framework has not yet reregistered after master failover. |
| // Furthermore, no agents have reregistered running one of this |
| // framework's tasks. Reconstruct a `Framework` object from the |
| // supplied `FrameworkInfo`. |
| recoverFramework(frameworkInfo, suppressedRoles); |
| |
| framework = getFramework(frameworkInfo.id()); |
| } |
| |
| CHECK_NOTNULL(framework); |
| |
| framework->metrics.incrementCall(scheduler::Call::SUBSCRIBE); |
| |
| if (!framework->recovered()) { |
| // The framework has previously been registered with this master; |
| // it may or may not currently be connected. |
| |
| updateFramework(framework, frameworkInfo, suppressedRoles); |
| framework->reregisteredTime = Clock::now(); |
| |
| // Always failover the old framework connection. See MESOS-4712 for details. |
| failoverFramework(framework, http); |
| } else { |
| // The framework has not yet reregistered after master failover. |
| Try<Nothing> activate = activateRecoveredFramework( |
| framework, frameworkInfo, None(), http, suppressedRoles); |
| |
| if (activate.isError()) { |
| LOG(INFO) << "Could not update FrameworkInfo of framework '" |
| << frameworkInfo.name() << "': " << activate.error(); |
| |
| FrameworkErrorMessage message; |
| message.set_message(activate.error()); |
| http.send(message); |
| http.close(); |
| return; |
| } |
| } |
| |
| if (!subscribers.subscribed.empty()) { |
| subscribers.send( |
| protobuf::master::event::createFrameworkUpdated(*framework)); |
| } |
| |
| // Broadcast the new framework pid to all the slaves. We have to |
| // broadcast because an executor might be running on a slave but |
| // it currently isn't running any tasks. |
| foreachvalue (Slave* slave, slaves.registered) { |
| UpdateFrameworkMessage message; |
| message.mutable_framework_id()->CopyFrom(frameworkInfo.id()); |
| |
| // TODO(anand): We set 'pid' to UPID() for http frameworks |
| // as 'pid' was made optional in 0.24.0. In 0.25.0, we |
| // no longer have to set pid here for http frameworks. |
| message.set_pid(UPID()); |
| message.mutable_framework_info()->CopyFrom(frameworkInfo); |
| send(slave->pid, message); |
| } |
| } |
| |
| |
| void Master::subscribe( |
| const UPID& from, |
| const scheduler::Call::Subscribe& subscribe) |
| { |
| FrameworkInfo frameworkInfo = subscribe.framework_info(); |
| |
| // Update messages_{re}register_framework accordingly. |
| if (!frameworkInfo.has_id() || frameworkInfo.id() == "") { |
| ++metrics->messages_register_framework; |
| } else { |
| ++metrics->messages_reregister_framework; |
| } |
| |
| if (authenticating.contains(from)) { |
| // TODO(vinod): Consider dropping this request and fix the tests |
| // to deal with the drop. Currently there is a race between master |
| // realizing the framework is authenticated and framework sending |
| // a subscribe call. Dropping this message will cause the |
| // framework to retry slowing down the tests. |
| LOG(INFO) << "Queuing up SUBSCRIBE call for" |
| << " framework '" << frameworkInfo.name() << "' at " << from |
| << " because authentication is still in progress"; |
| |
| // Need to disambiguate for the compiler. |
| void (Master::*f)(const UPID&, const scheduler::Call::Subscribe&) |
| = &Self::subscribe; |
| |
| authenticating[from] |
| .onReady(defer(self(), f, from, subscribe)); |
| return; |
| } |
| |
| Option<Error> validationError = |
| validation::framework::validate(frameworkInfo); |
| |
| if (validationError.isNone()) { |
| // Check the framework's role(s) against the whitelist. |
| set<string> invalidRoles; |
| |
| if (protobuf::frameworkHasCapability( |
| frameworkInfo, |
| FrameworkInfo::Capability::MULTI_ROLE)) { |
| foreach (const string& role, frameworkInfo.roles()) { |
| if (!isWhitelistedRole(role)) { |
| invalidRoles.insert(role); |
| } |
| } |
| } else { |
| if (!isWhitelistedRole(frameworkInfo.role())) { |
| invalidRoles.insert(frameworkInfo.role()); |
| } |
| } |
| |
| if (!invalidRoles.empty()) { |
| validationError = Error("Roles " + stringify(invalidRoles) + |
| " are not present in the master's --roles"); |
| } |
| } |
| |
| // Ensure each of the suppressed role is contained in the list of roles. |
| set<string> frameworkRoles = protobuf::framework::getRoles(frameworkInfo); |
| set<string> suppressedRoles = set<string>( |
| subscribe.suppressed_roles().begin(), subscribe.suppressed_roles().end()); |
| |
| if (validationError.isNone()) { |
| // The suppressed roles must be contained within the list of all |
| // roles for the framwork. |
| foreach (const string& role, suppressedRoles) { |
| if (!frameworkRoles.count(role)) { |
| validationError = Error("Suppressed role '" + role + |
| "' is not contained in the list of roles"); |
| |
| break; |
| } |
| } |
| } |
| |
| // TODO(vinod): Deprecate this in favor of authorization. |
| if (validationError.isNone() && |
| frameworkInfo.user() == "root" && !flags.root_submissions) { |
| validationError = Error("User 'root' is not allowed to run frameworks" |
| " without --root_submissions set"); |
| } |
| |
| if (validationError.isNone() && frameworkInfo.has_id() && |
| isCompletedFramework(frameworkInfo.id())) { |
| // This could happen if a framework tries to subscribe after its |
| // failover timeout has elapsed or it unregistered itself by |
| // calling 'stop()' on the scheduler driver. |
| // |
| // TODO(vinod): Master should persist admitted frameworks to the |
| // registry and remove them from it after failover timeout. |
| validationError = Error("Framework has been removed"); |
| } |
| |
| if (validationError.isNone() && !isValidFailoverTimeout(frameworkInfo)) { |
| validationError = Error("The framework failover_timeout (" + |
| stringify(frameworkInfo.failover_timeout()) + |
| ") is invalid"); |
| } |
| |
| // Note that re-authentication errors are already handled above. |
| if (validationError.isNone()) { |
| validationError = validateFrameworkAuthentication(frameworkInfo, from); |
| } |
| |
| if (validationError.isSome()) { |
| LOG(INFO) << "Refusing subscription of framework" |
| << " '" << frameworkInfo.name() << "' at " << from << ": " |
| << validationError->message; |
| |
| FrameworkErrorMessage message; |
| message.set_message(validationError->message); |
| send(from, message); |
| return; |
| } |
| |
| LOG(INFO) << "Received SUBSCRIBE call for" |
| << " framework '" << frameworkInfo.name() << "' at " << from; |
| |
| // We allow an authenticated framework to not specify a principal |
| // in `FrameworkInfo` but we'd prefer to log a WARNING here. We also |
| // set `FrameworkInfo.principal` to the value of authenticated principal |
| // and use it for authorization later when it happens. |
| if (!frameworkInfo.has_principal() && authenticated.contains(from)) { |
| LOG(WARNING) |
| << "Setting 'principal' in FrameworkInfo to '" << authenticated[from] |
| << "' because the framework authenticated with that principal but did " |
| << "not set it in FrameworkInfo"; |
| |
| frameworkInfo.set_principal(authenticated[from]); |
| } |
| |
| // Need to disambiguate for the compiler. |
| void (Master::*_subscribe)( |
| const UPID&, |
| const FrameworkInfo&, |
| bool, |
| const set<string>&, |
| const Future<bool>&) = &Self::_subscribe; |
| |
| authorizeFramework(frameworkInfo) |
| .onAny(defer(self(), |
| _subscribe, |
| from, |
| frameworkInfo, |
| subscribe.force(), |
| suppressedRoles, |
| lambda::_1)); |
| } |
| |
| |
| void Master::_subscribe( |
| const UPID& from, |
| const FrameworkInfo& frameworkInfo, |
| bool force, |
| const set<string>& suppressedRoles, |
| const Future<bool>& authorized) |
| { |
| CHECK(!authorized.isDiscarded()); |
| |
| Option<Error> authorizationError = None(); |
| |
| if (authorized.isFailed()) { |
| authorizationError = |
| Error("Authorization failure: " + authorized.failure()); |
| } else if (!authorized.get()) { |
| authorizationError = Error( |
| "Not authorized to use roles '" + |
| stringify(protobuf::framework::getRoles(frameworkInfo)) + "'"); |
| } |
| |
| if (authorizationError.isSome()) { |
| LOG(INFO) << "Refusing subscription of framework" |
| << " '" << frameworkInfo.name() << "' at " << from |
| << ": " << authorizationError->message; |
| |
| FrameworkErrorMessage message; |
| message.set_message(authorizationError->message); |
| |
| send(from, message); |
| return; |
| } |
| |
| // At this point, authentications errors will be due to |
| // re-authentication during the authorization process, |
| // so we drop the subscription. |
| Option<Error> authenticationError = |
| validateFrameworkAuthentication(frameworkInfo, from); |
| |
| if (authenticationError.isSome()) { |
| LOG(INFO) << "Dropping SUBSCRIBE call for framework" |
| << " '" << frameworkInfo.name() << "' at " << from |
| << ": " << authenticationError->message; |
| |
| return; |
| } |
| |
| LOG(INFO) << "Subscribing framework " << frameworkInfo.name() |
| << " with checkpointing " |
| << (frameworkInfo.checkpoint() ? "enabled" : "disabled") |
| << " and capabilities " << frameworkInfo.capabilities(); |
| |
| if (!frameworkInfo.has_id() || frameworkInfo.id().value().empty()) { |
| // If we are here the framework is subscribing for the first time. |
| // Check if this framework is already subscribed (because it retries). |
| foreachvalue (Framework* framework, frameworks.registered) { |
| if (framework->pid == from) { |
| LOG(INFO) << "Framework " << *framework |
| << " already subscribed, resending acknowledgement"; |
| |
| FrameworkRegisteredMessage message; |
| message.mutable_framework_id()->MergeFrom(framework->id()); |
| message.mutable_master_info()->MergeFrom(info_); |
| framework->send(message); |
| return; |
| } |
| } |
| |
| CHECK(!frameworks.principals.contains(from)); |
| |
| // Assign a new FrameworkID. |
| FrameworkInfo frameworkInfo_ = frameworkInfo; |
| frameworkInfo_.mutable_id()->CopyFrom(newFrameworkId()); |
| |
| Framework* framework = new Framework(this, flags, frameworkInfo_, from); |
| |
| addFramework(framework, suppressedRoles); |
| |
| FrameworkRegisteredMessage message; |
| message.mutable_framework_id()->MergeFrom(framework->id()); |
| message.mutable_master_info()->MergeFrom(info_); |
| framework->send(message); |
| |
| if (!subscribers.subscribed.empty()) { |
| subscribers.send( |
| protobuf::master::event::createFrameworkAdded(*framework)); |
| } |
| |
| return; |
| } |
| |
| // If we are here the framework has already been assigned an id. |
| CHECK(!frameworkInfo.id().value().empty()); |
| |
| // Check whether we got a subscribe from a framework whose UPID duplicates |
| // a framework that is already connected. Note that we don't send an error |
| // response because that would go to the framework that is already connected. |
| if (frameworks.principals.contains(from)) { |
| foreachvalue (Framework* framework, frameworks.registered) { |
| if (framework->pid == from && framework->id() != frameworkInfo.id()) { |
| LOG(ERROR) << "Dropping SUBSCRIBE call for framework '" |
| << frameworkInfo.name() << "': " << *framework |
| << " already connected at " << from; |
| |
| return; |
| } |
| } |
| } |
| |
| Framework* framework = getFramework(frameworkInfo.id()); |
| |
| if (framework == nullptr) { |
| // The framework has not yet reregistered after master failover. |
| // Furthermore, no agents have reregistered running one of this |
| // framework's tasks. Reconstruct a `Framework` object from the |
| // supplied `FrameworkInfo`. |
| recoverFramework(frameworkInfo, suppressedRoles); |
| |
| framework = getFramework(frameworkInfo.id()); |
| } |
| |
| CHECK_NOTNULL(framework); |
| |
| if (!framework->recovered()) { |
| // The framework has previously been registered with this master; |
| // it may or may not currently be connected. |
| // |
| // Using the "force" field of the scheduler allows us to keep a |
| // scheduler that got partitioned but didn't die (in ZooKeeper |
| // speak this means didn't lose their session) and then |
| // eventually tried to connect to this master even though |
| // another instance of their scheduler has reconnected. |
| |
| // Test for the error case first. |
| if ((framework->pid != from) && !force) { |
| LOG(ERROR) << "Disallowing subscription attempt of" |
| << " framework " << *framework |
| << " because it is not expected from " << from; |
| |
| FrameworkErrorMessage message; |
| message.set_message("Framework failed over"); |
| send(from, message); |
| return; |
| } |
| |
| // It is now safe to update the framework fields since the request is now |
| // guaranteed to be successful. We use the fields passed in during |
| // re-registration. |
| updateFramework(framework, frameworkInfo, suppressedRoles); |
| |
| framework->reregisteredTime = Clock::now(); |
| |
| if (force) { |
| // TODO(vinod): Now that the scheduler pid is unique we don't |
| // need to call 'failoverFramework()' if the pid hasn't changed |
| // (i.e., duplicate message). Instead we can just send the |
| // FrameworkReregisteredMessage back and activate the framework |
| // if necesssary. |
| LOG(INFO) << "Framework " << *framework << " failed over"; |
| failoverFramework(framework, from); |
| |
| if (!subscribers.subscribed.empty()) { |
| subscribers.send( |
| protobuf::master::event::createFrameworkUpdated(*framework)); |
| } |
| } else { |
| LOG(INFO) << "Allowing framework " << *framework |
| << " to subscribe with an already used id"; |
| |
| // Remove any offers sent to this framework. |
| // NOTE: We need to do this because the scheduler might have |
| // replied to the offers but the driver might have dropped |
| // those messages since it wasn't connected to the master. |
| foreach (Offer* offer, utils::copy(framework->offers)) { |
| allocator->recoverResources( |
| offer->framework_id(), |
| offer->slave_id(), |
| offer->resources(), |
| None()); |
| removeOffer(offer, true); // Rescind. |
| } |
| |
| // Also remove inverse offers. |
| foreach (InverseOffer* inverseOffer, |
| utils::copy(framework->inverseOffers)) { |
| allocator->updateInverseOffer( |
| inverseOffer->slave_id(), |
| inverseOffer->framework_id(), |
| UnavailableResources{ |
| inverseOffer->resources(), |
| inverseOffer->unavailability()}, |
| None()); |
| |
| removeInverseOffer(inverseOffer, true); // Rescind. |
| } |
| |
| // Relink to the framework. This might be necessary if the |
| // framework link previously broke. |
| link(framework->pid.get()); |
| |
| // Reactivate the framework. |
| // NOTE: We do this after recovering resources (above) so that |
| // the allocator has the correct view of the framework's share. |
| if (!framework->active()) { |
| framework->setFrameworkState(Framework::State::ACTIVE); |
| allocator->activateFramework(framework->id()); |
| } |
| |
| FrameworkReregisteredMessage message; |
| message.mutable_framework_id()->MergeFrom(frameworkInfo.id()); |
| message.mutable_master_info()->MergeFrom(info_); |
| framework->send(message); |
| |
| if (!subscribers.subscribed.empty()) { |
| subscribers.send( |
| protobuf::master::event::createFrameworkUpdated(*framework)); |
| } |
| return; |
| } |
| } else { |
| // The framework has not yet reregistered after master failover. |
| Try<Nothing> activate = activateRecoveredFramework( |
| framework, frameworkInfo, from, None(), suppressedRoles); |
| |
| if (activate.isError()) { |
| LOG(INFO) << "Could not update FrameworkInfo of framework '" |
| << frameworkInfo.name() << "': " << activate.error(); |
| |
| FrameworkErrorMessage message; |
| message.set_message(activate.error()); |
| send(from, message); |
| return; |
| } |
| |
| if (!subscribers.subscribed.empty()) { |
| subscribers.send( |
| protobuf::master::event::createFrameworkUpdated(*framework)); |
| } |
| } |
| |
| // Broadcast the new framework pid to all the slaves. We have to |
| // broadcast because an executor might be running on a slave but |
| // it currently isn't running any tasks. |
| foreachvalue (Slave* slave, slaves.registered) { |
| UpdateFrameworkMessage message; |
| message.mutable_framework_id()->CopyFrom(frameworkInfo.id()); |
| message.set_pid(from); |
| message.mutable_framework_info()->CopyFrom(frameworkInfo); |
| send(slave->pid, message); |
| } |
| } |
| |
| |
| void Master::unregisterFramework( |
| const UPID& from, |
| const FrameworkID& frameworkId) |
| { |
| LOG(INFO) << "Asked to unregister framework " << frameworkId; |
| |
| Framework* framework = getFramework(frameworkId); |
| if (framework != nullptr) { |
| if (framework->pid == from) { |
| teardown(framework); |
| } else { |
| LOG(WARNING) |
| << "Ignoring unregister framework message for framework " << *framework |
| << " because it is not expected from " << from; |
| } |
| } |
| } |
| |
| |
| void Master::deactivateFramework( |
| const UPID& from, |
| const FrameworkID& frameworkId) |
| { |
| ++metrics->messages_deactivate_framework; |
| |
| Framework* framework = getFramework(frameworkId); |
| |
| if (framework == nullptr) { |
| LOG(WARNING) |
| << "Ignoring deactivate framework message for framework " << frameworkId |
| << " because the framework cannot be found"; |
| |
| return; |
| } |
| |
| if (framework->pid != from) { |
| LOG(WARNING) |
| << "Ignoring deactivate framework message for framework " << *framework |
| << " because it is not expected from " << from; |
| |
| return; |
| } |
| |
| if (!framework->connected()) { |
| LOG(INFO) |
| << "Ignoring deactivate framework message for framework" << *framework |
| << " because it is disconnected"; |
| |
| return; |
| } |
| |
| if (framework->active()) { |
| deactivate(framework, true); |
| } |
| } |
| |
| |
| void Master::disconnect(Framework* framework) |
| { |
| CHECK_NOTNULL(framework); |
| CHECK(framework->connected()); |
| |
| if (framework->active()) { |
| deactivate(framework, true); |
| } |
| |
| LOG(INFO) << "Disconnecting framework " << *framework; |
| |
| framework->setFrameworkState(Framework::State::DISCONNECTED); |
| |
| if (framework->pid.isSome()) { |
| // Remove the framework from authenticated. This is safe because |
| // a framework will always reauthenticate before (re-)registering. |
| authenticated.erase(framework->pid.get()); |
| } else { |
| CHECK_SOME(framework->http); |
| |
| // Close the HTTP connection, which may already have |
| // been closed due to scheduler disconnection. |
| framework->http->close(); |
| } |
| } |
| |
| |
| void Master::deactivate(Framework* framework, bool rescind) |
| { |
| CHECK_NOTNULL(framework); |
| CHECK(framework->active()); |
| |
| LOG(INFO) << "Deactivating framework " << *framework; |
| |
| framework->setFrameworkState(Framework::State::INACTIVE); |
| |
| // Tell the allocator to stop allocating resources to this framework. |
| allocator->deactivateFramework(framework->id()); |
| |
| // Remove the framework's offers. |
| foreach (Offer* offer, utils::copy(framework->offers)) { |
| allocator->recoverResources( |
| offer->framework_id(), |
| offer->slave_id(), |
| offer->resources(), |
| None()); |
| |
| removeOffer(offer, rescind); |
| } |
| |
| // Remove the framework's inverse offers. |
| foreach (InverseOffer* inverseOffer, utils::copy(framework->inverseOffers)) { |
| allocator->updateInverseOffer( |
| inverseOffer->slave_id(), |
| inverseOffer->framework_id(), |
| UnavailableResources{ |
| inverseOffer->resources(), |
| inverseOffer->unavailability()}, |
| None()); |
| |
| removeInverseOffer(inverseOffer, rescind); |
| } |
| } |
| |
| |
| void Master::disconnect(Slave* slave) |
| { |
| CHECK_NOTNULL(slave); |
| |
| LOG(INFO) << "Disconnecting agent " << *slave; |
| |
| slave->connected = false; |
| |
| // Inform the slave observer. |
| dispatch(slave->observer, &SlaveObserver::disconnect); |
| |
| // Remove the slave from authenticated. This is safe because |
| // a slave will always reauthenticate before (re-)registering. |
| authenticated.erase(slave->pid); |
| |
| deactivate(slave); |
| } |
| |
| |
| void Master::deactivate(Slave* slave) |
| { |
| CHECK_NOTNULL(slave); |
| |
| LOG(INFO) << "Deactivating agent " << *slave; |
| |
| slave->active = false; |
| |
| allocator->deactivateSlave(slave->id); |
| |
| // Remove and rescind offers. |
| foreach (Offer* offer, utils::copy(slave->offers)) { |
| allocator->recoverResources( |
| offer->framework_id(), |
| slave->id, |
| offer->resources(), |
| None()); |
| |
| removeOffer(offer, true); // Rescind! |
| } |
| |
| // Remove and rescind inverse offers. |
| foreach (InverseOffer* inverseOffer, utils::copy(slave->inverseOffers)) { |
| allocator->updateInverseOffer( |
| slave->id, |
| inverseOffer->framework_id(), |
| UnavailableResources{ |
| inverseOffer->resources(), |
| inverseOffer->unavailability()}, |
| None()); |
| |
| removeInverseOffer(inverseOffer, true); // Rescind! |
| } |
| } |
| |
| |
| void Master::resourceRequest( |
| const UPID& from, |
| const FrameworkID& frameworkId, |
| const vector<Request>& requests) |
| { |
| Framework* framework = getFramework(frameworkId); |
| |
| if (framework == nullptr) { |
| LOG(WARNING) |
| << "Ignoring resource request message from framework " << frameworkId |
| << " because the framework cannot be found"; |
| |
| return; |
| } |
| |
| if (framework->pid != from) { |
| LOG(WARNING) |
| << "Ignoring resource request message from framework " << *framework |
| << " because it is not expected from " << from; |
| |
| return; |
| } |
| |
| scheduler::Call::Request call; |
| foreach (const Request& request, requests) { |
| call.add_requests()->CopyFrom(request); |
| } |
| |
| request(framework, call); |
| } |
| |
| |
| void Master::request( |
| Framework* framework, |
| const scheduler::Call::Request& request) |
| { |
| CHECK_NOTNULL(framework); |
| |
| LOG(INFO) << "Processing REQUEST call for framework " << *framework; |
| |
| ++metrics->messages_resource_request; |
| |
| allocator->requestResources( |
| framework->id(), |
| google::protobuf::convert(request.requests())); |
| } |
| |
| |
| void Master::suppress( |
| Framework* framework, |
| const scheduler::Call::Suppress& suppress) |
| { |
| CHECK_NOTNULL(framework); |
| |
| LOG(INFO) << "Processing SUPPRESS call for framework " << *framework; |
| |
| ++metrics->messages_suppress_offers; |
| |
| set<string> roles; |
| |
| // Validate the roles, if provided. We need to make sure the |
| // roles is valid and also contained within the framework roles. |
| // Note that if a single role is invalid, we drop the entire |
| // call and do not suppress the valid roles. |
| foreach (const string& role, suppress.roles()) { |
| Option<Error> roleError = roles::validate(role); |
| if (roleError.isSome()) { |
| drop(framework, |
| suppress, |
| "suppression role '" + role + "' is invalid: " + roleError->message); |
| return; |
| } |
| |
| if (framework->roles.count(role) == 0) { |
| drop(framework, |
| suppress, |
| "suppression role '" + role + "' is not one" |
| " of the frameworks's subscribed roles"); |
| return; |
| } |
| |
| roles.insert(role); |
| } |
| |
| allocator->suppressOffers(framework->id(), roles); |
| } |
| |
| |
| bool Master::isWhitelistedRole(const string& name) const |
| { |
| if (roleWhitelist.isNone()) { |
| return true; |
| } |
| |
| return roleWhitelist->contains(name); |
| } |
| |
| |
| void Master::launchTasks( |
| const UPID& from, |
| LaunchTasksMessage&& launchTasksMessage) |
| { |
| Framework* framework = getFramework(launchTasksMessage.framework_id()); |
| |
| if (framework == nullptr) { |
| LOG(WARNING) |
| << "Ignoring launch tasks message for offers " |
| << stringify(launchTasksMessage.offer_ids()) |
| << " of framework " << launchTasksMessage.framework_id() |
| << " because the framework cannot be found"; |
| |
| return; |
| } |
| |
| if (framework->pid != from) { |
| LOG(WARNING) |
| << "Ignoring launch tasks message for offers " |
| << stringify(launchTasksMessage.offer_ids()) |
| << " from '" << from << "' because it is not from the" |
| << " registered framework " << *framework; |
| |
| return; |
| } |
| |
| // Currently when no tasks are specified in the launchTasks message |
| // it is implicitly considered a decline of the offers. |
| if (!launchTasksMessage.tasks().empty()) { |
| scheduler::Call::Accept message; |
| |
| *message.mutable_filters() = |
| std::move(*launchTasksMessage.mutable_filters()); |
| |
| *message.mutable_offer_ids() = |
| std::move(*launchTasksMessage.mutable_offer_ids()); |
| |
| Offer::Operation* operation = message.add_operations(); |
| operation->set_type(Offer::Operation::LAUNCH); |
| |
| *operation->mutable_launch()->mutable_task_infos() = |
| std::move(*launchTasksMessage.mutable_tasks()); |
| |
| accept(framework, std::move(message)); |
| } else { |
| scheduler::Call::Decline message; |
| |
| *message.mutable_filters() = |
| std::move(*launchTasksMessage.mutable_filters()); |
| |
| *message.mutable_offer_ids() = |
| std::move(*launchTasksMessage.mutable_offer_ids()); |
| |
| decline(framework, std::move(message)); |
| } |
| } |
| |
| |
| Future<bool> Master::authorizeTask( |
| const TaskInfo& task, |
| Framework* framework) |
| { |
| CHECK_NOTNULL(framework); |
| |
| if (authorizer.isNone()) { |
| return true; // Authorization is disabled. |
| } |
| |
| // Authorize the task. |
| authorization::Request request; |
| |
| if (framework->info.has_principal()) { |
| request.mutable_subject()->set_value(framework->info.principal()); |
| } |
| |
| request.set_action(authorization::RUN_TASK); |
| |
| authorization::Object* object = request.mutable_object(); |
| |
| object->mutable_task_info()->CopyFrom(task); |
| object->mutable_framework_info()->CopyFrom(framework->info); |
| |
| LOG(INFO) |
| << "Authorizing framework principal '" |
| << (framework->info.has_principal() ? framework->info.principal() : "ANY") |
| << "' to launch task " << task.task_id(); |
| |
| return authorizer.get()->authorized(request); |
| } |
| |
| |
| Future<bool> Master::authorizeReserveResources( |
| const Offer::Operation::Reserve& reserve, |
| const Option<Principal>& principal) |
| { |
| // Authorizing the reserve operation is equivalent to authorizing |
| // the resources specified in the operation. |
| return authorizeReserveResources(reserve.resources(), principal); |
| } |
| |
| |
| Future<bool> Master::authorizeReserveResources( |
| const Resources& resources, |
| const Option<Principal>& principal) |
| { |
| if (authorizer.isNone()) { |
| return true; // Authorization is disabled. |
| } |
| |
| authorization::Request request; |
| request.set_action(authorization::RESERVE_RESOURCES); |
| |
| Option<authorization::Subject> subject = createSubject(principal); |
| if (subject.isSome()) { |
| request.mutable_subject()->CopyFrom(subject.get()); |
| } |
| |
| // The operation will be authorized if the entity is allowed to make |
| // reservations for all roles included in `reserve.resources`. |
| // Add an element to `request.roles` for each unique role in the resources. |
| hashset<string> roles; |
| vector<Future<bool>> authorizations; |
| foreach (const Resource& resource, resources) { |
| // NOTE: Since authorization happens __before__ validation and resource |
| // format conversion, we must look for roles that may appear in both |
| // "pre" and "post" reservation-refinement formats. This may not even be |
| // valid, but we rely on validation being performed aftewards. |
| string role; |
| if (resource.reservations_size() > 0) { |
| // Check for the role in the "post-reservation-refinement" format. |
| // |
| // If there is a stack of reservations, we only perform authorization |
| // for the most refined reservation, since we only support "pushing" |
| // one reservation at a time. That is, all of the previous reservations |
| // must have already been authorized. |
| role = resource.reservations().rbegin()->role(); |
| } else { |
| // Check for the role in the "pre-reservation-refinement" format. |
| role = resource.role(); |
| } |
| |
| if (!roles.contains(role)) { |
| roles.insert(role); |
| |
| request.mutable_object()->mutable_resource()->CopyFrom(resource); |
| request.mutable_object()->set_value(role); |
| authorizations.push_back(authorizer.get()->authorized(request)); |
| } |
| } |
| |
| LOG(INFO) << "Authorizing principal '" |
| << (principal.isSome() ? stringify(principal.get()) : "ANY") |
| << "' to reserve resources '" << resources << "'"; |
| |
| // NOTE: Empty authorizations are not valid and are checked by a validator. |
| // However under certain circumstances, this method can be called before |
| // the validation occur and the case must be considered non erroneous. |
| // TODO(arojas): Consider ensuring that `validate()` is called before |
| // `authorizeReserveResources` so a `CHECK(!roles.empty())` can be added. |
| if (authorizations.empty()) { |
| return authorizer.get()->authorized(request); |
| } |
| |
| return collectAuthorizations(authorizations); |
| } |
| |
| |
| Future<bool> Master::authorizeUnreserveResources( |
| const Offer::Operation::Unreserve& unreserve, |
| const Option<Principal>& principal) |
| { |
| if (authorizer.isNone()) { |
| return true; // Authorization is disabled. |
| } |
| |
| authorization::Request request; |
| request.set_action(authorization::UNRESERVE_RESOURCES); |
| |
| Option<authorization::Subject> subject = createSubject(principal); |
| if (subject.isSome()) { |
| request.mutable_subject()->CopyFrom(subject.get()); |
| } |
| |
| vector<Future<bool>> authorizations; |
| foreach (const Resource& resource, unreserve.resources()) { |
| // NOTE: Since authorization happens __before__ validation and resource |
| // format conversion, we must look for the principal that may appear in |
| // both "pre" and "post" reservation-refinement formats. This may not be |
| // valid, but we rely on validation being performed later. |
| Option<string> principal; |
| if (resource.reservations_size() > 0 && |
| resource.reservations().rbegin()->has_principal()) { |
| // Check for roles in the "post-reservation-refinement" format. |
| principal = resource.reservations().rbegin()->principal(); |
| } else if ( |
| resource.has_reservation() && resource.reservation().has_principal()) { |
| // Check for roles in the "pre-reservation-refinement" format. |
| principal = resource.reservation().principal(); |
| } |
| |
| if (principal.isSome()) { |
| request.mutable_object()->mutable_resource()->CopyFrom(resource); |
| request.mutable_object()->set_value(principal.get()); |
| |
| authorizations.push_back(authorizer.get()->authorized(request)); |
| } |
| } |
| |
| LOG(INFO) << "Authorizing principal '" |
| << (principal.isSome() ? stringify(principal.get()) : "ANY") |
| << "' to unreserve resources '" << unreserve.resources() << "'"; |
| |
| if (authorizations.empty()) { |
| return authorizer.get()->authorized(request); |
| } |
| |
| return collectAuthorizations(authorizations); |
| } |
| |
| |
| Future<bool> Master::authorizeCreateVolume( |
| const Offer::Operation::Create& create, |
| const Option<Principal>& principal) |
| { |
| if (authorizer.isNone()) { |
| return true; // Authorization is disabled. |
| } |
| |
| authorization::Request request; |
| request.set_action(authorization::CREATE_VOLUME); |
| |
| Option<authorization::Subject> subject = createSubject(principal); |
| if (subject.isSome()) { |
| request.mutable_subject()->CopyFrom(subject.get()); |
| } |
| |
| // The operation will be authorized if the entity is allowed to create |
| // volumes for all roles included in `create.volumes`. |
| // Add an element to `request.roles` for each unique role in the volumes. |
| hashset<string> roles; |
| vector<Future<bool>> authorizations; |
| foreach (const Resource& volume, create.volumes()) { |
| string role; |
| if (volume.reservations_size() > 0) { |
| // Check for role in the "post-reservation-refinement" format. |
| // |
| // If there is a stack of reservations, we only perform authorization |
| // for the most refined reservation, since we only support "pushing" |
| // one reservation at a time. That is, all of the previous reservations |
| // must have already been authorized. |
| role = volume.reservations().rbegin()->role(); |
| } else { |
| // Check for role in the "pre-reservation-refinement" format. |
| role = volume.role(); |
| } |
| |
| if (!roles.contains(role)) { |
| roles.insert(role); |
| |
| request.mutable_object()->mutable_resource()->CopyFrom(volume); |
| request.mutable_object()->set_value(role); |
| authorizations.push_back(authorizer.get()->authorized(request)); |
| } |
| } |
| |
| LOG(INFO) << "Authorizing principal '" |
| << (principal.isSome() ? stringify(principal.get()) : "ANY") |
| << "' to create volumes '" << create.volumes() << "'"; |
| |
| if (authorizations.empty()) { |
| return authorizer.get()->authorized(request); |
| } |
| |
| return collectAuthorizations(authorizations); |
| } |
| |
| |
| Future<bool> Master::authorizeDestroyVolume( |
| const Offer::Operation::Destroy& destroy, |
| const Option<Principal>& principal) |
| { |
| if (authorizer.isNone()) { |
| return true; // Authorization is disabled. |
| } |
| |
| authorization::Request request; |
| request.set_action(authorization::DESTROY_VOLUME); |
| |
| Option<authorization::Subject> subject = createSubject(principal); |
| if (subject.isSome()) { |
| request.mutable_subject()->CopyFrom(subject.get()); |
| } |
| |
| vector<Future<bool>> authorizations; |
| foreach (const Resource& volume, destroy.volumes()) { |
| // NOTE: Since validation of this operation may be performed after |
| // authorization, we must check here that this resource is a persistent |
| // volume. If it isn't, the error will be caught during validation. |
| if (volume.has_disk() && volume.disk().has_persistence()) { |
| request.mutable_object()->mutable_resource()->CopyFrom(volume); |
| request.mutable_object()->set_value( |
| volume.disk().persistence().principal()); |
| |
| authorizations.push_back(authorizer.get()->authorized(request)); |
| } |
| } |
| |
| LOG(INFO) << "Authorizing principal '" |
| << (principal.isSome() ? stringify(principal.get()) : "ANY") |
| << "' to destroy volumes '" << destroy.volumes() << "'"; |
| |
| if (authorizations.empty()) { |
| return authorizer.get()->authorized(request); |
| } |
| |
| return collectAuthorizations(authorizations); |
| } |
| |
| |
| Future<bool> Master::authorizeResizeVolume( |
| const Resource& volume, |
| const Option<Principal>& principal) |
| { |
| if (authorizer.isNone()) { |
| return true; // Authorization is disabled. |
| } |
| |
| authorization::Request request; |
| request.set_action(authorization::RESIZE_VOLUME); |
| |
| Option<authorization::Subject> subject = createSubject(principal); |
| if (subject.isSome()) { |
| request.mutable_subject()->CopyFrom(subject.get()); |
| } |
| |
| request.mutable_object()->mutable_resource()->CopyFrom(volume); |
| |
| string role; |
| if (volume.reservations_size() > 0) { |
| // Check for role in the "post-reservation-refinement" format. |
| role = volume.reservations().rbegin()->role(); |
| } else { |
| // Check for role in the "pre-reservation-refinement" format. |
| role = volume.role(); |
| } |
| |
| request.mutable_object()->set_value(role); |
| |
| LOG(INFO) << "Authorizing principal '" |
| << (principal.isSome() ? stringify(principal.get()) : "ANY") |
| << "' to resize volume '" << volume << "'"; |
| |
| return authorizer.get()->authorized(request); |
| } |
| |
| |
| Future<bool> Master::authorizeCreateDisk( |
| const Offer::Operation::CreateDisk& createDisk, |
| const Option<Principal>& principal) |
| { |
| if (authorizer.isNone()) { |
| return true; // Authorization is disabled. |
| } |
| |
| const Resource& resource = createDisk.source(); |
| |
| Option<authorization::Action> action; |
| switch (createDisk.target_type()) { |
| case Resource::DiskInfo::Source::MOUNT: { |
| action = authorization::CREATE_MOUNT_DISK; |
| break; |
| } |
| case Resource::DiskInfo::Source::BLOCK: { |
| action = authorization::CREATE_BLOCK_DISK; |
| break; |
| } |
| case Resource::DiskInfo::Source::UNKNOWN: |
| case Resource::DiskInfo::Source::PATH: |
| case Resource::DiskInfo::Source::RAW: { |
| return Failure( |
| "Failed to authorize principal '" + |
| (principal.isSome() ? stringify(principal.get()) : "ANY") + |
| "' to create a " + stringify(createDisk.target_type()) + |
| " disk from '" + stringify(resource) + "': Unsupported disk type"); |
| } |
| } |
| |
| authorization::Request request; |
| request.set_action(CHECK_NOTNONE(action)); |
| |
| Option<authorization::Subject> subject = createSubject(principal); |
| if (subject.isSome()) { |
| request.mutable_subject()->CopyFrom(subject.get()); |
| } |
| |
| request.mutable_object()->mutable_resource()->CopyFrom(resource); |
| |
| // We set `object.value` in addition to `object.resource` to support legacy |
| // authorizers making only use of this deprecated field. |
| // |
| // NOTE: We rely on the master to ensure that the resource is in the |
| // post-reservation-refinement format and set the value to the most refined |
| // role, or default to '*' for consistency if there is no reservation. |
| CHECK(!resource.has_role()) << resource; |
| CHECK(!resource.has_reservation()) << resource; |
| request.mutable_object()->set_value( |
| resource.reservations().empty() |
| ? "*" |
| : resource.reservations().rbegin()->role()); |
| |
| LOG(INFO) << "Authorizing principal '" |
| << (principal.isSome() ? stringify(principal.get()) : "ANY") |
| << "' to create a " << createDisk.target_type() << " disk from '" |
| << createDisk.source() << "'"; |
| |
| return authorizer.get()->authorized(request); |
| } |
| |
| |
| Future<bool> Master::authorizeDestroyDisk( |
| const Offer::Operation::DestroyDisk& destroyDisk, |
| const Option<Principal>& principal) |
| { |
| if (authorizer.isNone()) { |
| return true; // Authorization is disabled. |
| } |
| |
| const Resource& resource = destroyDisk.source(); |
| |
| Option<authorization::Action> action; |
| switch (resource.disk().source().type()) { |
| case Resource::DiskInfo::Source::MOUNT: { |
| action = authorization::DESTROY_MOUNT_DISK; |
| break; |
| } |
| case Resource::DiskInfo::Source::BLOCK: { |
| action = authorization::DESTROY_BLOCK_DISK; |
| break; |
| } |
| case Resource::DiskInfo::Source::RAW: { |
| action = authorization::DESTROY_RAW_DISK; |
| break; |
| } |
| case Resource::DiskInfo::Source::UNKNOWN: |
| case Resource::DiskInfo::Source::PATH: { |
| return Failure( |
| "Failed to authorize principal '" + |
| (principal.isSome() ? stringify(principal.get()) : "ANY") + |
| "' to destroy disk '" + stringify(resource) + |
| "': Unsupported disk type"); |
| } |
| } |
| |
| authorization::Request request; |
| request.set_action(CHECK_NOTNONE(action)); |
| |
| Option<authorization::Subject> subject = createSubject(principal); |
| if (subject.isSome()) { |
| request.mutable_subject()->CopyFrom(subject.get()); |
| } |
| |
| request.mutable_object()->mutable_resource()->CopyFrom(resource); |
| |
| // We set `object.value` in addition to `object.resource` to support legacy |
| // authorizers making only use of this deprecated field. |
| // |
| // NOTE: We rely on the master to ensure that the resource is in the |
| // post-reservation-refinement format and set the value to the most refined |
| // role, or default to '*' for consistency if there is no reservation. |
| CHECK(!resource.has_role()) << resource; |
| CHECK(!resource.has_reservation()) << resource; |
| request.mutable_object()->set_value( |
| resource.reservations().empty() |
| ? "*" |
| : resource.reservations().rbegin()->role()); |
| |
| LOG(INFO) << "Authorizing principal '" |
| << (principal.isSome() ? stringify(principal.get()) : "ANY") |
| << "' to destroy disk '" << destroyDisk.source() << "'"; |
| |
| return authorizer.get()->authorized(request); |
| } |
| |
| |
| Future<bool> Master::authorizeSlave( |
| const SlaveInfo& slaveInfo, |
| const Option<Principal>& principal) |
| { |
| if (authorizer.isNone()) { |
| return true; |
| } |
| |
| vector<Future<bool>> authorizations; |
| |
| // First authorize whether the agent can register. |
| LOG(INFO) << "Authorizing agent providing resources " |
| << "'" << stringify(Resources(slaveInfo.resources())) << "' " |
| << (principal.isSome() |
| ? "with principal '" + stringify(principal.get()) + "'" |
| : "without a principal"); |
| |
| authorization::Request request; |
| request.set_action(authorization::REGISTER_AGENT); |
| |
| Option<authorization::Subject> subject = createSubject(principal); |
| if (subject.isSome()) { |
| request.mutable_subject()->CopyFrom(subject.get()); |
| } |
| |
| // No need to set the request's object as it is implicitly set to |
| // ANY by the authorizer. |
| authorizations.push_back(authorizer.get()->authorized(request)); |
| |
| // Next, if static reservations exist, also authorize them. |
| // |
| // NOTE: We don't look at dynamic reservations in checkpointed |
| // resources because they should have gone through authorization |
| // against the framework / operator's principal when they were |
| // created. In constrast, static reservations are initiated by the |
| // agent's principal and authorizing them helps prevent agents from |
| // advertising reserved resources of arbitrary roles. |
| if (!Resources(slaveInfo.resources()).reserved().empty()) { |
| authorizations.push_back( |
| authorizeReserveResources(slaveInfo.resources(), principal)); |
| } |
| |
| return collectAuthorizations(authorizations); |
| } |
| |
| |
| bool Master::isLaunchExecutor( |
| const ExecutorID& executorId, |
| Framework* framework, |
| Slave* slave) const |
| { |
| CHECK_NOTNULL(framework); |
| CHECK_NOTNULL(slave); |
| |
| if (!slave->hasExecutor(framework->id(), executorId)) { |
| CHECK(!framework->hasExecutor(slave->id, executorId)) |
| << "Executor '" << executorId |
| << "' known to the framework " << *framework |
| << " but unknown to the agent " << *slave; |
| |
| return true; |
| } |
| |
| return false; |
| } |
| |
| |
| void Master::addExecutor( |
| const ExecutorInfo& executorInfo, |
| Framework* framework, |
| Slave* slave) |
| { |
| CHECK_NOTNULL(framework); |
| CHECK_NOTNULL(slave); |
| CHECK(slave->connected) << "Adding executor " << executorInfo.executor_id() |
| << " to disconnected agent " << *slave; |
| |
| slave->addExecutor(framework->id(), executorInfo); |
| framework->addExecutor(slave->id, executorInfo); |
| } |
| |
| |
| void Master::addTask( |
| const TaskInfo& task, |
| Framework* framework, |
| Slave* slave) |
| { |
| CHECK_NOTNULL(framework); |
| CHECK_NOTNULL(slave); |
| CHECK(slave->connected) << "Adding task " << task.task_id() |
| << " to disconnected agent " << *slave; |
| |
| // Add the task to the framework and slave. |
| Task* t = new Task(protobuf::createTask(task, TASK_STAGING, framework->id())); |
| |
| slave->addTask(t); |
| framework->addTask(t); |
| } |
| |
| |
| void Master::accept( |
| Framework* framework, |
| scheduler::Call::Accept&& accept) |
| { |
| CHECK_NOTNULL(framework); |
| |
| // Bump metrics. |
| foreach (const Offer::Operation& operation, accept.operations()) { |
| if (operation.type() == Offer::Operation::LAUNCH) { |
| if (operation.launch().task_infos().size() > 0) { |
| ++metrics->messages_launch_tasks; |
| } else { |
| ++metrics->messages_decline_offers; |
| LOG(WARNING) << "Implicitly declining offers: " << accept.offer_ids() |
| << " in ACCEPT call for framework " << framework->id() |
| << " as the launch operation specified no tasks"; |
| } |
| } |
| |
| // TODO(mpark): Add metrics for LAUNCH_GROUP operation. |
| // TODO(jieyu): Add metrics for non launch operations. |
| } |
| |
| // TODO(bmahler): We currently only support using multiple offers |
| // for a single slave. |
| Resources offeredResources; |
| Option<SlaveID> slaveId = None(); |
| Option<Error> error = None(); |
| Option<Resource::AllocationInfo> allocationInfo = None(); |
| |
| if (accept.offer_ids().size() == 0) { |
| error = Error("No offers specified"); |
| } else { |
| // Validate the offers. |
| error = validation::offer::validate(accept.offer_ids(), this, framework); |
| |
| size_t offersAccepted = 0; |
| |
| // Compute offered resources and remove the offers. If the |
| // validation failed, return resources to the allocator. |
| foreach (const OfferID& offerId, accept.offer_ids()) { |
| Offer* offer = getOffer(offerId); |
| if (offer != nullptr) { |
| // Don't bother adding resources to `offeredResources` in case |
| // validation failed; just recover them. |
| if (error.isSome()) { |
| allocator->recoverResources( |
| offer->framework_id(), |
| offer->slave_id(), |
| offer->resources(), |
| None()); |
| } else { |
| slaveId = offer->slave_id(); |
| allocationInfo = offer->allocation_info(); |
| offeredResources += offer->resources(); |
| |
| offersAccepted++; |
| } |
| |
| removeOffer(offer); |
| continue; |
| } |
| |
| // If the offer was not in our offer set, then this offer is no |
| // longer valid. |
| LOG(WARNING) << "Ignoring accept of offer " << offerId |
| << " since it is no longer valid"; |
| } |
| |
| framework->metrics.offers_accepted += offersAccepted; |
| } |
| |
| // If invalid, send TASK_DROPPED for the launch attempts. If the |
| // framework is not partition-aware, send TASK_LOST instead. If |
| // other operations have their `id` field set, then send |
| // OPERATION_DROPPED updates for them. |
| // |
| // TODO(jieyu): Consider adding a 'drop' overload for ACCEPT call to |
| // consistently handle message dropping. It would be ideal if the |
| // 'drop' overload can handle both resource recovery and lost task |
| // notifications. |
| if (error.isSome()) { |
| LOG(WARNING) << "ACCEPT call used invalid offers '" << accept.offer_ids() |
| << "': " << error->message; |
| |
| TaskState newTaskState = TASK_DROPPED; |
| if (!framework->capabilities.partitionAware) { |
| newTaskState = TASK_LOST; |
| } |
| |
| foreach (const Offer::Operation& operation, accept.operations()) { |
| if (operation.type() != Offer::Operation::LAUNCH && |
| operation.type() != Offer::Operation::LAUNCH_GROUP) { |
| drop(framework, |
| operation, |
| "Operation attempted with invalid offers: " + error->message); |
| continue; |
| } |
| |
| const RepeatedPtrField<TaskInfo>& tasks = [&]() { |
| if (operation.type() == Offer::Operation::LAUNCH) { |
| return operation.launch().task_infos(); |
| } else if (operation.type() == Offer::Operation::LAUNCH_GROUP) { |
| return operation.launch_group().task_group().tasks(); |
| } |
| UNREACHABLE(); |
| }(); |
| |
| foreach (const TaskInfo& task, tasks) { |
| const StatusUpdate& update = protobuf::createStatusUpdate( |
| framework->id(), |
| task.slave_id(), |
| task.task_id(), |
| newTaskState, |
| TaskStatus::SOURCE_MASTER, |
| None(), |
| "Task launched with invalid offers: " + error->message, |
| TaskStatus::REASON_INVALID_OFFERS); |
| |
| if (framework->capabilities.partitionAware) { |
| metrics->tasks_dropped++; |
| } else { |
| metrics->tasks_lost++; |
| } |
| |
| metrics->incrementTasksStates( |
| newTaskState, |
| TaskStatus::SOURCE_MASTER, |
| TaskStatus::REASON_INVALID_OFFERS); |
| |
| forward(update, UPID(), framework); |
| } |
| } |
| |
| return; |
| } |
| |
| CHECK_SOME(slaveId); |
| Slave* slave = slaves.registered.get(slaveId.get()); |
| CHECK_NOTNULL(slave); |
| |
| // Validate and upgrade all of the resources in `accept.operations`: |
| // |
| // For an operation except LAUNCH and LAUNCH_GROUP which contains invalid |
| // resources, |
| // - if the framework has elected to receive feedback by setting the `id` |
| // field, then we send an offer operation status update with a state of |
| // OPERATION_ERROR. |
| // - if the framework has not set the `id` field, then we simply drop the |
| // operation. |
| // |
| // If a LAUNCH or LAUNCH_GROUP operation contains invalid resources, we send |
| // a TASK_ERROR status update per task. |
| // |
| // |
| // If the framework is requesting offer operation status updates by setting |
| // the `id` field in an operation, then also verify that the relevant agent |
| // has the RESOURCE_PROVIDER capability. If it does not, then send an offer |
| // operation status update with a state of OPERATION_ERROR. |
| // |
| // LAUNCH and LAUNCH_GROUP operations cannot receive offer operation status, |
| // updates, so we send a TASK_ERROR status update per task when these |
| // operations set the `id` field. |
| { |
| // Used to send TASK_ERROR status updates for tasks in invalid LAUNCH |
| // and LAUNCH_GROUP operations. Note that we don't need to recover |
| // the resources here because we always continue onto `_accept` |
| // which recovers the unused resources at the end. |
| // |
| // TODO(mpark): Consider pulling this out in a more reusable manner. |
| auto sendStatusUpdates = [&]( |
| const RepeatedPtrField<TaskInfo>& tasks, |
| TaskStatus::Reason reason, |
| const string& message) { |
| foreach (const TaskInfo& task, tasks) { |
| const StatusUpdate& update = protobuf::createStatusUpdate( |
| framework->id(), |
| task.slave_id(), |
| task.task_id(), |
| TASK_ERROR, |
| TaskStatus::SOURCE_MASTER, |
| None(), |
| message, |
| reason); |
| |
| metrics->tasks_error++; |
| |
| metrics->incrementTasksStates( |
| TASK_ERROR, TaskStatus::SOURCE_MASTER, reason); |
| |
| forward(update, UPID(), framework); |
| } |
| }; |
| |
| // We move out the `accept.operations`, and re-insert the operations |
| // with the resources validated and upgraded. |
| RepeatedPtrField<Offer::Operation> operations = accept.operations(); |
| accept.clear_operations(); |
| |
| foreach (Offer::Operation& operation, operations) { |
| Option<Error> error = validateAndUpgradeResources(&operation); |
| if (error.isSome()) { |
| switch (operation.type()) { |
| case Offer::Operation::RESERVE: |
| case Offer::Operation::UNRESERVE: |
| case Offer::Operation::CREATE: |
| case Offer::Operation::DESTROY: |
| case Offer::Operation::GROW_VOLUME: |
| case Offer::Operation::SHRINK_VOLUME: |
| case Offer::Operation::CREATE_DISK: |
| case Offer::Operation::DESTROY_DISK: { |
| drop(framework, |
| operation, |
| "Operation attempted with invalid resources: " + |
| error->message); |
| break; |
| } |
| case Offer::Operation::LAUNCH: { |
| sendStatusUpdates( |
| operation.launch().task_infos(), |
| TaskStatus::REASON_TASK_INVALID, |
| error->message); |
| |
| break; |
| } |
| case Offer::Operation::LAUNCH_GROUP: { |
| sendStatusUpdates( |
| operation.launch_group().task_group().tasks(), |
| TaskStatus::REASON_TASK_GROUP_INVALID, |
| error->message); |
| |
| break; |
| } |
| case Offer::Operation::UNKNOWN: { |
| LOG(WARNING) << "Ignoring unknown operation"; |
| break; |
| } |
| } |
| } else if (operation.has_id()) { |
| // The `id` field is set, which means operation feedback is requested. |
| // |
| // Operation feedback is not supported for LAUNCH or LAUNCH_GROUP |
| // operations, so we drop them and send TASK_ERROR status updates. |
| // |
| // For other operations, verify that they have been sent by an HTTP |
| // framework and that they are destined for an agent with the |
| // RESOURCE_PROVIDER capability. |
| switch (operation.type()) { |
| case Offer::Operation::LAUNCH: { |
| sendStatusUpdates( |
| operation.launch().task_infos(), |
| TaskStatus::REASON_TASK_INVALID, |
| "The `id` field cannot be set on LAUNCH operations"); |
| |
| break; |
| } |
| case Offer::Operation::LAUNCH_GROUP: { |
| sendStatusUpdates( |
| operation.launch_group().task_group().tasks(), |
| TaskStatus::REASON_TASK_GROUP_INVALID, |
| "The `id` field cannot be set on LAUNCH_GROUP operations"); |
| |
| break; |
| } |
| case Offer::Operation::RESERVE: |
| case Offer::Operation::UNRESERVE: |
| case Offer::Operation::CREATE: |
| case Offer::Operation::DESTROY: |
| case Offer::Operation::GROW_VOLUME: |
| case Offer::Operation::SHRINK_VOLUME: |
| case Offer::Operation::CREATE_DISK: |
| case Offer::Operation::DESTROY_DISK: { |
| if (framework->http.isNone()) { |
| const string message = |
| "The 'id' field was set in an offer operation, but operation" |
| " feedback is not supported for the SchedulerDriver API"; |
| |
| LOG(WARNING) << "Dropping " |
| << Offer::Operation::Type_Name(operation.type()) |
| << " operation from framework " << *framework << ": " |
| << message; |
| |
| // Send an error which will cause the scheduler driver to abort. |
| FrameworkErrorMessage frameworkError; |
| frameworkError.set_message( |
| message + |
| "; please use the HTTP scheduler API for this feature"); |
| framework->send(frameworkError); |
| |
| break; |
| } |
| |
| if (getResourceProviderId(operation).isNone()) { |
| drop(framework, |
| operation, |
| "Operation requested feedback, but it affects resources not" |
| " managed by a resource provider"); |
| break; |
| } |
| |
| if (!slave->capabilities.resourceProvider) { |
| drop(framework, |
| operation, |
| "Operation requested feedback, but agent " + |
| stringify(slaveId.get()) + |
| " does not have the required RESOURCE_PROVIDER capability"); |
| break; |
| } |
| |
| accept.add_operations()->CopyFrom(operation); |
| break; |
| } |
| case Offer::Operation::UNKNOWN: { |
| LOG(WARNING) << "Ignoring unknown operation"; |
| break; |
| } |
| } |
| } else { |
| // Resource validation succeeded and feedback is not requested, |
| // so add the operation. |
| accept.add_operations()->CopyFrom(operation); |
| } |
| } |
| } |
| |
| // We make various adjustments to the `Offer::Operation`s, |
| // typically for backward/forward compatibility. |
| // TODO(mpark): Pull this out to a master normalization utility. |
| foreach (Offer::Operation& operation, *accept.mutable_operations()) { |
| // With the addition of the MULTI_ROLE capability, the resources |
| // within an offer now contain an `AllocationInfo`. We therefore |
| // inject the offer's allocation info into the operation's |
| // resources if the scheduler has not done so already. |
| CHECK_SOME(allocationInfo); |
| protobuf::injectAllocationInfo(&operation, allocationInfo.get()); |
| |
| switch (operation.type()) { |
| case Offer::Operation::RESERVE: |
| case Offer::Operation::UNRESERVE: |
| case Offer::Operation::CREATE: |
| case Offer::Operation::DESTROY: |
| case Offer::Operation::GROW_VOLUME: |
| case Offer::Operation::SHRINK_VOLUME: |
| case Offer::Operation::CREATE_DISK: |
| case Offer::Operation::DESTROY_DISK: { |
| // No-op. |
| break; |
| } |
| case Offer::Operation::LAUNCH: { |
| foreach ( |
| TaskInfo& task, *operation.mutable_launch()->mutable_task_infos()) { |
| // TODO(haosdent): Once we have internal `TaskInfo` separate from |
| // the v0 `TaskInfo` (see MESOS-6268), consider extracting the |
| // following adaptation code into devolve methods from v0 and v1 |
| // `TaskInfo` to internal `TaskInfo`. |
| // |
| // Make a copy of the original task so that we can fill the missing |
| // `framework_id` in `ExecutorInfo` if needed. This field was added |
| // to the API later and thus was made optional. |
| if (task.has_executor() && !task.executor().has_framework_id()) { |
| task.mutable_executor()->mutable_framework_id()->CopyFrom( |
| framework->id()); |
| } |
| |
| // For backwards compatibility with the v0 and v1 API, when |
| // the type of the health check is not specified, determine |
| // its type from the `http` and `command` fields. |
| // |
| // TODO(haosdent): Remove this after the deprecation cycle which |
| // starts in 2.0. |
| if (task.has_health_check() && !task.health_check().has_type()) { |
| LOG(WARNING) << "The type of health check is not set; use of " |
| << "'HealthCheck' without specifying 'type' will be " |
| << "deprecated in Mesos 2.0"; |
| |
| const HealthCheck& healthCheck = task.health_check(); |
| if (healthCheck.has_command() && !healthCheck.has_http()) { |
| task.mutable_health_check()->set_type(HealthCheck::COMMAND); |
| } else if (healthCheck.has_http() && !healthCheck.has_command()) { |
| task.mutable_health_check()->set_type(HealthCheck::HTTP); |
| } |
| } |
| } |
| |
| break; |
| } |
| case Offer::Operation::LAUNCH_GROUP: { |
| const ExecutorInfo& executor = operation.launch_group().executor(); |
| |
| TaskGroupInfo* taskGroup = |
| operation.mutable_launch_group()->mutable_task_group(); |
| |
| // Mutate `TaskInfo` to include `ExecutorInfo` to make it easy |
| // for operator API and WebUI to get access to the corresponding |
| // executor for tasks in the task group. |
| foreach (TaskInfo& task, *taskGroup->mutable_tasks()) { |
| if (!task.has_executor()) { |
| task.mutable_executor()->CopyFrom(executor); |
| } |
| } |
| |
| break; |
| } |
| case Offer::Operation::UNKNOWN: { |
| // No-op. |
| break; |
| } |
| } |
| } |
| |
| LOG(INFO) << "Processing ACCEPT call for offers: " << accept.offer_ids() |
| << " on agent " << *slave << " for framework " << *framework; |
| |
| vector<Future<bool>> futures; |
| foreach (const Offer::Operation& operation, accept.operations()) { |
| switch (operation.type()) { |
| case Offer::Operation::LAUNCH: |
| case Offer::Operation::LAUNCH_GROUP: { |
| const RepeatedPtrField<TaskInfo>& tasks = [&]() { |
| if (operation.type() == Offer::Operation::LAUNCH) { |
| return operation.launch().task_infos(); |
| } else if (operation.type() == Offer::Operation::LAUNCH_GROUP) { |
| return operation.launch_group().task_group().tasks(); |
| } |
| UNREACHABLE(); |
| }(); |
| |
| // Authorize the tasks. A task is in 'framework->pendingTasks' |
| // and 'slave->pendingTasks' before it is authorized. |
| foreach (const TaskInfo& task, tasks) { |
| futures.push_back(authorizeTask(task, framework)); |
| |
| // Add to the framework's list of pending tasks. |
| // |
| // NOTE: If two tasks have the same ID, the second one will |
| // not be put into 'framework->pendingTasks', therefore |
| // will not be launched (and TASK_ERROR will be sent). |
| // Unfortunately, we can't tell the difference between a |
| // duplicate TaskID and getting killed while pending |
| // (removed from the map). So it's possible that we send |
| // a TASK_ERROR after a TASK_KILLED (see _accept())! |
| if (!framework->pendingTasks.contains(task.task_id())) { |
| framework->pendingTasks[task.task_id()] = task; |
| } |
| |
| // Add to the slave's list of pending tasks. |
| if (!slave->pendingTasks.contains(framework->id()) || |
| !slave->pendingTasks[framework->id()].contains(task.task_id())) { |
| slave->pendingTasks[framework->id()][task.task_id()] = task; |
| } |
| } |
| break; |
| } |
| |
| // NOTE: When handling RESERVE and UNRESERVE operations, authorization |
| // will proceed even if no principal is specified, although currently |
| // resources cannot be reserved or unreserved unless a principal is |
| // provided. Any RESERVE/UNRESERVE operation with no associated principal |
| // will be found invalid when `validate()` is called in `_accept()` below. |
| |
| // The RESERVE operation allows a principal to reserve resources. |
| case Offer::Operation::RESERVE: { |
| Option<Principal> principal = framework->info.has_principal() |
| ? Principal(framework->info.principal()) |
| : Option<Principal>::none(); |
| |
| futures.push_back( |
| authorizeReserveResources( |
| operation.reserve(), principal)); |
| |
| break; |
| } |
| |
| // The UNRESERVE operation allows a principal to unreserve resources. |
| case Offer::Operation::UNRESERVE: { |
| Option<Principal> principal = framework->info.has_principal() |
| ? Principal(framework->info.principal()) |
| : Option<Principal>::none(); |
| |
| futures.push_back( |
| authorizeUnreserveResources( |
| operation.unreserve(), principal)); |
| |
| break; |
| } |
| |
| // The CREATE operation allows the creation of a persistent volume. |
| case Offer::Operation::CREATE: { |
| Option<Principal> principal = framework->info.has_principal() |
| ? Principal(framework->info.principal()) |
| : Option<Principal>::none(); |
| |
| futures.push_back( |
| authorizeCreateVolume( |
| operation.create(), principal)); |
| |
| break; |
| } |
| |
| // The DESTROY operation allows the destruction of a persistent volume. |
| case Offer::Operation::DESTROY: { |
| Option<Principal> principal = framework->info.has_principal() |
| ? Principal(framework->info.principal()) |
| : Option<Principal>::none(); |
| |
| futures.push_back( |
| authorizeDestroyVolume( |
| operation.destroy(), principal)); |
| |
| break; |
| } |
| |
| case Offer::Operation::GROW_VOLUME: { |
| Option<Principal> principal = framework->info.has_principal() |
| ? Principal(framework->info.principal()) |
| : Option<Principal>::none(); |
| |
| futures.push_back( |
| authorizeResizeVolume( |
| operation.grow_volume().volume(), principal)); |
| |
| break; |
| } |
| |
| case Offer::Operation::SHRINK_VOLUME: { |
| Option<Principal> principal = framework->info.has_principal() |
| ? Principal(framework->info.principal()) |
| : Option<Principal>::none(); |
| |
| futures.push_back( |
| authorizeResizeVolume( |
| operation.shrink_volume().volume(), principal)); |
| |
| break; |
| } |
| |
| case Offer::Operation::CREATE_DISK: { |
| Option<Principal> principal = framework->info.has_principal() |
| ? Principal(framework->info.principal()) |
| : Option<Principal>::none(); |
| |
| futures.push_back( |
| authorizeCreateDisk( |
| operation.create_disk(), principal)); |
| |
| break; |
| } |
| |
| case Offer::Operation::DESTROY_DISK: { |
| Option<Principal> principal = framework->info.has_principal() |
| ? Principal(framework->info.principal()) |
| : Option<Principal>::none(); |
| |
| futures.push_back( |
| authorizeDestroyDisk( |
| operation.destroy_disk(), principal)); |
| |
| break; |
| } |
| |
| case Offer::Operation::UNKNOWN: { |
| // TODO(vinod): Send an error event to the scheduler? |
| LOG(WARNING) << "Ignoring unknown operation"; |
| break; |
| } |
| } |
| } |
| |
| // Wait for all the tasks to be authorized. |
| await(futures) |
| .onAny(defer(self(), |
| &Master::_accept, |
| framework->id(), |
| slaveId.get(), |
| offeredResources, |
| std::move(accept), |
| lambda::_1)); |
| } |
| |
| |
| void Master::_accept( |
| const FrameworkID& frameworkId, |
| const SlaveID& slaveId, |
| const Resources& offeredResources, |
| scheduler::Call::Accept&& accept, |
| const Future<vector<Future<bool>>>& _authorizations) |
| { |
| Framework* framework = getFramework(frameworkId); |
| |
| // TODO(jieyu): Consider using the 'drop' overload mentioned in |
| // 'accept' to consistently handle dropping ACCEPT calls. |
| if (framework == nullptr) { |
| LOG(WARNING) |
| << "Ignoring ACCEPT call for framework " << frameworkId |
| << " because the framework cannot be found"; |
| |
| // Tell the allocator about the recovered resources. |
| allocator->recoverResources( |
| frameworkId, |
| slaveId, |
| offeredResources, |
| None()); |
| |
| return; |
| } |
| |
| Slave* slave = slaves.registered.get(slaveId); |
| |
| if (slave == nullptr || !slave->connected) { |
| TaskState newTaskState = TASK_DROPPED; |
| if (!framework->capabilities.partitionAware) { |
| newTaskState = TASK_LOST; |
| } |
| |
| foreach (const Offer::Operation& operation, accept.operations()) { |
| if (operation.type() != Offer::Operation::LAUNCH && |
| operation.type() != Offer::Operation::LAUNCH_GROUP) { |
| continue; |
| } |
| |
| const RepeatedPtrField<TaskInfo>& tasks = [&]() { |
| if (operation.type() == Offer::Operation::LAUNCH) { |
| return operation.launch().task_infos(); |
| } else { |
| CHECK_EQ(Offer::Operation::LAUNCH_GROUP, operation.type()); |
| return operation.launch_group().task_group().tasks(); |
| } |
| }(); |
| |
| foreach (const TaskInfo& task, tasks) { |
| // Remove the task from being pending. |
| framework->pendingTasks.erase(task.task_id()); |
| if (slave != nullptr) { |
| slave->pendingTasks[framework->id()].erase(task.task_id()); |
| if (slave->pendingTasks[framework->id()].empty()) { |
| slave->pendingTasks.erase(framework->id()); |
| } |
| } |
| |
| const TaskStatus::Reason reason = |
| slave == nullptr ? TaskStatus::REASON_SLAVE_REMOVED |
| : TaskStatus::REASON_SLAVE_DISCONNECTED; |
| const StatusUpdate& update = protobuf::createStatusUpdate( |
| framework->id(), |
| task.slave_id(), |
| task.task_id(), |
| newTaskState, |
| TaskStatus::SOURCE_MASTER, |
| None(), |
| slave == nullptr ? "Agent removed" : "Agent disconnected", |
| reason); |
| |
| if (framework->capabilities.partitionAware) { |
| metrics->tasks_dropped++; |
| } else { |
| metrics->tasks_lost++; |
| } |
| |
| metrics->incrementTasksStates( |
| newTaskState, |
| TaskStatus::SOURCE_MASTER, |
| reason); |
| |
| forward(update, UPID(), framework); |
| } |
| } |
| |
| // Tell the allocator about the recovered resources. |
| allocator->recoverResources( |
| frameworkId, |
| slaveId, |
| offeredResources, |
| None()); |
| |
| return; |
| } |
| |
| // Some operations update the offered resources. We keep |
| // updated offered resources here. When a task is successfully |
| // launched, we remove its resource from offered resources. |
| Resources _offeredResources = offeredResources; |
| |
| // Converted resources from volume resizes. These converted resources are not |
| // put into `_offeredResources`, so no other operations can consume them. |
| // TODO(zhitao): This will be unnecessary once `GROW_VOLUME` and |
| // `SHRINK_VOLUME` become non-speculative. |
| Resources resizedResources; |
| |
| // We keep track of the shared resources from the offers separately. |
| // `offeredSharedResources` can be modified by CREATE/DESTROY but we |
| // don't remove from it when a task is successfully launched so this |
| // variable always tracks the *total* amount. We do this to support |
| // validation of tasks involving shared resources. See comments in |
| // the LAUNCH case below. |
| Resources offeredSharedResources = offeredResources.shared(); |
| |
| // Maintain a list of resource conversions to pass to the allocator |
| // as a result of operations. Note that: |
| // 1) We drop invalid operations. |
| // 2) For LAUNCH operations, we drop invalid tasks. LAUNCH operation |
| // will result in resource conversions because of shared |
| // resources. |
| // 3) Currently, LAUNCH_GROUP won't result in resource conversions |
| // because shared resources are not supported yet if the |
| // framework uses LAUNCH_GROUP operation. |
| // |
| // The order of the conversions is important and preserved. |
| vector<ResourceConversion> conversions; |
| |
| // The order of `authorizations` must match the order of the operations and/or |
| // tasks in `accept.operations()` as they are iterated through simultaneously. |
| CHECK_READY(_authorizations); |
| std::deque<Future<bool>> authorizations( |
| _authorizations->begin(), _authorizations->end()); |
| |
| foreach (const Offer::Operation& operation, accept.operations()) { |
| switch (operation.type()) { |
| // The RESERVE operation allows a principal to reserve resources. |
| case Offer::Operation::RESERVE: { |
| CHECK(!authorizations.empty()); |
| Future<bool> authorization = authorizations.front(); |
| authorizations.pop_front(); |
| |
| CHECK(!authorization.isDiscarded()); |
| |
| if (authorization.isFailed()) { |
| // TODO(greggomann): We may want to retry this failed authorization |
| // request rather than dropping it immediately. |
| drop(framework, |
| operation, |
| "Authorization of principal '" + framework->info.principal() + |
| "' to reserve resources failed: " + authorization.failure()); |
| |
| continue; |
| } else if (!authorization.get()) { |
| drop(framework, |
| operation, |
| "Not authorized to reserve resources as '" + |
| framework->info.principal() + "'"); |
| |
| continue; |
| } |
| |
| Option<Principal> principal = framework->info.has_principal() |
| ? Principal(framework->info.principal()) |
| : Option<Principal>::none(); |
| |
| // Make sure this reserve operation is valid. |
| Option<Error> error = validation::operation::validate( |
| operation.reserve(), |
| principal, |
| slave->capabilities, |
| framework->info); |
| |
| if (error.isSome()) { |
| drop( |
| framework, |
| operation, |
| error->message + "; on agent " + stringify(*slave)); |
| continue; |
| } |
| |
| // Test the given operation on the included resources. |
| Try<vector<ResourceConversion>> _conversions = |
| getResourceConversions(operation); |
| |
| if (_conversions.isError()) { |
| drop(framework, operation, _conversions.error()); |
| continue; |
| } |
| |
| Try<Resources> resources = _offeredResources.apply(_conversions.get()); |
| if (resources.isError()) { |
| drop(framework, operation, resources.error()); |
| continue; |
| } |
| |
| _offeredResources = resources.get(); |
| |
| LOG(INFO) << "Applying RESERVE operation for resources " |
| << operation.reserve().resources() << " from framework " |
| << *framework << " to agent " << *slave; |
| |
| _apply(slave, framework, operation); |
| |
| conversions.insert( |
| conversions.end(), |
| _conversions->begin(), |
| _conversions->end()); |
| |
| break; |
| } |
| |
| // The UNRESERVE operation allows a principal to unreserve resources. |
| case Offer::Operation::UNRESERVE: { |
| CHECK(!authorizations.empty()); |
| Future<bool> authorization = authorizations.front(); |
| authorizations.pop_front(); |
| |
| CHECK(!authorization.isDiscarded()); |
| |
| if (authorization.isFailed()) { |
| // TODO(greggomann): We may want to retry this failed authorization |
| // request rather than dropping it immediately. |
| drop(framework, |
| operation, |
| "Authorization of principal '" + framework->info.principal() + |
| "' to unreserve resources failed: " + |
| authorization.failure()); |
| |
| continue; |
| } else if (!authorization.get()) { |
| drop(framework, |
| operation, |
| "Not authorized to unreserve resources as '" + |
| framework->info.principal() + "'"); |
| |
| continue; |
| } |
| |
| // Make sure this unreserve operation is valid. |
| Option<Error> error = |
| validation::operation::validate(operation.unreserve()); |
| |
| if (error.isSome()) { |
| drop(framework, operation, error->message); |
| continue; |
| } |
| |
| // Test the given operation on the included resources. |
| Try<vector<ResourceConversion>> _conversions = |
| getResourceConversions(operation); |
| |
| if (_conversions.isError()) { |
| drop(framework, operation, _conversions.error()); |
| continue; |
| } |
| |
| Try<Resources> resources = _offeredResources.apply(_conversions.get()); |
| if (resources.isError()) { |
| drop(framework, operation, resources.error()); |
| continue; |
| } |
| |
| _offeredResources = resources.get(); |
| |
| LOG(INFO) << "Applying UNRESERVE operation for resources " |
| << operation.unreserve().resources() << " from framework " |
| << *framework << " to agent " << *slave; |
| |
| _apply(slave, framework, operation); |
| |
| conversions.insert( |
| conversions.end(), |
| _conversions->begin(), |
| _conversions->end()); |
| |
| break; |
| } |
| |
| case Offer::Operation::CREATE: { |
| CHECK(!authorizations.empty()); |
| Future<bool> authorization = authorizations.front(); |
| authorizations.pop_front(); |
| |
| CHECK(!authorization.isDiscarded()); |
| |
| if (authorization.isFailed()) { |
| // TODO(greggomann): We may want to retry this failed authorization |
| // request rather than dropping it immediately. |
| drop(framework, |
| operation, |
| "Authorization of principal '" + framework->info.principal() + |
| "' to create persistent volumes failed: " + |
| authorization.failure()); |
| |
| continue; |
| } else if (!authorization.get()) { |
| drop(framework, |
| operation, |
| "Not authorized to create persistent volumes as '" + |
| framework->info.principal() + "'"); |
| |
| continue; |
| } |
| |
| Option<Principal> principal = framework->info.has_principal() |
| ? Principal(framework->info.principal()) |
| : Option<Principal>::none(); |
| |
| // Make sure this create operation is valid. |
| Option<Error> error = validation::operation::validate( |
| operation.create(), |
| slave->checkpointedResources, |
| principal, |
| slave->capabilities, |
| framework->info); |
| |
| if (error.isSome()) { |
| drop( |
| framework, |
| operation, |
| error->message + "; on agent " + stringify(*slave)); |
| continue; |
| } |
| |
| // Test the given operation on the included resources. |
| Try<vector<ResourceConversion>> _conversions = |
| getResourceConversions(operation); |
| |
| if (_conversions.isError()) { |
| drop(framework, operation, _conversions.error()); |
| continue; |
| } |
| |
| Try<Resources> resources = _offeredResources.apply(_conversions.get()); |
| if (resources.isError()) { |
| drop(framework, operation, resources.error()); |
| continue; |
| } |
| |
| _offeredResources = resources.get(); |
| offeredSharedResources = _offeredResources.shared(); |
| |
| LOG(INFO) << "Applying CREATE operation for volumes " |
| << operation.create().volumes() << " from framework " |
| << *framework << " to agent " << *slave; |
| |
| _apply(slave, framework, operation); |
| |
| conversions.insert( |
| conversions.end(), |
| _conversions->begin(), |
| _conversions->end()); |
| |
| break; |
| } |
| |
| case Offer::Operation::DESTROY: { |
| CHECK(!authorizations.empty()); |
| Future<bool> authorization = authorizations.front(); |
| authorizations.pop_front(); |
| |
| CHECK(!authorization.isDiscarded()); |
| |
| if (authorization.isFailed()) { |
| // TODO(greggomann): We may want to retry this failed authorization |
| // request rather than dropping it immediately. |
| drop(framework, |
| operation, |
| "Authorization of principal '" + framework->info.principal() + |
| "' to destroy persistent volumes failed: " + |
| authorization.failure()); |
| |
| continue; |
| } else if (!authorization.get()) { |
| drop(framework, |
| operation, |
| "Not authorized to destroy persistent volumes as '" + |
| framework->info.principal() + "'"); |
| |
| continue; |
| } |
| |
| // Make sure this destroy operation is valid. |
| Option<Error> error = validation::operation::validate( |
| operation.destroy(), |
| slave->checkpointedResources, |
| slave->usedResources, |
| slave->pendingTasks); |
| |
| if (error.isSome()) { |
| drop(framework, operation, error->message); |
| continue; |
| } |
| |
| // If any offer from this slave contains a volume that needs |
| // to be destroyed, we should process it, but we should also |
| // rescind those offers. |
| foreach (Offer* offer, utils::copy(slave->offers)) { |
| const Resources& offered = offer->resources(); |
| |
| foreach (const Resource& volume, operation.destroy().volumes()) { |
| if (offered.contains(volume)) { |
| allocator->recoverResources( |
| offer->framework_id(), |
| offer->slave_id(), |
| offered, |
| None()); |
| |
| removeOffer(offer, true); |
| |
| // This offer may contain other volumes that are being destroyed. |
| // However, we have already rescinded it, so we should move on |
| // to the next offer. |
| break; |
| } |
| } |
| } |
| |
| // Test the given operation on the included resources. |
| Try<vector<ResourceConversion>> _conversions = |
| getResourceConversions(operation); |
| |
| if (_conversions.isError()) { |
| drop(framework, operation, _conversions.error()); |
| continue; |
| } |
| |
| Try<Resources> resources = _offeredResources.apply(_conversions.get()); |
| if (resources.isError()) { |
| drop(framework, operation, resources.error()); |
| continue; |
| } |
| |
| _offeredResources = resources.get(); |
| offeredSharedResources = _offeredResources.shared(); |
| |
| LOG(INFO) << "Applying DESTROY operation for volumes " |
| << operation.destroy().volumes() << " from framework " |
| << *framework << " to agent " << *slave; |
| |
| _apply(slave, framework, operation); |
| |
| conversions.insert( |
| conversions.end(), |
| _conversions->begin(), |
| _conversions->end()); |
| |
| break; |
| } |
| |
| case Offer::Operation::GROW_VOLUME: { |
| CHECK(!authorizations.empty()); |
| Future<bool> authorization = authorizations.front(); |
| authorizations.pop_front(); |
| |
| CHECK(!authorization.isDiscarded()); |
| |
| if (authorization.isFailed()) { |
| // TODO(greggomann): We may want to retry this failed authorization |
| // request rather than dropping it immediately. |
| drop(framework, |
| operation, |
| "Authorization of principal '" + framework->info.principal() + |
| "' to grow a volume failed: " + |
| authorization.failure()); |
| |
| continue; |
| } else if (!authorization.get()) { |
| drop(framework, |
| operation, |
| "Not authorized to grow a volume as '" + |
| framework->info.principal() + "'"); |
| |
| continue; |
| } |
| |
| // Make sure this grow volume operation is valid. |
| Option<Error> error = validation::operation::validate( |
| operation.grow_volume(), slave->capabilities); |
| |
| if (error.isSome()) { |
| drop( |
| framework, |
| operation, |
| error->message + "; on agent " + stringify(*slave)); |
| continue; |
| } |
| |
| // TODO(zhitao): Convert this operation to non-speculative once we can |
| // support that in the operator API. |
| Try<vector<ResourceConversion>> _conversions = |
| getResourceConversions(operation); |
| |
| if (_conversions.isError()) { |
| drop(framework, operation, _conversions.error()); |
| continue; |
| } |
| |
| CHECK_EQ(1u, _conversions->size()); |
| const Resources& consumed = _conversions->at(0).consumed; |
| const Resources& converted = _conversions->at(0).converted; |
| |
| if (!_offeredResources.contains(consumed)) { |
| drop( |
| framework, |
| operation, |
| "Invalid GROW_VOLUME operation: " + |
| stringify(_offeredResources) + " does not contain " + |
| stringify(consumed)); |
| |
| continue; |
| } |
| |
| _offeredResources -= consumed; |
| resizedResources += converted; |
| |
| LOG(INFO) << "Processing GROW_VOLUME operation for volume " |
| << operation.grow_volume().volume() |
| << " with additional resource " |
| << operation.grow_volume().addition() |
| << " from framework " |
| << *framework << " on agent " << *slave; |
| |
| _apply(slave, framework, operation); |
| |
| conversions.insert( |
| conversions.end(), |
| _conversions->begin(), |
| _conversions->end()); |
| |
| break; |
| } |
| |
| case Offer::Operation::SHRINK_VOLUME: { |
| CHECK(!authorizations.empty()); |
| Future<bool> authorization = authorizations.front(); |
| authorizations.pop_front(); |
| |
| CHECK(!authorization.isDiscarded()); |
| |
| if (authorization.isFailed()) { |
| // TODO(greggomann): We may want to retry this failed authorization |
| // request rather than dropping it immediately. |
| drop(framework, |
| operation, |
| "Authorization of principal '" + framework->info.principal() + |
| "' to shrink a volume failed: " + |
| authorization.failure()); |
| |
| continue; |
| } else if (!authorization.get()) { |
| drop(framework, |
| operation, |
| "Not authorized to shrink a volume as '" + |
| framework->info.principal() + "'"); |
| |
| continue; |
| } |
| |
| // Make sure this shrink volume operation is valid. |
| Option<Error> error = validation::operation::validate( |
| operation.shrink_volume(), slave->capabilities); |
| |
| if (error.isSome()) { |
| drop( |
| framework, |
| operation, |
| error->message + "; on agent " + stringify(*slave)); |
| continue; |
| } |
| |
| // TODO(zhitao): Convert this operation to non-speculative once we can |
| // support that in the operator API. |
| Try<vector<ResourceConversion>> _conversions = |
| getResourceConversions(operation); |
| |
| if (_conversions.isError()) { |
| drop(framework, operation, _conversions.error()); |
| continue; |
| } |
| |
| CHECK_EQ(1u, _conversions->size()); |
| const Resources& consumed = _conversions->at(0).consumed; |
| const Resources& converted = _conversions->at(0).converted; |
| |
| if (!_offeredResources.contains(consumed)) { |
| drop( |
| framework, |
| operation, |
| "Invalid SHRINK_VOLUME operation: " + |
| stringify(_offeredResources) + " does not contain " + |
| stringify(consumed)); |
| |
| continue; |
| } |
| |
| _offeredResources -= consumed; |
| resizedResources += converted; |
| |
| LOG(INFO) << "Processing SHRINK_VOLUME operation for volume " |
| << operation.shrink_volume().volume() |
| << " subtracting scalar value " |
| << operation.shrink_volume().subtract() |
| << " from framework " |
| << *framework << " on agent " << *slave; |
| |
| _apply(slave, framework, operation); |
| |
| conversions.insert( |
| conversions.end(), |
| _conversions->begin(), |
| _conversions->end()); |
| |
| break; |
| } |
| |
| case Offer::Operation::LAUNCH: { |
| foreach (const TaskInfo& task, operation.launch().task_infos()) { |
| CHECK(!authorizations.empty()); |
| Future<bool> authorization = authorizations.front(); |
| authorizations.pop_front(); |
| |
| // The task will not be in `pendingTasks` if it has been |
| // killed in the interim. No need to send TASK_KILLED in |
| // this case as it has already been sent. Note however that |
| // we cannot currently distinguish between the task being |
| // killed and the task having a duplicate TaskID within |
| // `pendingTasks`. Therefore we must still validate the task |
| // to ensure we send the TASK_ERROR in the case that it has a |
| // duplicate TaskID. |
| // |
| // TODO(bmahler): We may send TASK_ERROR after a TASK_KILLED |
| // if a task was killed (removed from `pendingTasks`) *and* |
| // the task is invalid or unauthorized here. |
| |
| bool pending = framework->pendingTasks.contains(task.task_id()); |
| framework->pendingTasks.erase(task.task_id()); |
| slave->pendingTasks[framework->id()].erase(task.task_id()); |
| if (slave->pendingTasks[framework->id()].empty()) { |
| slave->pendingTasks.erase(framework->id()); |
| } |
| |
| CHECK(!authorization.isDiscarded()); |
| |
| if (authorization.isFailed() || !authorization.get()) { |
| string user = framework->info.user(); // Default user. |
| if (task.has_command() && task.command().has_user()) { |
| user = task.command().user(); |
| } else if (task.has_executor() && |
| task.executor().command().has_user()) { |
| user = task.executor().command().user(); |
| } |
| |
| const StatusUpdate& update = protobuf::createStatusUpdate( |
| framework->id(), |
| task.slave_id(), |
| task.task_id(), |
| TASK_ERROR, |
| TaskStatus::SOURCE_MASTER, |
| None(), |
| authorization.isFailed() ? |
| "Authorization failure: " + authorization.failure() : |
| "Not authorized to launch as user '" + user + "'", |
| TaskStatus::REASON_TASK_UNAUTHORIZED); |
| |
| metrics->tasks_error++; |
| |
| metrics->incrementTasksStates( |
| TASK_ERROR, |
| TaskStatus::SOURCE_MASTER, |
| TaskStatus::REASON_TASK_UNAUTHORIZED); |
| |
| forward(update, UPID(), framework); |
| |
| continue; // Continue to the next task. |
| } |
| |
| // Validate the task. |
| |
| // We add back offered shared resources for validation even if they |
| // are already consumed by other tasks in the same ACCEPT call. This |
| // allows these tasks to use more copies of the same shared resource |
| // than those being offered. e.g., 2 tasks can be launched on 1 copy |
| // of a shared persistent volume from the offer; 3 tasks can be |
| // launched on 2 copies of a shared persistent volume from 2 offers. |
| Resources available = |
| _offeredResources.nonShared() + offeredSharedResources; |
| |
| Option<Error> error = |
| validation::task::validate(task, framework, slave, available); |
| |
| if (error.isSome()) { |
| const StatusUpdate& update = protobuf::createStatusUpdate( |
| framework->id(), |
| task.slave_id(), |
| task.task_id(), |
| TASK_ERROR, |
| TaskStatus::SOURCE_MASTER, |
| None(), |
| error->message, |
| TaskStatus::REASON_TASK_INVALID); |
| |
| metrics->tasks_error++; |
| |
| metrics->incrementTasksStates( |
| TASK_ERROR, |
| TaskStatus::SOURCE_MASTER, |
| TaskStatus::REASON_TASK_INVALID); |
| |
| forward(update, UPID(), framework); |
| |
| continue; // Continue to the next task. |
| } |
| |
| // Add task. |
| if (pending) { |
| Resources consumed; |
| |
| bool launchExecutor = true; |
| if (task.has_executor()) { |
| launchExecutor = isLaunchExecutor( |
| task.executor().executor_id(), framework, slave); |
| |
| // Master tracks the new executor only if the task is not a |
| // command task. |
| if (launchExecutor) { |
| addExecutor(task.executor(), framework, slave); |
| consumed += task.executor().resources(); |
| } |
| } |
| |
| addTask(task, framework, slave); |
| consumed += task.resources(); |
| |
| CHECK(available.contains(consumed)) |
| << available << " does not contain " << consumed; |
| |
| // Determine the additional instances of shared resources |
| // needed to be added to the allocations since we support |
| // tasks requesting more instances of shared resources |
| // than those being offered. |
| const Resources& consumedShared = consumed.shared(); |
| |
| // Check that offered resources contain at least one copy |
| // of each consumed shared resource (guaranteed by master |
| // validation). |
| foreach (const Resource& resource, consumedShared) { |
| CHECK(offeredSharedResources.contains(resource)); |
| } |
| |
| Resources additional = consumedShared - _offeredResources.shared(); |
| if (!additional.empty()) { |
| LOG(INFO) << "Allocating additional resources " << additional |
| << " for task " << task.task_id() |
| << " of framework " << *framework |
| << " on agent " << *slave; |
| |
| conversions.emplace_back(Resources(), additional); |
| } |
| |
| _offeredResources -= consumed; |
| |
| RunTaskMessage message; |
| message.mutable_framework()->MergeFrom(framework->info); |
| |
| hashmap<Option<ResourceProviderID>, UUID> resourceVersions; |
| if (slave->resourceVersion.isSome()) { |
| resourceVersions.put(None(), slave->resourceVersion.get()); |
| } |
| |
| foreachpair ( |
| const ResourceProviderID& resourceProviderId, |
| const Slave::ResourceProvider& resourceProvider, |
| slave->resourceProviders) { |
| resourceVersions.put( |
| resourceProviderId, resourceProvider.resourceVersion); |
| } |
| |
| message.mutable_resource_version_uuids()->CopyFrom( |
| protobuf::createResourceVersions(resourceVersions)); |
| |
| // TODO(anand): We set 'pid' to UPID() for http frameworks |
| // as 'pid' was made optional in 0.24.0. In 0.25.0, we |
| // no longer have to set pid here for http frameworks. |
| message.set_pid(framework->pid.getOrElse(UPID())); |
| message.mutable_task()->MergeFrom(task); |
| |
| message.set_launch_executor(launchExecutor); |
| |
| if (HookManager::hooksAvailable()) { |
| // Set labels retrieved from label-decorator hooks. |
| message.mutable_task()->mutable_labels()->CopyFrom( |
| HookManager::masterLaunchTaskLabelDecorator( |
| task, |
| framework->info, |
| slave->info)); |
| } |
| |
| // If the agent does not support reservation refinement, downgrade |
| // the task / executor resources to the "pre-reservation-refinement" |
| // format. This cannot contain any refined reservations since |
| // the master rejects attempts to create refined reservations |
| // on non-capable agents. |
| if (!slave->capabilities.reservationRefinement) { |
| CHECK_SOME(downgradeResources(&message)); |
| } |
| |
| LOG(INFO) << "Launching task " << task.task_id() << " of framework " |
| << *framework << " with resources " << task.resources() |
| << " on agent " << *slave << " on " |
| << (launchExecutor ? |
| " new executor" : " existing executor"); |
| |
| // Increment this metric here for LAUNCH since it |
| // does not make use of the `_apply()` function. |
| framework->metrics.incrementOperation(operation); |
| |
| send(slave->pid, message); |
| } |
| } |
| |
| break; |
| } |
| |
| case Offer::Operation::LAUNCH_GROUP: { |
| // We must ensure that the entire group can be launched. This |
| // means all tasks in the group must be authorized and valid. |
| // If any tasks in the group have been killed in the interim |
| // we must kill the entire group. |
| const ExecutorInfo& executor = operation.launch_group().executor(); |
| const TaskGroupInfo& taskGroup = operation.launch_group().task_group(); |
| |
| // Remove all the tasks from being pending. |
| hashset<TaskID> killed; |
| foreach (const TaskInfo& task, taskGroup.tasks()) { |
| bool pending = framework->pendingTasks.contains(task.task_id()); |
| framework->pendingTasks.erase(task.task_id()); |
| slave->pendingTasks[framework->id()].erase(task.task_id()); |
| if (slave->pendingTasks[framework->id()].empty()) { |
| slave->pendingTasks.erase(framework->id()); |
| } |
| |
| if (!pending) { |
| killed.insert(task.task_id()); |
| } |
| } |
| |
| // Note that we do not fill in the `ExecutorInfo.framework_id` |
| // since we do not have to support backwards compatibility like |
| // in the `Launch` operation case. |
| |
| // TODO(bmahler): Consider injecting some default (cpus, mem, disk) |
| // resources when the framework omits the executor resources. |
| |
| // See if there are any authorization or validation errors. |
| // Note that we'll only report the first error we encounter |
| // for the group. |
| // |
| // TODO(anindya_sinha): If task group uses shared resources, this |
| // validation needs to be enhanced to accommodate multiple copies |
| // of shared resources across tasks within the task group. |
| Option<Error> error; |
| Option<TaskStatus::Reason> reason; |
| |
| // NOTE: We check for the authorization errors first and never break the |
| // loop to ensure that all authorization futures for this task group are |
| // iterated through. |
| foreach (const TaskInfo& task, taskGroup.tasks()) { |
| CHECK(!authorizations.empty()); |
| Future<bool> authorization = authorizations.front(); |
| authorizations.pop_front(); |
| |
| CHECK(!authorization.isDiscarded()); |
| |
| if (authorization.isFailed()) { |
| error = Error("Failed to authorize task" |
| " '" + stringify(task.task_id()) + "'" |
| ": " + authorization.failure()); |
| } else if (!authorization.get()) { |
| string user = framework->info.user(); // Default user. |
| if (task.has_command() && task.command().has_user()) { |
| user = task.command().user(); |
| } |
| |
| error = Error("Task '" + stringify(task.task_id()) + "'" |
| " is not authorized to launch as" |
| " user '" + user + "'"); |
| } |
| } |
| |
| if (error.isSome()) { |
| reason = TaskStatus::REASON_TASK_GROUP_UNAUTHORIZED; |
| } else { |
| error = validation::task::group::validate( |
| taskGroup, executor, framework, slave, _offeredResources); |
| |
| if (error.isSome()) { |
| reason = TaskStatus::REASON_TASK_GROUP_INVALID; |
| } |
| } |
| |
| if (error.isSome()) { |
| CHECK_SOME(reason); |
| |
| // NOTE: If some of these invalid or unauthorized tasks were |
| // killed already, here we end up sending a TASK_ERROR after |
| // having already sent TASK_KILLED. |
| foreach (const TaskInfo& task, taskGroup.tasks()) { |
| const StatusUpdate& update = protobuf::createStatusUpdate( |
| framework->id(), |
| task.slave_id(), |
| task.task_id(), |
| TASK_ERROR, |
| TaskStatus::SOURCE_MASTER, |
| None(), |
| error->message, |
| reason.get()); |
| |
| metrics->tasks_error++; |
| |
| metrics->incrementTasksStates( |
| TASK_ERROR, TaskStatus::SOURCE_MASTER, reason.get()); |
| |
| forward(update, UPID(), framework); |
| } |
| |
| continue; |
| } |
| |
| // If task(s) were killed, send TASK_KILLED for |
| // all of the remaining tasks, since a TaskGroup must |
| // be delivered in its entirety. |
| // |
| // TODO(bmahler): Do this killing when processing |
| // the `Kill` call, rather than doing it here. |
| if (!killed.empty()) { |
| foreach (const TaskInfo& task, taskGroup.tasks()) { |
| if (!killed.contains(task.task_id())) { |
| const StatusUpdate& update = protobuf::createStatusUpdate( |
| framework->id(), |
| task.slave_id(), |
| task.task_id(), |
| TASK_KILLED, |
| TaskStatus::SOURCE_MASTER, |
| None(), |
| "A task within the task group was killed before" |
| " delivery to the agent", |
| TaskStatus::REASON_TASK_KILLED_DURING_LAUNCH); |
| |
| metrics->tasks_killed++; |
| |
| // TODO(bmahler): Increment the task state source metric, |
| // we currently cannot because it requires each source |
| // requires a reason. |
| |
| forward(update, UPID(), framework); |
| } |
| } |
| |
| continue; |
| } |
| |
| // Now launch the task group! |
| RunTaskGroupMessage message; |
| message.mutable_framework()->CopyFrom(framework->info); |
| message.mutable_executor()->CopyFrom(executor); |
| message.mutable_task_group()->CopyFrom(taskGroup); |
| |
| hashmap<Option<ResourceProviderID>, UUID> resourceVersions; |
| if (slave->resourceVersion.isSome()) { |
| resourceVersions.put(None(), slave->resourceVersion.get()); |
| } |
| |
| foreachpair ( |
| const ResourceProviderID& resourceProviderId, |
| const Slave::ResourceProvider& resourceProvider, |
| slave->resourceProviders) { |
| resourceVersions.put( |
| resourceProviderId, resourceProvider.resourceVersion); |
| } |
| |
| message.mutable_resource_version_uuids()->CopyFrom( |
| protobuf::createResourceVersions(resourceVersions)); |
| |
| set<TaskID> taskIds; |
| Resources totalResources; |
| Resources executorResources; |
| |
| bool launchExecutor = |
| isLaunchExecutor(executor.executor_id(), framework, slave); |
| |
| if (launchExecutor) { |
| addExecutor(executor, framework, slave); |
| executorResources = executor.resources(); |
| totalResources += executorResources; |
| } |
| |
| message.set_launch_executor(launchExecutor); |
| |
| foreach ( |
| TaskInfo& task, *message.mutable_task_group()->mutable_tasks()) { |
| taskIds.insert(task.task_id()); |
| totalResources += task.resources(); |
| |
| addTask(task, framework, slave); |
| |
| if (HookManager::hooksAvailable()) { |
| // Set labels retrieved from label-decorator hooks. |
| task.mutable_labels()->CopyFrom( |
| HookManager::masterLaunchTaskLabelDecorator( |
| task, |
| framework->info, |
| slave->info)); |
| } |
| } |
| |
| CHECK(_offeredResources.contains(totalResources)) |
| << _offeredResources << " does not contain " << totalResources; |
| |
| _offeredResources -= totalResources; |
| |
| // If the agent does not support reservation refinement, downgrade |
| // the task and executor resources to the "pre-reservation-refinement" |
| // format. This cannot contain any refined reservations since |
| // the master rejects attempts to create refined reservations |
| // on non-capable agents. |
| if (!slave->capabilities.reservationRefinement) { |
| CHECK_SOME(downgradeResources(&message)); |
| } |
| |
| LOG(INFO) << "Launching task group " << stringify(taskIds) |
| << " of framework " << *framework << " with resources " |
| << totalResources - executorResources << " on agent " |
| << *slave << " on " |
| << (launchExecutor ? " new executor" : " existing executor"); |
| |
| // Increment this metric here for LAUNCH_GROUP since it |
| // does not make use of the `_apply()` function. |
| framework->metrics.incrementOperation(operation); |
| |
| send(slave->pid, message); |
| |
| break; |
| } |
| |
| case Offer::Operation::CREATE_DISK: { |
| const Resource::DiskInfo::Source::Type diskType = |
| operation.create_disk().target_type(); |
| |
| CHECK(!authorizations.empty()); |
| Future<bool> authorization = authorizations.front(); |
| authorizations.pop_front(); |
| |
| CHECK(!authorization.isDiscarded()); |
| |
| if (authorization.isFailed()) { |
| // TODO(greggomann): We may want to retry this failed authorization |
| // request rather than dropping it immediately. |
| drop(framework, |
| operation, |
| "Authorization of principal '" + framework->info.principal() + |
| "' to create a " + stringify(diskType) + " disk failed: " + |
| authorization.failure()); |
| |
| continue; |
| } else if (!authorization.get()) { |
| drop(framework, |
| operation, |
| "Not authorized to create a " + stringify(diskType) + |
| " disk as '" + framework->info.principal() + "'"); |
| |
| continue; |
| } |
| |
| if (!slave->capabilities.resourceProvider) { |
| drop(framework, |
| operation, |
| "Not supported on agent " + stringify(*slave) + |
| " because it does not have RESOURCE_PROVIDER capability"); |
| continue; |
| } |
| |
| Option<Error> error = validation::operation::validate( |
| operation.create_disk()); |
| |
| if (error.isSome()) { |
| drop(framework, operation, error->message); |
| continue; |
| } |
| |
| const Resource& consumed = operation.create_disk().source(); |
| |
| if (!_offeredResources.contains(consumed)) { |
| drop(framework, |
| operation, |
| "Invalid CREATE_DISK Operation: " + |
| stringify(_offeredResources) + " does not contain " + |
| stringify(consumed)); |
| continue; |
| } |
| |
| _offeredResources -= consumed; |
| |
| LOG(INFO) << "Processing CREATE_DISK operation with source " |
| << operation.create_disk().source() << " from framework " |
| << *framework << " to agent " << *slave; |
| |
| _apply(slave, framework, operation); |
| |
| break; |
| } |
| |
| case Offer::Operation::DESTROY_DISK: { |
| const Resource::DiskInfo::Source::Type diskType = |
| operation.destroy_disk().source().disk().source().type(); |
| |
| CHECK(!authorizations.empty()); |
| Future<bool> authorization = authorizations.front(); |
| authorizations.pop_front(); |
| |
| CHECK(!authorization.isDiscarded()); |
| |
| if (authorization.isFailed()) { |
| // TODO(greggomann): We may want to retry this failed authorization |
| // request rather than dropping it immediately. |
| drop(framework, |
| operation, |
| "Authorization of principal '" + framework->info.principal() + |
| "' to destroy a " + stringify(diskType) + " disk failed: " + |
| authorization.failure()); |
| |
| continue; |
| } else if (!authorization.get()) { |
| drop(framework, |
| operation, |
| "Not authorized to destroy a " + stringify(diskType) + |
| " disk as '" + framework->info.principal() + "'"); |
| |
| continue; |
| } |
| |
| if (!slave->capabilities.resourceProvider) { |
| drop(framework, |
| operation, |
| "Not supported on agent " + stringify(*slave) + |
| " because it does not have RESOURCE_PROVIDER capability"); |
| continue; |
| } |
| |
| Option<Error> error = validation::operation::validate( |
| operation.destroy_disk()); |
| |
| if (error.isSome()) { |
| drop(framework, operation, error->message); |
| continue; |
| } |
| |
| const Resource& consumed = operation.destroy_disk().source(); |
| |
| if (!_offeredResources.contains(consumed)) { |
| drop(framework, |
| operation, |
| "Invalid DESTROY_DISK Operation: " + |
| stringify(_offeredResources) + " does not contain " + |
| stringify(consumed)); |
| continue; |
| } |
| |
| _offeredResources -= consumed; |
| |
| LOG(INFO) << "Processing DESTROY_DISK operation for volume " |
| << operation.destroy_disk().source() << " from framework " |
| << *framework << " to agent " << *slave; |
| |
| _apply(slave, framework, operation); |
| |
| break; |
| } |
| |
| case Offer::Operation::UNKNOWN: { |
| LOG(WARNING) << "Ignoring unknown operation"; |
| break; |
| } |
| } |
| } |
| |
| CHECK(authorizations.empty()) |
| << "Authorization results not processed: " |
| << stringify( |
| vector<Future<bool>>(authorizations.begin(), authorizations.end())); |
| |
| // Update the allocator based on the operations. |
| if (!conversions.empty()) { |
| allocator->updateAllocation( |
| frameworkId, |
| slaveId, |
| offeredResources, |
| conversions); |
| } |
| |
| // We now need to compute the amounts of remaining (1) speculatively converted |
| // resources to recover without a filter and (2) resources that are implicitly |
| // declined with the filter: |
| // |
| // Speculatively converted resources |
| // = (offered resources).apply(speculative operations) |
| // - resources consumed by non-speculative operations |
| // - offered resources not consumed by any operation |
| // = `_offeredResources` - offered resources not consumed by any operation |
| // = `_offeredResources` - offered resources |
| // |
| // (The last equality holds because resource subtraction yields no negatives.) |
| // |
| // Implicitly declined resources |
| // = (offered resources).apply(speculative operations) |
| // - resources consumed by non-speculative operations |
| // - speculatively converted resources |
| // = `_offeredResources` - speculatively converted resources |
| // |
| // TODO(zhitao): Right now `GROW_VOLUME` and `SHRINK_VOLUME` are implemented |
| // as speculative operations. Since the plan is to make them non-speculative |
| // in the future, their results are not in `_offeredResources`, so we add them |
| // back here. Remove this once the operations become non-speculative. |
| Resources speculativelyConverted = |
| _offeredResources + resizedResources - offeredResources; |
| Resources implicitlyDeclined = _offeredResources - speculativelyConverted; |
| |
| // Tell the allocator about the net speculatively converted resources. These |
| // resources should not be implicitly declined. |
| if (!speculativelyConverted.empty()) { |
| allocator->recoverResources( |
| frameworkId, slaveId, speculativelyConverted, None()); |
| } |
| |
| // Tell the allocator about the implicitly declined resources. |
| if (!implicitlyDeclined.empty()) { |
| allocator->recoverResources( |
| frameworkId, slaveId, implicitlyDeclined, accept.filters()); |
| } |
| } |
| |
| |
| void Master::acceptInverseOffers( |
| Framework* framework, |
| const scheduler::Call::AcceptInverseOffers& accept) |
| { |
| CHECK_NOTNULL(framework); |
| |
| Option<Error> error; |
| |
| if (accept.inverse_offer_ids().size() == 0) { |
| error = Error("No inverse offers specified"); |
| } else { |
| LOG(INFO) << "Processing ACCEPT_INVERSE_OFFERS call for inverse offers: " |
| << accept.inverse_offer_ids() << " for framework " << *framework; |
| |
| // Validate the inverse offers. |
| error = validation::offer::validateInverseOffers( |
| accept.inverse_offer_ids(), |
| this, |
| framework); |
| |
| // Update each inverse offer in the allocator with the accept and |
| // filter. |
| // TODO(anand): Notify the framework if some of the offers were invalid. |
| foreach (const OfferID& offerId, accept.inverse_offer_ids()) { |
| InverseOffer* inverseOffer = getInverseOffer(offerId); |
| if (inverseOffer != nullptr) { |
| mesos::allocator::InverseOfferStatus status; |
| status.set_status(mesos::allocator::InverseOfferStatus::ACCEPT); |
| status.mutable_framework_id()->CopyFrom(inverseOffer->framework_id()); |
| status.mutable_timestamp()->CopyFrom(protobuf::getCurrentTime()); |
| |
| allocator->updateInverseOffer( |
| inverseOffer->slave_id(), |
| inverseOffer->framework_id(), |
| UnavailableResources{ |
| inverseOffer->resources(), |
| inverseOffer->unavailability()}, |
| status, |
| accept.filters()); |
| |
| removeInverseOffer(inverseOffer); |
| continue; |
| } |
| |
| // If the offer was not in our inverse offer set, then this |
| // offer is no longer valid. |
| LOG(WARNING) << "Ignoring accept of inverse offer " << offerId |
| << " since it is no longer valid"; |
| } |
| } |
| |
| if (error.isSome()) { |
| LOG(WARNING) << "ACCEPT_INVERSE_OFFERS call used invalid offers '" |
| << accept.inverse_offer_ids() << "': " << error->message; |
| } |
| } |
| |
| |
| void Master::decline( |
| Framework* framework, |
| scheduler::Call::Decline&& decline) |
| { |
| CHECK_NOTNULL(framework); |
| |
| LOG(INFO) << "Processing DECLINE call for offers: " << decline.offer_ids() |
| << " for framework " << *framework; |
| |
| ++metrics->messages_decline_offers; |
| |
| size_t offersDeclined = 0; |
| |
| // Return resources to the allocator. |
| foreach (const OfferID& offerId, decline.offer_ids()) { |
| Offer* offer = getOffer(offerId); |
| if (offer != nullptr) { |
| allocator->recoverResources( |
| offer->framework_id(), |
| offer->slave_id(), |
| offer->resources(), |
| decline.filters()); |
| |
| removeOffer(offer); |
| |
| offersDeclined++; |
| continue; |
| } |
| |
| // If the offer was not in our offer set, then this offer is no |
| // longer valid. |
| LOG(WARNING) << "Ignoring decline of offer " << offerId |
| << " since it is no longer valid"; |
| } |
| |
| framework->metrics.offers_declined += offersDeclined; |
| } |
| |
| |
| void Master::declineInverseOffers( |
| Framework* framework, |
| const scheduler::Call::DeclineInverseOffers& decline) |
| { |
| CHECK_NOTNULL(framework); |
| |
| LOG(INFO) << "Processing DECLINE_INVERSE_OFFERS call for inverse offers: " |
| << decline.inverse_offer_ids() << " for framework " << *framework; |
| |
| // Update each inverse offer in the allocator with the decline and |
| // filter. |
| foreach (const OfferID& offerId, decline.inverse_offer_ids()) { |
| // Try it as an inverse offer. If this fails then the offer is no |
| // longer valid. |
| InverseOffer* inverseOffer = getInverseOffer(offerId); |
| if (inverseOffer != nullptr) { // If this is an inverse offer. |
| mesos::allocator::InverseOfferStatus status; |
| status.set_status(mesos::allocator::InverseOfferStatus::DECLINE); |
| status.mutable_framework_id()->CopyFrom(inverseOffer->framework_id()); |
| status.mutable_timestamp()->CopyFrom(protobuf::getCurrentTime()); |
| |
| allocator->updateInverseOffer( |
| inverseOffer->slave_id(), |
| inverseOffer->framework_id(), |
| UnavailableResources{ |
| inverseOffer->resources(), |
| inverseOffer->unavailability()}, |
| status, |
| decline.filters()); |
| |
| removeInverseOffer(inverseOffer); |
| continue; |
| } |
| |
| // If the offer was not in our inverse offer set, then this |
| // offer is no longer valid. |
| LOG(WARNING) << "Ignoring decline of inverse offer " << offerId |
| << " since it is no longer valid"; |
| } |
| } |
| |
| |
| void Master::reviveOffers( |
| const UPID& from, |
| const FrameworkID& frameworkId, |
| const vector<string>& roles) |
| { |
| Framework* framework = getFramework(frameworkId); |
| |
| if (framework == nullptr) { |
| LOG(WARNING) |
| << "Ignoring revive offers message for framework " << frameworkId |
| << " because the framework cannot be found"; |
| |
| return; |
| } |
| |
| if (framework->pid != from) { |
| LOG(WARNING) |
| << "Ignoring revive offers message for framework " << *framework |
| << " because it is not expected from " << from; |
| |
| return; |
| } |
| |
| scheduler::Call::Revive call; |
| foreach (const string& role, roles) { |
| call.add_roles(role); |
| } |
| |
| revive(framework, call); |
| } |
| |
| |
| void Master::revive( |
| Framework* framework, |
| const scheduler::Call::Revive& revive) |
| { |
| CHECK_NOTNULL(framework); |
| |
| LOG(INFO) << "Processing REVIVE call for framework " << *framework; |
| |
| ++metrics->messages_revive_offers; |
| |
| set<string> roles; |
| |
| // Validate the roles, if provided. We need to make sure the |
| // roles is valid and also contained within the framework roles. |
| // Note that if a single role is invalid, we drop the entire |
| // call and do not suppress the valid roles. |
| foreach (const string& role, revive.roles()) { |
| Option<Error> roleError = roles::validate(role); |
| if (roleError.isSome()) { |
| drop(framework, |
| revive, |
| "revive role '" + role + "' is invalid: " + roleError->message); |
| return; |
| } |
| |
| if (framework->roles.count(role) == 0) { |
| drop(framework, |
| revive, |
| "revive role '" + role + "' is not one" |
| " of the frameworks's subscribed roles"); |
| return; |
| } |
| |
| roles.insert(role); |
| } |
| |
| allocator->reviveOffers(framework->id(), roles); |
| } |
| |
| |
| void Master::killTask( |
| const UPID& from, |
| const FrameworkID& frameworkId, |
| const TaskID& taskId) |
| { |
| Framework* framework = getFramework(frameworkId); |
| |
| if (framework == nullptr) { |
| LOG(WARNING) |
| << "Ignoring kill task message for task " << taskId << " of framework " |
| << frameworkId << " because the framework cannot be found"; |
| |
| return; |
| } |
| |
| if (framework->pid != from) { |
| LOG(WARNING) |
| << "Ignoring kill task message for task " << taskId << " of framework " |
| << *framework << " because it is not expected from " << from; |
| |
| return; |
| } |
| |
| scheduler::Call::Kill call; |
| call.mutable_task_id()->CopyFrom(taskId); |
| // NOTE: Kill policy in kill task is not supported for schedulers |
| // sending `KillTaskMessage` instead of `scheduler::Call::Kill`. |
| |
| kill(framework, call); |
| } |
| |
| |
| void Master::kill(Framework* framework, const scheduler::Call::Kill& kill) |
| { |
| CHECK_NOTNULL(framework); |
| |
| const TaskID& taskId = kill.task_id(); |
| const Option<SlaveID> slaveId = |
| kill.has_slave_id() ? Option<SlaveID>(kill.slave_id()) : None(); |
| |
| LOG(INFO) << "Processing KILL call for task '" << taskId << "'" |
| << " of framework " << *framework; |
| |
| ++metrics->messages_kill_task; |
| |
| if (framework->pendingTasks.contains(taskId)) { |
| // Remove from pending tasks. |
| framework->pendingTasks.erase(taskId); |
| |
| if (slaveId.isSome()) { |
| Slave* slave = slaves.registered.get(slaveId.get()); |
| |
| if (slave != nullptr) { |
| slave->pendingTasks[framework->id()].erase(taskId); |
| if (slave->pendingTasks[framework->id()].empty()) { |
| slave->pendingTasks.erase(framework->id()); |
| } |
| } |
| } |
| |
| const StatusUpdate& update = protobuf::createStatusUpdate( |
| framework->id(), |
| slaveId, |
| taskId, |
| TASK_KILLED, |
| TaskStatus::SOURCE_MASTER, |
| None(), |
| "Killed before delivery to the agent", |
| TaskStatus::REASON_TASK_KILLED_DURING_LAUNCH); |
| |
| forward(update, UPID(), framework); |
| |
| return; |
| } |
| |
| Task* task = framework->getTask(taskId); |
| if (task == nullptr) { |
| LOG(WARNING) << "Cannot kill task " << taskId |
| << " of framework " << *framework |
| << " because it is unknown; performing reconciliation"; |
| |
| scheduler::Call::Reconcile message; |
| scheduler::Call::Reconcile::Task* t = message.add_tasks(); |
| |
| *t->mutable_task_id() = taskId; |
| |
| if (slaveId.isSome()) { |
| *t->mutable_slave_id() = slaveId.get(); |
| } |
| |
| reconcile(framework, std::move(message)); |
| return; |
| } |
| |
| if (slaveId.isSome() && slaveId.get() != task->slave_id()) { |
| LOG(WARNING) << "Cannot kill task " << taskId << " of agent " |
| << slaveId.get() << " of framework " << *framework |
| << " because it belongs to different agent " |
| << task->slave_id(); |
| |
| // TODO(vinod): Return a "Bad Request" when using HTTP API. |
| return; |
| } |
| |
| Slave* slave = slaves.registered.get(task->slave_id()); |
| CHECK(slave != nullptr) << "Unknown agent " << task->slave_id(); |
| |
| // We add the task to 'killedTasks' here because the slave |
| // might be partitioned or disconnected but the master |
| // doesn't know it yet. |
| slave->killedTasks.put(framework->id(), taskId); |
| |
| // NOTE: This task will be properly reconciled when the disconnected slave |
| // reregisters with the master. |
| // We send the KillTaskMessage even if we have already sent one, just in case |
| // the previous one was dropped by the network but it didn't trigger a slave |
| // re-registration (and hence reconciliation). |
| if (slave->connected) { |
| LOG(INFO) << "Telling agent " << *slave |
| << " to kill task " << taskId |
| << " of framework " << *framework; |
| |
| KillTaskMessage message; |
| message.mutable_framework_id()->MergeFrom(framework->id()); |
| message.mutable_task_id()->MergeFrom(taskId); |
| if (kill.has_kill_policy()) { |
| message.mutable_kill_policy()->MergeFrom(kill.kill_policy()); |
| } |
| |
| send(slave->pid, message); |
| } else { |
| LOG(WARNING) << "Cannot kill task " << taskId |
| << " of framework " << *framework |
| << " because the agent " << *slave << " is disconnected." |
| << " Kill will be retried if the agent reregisters"; |
| } |
| } |
| |
| |
| void Master::statusUpdateAcknowledgement( |
| const UPID& from, |
| StatusUpdateAcknowledgementMessage&& statusUpdateAcknowledgementMessage) |
| { |
| const SlaveID& slaveId = |
| statusUpdateAcknowledgementMessage.slave_id(); |
| const FrameworkID& frameworkId = |
| statusUpdateAcknowledgementMessage.framework_id(); |
| const TaskID& taskId = |
| statusUpdateAcknowledgementMessage.task_id(); |
| const string& uuid = |
| statusUpdateAcknowledgementMessage.uuid(); |
| |
| // TODO(bmahler): Consider adding a message validator abstraction |
| // for the master that takes care of all this boilerplate. Ideally |
| // by the time we process messages in the critical master code, we |
| // can assume that they are valid. This will become especially |
| // important as validation logic is moved out of the scheduler |
| // driver and into the master. |
| |
| Try<id::UUID> uuid_ = id::UUID::fromBytes(uuid); |
| if (uuid_.isError()) { |
| LOG(WARNING) |
| << "Ignoring status update acknowledgement " |
| << " for task " << taskId << " of framework " << frameworkId |
| << " on agent " << slaveId << " due to: " << uuid_.error(); |
| |
| metrics->invalid_status_update_acknowledgements++; |
| return; |
| } |
| |
| Framework* framework = getFramework(frameworkId); |
| |
| if (framework == nullptr) { |
| LOG(WARNING) |
| << "Ignoring status update acknowledgement for status " |
| << uuid_.get() << " of task " << taskId << " of framework " |
| << frameworkId << " on agent " << slaveId << " because the framework " |
| << "cannot be found"; |
| |
| metrics->invalid_status_update_acknowledgements++; |
| return; |
| } |
| |
| if (framework->pid != from) { |
| LOG(WARNING) |
| << "Ignoring status update acknowledgement for status " |
| << uuid_.get() << " of task " << taskId << " of framework " |
| << *framework << " on agent " << slaveId << " because it is not " |
| << "expected from " << from; |
| |
| metrics->invalid_status_update_acknowledgements++; |
| return; |
| } |
| |
| scheduler::Call::Acknowledge message; |
| |
| *message.mutable_slave_id() = |
| std::move(*statusUpdateAcknowledgementMessage.mutable_slave_id()); |
| *message.mutable_task_id() = |
| std::move(*statusUpdateAcknowledgementMessage.mutable_task_id()); |
| *message.mutable_uuid() = |
| std::move(*statusUpdateAcknowledgementMessage.mutable_uuid()); |
| |
| acknowledge(framework, std::move(message)); |
| } |
| |
| |
| void Master::acknowledge( |
| Framework* framework, |
| scheduler::Call::Acknowledge&& acknowledge) |
| { |
| CHECK_NOTNULL(framework); |
| |
| metrics->messages_status_update_acknowledgement++; |
| |
| const SlaveID& slaveId = acknowledge.slave_id(); |
| const TaskID& taskId = acknowledge.task_id(); |
| |
| Try<id::UUID> uuid_ = id::UUID::fromBytes(acknowledge.uuid()); |
| CHECK_SOME(uuid_); |
| const id::UUID uuid = uuid_.get(); |
| |
| Slave* slave = slaves.registered.get(slaveId); |
| |
| if (slave == nullptr) { |
| LOG(WARNING) |
| << "Cannot send status update acknowledgement for status " << uuid |
| << " of task " << taskId << " of framework " << *framework |
| << " to agent " << slaveId << " because agent is not registered"; |
| |
| metrics->invalid_status_update_acknowledgements++; |
| return; |
| } |
| |
| if (!slave->connected) { |
| LOG(WARNING) |
| << "Cannot send status update acknowledgement for status " << uuid |
| << " of task " << taskId << " of framework " << *framework |
| << " to agent " << *slave << " because agent is disconnected"; |
| |
| metrics->invalid_status_update_acknowledgements++; |
| return; |
| } |
| |
| LOG(INFO) |
| << "Processing ACKNOWLEDGE call for status " << uuid |
| << " for task " << taskId |
| << " of framework " << *framework |
| << " on agent " << slaveId; |
| |
| Task* task = slave->getTask(framework->id(), taskId); |
| |
| if (task != nullptr) { |
| // Status update state and uuid should be either set or unset |
| // together. |
| CHECK_EQ(task->has_status_update_uuid(), task->has_status_update_state()); |
| |
| if (!task->has_status_update_state()) { |
| // Task should have status update state set because it must have |
| // been set when the update corresponding to this |
| // acknowledgement was processed by the master. But in case this |
| // acknowledgement was intended for the old run of the master |
| // and the task belongs to a 0.20.0 slave, we could be here. |
| // Dropping the acknowledgement is safe because the slave will |
| // retry the update, at which point the master will set the |
| // status update state. |
| LOG(WARNING) |
| << "Ignoring status update acknowledgement for status " << uuid |
| << " of task " << taskId << " of framework " << *framework |
| << " to agent " << *slave << " because the update was not" |
| << " sent by this master"; |
| |
| metrics->invalid_status_update_acknowledgements++; |
| return; |
| } |
| |
| // Remove the task once the terminal update is acknowledged. |
| if (protobuf::isTerminalState(task->status_update_state()) && |
| id::UUID::fromBytes(task->status_update_uuid()).get() == uuid) { |
| removeTask(task); |
| } |
| } |
| |
| StatusUpdateAcknowledgementMessage message; |
| *message.mutable_slave_id() = std::move(*acknowledge.mutable_slave_id()); |
| *message.mutable_framework_id() = framework->id(); |
| *message.mutable_task_id() = std::move(*acknowledge.mutable_task_id()); |
| *message.mutable_uuid() = std::move(*acknowledge.mutable_uuid()); |
| |
| send(slave->pid, message); |
| |
| metrics->valid_status_update_acknowledgements++; |
| } |
| |
| |
| void Master::acknowledgeOperationStatus( |
| Framework* framework, |
| scheduler::Call::AcknowledgeOperationStatus&& acknowledge) |
| { |
| CHECK_NOTNULL(framework); |
| |
| metrics->messages_operation_status_update_acknowledgement++; |
| |
| const OperationID& operationId = acknowledge.operation_id(); |
| |
| Try<id::UUID> statusUuid_ = id::UUID::fromBytes(acknowledge.uuid()); |
| |
| CHECK_SOME(statusUuid_); |
| const id::UUID statusUuid = statusUuid_.get(); |
| |
| CHECK(acknowledge.has_slave_id()); |
| const SlaveID& slaveId = acknowledge.slave_id(); |
| |
| CHECK(acknowledge.has_resource_provider_id()); |
| |
| Slave* slave = slaves.registered.get(slaveId); |
| if (slave == nullptr) { |
| LOG(WARNING) |
| << "Cannot send operation status update acknowledgement for status " |
| << statusUuid << " of operation '" << operationId << "'" |
| << " of framework " << *framework << " to agent " << slaveId |
| << " because agent is not registered"; |
| |
| metrics->invalid_operation_status_update_acknowledgements++; |
| return; |
| } |
| |
| if (!slave->connected) { |
| LOG(WARNING) |
| << "Cannot send operation status update acknowledgement for status " |
| << statusUuid << " of operation '" << operationId << "'" |
| << " of framework " << *framework << " to agent " << slaveId |
| << " because agent is disconnected"; |
| |
| metrics->invalid_operation_status_update_acknowledgements++; |
| return; |
| } |
| |
| if (!slave->capabilities.resourceProvider) { |
| LOG(WARNING) |
| << "Cannot send operation status update acknowledgement for status " |
| << statusUuid << " of operation '" << operationId << "'" |
| << " of framework " << *framework << " to agent " << slaveId |
| << " because the agent does not support resource providers"; |
| |
| metrics->invalid_operation_status_update_acknowledgements++; |
| return; |
| } |
| |
| const Option<UUID> operationUuid_ = |
| framework->operationUUIDs.get(operationId); |
| |
| if (operationUuid_.isNone()) { |
| LOG(WARNING) |
| << "Cannot send operation status update acknowledgement for status " |
| << statusUuid << " of operation '" << operationId << "'" |
| << " of framework" << *framework << " to agent " << slaveId |
| << " because the operation is unknown"; |
| |
| metrics->invalid_operation_status_update_acknowledgements++; |
| return; |
| } |
| const UUID operationUuid = operationUuid_.get(); |
| |
| Operation* operation = slave->getOperation(operationUuid); |
| CHECK_NOTNULL(operation); |
| |
| auto it = std::find_if( |
| operation->statuses().begin(), |
| operation->statuses().end(), |
| [&statusUuid](const OperationStatus& operationStatus) { |
| return operationStatus.has_uuid() && |
| operationStatus.uuid().value() == statusUuid.toBytes(); |
| }); |
| |
| if (it == operation->statuses().end()) { |
| LOG(WARNING) |
| << "Ignoring operation status acknowledgement for status " << statusUuid |
| << " of operation '" << operationId << "'" |
| << " (uuid " << operationUuid << ")" |
| << " of framework" << *framework |
| << " because the operation status is unknown"; |
| |
| metrics->invalid_status_update_acknowledgements++; |
| return; |
| } |
| |
| const OperationStatus& acknowledgedStatus = *it; |
| |
| LOG(INFO) << "Processing ACKNOWLEDGE_OPERATION_STATUS call for status " |
| << statusUuid << " of operation '" << operationId << "'" |
| << " (uuid " << operationUuid << ")" |
| << " of framework " << *framework << " on agent " << slaveId; |
| |
| // If the acknowledged status update is terminal, remove the operation. |
| if (protobuf::isTerminalState(acknowledgedStatus.state())) { |
| removeOperation(operation); |
| } |
| |
| AcknowledgeOperationStatusMessage message; |
| message.mutable_status_uuid()->set_value(statusUuid.toBytes()); |
| *message.mutable_operation_uuid() = std::move(operationUuid); |
| *message.mutable_resource_provider_id() = |
| std::move(*acknowledge.mutable_resource_provider_id()); |
| |
| send(slave->pid, message); |
| |
| metrics->valid_operation_status_update_acknowledgements++; |
| } |
| |
| |
| void Master::schedulerMessage( |
| const UPID& from, |
| FrameworkToExecutorMessage&& frameworkToExecutorMessage) |
| { |
| const FrameworkID& frameworkId = frameworkToExecutorMessage.framework_id(); |
| const ExecutorID& executorId = frameworkToExecutorMessage.executor_id(); |
| |
| Framework* framework = getFramework(frameworkId); |
| |
| if (framework == nullptr) { |
| LOG(WARNING) << "Ignoring framework message" |
| << " for executor '" << executorId << "'" |
| << " of framework " << frameworkId |
| << " because the framework cannot be found"; |
| |
| metrics->invalid_framework_to_executor_messages++; |
| return; |
| } |
| |
| if (framework->pid != from) { |
| LOG(WARNING) |
| << "Ignoring framework message for executor '" << executorId |
| << "' of framework " << *framework |
| << " because it is not expected from " << from; |
| |
| metrics->invalid_framework_to_executor_messages++; |
| return; |
| } |
| |
| scheduler::Call::Message message_; |
| *message_.mutable_slave_id() = |
| std::move(*frameworkToExecutorMessage.mutable_slave_id()); |
| *message_.mutable_executor_id() = |
| std::move(*frameworkToExecutorMessage.mutable_executor_id()); |
| *message_.mutable_data() = |
| std::move(*frameworkToExecutorMessage.mutable_data()); |
| |
| message(framework, std::move(message_)); |
| } |
| |
| |
| void Master::executorMessage( |
| const UPID& from, |
| ExecutorToFrameworkMessage&& executorToFrameworkMessage) |
| { |
| const SlaveID& slaveId = executorToFrameworkMessage.slave_id(); |
| const FrameworkID& frameworkId = executorToFrameworkMessage.framework_id(); |
| const ExecutorID& executorId = executorToFrameworkMessage.executor_id(); |
| |
| metrics->messages_executor_to_framework++; |
| |
| if (slaves.removed.get(slaveId).isSome()) { |
| // If the slave has been removed, drop the executor message. The |
| // master is no longer trying to health check this slave; when the |
| // slave realizes it hasn't received any pings from the master, it |
| // will eventually try to reregister. |
| LOG(WARNING) << "Ignoring executor message" |
| << " from executor" << " '" << executorId << "'" |
| << " of framework " << frameworkId |
| << " on removed agent " << slaveId; |
| |
| metrics->invalid_executor_to_framework_messages++; |
| return; |
| } |
| |
| // The slave should (re-)register with the master before |
| // forwarding executor messages. |
| Slave* slave = slaves.registered.get(slaveId); |
| |
| if (slave == nullptr) { |
| LOG(WARNING) << "Ignoring executor message" |
| << " from executor '" << executorId << "'" |
| << " of framework " << frameworkId |
| << " on unknown agent " << slaveId; |
| |
| metrics->invalid_executor_to_framework_messages++; |
| return; |
| } |
| |
| Framework* framework = getFramework(frameworkId); |
| |
| if (framework == nullptr) { |
| LOG(WARNING) << "Not forwarding executor message" |
| << " for executor '" << executorId << "'" |
| << " of framework " << frameworkId |
| << " on agent " << *slave |
| << " because the framework is unknown"; |
| |
| metrics->invalid_executor_to_framework_messages++; |
| return; |
| } |
| |
| if (!framework->connected()) { |
| LOG(WARNING) << "Not forwarding executor message for executor '" |
| << executorId << "' of framework " << frameworkId |
| << " on agent " << *slave |
| << " because the framework is disconnected"; |
| |
| metrics->invalid_executor_to_framework_messages++; |
| return; |
| } |
| |
| ExecutorToFrameworkMessage message; |
| *message.mutable_slave_id() = |
| std::move(*executorToFrameworkMessage.mutable_slave_id()); |
| *message.mutable_framework_id() = |
| std::move(*executorToFrameworkMessage.mutable_framework_id()); |
| *message.mutable_executor_id() = |
| std::move(*executorToFrameworkMessage.mutable_executor_id()); |
| *message.mutable_data() = |
| std::move(*executorToFrameworkMessage.mutable_data()); |
| |
| framework->send(message); |
| |
| metrics->valid_executor_to_framework_messages++; |
| } |
| |
| |
| void Master::message( |
| Framework* framework, |
| scheduler::Call::Message&& message) |
| { |
| CHECK_NOTNULL(framework); |
| |
| metrics->messages_framework_to_executor++; |
| |
| Slave* slave = slaves.registered.get(message.slave_id()); |
| |
| if (slave == nullptr) { |
| LOG(WARNING) << "Cannot send framework message for framework " |
| << *framework << " to agent " << message.slave_id() |
| << " because agent is not registered"; |
| |
| metrics->invalid_framework_to_executor_messages++; |
| return; |
| } |
| |
| if (!slave->connected) { |
| LOG(WARNING) << "Cannot send framework message for framework " |
| << *framework << " to agent " << *slave |
| << " because agent is disconnected"; |
| |
| metrics->invalid_framework_to_executor_messages++; |
| return; |
| } |
| |
| LOG(INFO) << "Processing MESSAGE call from framework " |
| << *framework << " to agent " << *slave; |
| |
| FrameworkToExecutorMessage message_; |
| *message_.mutable_slave_id() = std::move(*message.mutable_slave_id()); |
| *message_.mutable_framework_id() = framework->id(); |
| *message_.mutable_executor_id() = std::move(*message.mutable_executor_id()); |
| *message_.mutable_data() = std::move(*message.mutable_data()); |
| |
| send(slave->pid, message_); |
| |
| metrics->valid_framework_to_executor_messages++; |
| } |
| |
| |
| void Master::registerSlave( |
| const UPID& from, |
| RegisterSlaveMessage&& registerSlaveMessage) |
| { |
| ++metrics->messages_register_slave; |
| |
| if (authenticating.contains(from)) { |
| LOG(INFO) << "Queuing up registration request from " << from |
| << " because authentication is still in progress"; |
| |
| authenticating[from] |
| .onReady(defer(self(), |
| &Self::registerSlave, |
| from, |
| std::move(registerSlaveMessage))); |
| return; |
| } |
| |
| if (flags.authenticate_agents && !authenticated.contains(from)) { |
| // This could happen if another authentication request came |
| // through before we are here or if a slave tried to register |
| // without authentication. |
| LOG(WARNING) << "Refusing registration of agent at " << from |
| << " because it is not authenticated"; |
| return; |
| } |
| |
| Option<Error> error = |
| validation::master::message::registerSlave(registerSlaveMessage); |
| |
| if (error.isSome()) { |
| LOG(WARNING) << "Dropping registration of agent at " << from |
| << " because it sent an invalid registration: " |
| << error->message; |
| |
| return; |
| } |
| |
| if (slaves.registering.contains(from)) { |
| LOG(INFO) << "Ignoring register agent message from " << from |
| << " (" << registerSlaveMessage.slave().hostname() |
| << ") as registration is already in progress"; |
| |
| return; |
| } |
| |
| LOG(INFO) << "Received register agent message from " << from |
| << " (" << registerSlaveMessage.slave().hostname() << ")"; |
| |
| slaves.registering.insert(from); |
| |
| // Update all resources passed by the agent to `POST_RESERVATION_REFINEMENT` |
| // format. We do this as early as possible so that we only use a single |
| // format inside master, and downgrade again if necessary when they leave the |
| // master (e.g. when writing to the registry). |
| upgradeResources(®isterSlaveMessage); |
| |
| // Note that the principal may be empty if authentication is not |
| // required. Also it is passed along because it may be removed from |
| // `authenticated` while the authorization is pending. |
| Option<Principal> principal = authenticated.contains(from) |
| ? Principal(authenticated.at(from)) |
| : Option<Principal>::none(); |
| |
| // Calling the `onAny` continuation below separately so we can move |
| // `registerSlaveMessage` without it being evaluated before it's used |
| // by `authorizeSlave`. |
| Future<bool> authorization = |
| authorizeSlave(registerSlaveMessage.slave(), principal); |
| |
| authorization |
| .onAny(defer(self(), |
| &Self::_registerSlave, |
| from, |
| std::move(registerSlaveMessage), |
| principal, |
| lambda::_1)); |
| } |
| |
| |
| void Master::_registerSlave( |
| const UPID& pid, |
| RegisterSlaveMessage&& registerSlaveMessage, |
| const Option<Principal>& principal, |
| const Future<bool>& authorized) |
| { |
| CHECK(!authorized.isDiscarded()); |
| CHECK(slaves.registering.contains(pid)); |
| |
| const SlaveInfo& slaveInfo = registerSlaveMessage.slave(); |
| |
| Option<string> authorizationError = None(); |
| |
| if (authorized.isFailed()) { |
| authorizationError = "Authorization failure: " + authorized.failure(); |
| } else if (!authorized.get()) { |
| authorizationError = |
| "Not authorized to register agent providing resources " |
| "'" + stringify(Resources(slaveInfo.resources())) + "' " + |
| (principal.isSome() |
| ? "with principal '" + stringify(principal.get()) + "'" |
| : "without a principal"); |
| } |
| |
| if (authorizationError.isSome()) { |
| LOG(WARNING) << "Refusing registration of agent at " << pid |
| << " (" << slaveInfo.hostname() << ")" |
| << ": " << authorizationError.get(); |
| |
| slaves.registering.erase(pid); |
| return; |
| } |
| |
| VLOG(1) << "Authorized registration of agent at " << pid |
| << " (" << slaveInfo.hostname() << ")"; |
| |
| MachineID machineId; |
| machineId.set_hostname(slaveInfo.hostname()); |
| machineId.set_ip(stringify(pid.address.ip)); |
| |
| // Slaves are not allowed to register while the machine they are on is in |
| // `DOWN` mode. |
| if (machines.contains(machineId) && |
| machines[machineId].info.mode() == MachineInfo::DOWN) { |
| LOG(WARNING) << "Refusing registration of agent at " << pid |
| << " because the machine '" << machineId << "' that it is " |
| << "running on is `DOWN`"; |
| |
| ShutdownMessage message; |
| message.set_message("Machine is `DOWN`"); |
| send(pid, message); |
| |
| slaves.registering.erase(pid); |
| return; |
| } |
| |
| // Ignore registration attempts by agents running old Mesos versions. |
| // We expect that the agent's version is in SemVer format; if the |
| // version cannot be parsed, the registration attempt is ignored. |
| const string& version = registerSlaveMessage.version(); |
| Try<Version> parsedVersion = Version::parse(version); |
| |
| if (parsedVersion.isError()) { |
| LOG(WARNING) << "Failed to parse version '" << version << "'" |
| << " of agent at " << pid << ": " << parsedVersion.error() |
| << "; ignoring agent registration attempt"; |
| |
| slaves.registering.erase(pid); |
| return; |
| } else if (parsedVersion.get() < MINIMUM_AGENT_VERSION) { |
| LOG(WARNING) << "Ignoring registration attempt from old agent at " |
| << pid << ": agent version is " << parsedVersion.get() |
| << ", minimum supported agent version is " |
| << MINIMUM_AGENT_VERSION; |
| |
| slaves.registering.erase(pid); |
| return; |
| } |
| |
| // If the agent is configured with a domain but the master is not, |
| // we can't determine whether the agent is remote. To be safe, we |
| // don't allow the agent to register. We don't shutdown the agent so |
| // that any tasks on the agent can continue to run. |
| // |
| // TODO(neilc): Consider sending a warning to agent (MESOS-7615). |
| if (slaveInfo.has_domain() && !info_.has_domain()) { |
| LOG(WARNING) << "Agent at " << pid << " is configured with " |
| << "domain " << slaveInfo.domain() << " " |
| << "but the master has no configured domain. " |
| << "Ignoring agent registration attempt"; |
| |
| slaves.registering.erase(pid); |
| return; |
| } |
| |
| // Don't allow agents without domain if domains are required. |
| // We don't shutdown the agent to allow it to restart itself with |
| // the correct domain and without losing tasks. |
| if (flags.require_agent_domain && !slaveInfo.has_domain()) { |
| LOG(WARNING) << "Agent at " << pid << " attempted to register without " |
| << "a domain, but this master is configured to require agent " |
| << "domains. Ignoring agent registration attempt"; |
| |
| slaves.registering.erase(pid); |
| return; |
| } |
| |
| // Check if this slave is already registered (because it retries). |
| if (Slave* slave = slaves.registered.get(pid)) { |
| if (!slave->connected) { |
| // The slave was previously disconnected but it is now trying |
| // to register as a new slave. |
| // There are several possible reasons for this to happen: |
| // - If the slave failed recovery and hence registering as a new |
| // slave before the master removed the old slave from its map. |
| // - If the slave was shutting down while it had a registration |
| // retry scheduled. See MESOS-8463. |
| LOG(INFO) << "Removing old disconnected agent " << *slave |
| << " because a registration attempt occurred"; |
| |
| removeSlave(slave, |
| "a new agent registered at the same address", |
| metrics->slave_removals_reason_registered); |
| } else { |
| CHECK(slave->active) |
| << "Unexpected connected but deactivated agent " << *slave; |
| |
| LOG(INFO) << "Agent " << *slave << " already registered," |
| << " resending acknowledgement"; |
| |
| Duration pingTimeout = |
| flags.agent_ping_timeout * flags.max_agent_ping_timeouts; |
| MasterSlaveConnection connection; |
| connection.set_total_ping_timeout_seconds(pingTimeout.secs()); |
| |
| SlaveRegisteredMessage message; |
| message.mutable_slave_id()->CopyFrom(slave->id); |
| message.mutable_connection()->CopyFrom(connection); |
| send(pid, message); |
| |
| slaves.registering.erase(pid); |
| return; |
| } |
| } |
| |
| // Create and add the slave id. |
| SlaveID slaveId = newSlaveId(); |
| |
| LOG(INFO) << "Registering agent at " << pid << " (" |
| << slaveInfo.hostname() << ") with id " << slaveId; |
| |
| SlaveInfo slaveInfo_ = slaveInfo; |
| slaveInfo_.mutable_id()->CopyFrom(slaveId); |
| |
| registerSlaveMessage.mutable_slave()->mutable_id()->CopyFrom(slaveId); |
| |
| registrar->apply(Owned<RegistryOperation>(new AdmitSlave(slaveInfo_))) |
| .onAny(defer(self(), |
| &Self::__registerSlave, |
| pid, |
| std::move(registerSlaveMessage), |
| lambda::_1)); |
| } |
| |
| |
| void Master::__registerSlave( |
| const UPID& pid, |
| RegisterSlaveMessage&& registerSlaveMessage, |
| const Future<bool>& admit) |
| { |
| CHECK(slaves.registering.contains(pid)); |
| |
| CHECK(!admit.isDiscarded()); |
| |
| const SlaveInfo& slaveInfo = registerSlaveMessage.slave(); |
| |
| if (admit.isFailed()) { |
| LOG(FATAL) << "Failed to admit agent " << slaveInfo.id() << " at " << pid |
| << " (" << slaveInfo.hostname() << "): " << admit.failure(); |
| } |
| |
| if (!admit.get()) { |
| // This should only happen if there is a slaveID collision, but that |
| // is extremely unlikely in practice: slaveIDs are prefixed with the |
| // master ID, which is a randomly generated UUID. In this situation, |
| // we ignore the registration attempt. The slave will eventually try |
| // to register again and be assigned a new slave ID. |
| LOG(WARNING) << "Agent " << slaveInfo.id() << " at " << pid |
| << " (" << slaveInfo.hostname() << ") was assigned" |
| << " an agent ID that already appears in the registry;" |
| << " ignoring registration attempt"; |
| |
| slaves.registering.erase(pid); |
| return; |
| } |
| |
| VLOG(1) << "Admitted agent " << slaveInfo.id() << " at " << pid |
| << " (" << slaveInfo.hostname() << ")"; |
| |
| MachineID machineId; |
| machineId.set_hostname(slaveInfo.hostname()); |
| machineId.set_ip(stringify(pid.address.ip)); |
| |
| vector<SlaveInfo::Capability> agentCapabilities = google::protobuf::convert( |
| std::move(*registerSlaveMessage.mutable_agent_capabilities())); |
| vector<Resource> checkpointedResources = google::protobuf::convert( |
| std::move(*registerSlaveMessage.mutable_checkpointed_resources())); |
| |
| Option<UUID> resourceVersion; |
| if (registerSlaveMessage.has_resource_version_uuid()) { |
| resourceVersion = registerSlaveMessage.resource_version_uuid(); |
| } |
| |
| Slave* slave = new Slave( |
| this, |
| slaveInfo, |
| pid, |
| machineId, |
| registerSlaveMessage.version(), |
| std::move(agentCapabilities), |
| Clock::now(), |
| std::move(checkpointedResources), |
| resourceVersion); |
| |
| ++metrics->slave_registrations; |
| |
| addSlave(slave, {}); |
| |
| Duration pingTimeout = |
| flags.agent_ping_timeout * flags.max_agent_ping_timeouts; |
| MasterSlaveConnection connection; |
| connection.set_total_ping_timeout_seconds(pingTimeout.secs()); |
| |
| SlaveRegisteredMessage message; |
| message.mutable_slave_id()->CopyFrom(slave->id); |
| message.mutable_connection()->CopyFrom(connection); |
| send(slave->pid, message); |
| |
| // Note that we convert to `Resources` for output as it's faster than |
| // logging raw protobuf data. Conversion is safe, as resources have |
| // already passed validation. |
| LOG(INFO) << "Registered agent " << *slave |
| << " with " << Resources(slave->info.resources()); |
| |
| slaves.registering.erase(pid); |
| } |
| |
| |
| void Master::reregisterSlave( |
| const UPID& from, |
| ReregisterSlaveMessage&& reregisterSlaveMessage) |
| { |
| ++metrics->messages_reregister_slave; |
| |
| if (authenticating.contains(from)) { |
| LOG(INFO) << "Queuing up re-registration request from " << from |
| << " because authentication is still in progress"; |
| |
| authenticating[from] |
| .onReady(defer(self(), |
| &Self::reregisterSlave, |
| from, |
| std::move(reregisterSlaveMessage))); |
| return; |
| } |
| |
| if (flags.authenticate_agents && !authenticated.contains(from)) { |
| // This could happen if another authentication request came |
| // through before we are here or if a slave tried to |
| // reregister without authentication. |
| LOG(WARNING) << "Refusing re-registration of agent at " << from |
| << " because it is not authenticated"; |
| return; |
| } |
| |
| // TODO(bevers): Technically this behaviour seems to be incorrect, since we |
| // discard the newer re-registration attempt, which might have additional |
| // capabilities or a higher version (or a changed SlaveInfo, after Mesos 1.5). |
| // However, this should very rarely happen in practice, and nobody seems to |
| // have complained about it so far. |
| const SlaveInfo& slaveInfo = reregisterSlaveMessage.slave(); |
| if (slaves.reregistering.contains(slaveInfo.id())) { |
| LOG(INFO) |
| << "Ignoring reregister agent message from agent " |
| << slaveInfo.id() << " at " << from << " (" |
| << slaveInfo.hostname() << ") as re-registration is already in progress"; |
| |
| return; |
| } |
| |
| if (slaves.markingGone.contains(slaveInfo.id())) { |
| LOG(INFO) |
| << "Ignoring reregister agent message from agent " |
| << slaveInfo.id() << " at " << from << " (" |
| << slaveInfo.hostname() << ") as a gone operation is already in progress"; |
| |
| return; |
| } |
| |
| if (slaves.gone.contains(slaveInfo.id())) { |
| LOG(WARNING) << "Refusing re-registration of agent at " << from |
| << " because it is already marked gone"; |
| |
| ShutdownMessage message; |
| message.set_message("Agent has been marked gone"); |
| send(from, message); |
| return; |
| } |
| |
| Option<Error> error = |
| validation::master::message::reregisterSlave(reregisterSlaveMessage); |
| |
| if (error.isSome()) { |
| LOG(WARNING) << "Dropping re-registration of agent at " << from |
| << " because it sent an invalid re-registration: " |
| << error->message; |
| |
| return; |
| } |
| |
| LOG(INFO) << "Received reregister agent message from agent " |
| << slaveInfo.id() << " at " << from << " (" |
| << slaveInfo.hostname() << ")"; |
| |
| // TODO(bevers): Create a guard object calling `insert()` in its constructor |
| // and `erase()` in its destructor, to avoid the manual bookkeeping. |
| slaves.reregistering.insert(slaveInfo.id()); |
| |
| // Update all resources passed by the agent to `POST_RESERVATION_REFINEMENT` |
| // format. We do this as early as possible so that we only use a single |
| // format inside master, and downgrade again if necessary when they leave the |
| // master (e.g. when writing to the registry). |
| upgradeResources(&reregisterSlaveMessage); |
| |
| // Note that the principal may be empty if authentication is not |
| // required. Also it is passed along because it may be removed from |
| // `authenticated` while the authorization is pending. |
| Option<Principal> principal = authenticated.contains(from) |
| ? Principal(authenticated.at(from)) |
| : Option<Principal>::none(); |
| |
| // Calling the `onAny` continuation below separately so we can move |
| // `reregisterSlaveMessage` without it being evaluated before it's used |
| // by `authorizeSlave`. |
| Future<bool> authorization = |
| authorizeSlave(reregisterSlaveMessage.slave(), principal); |
| |
| authorization |
| .onAny(defer(self(), |
| &Self::_reregisterSlave, |
| from, |
| std::move(reregisterSlaveMessage), |
| principal, |
| lambda::_1)); |
| } |
| |
| |
| void Master::_reregisterSlave( |
| const UPID& pid, |
| ReregisterSlaveMessage&& reregisterSlaveMessage, |
| const Option<Principal>& principal, |
| const Future<bool>& authorized) |
| { |
| CHECK(!authorized.isDiscarded()); |
| |
| const SlaveInfo& slaveInfo = reregisterSlaveMessage.slave(); |
| CHECK(slaves.reregistering.contains(slaveInfo.id())); |
| |
| Option<string> authorizationError = None(); |
| |
| if (authorized.isFailed()) { |
| authorizationError = "Authorization failure: " + authorized.failure(); |
| } else if (!authorized.get()) { |
| authorizationError = |
| "Not authorized to reregister agent providing resources " |
| "'" + stringify(Resources(slaveInfo.resources())) + "' " + |
| (principal.isSome() |
| ? "with principal '" + stringify(principal.get()) + "'" |
| : "without a principal"); |
| } |
| |
| if (authorizationError.isSome()) { |
| LOG(WARNING) << "Refusing re-registration of agent " << slaveInfo.id() |
| << " at " << pid << " (" << slaveInfo.hostname() << ")" |
| << ": " << authorizationError.get(); |
| |
| slaves.reregistering.erase(slaveInfo.id()); |
| return; |
| } |
| |
| if (slaves.markingGone.contains(slaveInfo.id())) { |
| LOG(INFO) |
| << "Ignoring reregister agent message from agent " |
| << slaveInfo.id() << " at " << pid << " (" |
| << slaveInfo.hostname() << ") as a gone operation is already in progress"; |
| |
| slaves.reregistering.erase(slaveInfo.id()); |
| return; |
| } |
| |
| if (slaves.gone.contains(slaveInfo.id())) { |
| LOG(WARNING) << "Refusing re-registration of agent at " << pid |
| << " because it is already marked gone"; |
| |
| ShutdownMessage message; |
| message.set_message("Agent has been marked gone"); |
| send(pid, message); |
| |
| slaves.reregistering.erase(slaveInfo.id()); |
| return; |
| } |
| |
| VLOG(1) << "Authorized re-registration of agent " << slaveInfo.id() |
| << " at " << pid << " (" << slaveInfo.hostname() << ")"; |
| |
| MachineID machineId; |
| machineId.set_hostname(slaveInfo.hostname()); |
| machineId.set_ip(stringify(pid.address.ip)); |
| |
| // Slaves are not allowed to reregister while the machine they are on is in |
| // 'DOWN` mode. |
| if (machines.contains(machineId) && |
| machines[machineId].info.mode() == MachineInfo::DOWN) { |
| LOG(WARNING) << "Refusing re-registration of agent at " << pid |
| << " because the machine '" << machineId << "' that it is " |
| << "running on is `DOWN`"; |
| |
| ShutdownMessage message; |
| message.set_message("Machine is `DOWN`"); |
| send(pid, message); |
| |
| slaves.reregistering.erase(slaveInfo.id()); |
| return; |
| } |
| |
| // Ignore re-registration attempts by agents running old Mesos versions. |
| // We expect that the agent's version is in SemVer format; if the |
| // version cannot be parsed, the re-registration attempt is ignored. |
| const string& version = reregisterSlaveMessage.version(); |
| Try<Version> parsedVersion = Version::parse(version); |
| |
| if (parsedVersion.isError()) { |
| LOG(WARNING) << "Failed to parse version '" << version << "'" |
| << " of agent at " << pid << ": " << parsedVersion.error() |
| << "; ignoring agent re-registration attempt"; |
| |
| slaves.reregistering.erase(slaveInfo.id()); |
| return; |
| } else if (parsedVersion.get() < MINIMUM_AGENT_VERSION) { |
| LOG(WARNING) << "Ignoring re-registration attempt from old agent at " |
| << pid << ": agent version is " << parsedVersion.get() |
| << ", minimum supported agent version is " |
| << MINIMUM_AGENT_VERSION; |
| |
| slaves.reregistering.erase(slaveInfo.id()); |
| return; |
| } |
| |
| // If the agent is configured with a domain but the master is not, |
| // we can't determine whether the agent is remote. To be safe, we |
| // don't allow the agent to reregister. We don't shutdown the agent |
| // so that any tasks on the agent can continue to run. |
| // |
| // TODO(neilc): Consider sending a warning to agent (MESOS-7615). |
| if (slaveInfo.has_domain() && !info_.has_domain()) { |
| LOG(WARNING) << "Agent at " << pid << " is configured with " |
| << "domain " << slaveInfo.domain() << " " |
| << "but the master has no configured domain." |
| << "Ignoring agent re-registration attempt"; |
| |
| slaves.reregistering.erase(slaveInfo.id()); |
| return; |
| } |
| |
| // Don't allow agents without domain if domains are required. |
| // We don't shutdown the agent to allow it to restart itself with |
| // the correct domain and without losing tasks. |
| if (flags.require_agent_domain && !slaveInfo.has_domain()) { |
| LOG(WARNING) << "Agent at " << pid << " attempted to register without " |
| << "a domain, but this master is configured to require agent " |
| << "domains. Ignoring agent re-registration attempt"; |
| |
| slaves.reregistering.erase(slaveInfo.id()); |
| return; |
| } |
| |
| if (Slave* slave = slaves.registered.get(slaveInfo.id())) { |
| CHECK(!slaves.recovered.contains(slaveInfo.id())); |
| |
| // NOTE: This handles the case where a slave tries to |
| // reregister with an existing master (e.g. because of a |
| // spurious Zookeeper session expiration or after the slave |
| // recovers after a restart). |
| // For now, we assume this slave is not nefarious (eventually |
| // this will be handled by orthogonal security measures like key |
| // based authentication). |
| VLOG(1) << "Agent is already marked as registered: " << slaveInfo.id() |
| << " at " << pid << " (" << slaveInfo.hostname() << ")"; |
| |
| // We don't allow reregistering this way with a different IP or |
| // hostname. This is because maintenance is scheduled at the |
| // machine level; so we would need to re-validate the slave's |
| // unavailability if the machine it is running on changed. |
| if (slave->pid.address.ip != pid.address.ip || |
| slave->info.hostname() != slaveInfo.hostname()) { |
| LOG(WARNING) << "Agent " << slaveInfo.id() << " at " << pid |
| << " (" << slaveInfo.hostname() << ") attempted to " |
| << "reregister with different IP / hostname; expected " |
| << slave->pid.address.ip << " (" << slave->info.hostname() |
| << ") shutting it down"; |
| |
| ShutdownMessage message; |
| message.set_message( |
| "Agent attempted to reregister with different IP / hostname"); |
| |
| send(pid, message); |
| |
| slaves.reregistering.erase(slaveInfo.id()); |
| return; |
| } |
| |
| // Skip updating the registry if `slaveInfo` did not change from its |
| // previously known state. |
| if (slaveInfo == slave->info) { |
| ___reregisterSlave( |
| pid, |
| std::move(reregisterSlaveMessage), |
| true); |
| } else { |
| registrar->apply(Owned<RegistryOperation>(new UpdateSlave(slaveInfo))) |
| .onAny(defer(self(), |
| &Self::___reregisterSlave, |
| pid, |
| std::move(reregisterSlaveMessage), |
| lambda::_1)); |
| } |
| } else if (slaves.recovered.contains(slaveInfo.id())) { |
| // The agent likely is reregistering after a master failover as it |
| // is in the list recovered from the registry. |
| VLOG(1) << "Re-admitting recovered agent " << slaveInfo.id() |
| << " at " << pid << "(" << slaveInfo.hostname() << ")"; |
| |
| SlaveInfo recoveredInfo = slaves.recovered.at(slaveInfo.id()); |
| |
| // Skip updating the registry if `slaveInfo` did not change from its |
| // previously known state (see also MESOS-7711). |
| if (slaveInfo == recoveredInfo) { |
| __reregisterSlave( |
| pid, |
| std::move(reregisterSlaveMessage), |
| true); |
| } else { |
| registrar->apply(Owned<RegistryOperation>(new UpdateSlave(slaveInfo))) |
| .onAny(defer(self(), |
| &Self::__reregisterSlave, |
| pid, |
| std::move(reregisterSlaveMessage), |
| lambda::_1)); |
| } |
| } else { |
| // In the common case, the slave has been marked unreachable |
| // by the master, so we move the slave to the reachable list and |
| // readmit it. If the slave isn't in the unreachable list (which |
| // might occur if the slave's entry in the unreachable list is |
| // GC'd), we admit the slave anyway. |
| VLOG(1) << "Consulting registry about agent " << slaveInfo.id() |
| << " at " << pid << "(" << slaveInfo.hostname() << ")"; |
| |
| registrar->apply(Owned<RegistryOperation>( |
| new MarkSlaveReachable(slaveInfo))) |
| .onAny(defer(self(), |
| &Self::__reregisterSlave, |
| pid, |
| std::move(reregisterSlaveMessage), |
| lambda::_1)); |
| } |
| } |
| |
| |
| void Master::__reregisterSlave( |
| const UPID& pid, |
| ReregisterSlaveMessage&& reregisterSlaveMessage, |
| const Future<bool>& future) |
| { |
| const SlaveInfo& slaveInfo = reregisterSlaveMessage.slave(); |
| CHECK(slaves.reregistering.contains(slaveInfo.id())); |
| |
| if (future.isFailed()) { |
| LOG(FATAL) << "Failed to update registry for agent " << slaveInfo.id() |
| << " at " << pid << " (" << slaveInfo.hostname() << "): " |
| << future.failure(); |
| } |
| |
| CHECK(!future.isDiscarded()); |
| |
| // Neither the `UpdateSlave` nor `MarkSlaveReachable` registry operations |
| // should ever fail. |
| CHECK(future.get()); |
| |
| if (slaves.markingGone.contains(slaveInfo.id())) { |
| LOG(INFO) |
| << "Ignoring reregister agent message from agent " |
| << slaveInfo.id() << " at " << pid << " (" |
| << slaveInfo.hostname() << ") as a gone operation is already in progress"; |
| |
| slaves.reregistering.erase(slaveInfo.id()); |
| return; |
| } |
| |
| if (slaves.gone.contains(slaveInfo.id())) { |
| LOG(WARNING) << "Refusing re-registration of agent at " << pid |
| << " because it is already marked gone"; |
| |
| ShutdownMessage message; |
| message.set_message("Agent has been marked gone"); |
| send(pid, message); |
| |
| slaves.reregistering.erase(slaveInfo.id()); |
| return; |
| } |
| |
| VLOG(1) << "Re-admitted agent " << slaveInfo.id() << " at " << pid |
| << " (" << slaveInfo.hostname() << ")"; |
| |
| // For agents without the MULTI_ROLE capability, |
| // we need to inject the allocation role inside |
| // the task and executor resources; |
| auto injectAllocationInfo = []( |
| RepeatedPtrField<Resource>* resources, |
| const FrameworkInfo& frameworkInfo) |
| { |
| set<string> roles = protobuf::framework::getRoles(frameworkInfo); |
| |
| foreach (Resource& resource, *resources) { |
| if (!resource.has_allocation_info()) { |
| if (roles.size() != 1) { |
| LOG(FATAL) << "Missing 'Resource.AllocationInfo' for resources" |
| << " allocated to MULTI_ROLE framework" |
| << " '" << frameworkInfo.name() << "'"; |
| } |
| |
| resource.mutable_allocation_info()->set_role(*roles.begin()); |
| } |
| } |
| }; |
| |
| vector<SlaveInfo::Capability> agentCapabilities = |
| google::protobuf::convert(reregisterSlaveMessage.agent_capabilities()); |
| |
| // Adjust the agent's task and executor infos to ensure |
| // compatibility with old agents without certain capabilities. |
| protobuf::slave::Capabilities slaveCapabilities(agentCapabilities); |
| |
| // If the agent is not multi-role capable, inject allocation info. |
| if (!slaveCapabilities.multiRole) { |
| hashmap<FrameworkID, reference_wrapper<const FrameworkInfo>> frameworks; |
| |
| foreach (const FrameworkInfo& framework, |
| reregisterSlaveMessage.frameworks()) { |
| frameworks.emplace(framework.id(), framework); |
| } |
| |
| foreach (Task& task, *reregisterSlaveMessage.mutable_tasks()) { |
| CHECK(frameworks.contains(task.framework_id())); |
| |
| injectAllocationInfo( |
| task.mutable_resources(), |
| frameworks.at(task.framework_id())); |
| } |
| |
| foreach (ExecutorInfo& executor, |
| *reregisterSlaveMessage.mutable_executor_infos()) { |
| CHECK(frameworks.contains(executor.framework_id())); |
| |
| injectAllocationInfo( |
| executor.mutable_resources(), |
| frameworks.at(executor.framework_id())); |
| } |
| } |
| |
| MachineID machineId; |
| machineId.set_hostname(slaveInfo.hostname()); |
| machineId.set_ip(stringify(pid.address.ip)); |
| |
| // For easy lookup, first determine the set of FrameworkIDs on the |
| // reregistering agent that are partition-aware. |
| hashset<FrameworkID> partitionAwareFrameworks; |
| |
| foreach (const FrameworkInfo& framework, |
| reregisterSlaveMessage.frameworks()) { |
| if (protobuf::frameworkHasCapability( |
| framework, FrameworkInfo::Capability::PARTITION_AWARE)) { |
| partitionAwareFrameworks.insert(framework.id()); |
| } |
| } |
| |
| // All tasks except the ones from completed frameworks are re-added to the |
| // master (those tasks were previously marked "unreachable", so they |
| // should be removed from that collection). |
| vector<Task> recoveredTasks; |
| foreach (Task& task, *reregisterSlaveMessage.mutable_tasks()) { |
| const FrameworkID& frameworkId = task.framework_id(); |
| |
| // Don't re-add tasks whose framework has been shutdown at the |
| // master. Such frameworks will be shutdown on the agent below. |
| if (isCompletedFramework(frameworkId)) { |
| continue; |
| } |
| |
| if (!slaves.recovered.contains(slaveInfo.id())) { |
| Framework* framework = getFramework(frameworkId); |
| if (framework != nullptr) { |
| framework->unreachableTasks.erase(task.task_id()); |
| |
| // The master transitions task to terminal state on its own in certain |
| // scenarios (e.g., framework or agent teardown) before instructing the |
| // agent to remove it. However, we are not guaranteed that the message |
| // reaches the agent and is processed by it. If the agent fails to act |
| // on the message, tasks the master has declared terminal might reappear |
| // from the agent as non-terminal, see e.g., MESOS-9940. |
| // |
| // Avoid tracking a task as both terminal and non-terminal by |
| // garbage-collected completed tasks which come back as running. |
| framework->completedTasks.erase( |
| std::remove_if( |
| framework->completedTasks.begin(), |
| framework->completedTasks.end(), |
| [&](const Owned<Task>& task_) { |
| return task_.get() && task_->task_id() == task.task_id(); |
| }), |
| framework->completedTasks.end()); |
| } |
| |
| const string message = slaves.unreachable.contains(slaveInfo.id()) |
| ? "Unreachable agent re-reregistered" |
| : "Unknown agent reregistered"; |
| |
| const StatusUpdate& update = protobuf::createStatusUpdate( |
| task.framework_id(), |
| task.slave_id(), |
| task.task_id(), |
| task.state(), |
| TaskStatus::SOURCE_MASTER, |
| None(), |
| message, |
| TaskStatus::REASON_SLAVE_REREGISTERED, |
| (task.has_executor_id() |
| ? Option<ExecutorID>(task.executor_id()) : None()), |
| protobuf::getTaskHealth(task), |
| protobuf::getTaskCheckStatus(task), |
| None(), |
| protobuf::getTaskContainerStatus(task)); |
| |
| if (framework == nullptr || !framework->connected()) { |
| LOG(WARNING) << "Dropping update " << update |
| << (update.status().has_message() |
| ? " '" + update.status().message() + "'" |
| : "") |
| << " for " |
| << (framework == nullptr ? "unknown" : "disconnected") |
| << " framework " << frameworkId; |
| } else { |
| forward(update, UPID(), framework); |
| } |
| } |
| |
| recoveredTasks.push_back(std::move(task)); |
| } |
| |
| // All tasks from this agent are now reachable so clean them up from |
| // the master's unreachable task records. |
| if (slaves.unreachableTasks.contains(slaveInfo.id())) { |
| foreachkey (const FrameworkID& frameworkId, |
| slaves.unreachableTasks.at(slaveInfo.id())) { |
| Framework* framework = getFramework(frameworkId); |
| if (framework != nullptr) { |
| foreach (const TaskID& taskId, |
| slaves.unreachableTasks.at(slaveInfo.id()).at(frameworkId)) { |
| framework->unreachableTasks.erase(taskId); |
| } |
| } |
| } |
| } |
| |
| slaves.unreachableTasks.erase(slaveInfo.id()); |
| |
| vector<Resource> checkpointedResources = google::protobuf::convert( |
| std::move(*reregisterSlaveMessage.mutable_checkpointed_resources())); |
| vector<ExecutorInfo> executorInfos = google::protobuf::convert( |
| std::move(*reregisterSlaveMessage.mutable_executor_infos())); |
| |
| Option<UUID> resourceVersion; |
| if (reregisterSlaveMessage.has_resource_version_uuid()) { |
| resourceVersion = reregisterSlaveMessage.resource_version_uuid(); |
| } |
| |
| slaves.recovered.erase(slaveInfo.id()); |
| |
| Slave* slave = new Slave( |
| this, |
| slaveInfo, |
| pid, |
| machineId, |
| reregisterSlaveMessage.version(), |
| std::move(agentCapabilities), |
| Clock::now(), |
| std::move(checkpointedResources), |
| resourceVersion, |
| std::move(executorInfos), |
| std::move(recoveredTasks)); |
| |
| slave->reregisteredTime = Clock::now(); |
| |
| ++metrics->slave_reregistrations; |
| |
| slaves.removed.erase(slave->id); |
| slaves.unreachable.erase(slave->id); |
| |
| vector<Archive::Framework> completedFrameworks = google::protobuf::convert( |
| std::move(*reregisterSlaveMessage.mutable_completed_frameworks())); |
| |
| addSlave(slave, std::move(completedFrameworks)); |
| |
| Duration pingTimeout = |
| flags.agent_ping_timeout * flags.max_agent_ping_timeouts; |
| MasterSlaveConnection connection; |
| connection.set_total_ping_timeout_seconds(pingTimeout.secs()); |
| |
| SlaveReregisteredMessage message; |
| message.mutable_slave_id()->CopyFrom(slave->id); |
| message.mutable_connection()->CopyFrom(connection); |
| send(slave->pid, message); |
| |
| // Note that we convert to `Resources` for output as it's faster than |
| // logging raw protobuf data. Conversion is safe, as resources have |
| // already passed validation. |
| LOG(INFO) << "Re-registered agent " << *slave |
| << " with " << Resources(slave->info.resources()); |
| |
| // Any framework that is completed at the master but still running |
| // at the slave is shutdown. This can occur if the framework was |
| // removed when the slave was partitioned. NOTE: This is just a |
| // short-term hack because information about completed frameworks is |
| // lost when the master fails over. Also, we only store a limited |
| // number of completed frameworks. A proper fix likely involves |
| // storing framework information in the registry (MESOS-1719). |
| foreach (const FrameworkInfo& framework, |
| reregisterSlaveMessage.frameworks()) { |
| if (isCompletedFramework(framework.id())) { |
| LOG(INFO) << "Shutting down framework " << framework.id() |
| << " at reregistered agent " << *slave |
| << " because the framework has been shutdown at the master"; |
| |
| ShutdownFrameworkMessage message; |
| message.mutable_framework_id()->MergeFrom(framework.id()); |
| send(slave->pid, message); |
| } |
| } |
| |
| // TODO(bmahler): Consider moving this in to `updateSlaveFrameworks`, |
| // would be helpful when there are a large total number of frameworks |
| // in the cluster. |
| const vector<FrameworkInfo> frameworks = google::protobuf::convert( |
| std::move(*reregisterSlaveMessage.mutable_frameworks())); |
| |
| updateSlaveFrameworks(slave, frameworks); |
| |
| slaves.reregistering.erase(slaveInfo.id()); |
| } |
| |
| |
| void Master::___reregisterSlave( |
| const process::UPID& pid, |
| ReregisterSlaveMessage&& reregisterSlaveMessage, |
| const process::Future<bool>& updated) |
| { |
| const SlaveInfo& slaveInfo = reregisterSlaveMessage.slave(); |
| CHECK(slaves.reregistering.contains(slaveInfo.id())); |
| |
| CHECK_READY(updated); |
| CHECK(updated.get()); |
| |
| VLOG(1) << "Registry updated for slave " << slaveInfo.id() << " at " << pid |
| << "(" << slaveInfo.hostname() << ")"; |
| |
| if (slaves.markingGone.contains(slaveInfo.id())) { |
| LOG(INFO) |
| << "Ignoring reregister agent message from agent " |
| << slaveInfo.id() << " at " << pid << " (" |
| << slaveInfo.hostname() << ") as a gone operation is already in progress"; |
| |
| slaves.reregistering.erase(slaveInfo.id()); |
| return; |
| } |
| |
| if (slaves.gone.contains(slaveInfo.id())) { |
| LOG(WARNING) << "Refusing re-registration of agent at " << pid |
| << " because it is already marked gone"; |
| |
| ShutdownMessage message; |
| message.set_message("Agent has been marked gone"); |
| send(pid, message); |
| |
| slaves.reregistering.erase(slaveInfo.id()); |
| return; |
| } |
| |
| if (!slaves.registered.contains(slaveInfo.id())) { |
| LOG(WARNING) |
| << "Dropping ongoing re-registration attempt of slave " << slaveInfo.id() |
| << " at " << pid << "(" << slaveInfo.hostname() << ") " |
| << "because the re-registration timeout was reached."; |
| |
| slaves.reregistering.erase(slaveInfo.id()); |
| // Don't send a ShutdownMessage here because tasks from partition-aware |
| // frameworks running on this host might still be recovered when the slave |
| // retries the re-registration. |
| return; |
| } |
| |
| Slave* slave = slaves.registered.get(slaveInfo.id()); |
| |
| // Update the slave pid and relink to it. |
| // NOTE: Re-linking the slave here always rather than only when |
| // the slave is disconnected can lead to multiple exited events |
| // in succession for a disconnected slave. As a result, we |
| // ignore duplicate exited events for disconnected slaves. |
| // See: https://issues.apache.org/jira/browse/MESOS-675 |
| slave->pid = pid; |
| link(slave->pid); |
| |
| const string& version = reregisterSlaveMessage.version(); |
| const vector<SlaveInfo::Capability> agentCapabilities = |
| google::protobuf::convert(reregisterSlaveMessage.agent_capabilities()); |
| |
| Option<UUID> resourceVersion; |
| if (reregisterSlaveMessage.has_resource_version_uuid()) { |
| resourceVersion = reregisterSlaveMessage.resource_version_uuid(); |
| } |
| |
| // Update our view of checkpointed agent resources for resource |
| // provider-capable agents; for other agents the master will resend |
| // checkpointed resources after reregistration. |
| const Resources checkpointedResources = |
| slave->capabilities.resourceProvider |
| ? Resources(reregisterSlaveMessage.checkpointed_resources()) |
| : slave->checkpointedResources; |
| |
| Try<Nothing> stateUpdated = slave->update( |
| slaveInfo, |
| version, |
| agentCapabilities, |
| checkpointedResources, |
| resourceVersion); |
| |
| // As of now, the only way `slave->update()` can fail is if the agent sent |
| // different checkpointed resources than it had before. A well-behaving |
| // agent shouldn't do this, so this one is either malicious or buggy. Either |
| // way, we refuse the re-registration attempt. |
| if (stateUpdated.isError()) { |
| LOG(WARNING) << "Refusing re-registration of agent " << slaveInfo.id() |
| << " at " << pid << " (" << slaveInfo.hostname() << ")" |
| << " because state update failed: " << stateUpdated.error(); |
| |
| ShutdownMessage message; |
| message.set_message(stateUpdated.error()); |
| send(pid, message); |
| |
| slaves.reregistering.erase(slaveInfo.id()); |
| return; |
| } |
| |
| slave->reregisteredTime = Clock::now(); |
| |
| allocator->updateSlave( |
| slave->id, |
| slave->info, |
| slave->totalResources, |
| agentCapabilities); |
| |
| const vector<ExecutorInfo> executorInfos = |
| google::protobuf::convert(reregisterSlaveMessage.executor_infos()); |
| const vector<Task> tasks = |
| google::protobuf::convert(reregisterSlaveMessage.tasks()); |
| const vector<FrameworkInfo> frameworks = |
| google::protobuf::convert(reregisterSlaveMessage.frameworks()); |
| |
| // Reconcile tasks between master and slave, and send the |
| // `SlaveReregisteredMessage`. |
| reconcileKnownSlave(slave, executorInfos, tasks); |
| |
| // If this is a disconnected slave, add it back to the allocator. |
| // This is done after reconciliation to ensure the allocator's |
| // offers include the recovered resources initially on this |
| // slave. |
| if (!slave->connected) { |
| CHECK(slave->reregistrationTimer.isSome()); |
| Clock::cancel(slave->reregistrationTimer.get()); |
| |
| slave->connected = true; |
| dispatch(slave->observer, &SlaveObserver::reconnect); |
| |
| slave->active = true; |
| allocator->activateSlave(slave->id); |
| } |
| |
| CHECK(slave->active) |
| << "Unexpected connected but deactivated agent " << *slave; |
| |
| // Inform the agent of the new framework pids for its tasks, and |
| // recover any unknown frameworks from the slave info. |
| updateSlaveFrameworks(slave, frameworks); |
| |
| slaves.reregistering.erase(slaveInfo.id()); |
| |
| // If the agent is not resource provider capable (legacy agent), |
| // send checkpointed resources to the agent. This is important for |
| // the cases where the master didn't fail over. In that case, the |
| // master might have already applied an operation that the agent |
| // didn't see (e.g., due to a breaking connection). This message |
| // will sync the state between the master and the agent about |
| // checkpointed resources. |
| // |
| // New agents that are resource provider capable will always |
| // update the master with total resources during re-registration. |
| // Therefore, no need to send checkpointed resources to the new |
| // agent in this case. |
| if (!slave->capabilities.resourceProvider) { |
| CheckpointResourcesMessage message; |
| |
| message.mutable_resources()->CopyFrom(slave->checkpointedResources); |
| |
| if (!slave->capabilities.reservationRefinement) { |
| // If the agent is not refinement-capable, don't send it |
| // checkpointed resources that contain refined reservations. This |
| // might occur if a reservation refinement is created but never |
| // reaches the agent (e.g., due to network partition), and then |
| // the agent is downgraded before the partition heals. |
| // |
| // TODO(neilc): It would probably be better to prevent the agent |
| // from reregistering in this scenario. |
| Try<Nothing> result = downgradeResources(&message); |
| if (result.isError()) { |
| LOG(WARNING) << "Not sending updated checkpointed resources " |
| << slave->checkpointedResources |
| << " with refined reservations, since agent " << *slave |
| << " is not RESERVATION_REFINEMENT-capable."; |
| |
| return; |
| } |
| } |
| |
| LOG(INFO) << "Sending updated checkpointed resources " |
| << slave->checkpointedResources |
| << " to agent " << *slave; |
| |
| send(slave->pid, message); |
| } |
| } |
| |
| |
| void Master::updateSlaveFrameworks( |
| Slave* slave, |
| const vector<FrameworkInfo>& frameworks) |
| { |
| CHECK_NOTNULL(slave); |
| |
| // Send the latest framework pids to the slave. |
| foreach (const FrameworkInfo& frameworkInfo, frameworks) { |
| CHECK(frameworkInfo.has_id()); |
| Framework* framework = getFramework(frameworkInfo.id()); |
| |
| if (framework != nullptr) { |
| // TODO(bmahler): Copying the framework info here can be |
| // expensive, consider only sending this message when |
| // there has been a change vs what the agent reported. |
| UpdateFrameworkMessage message; |
| message.mutable_framework_id()->CopyFrom(framework->id()); |
| message.mutable_framework_info()->CopyFrom(framework->info); |
| |
| // TODO(anand): We set 'pid' to UPID() for http frameworks |
| // as 'pid' was made optional in 0.24.0. In 0.25.0, we |
| // no longer have to set pid here for http frameworks. |
| message.set_pid(framework->pid.getOrElse(UPID())); |
| |
| send(slave->pid, message); |
| } else { |
| // The agent is running a framework that the master doesn't know |
| // about. Recover the framework using the `FrameworkInfo` |
| // supplied by the agent. |
| |
| // We skip recovering the framework if it has already been |
| // marked completed at the master. In this situation, the master |
| // has already told the agent to shutdown the framework in |
| // `__reregisterSlave`. |
| if (isCompletedFramework(frameworkInfo.id())) { |
| continue; |
| } |
| |
| LOG(INFO) << "Recovering framework " << frameworkInfo.id() |
| << " from reregistering agent " << *slave; |
| |
| recoverFramework(frameworkInfo, {}); |
| } |
| } |
| } |
| |
| |
| void Master::unregisterSlave(const UPID& from, const SlaveID& slaveId) |
| { |
| ++metrics->messages_unregister_slave; |
| |
| Slave* slave = slaves.registered.get(slaveId); |
| |
| if (slave == nullptr) { |
| LOG(WARNING) << "Ignoring unregister agent message from " << from |
| << " for unknown agent"; |
| |
| return; |
| } |
| |
| if (slave->pid != from) { |
| LOG(WARNING) << "Ignoring unregister agent message from " << from |
| << " because it is not the agent " << slave->pid; |
| |
| return; |
| } |
| |
| removeSlave(slave, |
| "the agent unregistered", |
| metrics->slave_removals_reason_unregistered); |
| } |
| |
| |
| void Master::updateFramework( |
| Framework* framework, |
| const FrameworkInfo& frameworkInfo, |
| const set<string>& suppressedRoles) |
| { |
| LOG(INFO) << "Updating framework " << *framework << " with roles " |
| << stringify(suppressedRoles) << " suppressed"; |
| |
| // NOTE: The allocator takes care of activating/deactivating |
| // the frameworks from the added/removed roles, respectively. |
| allocator->updateFramework(framework->id(), frameworkInfo, suppressedRoles); |
| |
| // First, remove the offers allocated to roles being removed. |
| foreach (Offer* offer, utils::copy(framework->offers)) { |
| set<string> newRoles = protobuf::framework::getRoles(frameworkInfo); |
| if (newRoles.count(offer->allocation_info().role()) > 0) { |
| continue; |
| } |
| |
| allocator->recoverResources( |
| offer->framework_id(), |
| offer->slave_id(), |
| offer->resources(), |
| None()); |
| |
| removeOffer(offer, true); // Rescind! |
| } |
| |
| framework->update(frameworkInfo); |
| } |
| |
| |
| void Master::updateSlave(UpdateSlaveMessage&& message) |
| { |
| ++metrics->messages_update_slave; |
| |
| upgradeResources(&message); |
| |
| const SlaveID& slaveId = message.slave_id(); |
| |
| if (slaves.removed.get(slaveId).isSome()) { |
| // If the slave has been removed, drop the status update. The |
| // master is no longer trying to health check this slave; when the |
| // slave realizes it hasn't received any pings from the master, it |
| // will eventually try to reregister. |
| LOG(WARNING) << "Ignoring update on removed agent " << slaveId; |
| return; |
| } |
| |
| Slave* slave = slaves.registered.get(slaveId); |
| |
| if (slave == nullptr) { |
| LOG(WARNING) << "Ignoring update on removed agent " << slaveId; |
| return; |
| } |
| |
| // NOTE: We must *first* update the agent's resources before we |
| // recover the resources. If we recovered the resources first, |
| // an allocation could trigger between recovering resources and |
| // updating the agent in the allocator. This would lead us to |
| // re-send out the stale oversubscribed resources! |
| |
| // If agent does not specify the `update_oversubscribed_resources` |
| // field, we assume we should set `oversubscribedResources` to be |
| // backwards-compatibility with older agents (version < 1.5). |
| const bool hasOversubscribed = |
| !message.has_update_oversubscribed_resources() || |
| message.update_oversubscribed_resources(); |
| |
| Option<Resources> newOversubscribed; |
| |
| if (hasOversubscribed) { |
| const Resources& oversubscribedResources = |
| message.oversubscribed_resources(); |
| |
| LOG(INFO) << "Received update of agent " << *slave << " with total" |
| << " oversubscribed resources " << oversubscribedResources; |
| |
| newOversubscribed = oversubscribedResources; |
| } |
| |
| Resources newResourceProviderResources; |
| if (message.has_resource_providers()) { |
| foreach ( |
| const UpdateSlaveMessage::ResourceProvider& resourceProvider, |
| message.resource_providers().providers()) { |
| newResourceProviderResources += resourceProvider.total_resources(); |
| } |
| } |
| |
| auto agentResources = [](const Resource& resource) { |
| return !resource.has_provider_id(); |
| }; |
| |
| const Resources newSlaveResources = |
| slave->totalResources.nonRevocable().filter(agentResources) + |
| newOversubscribed.getOrElse( |
| slave->totalResources.revocable().filter(agentResources)) + |
| newResourceProviderResources; |
| |
| // TODO(bbannier): We only need to update if any changes from |
| // resource providers are reported. |
| bool updated = slave->totalResources != newSlaveResources; |
| |
| // Check if the agent's resource version changed. |
| if (!updated && message.has_resource_version_uuid() && |
| (slave->resourceVersion.isNone() || |
| (slave->resourceVersion.isSome() && |
| message.resource_version_uuid() != slave->resourceVersion.get()))) { |
| updated = true; |
| } |
| |
| // Check if the known operations for this agent changed. |
| if (!updated) { |
| // Below we loop over all received operations and check whether |
| // they are known to the master; operations can be unknown to the |
| // master after a master failover. To handle dropped operations on |
| // agent failover we explicitly track the received operations and |
| // compare them against the operations known to the master. |
| hashset<UUID> receivedOperations; |
| |
| foreach (const Operation& operation, message.operations().operations()) { |
| if (!slave->operations.contains(operation.uuid())) { |
| updated = true; |
| break; |
| } |
| |
| if (*slave->operations.at(operation.uuid()) != operation) { |
| updated = true; |
| break; |
| } |
| |
| receivedOperations.insert(operation.uuid()); |
| } |
| |
| if (receivedOperations.size() != slave->operations.size()) { |
| updated = true; |
| } |
| } |
| |
| // Check if resource provider information changed. |
| if (!updated && message.has_resource_providers()) { |
| foreach ( |
| const UpdateSlaveMessage::ResourceProvider& receivedProvider, |
| message.resource_providers().providers()) { |
| CHECK(receivedProvider.has_info()); |
| CHECK(receivedProvider.info().has_id()); |
| |
| const ResourceProviderID& resourceProviderId = |
| receivedProvider.info().id(); |
| |
| if (!slave->resourceProviders.contains(resourceProviderId)) { |
| updated = true; |
| break; |
| } |
| |
| const Slave::ResourceProvider& storedProvider = |
| slave->resourceProviders.at(resourceProviderId); |
| |
| if (storedProvider.info != receivedProvider.info() || |
| storedProvider.totalResources != receivedProvider.total_resources() || |
| storedProvider.resourceVersion != |
| receivedProvider.resource_version_uuid()) { |
| updated = true; |
| break; |
| } |
| |
| foreach ( |
| const Operation& operation, |
| receivedProvider.operations().operations()) { |
| if (!storedProvider.operations.contains(operation.uuid())) { |
| updated = true; |
| break; |
| } |
| |
| if (*storedProvider.operations.at(operation.uuid()) != operation) { |
| updated = true; |
| break; |
| } |
| } |
| } |
| } |
| |
| if (!updated) { |
| LOG(INFO) << "Ignoring update on agent " << *slave |
| << " as it reports no changes"; |
| |
| return; |
| } |
| |
| // Check invariants of the received update. |
| { |
| foreach ( |
| const UpdateSlaveMessage::ResourceProvider& resourceProvider, |
| message.resource_providers().providers()) { |
| CHECK(resourceProvider.has_info()); |
| CHECK(resourceProvider.info().has_id()); |
| const ResourceProviderID& providerId = resourceProvider.info().id(); |
| |
| const Option<Slave::ResourceProvider>& oldProvider = |
| slave->resourceProviders.get(providerId); |
| |
| if (oldProvider.isSome()) { |
| // For known resource providers the master should always know at least |
| // as many non-terminal operations as the agent. While an |
| // operation might get lost on the way to the agent or resource |
| // provider, or become terminal inside the agent, the master would never |
| // make an operation known to the agent terminal without the agent |
| // doing that first. |
| // |
| // NOTE: We only consider non-terminal operations here as there is an |
| // edge case where the master removes a terminal operation from |
| // its own state when it passes on an acknowledgement from a framework |
| // to the agent, but the agent fails over before it can process the |
| // acknowledgement, or the agent initiates an unrelated |
| // `UpdateSlaveMessage`. |
| foreach ( |
| const Operation& operation, |
| resourceProvider.operations().operations()) { |
| if (!protobuf::isTerminalState(operation.latest_status().state())) { |
| CHECK(oldProvider->operations.contains(operation.uuid())) |
| << "Agent tried to reconcile unknown non-terminal operation " |
| << operation.uuid(); |
| } |
| } |
| } |
| } |
| } |
| |
| // Update master and allocator state. |
| |
| if (hasOversubscribed) { |
| slave->totalResources -= slave->totalResources.revocable(); |
| slave->totalResources += message.oversubscribed_resources(); |
| |
| // TODO(bbannier): Track oversubscribed resources for resource |
| // providers as well. |
| } |
| |
| ReconcileOperationsMessage reconcile; |
| |
| // Reconcile operations on agent-default resources. |
| hashset<UUID> newOperations; |
| foreach (const Operation& operation, message.operations().operations()) { |
| newOperations.insert(operation.uuid()); |
| } |
| |
| foreachkey (const UUID& uuid, slave->operations) { |
| if (!message.has_operations() || !newOperations.contains(uuid)) { |
| LOG(WARNING) << "Performing explicit reconciliation with agent for" |
| << " known operation " << uuid |
| << " since it was not present in original" |
| << " reconciliation message from agent"; |
| |
| ReconcileOperationsMessage::Operation* reconcileOperation = |
| reconcile.add_operations(); |
| |
| reconcileOperation->mutable_operation_uuid()->CopyFrom(uuid); |
| } |
| } |
| |
| foreach ( |
| const UpdateSlaveMessage::ResourceProvider& resourceProvider, |
| message.resource_providers().providers()) { |
| CHECK(resourceProvider.has_info()); |
| CHECK(resourceProvider.info().has_id()); |
| const ResourceProviderID& providerId = resourceProvider.info().id(); |
| |
| // Below we only add operations to our state from resource |
| // providers which are unknown, or possibly remove them for known |
| // resource providers. This works since the master should always |
| // know more operations of known resource providers than any |
| // resource provider itself. |
| // |
| // NOTE: We do not mutate operation statuses here; that is the |
| // responsibility of the `updateOperationStatus` handler. |
| // |
| // There still exists an edge case where the master might remove a |
| // terminal operation from its state when passing an |
| // acknowledgement from a framework on to the agent, with the |
| // agent failing over before the acknowledgement can be processed. |
| // In that case the agent would track an operation unknown to the |
| // master. |
| // |
| // TODO(bbannier): We might want to consider to also learn about |
| // new (terminal) operations when observing messages from status |
| // update managers to frameworks. |
| if (!slave->resourceProviders.contains(providerId)) { |
| // If this is a not previously seen resource provider we had a master |
| // failover. Add the resources and operations to our state. |
| CHECK( |
| resourceProvider.total_resources().empty() || |
| !slave->totalResources.contains(resourceProvider.total_resources())); |
| |
| // We add the resource provider to the master first so |
| // that it can be found when e.g., adding operations. |
| slave->resourceProviders.put( |
| providerId, |
| {resourceProvider.info(), |
| resourceProvider.total_resources(), |
| resourceProvider.resource_version_uuid(), |
| {}}); |
| |
| hashmap<FrameworkID, Resources> usedByOperations; |
| |
| foreach ( |
| const Operation& operation, |
| resourceProvider.operations().operations()) { |
| // Update to bookkeeping of operations. |
| Framework* framework = nullptr; |
| if (operation.has_framework_id()) { |
| framework = getFramework(operation.framework_id()); |
| } |
| |
| addOperation(framework, slave, new Operation(operation)); |
| |
| if (!protobuf::isTerminalState(operation.latest_status().state()) && |
| operation.has_framework_id()) { |
| // If we do not yet know the `FrameworkInfo` of the framework the |
| // operation originated from, we cannot properly track the operation |
| // at this point. |
| // |
| // TODO(bbannier): Consider introducing ways of making sure an agent |
| // always knows the `FrameworkInfo` of operations triggered on its |
| // resources, e.g., by adding an explicit `FrameworkInfo` to |
| // operations like is already done for `RunTaskMessage`, see |
| // MESOS-8582. |
| if (framework == nullptr) { |
| LOG(WARNING) |
| << "Cannot properly account for operation " << operation.uuid() |
| << " learnt in reconciliation of agent " << slaveId |
| << " since framework " << operation.framework_id() |
| << " is unknown; this can lead to assertion failures after the" |
| " operation terminates, see MESOS-8536"; |
| continue; |
| } |
| |
| Try<Resources> consumedResources = |
| protobuf::getConsumedResources(operation.info()); |
| |
| CHECK_SOME(consumedResources) |
| << "Could not determine resources consumed by operation " |
| << operation.uuid(); |
| |
| usedByOperations[operation.framework_id()] += |
| consumedResources.get(); |
| } |
| } |
| |
| slave->totalResources += resourceProvider.total_resources(); |
| |
| allocator->addResourceProvider( |
| slaveId, resourceProvider.total_resources(), usedByOperations); |
| } else { |
| // If this is a known resource provider its total capacity cannot have |
| // changed, and it would not know about any non-terminal operations not |
| // already known to the master. However, it might not have received an |
| // operation for a couple different reasons: |
| // |
| // - The resource provider or agent could have failed over |
| // before the operation's `ApplyOperationMessage` could be |
| // received. |
| // - The operation's `ApplyOperationMessage` could have raced |
| // with this `UpdateSlaveMessage`. |
| // |
| // In both of these cases, we need to reconcile such operations explicitly |
| // with the agent. For operations which the agent or resource provider |
| // does not recognize, an OPERATION_DROPPED status update will be |
| // generated and the master will remove the operation from its state upon |
| // receipt of that update. |
| CHECK(slave->resourceProviders.contains(providerId)); |
| |
| Slave::ResourceProvider& oldProvider = |
| slave->resourceProviders.at(providerId); |
| |
| hashmap<UUID, const Operation*> newOperations; |
| foreach ( |
| const Operation& operation, |
| resourceProvider.operations().operations()) { |
| newOperations.put(operation.uuid(), &operation); |
| } |
| |
| foreachpair ( |
| const UUID& uuid, Operation* oldOperation, oldProvider.operations) { |
| if (!newOperations.contains(uuid)) { |
| LOG(WARNING) << "Performing explicit reconciliation with agent for" |
| << " known operation " << uuid |
| << " since it was not present in original" |
| << " reconciliation message from agent"; |
| |
| ReconcileOperationsMessage::Operation* reconcileOperation = |
| reconcile.add_operations(); |
| |
| reconcileOperation->mutable_operation_uuid()->CopyFrom(uuid); |
| reconcileOperation->mutable_resource_provider_id()->CopyFrom( |
| providerId); |
| } else { |
| // If a known operation became terminal between any previous offer |
| // operation status update and this `UpdateSlaveMessage`, the total |
| // resources we were sent already had the operation applied. We need |
| // to update the state of the operation to terminal here so that any |
| // update sent by the agent later does not cause us to apply the |
| // operation again. |
| |
| const Operation* newOperation = newOperations.at(uuid); |
| |
| if (!protobuf::isTerminalState( |
| oldOperation->latest_status().state()) && |
| protobuf::isTerminalState( |
| newOperation->latest_status().state())) { |
| Operation* operation = CHECK_NOTNULL(slave->getOperation(uuid)); |
| |
| UpdateOperationStatusMessage update = |
| protobuf::createUpdateOperationStatusMessage( |
| uuid, |
| newOperation->latest_status(), |
| newOperation->latest_status(), |
| operation->framework_id(), |
| operation->slave_id()); |
| |
| updateOperation( |
| operation, update, false); // Do not update resources. |
| } |
| } |
| } |
| |
| // Reconcile the total resources. This includes undoing |
| // speculated operations which are only visible in the total, |
| // but never in the used resources. We explicitly allow for |
| // resource providers to change from or to zero capacity. |
| const Resources oldResources = |
| slave->totalResources.filter([&providerId](const Resource& resource) { |
| return resource.provider_id() == providerId; |
| }); |
| |
| slave->totalResources -= oldResources; |
| slave->totalResources += resourceProvider.total_resources(); |
| |
| oldProvider.totalResources = resourceProvider.total_resources(); |
| |
| // Reconcile resource versions. |
| oldProvider.resourceVersion = resourceProvider.resource_version_uuid(); |
| } |
| } |
| |
| if (reconcile.operations_size() > 0) { |
| send(slave->pid, reconcile); |
| } |
| |
| // Now update the agent's state and total resources in the allocator. |
| allocator->updateSlave(slaveId, slave->info, slave->totalResources); |
| |
| // Then rescind outstanding offers affected by the update. |
| // NOTE: Need a copy of offers because the offers are removed inside the loop. |
| foreach (Offer* offer, utils::copy(slave->offers)) { |
| bool rescind = false; |
| |
| const Resources& offered = offer->resources(); |
| // Since updates of the agent's oversubscribed resources are sent at regular |
| // intervals, we only rescind offers containing revocable resources to |
| // reduce churn. |
| if (hasOversubscribed && !offered.revocable().empty()) { |
| LOG(INFO) << "Removing offer " << offer->id() |
| << " with revocable resources " << offered << " on agent " |
| << *slave; |
| |
| rescind = true; |
| } |
| |
| // Updates on resource providers can change the agent total |
| // resources, so we rescind all offers. |
| // |
| // TODO(bbannier): Only rescind offers possibly containing |
| // affected resources. |
| const Resources offeredResourceProviderResources = offered.filter( |
| [](const Resource& resource) { return resource.has_provider_id(); }); |
| if (message.has_resource_providers() && |
| !offeredResourceProviderResources.empty()) { |
| LOG(INFO) |
| << "Removing offer " << offer->id() |
| << " with resources " << offered << " on agent " << *slave; |
| |
| rescind = true; |
| } |
| |
| if (!rescind) { |
| continue; |
| } |
| |
| allocator->recoverResources( |
| offer->framework_id(), |
| offer->slave_id(), |
| offered, |
| None()); |
| |
| removeOffer(offer, true); // Rescind. |
| } |
| |
| // NOTE: We don't need to rescind inverse offers here as they are unrelated to |
| // oversubscription. |
| } |
| |
| |
| void Master::updateUnavailability( |
| const MachineID& machineId, |
| const Option<Unavailability>& unavailability) |
| { |
| if (unavailability.isSome()) { |
| machines[machineId].info.mutable_unavailability()->CopyFrom( |
| unavailability.get()); |
| } else { |
| machines[machineId].info.clear_unavailability(); |
| } |
| |
| // TODO(jmlvanre): Only update allocator and rescind offers if the |
| // unavailability has actually changed. |
| if (machines.contains(machineId)) { |
| // For every slave on this machine, update the allocator. |
| foreach (const SlaveID& slaveId, machines[machineId].slaves) { |
| // The slave should not be in the machines mapping if it is removed. |
| CHECK(slaves.removed.get(slaveId).isNone()); |
| |
| // The slave should be registered if it is in the machines mapping. |
| CHECK(slaves.registered.contains(slaveId)); |
| |
| Slave* slave = slaves.registered.get(slaveId); |
| |
| if (unavailability.isSome()) { |
| // TODO(jmlvanre): Add stream operator for unavailability. |
| LOG(INFO) << "Updating unavailability of agent " << *slave |
| << ", starting at " |
| << Nanoseconds(unavailability->start().nanoseconds()); |
| } else { |
| LOG(INFO) << "Removing unavailability of agent " << *slave; |
| } |
| |
| // Remove and rescind offers since we want to inform frameworks of the |
| // unavailability change as soon as possible. |
| foreach (Offer* offer, utils::copy(slave->offers)) { |
| allocator->recoverResources( |
| offer->framework_id(), slave->id, offer->resources(), None()); |
| |
| removeOffer(offer, true); // Rescind! |
| } |
| |
| // Remove and rescind inverse offers since the allocator will send new |
| // inverse offers for the updated unavailability. |
| foreach (InverseOffer* inverseOffer, utils::copy(slave->inverseOffers)) { |
| allocator->updateInverseOffer( |
| slave->id, |
| inverseOffer->framework_id(), |
| UnavailableResources{ |
| inverseOffer->resources(), |
| inverseOffer->unavailability()}, |
| None()); |
| |
| removeInverseOffer(inverseOffer, true); // Rescind! |
| } |
| |
| // We remove / rescind all the offers first so that any calls to the |
| // allocator to modify its internal state are queued before the update of |
| // the unavailability in the allocator. We do this so that the allocator's |
| // state can start from a "clean slate" for the new unavailability. |
| // NOTE: Any calls from the Allocator back into the master, for example |
| // `offer()`, are guaranteed to happen after this function exits due to |
| // the Actor pattern. |
| |
| allocator->updateUnavailability(slaveId, unavailability); |
| } |
| } |
| } |
| |
| |
| // TODO(vinod): Since 0.22.0, we can use 'from' instead of 'pid' |
| // because the status updates will be sent by the slave. |
| // |
| // TODO(vinod): Add a benchmark test for status update handling. |
| void Master::statusUpdate(StatusUpdateMessage&& statusUpdateMessage) |
| { |
| const StatusUpdate& update = statusUpdateMessage.update(); |
| const UPID& pid = statusUpdateMessage.pid(); |
| |
| CHECK_NE(pid, UPID()); |
| |
| ++metrics->messages_status_update; |
| |
| if (slaves.removed.get(update.slave_id()).isSome()) { |
| // If the slave has been removed, drop the status update. The |
| // master is no longer trying to health check this slave; when the |
| // slave realizes it hasn't received any pings from the master, it |
| // will eventually try to reregister. |
| LOG(WARNING) << "Ignoring status update " << update |
| << " from removed agent " << pid |
| << " with id " << update.slave_id(); |
| |
| metrics->invalid_status_updates++; |
| return; |
| } |
| |
| Slave* slave = slaves.registered.get(update.slave_id()); |
| |
| if (slave == nullptr) { |
| LOG(WARNING) << "Ignoring status update " << update |
| << " from unknown agent " << pid |
| << " with id " << update.slave_id(); |
| |
| metrics->invalid_status_updates++; |
| return; |
| } |
| |
| Try<id::UUID> uuid = id::UUID::fromBytes(update.uuid()); |
| if (uuid.isError()) { |
| LOG(WARNING) << "Ignoring status update " |
| << " from agent " << *slave |
| << ": " << uuid.error(); |
| |
| ++metrics->invalid_status_updates; |
| return; |
| } |
| |
| LOG(INFO) << "Status update " << update << " from agent " << *slave; |
| |
| // Agents >= 0.26 should always correctly set task status uuid. |
| CHECK(update.status().has_uuid()); |
| |
| bool validStatusUpdate = true; |
| |
| Framework* framework = getFramework(update.framework_id()); |
| |
| // A framework might not have reregistered upon a master failover or |
| // got disconnected. |
| if (framework != nullptr && framework->connected()) { |
| forward(update, pid, framework); |
| } else { |
| validStatusUpdate = false; |
| LOG(WARNING) << "Received status update " << update << " from agent " |
| << *slave << " for " |
| << (framework == nullptr ? "an unknown " : "a disconnected ") |
| << "framework"; |
| } |
| |
| // Lookup the task and see if we need to update anything locally. |
| Task* task = slave->getTask(update.framework_id(), update.status().task_id()); |
| if (task == nullptr) { |
| // TODO(neilc): We might see status updates for non-partition |
| // aware tasks running on a partitioned agent that has |
| // reregistered with the master. The master marks such tasks |
| // completed when the agent partitions; it will shutdown the |
| // framework when the agent-reregisters, but we may see a number |
| // of status updates before the framework is shutdown. |
| LOG(WARNING) << "Could not lookup task for status update " << update |
| << " from agent " << *slave; |
| |
| metrics->invalid_status_updates++; |
| return; |
| } |
| |
| updateTask(task, update); |
| |
| validStatusUpdate |
| ? metrics->valid_status_updates++ : metrics->invalid_status_updates++; |
| } |
| |
| |
| void Master::forward( |
| const StatusUpdate& update, |
| const UPID& acknowledgee, |
| Framework* framework) |
| { |
| CHECK_NOTNULL(framework); |
| |
| if (!acknowledgee) { |
| LOG(INFO) << "Sending status update " << update |
| << (update.status().has_message() |
| ? " '" + update.status().message() + "'" |
| : ""); |
| } else { |
| LOG(INFO) << "Forwarding status update " << update; |
| } |
| |
| // The task might not exist in master's memory (e.g., failed task validation). |
| Task* task = framework->getTask(update.status().task_id()); |
| if (task != nullptr) { |
| // Set the status update state and uuid for the task. Note that |
| // master-generated updates are terminal and do not have a uuid |
| // (in which case the master also calls `removeTask()`). |
| if (update.has_uuid()) { |
| task->set_status_update_state(update.status().state()); |
| task->set_status_update_uuid(update.status().uuid()); |
| } |
| } |
| |
| StatusUpdateMessage message; |
| message.mutable_update()->MergeFrom(update); |
| message.set_pid(acknowledgee); |
| framework->send(message); |
| } |
| |
| |
| void Master::updateOperationStatus(UpdateOperationStatusMessage&& update) |
| { |
| CHECK(update.has_slave_id()) |
| << "External resource provider is not supported yet"; |
| |
| const SlaveID& slaveId = update.slave_id(); |
| |
| // The status update for the operation might be for an |
| // operator API call, thus the framework ID here is optional. |
| Option<FrameworkID> frameworkId = update.has_framework_id() |
| ? update.framework_id() |
| : Option<FrameworkID>::none(); |
| |
| Slave* slave = slaves.registered.get(slaveId); |
| |
| const UUID& uuid = update.operation_uuid(); |
| |
| // This is possible if the agent is marked as unreachable or gone, |
| // or has initiated a graceful shutdown. In either of those cases, |
| // ignore the operation status update. |
| // |
| // TODO(jieyu): If the agent is unreachable or has initiated a |
| // graceful shutdown, we can still forward the update to the |
| // framework so that the framework can get notified about the offer |
| // operation early. However, the acknowledgement of the update won't |
| // be able to reach the agent in those cases. If the agent is gone, |
| // we cannot forward the update because the master might already |
| // tell the framework that the operation is gone. |
| if (slave == nullptr) { |
| LOG(WARNING) << "Ignoring status update for operation '" |
| << update.status().operation_id() |
| << "' (uuid: " << uuid << ") for " |
| << (frameworkId.isSome() |
| ? "framework " + stringify(frameworkId.get()) |
| : "an operator API call") |
| << ": Agent " << slaveId << " is not registered"; |
| |
| return; |
| } |
| |
| Operation* operation = slave->getOperation(update.operation_uuid()); |
| if (operation == nullptr) { |
| LOG(ERROR) << "Failed to find the operation '" |
| << update.status().operation_id() << "' (uuid: " << uuid << ")" |
| << " for " << (frameworkId.isSome() |
| ? "framework " + stringify(frameworkId.get()) |
| : "an operator API call") |
| << " on agent " << slaveId; |
| |
| return; |
| } |
| |
| if (operation->info().has_id()) { |
| // Agents don't include the framework and operation IDs when sending |
| // operation status updates for dropped operations in response to a |
| // `ReconcileOperationsMessage`, but they can be deduced from the operation |
| // info kept on the master. |
| |
| // Only operations done via the scheduler API can have an ID. |
| CHECK(operation->has_framework_id()); |
| |
| frameworkId = operation->framework_id(); |
| |
| update.mutable_status()->mutable_operation_id()->CopyFrom( |
| operation->info().id()); |
| } |
| |
| updateOperation(operation, update); |
| |
| CHECK(operation->statuses_size() > 0); |
| |
| const OperationStatus& latestStatus = *operation->statuses().rbegin(); |
| |
| if (operation->info().has_id()) { |
| // Forward the status update to the framework. |
| Framework* framework = getFramework(frameworkId.get()); |
| |
| if (framework == nullptr || !framework->connected()) { |
| LOG(WARNING) << "Received operation status update " << update |
| << ", but the framework is " |
| << (framework == nullptr ? "unknown" : "disconnected"); |
| } else { |
| LOG(INFO) << "Forwarding operation status update " << update; |
| framework->send(update); |
| } |
| |
| if (protobuf::isTerminalState(latestStatus.state()) && |
| !latestStatus.has_uuid()) { |
| // Remove the operation if the update is terminal and it is not |
| // reliably sent. |
| removeOperation(operation); |
| } |
| } else { |
| if (latestStatus.has_uuid()) { |
| // This update is being sent reliably, and it doesn't have an operation |
| // ID, so the master has to send an acknowledgement. |
| |
| Result<ResourceProviderID> resourceProviderId = |
| getResourceProviderId(operation->info()); |
| |
| // TODO(greggomann): Remove this CHECK once the agent is sending reliable |
| // updates for operations on its default resources. See MESOS-8194. |
| CHECK_SOME(resourceProviderId); |
| |
| AcknowledgeOperationStatusMessage acknowledgement; |
| acknowledgement.mutable_status_uuid()->CopyFrom(latestStatus.uuid()); |
| acknowledgement.mutable_operation_uuid()->CopyFrom(operation->uuid()); |
| acknowledgement.mutable_resource_provider_id()->CopyFrom( |
| resourceProviderId.get()); |
| |
| CHECK(slave->capabilities.resourceProvider); |
| |
| send(slave->pid, acknowledgement); |
| } |
| |
| if (protobuf::isTerminalState(latestStatus.state())) { |
| removeOperation(operation); |
| } |
| } |
| } |
| |
| |
| void Master::exitedExecutor( |
| const UPID& from, |
| const SlaveID& slaveId, |
| const FrameworkID& frameworkId, |
| const ExecutorID& executorId, |
| int32_t status) |
| { |
| ++metrics->messages_exited_executor; |
| |
| if (slaves.removed.get(slaveId).isSome()) { |
| // If the slave has been removed, drop the executor message. The |
| // master is no longer trying to health check this slave; when the |
| // slave realizes it hasn't received any pings from the master, it |
| // will eventually try to reregister. |
| LOG(WARNING) << "Ignoring exited executor '" << executorId |
| << "' of framework " << frameworkId |
| << " on removed agent " << slaveId; |
| |
| return; |
| } |
| |
| Slave* slave = slaves.registered.get(slaveId); |
| |
| if (slave == nullptr) { |
| LOG(WARNING) << "Ignoring exited executor '" << executorId |
| << "' of framework " << frameworkId |
| << " on unknown agent " << slaveId; |
| |
| return; |
| } |
| |
| // Only update master's internal data structures here for proper |
| // accounting. The TASK_LOST updates are handled by the slave. |
| |
| if (!slave->hasExecutor(frameworkId, executorId)) { |
| LOG(WARNING) << "Ignoring unknown exited executor '" << executorId |
| << "' of framework " << frameworkId |
| << " on agent " << *slave; |
| |
| return; |
| } |
| |
| LOG(INFO) << "Executor '" << executorId |
| << "' of framework " << frameworkId |
| << " on agent " << *slave << ": " |
| << WSTRINGIFY(status); |
| |
| removeExecutor(slave, frameworkId, executorId); |
| |
| // TODO(vinod): Reliably forward this message to the scheduler. |
| Framework* framework = getFramework(frameworkId); |
| if (framework == nullptr || !framework->connected()) { |
| string status = (framework == nullptr ? "unknown" : "disconnected"); |
| |
| LOG(WARNING) |
| << "Not forwarding exited executor message for executor '" << executorId |
| << "' of framework " << frameworkId << " on agent " << *slave |
| << " because the framework is " << status; |
| |
| return; |
| } |
| |
| ExitedExecutorMessage message; |
| message.mutable_executor_id()->CopyFrom(executorId); |
| message.mutable_framework_id()->CopyFrom(frameworkId); |
| message.mutable_slave_id()->CopyFrom(slaveId); |
| message.set_status(status); |
| |
| framework->send(message); |
| } |
| |
| |
| void Master::shutdown( |
| Framework* framework, |
| const scheduler::Call::Shutdown& shutdown) |
| { |
| CHECK_NOTNULL(framework); |
| |
| // TODO(vinod): Add a metric for executor shutdowns. |
| |
| const SlaveID& slaveId = shutdown.slave_id(); |
| const ExecutorID& executorId = shutdown.executor_id(); |
| const FrameworkID& frameworkId = framework->id(); |
| |
| Slave* slave = slaves.registered.get(slaveId); |
| |
| if (slave == nullptr) { |
| LOG(WARNING) << "Unable to shutdown executor '" << executorId |
| << "' of framework " << frameworkId |
| << " of unknown agent " << slaveId; |
| |
| return; |
| } |
| |
| LOG(INFO) << "Processing SHUTDOWN call for executor '" << executorId |
| << "' of framework " << *framework << " on agent " << slaveId; |
| |
| ShutdownExecutorMessage message; |
| message.mutable_executor_id()->CopyFrom(executorId); |
| message.mutable_framework_id()->CopyFrom(frameworkId); |
| send(slave->pid, message); |
| } |
| |
| |
| Future<bool> Master::markUnreachable( |
| const SlaveInfo& slave, |
| bool duringMasterFailover, |
| const string& message) |
| { |
| if (duringMasterFailover && !slaves.recovered.contains(slave.id())) { |
| LOG(INFO) << "Skipping transition of agent" |
| << " " << slave.id() << " (" << slave.hostname() << ")" |
| << " to unreachable because it reregistered in the interim"; |
| |
| return false; |
| } |
| |
| if (!duringMasterFailover && !slaves.registered.contains(slave.id())) { |
| // Possible when the `SlaveObserver` dispatches a message to |
| // mark an unhealthy slave as unreachable, but the slave is |
| // concurrently removed for another reason (e.g., |
| // `UnregisterSlaveMessage` is received). |
| LOG(WARNING) << "Skipping transition of agent" |
| << " " << slave.id() << " (" << slave.hostname() << ")" |
| << " to unreachable because it has already been removed" |
| << " or marked unreachable"; |
| |
| return false; |
| } |
| |
| // The slave might be in the process of reregistering without |
| // the marking unreachable having been canceled. |
| if (slaves.reregistering.contains(slave.id())) { |
| LOG(INFO) << "Skipping transition of agent" |
| << " " << slave.id() << " (" << slave.hostname() << ")" |
| << " to unreachable because it is reregistering"; |
| |
| return false; |
| } |
| |
| if (slaves.markingUnreachable.contains(slave.id())) { |
| // We might already be marking this slave unreachable. This is |
| // possible if marking the slave unreachable in the registry takes |
| // a long time. While the registry operation is in progress, the |
| // `SlaveObserver` will continue to ping the slave; if the slave |
| // fails another health check, the `SlaveObserver` will trigger |
| // another attempt to mark it unreachable. Also possible if |
| // `agentReregisterTimeout` marks the slave unreachable |
| // concurrently with the slave observer doing so. |
| LOG(WARNING) << "Skipping transition of agent" |
| << " " << slave.id() << " (" << slave.hostname() << ")" |
| << " to unreachable because another unreachable" |
| << " transition is already in progress"; |
| |
| return false; |
| } |
| |
| if (slaves.removing.contains(slave.id())) { |
| LOG(WARNING) << "Skipping transition of agent" |
| << " " << slave.id() << " (" << slave.hostname() << ")" |
| << " to unreachable because it is being removed"; |
| |
| return false; |
| } |
| |
| if (slaves.removed.get(slave.id()).isSome()) { |
| LOG(WARNING) << "Skipping transition of agent" |
| << " " << slave.id() << " (" << slave.hostname() << ")" |
| << " to unreachable because it has been removed"; |
| |
| return false; |
| } |
| |
| if (slaves.markingGone.contains(slave.id())) { |
| LOG(WARNING) << "Skipping transition of agent" |
| << " " << slave.id() << " (" << slave.hostname() << ")" |
| << " to unreachable because it is being marked as gone"; |
| |
| return false; |
| } |
| |
| if (slaves.gone.contains(slave.id())) { |
| LOG(WARNING) << "Skipping transition of agent" |
| << " " << slave.id() << " (" << slave.hostname() << ")" |
| << " to unreachable because it has been marked as gone"; |
| |
| return false; |
| } |
| |
| LOG(INFO) << "Marking agent " << slave.id() << " (" << slave.hostname() << ")" |
| << " unreachable: " << message; |
| |
| CHECK(!slaves.unreachable.contains(slave.id())); |
| slaves.markingUnreachable.insert(slave.id()); |
| |
| // Use the same timestamp for all status updates sent below; |
| // we also use this timestamp when updating the registry. |
| TimeInfo unreachableTime = protobuf::getCurrentTime(); |
| |
| const string failure = "Failed to mark agent " + stringify(slave.id()) + |
| " (" + slave.hostname() + ") as unreachable in the registry"; |
| |
| // Update the registry to move this slave from the list of admitted |
| // slaves to the list of unreachable slaves. After this is complete, |
| // we can remove the slave from the master's in-memory state and |
| // send TASK_UNREACHABLE / TASK_LOST updates to the frameworks. |
| return undiscardable( |
| registrar->apply(Owned<RegistryOperation>( |
| new MarkSlaveUnreachable(slave, unreachableTime))) |
| .onFailed(lambda::bind(fail, failure, lambda::_1)) |
| .onDiscarded(lambda::bind(fail, failure, "discarded")) |
| .then(defer(self(), [=](bool result) { |
| _markUnreachable( |
| slave, unreachableTime, duringMasterFailover, message, result); |
| return true; |
| }))); |
| } |
| |
| |
| void Master::_markUnreachable( |
| const SlaveInfo& slave, |
| const TimeInfo& unreachableTime, |
| bool duringMasterFailover, |
| const string& message, |
| bool registrarResult) |
| { |
| // `MarkSlaveUnreachable` registry operation should never fail. |
| CHECK(registrarResult); |
| |
| CHECK(slaves.markingUnreachable.contains(slave.id())); |
| slaves.markingUnreachable.erase(slave.id()); |
| |
| LOG(INFO) << "Marked agent" |
| << " " << slave.id() << " (" << slave.hostname() << ")" |
| << " unreachable: " << message; |
| |
| ++metrics->slave_removals; |
| ++metrics->slave_removals_reason_unhealthy; |
| |
| CHECK(!slaves.unreachable.contains(slave.id())); |
| slaves.unreachable[slave.id()] = unreachableTime; |
| |
| if (duringMasterFailover) { |
| CHECK(slaves.recovered.contains(slave.id())); |
| slaves.recovered.erase(slave.id()); |
| |
| ++metrics->recovery_slave_removals; |
| |
| // TODO(bmahler): Tell partition aware frameworks that the |
| // agent is unreachable rather than lost. This requires a |
| // new capability. |
| sendSlaveLost(slave); |
| } else { |
| CHECK(slaves.registered.contains(slave.id())); |
| |
| __removeSlave(slaves.registered.get(slave.id()), message, unreachableTime); |
| } |
| } |
| |
| |
| void Master::markGone(const SlaveID& slaveId, const TimeInfo& goneTime) |
| { |
| CHECK(slaves.markingGone.contains(slaveId)); |
| |
| slaves.markingGone.erase(slaveId); |
| |
| slaves.gone[slaveId] = goneTime; |
| |
| const string message = "Agent has been marked gone"; |
| |
| Slave* slave = slaves.registered.get(slaveId); |
| |
| // If the `Slave` struct does not exist, then the agent |
| // must be either recovered or unreachable. |
| if (slave == nullptr) { |
| CHECK(slaves.recovered.contains(slaveId) || |
| slaves.unreachable.contains(slaveId)); |
| |
| // When a recovered agent is marked gone, we have no task metadata to use in |
| // order to send task status updates. We could retain this agent ID and send |
| // updates upon reregistration but do not currently do this. See MESOS-9739. |
| if (slaves.recovered.contains(slaveId)) { |
| return; |
| } |
| |
| slaves.unreachable.erase(slaveId); |
| |
| // TODO(vinod): Consider moving these tasks into `completedTasks` by |
| // transitioning them to a terminal state and sending status updates. |
| // But it's not clear what this state should be. If a framework |
| // reconciles these tasks after this point it would get `TASK_UNKNOWN` |
| // which seems appropriate but we don't keep tasks in this state in-memory. |
| if (slaves.unreachableTasks.contains(slaveId)) { |
| foreachkey (const FrameworkID& frameworkId, |
| slaves.unreachableTasks.at(slaveId)) { |
| Framework* framework = getFramework(frameworkId); |
| if (framework == nullptr) { |
| continue; |
| } |
| |
| TaskState newTaskState = TASK_GONE_BY_OPERATOR; |
| TaskStatus::Reason newTaskReason = |
| TaskStatus::REASON_SLAVE_REMOVED_BY_OPERATOR; |
| |
| if (!framework->capabilities.partitionAware) { |
| newTaskState = TASK_LOST; |
| newTaskReason = TaskStatus::REASON_SLAVE_REMOVED; |
| } |
| |
| foreach (const TaskID& taskId, |
| slaves.unreachableTasks.at(slaveId).at(frameworkId)) { |
| if (framework->unreachableTasks.contains(taskId)) { |
| const Owned<Task>& task = framework->unreachableTasks.at(taskId); |
| |
| const StatusUpdate& update = protobuf::createStatusUpdate( |
| task->framework_id(), |
| task->slave_id(), |
| task->task_id(), |
| newTaskState, |
| TaskStatus::SOURCE_MASTER, |
| None(), |
| message, |
| newTaskReason, |
| (task->has_executor_id() |
| ? Option<ExecutorID>(task->executor_id()) |
| : None())); |
| |
| updateTask(task.get(), update); |
| |
| if (!framework->connected()) { |
| LOG(WARNING) << "Dropping update " << update |
| << " for disconnected " |
| << " framework " << frameworkId; |
| } else { |
| forward(update, UPID(), framework); |
| } |
| |
| // Move task from unreachable map to completed map. |
| framework->addCompletedTask(std::move(*task)); |
| framework->unreachableTasks.erase(taskId); |
| } |
| } |
| } |
| |
| slaves.unreachableTasks.erase(slaveId); |
| } |
| |
| return; |
| } |
| |
| // Shutdown the agent if it transitioned to gone. |
| ShutdownMessage shutdownMessage; |
| shutdownMessage.set_message(message); |
| send(slave->pid, shutdownMessage); |
| |
| __removeSlave(slave, message, None()); |
| } |
| |
| |
| void Master::reconcileTasks( |
| const UPID& from, |
| ReconcileTasksMessage&& reconcileTasksMessage) |
| { |
| const FrameworkID& frameworkId = reconcileTasksMessage.framework_id(); |
| |
| Framework* framework = getFramework(frameworkId); |
| if (framework == nullptr) { |
| LOG(WARNING) << "Unknown framework " << frameworkId << " at " << from |
| << " attempted to reconcile tasks"; |
| |
| return; |
| } |
| |
| if (framework->pid != from) { |
| LOG(WARNING) |
| << "Ignoring reconcile tasks message for framework " << *framework |
| << " because it is not expected from " << from; |
| |
| return; |
| } |
| |
| scheduler::Call::Reconcile message; |
| message.mutable_tasks()->Reserve(reconcileTasksMessage.statuses_size()); |
| |
| foreach (TaskStatus& status, *reconcileTasksMessage.mutable_statuses()) { |
| scheduler::Call::Reconcile::Task* t = message.add_tasks(); |
| |
| *t->mutable_task_id() = std::move(status.task_id()); |
| |
| if (status.has_slave_id()) { |
| *t->mutable_slave_id() = std::move(status.slave_id()); |
| } |
| } |
| |
| reconcile(framework, std::move(message)); |
| } |
| |
| |
| void Master::reconcile( |
| Framework* framework, |
| scheduler::Call::Reconcile&& reconcile) |
| { |
| CHECK_NOTNULL(framework); |
| |
| ++metrics->messages_reconcile_tasks; |
| |
| if (reconcile.tasks().empty()) { |
| // Implicit reconciliation. |
| LOG(INFO) << "Performing implicit task state reconciliation" |
| " for framework " << *framework; |
| |
| foreachvalue (const TaskInfo& task, framework->pendingTasks) { |
| StatusUpdate update = protobuf::createStatusUpdate( |
| framework->id(), |
| task.slave_id(), |
| task.task_id(), |
| TASK_STAGING, |
| TaskStatus::SOURCE_MASTER, |
| None(), |
| "Reconciliation: Latest task state", |
| TaskStatus::REASON_RECONCILIATION); |
| |
| VLOG(1) << "Sending implicit reconciliation state " |
| << update.status().state() |
| << " for task " << update.status().task_id() |
| << " of framework " << *framework; |
| |
| // TODO(bmahler): Consider using forward(); might lead to too |
| // much logging. |
| StatusUpdateMessage message; |
| *message.mutable_update() = std::move(update); |
| framework->send(message); |
| } |
| |
| foreachvalue (Task* task, framework->tasks) { |
| const TaskState& state = task->has_status_update_state() |
| ? task->status_update_state() |
| : task->state(); |
| |
| const Option<ExecutorID>& executorId = task->has_executor_id() |
| ? Option<ExecutorID>(task->executor_id()) |
| : None(); |
| |
| StatusUpdate update = protobuf::createStatusUpdate( |
| framework->id(), |
| task->slave_id(), |
| task->task_id(), |
| state, |
| TaskStatus::SOURCE_MASTER, |
| None(), |
| "Reconciliation: Latest task state", |
| TaskStatus::REASON_RECONCILIATION, |
| executorId, |
| protobuf::getTaskHealth(*task), |
| protobuf::getTaskCheckStatus(*task), |
| None(), |
| protobuf::getTaskContainerStatus(*task)); |
| |
| VLOG(1) << "Sending implicit reconciliation state " |
| << update.status().state() |
| << " for task " << update.status().task_id() |
| << " of framework " << *framework; |
| |
| // TODO(bmahler): Consider using forward(); might lead to too |
| // much logging. |
| StatusUpdateMessage message; |
| *message.mutable_update() = std::move(update); |
| framework->send(message); |
| } |
| |
| return; |
| } |
| |
| // Explicit reconciliation. |
| LOG(INFO) << "Performing explicit task state reconciliation" |
| << " for " << reconcile.tasks().size() << " tasks" |
| << " of framework " << *framework; |
| |
| // Explicit reconciliation occurs for the following cases: |
| // (1) Task is known, but pending: TASK_STAGING. |
| // (2) Task is known: send the latest state. |
| // (3) Task is unknown, slave is recovered: no-op. |
| // (4) Task is unknown, slave is registered: TASK_GONE. |
| // (5) Task is unknown, slave is unreachable: TASK_UNREACHABLE. |
| // (6) Task is unknown, slave is gone: TASK_GONE_BY_OPERATOR. |
| // (7) Task is unknown, slave is unknown: TASK_UNKNOWN. |
| // |
| // For case (3), if the slave ID is not provided, we err on the |
| // side of caution and do not reply if there are *any* recovered |
| // slaves that haven't reregistered, since the task could reside |
| // on one of these slaves. |
| // |
| // For cases (4), (5), (6) and (7) TASK_LOST is sent instead if the |
| // framework has not opted-in to the PARTITION_AWARE capability. |
| foreach (const scheduler::Call::Reconcile::Task& t, reconcile.tasks()) { |
| Option<SlaveID> slaveId = None(); |
| if (t.has_slave_id()) { |
| slaveId = t.slave_id(); |
| } |
| |
| Option<StatusUpdate> update = None(); |
| Task* task = framework->getTask(t.task_id()); |
| |
| if (framework->pendingTasks.contains(t.task_id())) { |
| // (1) Task is known, but pending: TASK_STAGING. |
| const TaskInfo& task_ = framework->pendingTasks[t.task_id()]; |
| update = protobuf::createStatusUpdate( |
| framework->id(), |
| task_.slave_id(), |
| task_.task_id(), |
| TASK_STAGING, |
| TaskStatus::SOURCE_MASTER, |
| None(), |
| "Reconciliation: Latest task state", |
| TaskStatus::REASON_RECONCILIATION); |
| } else if (task != nullptr) { |
| // (2) Task is known: send the latest status update state. |
| const TaskState& state = task->has_status_update_state() |
| ? task->status_update_state() |
| : task->state(); |
| |
| const Option<ExecutorID> executorId = task->has_executor_id() |
| ? Option<ExecutorID>(task->executor_id()) |
| : None(); |
| |
| update = protobuf::createStatusUpdate( |
| framework->id(), |
| task->slave_id(), |
| task->task_id(), |
| state, |
| TaskStatus::SOURCE_MASTER, |
| None(), |
| "Reconciliation: Latest task state", |
| TaskStatus::REASON_RECONCILIATION, |
| executorId, |
| protobuf::getTaskHealth(*task), |
| protobuf::getTaskCheckStatus(*task), |
| None(), |
| protobuf::getTaskContainerStatus(*task)); |
| } else if ((slaveId.isSome() && slaves.recovered.contains(slaveId.get())) || |
| (slaveId.isNone() && !slaves.recovered.empty())) { |
| // (3) Task is unknown, slave is recovered: no-op. The framework |
| // will have to retry this and will not receive a response until |
| // the agent either registers, or is marked unreachable after the |
| // timeout. |
| LOG(INFO) << "Dropping reconciliation of task " << t.task_id() |
| << " for framework " << *framework << " because " |
| << (slaveId.isSome() ? |
| "agent " + stringify(slaveId.get()) + " has" : |
| "some agents have") |
| << " not yet reregistered with the master"; |
| } else if (slaveId.isSome() && slaves.registered.contains(slaveId.get())) { |
| // (4) Task is unknown, slave is registered: TASK_GONE. If the |
| // framework does not have the PARTITION_AWARE capability, send |
| // TASK_LOST for backward compatibility. |
| TaskState taskState = TASK_GONE; |
| if (!framework->capabilities.partitionAware) { |
| taskState = TASK_LOST; |
| } |
| |
| update = protobuf::createStatusUpdate( |
| framework->id(), |
| slaveId.get(), |
| t.task_id(), |
| taskState, |
| TaskStatus::SOURCE_MASTER, |
| None(), |
| "Reconciliation: Task is unknown to the agent", |
| TaskStatus::REASON_RECONCILIATION); |
| } else if (slaveId.isSome() && slaves.unreachable.contains(slaveId.get())) { |
| // (5) Slave is unreachable: TASK_UNREACHABLE. If the framework |
| // does not have the PARTITION_AWARE capability, send TASK_LOST |
| // for backward compatibility. In either case, the status update |
| // also includes the time when the slave was marked unreachable. |
| const TimeInfo& unreachableTime = slaves.unreachable.at(slaveId.get()); |
| |
| TaskState taskState = TASK_UNREACHABLE; |
| if (!framework->capabilities.partitionAware) { |
| taskState = TASK_LOST; |
| } |
| |
| update = protobuf::createStatusUpdate( |
| framework->id(), |
| slaveId.get(), |
| t.task_id(), |
| taskState, |
| TaskStatus::SOURCE_MASTER, |
| None(), |
| "Reconciliation: Task is unreachable", |
| TaskStatus::REASON_RECONCILIATION, |
| None(), |
| None(), |
| None(), |
| None(), |
| None(), |
| unreachableTime); |
| } else if (slaveId.isSome() && slaves.gone.contains(slaveId.get())) { |
| // (6) Slave is gone: TASK_GONE_BY_OPERATOR. If the framework |
| // does not have the PARTITION_AWARE capability, send TASK_LOST |
| // for backward compatibility. |
| TaskState taskState = TASK_GONE_BY_OPERATOR; |
| if (!framework->capabilities.partitionAware) { |
| taskState = TASK_LOST; |
| } |
| |
| update = protobuf::createStatusUpdate( |
| framework->id(), |
| slaveId.get(), |
| t.task_id(), |
| taskState, |
| TaskStatus::SOURCE_MASTER, |
| None(), |
| "Reconciliation: Task is gone", |
| TaskStatus::REASON_RECONCILIATION); |
| } else { |
| // (7) Task is unknown, slave is unknown: TASK_UNKNOWN. If the |
| // framework does not have the PARTITION_AWARE capability, send |
| // TASK_LOST for backward compatibility. |
| TaskState taskState = TASK_UNKNOWN; |
| if (!framework->capabilities.partitionAware) { |
| taskState = TASK_LOST; |
| } |
| |
| update = protobuf::createStatusUpdate( |
| framework->id(), |
| slaveId, |
| t.task_id(), |
| taskState, |
| TaskStatus::SOURCE_MASTER, |
| None(), |
| "Reconciliation: Task is unknown", |
| TaskStatus::REASON_RECONCILIATION); |
| } |
| |
| if (update.isSome()) { |
| VLOG(1) << "Sending explicit reconciliation state " |
| << update->status().state() |
| << " for task " << update->status().task_id() |
| << " of framework " << *framework; |
| |
| // TODO(bmahler): Consider using forward(); might lead to too |
| // much logging. |
| StatusUpdateMessage message; |
| *message.mutable_update() = std::move(update.get()); |
| framework->send(message); |
| } |
| } |
| } |
| |
| |
| scheduler::Response::ReconcileOperations Master::reconcileOperations( |
| Framework* framework, |
| const scheduler::Call::ReconcileOperations& reconcile) |
| { |
| CHECK_NOTNULL(framework); |
| |
| ++metrics->messages_reconcile_operations; |
| |
| scheduler::Response::ReconcileOperations response; |
| |
| if (reconcile.operations_size() == 0) { |
| // Implicit reconciliation. |
| LOG(INFO) << "Performing implicit operation state reconciliation" |
| " for framework " << *framework; |
| |
| response.mutable_operation_statuses()->Reserve( |
| framework->operations.size()); |
| |
| foreachvalue (Operation* operation, framework->operations) { |
| if (operation->statuses().empty()) { |
| // This can happen if the operation is pending. |
| response.add_operation_statuses()->CopyFrom(operation->latest_status()); |
| } else { |
| response.add_operation_statuses()->CopyFrom( |
| *operation->statuses().rbegin()); |
| } |
| } |
| |
| return response; |
| } |
| |
| // Explicit reconciliation. |
| LOG(INFO) << "Performing explicit operation state reconciliation for " |
| << reconcile.operations_size() << " operations of framework " |
| << *framework; |
| |
| // Explicit reconciliation occurs for the following cases: |
| // (1) Operation is known: the latest status sent to the framework. |
| // (2) Operation is unknown, slave is recovered: OPERATION_RECOVERING. |
| // (3) Operation is unknown, slave is registered: OPERATION_UNKNOWN. |
| // (4) Operation is unknown, slave is unreachable: OPERATION_UNREACHABLE. |
| // (5) Operation is unknown, slave is gone: OPERATION_GONE_BY_OPERATOR. |
| // (6) Operation is unknown, slave is unknown: OPERATION_UNKNOWN. |
| // (7) Operation is unknown, slave ID is not specified: OPERATION_UNKNOWN. |
| |
| foreach (const scheduler::Call::ReconcileOperations::Operation& operation, |
| reconcile.operations()) { |
| Option<SlaveID> slaveId = None(); |
| if (operation.has_slave_id()) { |
| slaveId = operation.slave_id(); |
| } |
| |
| Option<ResourceProviderID> resourceProviderId = None(); |
| if (operation.has_resource_provider_id()) { |
| resourceProviderId = operation.resource_provider_id(); |
| } |
| |
| Option<Operation*> frameworkOperation = |
| framework->getOperation(operation.operation_id()); |
| |
| OperationStatus* status = response.add_operation_statuses(); |
| if (frameworkOperation.isSome()) { |
| // (1) Operation is known: resend the latest status sent to the framework. |
| if (frameworkOperation.get()->statuses().empty()) { |
| // This can happen if the operation is pending. |
| *status = frameworkOperation.get()->latest_status(); |
| } else { |
| *status = *frameworkOperation.get()->statuses().rbegin(); |
| } |
| } else if (slaveId.isSome() && slaves.recovered.contains(slaveId.get())) { |
| // (2) Operation is unknown, slave is recovered: OPERATION_RECOVERING. |
| *status = protobuf::createOperationStatus( |
| OperationState::OPERATION_RECOVERING, |
| operation.operation_id(), |
| "Reconciliation: Agent is recovered but has not re-registered", |
| None(), |
| None(), |
| slaveId, |
| resourceProviderId); |
| } else if (slaveId.isSome() && slaves.registered.contains(slaveId.get())) { |
| // (3) Operation is unknown, slave is registered: OPERATION_UNKNOWN. |
| *status = protobuf::createOperationStatus( |
| OperationState::OPERATION_UNKNOWN, |
| operation.operation_id(), |
| "Reconciliation: Operation is unknown", |
| None(), |
| None(), |
| slaveId, |
| resourceProviderId); |
| } else if (slaveId.isSome() && slaves.unreachable.contains(slaveId.get())) { |
| // (4) Operation is unknown, slave is unreachable: OPERATION_UNREACHABLE. |
| *status = protobuf::createOperationStatus( |
| OperationState::OPERATION_UNREACHABLE, |
| operation.operation_id(), |
| "Reconciliation: Agent is unreachable", |
| None(), |
| None(), |
| slaveId, |
| resourceProviderId); |
| } else if (slaveId.isSome() && slaves.gone.contains(slaveId.get())) { |
| // (5) Operation is unknown, slave is gone: OPERATION_GONE_BY_OPERATOR. |
| *status = protobuf::createOperationStatus( |
| OperationState::OPERATION_GONE_BY_OPERATOR, |
| operation.operation_id(), |
| "Reconciliation: Agent marked gone by operator", |
| None(), |
| None(), |
| slaveId, |
| resourceProviderId); |
| } else if (slaveId.isSome()) { |
| // (6) Operation is unknown, slave is unknown: OPERATION_UNKNOWN. |
| *status = protobuf::createOperationStatus( |
| OperationState::OPERATION_UNKNOWN, |
| operation.operation_id(), |
| "Reconciliation: Both operation and agent are unknown", |
| None(), |
| None(), |
| slaveId, |
| resourceProviderId); |
| } else { |
| // (7) Operation is unknown, slave is unknown: OPERATION_UNKNOWN. |
| *status = protobuf::createOperationStatus( |
| OperationState::OPERATION_UNKNOWN, |
| operation.operation_id(), |
| "Reconciliation: Operation is unknown and no 'agent_id' was" |
| " provided", |
| None(), |
| None(), |
| slaveId, |
| resourceProviderId); |
| } |
| } |
| |
| return response; |
| } |
| |
| |
| void Master::frameworkFailoverTimeout(const FrameworkID& frameworkId, |
| const Time& reregisteredTime) |
| { |
| Framework* framework = getFramework(frameworkId); |
| |
| if (framework != nullptr && !framework->connected()) { |
| // If the re-registration time has not changed, then the framework |
| // has not reregistered within the failover timeout. |
| if (framework->reregisteredTime == reregisteredTime) { |
| LOG(INFO) << "Framework failover timeout, removing framework " |
| << *framework; |
| |
| removeFramework(framework); |
| } |
| } |
| } |
| |
| |
| void Master::offer( |
| const FrameworkID& frameworkId, |
| const hashmap<string, hashmap<SlaveID, Resources>>& resources) |
| { |
| if (!frameworks.registered.contains(frameworkId) || |
| !frameworks.registered[frameworkId]->active()) { |
| LOG(WARNING) << "Master returning resources offered to framework " |
| << frameworkId << " because the framework" |
| << " has terminated or is inactive"; |
| |
| foreachkey (const string& role, resources) { |
| foreachpair (const SlaveID& slaveId, |
| const Resources& offered, |
| resources.at(role)) { |
| allocator->recoverResources(frameworkId, slaveId, offered, None()); |
| } |
| } |
| return; |
| } |
| |
| Framework* framework = CHECK_NOTNULL(frameworks.registered.at(frameworkId)); |
| |
| // Each offer we create is tied to a single agent |
| // and a single allocation role. |
| ResourceOffersMessage message; |
| |
| // We keep track of the offer IDs so that we can log them. |
| vector<OfferID> offerIds; |
| |
| foreachkey (const string& role, resources) { |
| foreachpair (const SlaveID& slaveId, |
| const Resources& offered, |
| resources.at(role)) { |
| Slave* slave = slaves.registered.get(slaveId); |
| |
| if (slave == nullptr) { |
| LOG(WARNING) |
| << "Master returning resources offered to framework " << *framework |
| << " because agent " << slaveId << " is not valid"; |
| |
| allocator->recoverResources(frameworkId, slaveId, offered, None()); |
| continue; |
| } |
| |
| // This could happen if the allocator dispatched 'Master::offer' before |
| // the slave was deactivated in the allocator. |
| if (!slave->active) { |
| LOG(WARNING) |
| << "Master returning resources offered because agent " << *slave |
| << " is " << (slave->connected ? "deactivated" : "disconnected"); |
| |
| allocator->recoverResources(frameworkId, slaveId, offered, None()); |
| continue; |
| } |
| |
| #ifdef ENABLE_PORT_MAPPING_ISOLATOR |
| // TODO(dhamon): This flag is required as the static allocation of |
| // ephemeral ports leads to a maximum number of containers that can |
| // be created on each slave. Once MESOS-1654 is fixed and ephemeral |
| // ports are a first class resource, this can be removed. |
| if (flags.max_executors_per_agent.isSome()) { |
| // Check that we haven't hit the executor limit. |
| size_t numExecutors = 0; |
| foreachkey (const FrameworkID& frameworkId, slave->executors) { |
| numExecutors += slave->executors[frameworkId].keys().size(); |
| } |
| |
| if (numExecutors >= flags.max_executors_per_agent.get()) { |
| LOG(WARNING) << "Master returning resources offered because agent " |
| << *slave << " has reached the maximum number of " |
| << "executors"; |
| |
| // Pass a default filter to avoid getting this same offer immediately |
| // from the allocator. |
| allocator->recoverResources(frameworkId, slaveId, offered, Filters()); |
| continue; |
| } |
| } |
| #endif // ENABLE_PORT_MAPPING_ISOLATOR |
| |
| // TODO(vinod): Split regular and revocable resources into |
| // separate offers, so that rescinding offers with revocable |
| // resources does not affect offers with regular resources. |
| |
| // TODO(bmahler): Set "https" if only "https" is supported. |
| mesos::URL url; |
| url.set_scheme("http"); |
| url.mutable_address()->set_hostname(slave->info.hostname()); |
| url.mutable_address()->set_ip(stringify(slave->pid.address.ip)); |
| url.mutable_address()->set_port(slave->pid.address.port); |
| url.set_path("/" + slave->pid.id); |
| |
| Offer* offer = new Offer(); |
| offer->mutable_id()->MergeFrom(newOfferId()); |
| offer->mutable_framework_id()->MergeFrom(framework->id()); |
| offer->mutable_slave_id()->MergeFrom(slave->id); |
| offer->set_hostname(slave->info.hostname()); |
| offer->mutable_url()->MergeFrom(url); |
| offer->mutable_resources()->MergeFrom(offered); |
| offer->mutable_attributes()->MergeFrom(slave->info.attributes()); |
| offer->mutable_allocation_info()->set_role(role); |
| |
| if (slave->info.has_domain()) { |
| offer->mutable_domain()->MergeFrom(slave->info.domain()); |
| } |
| |
| // Add all framework's executors running on this slave. |
| if (slave->executors.contains(framework->id())) { |
| const hashmap<ExecutorID, ExecutorInfo>& executors = |
| slave->executors[framework->id()]; |
| foreachkey (const ExecutorID& executorId, executors) { |
| offer->add_executor_ids()->MergeFrom(executorId); |
| } |
| } |
| |
| // If the slave in this offer is planned to be unavailable due to |
| // maintenance in the future, then set the Unavailability. |
| CHECK(machines.contains(slave->machineId)); |
| if (machines[slave->machineId].info.has_unavailability()) { |
| offer->mutable_unavailability()->CopyFrom( |
| machines[slave->machineId].info.unavailability()); |
| } |
| |
| offers[offer->id()] = offer; |
| |
| framework->addOffer(offer); |
| slave->addOffer(offer); |
| |
| if (flags.offer_timeout.isSome()) { |
| // Rescind the offer after the timeout elapses. |
| offerTimers[offer->id()] = |
| delay(flags.offer_timeout.get(), |
| self(), |
| &Self::offerTimeout, |
| offer->id()); |
| } |
| |
| // TODO(jieyu): For now, we strip 'ephemeral_ports' resource from |
| // offers so that frameworks do not see this resource. This is a |
| // short term workaround. Revisit this once we resolve MESOS-1654. |
| Offer offer_ = *offer; |
| offer_.clear_resources(); |
| |
| foreach (const Resource& resource, offered) { |
| if (resource.name() != "ephemeral_ports") { |
| offer_.add_resources()->CopyFrom(resource); |
| } |
| } |
| |
| // Per MESOS-8237, it is problematic to show the |
| // `Resource.allocation_info` for pre-MULTI_ROLE schedulers. |
| // Pre-MULTI_ROLE schedulers are not `AllocationInfo` aware, |
| // and since they may be performing operations that |
| // implicitly uses all of Resource's state (e.g. equality |
| // comparison), we strip the `AllocationInfo` from `Resource`, |
| // as well as Offer. The idea here is that since the |
| // information doesn't provide any value to a pre-MULTI_ROLE |
| // scheduler, we preserve the old `Offer` format for them. |
| if (!framework->capabilities.multiRole) { |
| offer_.clear_allocation_info(); |
| |
| foreach (Resource& resource, *offer_.mutable_resources()) { |
| resource.clear_allocation_info(); |
| } |
| } |
| |
| if (!framework->capabilities.reservationRefinement) { |
| convertResourceFormat( |
| offer_.mutable_resources(), PRE_RESERVATION_REFINEMENT); |
| } |
| |
| // Add the offer *AND* the corresponding slave's PID. |
| message.add_offers()->MergeFrom(offer_); |
| message.add_pids(slave->pid); |
| |
| offerIds.push_back(offer_.id()); |
| |
| VLOG(2) << "Sending offer " << offer_.id() |
| << " containing resources " << offered |
| << " on agent " << *slave |
| << " to framework " << *framework; |
| } |
| } |
| |
| if (message.offers().size() == 0) { |
| return; |
| } |
| |
| LOG(INFO) << "Sending offers " << offerIds << " to framework " << *framework; |
| |
| framework->metrics.offers_sent += message.offers().size(); |
| framework->send(message); |
| } |
| |
| |
| void Master::inverseOffer( |
| const FrameworkID& frameworkId, |
| const hashmap<SlaveID, UnavailableResources>& resources) |
| { |
| if (!frameworks.registered.contains(frameworkId) || |
| !frameworks.registered[frameworkId]->active()) { |
| LOG(INFO) << "Master ignoring inverse offers to framework " << frameworkId |
| << " because the framework has terminated or is inactive"; |
| |
| return; |
| } |
| |
| // Create an inverse offer for each slave and add it to the message. |
| InverseOffersMessage message; |
| |
| Framework* framework = CHECK_NOTNULL(frameworks.registered[frameworkId]); |
| foreachpair (const SlaveID& slaveId, |
| const UnavailableResources& unavailableResources, |
| resources) { |
| Slave* slave = slaves.registered.get(slaveId); |
| |
| if (slave == nullptr) { |
| LOG(INFO) |
| << "Master ignoring inverse offers to framework " << *framework |
| << " because agent " << slaveId << " is not valid"; |
| |
| continue; |
| } |
| |
| // This could happen if the allocator dispatched 'Master::inverseOffer' |
| // before the slave was deactivated in the allocator. |
| if (!slave->active) { |
| LOG(INFO) |
| << "Master ignoring inverse offers to framework " << *framework |
| << " because agent " << *slave << " is " |
| << (slave->connected ? "deactivated" : "disconnected"); |
| |
| continue; |
| } |
| |
| // This could happen if the allocator dispatched `Master::inverseOffer` |
| // before the unavailability was removed in the master. |
| if (!machines.contains(slave->machineId) || |
| !machines.at(slave->machineId).info.has_unavailability()) { |
| LOG(INFO) |
| << "Master dropping inverse offers to framework " << *framework |
| << " because agent " << *slave << " had its unavailability revoked."; |
| |
| continue; |
| } |
| |
| // TODO(bmahler): Set "https" if only "https" is supported. |
| mesos::URL url; |
| url.set_scheme("http"); |
| url.mutable_address()->set_hostname(slave->info.hostname()); |
| url.mutable_address()->set_ip(stringify(slave->pid.address.ip)); |
| url.mutable_address()->set_port(slave->pid.address.port); |
| url.set_path("/" + slave->pid.id); |
| |
| InverseOffer* inverseOffer = new InverseOffer(); |
| |
| // We use the same id generator as regular offers so that we can |
| // have unique ids across both. This way we can re-use some of the |
| // `OfferID` only messages. |
| inverseOffer->mutable_id()->CopyFrom(newOfferId()); |
| inverseOffer->mutable_framework_id()->CopyFrom(framework->id()); |
| inverseOffer->mutable_slave_id()->CopyFrom(slave->id); |
| inverseOffer->mutable_url()->CopyFrom(url); |
| inverseOffer->mutable_unavailability()->CopyFrom( |
| unavailableResources.unavailability); |
| |
| inverseOffers[inverseOffer->id()] = inverseOffer; |
| |
| framework->addInverseOffer(inverseOffer); |
| slave->addInverseOffer(inverseOffer); |
| |
| // TODO(jmlvanre): Do we want a separate flag for inverse offer |
| // timeout? |
| if (flags.offer_timeout.isSome()) { |
| // Rescind the inverse offer after the timeout elapses. |
| inverseOfferTimers[inverseOffer->id()] = |
| delay(flags.offer_timeout.get(), |
| self(), |
| &Self::inverseOfferTimeout, |
| inverseOffer->id()); |
| } |
| |
| // Add the inverse offer *AND* the corresponding slave's PID. |
| message.add_inverse_offers()->CopyFrom(*inverseOffer); |
| message.add_pids(slave->pid); |
| } |
| |
| if (message.inverse_offers().size() == 0) { |
| return; |
| } |
| |
| vector<OfferID> inverseOfferIds; |
| foreach (const InverseOffer& inverseOffer, message.inverse_offers()) { |
| inverseOfferIds.push_back(inverseOffer.id()); |
| } |
| |
| LOG(INFO) << "Sending inverse offers " << inverseOfferIds << " to framework " |
| << *framework; |
| |
| framework->send(message); |
| } |
| |
| |
| // TODO(vinod): If due to network partition there are two instances |
| // of the framework that think they are leaders and try to |
| // authenticate with master they would be stepping on each other's |
| // toes. Currently it is tricky to detect this case because the |
| // 'authenticate' message doesn't contain the 'FrameworkID'. |
| // 'from' is the authenticatee process with which to communicate. |
| // 'pid' is the framework/slave process being authenticated. |
| void Master::authenticate(const UPID& from, const UPID& pid) |
| { |
| ++metrics->messages_authenticate; |
| |
| // An authentication request is sent by a client (slave/framework) |
| // in the following cases: |
| // |
| // 1. First time the client is connecting. |
| // This is straightforward; just proceed with authentication. |
| // |
| // 2. Client retried because of ZK expiration / authentication timeout. |
| // If the client is already authenticated, it will be removed from |
| // the 'authenticated' map and authentication is retried. |
| // |
| // 3. Client restarted. |
| // 3.1. We are here after receiving 'exited()' from old client. |
| // This is safe because the client will be first marked as |
| // disconnected and then when it reregisters it will be |
| // marked as connected. |
| // |
| // 3.2. We are here before receiving 'exited()' from old client. |
| // This is tricky only if the PID of the client doesn't change |
| // after restart; true for slave but not for framework. |
| // If the PID doesn't change the master might mark the client |
| // disconnected *after* the client reregisters. |
| // This is safe because the client (slave) will be informed |
| // about this discrepancy via ping messages so that it can |
| // reregister. |
| |
| bool erased = authenticated.erase(pid) > 0; |
| |
| if (authenticator.isNone()) { |
| // The default authenticator is CRAM-MD5 rather than none. |
| // Since the default parameters specify CRAM-MD5 authenticator, no |
| // required authentication, and no credentials, we must support |
| // this for starting successfully. |
| // In this case, we must allow non-authenticating frameworks / |
| // slaves to register without authentication, but we will return |
| // an AuthenticationError if they actually try to authenticate. |
| |
| // TODO(tillt): We need to make sure this does not cause retries. |
| // See MESOS-2379. |
| LOG(ERROR) << "Received authentication request from " << pid |
| << " but authenticator is not loaded"; |
| |
| AuthenticationErrorMessage message; |
| message.set_error("No authenticator loaded"); |
| send(from, message); |
| |
| return; |
| } |
| |
| // If a new authentication is occurring for a client that already |
| // has an authentication in progress, we discard the old one |
| // (since the client is no longer interested in it) and |
| // immediately proceed with the new authentication. |
| if (authenticating.contains(pid)) { |
| authenticating.at(pid).discard(); |
| authenticating.erase(pid); |
| |
| LOG(INFO) << "Re-authenticating " << pid << ";" |
| << " discarding outstanding authentication"; |
| } else { |
| LOG(INFO) << "Authenticating " << pid |
| << (erased ? "; clearing previous authentication" : ""); |
| } |
| |
| // Start authentication. |
| const Future<Option<string>> future = authenticator.get()->authenticate(from); |
| |
| // Save our state. |
| authenticating[pid] = future; |
| |
| future.onAny(defer(self(), &Self::_authenticate, pid, future)); |
| |
| // Don't wait for authentication to complete forever. |
| delay(flags.authentication_v0_timeout, |
| self(), |
| &Self::authenticationTimeout, |
| future); |
| } |
| |
| |
| void Master::_authenticate( |
| const UPID& pid, |
| const Future<Option<string>>& future) |
| { |
| // Ignore stale authentication results (if the authentication |
| // future has been overwritten). |
| if (authenticating.get(pid) != future) { |
| LOG(INFO) << "Ignoring stale authentication result of " << pid; |
| return; |
| } |
| |
| if (future.isReady() && future->isSome()) { |
| LOG(INFO) << "Successfully authenticated principal '" << future->get() |
| << "' at " << pid; |
| |
| authenticated.put(pid, future->get()); |
| } else if (future.isReady() && future->isNone()) { |
| LOG(INFO) << "Authentication of " << pid << " was unsuccessful:" |
| << " Invalid credentials"; |
| } else if (future.isFailed()) { |
| LOG(WARNING) << "An error ocurred while attempting to authenticate " << pid |
| << ": " << future.failure(); |
| } else { |
| LOG(INFO) << "Authentication of " << pid << " was discarded"; |
| } |
| |
| authenticating.erase(pid); |
| } |
| |
| |
| void Master::authenticationTimeout(Future<Option<string>> future) |
| { |
| // Note that a 'discard' here is safe even if another |
| // authenticator is in progress because this copy of the future |
| // corresponds to the original authenticator that started the timer. |
| if (future.discard()) { // This is a no-op if the future is already ready. |
| LOG(WARNING) << "Authentication timed out"; |
| } |
| } |
| |
| |
| void Master::reconcileKnownSlave( |
| Slave* slave, |
| const vector<ExecutorInfo>& executors, |
| const vector<Task>& tasks) |
| { |
| CHECK_NOTNULL(slave); |
| |
| // TODO(bmahler): There's an implicit assumption here the slave |
| // cannot have tasks unknown to the master. This _should_ be the |
| // case since the causal relationship is: |
| // slave removes task -> master removes task |
| // Add error logging for any violations of this assumption! |
| |
| // We convert the 'tasks' into a map for easier lookup below. |
| multihashmap<FrameworkID, TaskID> slaveTasks; |
| foreach (const Task& task, tasks) { |
| slaveTasks.put(task.framework_id(), task.task_id()); |
| } |
| |
| // Look for tasks missing in the slave's re-registration message. |
| // This can occur when: |
| // (1) a launch message was dropped (e.g. slave failed over), or |
| // (2) the slave re-registration raced with a launch message, in |
| // which case the slave actually received the task. |
| // To resolve both cases correctly, we must reconcile through the |
| // slave. For slaves that do not support reconciliation, we keep |
| // the old semantics and cover only case (1) via TASK_LOST. |
| Duration pingTimeout = |
| flags.agent_ping_timeout * flags.max_agent_ping_timeouts; |
| MasterSlaveConnection connection; |
| connection.set_total_ping_timeout_seconds(pingTimeout.secs()); |
| |
| SlaveReregisteredMessage reregistered; |
| reregistered.mutable_slave_id()->CopyFrom(slave->id); |
| reregistered.mutable_connection()->CopyFrom(connection); |
| |
| foreachkey (const FrameworkID& frameworkId, slave->tasks) { |
| ReconcileTasksMessage reconcile; |
| |
| foreachvalue (Task* task, slave->tasks[frameworkId]) { |
| if (!slaveTasks.contains(task->framework_id(), task->task_id())) { |
| LOG(WARNING) << "Task " << task->task_id() |
| << " of framework " << task->framework_id() |
| << " unknown to the agent " << *slave |
| << " during re-registration: reconciling with the agent"; |
| |
| // NOTE: The slave doesn't look at the task state when it |
| // reconciles the task. We send the master's view of the |
| // current task state since it might be useful in the future. |
| const TaskState& state = task->has_status_update_state() |
| ? task->status_update_state() |
| : task->state(); |
| |
| TaskStatus* status = reconcile.add_statuses(); |
| status->mutable_task_id()->CopyFrom(task->task_id()); |
| status->mutable_slave_id()->CopyFrom(slave->id); |
| status->set_state(state); |
| status->set_source(TaskStatus::SOURCE_MASTER); |
| status->set_message("Reconciliation request"); |
| status->set_reason(TaskStatus::REASON_RECONCILIATION); |
| status->set_timestamp(Clock::now().secs()); |
| } |
| } |
| |
| if (reconcile.statuses_size() > 0) { |
| // NOTE: This function is only invoked when a slave reregisters |
| // with a master that previously knew about the slave and has |
| // not marked it unreachable. If the master has any tasks for |
| // the agent that are not known to the agent itself, it MUST |
| // have the FrameworkInfo for those tasks. This is because if a |
| // master has a task that the agent doesn't know about, the |
| // framework must have reregistered with this master since the |
| // last master failover. |
| Framework* framework = CHECK_NOTNULL(getFramework(frameworkId)); |
| CHECK(!framework->recovered()); |
| |
| reconcile.mutable_framework_id()->CopyFrom(frameworkId); |
| reconcile.mutable_framework()->CopyFrom(framework->info); |
| |
| reregistered.add_reconciliations()->CopyFrom(reconcile); |
| } |
| } |
| |
| // Re-register the slave. |
| send(slave->pid, reregistered); |
| |
| // Likewise, any executors that are present in the master but |
| // not present in the slave must be removed to correctly account |
| // for resources. First we index the executors for fast lookup below. |
| multihashmap<FrameworkID, ExecutorID> slaveExecutors; |
| foreach (const ExecutorInfo& executor, executors) { |
| // Master validates that `framework_id` is set during task launch. |
| CHECK(executor.has_framework_id()); |
| slaveExecutors.put(executor.framework_id(), executor.executor_id()); |
| } |
| |
| // Now that we have the index for lookup, remove all the executors |
| // in the master that are not known to the slave. |
| // |
| // NOTE: A copy is needed because removeExecutor modifies |
| // slave->executors. |
| foreachkey (const FrameworkID& frameworkId, utils::copy(slave->executors)) { |
| foreachkey (const ExecutorID& executorId, |
| utils::copy(slave->executors[frameworkId])) { |
| if (!slaveExecutors.contains(frameworkId, executorId)) { |
| // TODO(bmahler): Reconcile executors correctly between the |
| // master and the slave, see: |
| // MESOS-1466, MESOS-1800, and MESOS-1720. |
| LOG(WARNING) << "Executor '" << executorId |
| << "' of framework " << frameworkId |
| << " possibly unknown to the agent " << *slave; |
| |
| removeExecutor(slave, frameworkId, executorId); |
| } |
| } |
| } |
| |
| // Send KillTaskMessages for tasks in 'killedTasks' that are |
| // still alive on the slave. This could happen if the slave |
| // did not receive KillTaskMessage because of a partition or |
| // disconnection. |
| foreach (const Task& task, tasks) { |
| if (!protobuf::isTerminalState(task.state()) && |
| slave->killedTasks.contains(task.framework_id(), task.task_id())) { |
| LOG(WARNING) << " Agent " << *slave |
| << " has non-terminal task " << task.task_id() |
| << " that is supposed to be killed. Killing it now!"; |
| |
| KillTaskMessage message; |
| message.mutable_framework_id()->MergeFrom(task.framework_id()); |
| message.mutable_task_id()->MergeFrom(task.task_id()); |
| send(slave->pid, message); |
| } |
| } |
| |
| // Send ShutdownFrameworkMessages for frameworks that are completed. |
| // This could happen if the message wasn't received by the slave |
| // (e.g., slave was down, partitioned). |
| // |
| // NOTE: This is a short-term hack because this information is lost |
| // when the master fails over. Also, we only store a limited number |
| // of completed frameworks. |
| // |
| // TODO(vinod): Revisit this when registrar is in place. It would |
| // likely involve storing this information in the registrar. |
| foreachvalue (const Owned<Framework>& framework, |
| frameworks.completed) { |
| if (slaveTasks.contains(framework->id())) { |
| LOG(WARNING) << "Agent " << *slave |
| << " reregistered with completed framework " << *framework |
| << ". Shutting down the framework on the agent"; |
| |
| ShutdownFrameworkMessage message; |
| message.mutable_framework_id()->MergeFrom(framework->id()); |
| send(slave->pid, message); |
| } |
| } |
| } |
| |
| |
| void Master::addFramework( |
| Framework* framework, |
| const set<string>& suppressedRoles) |
| { |
| CHECK_NOTNULL(framework); |
| |
| CHECK(!frameworks.registered.contains(framework->id())) |
| << "Framework " << *framework << " already exists!"; |
| |
| LOG(INFO) << "Adding framework " << *framework << " with roles " |
| << stringify(suppressedRoles) << " suppressed"; |
| |
| frameworks.registered[framework->id()] = framework; |
| |
| if (framework->connected()) { |
| if (framework->pid.isSome()) { |
| link(framework->pid.get()); |
| } else { |
| CHECK_SOME(framework->http); |
| |
| const HttpConnection& http = framework->http.get(); |
| |
| http.closed() |
| .onAny(defer(self(), &Self::exited, framework->id(), http)); |
| } |
| } |
| |
| // There should be no offered resources yet! |
| CHECK_EQ(Resources(), framework->totalOfferedResources); |
| |
| allocator->addFramework( |
| framework->id(), |
| framework->info, |
| framework->usedResources, |
| framework->active(), |
| suppressedRoles); |
| |
| // Export framework metrics if a principal is specified in `FrameworkInfo`. |
| |
| Option<string> principal = framework->info.has_principal() |
| ? Option<string>(framework->info.principal()) |
| : None(); |
| |
| if (framework->pid.isSome()) { |
| CHECK(!frameworks.principals.contains(framework->pid.get())); |
| frameworks.principals.put(framework->pid.get(), principal); |
| } |
| |
| if (principal.isSome()) { |
| // Create new framework metrics if this framework is the first |
| // one of this principal. Otherwise existing metrics are reused. |
| if (!metrics->frameworks.contains(principal.get())) { |
| metrics->frameworks.put( |
| principal.get(), |
| Owned<Metrics::Frameworks>( |
| new Metrics::Frameworks(principal.get()))); |
| } |
| } |
| } |
| |
| |
| void Master::recoverFramework( |
| const FrameworkInfo& info, |
| const set<string>& suppressedRoles) |
| { |
| CHECK(!frameworks.registered.contains(info.id())); |
| |
| Framework* framework = new Framework(this, flags, info); |
| |
| // Send a `FRAMEWORK_ADDED` event to subscribers before adding recovered tasks |
| // so the framework ID referred by any succeeding `TASK_ADDED` event will be |
| // known to subscribers. |
| if (!subscribers.subscribed.empty()) { |
| subscribers.send(protobuf::master::event::createFrameworkAdded(*framework)); |
| } |
| |
| // Add active operations, tasks, and executors to the framework. |
| foreachvalue (Slave* slave, slaves.registered) { |
| if (slave->tasks.contains(framework->id())) { |
| foreachvalue (Task* task, slave->tasks.at(framework->id())) { |
| framework->addTask(task); |
| } |
| } |
| |
| if (slave->executors.contains(framework->id())) { |
| foreachvalue (const ExecutorInfo& executor, |
| slave->executors.at(framework->id())) { |
| framework->addExecutor(slave->id, executor); |
| } |
| } |
| |
| foreachvalue (Operation* operation, slave->operations) { |
| if (operation->has_framework_id() && |
| operation->framework_id() == framework->id()) { |
| framework->addOperation(operation); |
| } |
| } |
| |
| foreachvalue (const Slave::ResourceProvider& resourceProvider, |
| slave->resourceProviders) { |
| foreachvalue (Operation* operation, resourceProvider.operations) { |
| if (operation->has_framework_id() && |
| operation->framework_id() == framework->id()) { |
| framework->addOperation(operation); |
| } |
| } |
| } |
| } |
| |
| addFramework(framework, suppressedRoles); |
| } |
| |
| |
| Try<Nothing> Master::activateRecoveredFramework( |
| Framework* framework, |
| const FrameworkInfo& frameworkInfo, |
| const Option<UPID>& pid, |
| const Option<HttpConnection>& http, |
| const set<string>& suppressedRoles) |
| { |
| // Exactly one of `pid` or `http` must be provided. |
| CHECK(pid.isSome() != http.isSome()); |
| |
| CHECK_NOTNULL(framework); |
| CHECK(framework->recovered()); |
| CHECK(framework->offers.empty()); |
| CHECK(framework->inverseOffers.empty()); |
| CHECK(framework->pid.isNone()); |
| CHECK(framework->http.isNone()); |
| |
| updateFramework(framework, frameworkInfo, suppressedRoles); |
| |
| // Updating `registeredTime` here is debatable: ideally, |
| // `registeredTime` would be the time at which the framework first |
| // registered with the master. However, we cannot determine this |
| // because the time at which a framework first registered is not |
| // persisted across master failover. |
| framework->registeredTime = Clock::now(); |
| framework->reregisteredTime = Clock::now(); |
| |
| // Update the framework's connection state. |
| if (pid.isSome()) { |
| framework->updateConnection(pid.get()); |
| link(pid.get()); |
| } else { |
| framework->updateConnection(http.get()); |
| http->closed() |
| .onAny(defer(self(), &Self::exited, framework->id(), http.get())); |
| } |
| |
| // Activate the framework. |
| framework->setFrameworkState(Framework::State::ACTIVE); |
| allocator->activateFramework(framework->id()); |
| |
| // Export framework metrics if a principal is specified in `FrameworkInfo`. |
| Option<string> principal = framework->info.has_principal() |
| ? Option<string>(framework->info.principal()) |
| : None(); |
| |
| if (framework->pid.isSome()) { |
| CHECK(!frameworks.principals.contains(framework->pid.get())); |
| frameworks.principals.put(framework->pid.get(), principal); |
| } |
| |
| // We expect the framework metrics for this principal to be created |
| // when the framework is recovered. This implies that the framework |
| // principal cannot change on re-registration, which is currently |
| // the case (MESOS-2842). |
| if (principal.isSome()) { |
| CHECK(metrics->frameworks.contains(principal.get())); |
| } |
| |
| if (pid.isSome()) { |
| // TODO(bmahler): We have to send a registered message here for |
| // the reregistering framework, per the API contract. Send |
| // reregister here per MESOS-786; requires deprecation or it |
| // will break frameworks. |
| FrameworkRegisteredMessage message; |
| message.mutable_framework_id()->MergeFrom(framework->id()); |
| message.mutable_master_info()->MergeFrom(info_); |
| framework->send(message); |
| } else { |
| FrameworkReregisteredMessage message; |
| message.mutable_framework_id()->MergeFrom(framework->id()); |
| message.mutable_master_info()->MergeFrom(info_); |
| framework->send(message); |
| |
| // Start the heartbeat after sending SUBSCRIBED event. |
| framework->heartbeat(); |
| } |
| |
| return Nothing(); |
| } |
| |
| |
| void Master::failoverFramework(Framework* framework, const HttpConnection& http) |
| { |
| CHECK_NOTNULL(framework); |
| |
| // Notify the old connected framework that it has failed over. |
| // This is safe to do even if it is a retry because the framework is expected |
| // to close the old connection (and hence not receive any more responses) |
| // before sending subscription request on a new connection. |
| if (framework->connected()) { |
| FrameworkErrorMessage message; |
| message.set_message("Framework failed over"); |
| framework->send(message); |
| } |
| |
| // If this is an upgrade, clear the authentication related data. |
| if (framework->pid.isSome()) { |
| authenticated.erase(framework->pid.get()); |
| |
| CHECK(frameworks.principals.contains(framework->pid.get())); |
| Option<string> principal = frameworks.principals[framework->pid.get()]; |
| |
| frameworks.principals.erase(framework->pid.get()); |
| } |
| |
| framework->updateConnection(http); |
| |
| http.closed() |
| .onAny(defer(self(), &Self::exited, framework->id(), http)); |
| |
| _failoverFramework(framework); |
| |
| // Start the heartbeat after sending SUBSCRIBED event. |
| framework->heartbeat(); |
| } |
| |
| |
| // Replace the scheduler for a framework with a new process ID, in the |
| // event of a scheduler failover. |
| void Master::failoverFramework(Framework* framework, const UPID& newPid) |
| { |
| CHECK_NOTNULL(framework); |
| |
| const Option<UPID> oldPid = framework->pid; |
| |
| // There are a few failover cases to consider: |
| // 1. The pid has changed or it was previously a HTTP based scheduler. |
| // In these cases we definitely want to send a FrameworkErrorMessage to |
| // shut down the older scheduler. |
| // 2. The pid has not changed. |
| // 2.1 The old scheduler on that pid failed over to a new |
| // instance on the same pid. No need to shut down the old |
| // scheduler as it is necessarily dead. |
| // 2.2 This is a duplicate message. In this case, the scheduler |
| // has not failed over, so we do not want to shut it down. |
| if (oldPid != newPid && framework->connected()) { |
| FrameworkErrorMessage message; |
| message.set_message("Framework failed over"); |
| framework->send(message); |
| } |
| |
| framework->updateConnection(newPid); |
| link(newPid); |
| |
| _failoverFramework(framework); |
| |
| CHECK_SOME(framework->pid); |
| |
| // Update the principal mapping for this framework, which is |
| // needed to keep the per-principal framework metrics accurate. |
| if (oldPid.isSome() && frameworks.principals.contains(oldPid.get())) { |
| frameworks.principals.erase(oldPid.get()); |
| } |
| |
| frameworks.principals[newPid] = authenticated.get(newPid); |
| } |
| |
| |
| void Master::_failoverFramework(Framework* framework) |
| { |
| // Remove the framework's offers (if they weren't removed before). |
| foreach (Offer* offer, utils::copy(framework->offers)) { |
| allocator->recoverResources( |
| offer->framework_id(), offer->slave_id(), offer->resources(), None()); |
| |
| removeOffer(offer); |
| } |
| |
| // Also remove the inverse offers. |
| foreach (InverseOffer* inverseOffer, utils::copy(framework->inverseOffers)) { |
| allocator->updateInverseOffer( |
| inverseOffer->slave_id(), |
| inverseOffer->framework_id(), |
| UnavailableResources{ |
| inverseOffer->resources(), |
| inverseOffer->unavailability()}, |
| None()); |
| |
| removeInverseOffer(inverseOffer); |
| } |
| |
| CHECK(!framework->recovered()); |
| |
| // Reactivate the framework, if needed. |
| // NOTE: We do this after recovering resources (above) so that |
| // the allocator has the correct view of the framework's share. |
| if (!framework->active()) { |
| framework->setFrameworkState(Framework::State::ACTIVE); |
| allocator->activateFramework(framework->id()); |
| } |
| |
| // The scheduler driver safely ignores any duplicate registration |
| // messages, so we don't need to compare the old and new pids here. |
| FrameworkRegisteredMessage message; |
| message.mutable_framework_id()->MergeFrom(framework->id()); |
| message.mutable_master_info()->MergeFrom(info_); |
| framework->send(message); |
| } |
| |
| |
| void Master::teardown(Framework* framework) |
| { |
| CHECK_NOTNULL(framework); |
| |
| LOG(INFO) << "Processing TEARDOWN call for framework " << *framework; |
| |
| ++metrics->messages_unregister_framework; |
| |
| removeFramework(framework); |
| } |
| |
| |
| void Master::removeFramework(Framework* framework) |
| { |
| CHECK_NOTNULL(framework); |
| |
| LOG(INFO) << "Removing framework " << *framework; |
| |
| if (framework->active()) { |
| // Deactivate framework, but don't bother rescinding offers |
| // because the framework is being removed. |
| deactivate(framework, false); |
| } |
| |
| // The framework's offers should have been removed when the |
| // framework was deactivated. |
| CHECK(framework->offers.empty()); |
| CHECK(framework->inverseOffers.empty()); |
| |
| foreachvalue (Slave* slave, slaves.registered) { |
| // Remove the pending tasks from the slave. |
| slave->pendingTasks.erase(framework->id()); |
| |
| // Tell slaves to shutdown the framework. |
| ShutdownFrameworkMessage message; |
| message.mutable_framework_id()->MergeFrom(framework->id()); |
| send(slave->pid, message); |
| } |
| |
| // Remove the pending tasks from the framework. |
| framework->pendingTasks.clear(); |
| |
| // Remove pointers to the framework's tasks in slaves and mark those |
| // tasks as completed. |
| foreachvalue (Task* task, utils::copy(framework->tasks)) { |
| Slave* slave = slaves.registered.get(task->slave_id()); |
| |
| // Since we only find out about tasks when the slave reregisters, |
| // it must be the case that the slave exists! |
| CHECK(slave != nullptr) |
| << "Unknown agent " << task->slave_id() |
| << " for task " << task->task_id(); |
| |
| // The task is implicitly killed, and TASK_KILLED is the closest |
| // state we have by now. We mark the task and remove it, without |
| // sending the update. However, a task may finish during the |
| // executor graceful shutdown period. By marking such task as |
| // killed and moving it to completed, we lose the opportunity to |
| // collect the possible finished status. We tolerate this, |
| // because we expect that if the framework has been asked to shut |
| // down, its user is not interested in results anymore. |
| // |
| // TODO(alex): Consider a more descriptive state, e.g. TASK_ABANDONED. |
| // |
| // TODO(neilc): Marking the task KILLED before it has actually |
| // terminated is misleading. Instead, we should consider leaving |
| // the task in its current state at the master; if/when the agent |
| // shuts down the framework, we should arrange for a terminal |
| // status update to be delivered to the master and update the |
| // state of the task at that time (MESOS-6608). |
| const StatusUpdate& update = protobuf::createStatusUpdate( |
| task->framework_id(), |
| task->slave_id(), |
| task->task_id(), |
| TASK_KILLED, |
| TaskStatus::SOURCE_MASTER, |
| None(), |
| "Framework " + framework->id().value() + " removed", |
| TaskStatus::REASON_FRAMEWORK_REMOVED, |
| (task->has_executor_id() |
| ? Option<ExecutorID>(task->executor_id()) |
| : None())); |
| |
| updateTask(task, update); |
| removeTask(task); |
| } |
| |
| // Mark the framework's unreachable tasks as completed. |
| foreach (const TaskID& taskId, framework->unreachableTasks.keys()) { |
| const Owned<Task>& task = framework->unreachableTasks.at(taskId); |
| |
| // TODO(neilc): Per comment above, using TASK_KILLED here is not |
| // ideal. It would be better to use TASK_UNREACHABLE here and only |
| // transition it to a terminal state when the agent reregisters |
| // and the task is shutdown (MESOS-6608). |
| const StatusUpdate& update = protobuf::createStatusUpdate( |
| task->framework_id(), |
| task->slave_id(), |
| task->task_id(), |
| TASK_KILLED, |
| TaskStatus::SOURCE_MASTER, |
| None(), |
| "Framework " + framework->id().value() + " removed", |
| TaskStatus::REASON_FRAMEWORK_REMOVED, |
| (task->has_executor_id() |
| ? Option<ExecutorID>(task->executor_id()) |
| : None())); |
| |
| updateTask(task.get(), update); |
| |
| // We don't need to remove the task from the slave, because the |
| // task was removed when the agent was marked unreachable. |
| CHECK(!slaves.registered.contains(task->slave_id())) |
| << "Unreachable task " << task->task_id() |
| << " of framework " << task->framework_id() |
| << " was found on registered agent " << task->slave_id(); |
| |
| // Move task from unreachable map to completed map. |
| framework->addCompletedTask(std::move(*task)); |
| framework->unreachableTasks.erase(taskId); |
| } |
| |
| // Remove the framework's executors for correct resource accounting. |
| foreachkey (const SlaveID& slaveId, utils::copy(framework->executors)) { |
| Slave* slave = slaves.registered.get(slaveId); |
| |
| if (slave != nullptr) { |
| foreachkey (const ExecutorID& executorId, |
| utils::copy(framework->executors[slaveId])) { |
| removeExecutor(slave, framework->id(), executorId); |
| } |
| } |
| } |
| |
| foreachvalue (Operation* operation, utils::copy(framework->operations)) { |
| framework->removeOperation(operation); |
| } |
| |
| // TODO(benh): Similar code between removeFramework and |
| // failoverFramework needs to be shared! |
| |
| // TODO(benh): unlink(framework->pid); |
| |
| // For http frameworks, close the connection. |
| if (framework->http.isSome()) { |
| framework->http->close(); |
| } |
| |
| framework->unregisteredTime = Clock::now(); |
| |
| foreach (const string& role, framework->roles) { |
| framework->untrackUnderRole(role); |
| } |
| |
| // TODO(anand): This only works for pid based frameworks. We would |
| // need similar authentication logic for http frameworks. |
| if (framework->pid.isSome()) { |
| authenticated.erase(framework->pid.get()); |
| |
| CHECK(frameworks.principals.contains(framework->pid.get())); |
| Option<string> principal = frameworks.principals[framework->pid.get()]; |
| |
| frameworks.principals.erase(framework->pid.get()); |
| |
| // Remove the metrics for the principal if this framework is the |
| // last one with this principal. |
| if (principal.isSome() && |
| !frameworks.principals.containsValue(principal.get())) { |
| CHECK(metrics->frameworks.contains(principal.get())); |
| metrics->frameworks.erase(principal.get()); |
| } |
| } |
| |
| // Remove the framework. |
| frameworks.registered.erase(framework->id()); |
| allocator->removeFramework(framework->id()); |
| |
| // The framework pointer is now owned by `frameworks.completed`. |
| frameworks.completed.set(framework->id(), Owned<Framework>(framework)); |
| |
| if (!subscribers.subscribed.empty()) { |
| subscribers.send( |
| protobuf::master::event::createFrameworkRemoved(framework->info)); |
| } |
| } |
| |
| |
| void Master::removeFramework(Slave* slave, Framework* framework) |
| { |
| CHECK_NOTNULL(slave); |
| CHECK_NOTNULL(framework); |
| |
| LOG(INFO) << "Removing framework " << *framework |
| << " from agent " << *slave; |
| |
| // Remove pointers to framework's tasks in slaves, and send status |
| // updates. |
| // NOTE: A copy is needed because removeTask modifies slave->tasks. |
| foreachvalue (Task* task, utils::copy(slave->tasks[framework->id()])) { |
| // Remove tasks that belong to this framework. |
| if (task->framework_id() == framework->id()) { |
| // A framework might not actually exist because the master failed |
| // over and the framework hasn't reconnected yet. For more info |
| // please see the comments in 'removeFramework(Framework*)'. |
| const StatusUpdate& update = protobuf::createStatusUpdate( |
| task->framework_id(), |
| task->slave_id(), |
| task->task_id(), |
| TASK_LOST, |
| TaskStatus::SOURCE_MASTER, |
| None(), |
| "Agent " + slave->info.hostname() + " disconnected", |
| TaskStatus::REASON_SLAVE_DISCONNECTED, |
| (task->has_executor_id() |
| ? Option<ExecutorID>(task->executor_id()) : None())); |
| |
| updateTask(task, update); |
| removeTask(task); |
| |
| if (framework->connected()) { |
| forward(update, UPID(), framework); |
| } |
| } |
| } |
| |
| // Remove the framework's executors from the slave and framework |
| // for proper resource accounting. |
| if (slave->executors.contains(framework->id())) { |
| foreachkey (const ExecutorID& executorId, |
| utils::copy(slave->executors[framework->id()])) { |
| removeExecutor(slave, framework->id(), executorId); |
| } |
| } |
| } |
| |
| |
| void Master::addSlave( |
| Slave* slave, |
| vector<Archive::Framework>&& completedFrameworks) |
| { |
| CHECK_NOTNULL(slave); |
| CHECK(!slaves.registered.contains(slave->id)); |
| CHECK(!slaves.unreachable.contains(slave->id)); |
| CHECK(slaves.removed.get(slave->id).isNone()); |
| |
| slaves.registered.put(slave); |
| |
| link(slave->pid); |
| |
| // Map the slave to the machine it is running on. |
| CHECK(!machines[slave->machineId].slaves.contains(slave->id)); |
| machines[slave->machineId].slaves.insert(slave->id); |
| |
| // Set up an observer for the slave. |
| slave->observer = new SlaveObserver( |
| slave->pid, |
| slave->info, |
| slave->id, |
| self(), |
| slaves.limiter, |
| metrics, |
| flags.agent_ping_timeout, |
| flags.max_agent_ping_timeouts); |
| |
| spawn(slave->observer); |
| |
| // Add the slave's executors to the frameworks. |
| foreachkey (const FrameworkID& frameworkId, slave->executors) { |
| Framework* framework = getFramework(frameworkId); |
| |
| // If the framework has not reregistered yet and this is the |
| // first agent to reregister that is running the framework, we |
| // skip adding the framework's executors here. Instead, the |
| // framework will be recovered in `__reregisterSlave` and its |
| // executors will be added by `recoverFramework`. |
| if (framework == nullptr) { |
| continue; |
| } |
| |
| foreachvalue (const ExecutorInfo& executorInfo, |
| slave->executors[frameworkId]) { |
| framework->addExecutor(slave->id, executorInfo); |
| } |
| } |
| |
| // Add the slave's tasks to the frameworks. |
| foreachkey (const FrameworkID& frameworkId, slave->tasks) { |
| Framework* framework = getFramework(frameworkId); |
| |
| // If the framework has not reregistered yet and this is the |
| // first agent to reregister that is running the framework, we |
| // skip adding the framework's tasks here. Instead, the framework |
| // will be recovered in `__reregisterSlave` and its tasks will be |
| // added by `recoverFramework`. |
| if (framework == nullptr) { |
| continue; |
| } |
| |
| foreachvalue (Task* task, slave->tasks[frameworkId]) { |
| framework->addTask(task); |
| } |
| } |
| |
| // Re-add completed tasks reported by the slave. |
| // |
| // Note that a slave considers a framework completed when it has no |
| // tasks/executors running for that framework. But a master |
| // considers a framework completed when the framework is removed |
| // after a failover timeout. |
| // |
| // TODO(vinod): Reconcile the notion of a completed framework across |
| // the master and slave. |
| foreach (Archive::Framework& completedFramework, completedFrameworks) { |
| Framework* framework = getFramework( |
| completedFramework.framework_info().id()); |
| |
| foreach (Task& task, *completedFramework.mutable_tasks()) { |
| if (framework != nullptr) { |
| VLOG(2) << "Re-adding completed task " << task.task_id() |
| << " of framework " << *framework |
| << " that ran on agent " << *slave; |
| |
| framework->addCompletedTask(std::move(task)); |
| } else { |
| // The framework might not be reregistered yet. |
| // |
| // TODO(vinod): Revisit these semantics when we store frameworks' |
| // information in the registrar. |
| LOG(WARNING) << "Possibly orphaned completed task " << task.task_id() |
| << " of framework " << task.framework_id() |
| << " that ran on agent " << *slave; |
| } |
| } |
| } |
| |
| CHECK(machines.contains(slave->machineId)); |
| |
| // Only set unavailability if the protobuf has one set. |
| Option<Unavailability> unavailability = None(); |
| if (machines[slave->machineId].info.has_unavailability()) { |
| unavailability = machines[slave->machineId].info.unavailability(); |
| } |
| |
| allocator->addSlave( |
| slave->id, |
| slave->info, |
| google::protobuf::convert(slave->capabilities.toRepeatedPtrField()), |
| unavailability, |
| slave->totalResources, |
| slave->usedResources); |
| |
| if (!subscribers.subscribed.empty()) { |
| subscribers.send(protobuf::master::event::createAgentAdded(*slave)); |
| } |
| } |
| |
| |
| void Master::removeSlave( |
| Slave* slave, |
| const string& message, |
| Option<Counter> reason) |
| { |
| CHECK_NOTNULL(slave); |
| |
| // It would be better to remove the slave here instead of continuing |
| // to mark it unreachable, but probably not worth the complexity. |
| if (slaves.markingUnreachable.contains(slave->id)) { |
| LOG(WARNING) << "Ignoring removal of agent " << *slave |
| << " that is in the process of being marked unreachable"; |
| |
| return; |
| } |
| |
| if (slaves.markingGone.contains(slave->id)) { |
| LOG(WARNING) << "Ignoring removal of agent " << *slave |
| << " that is in the process of being marked gone"; |
| |
| return; |
| } |
| |
| // This should not be possible, but we protect against it anyway for |
| // the sake of paranoia. |
| if (slaves.removing.contains(slave->id)) { |
| LOG(WARNING) << "Ignoring removal of agent " << *slave |
| << " that is in the process of being removed"; |
| |
| return; |
| } |
| |
| slaves.removing.insert(slave->id); |
| |
| LOG(INFO) << "Removing agent " << *slave << ": " << message; |
| |
| // Remove this slave from the registrar. Note that we update the |
| // registry BEFORE we update the master's in-memory state; this |
| // means that until the registry operation has completed, the slave |
| // is not considered to be removed (so we might offer its resources |
| // to frameworks, etc.). Ensuring that the registry update succeeds |
| // before we modify in-memory state ensures that external clients |
| // see consistent behavior if the master fails over. |
| registrar->apply(Owned<RegistryOperation>(new RemoveSlave(slave->info))) |
| .onAny(defer(self(), |
| &Self::_removeSlave, |
| slave, |
| lambda::_1, |
| message, |
| reason)); |
| } |
| |
| |
| void Master::_removeSlave( |
| Slave* slave, |
| const Future<bool>& registrarResult, |
| const string& removalCause, |
| Option<Counter> reason) |
| { |
| CHECK_NOTNULL(slave); |
| CHECK(slaves.removing.contains(slave->info.id())); |
| slaves.removing.erase(slave->info.id()); |
| |
| CHECK(!registrarResult.isDiscarded()); |
| |
| if (registrarResult.isFailed()) { |
| LOG(FATAL) << "Failed to remove agent " << *slave |
| << " from the registrar: " << registrarResult.failure(); |
| } |
| |
| // Should not happen: the master will only try to remove agents that |
| // are currently admitted. |
| CHECK(registrarResult.get()) |
| << "Agent " << *slave |
| << "already removed from the registrar"; |
| |
| LOG(INFO) << "Removed agent " << *slave << ": " << removalCause; |
| |
| ++metrics->slave_removals; |
| if (reason.isSome()) { |
| ++utils::copy(reason.get()); // Remove const. |
| } |
| |
| // We want to remove the slave first, to avoid the allocator |
| // re-allocating the recovered resources. |
| // |
| // NOTE: Removing the slave is not sufficient for recovering the |
| // resources in the allocator, because the "Sorters" are updated |
| // only within recoverResources() (see MESOS-621). The calls to |
| // recoverResources() below are therefore required, even though |
| // the slave is already removed. |
| allocator->removeSlave(slave->id); |
| |
| // Transition the tasks to lost and remove them. |
| foreachkey (const FrameworkID& frameworkId, utils::copy(slave->tasks)) { |
| Framework* framework = getFramework(frameworkId); |
| |
| foreachvalue (Task* task, utils::copy(slave->tasks[frameworkId])) { |
| // TODO(bmahler): Differentiate between agent removal reasons |
| // (e.g. unhealthy vs. unregistered for maintenance). |
| const StatusUpdate& update = protobuf::createStatusUpdate( |
| task->framework_id(), |
| task->slave_id(), |
| task->task_id(), |
| TASK_LOST, |
| TaskStatus::SOURCE_MASTER, |
| None(), |
| "Agent " + slave->info.hostname() + " removed: " + removalCause, |
| TaskStatus::REASON_SLAVE_REMOVED, |
| (task->has_executor_id() ? |
| Option<ExecutorID>(task->executor_id()) : None())); |
| |
| updateTask(task, update); |
| removeTask(task); |
| |
| if (framework == nullptr || !framework->connected()) { |
| LOG(WARNING) << "Dropping update " << update |
| << " for unknown framework " << frameworkId; |
| } else { |
| forward(update, UPID(), framework); |
| } |
| } |
| } |
| |
| // Remove executors from the slave for proper resource accounting. |
| foreachkey (const FrameworkID& frameworkId, utils::copy(slave->executors)) { |
| foreachkey (const ExecutorID& executorId, |
| utils::copy(slave->executors[frameworkId])) { |
| removeExecutor(slave, frameworkId, executorId); |
| } |
| } |
| |
| foreach (Offer* offer, utils::copy(slave->offers)) { |
| // TODO(vinod): We don't need to call 'Allocator::recoverResources' |
| // once MESOS-621 is fixed. |
| allocator->recoverResources( |
| offer->framework_id(), slave->id, offer->resources(), None()); |
| |
| // Remove and rescind offers. |
| removeOffer(offer, true); // Rescind! |
| } |
| |
| // Remove inverse offers because sending them for a slave that is |
| // gone doesn't make sense. |
| foreach (InverseOffer* inverseOffer, utils::copy(slave->inverseOffers)) { |
| // We don't need to update the allocator because we've already called |
| // `RemoveSlave()`. |
| // Remove and rescind inverse offers. |
| removeInverseOffer(inverseOffer, true); // Rescind! |
| } |
| |
| // Remove the pending tasks from the slave. |
| slave->pendingTasks.clear(); |
| |
| // Mark the slave as being removed. |
| slaves.registered.remove(slave); |
| slaves.removed.put(slave->id, Nothing()); |
| authenticated.erase(slave->pid); |
| |
| // Remove the slave from the `machines` mapping. |
| CHECK(machines.contains(slave->machineId)); |
| CHECK(machines[slave->machineId].slaves.contains(slave->id)); |
| machines[slave->machineId].slaves.erase(slave->id); |
| |
| // Kill the slave observer. |
| terminate(slave->observer); |
| wait(slave->observer); |
| delete slave->observer; |
| |
| // TODO(benh): unlink(slave->pid); |
| |
| sendSlaveLost(slave->info); |
| |
| if (!subscribers.subscribed.empty()) { |
| subscribers.send(protobuf::master::event::createAgentRemoved(slave->id)); |
| } |
| |
| delete slave; |
| } |
| |
| |
| void Master::__removeSlave( |
| Slave* slave, |
| const string& message, |
| const Option<TimeInfo>& unreachableTime) |
| { |
| // We want to remove the slave first, to avoid the allocator |
| // re-allocating the recovered resources. |
| // |
| // NOTE: Removing the slave is not sufficient for recovering the |
| // resources in the allocator, because the "Sorters" are updated |
| // only within recoverResources() (see MESOS-621). The calls to |
| // recoverResources() below are therefore required, even though |
| // the slave is already removed. |
| allocator->removeSlave(slave->id); |
| |
| // Transition tasks to TASK_UNREACHABLE/TASK_GONE_BY_OPERATOR/TASK_LOST |
| // and remove them. We only use TASK_UNREACHABLE/TASK_GONE_BY_OPERATOR if |
| // the framework has opted in to the PARTITION_AWARE capability. |
| foreachkey (const FrameworkID& frameworkId, utils::copy(slave->tasks)) { |
| Framework* framework = getFramework(frameworkId); |
| CHECK_NOTNULL(framework); |
| |
| TaskState newTaskState = TASK_UNREACHABLE; |
| TaskStatus::Reason newTaskReason = TaskStatus::REASON_SLAVE_REMOVED; |
| |
| // Needed to convey task unreachability because we lose this |
| // information from the task state if `TASK_LOST` is used. |
| bool unreachable = true; |
| |
| if (!framework->capabilities.partitionAware) { |
| newTaskState = TASK_LOST; |
| } else if (unreachableTime.isNone()) { |
| unreachable = false; |
| newTaskState = TASK_GONE_BY_OPERATOR; |
| newTaskReason = TaskStatus::REASON_SLAVE_REMOVED_BY_OPERATOR; |
| } |
| |
| foreachvalue (Task* task, utils::copy(slave->tasks[frameworkId])) { |
| const StatusUpdate& update = protobuf::createStatusUpdate( |
| task->framework_id(), |
| task->slave_id(), |
| task->task_id(), |
| newTaskState, |
| TaskStatus::SOURCE_MASTER, |
| None(), |
| message, |
| newTaskReason, |
| (task->has_executor_id() ? |
| Option<ExecutorID>(task->executor_id()) : None()), |
| None(), |
| None(), |
| None(), |
| None(), |
| unreachableTime.isSome() ? unreachableTime : None()); |
| |
| updateTask(task, update); |
| removeTask(task, unreachable); |
| |
| if (!framework->connected()) { |
| LOG(WARNING) << "Dropping update " << update |
| << " for disconnected " |
| << " framework " << frameworkId; |
| } else { |
| forward(update, UPID(), framework); |
| } |
| } |
| } |
| |
| // Remove executors from the slave for proper resource accounting. |
| foreachkey (const FrameworkID& frameworkId, utils::copy(slave->executors)) { |
| foreachkey (const ExecutorID& executorId, |
| utils::copy(slave->executors[frameworkId])) { |
| removeExecutor(slave, frameworkId, executorId); |
| } |
| } |
| |
| foreach (Offer* offer, utils::copy(slave->offers)) { |
| // TODO(vinod): We don't need to call 'Allocator::recoverResources' |
| // once MESOS-621 is fixed. |
| allocator->recoverResources( |
| offer->framework_id(), slave->id, offer->resources(), None()); |
| |
| // Remove and rescind offers. |
| removeOffer(offer, true); // Rescind! |
| } |
| |
| // Remove inverse offers because sending them for a slave that is |
| // unreachable doesn't make sense. |
| foreach (InverseOffer* inverseOffer, utils::copy(slave->inverseOffers)) { |
| // We don't need to update the allocator because we've already called |
| // `RemoveSlave()`. |
| // Remove and rescind inverse offers. |
| removeInverseOffer(inverseOffer, true); // Rescind! |
| } |
| |
| // Mark the slave as being removed. |
| slaves.registered.remove(slave); |
| slaves.removed.put(slave->id, Nothing()); |
| authenticated.erase(slave->pid); |
| |
| // Remove the slave from the `machines` mapping. |
| CHECK(machines.contains(slave->machineId)); |
| CHECK(machines[slave->machineId].slaves.contains(slave->id)); |
| machines[slave->machineId].slaves.erase(slave->id); |
| |
| // Kill the slave observer. |
| terminate(slave->observer); |
| wait(slave->observer); |
| delete slave->observer; |
| |
| // TODO(benh): unlink(slave->pid); |
| |
| // TODO(bmahler): Tell partition aware frameworks that the |
| // agent is unreachable rather than lost, if applicable. |
| // This requires a new capability. |
| sendSlaveLost(slave->info); |
| |
| delete slave; |
| } |
| |
| |
| void Master::updateTask(Task* task, const StatusUpdate& update) |
| { |
| CHECK_NOTNULL(task); |
| |
| // Get the unacknowledged status. |
| const TaskStatus& status = update.status(); |
| |
| // NOTE: Refer to comments on `StatusUpdate` message in messages.proto for |
| // the difference between `update.latest_state()` and `status.state()`. |
| |
| // Updates from the slave have 'latest_state' set. |
| Option<TaskState> latestState; |
| if (update.has_latest_state()) { |
| latestState = update.latest_state(); |
| } |
| |
| const TaskState updateState = latestState.getOrElse(status.state()); |
| |
| // Determine whether the task transitioned to terminal or |
| // unreachable prior to changing the task state. |
| auto isTerminalOrUnreachableState = [](const TaskState& state) { |
| return protobuf::isTerminalState(state) || state == TASK_UNREACHABLE; |
| }; |
| |
| bool transitionedToTerminalOrUnreachable = |
| !isTerminalOrUnreachableState(task->state()) && |
| isTerminalOrUnreachableState(updateState); |
| |
| // Indicates whether we should send a notification to subscribers, |
| // set if the task transitioned to a new state. |
| bool sendSubscribersUpdate = false; |
| |
| Framework* framework = getFramework(task->framework_id()); |
| |
| // If the task has already transitioned to a terminal state, |
| // do not update its state. Note that we are being defensive |
| // here because this should not happen unless there is a bug |
| // in the master code. |
| // |
| // TODO(bmahler): Check that we're not transitioning from |
| // TASK_UNREACHABLE to another state. |
| if (!protobuf::isTerminalState(task->state())) { |
| if (task->state() != updateState && framework != nullptr) { |
| // When we observe a transition away from a non-terminal state, |
| // decrement the relevant metric. |
| framework->metrics.decrementActiveTaskState(task->state()); |
| |
| framework->metrics.incrementTaskState(updateState); |
| } |
| |
| task->set_state(updateState); |
| } |
| |
| // If this is a (health) check status update, always forward it to |
| // subscribers. |
| if (status.reason() == TaskStatus::REASON_TASK_CHECK_STATUS_UPDATED || |
| status.reason() == TaskStatus::REASON_TASK_HEALTH_CHECK_STATUS_UPDATED) { |
| sendSubscribersUpdate = true; |
| } |
| |
| // TODO(brenden): Consider wiping the `message` field? |
| if (task->statuses_size() > 0 && |
| task->statuses(task->statuses_size() - 1).state() == status.state()) { |
| task->mutable_statuses()->RemoveLast(); |
| } else { |
| // Send a `TASK_UPDATED` event for every new task state. |
| sendSubscribersUpdate = true; |
| } |
| task->add_statuses()->CopyFrom(status); |
| |
| // Delete data (maybe very large since it's stored by on-top framework) we |
| // are not interested in to avoid OOM. |
| // For example: mesos-master is running on a machine with 4GB free memory, |
| // if every task stores 10MB data into TaskStatus, then mesos-master will be |
| // killed by OOM killer after 400 tasks have finished. |
| // MESOS-1746. |
| task->mutable_statuses(task->statuses_size() - 1)->clear_data(); |
| |
| if (sendSubscribersUpdate && !subscribers.subscribed.empty()) { |
| // If the framework has been removed, the task would have already |
| // transitioned to `TASK_KILLED` by `removeFramework()`, thus |
| // `sendSubscribersUpdate` shouldn't have been set to true. |
| // TODO(chhsiao): This may be changed after MESOS-6608 is resolved. |
| CHECK_NOTNULL(framework); |
| |
| subscribers.send( |
| protobuf::master::event::createTaskUpdated( |
| *task, task->state(), status), |
| framework->info, |
| *task); |
| } |
| |
| LOG(INFO) << "Updating the state of task " << task->task_id() |
| << " of framework " << task->framework_id() |
| << " (latest state: " << task->state() |
| << ", status update state: " << status.state() << ")"; |
| |
| // Once the task transitioned to terminal or unreachable, |
| // recover the resources. |
| if (transitionedToTerminalOrUnreachable) { |
| allocator->recoverResources( |
| task->framework_id(), |
| task->slave_id(), |
| task->resources(), |
| None()); |
| |
| // The slave owns the Task object and cannot be nullptr. |
| Slave* slave = slaves.registered.get(task->slave_id()); |
| CHECK_NOTNULL(slave); |
| |
| slave->recoverResources(task); |
| |
| if (framework != nullptr) { |
| framework->recoverResources(task); |
| } |
| |
| switch (status.state()) { |
| case TASK_FINISHED: ++metrics->tasks_finished; break; |
| case TASK_FAILED: ++metrics->tasks_failed; break; |
| case TASK_KILLED: ++metrics->tasks_killed; break; |
| case TASK_LOST: ++metrics->tasks_lost; break; |
| case TASK_ERROR: ++metrics->tasks_error; break; |
| case TASK_DROPPED: ++metrics->tasks_dropped; break; |
| case TASK_GONE: ++metrics->tasks_gone; break; |
| case TASK_GONE_BY_OPERATOR: ++metrics->tasks_gone_by_operator; break; |
| |
| // The following are non-terminal and use gauge based metrics. |
| case TASK_STARTING: break; |
| case TASK_STAGING: break; |
| case TASK_RUNNING: break; |
| case TASK_KILLING: break; |
| case TASK_UNREACHABLE: break; |
| |
| // Should not happen. |
| case TASK_UNKNOWN: |
| LOG(FATAL) << "Unexpected TASK_UNKNOWN for in-memory task"; |
| break; |
| } |
| |
| if (status.has_reason()) { |
| metrics->incrementTasksStates( |
| status.state(), |
| status.source(), |
| status.reason()); |
| } |
| } |
| } |
| |
| |
| void Master::removeTask(Task* task, bool unreachable) |
| { |
| CHECK_NOTNULL(task); |
| |
| // The slave owns the Task object and cannot be nullptr. |
| Slave* slave = slaves.registered.get(task->slave_id()); |
| CHECK_NOTNULL(slave); |
| |
| // Note that we explicitly convert from protobuf to `Resources` here |
| // and then use the result below to avoid performance penalty for multiple |
| // conversions and validations implied by conversion. |
| // Conversion is safe, as resources have already passed validation. |
| const Resources resources = task->resources(); |
| |
| // The invariant here is that the master will recover the resources |
| // prior to removing terminal or unreachable tasks. If the task is |
| // not terminal or unreachable, we must recover the resources here. |
| // |
| // TODO(bmahler): Currently, only `Master::finalize()` will call |
| // `removeTask()` with a non-terminal task. Consider fixing this |
| // and instead CHECKing here to simplify the logic. |
| if (!protobuf::isTerminalState(task->state()) && |
| task->state() != TASK_UNREACHABLE) { |
| CHECK(!unreachable) << task->task_id(); |
| |
| // Note that we use `Resources` for output as it's faster than |
| // logging raw protobuf data. |
| LOG(WARNING) << "Removing task " << task->task_id() |
| << " with resources " << resources |
| << " of framework " << task->framework_id() |
| << " on agent " << *slave |
| << " in non-terminal state " << task->state(); |
| |
| allocator->recoverResources( |
| task->framework_id(), |
| task->slave_id(), |
| resources, |
| None()); |
| } else { |
| // Note that we use `Resources` for output as it's faster than |
| // logging raw protobuf data. |
| LOG(INFO) << "Removing task " << task->task_id() |
| << " with resources " << resources |
| << " of framework " << task->framework_id() |
| << " on agent " << *slave; |
| } |
| |
| if (unreachable) { |
| slaves.unreachableTasks[slave->id][task->framework_id()] |
| .push_back(task->task_id()); |
| } |
| |
| // Remove from framework. |
| Framework* framework = getFramework(task->framework_id()); |
| if (framework != nullptr) { // A framework might not be reregistered yet. |
| framework->removeTask(task, unreachable); |
| } |
| |
| // Remove from slave. |
| slave->removeTask(task); |
| |
| delete task; |
| } |
| |
| |
| void Master::removeExecutor( |
| Slave* slave, |
| const FrameworkID& frameworkId, |
| const ExecutorID& executorId) |
| { |
| CHECK_NOTNULL(slave); |
| CHECK(slave->hasExecutor(frameworkId, executorId)); |
| |
| ExecutorInfo executor = slave->executors[frameworkId][executorId]; |
| |
| LOG(INFO) << "Removing executor '" << executorId |
| << "' with resources " << executor.resources() |
| << " of framework " << frameworkId << " on agent " << *slave; |
| |
| allocator->recoverResources( |
| frameworkId, slave->id, executor.resources(), None()); |
| |
| Framework* framework = getFramework(frameworkId); |
| if (framework != nullptr) { // The framework might not be reregistered yet. |
| framework->removeExecutor(slave->id, executorId); |
| } |
| |
| slave->removeExecutor(frameworkId, executorId); |
| } |
| |
| |
| void Master::addOperation( |
| Framework* framework, |
| Slave* slave, |
| Operation* operation) |
| { |
| CHECK_NOTNULL(operation); |
| CHECK_NOTNULL(slave); |
| |
| slave->addOperation(operation); |
| |
| if (framework != nullptr) { |
| framework->addOperation(operation); |
| } |
| } |
| |
| |
| void Master::updateOperation( |
| Operation* operation, |
| const UpdateOperationStatusMessage& update, |
| bool convertResources) |
| { |
| CHECK_NOTNULL(operation); |
| |
| const OperationStatus& status = |
| update.has_latest_status() ? update.latest_status() : update.status(); |
| |
| LOG(INFO) << "Updating the state of operation '" << operation->info().id() |
| << "' (uuid: " << update.operation_uuid() << ") for" |
| << (operation->has_framework_id() |
| ? " framework " + stringify(operation->framework_id()) |
| : " an operator API call") |
| << " (latest state: " << operation->latest_status().state() |
| << ", status update state: " << status.state() << ")"; |
| |
| // Whether the operation has just become terminated. |
| const bool terminated = |
| !protobuf::isTerminalState(operation->latest_status().state()) && |
| protobuf::isTerminalState(status.state()); |
| |
| // If the operation has already transitioned to a terminal state, |
| // do not update its state. |
| if (!protobuf::isTerminalState(operation->latest_status().state())) { |
| operation->mutable_latest_status()->CopyFrom(status); |
| } |
| |
| // TODO(gkleiman): Revisit the de-duplication logic (MESOS-8441) - if two |
| // different terminal statuses arrive, we could end up with different states |
| // in `latest_status` and the front of statuses list. |
| if (operation->statuses().empty() || |
| *(operation->statuses().rbegin()) != status) { |
| operation->add_statuses()->CopyFrom(status); |
| } |
| |
| if (!terminated) { |
| return; |
| } |
| |
| // Update resource accounting in the master and in the allocator. |
| // NOTE: For the "old" operations (RESERVE, UNRESERVE, CREATE, |
| // DESTROY), the master speculatively assumes that the operation |
| // will be successful when it accepts the operations. Therefore, we |
| // don't need to update the resource accounting for those types of |
| // operations in the master and in the allocator states upon |
| // receiving a terminal status update. |
| if (protobuf::isSpeculativeOperation(operation->info())) { |
| return; |
| } |
| |
| // We currently do not support non-speculated operations not |
| // triggered by a framework (e.g., over the operator API). |
| CHECK(operation->has_framework_id()); |
| |
| Try<Resources> consumed = protobuf::getConsumedResources(operation->info()); |
| CHECK_SOME(consumed); |
| |
| CHECK(operation->has_slave_id()) |
| << "External resource provider is not supported yet"; |
| |
| // The slave owns the Operation object and cannot be nullptr. |
| // TODO(jieyu): Revisit this once we introduce support for external |
| // resource provider. |
| Slave* slave = slaves.registered.get(operation->slave_id()); |
| CHECK_NOTNULL(slave); |
| |
| switch (operation->latest_status().state()) { |
| // Terminal state, and the conversion is successful. |
| case OPERATION_FINISHED: { |
| const Resources converted = |
| operation->latest_status().converted_resources(); |
| |
| if (convertResources) { |
| allocator->updateAllocation( |
| operation->framework_id(), |
| operation->slave_id(), |
| consumed.get(), |
| {ResourceConversion(consumed.get(), converted)}); |
| |
| allocator->recoverResources( |
| operation->framework_id(), |
| operation->slave_id(), |
| converted, |
| None()); |
| |
| Resources consumedUnallocated = consumed.get(); |
| consumedUnallocated.unallocate(); |
| |
| Resources convertedUnallocated = converted; |
| convertedUnallocated.unallocate(); |
| |
| slave->apply( |
| {ResourceConversion(consumedUnallocated, convertedUnallocated)}); |
| } else { |
| allocator->recoverResources( |
| operation->framework_id(), |
| operation->slave_id(), |
| consumed.get(), |
| None()); |
| } |
| |
| break; |
| } |
| |
| // Terminal state, and the conversion has failed. |
| case OPERATION_FAILED: |
| case OPERATION_ERROR: |
| case OPERATION_DROPPED: { |
| allocator->recoverResources( |
| operation->framework_id(), |
| operation->slave_id(), |
| consumed.get(), |
| None()); |
| |
| break; |
| } |
| |
| // Non-terminal or not expected from an agent. This shouldn't happen. |
| case OPERATION_UNSUPPORTED: |
| case OPERATION_PENDING: |
| case OPERATION_UNREACHABLE: |
| case OPERATION_GONE_BY_OPERATOR: |
| case OPERATION_RECOVERING: |
| case OPERATION_UNKNOWN: { |
| LOG(FATAL) << "Unexpected operation state " |
| << operation->latest_status().state(); |
| |
| break; |
| } |
| } |
| |
| slave->recoverResources(operation); |
| |
| Framework* framework = getFramework(operation->framework_id()); |
| |
| if (framework != nullptr) { |
| framework->recoverResources(operation); |
| } |
| } |
| |
| |
| void Master::removeOperation(Operation* operation) |
| { |
| CHECK_NOTNULL(operation); |
| |
| // Remove from framework. |
| Framework* framework = operation->has_framework_id() |
| ? getFramework(operation->framework_id()) |
| : nullptr; |
| |
| if (framework != nullptr) { |
| framework->removeOperation(operation); |
| } |
| |
| // Remove from slave. |
| CHECK(operation->has_slave_id()) |
| << "External resource provider is not supported yet"; |
| |
| Slave* slave = slaves.registered.get(operation->slave_id()); |
| CHECK_NOTNULL(slave); |
| |
| slave->removeOperation(operation); |
| |
| // If the operation was not speculated and is not terminal we |
| // need to also recover its used resources in the allocator. |
| if (!protobuf::isSpeculativeOperation(operation->info()) && |
| !protobuf::isTerminalState(operation->latest_status().state())) { |
| Try<Resources> consumed = protobuf::getConsumedResources(operation->info()); |
| CHECK_SOME(consumed); |
| |
| allocator->recoverResources( |
| operation->framework_id(), |
| operation->slave_id(), |
| consumed.get(), |
| None()); |
| } |
| |
| delete operation; |
| } |
| |
| |
| Future<Nothing> Master::apply(Slave* slave, const Offer::Operation& operation) |
| { |
| CHECK_NOTNULL(slave); |
| |
| return allocator->updateAvailable(slave->id, {operation}) |
| .onReady(defer(self(), &Master::_apply, slave, nullptr, operation)); |
| } |
| |
| |
| void Master::_apply( |
| Slave* slave, |
| Framework* framework, |
| const Offer::Operation& operationInfo) |
| { |
| CHECK_NOTNULL(slave); |
| |
| if (slave->capabilities.resourceProvider) { |
| Result<ResourceProviderID> resourceProviderId = |
| getResourceProviderId(operationInfo); |
| |
| // This must have been validated by the caller. |
| CHECK(!resourceProviderId.isError()); |
| |
| CHECK( |
| resourceProviderId.isNone() || |
| slave->resourceProviders.contains(resourceProviderId.get())) |
| << "Resource provider " + stringify(resourceProviderId.get()) + |
| " is unknown"; |
| |
| CHECK_SOME(slave->resourceVersion); |
| |
| const UUID resourceVersion = resourceProviderId.isNone() |
| ? slave->resourceVersion.get() |
| : slave->resourceProviders.get(resourceProviderId.get())->resourceVersion; |
| |
| Operation* operation = new Operation(protobuf::createOperation( |
| operationInfo, |
| protobuf::createOperationStatus( |
| OPERATION_PENDING, |
| operationInfo.has_id() |
| ? operationInfo.id() |
| : Option<OperationID>::none(), |
| None(), |
| None(), |
| None(), |
| slave->id, |
| resourceProviderId.isSome() |
| ? Some(resourceProviderId.get()) |
| : Option<ResourceProviderID>::none()), |
| framework != nullptr ? framework->id() : Option<FrameworkID>::none(), |
| slave->id)); |
| |
| addOperation(framework, slave, operation); |
| |
| if (protobuf::isSpeculativeOperation(operation->info())) { |
| Offer::Operation strippedOperationInfo = operation->info(); |
| protobuf::stripAllocationInfo(&strippedOperationInfo); |
| |
| Try<vector<ResourceConversion>> conversions = |
| getResourceConversions(strippedOperationInfo); |
| |
| CHECK_SOME(conversions); |
| |
| slave->apply(conversions.get()); |
| } |
| |
| ApplyOperationMessage message; |
| if (framework != nullptr) { |
| message.mutable_framework_id()->CopyFrom(framework->id()); |
| } |
| message.mutable_operation_info()->CopyFrom(operation->info()); |
| message.mutable_operation_uuid()->CopyFrom(operation->uuid()); |
| if (resourceProviderId.isSome()) { |
| message.mutable_resource_version_uuid() |
| ->mutable_resource_provider_id() |
| ->CopyFrom(resourceProviderId.get()); |
| } |
| |
| message.mutable_resource_version_uuid()->mutable_uuid()->CopyFrom( |
| resourceVersion); |
| |
| LOG(INFO) << "Sending operation '" << operation->info().id() |
| << "' (uuid: " << operation->uuid() << ") " |
| << "to agent " << *slave; |
| |
| send(slave->pid, message); |
| } else { |
| if (!protobuf::isSpeculativeOperation(operationInfo)) { |
| LOG(FATAL) << "Unexpected operation to apply on agent " << *slave; |
| } |
| |
| // We need to strip the allocation info from the operation's |
| // resources in order to apply the operation successfully |
| // since the agent's total is stored as unallocated resources. |
| Offer::Operation strippedOperationInfo = operationInfo; |
| protobuf::stripAllocationInfo(&strippedOperationInfo); |
| |
| Try<vector<ResourceConversion>> conversions = |
| getResourceConversions(strippedOperationInfo); |
| |
| CHECK_SOME(conversions); |
| |
| slave->apply(conversions.get()); |
| |
| CheckpointResourcesMessage message; |
| |
| message.mutable_resources()->CopyFrom(slave->checkpointedResources); |
| |
| if (!slave->capabilities.reservationRefinement) { |
| // If the agent is not refinement-capable, don't send it |
| // checkpointed resources that contain refined reservations. This |
| // might occur if a reservation refinement is created but never |
| // reaches the agent (e.g., due to network partition), and then |
| // the agent is downgraded before the partition heals. |
| // |
| // TODO(neilc): It would probably be better to prevent the agent |
| // from reregistering in this scenario. |
| Try<Nothing> result = downgradeResources(&message); |
| if (result.isError()) { |
| LOG(WARNING) << "Not sending updated checkpointed resources " |
| << slave->checkpointedResources |
| << " with refined reservations, since agent " << *slave |
| << " is not RESERVATION_REFINEMENT-capable."; |
| |
| return; |
| } |
| } |
| |
| LOG(INFO) << "Sending updated checkpointed resources " |
| << slave->checkpointedResources |
| << " to agent " << *slave; |
| |
| send(slave->pid, message); |
| } |
| |
| if (framework != nullptr) { |
| // We increment per-framework operation metrics for all operations except |
| // LAUNCH and LAUNCH_GROUP here. |
| framework->metrics.incrementOperation(operationInfo); |
| } |
| } |
| |
| |
| void Master::offerTimeout(const OfferID& offerId) |
| { |
| Offer* offer = getOffer(offerId); |
| if (offer != nullptr) { |
| allocator->recoverResources( |
| offer->framework_id(), offer->slave_id(), offer->resources(), None()); |
| removeOffer(offer, true); |
| } |
| } |
| |
| |
| // TODO(vinod): Instead of 'removeOffer()', consider implementing |
| // 'useOffer()', 'discardOffer()' and 'rescindOffer()' for clarity. |
| void Master::removeOffer(Offer* offer, bool rescind) |
| { |
| // Remove from framework. |
| Framework* framework = getFramework(offer->framework_id()); |
| CHECK(framework != nullptr) |
| << "Unknown framework " << offer->framework_id() |
| << " in the offer " << offer->id(); |
| |
| framework->removeOffer(offer); |
| |
| // Remove from slave. |
| Slave* slave = slaves.registered.get(offer->slave_id()); |
| |
| CHECK(slave != nullptr) |
| << "Unknown agent " << offer->slave_id() |
| << " in the offer " << offer->id(); |
| |
| slave->removeOffer(offer); |
| |
| if (rescind) { |
| RescindResourceOfferMessage message; |
| message.mutable_offer_id()->MergeFrom(offer->id()); |
| framework->metrics.offers_rescinded++; |
| framework->send(message); |
| } |
| |
| // Remove and cancel offer removal timers. Canceling the Timers is |
| // only done to avoid having too many active Timers in libprocess. |
| if (offerTimers.contains(offer->id())) { |
| Clock::cancel(offerTimers[offer->id()]); |
| offerTimers.erase(offer->id()); |
| } |
| |
| // Delete it. |
| LOG(INFO) << "Removing offer " << offer->id(); |
| offers.erase(offer->id()); |
| delete offer; |
| } |
| |
| |
| void Master::inverseOfferTimeout(const OfferID& inverseOfferId) |
| { |
| InverseOffer* inverseOffer = getInverseOffer(inverseOfferId); |
| if (inverseOffer != nullptr) { |
| allocator->updateInverseOffer( |
| inverseOffer->slave_id(), |
| inverseOffer->framework_id(), |
| UnavailableResources{ |
| inverseOffer->resources(), |
| inverseOffer->unavailability()}, |
| None()); |
| |
| removeInverseOffer(inverseOffer, true); |
| } |
| } |
| |
| |
| void Master::removeInverseOffer(InverseOffer* inverseOffer, bool rescind) |
| { |
| // Remove from framework. |
| Framework* framework = getFramework(inverseOffer->framework_id()); |
| CHECK(framework != nullptr) |
| << "Unknown framework " << inverseOffer->framework_id() |
| << " in the inverse offer " << inverseOffer->id(); |
| |
| framework->removeInverseOffer(inverseOffer); |
| |
| // Remove from slave. |
| Slave* slave = slaves.registered.get(inverseOffer->slave_id()); |
| |
| CHECK(slave != nullptr) |
| << "Unknown agent " << inverseOffer->slave_id() |
| << " in the inverse offer " << inverseOffer->id(); |
| |
| slave->removeInverseOffer(inverseOffer); |
| |
| if (rescind) { |
| RescindInverseOfferMessage message; |
| message.mutable_inverse_offer_id()->CopyFrom(inverseOffer->id()); |
| framework->send(message); |
| } |
| |
| // Remove and cancel inverse offer removal timers. Canceling the Timers is |
| // only done to avoid having too many active Timers in libprocess. |
| if (inverseOfferTimers.contains(inverseOffer->id())) { |
| Clock::cancel(inverseOfferTimers[inverseOffer->id()]); |
| inverseOfferTimers.erase(inverseOffer->id()); |
| } |
| |
| // Delete it. |
| inverseOffers.erase(inverseOffer->id()); |
| delete inverseOffer; |
| } |
| |
| |
| bool Master::isCompletedFramework(const FrameworkID& frameworkId) |
| { |
| return frameworks.completed.contains(frameworkId); |
| } |
| |
| |
| // TODO(bmahler): Consider killing this. |
| Framework* Master::getFramework(const FrameworkID& frameworkId) const |
| { |
| return frameworks.registered.contains(frameworkId) |
| ? frameworks.registered.at(frameworkId) |
| : nullptr; |
| } |
| |
| |
| // TODO(bmahler): Consider killing this. |
| Offer* Master::getOffer(const OfferID& offerId) const |
| { |
| return offers.contains(offerId) ? offers.at(offerId) : nullptr; |
| } |
| |
| |
| // TODO(bmahler): Consider killing this. |
| InverseOffer* Master::getInverseOffer(const OfferID& inverseOfferId) const |
| { |
| return inverseOffers.contains(inverseOfferId) |
| ? inverseOffers.at(inverseOfferId) |
| : nullptr; |
| } |
| |
| |
| // Create a new framework ID. We format the ID as MASTERID-FWID, where |
| // MASTERID is the ID of the master (randomly generated UUID) and FWID |
| // is an increasing integer. |
| FrameworkID Master::newFrameworkId() |
| { |
| std::ostringstream out; |
| |
| out << info_.id() << "-" << std::setw(4) |
| << std::setfill('0') << nextFrameworkId++; |
| |
| FrameworkID frameworkId; |
| frameworkId.set_value(out.str()); |
| |
| return frameworkId; |
| } |
| |
| |
| OfferID Master::newOfferId() |
| { |
| OfferID offerId; |
| offerId.set_value(info_.id() + "-O" + stringify(nextOfferId++)); |
| return offerId; |
| } |
| |
| |
| SlaveID Master::newSlaveId() |
| { |
| SlaveID slaveId; |
| slaveId.set_value(info_.id() + "-S" + stringify(nextSlaveId++)); |
| return slaveId; |
| } |
| |
| |
| double Master::_slaves_connected() |
| { |
| double count = 0.0; |
| foreachvalue (Slave* slave, slaves.registered) { |
| if (slave->connected) { |
| count++; |
| } |
| } |
| return count; |
| } |
| |
| |
| double Master::_slaves_disconnected() |
| { |
| double count = 0.0; |
| foreachvalue (Slave* slave, slaves.registered) { |
| if (!slave->connected) { |
| count++; |
| } |
| } |
| return count; |
| } |
| |
| |
| double Master::_slaves_active() |
| { |
| double count = 0.0; |
| foreachvalue (Slave* slave, slaves.registered) { |
| if (slave->active) { |
| count++; |
| } |
| } |
| return count; |
| } |
| |
| |
| double Master::_slaves_inactive() |
| { |
| double count = 0.0; |
| foreachvalue (Slave* slave, slaves.registered) { |
| if (!slave->active) { |
| count++; |
| } |
| } |
| return count; |
| } |
| |
| |
| double Master::_slaves_unreachable() |
| { |
| return static_cast<double>(slaves.unreachable.size()); |
| } |
| |
| |
| double Master::_frameworks_connected() |
| { |
| double count = 0.0; |
| foreachvalue (Framework* framework, frameworks.registered) { |
| if (framework->connected()) { |
| count++; |
| } |
| } |
| return count; |
| } |
| |
| |
| double Master::_frameworks_disconnected() |
| { |
| double count = 0.0; |
| foreachvalue (Framework* framework, frameworks.registered) { |
| if (!framework->connected()) { |
| count++; |
| } |
| } |
| return count; |
| } |
| |
| |
| double Master::_frameworks_active() |
| { |
| double count = 0.0; |
| foreachvalue (Framework* framework, frameworks.registered) { |
| if (framework->active()) { |
| count++; |
| } |
| } |
| return count; |
| } |
| |
| |
| double Master::_frameworks_inactive() |
| { |
| double count = 0.0; |
| foreachvalue (Framework* framework, frameworks.registered) { |
| if (!framework->active()) { |
| count++; |
| } |
| } |
| return count; |
| } |
| |
| |
| double Master::_tasks_staging() |
| { |
| double count = 0.0; |
| |
| // Add the tasks pending validation / authorization. |
| foreachvalue (Framework* framework, frameworks.registered) { |
| count += framework->pendingTasks.size(); |
| } |
| |
| foreachvalue (Slave* slave, slaves.registered) { |
| typedef hashmap<TaskID, Task*> TaskMap; |
| foreachvalue (const TaskMap& tasks, slave->tasks) { |
| foreachvalue (const Task* task, tasks) { |
| if (task->state() == TASK_STAGING) { |
| count++; |
| } |
| } |
| } |
| } |
| |
| return count; |
| } |
| |
| |
| double Master::_tasks_starting() |
| { |
| double count = 0.0; |
| |
| foreachvalue (Slave* slave, slaves.registered) { |
| typedef hashmap<TaskID, Task*> TaskMap; |
| foreachvalue (const TaskMap& tasks, slave->tasks) { |
| foreachvalue (const Task* task, tasks) { |
| if (task->state() == TASK_STARTING) { |
| count++; |
| } |
| } |
| } |
| } |
| |
| return count; |
| } |
| |
| |
| double Master::_tasks_running() |
| { |
| double count = 0.0; |
| |
| foreachvalue (Slave* slave, slaves.registered) { |
| typedef hashmap<TaskID, Task*> TaskMap; |
| foreachvalue (const TaskMap& tasks, slave->tasks) { |
| foreachvalue (const Task* task, tasks) { |
| if (task->state() == TASK_RUNNING) { |
| count++; |
| } |
| } |
| } |
| } |
| |
| return count; |
| } |
| |
| |
| double Master::_tasks_unreachable() |
| { |
| double count = 0.0; |
| |
| foreachvalue (Framework* framework, frameworks.registered) { |
| foreachvalue (const Owned<Task>& task, framework->unreachableTasks) { |
| if (task->state() == TASK_UNREACHABLE) { |
| count++; |
| } |
| } |
| } |
| |
| return count; |
| } |
| |
| |
| double Master::_tasks_killing() |
| { |
| double count = 0.0; |
| |
| foreachvalue (Slave* slave, slaves.registered) { |
| typedef hashmap<TaskID, Task*> TaskMap; |
| foreachvalue (const TaskMap& tasks, slave->tasks) { |
| foreachvalue (const Task* task, tasks) { |
| if (task->state() == TASK_KILLING) { |
| count++; |
| } |
| } |
| } |
| } |
| |
| return count; |
| } |
| |
| |
| double Master::_resources_total(const string& name) |
| { |
| double total = 0.0; |
| |
| foreachvalue (Slave* slave, slaves.registered) { |
| foreach (const Resource& resource, slave->info.resources()) { |
| if (resource.name() == name && resource.type() == Value::SCALAR) { |
| total += resource.scalar().value(); |
| } |
| } |
| } |
| |
| return total; |
| } |
| |
| |
| double Master::_resources_used(const string& name) |
| { |
| double used = 0.0; |
| |
| foreachvalue (Slave* slave, slaves.registered) { |
| // We use `Resources` arithmetic to accummulate the resources since the |
| // `+=` operator de-duplicates the same shared resources across frameworks. |
| Resources slaveUsed; |
| |
| foreachvalue (const Resources& resources, slave->usedResources) { |
| slaveUsed += resources.nonRevocable(); |
| } |
| |
| used += |
| slaveUsed.get<Value::Scalar>(name).getOrElse(Value::Scalar()).value(); |
| } |
| |
| return used; |
| } |
| |
| |
| double Master::_resources_percent(const string& name) |
| { |
| double total = _resources_total(name); |
| |
| if (total == 0.0) { |
| return 0.0; |
| } |
| |
| return _resources_used(name) / total; |
| } |
| |
| |
| double Master::_resources_revocable_total(const string& name) |
| { |
| double total = 0.0; |
| |
| foreachvalue (Slave* slave, slaves.registered) { |
| foreach (const Resource& resource, slave->totalResources.revocable()) { |
| if (resource.name() == name && resource.type() == Value::SCALAR) { |
| total += resource.scalar().value(); |
| } |
| } |
| } |
| |
| return total; |
| } |
| |
| |
| double Master::_resources_revocable_used(const string& name) |
| { |
| double used = 0.0; |
| |
| foreachvalue (Slave* slave, slaves.registered) { |
| // We use `Resources` arithmetic to accummulate the resources since the |
| // `+=` operator de-duplicates the same shared resources across frameworks. |
| Resources slaveUsed; |
| |
| foreachvalue (const Resources& resources, slave->usedResources) { |
| slaveUsed += resources.revocable(); |
| } |
| |
| used += |
| slaveUsed.get<Value::Scalar>(name).getOrElse(Value::Scalar()).value(); |
| } |
| |
| return used; |
| } |
| |
| |
| double Master::_resources_revocable_percent(const string& name) |
| { |
| double total = _resources_revocable_total(name); |
| |
| if (total == 0.0) { |
| return 0.0; |
| } |
| |
| return _resources_revocable_used(name) / total; |
| } |
| |
| |
| static bool isValidFailoverTimeout(const FrameworkInfo& frameworkInfo) |
| { |
| return Duration::create(frameworkInfo.failover_timeout()).isSome(); |
| } |
| |
| |
| void Master::Subscribers::send( |
| mesos::master::Event&& event, |
| const Option<FrameworkInfo>& frameworkInfo, |
| const Option<Task>& task) |
| { |
| VLOG(1) << "Notifying all active subscribers about " << event.type() |
| << " event"; |
| |
| // Create a single copy of the event for all subscribers to share. |
| Shared<mesos::master::Event> sharedEvent( |
| new mesos::master::Event(std::move(event))); |
| |
| // Create a single copy of `FrameworkInfo` and `Task` for all |
| // subscribers to share. |
| Shared<FrameworkInfo> sharedFrameworkInfo( |
| frameworkInfo.isSome() |
| ? new FrameworkInfo(frameworkInfo.get()) : nullptr); |
| Shared<Task> sharedTask(task.isSome() ? new Task(task.get()) : nullptr); |
| |
| foreachvalue (const Owned<Subscriber>& subscriber, subscribed) { |
| subscriber->getApprovers( |
| master->authorizer, |
| {VIEW_ROLE, VIEW_FRAMEWORK, VIEW_TASK, VIEW_EXECUTOR}) |
| .then(defer( |
| master->self(), |
| [=](const Owned<ObjectApprovers>& approvers) { |
| subscriber->send( |
| sharedEvent, |
| approvers, |
| sharedFrameworkInfo, |
| sharedTask); |
| |
| return Nothing(); |
| })); |
| } |
| } |
| |
| |
| Future<Owned<ObjectApprovers>> Master::Subscribers::Subscriber::getApprovers( |
| const Option<Authorizer*>& authorizer, |
| std::initializer_list<authorization::Action> actions) |
| { |
| Future<Owned<ObjectApprovers>> approvers = |
| ObjectApprovers::create(authorizer, principal, actions); |
| |
| return approversSequence.add<Owned<ObjectApprovers>>( |
| [approvers] { return approvers; }); |
| } |
| |
| |
| void Master::Subscribers::Subscriber::send( |
| const Shared<mesos::master::Event>& event, |
| const Owned<ObjectApprovers>& approvers, |
| const Shared<FrameworkInfo>& frameworkInfo, |
| const Shared<Task>& task) |
| { |
| switch (event->type()) { |
| case mesos::master::Event::TASK_ADDED: { |
| CHECK_NOTNULL(frameworkInfo.get()); |
| |
| if (approvers->approved<VIEW_TASK>( |
| event->task_added().task(), *frameworkInfo) && |
| approvers->approved<VIEW_FRAMEWORK>(*frameworkInfo)) { |
| http.send<mesos::master::Event, v1::master::Event>(*event); |
| } |
| break; |
| } |
| case mesos::master::Event::TASK_UPDATED: { |
| CHECK_NOTNULL(frameworkInfo.get()); |
| CHECK_NOTNULL(task.get()); |
| |
| if (approvers->approved<VIEW_TASK>(*task, *frameworkInfo) && |
| approvers->approved<VIEW_FRAMEWORK>(*frameworkInfo)) { |
| http.send<mesos::master::Event, v1::master::Event>(*event); |
| } |
| break; |
| } |
| case mesos::master::Event::FRAMEWORK_ADDED: { |
| if (approvers->approved<VIEW_FRAMEWORK>( |
| event->framework_added().framework().framework_info())) { |
| mesos::master::Event event_(*event); |
| event_.mutable_framework_added()->mutable_framework()-> |
| mutable_allocated_resources()->Clear(); |
| event_.mutable_framework_added()->mutable_framework()-> |
| mutable_offered_resources()->Clear(); |
| |
| foreach( |
| const Resource& resource, |
| event->framework_added().framework().allocated_resources()) { |
| if (approvers->approved<VIEW_ROLE>(resource)) { |
| event_.mutable_framework_added()->mutable_framework()-> |
| add_allocated_resources()->CopyFrom(resource); |
| } |
| } |
| |
| foreach( |
| const Resource& resource, |
| event->framework_added().framework().offered_resources()) { |
| if (approvers->approved<VIEW_ROLE>(resource)) { |
| event_.mutable_framework_added()->mutable_framework()-> |
| add_offered_resources()->CopyFrom(resource); |
| } |
| } |
| |
| http.send<mesos::master::Event, v1::master::Event>(event_); |
| } |
| break; |
| } |
| case mesos::master::Event::FRAMEWORK_UPDATED: { |
| if (approvers->approved<VIEW_FRAMEWORK>( |
| event->framework_updated().framework().framework_info())) { |
| mesos::master::Event event_(*event); |
| event_.mutable_framework_updated()->mutable_framework()-> |
| mutable_allocated_resources()->Clear(); |
| event_.mutable_framework_updated()->mutable_framework()-> |
| mutable_offered_resources()->Clear(); |
| |
| foreach( |
| const Resource& resource, |
| event->framework_updated().framework().allocated_resources()) { |
| if (approvers->approved<VIEW_ROLE>(resource)) { |
| event_.mutable_framework_updated()->mutable_framework()-> |
| add_allocated_resources()->CopyFrom(resource); |
| } |
| } |
| |
| foreach( |
| const Resource& resource, |
| event->framework_updated().framework().offered_resources()) { |
| if (approvers->approved<VIEW_ROLE>(resource)) { |
| event_.mutable_framework_updated()->mutable_framework()-> |
| add_offered_resources()->CopyFrom(resource); |
| } |
| } |
| |
| http.send<mesos::master::Event, v1::master::Event>(event_); |
| } |
| break; |
| } |
| case mesos::master::Event::FRAMEWORK_REMOVED: { |
| if (approvers->approved<VIEW_FRAMEWORK>( |
| event->framework_removed().framework_info())) { |
| http.send<mesos::master::Event, v1::master::Event>(*event); |
| } |
| break; |
| } |
| case mesos::master::Event::AGENT_ADDED: { |
| mesos::master::Event event_(*event); |
| event_.mutable_agent_added()->mutable_agent()-> |
| mutable_total_resources()->Clear(); |
| |
| foreach( |
| const Resource& resource, |
| event->agent_added().agent().total_resources()) { |
| if (approvers->approved<VIEW_ROLE>(resource)) { |
| event_.mutable_agent_added()->mutable_agent()->add_total_resources() |
| ->CopyFrom(resource); |
| } |
| } |
| |
| http.send<mesos::master::Event, v1::master::Event>(event_); |
| break; |
| } |
| case mesos::master::Event::AGENT_REMOVED: |
| case mesos::master::Event::SUBSCRIBED: |
| case mesos::master::Event::HEARTBEAT: |
| case mesos::master::Event::UNKNOWN: |
| http.send<mesos::master::Event, v1::master::Event>(*event); |
| break; |
| } |
| } |
| |
| |
| void Master::exited(const id::UUID& id) |
| { |
| if (!subscribers.subscribed.contains(id)) { |
| LOG(WARNING) << "Unknown subscriber " << id << " disconnected"; |
| return; |
| } |
| |
| LOG(INFO) << "Removed subscriber " << id |
| << " from the list of active subscribers"; |
| |
| subscribers.subscribed.erase(id); |
| } |
| |
| |
| void Master::subscribe( |
| const HttpConnection& http, |
| const Option<Principal>& principal) |
| { |
| LOG(INFO) << "Added subscriber " << http.streamId |
| << " to the list of active subscribers"; |
| |
| http.closed() |
| .onAny(defer(self(), |
| [this, http](const Future<Nothing>&) { |
| exited(http.streamId); |
| })); |
| |
| subscribers.subscribed.put( |
| http.streamId, |
| Owned<Subscribers::Subscriber>( |
| new Subscribers::Subscriber{http, principal})); |
| } |
| |
| |
| Slave::Slave( |
| Master* const _master, |
| SlaveInfo _info, |
| const UPID& _pid, |
| const MachineID& _machineId, |
| const string& _version, |
| vector<SlaveInfo::Capability> _capabilites, |
| const Time& _registeredTime, |
| vector<Resource> _checkpointedResources, |
| const Option<UUID>& _resourceVersion, |
| vector<ExecutorInfo> executorInfos, |
| vector<Task> tasks) |
| : master(_master), |
| id(_info.id()), |
| info(std::move(_info)), |
| machineId(_machineId), |
| pid(_pid), |
| version(_version), |
| capabilities(std::move(_capabilites)), |
| registeredTime(_registeredTime), |
| connected(true), |
| active(true), |
| checkpointedResources(std::move(_checkpointedResources)), |
| resourceVersion(_resourceVersion), |
| observer(nullptr) |
| { |
| CHECK(info.has_id()); |
| |
| Try<Resources> resources = applyCheckpointedResources( |
| info.resources(), |
| checkpointedResources); |
| |
| // NOTE: This should be validated during slave recovery. |
| CHECK_SOME(resources); |
| totalResources = resources.get(); |
| |
| foreach (ExecutorInfo& executorInfo, executorInfos) { |
| CHECK(executorInfo.has_framework_id()); |
| addExecutor(executorInfo.framework_id(), std::move(executorInfo)); |
| } |
| |
| foreach (Task& task, tasks) { |
| addTask(new Task(std::move(task))); |
| } |
| } |
| |
| |
| Slave::~Slave() |
| { |
| if (reregistrationTimer.isSome()) { |
| process::Clock::cancel(reregistrationTimer.get()); |
| } |
| } |
| |
| |
| Task* Slave::getTask(const FrameworkID& frameworkId, const TaskID& taskId) const |
| { |
| if (tasks.contains(frameworkId) && tasks.at(frameworkId).contains(taskId)) { |
| return tasks.at(frameworkId).at(taskId); |
| } |
| return nullptr; |
| } |
| |
| |
| void Slave::addTask(Task* task) |
| { |
| const TaskID& taskId = task->task_id(); |
| const FrameworkID& frameworkId = task->framework_id(); |
| |
| CHECK(!tasks[frameworkId].contains(taskId)) |
| << "Duplicate task " << taskId << " of framework " << frameworkId; |
| |
| // Verify that Resource.AllocationInfo is set, |
| // this should be guaranteed by the master. |
| foreach (const Resource& resource, task->resources()) { |
| CHECK(resource.has_allocation_info()); |
| } |
| |
| tasks[frameworkId][taskId] = task; |
| |
| // Note that we explicitly convert from protobuf to `Resources` here |
| // and then use the result below to avoid performance penalty for multiple |
| // conversions and validations implied by conversion. |
| // Conversion is safe, as resources have already passed validation. |
| const Resources resources = task->resources(); |
| |
| CHECK(task->state() != TASK_UNREACHABLE) |
| << "Task '" << taskId << "' of framework " << frameworkId |
| << " added in TASK_UNREACHABLE state"; |
| |
| if (!protobuf::isTerminalState(task->state())) { |
| usedResources[frameworkId] += resources; |
| } |
| |
| // Note that we use `Resources` for output as it's faster than |
| // logging raw protobuf data. |
| LOG(INFO) << "Adding task " << taskId |
| << " with resources " << resources |
| << " on agent " << *this; |
| } |
| |
| |
| void Slave::recoverResources(Task* task) |
| { |
| const TaskID& taskId = task->task_id(); |
| const FrameworkID& frameworkId = task->framework_id(); |
| |
| CHECK(protobuf::isTerminalState(task->state()) || |
| task->state() == TASK_UNREACHABLE) |
| << "Task '" << taskId << "' of framework " << frameworkId |
| << " is in unexpected state " << task->state(); |
| |
| CHECK(tasks.at(frameworkId).contains(taskId)) |
| << "Unknown task " << taskId << " of framework " << frameworkId; |
| |
| usedResources[frameworkId] -= task->resources(); |
| if (usedResources[frameworkId].empty()) { |
| usedResources.erase(frameworkId); |
| } |
| } |
| |
| |
| void Slave::removeTask(Task* task) |
| { |
| const TaskID& taskId = task->task_id(); |
| const FrameworkID& frameworkId = task->framework_id(); |
| |
| CHECK(tasks.at(frameworkId).contains(taskId)) |
| << "Unknown task " << taskId << " of framework " << frameworkId; |
| |
| // The invariant here is that the master will have already called |
| // `recoverResources()` prior to removing terminal or unreachable tasks. |
| // |
| // TODO(bmahler): The unreachable case could be avoided if |
| // we updated `removeSlave` in the allocator to recover the |
| // resources (see MESOS-621) so that the master could just |
| // remove the unreachable agent from the allocator. |
| if (!protobuf::isTerminalState(task->state()) && |
| task->state() != TASK_UNREACHABLE) { |
| // We cannot call `Slave::recoverResources()` here because |
| // it expects the task to be terminal or unreachable. |
| usedResources[frameworkId] -= task->resources(); |
| if (usedResources[frameworkId].empty()) { |
| usedResources.erase(frameworkId); |
| } |
| } |
| |
| tasks[frameworkId].erase(taskId); |
| if (tasks[frameworkId].empty()) { |
| tasks.erase(frameworkId); |
| } |
| |
| killedTasks.remove(frameworkId, taskId); |
| } |
| |
| |
| void Slave::addOperation(Operation* operation) |
| { |
| Result<ResourceProviderID> resourceProviderId = |
| getResourceProviderId(operation->info()); |
| |
| CHECK(!resourceProviderId.isError()) << resourceProviderId.error(); |
| |
| if (resourceProviderId.isNone()) { |
| operations.put(operation->uuid(), operation); |
| } else { |
| CHECK(resourceProviders.contains(resourceProviderId.get())); |
| |
| ResourceProvider& resourceProvider = |
| resourceProviders.at(resourceProviderId.get()); |
| |
| resourceProvider.operations.put(operation->uuid(), operation); |
| } |
| |
| if (!protobuf::isSpeculativeOperation(operation->info()) && |
| !protobuf::isTerminalState(operation->latest_status().state())) { |
| Try<Resources> consumed = protobuf::getConsumedResources(operation->info()); |
| |
| CHECK_SOME(consumed); |
| |
| // There isn't support for non-speculative operations using the |
| // operator API. We can assume the framework ID has been set. |
| CHECK(operation->has_framework_id()); |
| |
| usedResources[operation->framework_id()] += consumed.get(); |
| } |
| } |
| |
| |
| void Slave::recoverResources(Operation* operation) |
| { |
| // TODO(jieyu): Currently, we do not keep track of used resources |
| // for operations that are created by the operator through the |
| // operator API endpoint. |
| if (!operation->has_framework_id()) { |
| return; |
| } |
| |
| const FrameworkID& frameworkId = operation->framework_id(); |
| |
| if (protobuf::isSpeculativeOperation(operation->info())) { |
| return; |
| } |
| |
| Try<Resources> consumed = protobuf::getConsumedResources(operation->info()); |
| CHECK_SOME(consumed); |
| |
| CHECK(usedResources[frameworkId].contains(consumed.get())) |
| << "Unknown resources " << consumed.get() |
| << " of framework " << frameworkId; |
| |
| usedResources[frameworkId] -= consumed.get(); |
| if (usedResources[frameworkId].empty()) { |
| usedResources.erase(frameworkId); |
| } |
| } |
| |
| |
| void Slave::removeOperation(Operation* operation) |
| { |
| const UUID& uuid = operation->uuid(); |
| |
| Result<ResourceProviderID> resourceProviderId = |
| getResourceProviderId(operation->info()); |
| |
| CHECK(!resourceProviderId.isError()) << resourceProviderId.error(); |
| |
| // Recover the resource used by this operation. |
| if (!protobuf::isSpeculativeOperation(operation->info()) && |
| !protobuf::isTerminalState(operation->latest_status().state())) { |
| recoverResources(operation); |
| } |
| |
| // Remove the operation. |
| if (resourceProviderId.isNone()) { |
| CHECK(operations.contains(uuid)) |
| << "Unknown operation (uuid: " << uuid << ")" |
| << " to agent " << *this; |
| |
| operations.erase(operation->uuid()); |
| } else { |
| CHECK(resourceProviders.contains(resourceProviderId.get())) |
| << "resource provider " << resourceProviderId.get() << " is unknown"; |
| |
| ResourceProvider& resourceProvider = |
| resourceProviders.at(resourceProviderId.get()); |
| |
| CHECK(resourceProvider.operations.contains(uuid)) |
| << "Unknown operation (uuid: " << uuid << ")" |
| << " to resource provider " << resourceProviderId.get() |
| << " on agent " << *this; |
| |
| resourceProvider.operations.erase(operation->uuid()); |
| } |
| } |
| |
| |
| Operation* Slave::getOperation(const UUID& uuid) const |
| { |
| if (operations.contains(uuid)) { |
| return operations.at(uuid); |
| } |
| |
| foreachvalue (const ResourceProvider& resourceProvider, resourceProviders) { |
| if (resourceProvider.operations.contains(uuid)) { |
| return resourceProvider.operations.at(uuid); |
| } |
| } |
| |
| return nullptr; |
| } |
| |
| |
| void Slave::addOffer(Offer* offer) |
| { |
| CHECK(!offers.contains(offer)) << "Duplicate offer " << offer->id(); |
| |
| offers.insert(offer); |
| offeredResources += offer->resources(); |
| } |
| |
| |
| void Slave::removeOffer(Offer* offer) |
| { |
| CHECK(offers.contains(offer)) << "Unknown offer " << offer->id(); |
| |
| offeredResources -= offer->resources(); |
| offers.erase(offer); |
| } |
| |
| |
| void Slave::addInverseOffer(InverseOffer* inverseOffer) |
| { |
| CHECK(!inverseOffers.contains(inverseOffer)) |
| << "Duplicate inverse offer " << inverseOffer->id(); |
| |
| inverseOffers.insert(inverseOffer); |
| } |
| |
| |
| void Slave::removeInverseOffer(InverseOffer* inverseOffer) |
| { |
| CHECK(inverseOffers.contains(inverseOffer)) |
| << "Unknown inverse offer " << inverseOffer->id(); |
| |
| inverseOffers.erase(inverseOffer); |
| } |
| |
| |
| bool Slave::hasExecutor(const FrameworkID& frameworkId, |
| const ExecutorID& executorId) const |
| { |
| return executors.contains(frameworkId) && |
| executors.at(frameworkId).contains(executorId); |
| } |
| |
| |
| void Slave::addExecutor(const FrameworkID& frameworkId, |
| const ExecutorInfo& executorInfo) |
| { |
| CHECK(!hasExecutor(frameworkId, executorInfo.executor_id())) |
| << "Duplicate executor '" << executorInfo.executor_id() |
| << "' of framework " << frameworkId; |
| |
| // Verify that Resource.AllocationInfo is set, |
| // this should be guaranteed by the master. |
| foreach (const Resource& resource, executorInfo.resources()) { |
| CHECK(resource.has_allocation_info()); |
| } |
| |
| executors[frameworkId][executorInfo.executor_id()] = executorInfo; |
| usedResources[frameworkId] += executorInfo.resources(); |
| } |
| |
| |
| void Slave::removeExecutor(const FrameworkID& frameworkId, |
| const ExecutorID& executorId) |
| { |
| CHECK(hasExecutor(frameworkId, executorId)) |
| << "Unknown executor '" << executorId << "' of framework " << frameworkId; |
| |
| usedResources[frameworkId] -= |
| executors[frameworkId][executorId].resources(); |
| if (usedResources[frameworkId].empty()) { |
| usedResources.erase(frameworkId); |
| } |
| |
| executors[frameworkId].erase(executorId); |
| if (executors[frameworkId].empty()) { |
| executors.erase(frameworkId); |
| } |
| } |
| |
| |
| void Slave::apply(const vector<ResourceConversion>& conversions) |
| { |
| Try<Resources> resources = totalResources.apply(conversions); |
| CHECK_SOME(resources); |
| |
| totalResources = resources.get(); |
| |
| checkpointedResources = totalResources.filter(needCheckpointing); |
| |
| // Also apply the conversion to the explicitly maintained resource |
| // provider resources. |
| foreach (const ResourceConversion& conversion, conversions) { |
| Result<ResourceProviderID> providerId = |
| getResourceProviderId(conversion.consumed); |
| |
| if (providerId.isNone()) { |
| continue; |
| } |
| |
| CHECK_SOME(providerId); |
| CHECK(resourceProviders.contains(providerId.get())); |
| ResourceProvider& provider = resourceProviders.at(providerId.get()); |
| |
| CHECK(provider.totalResources.contains(conversion.consumed)); |
| provider.totalResources -= conversion.consumed; |
| provider.totalResources += conversion.converted; |
| } |
| } |
| |
| |
| Try<Nothing> Slave::update( |
| const SlaveInfo& _info, |
| const string& _version, |
| const vector<SlaveInfo::Capability>& _capabilities, |
| const Resources& _checkpointedResources, |
| const Option<UUID>& _resourceVersion) |
| { |
| Try<Resources> resources = applyCheckpointedResources( |
| _info.resources(), |
| _checkpointedResources); |
| |
| // This should be validated during slave recovery. |
| if (resources.isError()) { |
| return Error(resources.error()); |
| } |
| |
| version = _version; |
| capabilities = _capabilities; |
| info = _info; |
| checkpointedResources = _checkpointedResources; |
| |
| // There is a short window here where `totalResources` can have an old value, |
| // but it should be relatively short because the agent will send |
| // an `UpdateSlaveMessage` with the new total resources immediately after |
| // reregistering in this case. |
| totalResources = resources.get(); |
| |
| resourceVersion = _resourceVersion; |
| |
| return Nothing(); |
| } |
| |
| } // namespace master { |
| } // namespace internal { |
| } // namespace mesos { |