// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include <errno.h>
#include <signal.h>
#include <stdlib.h> // For random().

#include <algorithm>
#include <cmath>
#include <deque>
#include <iomanip>
#include <list>
#include <map>
#include <memory>
#include <set>
#include <sstream>
#include <string>
#include <utility>
#include <vector>

#include <glog/logging.h>

#include <mesos/type_utils.hpp>

#include <mesos/authentication/secret_generator.hpp>

#include <mesos/module/authenticatee.hpp>

#include <mesos/state/leveldb.hpp>
#include <mesos/state/in_memory.hpp>

#include <mesos/resource_provider/storage/disk_profile_adaptor.hpp>

#include <process/after.hpp>
#include <process/async.hpp>
#include <process/check.hpp>
#include <process/collect.hpp>
#include <process/defer.hpp>
#include <process/delay.hpp>
#include <process/dispatch.hpp>
#include <process/http.hpp>
#include <process/id.hpp>
#include <process/loop.hpp>
#include <process/reap.hpp>
#include <process/time.hpp>

#include <process/ssl/flags.hpp>

#include <stout/bytes.hpp>
#include <stout/check.hpp>
#include <stout/duration.hpp>
#include <stout/exit.hpp>
#include <stout/fs.hpp>
#include <stout/json.hpp>
#include <stout/lambda.hpp>
#include <stout/net.hpp>
#include <stout/numify.hpp>
#include <stout/option.hpp>
#include <stout/os.hpp>
#include <stout/path.hpp>
#include <stout/protobuf.hpp>
#include <stout/stringify.hpp>
#include <stout/strings.hpp>
#include <stout/try.hpp>
#include <stout/utils.hpp>
#include <stout/uuid.hpp>

#include <stout/os/realpath.hpp>

#include "authentication/cram_md5/authenticatee.hpp"

#include "common/authorization.hpp"
#include "common/build.hpp"
#include "common/protobuf_utils.hpp"
#include "common/resources_utils.hpp"
#include "common/status_utils.hpp"
#include "common/validation.hpp"

#include "credentials/credentials.hpp"

#include "hook/manager.hpp"

#ifdef __linux__
#include "linux/fs.hpp"
#endif // __linux__

#include "logging/logging.hpp"

#include "master/detector/standalone.hpp"

#include "module/manager.hpp"

#include "slave/compatibility.hpp"
#include "slave/constants.hpp"
#include "slave/flags.hpp"
#include "slave/paths.hpp"
#include "slave/slave.hpp"
#include "slave/state.pb.h"
#include "slave/task_status_update_manager.hpp"

#ifdef __WINDOWS__
// Used to install a Windows console ctrl handler.
// https://msdn.microsoft.com/en-us/library/windows/desktop/ms682066(v=vs.85).aspx
#include <slave/windows_ctrlhandler.hpp>
#else
// Used to install a handler for POSIX signal.
// http://pubs.opengroup.org/onlinepubs/009695399/functions/sigaction.html
#include <slave/posix_signalhandler.hpp>
#endif // __WINDOWS__

namespace http = process::http;

using google::protobuf::RepeatedPtrField;

using mesos::SecretGenerator;

using mesos::authorization::createSubject;
using mesos::authorization::ACCESS_SANDBOX;

using mesos::executor::Call;

using mesos::master::detector::MasterDetector;

using mesos::slave::ContainerConfig;
using mesos::slave::ContainerTermination;
using mesos::slave::QoSController;
using mesos::slave::QoSCorrection;
using mesos::slave::ResourceEstimator;

using std::deque;
using std::find;
using std::list;
using std::map;
using std::ostream;
using std::ostringstream;
using std::set;
using std::shared_ptr;
using std::string;
using std::vector;

using process::after;
using process::async;
using process::wait; // Necessary on some OS's to disambiguate.
using process::Break;
using process::Clock;
using process::Continue;
using process::ControlFlow;
using process::Failure;
using process::Future;
using process::Owned;
using process::PID;
using process::Promise;
using process::Time;
using process::UPID;

using process::http::authentication::Principal;

namespace mesos {
namespace internal {
namespace slave {

using namespace state;

// Forward declarations.

// Needed for logging task/task group.
static string taskOrTaskGroup(
    const Option<TaskInfo>& task,
    const Option<TaskGroupInfo>& taskGroup);


// Returns the command info for default executor.
static CommandInfo defaultExecutorCommandInfo(
    const std::string& launcherDir,
    const Option<std::string>& user);


// Sets the executor resource limit (the `limit` parameter) based on the resource
// passed in (the `value` parameter).
static void setLimit(Option<Value::Scalar>& limit, const Value::Scalar& value);


Slave::Slave(const string& id,
             const slave::Flags& _flags,
             MasterDetector* _detector,
             Containerizer* _containerizer,
             Files* _files,
             GarbageCollector* _gc,
             TaskStatusUpdateManager* _taskStatusUpdateManager,
             ResourceEstimator* _resourceEstimator,
             QoSController* _qosController,
             SecretGenerator* _secretGenerator,
             VolumeGidManager* _volumeGidManager,
             PendingFutureTracker* _futureTracker,
#ifndef __WINDOWS__
             const Option<process::network::unix::Socket>& _executorSocket,
#endif // __WINDOWS__
             const Option<Authorizer*>& _authorizer)
  : ProcessBase(id),
    state(RECOVERING),
    flags(_flags),
    http(this),
    capabilities(
        _flags.agent_features.isNone()
          ? protobuf::slave::Capabilities(AGENT_CAPABILITIES())
          : protobuf::slave::Capabilities(
                _flags.agent_features->capabilities())),
    completedFrameworks(MAX_COMPLETED_FRAMEWORKS),
    detector(_detector),
    containerizer(_containerizer),
    files(_files),
    metrics(*this),
    gc(_gc),
    taskStatusUpdateManager(_taskStatusUpdateManager),
    masterPingTimeout(DEFAULT_MASTER_PING_TIMEOUT()),
    metaDir(paths::getMetaRootDir(flags.work_dir)),
    recoveryErrors(0),
    credential(None()),
    authenticatee(nullptr),
    authenticating(None()),
    authenticated(false),
    reauthenticate(false),
    executorDirectoryMaxAllowedAge(age(0)),
    resourceEstimator(_resourceEstimator),
    qosController(_qosController),
    secretGenerator(_secretGenerator),
    volumeGidManager(_volumeGidManager),
    futureTracker(_futureTracker),
#ifndef __WINDOWS__
    executorSocket(_executorSocket),
#endif // __WINDOWS__
    authorizer(_authorizer),
    resourceVersion(protobuf::createUUID()) {}


Slave::~Slave()
{
  // TODO(benh): Shut down frameworks?

  // TODO(benh): Shut down executors? The executor should get an "exited"
  // event and initiate a shut down itself.

  foreachvalue (Framework* framework, frameworks) {
    delete framework;
  }

  delete authenticatee;
}


void Slave::signaled(int signal, int uid)
{
  if (signal == SIGUSR1) {
    Result<string> user = os::user(uid);

    shutdown(
        UPID(),
        "Received SIGUSR1 signal" +
        (user.isSome() ? " from user " + user.get() : ""));
  }
}


void Slave::initialize()
{
  LOG(INFO) << "Mesos agent started on " << string(self()).substr(5);
  LOG(INFO) << "Flags at startup: " << flags;

  if (self().address.ip.isLoopback()) {
    LOG(WARNING) << "\n**************************************************\n"
                 << "Agent bound to loopback interface!"
                 << " Cannot communicate with remote master(s)."
                 << " You might want to set '--ip' flag to a routable"
                 << " IP address.\n"
                 << "**************************************************";
  }

  if (flags.registration_backoff_factor > REGISTER_RETRY_INTERVAL_MAX) {
    EXIT(EXIT_FAILURE)
      << "Invalid value '" << flags.registration_backoff_factor << "'"
      << " for --registration_backoff_factor:"
      << " Must be less than " << REGISTER_RETRY_INTERVAL_MAX;
  }

  authenticateeName = flags.authenticatee;

  // Load credential for agent authentication with the master.
  if (flags.credential.isSome()) {
    Result<Credential> _credential =
      credentials::readCredential(flags.credential.get());
    if (_credential.isError()) {
      EXIT(EXIT_FAILURE) << _credential.error() << " (see --credential flag)";
    } else if (_credential.isNone()) {
      EXIT(EXIT_FAILURE)
        << "Empty credential file '" << flags.credential.get() << "'"
        << " (see --credential flag)";
    } else {
      credential = _credential.get();
      LOG(INFO) << "Agent using credential for: "
                << credential->principal();
    }
  }

  Option<Credentials> httpCredentials;
  if (flags.http_credentials.isSome()) {
    Result<Credentials> credentials =
      credentials::read(flags.http_credentials.get());
    if (credentials.isError()) {
       EXIT(EXIT_FAILURE)
         << credentials.error() << " (see --http_credentials flag)";
    } else if (credentials.isNone()) {
       EXIT(EXIT_FAILURE)
         << "Credentials file must contain at least one credential"
         << " (see --http_credentials flag)";
    }
    httpCredentials = credentials.get();
  }

  string httpAuthenticators;
  if (flags.http_authenticators.isSome()) {
    httpAuthenticators = flags.http_authenticators.get();
#ifdef USE_SSL_SOCKET
  } else if (flags.authenticate_http_executors) {
    httpAuthenticators =
      string(DEFAULT_BASIC_HTTP_AUTHENTICATOR) + "," +
      string(DEFAULT_JWT_HTTP_AUTHENTICATOR);
#endif // USE_SSL_SOCKET
  } else {
    httpAuthenticators = DEFAULT_BASIC_HTTP_AUTHENTICATOR;
  }

  Option<string> jwtSecretKey;
#ifdef USE_SSL_SOCKET
  if (flags.jwt_secret_key.isSome()) {
    Try<string> jwtSecretKey_ = os::read(flags.jwt_secret_key.get());
    if (jwtSecretKey_.isError()) {
      EXIT(EXIT_FAILURE) << "Failed to read the file specified by "
                         << "--jwt_secret_key";
    }

    // TODO(greggomann): Factor the following code out into a common helper,
    // since we also do this when loading credentials.
    Try<os::Permissions> permissions =
      os::permissions(flags.jwt_secret_key.get());
    if (permissions.isError()) {
      LOG(WARNING) << "Failed to stat jwt secret key file '"
                   << flags.jwt_secret_key.get()
                   << "': " << permissions.error();
    } else if (permissions->others.rwx) {
      LOG(WARNING) << "Permissions on executor secret key file '"
                   << flags.jwt_secret_key.get()
                   << "' are too open; it is recommended that your"
                   << " key file is NOT accessible by others";
    }

    jwtSecretKey = jwtSecretKey_.get();
  }

  if (flags.authenticate_http_executors) {
    if (flags.jwt_secret_key.isNone()) {
      EXIT(EXIT_FAILURE) << "--jwt_secret_key must be specified when "
                         << "--authenticate_http_executors is set to true";
    }

    Try<Nothing> result = initializeHttpAuthenticators(
        EXECUTOR_HTTP_AUTHENTICATION_REALM,
        strings::split(httpAuthenticators, ","),
        httpCredentials,
        jwtSecretKey);

    if (result.isError()) {
      EXIT(EXIT_FAILURE) << result.error();
    }
  }
#endif // USE_SSL_SOCKET

  if (flags.authenticate_http_readonly) {
    Try<Nothing> result = initializeHttpAuthenticators(
        READONLY_HTTP_AUTHENTICATION_REALM,
        strings::split(httpAuthenticators, ","),
        httpCredentials,
        jwtSecretKey);

    if (result.isError()) {
      EXIT(EXIT_FAILURE) << result.error();
    }
  }

  if (flags.authenticate_http_readwrite) {
    Try<Nothing> result = initializeHttpAuthenticators(
        READWRITE_HTTP_AUTHENTICATION_REALM,
        strings::split(httpAuthenticators, ","),
        httpCredentials,
        jwtSecretKey);

    if (result.isError()) {
      EXIT(EXIT_FAILURE) << result.error();
    }
  }

  if ((flags.gc_disk_headroom < 0) || (flags.gc_disk_headroom > 1)) {
    EXIT(EXIT_FAILURE)
      << "Invalid value '" << flags.gc_disk_headroom << "'"
      << " for --gc_disk_headroom. Must be between 0.0 and 1.0";
  }

  Try<Nothing> initialize =
    resourceEstimator->initialize(defer(self(), &Self::usage));

  if (initialize.isError()) {
    EXIT(EXIT_FAILURE)
      << "Failed to initialize the resource estimator: " << initialize.error();
  }

  initialize = qosController->initialize(defer(self(), &Self::usage));

  if (initialize.isError()) {
    EXIT(EXIT_FAILURE)
      << "Failed to initialize the QoS Controller: " << initialize.error();
  }

  // Ensure slave work directory exists.
  Try<Nothing> mkdir = os::mkdir(flags.work_dir);
  if (mkdir.isError()) {
    EXIT(EXIT_FAILURE)
      << "Failed to create agent work directory '" << flags.work_dir << "': "
      << mkdir.error();
  }

  // Create the DiskProfileAdaptor module and set it globally so
  // any component that needs the module can share this instance.
  Try<DiskProfileAdaptor*> _diskProfileAdaptor =
    DiskProfileAdaptor::create(flags.disk_profile_adaptor);

  if (_diskProfileAdaptor.isError()) {
    EXIT(EXIT_FAILURE)
      << "Failed to create disk profile adaptor: "
      << _diskProfileAdaptor.error();
  }

  diskProfileAdaptor =
    shared_ptr<DiskProfileAdaptor>(_diskProfileAdaptor.get());

  DiskProfileAdaptor::setAdaptor(diskProfileAdaptor);

  string scheme = "http";

#ifdef USE_SSL_SOCKET
  if (process::network::openssl::flags().enabled) {
    scheme = "https";
  }
#endif

  http::URL localResourceProviderURL(
      scheme,
      self().address.ip,
      self().address.port,
      self().id + "/api/v1/resource_provider");

  Try<Owned<LocalResourceProviderDaemon>> _localResourceProviderDaemon =
    LocalResourceProviderDaemon::create(
        localResourceProviderURL,
        flags,
        secretGenerator);

  if (_localResourceProviderDaemon.isError()) {
    EXIT(EXIT_FAILURE)
      << "Failed to create local resource provider daemon: "
      << _localResourceProviderDaemon.error();
  }

  localResourceProviderDaemon = std::move(_localResourceProviderDaemon.get());

  Try<Resources> resources = Containerizer::resources(flags);
  if (resources.isError()) {
    EXIT(EXIT_FAILURE)
      << "Failed to determine agent resources: " << resources.error();
  }

  // Ensure disk `source`s are accessible.
  foreach (
      const Resource& resource,
      resources->filter([](const Resource& _resource) {
        return _resource.has_disk() && _resource.disk().has_source();
      })) {
    const Resource::DiskInfo::Source& source = resource.disk().source();
    switch (source.type()) {
      case Resource::DiskInfo::Source::PATH: {
        // For `PATH` sources we create them if they do not exist.
        CHECK(source.has_path());

        if (!source.path().has_root()) {
          EXIT(EXIT_FAILURE)
            << "PATH disk root directory is not specified "
            << "'" << resource << "'";
        }

        Try<Nothing> mkdir = os::mkdir(source.path().root(), true);

        if (mkdir.isError()) {
          EXIT(EXIT_FAILURE)
            << "Failed to create DiskInfo path directory "
            << "'" << source.path().root() << "': " << mkdir.error();
        }
        break;
      }
      case Resource::DiskInfo::Source::MOUNT: {
        CHECK(source.has_mount());

        if (!source.mount().has_root()) {
          EXIT(EXIT_FAILURE)
            << "MOUNT disk root directory is not specified "
            << "'" << resource << "'";
        }

        // For `MOUNT` sources we fail if they don't exist.
        // On Linux we test the mount table for existence.
#ifdef __linux__
        // Get the `realpath` of the `root` to verify it against the
        // mount table entries.
        // TODO(jmlvanre): Consider enforcing allowing only real paths
        // as opposed to symlinks. This would prevent the ability for
        // an operator to change the underlying data while the slave
        // checkpointed `root` had the same value. We could also check
        // the UUID of the underlying block device to catch this case.
        Result<string> realpath = os::realpath(source.mount().root());

        if (!realpath.isSome()) {
          EXIT(EXIT_FAILURE)
            << "Failed to determine `realpath` for DiskInfo mount in resource '"
            << resource << "' with path '" << source.mount().root() << "': "
            << (realpath.isError() ? realpath.error() : "no such path");
        }

        // TODO(jmlvanre): Consider moving this out of the for loop.
        Try<fs::MountTable> mountTable = fs::MountTable::read("/proc/mounts");
        if (mountTable.isError()) {
          EXIT(EXIT_FAILURE)
            << "Failed to open mount table to verify mounts: "
            << mountTable.error();
        }

        bool foundEntry = false;
        foreach (const fs::MountTable::Entry& entry, mountTable->entries) {
          if (entry.dir == realpath.get()) {
            foundEntry = true;
            break;
          }
        }

        if (!foundEntry) {
          EXIT(EXIT_FAILURE)
            << "Failed to find mount '" << realpath.get()
            << "' in /proc/mounts";
        }
#else // __linux__
        // On other platforms we test whether that provided `root` exists.
        if (!os::exists(source.mount().root())) {
          EXIT(EXIT_FAILURE)
            << "Failed to find mount point '" << source.mount().root() << "'";
        }
#endif // __linux__
        break;
      }
      case Resource::DiskInfo::Source::BLOCK:
      case Resource::DiskInfo::Source::RAW:
      case Resource::DiskInfo::Source::UNKNOWN: {
        EXIT(EXIT_FAILURE)
          << "Unsupported 'DiskInfo.Source.Type' in '" << resource << "'";
      }
    }
  }

  Attributes attributes;
  if (flags.attributes.isSome()) {
    attributes = Attributes::parse(flags.attributes.get());
  }

  // Determine our hostname or use the hostname provided.
  string hostname;

  if (flags.hostname.isNone()) {
    if (flags.hostname_lookup) {
      Try<string> result = net::getHostname(self().address.ip);

      if (result.isError()) {
        EXIT(EXIT_FAILURE) << "Failed to get hostname: " << result.error();
      }

      hostname = result.get();
    } else {
      // We use the IP address for hostname if the user requested us
      // NOT to look it up, and it wasn't explicitly set via --hostname:
      hostname = stringify(self().address.ip);
    }
  } else {
    hostname = flags.hostname.get();
  }

  // Initialize slave info.
  info.set_hostname(hostname);
  info.set_port(self().address.port);

  info.mutable_resources()->CopyFrom(resources.get());
  if (HookManager::hooksAvailable()) {
    info.mutable_resources()->CopyFrom(
        HookManager::slaveResourcesDecorator(info));
  }

  // Initialize `totalResources` with `info.resources`, checkpointed
  // resources will be applied later during recovery.
  totalResources = info.resources();

  LOG(INFO) << "Agent resources: " << info.resources();

  info.mutable_attributes()->CopyFrom(attributes);
  if (HookManager::hooksAvailable()) {
    info.mutable_attributes()->CopyFrom(
        HookManager::slaveAttributesDecorator(info));
  }

  LOG(INFO) << "Agent attributes: " << info.attributes();

  // Checkpointing of slaves is always enabled.
  info.set_checkpoint(true);

  if (flags.domain.isSome()) {
    info.mutable_domain()->CopyFrom(flags.domain.get());
  }

  LOG(INFO) << "Agent hostname: " << info.hostname();

  taskStatusUpdateManager->initialize(defer(self(), &Slave::forward, lambda::_1)
    .operator std::function<void(StatusUpdate)>());

  // We pause the status update managers so that they don't forward any updates
  // while the agent is still recovering. They are unpaused/resumed when the
  // agent (re-)registers with the master.
  taskStatusUpdateManager->pause();
  operationStatusUpdateManager.pause();

  // Start disk monitoring.
  // NOTE: We send a delayed message here instead of directly calling
  // checkDiskUsage, to make disabling this feature easy (e.g by specifying
  // a very large disk_watch_interval).
  delay(flags.disk_watch_interval, self(), &Slave::checkDiskUsage);

  // Start image store disk monitoring. Please note that image layers
  // garbage collection is only enabled if the agent flag `--image_gc_config`
  // is set.
  // TODO(gilbert): Consider move the image auto GC logic to containerizers
  // respectively. For now, it is only enabled for the Mesos Containerizer.
  if (flags.image_gc_config.isSome() &&
      flags.image_providers.isSome() &&
      strings::contains(flags.containerizers, "mesos")) {
    delay(
        Nanoseconds(
            flags.image_gc_config->image_disk_watch_interval().nanoseconds()),
        self(),
        &Slave::checkImageDiskUsage);
  }

  startTime = Clock::now();

  // Install protobuf handlers.
  install<SlaveRegisteredMessage>(
      &Slave::registered,
      &SlaveRegisteredMessage::slave_id,
      &SlaveRegisteredMessage::connection);

  install<SlaveReregisteredMessage>(
      &Slave::reregistered,
      &SlaveReregisteredMessage::slave_id,
      &SlaveReregisteredMessage::reconciliations,
      &SlaveReregisteredMessage::connection);

  install<RunTaskMessage>(
      &Slave::handleRunTaskMessage);

  install<RunTaskGroupMessage>(
      &Slave::handleRunTaskGroupMessage);

  install<KillTaskMessage>(
      &Slave::killTask);

  install<ShutdownExecutorMessage>(
      &Slave::shutdownExecutor,
      &ShutdownExecutorMessage::framework_id,
      &ShutdownExecutorMessage::executor_id);

  install<ShutdownFrameworkMessage>(
      &Slave::shutdownFramework,
      &ShutdownFrameworkMessage::framework_id);

  install<FrameworkToExecutorMessage>(
      &Slave::schedulerMessage,
      &FrameworkToExecutorMessage::slave_id,
      &FrameworkToExecutorMessage::framework_id,
      &FrameworkToExecutorMessage::executor_id,
      &FrameworkToExecutorMessage::data);

  install<UpdateFrameworkMessage>(
      &Slave::updateFramework);

  install<CheckpointResourcesMessage>(
      &Slave::checkpointResourcesMessage,
      &CheckpointResourcesMessage::resources);

  install<ApplyOperationMessage>(
      &Slave::applyOperation);

  install<ReconcileOperationsMessage>(
      &Slave::reconcileOperations);

  install<StatusUpdateAcknowledgementMessage>(
      &Slave::statusUpdateAcknowledgement,
      &StatusUpdateAcknowledgementMessage::slave_id,
      &StatusUpdateAcknowledgementMessage::framework_id,
      &StatusUpdateAcknowledgementMessage::task_id,
      &StatusUpdateAcknowledgementMessage::uuid);

  install<AcknowledgeOperationStatusMessage>(
      &Slave::operationStatusAcknowledgement);

  install<RegisterExecutorMessage>(
      &Slave::registerExecutor,
      &RegisterExecutorMessage::framework_id,
      &RegisterExecutorMessage::executor_id);

  install<ReregisterExecutorMessage>(
      &Slave::reregisterExecutor,
      &ReregisterExecutorMessage::framework_id,
      &ReregisterExecutorMessage::executor_id,
      &ReregisterExecutorMessage::tasks,
      &ReregisterExecutorMessage::updates);

  install<StatusUpdateMessage>(
      &Slave::statusUpdate,
      &StatusUpdateMessage::update,
      &StatusUpdateMessage::pid);

  install<ExecutorToFrameworkMessage>(
      &Slave::executorMessage,
      &ExecutorToFrameworkMessage::slave_id,
      &ExecutorToFrameworkMessage::framework_id,
      &ExecutorToFrameworkMessage::executor_id,
      &ExecutorToFrameworkMessage::data);

  install<ShutdownMessage>(
      &Slave::shutdown,
      &ShutdownMessage::message);

  install<DrainSlaveMessage>(&Slave::drain);

  install<PingSlaveMessage>(
      &Slave::ping,
      &PingSlaveMessage::connected);

  // Setup the '/api/v1' handler for streaming requests.
  RouteOptions options;
  options.requestStreaming = true;
  route("/api/v1",
        // TODO(benh): Is this authentication realm sufficient or do
        // we need some kind of hybrid if we expect both executors
        // and operators/tooling to use this endpoint?
        READWRITE_HTTP_AUTHENTICATION_REALM,
        Http::API_HELP(),
        [this](const http::Request& request,
               const Option<Principal>& principal) {
          logRequest(request);
          return http.api(request, principal);
        },
        options);

#ifndef __WINDOWS__
  if (executorSocket.isSome()) {
    // We use `http::Server` to manage the communication channel.
    // Since `http::Server` currently doesn't offer support for
    // authentication we then inject the request received by the
    // server into normal agent rounting logic.
    Try<http::Server> server = http::Server::create(
        *executorSocket,
        process::defer(
            self(),
            [this](const process::network::Socket&, http::Request request)
              -> Future<http::Response> {
              // Restrict access to only allow `/slave(N)/api/v1/executor`
              // and `/slave(N)/api/v1`. Executors need to be able to
              // access the first to subscribe and the latter to e.g.,
              // launch containers or perform other operator API calls.
              string selfPrefix = "/" + self().id;
              if (request.url.path != selfPrefix + "/api/v1/executor" &&
                  request.url.path != selfPrefix + "/api/v1") {
                LOG(INFO)
                  << "Blocking request for " << request.url.path
                  << " over executor socket";
                return http::Forbidden();
              }

              // Create an `HttpEvent` with the needed information which we can
              // be consumed by the agent. The event contains e.g., the
              // requested path so the expected route `/api/v1/executor` is
              // routed when consuming the event.
              std::unique_ptr<Promise<http::Response>> promise(
                  new Promise<http::Response>());

              Future<http::Response> response = promise->future();

              process::HttpEvent event(
                  std::unique_ptr<http::Request>(new http::Request(request)),
                  std::move(promise));

              std::move(event).consume(this);

              return response;
            }),
        {
          /* .scheme =*/process::http::Scheme::HTTP_UNIX,
          /* .backlog =*/16384,
        });

    if (server.isError()) {
      LOG(FATAL) << "Could not start listening on executor socket: "
                 << server.error();
    } else {
      executorSocketServer = std::move(*server);

      Future<Nothing> executorSocketServerTerminated =
        executorSocketServer->run();

      if (executorSocketServerTerminated.isFailed()) {
        LOG(FATAL) << "Could not start listening on executor socket: "
                   << executorSocketServerTerminated.failure();
      }
    }
  }
#endif // __WINDOWS__

  route("/api/v1/executor",
        EXECUTOR_HTTP_AUTHENTICATION_REALM,
        Http::EXECUTOR_HELP(),
        [this](const http::Request& request,
               const Option<Principal>& principal) {
          logRequest(request);
          return http.executor(request, principal);
        });
  route(
      "/api/v1/resource_provider",
      RESOURCE_PROVIDER_HTTP_AUTHENTICATION_REALM,
      Http::RESOURCE_PROVIDER_HELP(),
      [this](const http::Request& request, const Option<Principal>& principal)
        -> Future<http::Response> {
        logRequest(request);

        if (resourceProviderManager.get() == nullptr) {
          return http::ServiceUnavailable();
        }

        return resourceProviderManager->api(request, principal);
      });
  route("/state",
        READONLY_HTTP_AUTHENTICATION_REALM,
        Http::STATE_HELP(),
        [this](const http::Request& request,
               const Option<Principal>& principal) {
          logRequest(request);
          return http.state(request, principal)
            .onReady([request](const process::http::Response& response) {
              logResponse(request, response);
            });
        });
  route("/flags",
        READONLY_HTTP_AUTHENTICATION_REALM,
        Http::FLAGS_HELP(),
        [this](const http::Request& request,
               const Option<Principal>& principal) {
          logRequest(request);
          return http.flags(request, principal);
        });
  route("/health",
        Http::HEALTH_HELP(),
        [this](const http::Request& request) {
          return http.health(request);
        });
  route("/monitor/statistics",
        READONLY_HTTP_AUTHENTICATION_REALM,
        Http::STATISTICS_HELP(),
        [this](const http::Request& request,
               const Option<Principal>& principal) {
          logRequest(request);
          return http.statistics(request, principal);
        });
  route("/containers",
        READONLY_HTTP_AUTHENTICATION_REALM,
        Http::CONTAINERS_HELP(),
        [this](const http::Request& request,
               const Option<Principal>& principal) {
          logRequest(request);
          return http.containers(request, principal)
            .onReady([request](const process::http::Response& response) {
              logResponse(request, response);
            });
        });
  route("/containerizer/debug",
        READONLY_HTTP_AUTHENTICATION_REALM,
        Http::CONTAINERIZER_DEBUG_HELP(),
        [this](const http::Request& request,
               const Option<Principal>& principal) {
          logRequest(request);
          return http.containerizerDebug(request, principal);
        });

  // TODO(tillt): Use generalized lambda capture once we adopt C++14.
  Option<Authorizer*> _authorizer = authorizer;

  auto authorize = [_authorizer](const Option<Principal>& principal) {
    return authorization::authorizeLogAccess(_authorizer, principal);
  };

  // Expose the log file for the webui. Fall back to 'log_dir' if
  // an explicit file was not specified.
  if (flags.external_log_file.isSome()) {
    files->attach(
        flags.external_log_file.get(), AGENT_LOG_VIRTUAL_PATH, authorize)
      .onAny(defer(self(),
                   &Self::fileAttached,
                   lambda::_1,
                   flags.external_log_file.get(),
                   AGENT_LOG_VIRTUAL_PATH));
  } else if (flags.log_dir.isSome()) {
    Try<string> log =
      logging::getLogFile(logging::getLogSeverity(flags.logging_level));

    if (log.isError()) {
      LOG(ERROR) << "Agent log file cannot be found: " << log.error();
    } else {
      files->attach(log.get(), AGENT_LOG_VIRTUAL_PATH, authorize)
        .onAny(defer(self(),
                     &Self::fileAttached,
                     lambda::_1,
                     log.get(),
                     AGENT_LOG_VIRTUAL_PATH));
    }
  }

  // Check that the reconfiguration_policy flag is valid.
  if (flags.reconfiguration_policy != "equal" &&
      flags.reconfiguration_policy != "additive") {
    EXIT(EXIT_FAILURE)
      << "Unknown option for 'reconfiguration_policy' flag "
      << flags.reconfiguration_policy << "."
      << " Please run the agent with '--help' to see the valid options.";
  }

  // Check that the recover flag is valid.
  if (flags.recover != "reconnect" && flags.recover != "cleanup") {
    EXIT(EXIT_FAILURE)
      << "Unknown option for 'recover' flag " << flags.recover << "."
      << " Please run the agent with '--help' to see the valid options";
  }

  auto signalHandler = defer(self(), &Slave::signaled, lambda::_1, lambda::_2)
    .operator std::function<void(int, int)>();

#ifdef __WINDOWS__
  if (!os::internal::installCtrlHandler(&signalHandler)) {
    EXIT(EXIT_FAILURE)
      << "Failed to configure console handlers: " << WindowsError().message;
  }
#else
  if (os::internal::configureSignal(&signalHandler) < 0) {
    EXIT(EXIT_FAILURE)
      << "Failed to configure signal handlers: " << os::strerror(errno);
  }
#endif  // __WINDOWS__

  // Do recovery.
  async(&state::recover, metaDir, flags.strict)
    .then(defer(self(), &Slave::recover, lambda::_1))
    .then(defer(self(), &Slave::_recover))
    .onAny(defer(self(), &Slave::__recover, lambda::_1));
}


void Slave::finalize()
{
  LOG(INFO) << "Agent terminating";

  // NOTE: We use 'frameworks.keys()' here because 'shutdownFramework'
  // can potentially remove a framework from 'frameworks'.
  foreach (const FrameworkID& frameworkId, frameworks.keys()) {
    // TODO(benh): Because a shut down isn't instantaneous (but has
    // a shut down/kill phases) we might not actually propagate all
    // the status updates appropriately here. Consider providing
    // an alternative function which skips the shut down phase and
    // simply does a kill (sending all status updates
    // immediately). Of course, this still isn't sufficient
    // because those status updates might get lost and we won't
    // resend them unless we build that into the system.
    // NOTE: We shut down the framework only if it has disabled
    // checkpointing. This is because slave recovery tests terminate
    // the slave to simulate slave restart.
    if (!frameworks[frameworkId]->info.checkpoint()) {
      shutdownFramework(UPID(), frameworkId);
    }
  }

  // Explicitly tear down the resource provider manager to ensure that the
  // wrapped process is terminated and releases the underlying storage.
  resourceProviderManager.reset();
}


void Slave::shutdown(const UPID& from, const string& message)
{
  if (from && master != from) {
    LOG(WARNING) << "Ignoring shutdown message from " << from
                 << " because it is not from the registered master: "
                 << (master.isSome() ? stringify(master.get()) : "None");
    return;
  }

  if (from) {
    LOG(INFO) << "Agent asked to shut down by " << from
              << (message.empty() ? "" : " because '" + message + "'");
  } else if (info.has_id()) {
    if (message.empty()) {
      LOG(INFO) << "Unregistering and shutting down";
    } else {
      LOG(INFO) << message << "; unregistering and shutting down";
    }

    UnregisterSlaveMessage message_;
    message_.mutable_slave_id()->MergeFrom(info.id());
    send(master.get(), message_);
  } else {
    if (message.empty()) {
      LOG(INFO) << "Shutting down";
    } else {
      LOG(INFO) << message << "; shutting down";
    }
  }

  state = TERMINATING;

  if (frameworks.empty()) { // Terminate slave if there are no frameworks.
    terminate(self());
  } else {
    // NOTE: The slave will terminate after all the executors have
    // terminated.
    // NOTE: We use 'frameworks.keys()' here because 'shutdownFramework'
    // can potentially remove a framework from 'frameworks'.
    foreach (const FrameworkID& frameworkId, frameworks.keys()) {
      shutdownFramework(from, frameworkId);
    }
  }
}


void Slave::drain(
    const UPID& from,
    DrainSlaveMessage&& drainSlaveMessage)
{
  if (operations.empty() && frameworks.empty()) {
    LOG(INFO)
      << "Received DrainConfig " << drainSlaveMessage.config()
      << (drainConfig.isSome()
          ? "; previously stored DrainConfig " + stringify(*drainConfig)
          : "")
      << "; agent has no stored frameworks, tasks, or operations,"
         " so draining is already complete";

    return;
  }

  hashmap<FrameworkID, hashset<TaskID>> pendingTaskIds;
  foreachvalue (Framework* framework, frameworks) {
    foreachvalue (const auto& taskMap, framework->pendingTasks) {
      pendingTaskIds[framework->id()] = taskMap.keys();
    }
  }

  hashmap<FrameworkID, hashset<TaskID>> queuedTaskIds;
  foreachvalue (Framework* framework, frameworks) {
    foreachvalue (Executor* executor, framework->executors) {
      foreachkey (const TaskID& taskId, executor->queuedTasks) {
        queuedTaskIds[framework->id()].insert(taskId);
      }
    }
  }

  hashmap<FrameworkID, hashset<TaskID>> launchedTaskIds;
  foreachvalue (Framework* framework, frameworks) {
    foreachvalue (Executor* executor, framework->executors) {
      foreachkey (const TaskID& taskId, executor->launchedTasks) {
        launchedTaskIds[framework->id()].insert(taskId);
      }
    }
  }

  LOG(INFO)
    << "Initiating drain with DrainConfig " << drainSlaveMessage.config()
    << (drainConfig.isSome()
        ? "; overwriting previous DrainConfig " + stringify(*drainConfig)
        : "")
    << "; agent has (pending tasks, queued tasks, launched tasks, operations)"
    << " == ("
    << stringify(pendingTaskIds) << ", "
    << stringify(queuedTaskIds) << ", "
    << stringify(launchedTaskIds) << ", "
    << stringify(operations.keys()) << ")";

  CHECK_SOME(state::checkpoint(
      paths::getDrainConfigPath(metaDir, info.id()),
      drainSlaveMessage.config()))
    << "Failed to checkpoint DrainConfig";

  drainConfig = drainSlaveMessage.config();
  estimatedDrainStartTime = Clock::now();

  const Option<DurationInfo> maxGracePeriod =
    drainConfig->has_max_grace_period()
      ? drainConfig->max_grace_period()
      : Option<DurationInfo>::none();

  auto calculateKillPolicy =
    [&](const Option<KillPolicy>& killPolicy) -> Option<KillPolicy> {
      if (maxGracePeriod.isNone()) {
        return None();
      }

      KillPolicy killPolicyOverride;
      killPolicyOverride.mutable_grace_period()->CopyFrom(maxGracePeriod.get());

      // Task kill policy is not set or unknown.
      if (killPolicy.isNone() || !killPolicy->has_grace_period()) {
        return killPolicyOverride;
      }

      // Task kill policy is greater than the override.
      if (maxGracePeriod.get() < killPolicy->grace_period()) {
        return killPolicyOverride;
      }

      return None();
    };

  // Frameworks may be removed within `kill()` or `killPendingTask()` below,
  // so we must copy them and their members before looping.
  foreachvalue (Framework* framework, utils::copy(frameworks)) {
    typedef hashmap<TaskID, TaskInfo> TaskMap;
    foreachvalue (const TaskMap& tasks, utils::copy(framework->pendingTasks)) {
      foreachvalue (const TaskInfo& task, tasks) {
        killPendingTask(framework->id(), framework, task.task_id());
      }
    }

    foreachvalue (Executor* executor, utils::copy(framework->executors)) {
      foreachvalue (Task* task, executor->launchedTasks) {
        kill(framework->id(),
             framework,
             executor,
             task->task_id(),
             calculateKillPolicy(
                task->has_kill_policy()
                  ? task->kill_policy()
                  : Option<KillPolicy>::none()));
      }

      foreachvalue (const TaskInfo& task, utils::copy(executor->queuedTasks)) {
        kill(framework->id(),
             framework,
             executor,
             task.task_id(),
             calculateKillPolicy(
                task.has_kill_policy()
                  ? task.kill_policy()
                  : Option<KillPolicy>::none()));
      }
    }
  }
}


void Slave::fileAttached(
    const Future<Nothing>& result,
    const string& path,
    const string& virtualPath)
{
  if (result.isReady()) {
    VLOG(1) << "Successfully attached '" << path << "'"
            << " to virtual path '" << virtualPath << "'";
  } else {
    LOG(ERROR) << "Failed to attach '" << path << "'"
               << " to virtual path '" << virtualPath << "': "
               << (result.isFailed() ? result.failure() : "discarded");
  }
}


// TODO(vinod/bmahler): Get rid of this helper.
Nothing Slave::detachFile(const string& path)
{
  files->detach(path);
  return Nothing();
}


void Slave::attachTaskVolumeDirectory(
    const ExecutorInfo& executorInfo,
    const ContainerID& executorContainerId,
    const Task& task)
{
  CHECK(executorInfo.has_type() &&
        executorInfo.type() == ExecutorInfo::DEFAULT);

  CHECK_EQ(task.executor_id(), executorInfo.executor_id());

  // This is the case that the task has disk resources specified.
  foreach (const Resource& resource, task.resources()) {
    // Ignore if there are no disk resources or if the
    // disk resources did not specify a volume mapping.
    if (!resource.has_disk() || !resource.disk().has_volume()) {
      continue;
    }

    const Volume& volume = resource.disk().volume();

    const string executorRunPath = paths::getExecutorRunPath(
        flags.work_dir,
        info.id(),
        task.framework_id(),
        task.executor_id(),
        executorContainerId);

    const string executorDirectoryPath =
      path::join(executorRunPath, volume.container_path());

    const string taskPath = paths::getTaskPath(
        flags.work_dir,
        info.id(),
        task.framework_id(),
        task.executor_id(),
        executorContainerId,
        task.task_id());

    const string taskDirectoryPath =
      path::join(taskPath, volume.container_path());

    files->attach(executorDirectoryPath, taskDirectoryPath)
      .onAny(defer(
          self(),
          &Self::fileAttached,
          lambda::_1,
          executorDirectoryPath,
          taskDirectoryPath));
  }

  // This is the case that the executor has disk resources specified
  // and the task's ContainerInfo has a `SANDBOX_PATH` volume with type
  // `PARENT` to share the executor's disk volume.
  hashset<string> executorContainerPaths;
  foreach (const Resource& resource, executorInfo.resources()) {
    // Ignore if there are no disk resources or if the
    // disk resources did not specify a volume mapping.
    if (!resource.has_disk() || !resource.disk().has_volume()) {
      continue;
    }

    const Volume& volume = resource.disk().volume();
    executorContainerPaths.insert(volume.container_path());
  }

  if (executorContainerPaths.empty()) {
    return;
  }

  if (task.has_container()) {
    foreach (const Volume& volume, task.container().volumes()) {
      if (!volume.has_source() ||
          volume.source().type() != Volume::Source::SANDBOX_PATH) {
        continue;
      }

      CHECK(volume.source().has_sandbox_path());

      const Volume::Source::SandboxPath& sandboxPath =
        volume.source().sandbox_path();

      if (sandboxPath.type() != Volume::Source::SandboxPath::PARENT) {
        continue;
      }

      if (!executorContainerPaths.contains(sandboxPath.path())) {
        continue;
      }

      const string executorRunPath = paths::getExecutorRunPath(
          flags.work_dir,
          info.id(),
          task.framework_id(),
          task.executor_id(),
          executorContainerId);

      const string executorDirectoryPath =
        path::join(executorRunPath, sandboxPath.path());

      const string taskPath = paths::getTaskPath(
          flags.work_dir,
          info.id(),
          task.framework_id(),
          task.executor_id(),
          executorContainerId,
          task.task_id());

      const string taskDirectoryPath =
        path::join(taskPath, volume.container_path());

      files->attach(executorDirectoryPath, taskDirectoryPath)
        .onAny(defer(
            self(),
            &Self::fileAttached,
            lambda::_1,
            executorDirectoryPath,
            taskDirectoryPath));
    }
  }
}


void Slave::detachTaskVolumeDirectories(
    const ExecutorInfo& executorInfo,
    const ContainerID& executorContainerId,
    const vector<Task>& tasks)
{
  // NOTE: If the executor is not a default executor, this function will
  // still be called but with an empty list of tasks.
  CHECK(tasks.empty() ||
        (executorInfo.has_type() &&
         executorInfo.type() == ExecutorInfo::DEFAULT));

  hashset<string> executorContainerPaths;
  foreach (const Resource& resource, executorInfo.resources()) {
    // Ignore if there are no disk resources or if the
    // disk resources did not specify a volume mapping.
    if (!resource.has_disk() || !resource.disk().has_volume()) {
      continue;
    }

    const Volume& volume = resource.disk().volume();
    executorContainerPaths.insert(volume.container_path());
  }

  foreach (const Task& task, tasks) {
    CHECK_EQ(task.executor_id(), executorInfo.executor_id());

    // This is the case that the task has disk resources specified.
    foreach (const Resource& resource, task.resources()) {
      // Ignore if there are no disk resources or if the
      // disk resources did not specify a volume mapping.
      if (!resource.has_disk() || !resource.disk().has_volume()) {
        continue;
      }

      const Volume& volume = resource.disk().volume();

      const string taskPath = paths::getTaskPath(
          flags.work_dir,
          info.id(),
          task.framework_id(),
          task.executor_id(),
          executorContainerId,
          task.task_id());

      const string taskDirectoryPath =
        path::join(taskPath, volume.container_path());

      files->detach(taskDirectoryPath);
    }

    if (executorContainerPaths.empty()) {
      continue;
    }

    // This is the case that the executor has disk resources specified
    // and the task's ContainerInfo has a `SANDBOX_PATH` volume with type
    // `PARENT` to share the executor's disk volume.
    if (task.has_container()) {
      foreach (const Volume& volume, task.container().volumes()) {
        if (!volume.has_source() ||
            volume.source().type() != Volume::Source::SANDBOX_PATH) {
          continue;
        }

        CHECK(volume.source().has_sandbox_path());

        const Volume::Source::SandboxPath& sandboxPath =
          volume.source().sandbox_path();

        if (sandboxPath.type() != Volume::Source::SandboxPath::PARENT) {
          continue;
        }

        if (!executorContainerPaths.contains(sandboxPath.path())) {
          continue;
        }

        const string taskPath = paths::getTaskPath(
            flags.work_dir,
            info.id(),
            task.framework_id(),
            task.executor_id(),
            executorContainerId,
            task.task_id());

        const string taskDirectoryPath =
          path::join(taskPath, volume.container_path());

        files->detach(taskDirectoryPath);
      }
    }
  }
}


void Slave::detected(const Future<Option<MasterInfo>>& _master)
{
  CHECK(state == DISCONNECTED ||
        state == RUNNING ||
        state == TERMINATING) << state;

  if (state != TERMINATING) {
    state = DISCONNECTED;
  }

  // Pause the status updates.
  taskStatusUpdateManager->pause();
  operationStatusUpdateManager.pause();

  if (_master.isFailed()) {
    EXIT(EXIT_FAILURE) << "Failed to detect a master: " << _master.failure();
  }

  Option<MasterInfo> latest;

  if (_master.isDiscarded()) {
    LOG(INFO) << "Re-detecting master";
    latest = None();
    master = None();
  } else if (_master->isNone()) {
    LOG(INFO) << "Lost leading master";
    latest = None();
    master = None();
  } else {
    latest = _master.get();
    master = UPID(latest->pid());

    LOG(INFO) << "New master detected at " << master.get();

    // Cancel the pending registration timer to avoid spurious attempts
    // at reregistration. `Clock::cancel` is idempotent, so this call
    // is safe even if no timer is active or pending.
    Clock::cancel(agentRegistrationTimer);

    if (state == TERMINATING) {
      LOG(INFO) << "Skipping registration because agent is terminating";
      return;
    }

    if (requiredMasterCapabilities.agentUpdate) {
      protobuf::master::Capabilities masterCapabilities(
          latest->capabilities());

      if (!masterCapabilities.agentUpdate) {
        EXIT(EXIT_FAILURE) <<
          "Agent state changed on restart, but the detected master lacks the "
          "AGENT_UPDATE capability. Refusing to connect.";
        return;
      }

      if (dynamic_cast<mesos::master::detector::StandaloneMasterDetector*>(
          detector)) {
        LOG(WARNING) <<
          "The AGENT_UPDATE master capability is required, "
          "but the StandaloneMasterDetector does not have the ability to read "
          "master capabilities.";
      }
    }

    // Wait for a random amount of time before authentication or
    // registration.
    //
    // TODO(mzhu): Specialize this for authentication.
    Duration duration =
      flags.registration_backoff_factor * ((double) os::random() / RAND_MAX);

    if (credential.isSome()) {
      // Authenticate with the master.
      // TODO(vinod): Consider adding an "AUTHENTICATED" state to the
      // slave instead of "authenticate" variable.
      Duration maxTimeout = flags.authentication_timeout_min +
                            flags.authentication_backoff_factor * 2;

      delay(
          duration,
          self(),
          &Slave::authenticate,
          flags.authentication_timeout_min,
          std::min(maxTimeout, flags.authentication_timeout_max));
    } else {
      // Proceed with registration without authentication.
      LOG(INFO) << "No credentials provided."
                << " Attempting to register without authentication";

      delay(duration,
            self(),
            &Slave::doReliableRegistration,
            flags.registration_backoff_factor * 2); // Backoff.
    }
  }

  // Keep detecting masters.
  LOG(INFO) << "Detecting new master";
  detection = detector->detect(latest)
    .onAny(defer(self(), &Slave::detected, lambda::_1));
}


void Slave::authenticate(Duration minTimeout, Duration maxTimeout)
{
  authenticated = false;

  if (master.isNone()) {
    return;
  }

  if (authenticating.isSome()) {
    // Authentication is in progress. Try to cancel it.
    // Note that it is possible that 'authenticating' is ready
    // and the dispatch to '_authenticate' is enqueued when we
    // are here, making the 'discard' here a no-op. This is ok
    // because we set 'reauthenticate' here which enforces a retry
    // in '_authenticate'.
    Future<bool> authenticating_ = authenticating.get();
    authenticating_.discard();
    reauthenticate = true;
    return;
  }

  LOG(INFO) << "Authenticating with master " << master.get();

  // Ensure there is a link to the master before we start
  // communicating with it.
  link(master.get());

  CHECK(authenticatee == nullptr);

  if (authenticateeName == DEFAULT_AUTHENTICATEE) {
    LOG(INFO) << "Using default CRAM-MD5 authenticatee";
    authenticatee = new cram_md5::CRAMMD5Authenticatee();
  }

  if (authenticatee == nullptr) {
    Try<Authenticatee*> module =
      modules::ModuleManager::create<Authenticatee>(authenticateeName);
    if (module.isError()) {
      EXIT(EXIT_FAILURE)
        << "Could not create authenticatee module '"
        << authenticateeName << "': " << module.error();
    }
    LOG(INFO) << "Using '" << authenticateeName << "' authenticatee";
    authenticatee = module.get();
  }

  CHECK_SOME(credential);

  // We pick a random duration between `minTimeout` and `maxTimeout`.
  Duration timeout =
    minTimeout + (maxTimeout - minTimeout) * ((double)os::random() / RAND_MAX);

  authenticating =
    authenticatee->authenticate(master.get(), self(), credential.get())
      .onAny(defer(self(), &Self::_authenticate, minTimeout, maxTimeout))
      .after(timeout, [](Future<bool> future) {
        // NOTE: Discarded future results in a retry in '_authenticate()'.
        // This is a no-op if the future is already ready.
        if (future.discard()) {
          LOG(WARNING) << "Authentication timed out";
        }

        return future;
      });
}


void Slave::_authenticate(
    Duration currentMinTimeout, Duration currentMaxTimeout)
{
  delete CHECK_NOTNULL(authenticatee);
  authenticatee = nullptr;

  CHECK_SOME(authenticating);
  const Future<bool>& future = authenticating.get();

  if (master.isNone()) {
    LOG(INFO) << "Ignoring _authenticate because the master is lost";
    authenticating = None();
    // Set it to false because we do not want further retries until
    // a new master is detected.
    // We obviously do not need to reauthenticate either even if
    // 'reauthenticate' is currently true because the master is
    // lost.
    reauthenticate = false;
    return;
  }

  if (reauthenticate || !future.isReady()) {
    LOG(WARNING)
      << "Failed to authenticate with master " << master.get() << ": "
      << (reauthenticate ? "master changed" :
         (future.isFailed() ? future.failure() : "future discarded"));

    authenticating = None();
    reauthenticate = false;

    // Grow the timeout range using exponential backoff:
    //
    //   [min, min + factor * 2^0]
    //   [min, min + factor * 2^1]
    //   ...
    //   [min, min + factor * 2^N]
    //   ...
    //   [min, max] // Stop at max.
    Duration maxTimeout =
      currentMinTimeout + (currentMaxTimeout - currentMinTimeout) * 2;

    authenticate(
        currentMinTimeout,
        std::min(maxTimeout, flags.authentication_timeout_max));
    return;
  }

  if (!future.get()) {
    // For refused authentication, we exit instead of doing a shutdown
    // to keep possibly active executors running.
    EXIT(EXIT_FAILURE)
      << "Master " << master.get() << " refused authentication";
  }

  LOG(INFO) << "Successfully authenticated with master " << master.get();

  authenticated = true;
  authenticating = None();

  // Proceed with registration.
  doReliableRegistration(flags.registration_backoff_factor * 2);
}


void Slave::registered(
    const UPID& from,
    const SlaveID& slaveId,
    const MasterSlaveConnection& connection)
{
  if (master != from) {
    LOG(WARNING) << "Ignoring registration message from " << from
                 << " because it is not the expected master: "
                 << (master.isSome() ? stringify(master.get()) : "None");
    return;
  }

  CHECK_SOME(master);

  if (connection.has_total_ping_timeout_seconds()) {
    masterPingTimeout =
      Seconds(static_cast<int64_t>(connection.total_ping_timeout_seconds()));
  } else {
    masterPingTimeout = DEFAULT_MASTER_PING_TIMEOUT();
  }

  switch (state) {
    case DISCONNECTED: {
      LOG(INFO) << "Registered with master " << master.get()
                << "; given agent ID " << slaveId;

      state = RUNNING;

      // Cancel the pending registration timer to avoid spurious attempts
      // at reregistration. `Clock::cancel` is idempotent, so this call
      // is safe even if no timer is active or pending.
      Clock::cancel(agentRegistrationTimer);

      taskStatusUpdateManager->resume(); // Resume status updates.

      info.mutable_id()->CopyFrom(slaveId); // Store the slave id.

      // Create the slave meta directory.
      paths::createSlaveDirectory(metaDir, slaveId);

      // Initialize and resume the operation status update manager.
      //
      // NOTE: There is no need to recover the operation status update manager,
      // because its streams are checkpointed within the slave meta directory
      // which was just created.
      operationStatusUpdateManager.initialize(
          defer(self(), &Self::sendOperationStatusUpdate, lambda::_1),
          std::bind(
              &slave::paths::getSlaveOperationUpdatesPath,
              metaDir,
              info.id(),
              lambda::_1));

      operationStatusUpdateManager.resume();

      // Checkpoint slave info.
      const string path = paths::getSlaveInfoPath(metaDir, slaveId);

      VLOG(1) << "Checkpointing SlaveInfo to '" << path << "'";

      CHECK_SOME(state::checkpoint(path, info));

      // If we registered with this agent ID for the first time initialize
      // the resource provider manager with it; if the manager was already
      // initialized with a recovered agent ID this is a no-op.
      initializeResourceProviderManager(flags, info.id());

      // We start the local resource providers daemon once the agent is
      // running, so the resource providers can use the agent API.
      localResourceProviderDaemon->start(info.id());

      // Setup a timer so that the agent attempts to reregister if it
      // doesn't receive a ping from the master for an extended period
      // of time. This needs to be done once registered, in case we
      // never receive an initial ping.
      Clock::cancel(pingTimer);

      pingTimer = delay(
          masterPingTimeout,
          self(),
          &Slave::pingTimeout,
          detection);

      break;
    }
    case RUNNING:
      // Already registered!
      if (info.id() != slaveId) {
       EXIT(EXIT_FAILURE)
         << "Registered but got wrong id: " << slaveId
         << " (expected: " << info.id() << "). Committing suicide";
      }
      LOG(WARNING) << "Already registered with master " << master.get();

      break;
    case TERMINATING:
      LOG(WARNING) << "Ignoring registration because agent is terminating";
      break;
    case RECOVERING:
    default:
      LOG(FATAL) << "Unexpected agent state " << state;
      break;
  }

  // If this agent can support resource providers or has had any oversubscribed
  // resources set, send an `UpdateSlaveMessage` to the master to inform it of a
  // possible changes between completion of recovery and agent registration.
  if (capabilities.resourceProvider || oversubscribedResources.isSome()) {
    UpdateSlaveMessage message = generateUpdateSlaveMessage();

    LOG(INFO) << "Forwarding agent update " << JSON::protobuf(message);

    send(master.get(), message);
  }
}


void Slave::reregistered(
    const UPID& from,
    const SlaveID& slaveId,
    const vector<ReconcileTasksMessage>& reconciliations,
    const MasterSlaveConnection& connection)
{
  if (master != from) {
    LOG(WARNING) << "Ignoring re-registration message from " << from
                 << " because it is not the expected master: "
                 << (master.isSome() ? stringify(master.get()) : "None");
    return;
  }

  CHECK_SOME(master);

  if (info.id() != slaveId) {
    EXIT(EXIT_FAILURE)
      << "Re-registered but got wrong id: " << slaveId
      << " (expected: " << info.id() << "). Committing suicide";
  }

  if (connection.has_total_ping_timeout_seconds()) {
    masterPingTimeout =
      Seconds(static_cast<int64_t>(connection.total_ping_timeout_seconds()));
  } else {
    masterPingTimeout = DEFAULT_MASTER_PING_TIMEOUT();
  }

  switch (state) {
    case DISCONNECTED:
      LOG(INFO) << "Re-registered with master " << master.get();
      state = RUNNING;
      taskStatusUpdateManager->resume(); // Resume status updates.
      operationStatusUpdateManager.resume();

      // We start the local resource providers daemon once the agent is
      // running, so the resource providers can use the agent API.
      localResourceProviderDaemon->start(info.id());

      // Setup a timer so that the agent attempts to reregister if it
      // doesn't receive a ping from the master for an extended period
      // of time. This needs to be done once reregistered, in case we
      // never receive an initial ping.
      Clock::cancel(pingTimer);

      pingTimer = delay(
          masterPingTimeout,
          self(),
          &Slave::pingTimeout,
          detection);

      break;
    case RUNNING:
      LOG(WARNING) << "Already reregistered with master " << master.get();
      break;
    case TERMINATING:
      LOG(WARNING) << "Ignoring re-registration because agent is terminating";
      return;
    case RECOVERING:
      // It's possible to receive a message intended for the previous
      // run of the slave here. Short term we can leave this as is and
      // crash in this case. Ideally responses can be tied to a
      // particular run of the slave, see:
      // https://issues.apache.org/jira/browse/MESOS-676
      // https://issues.apache.org/jira/browse/MESOS-677
    default:
      LOG(FATAL) << "Unexpected agent state " << state;
      return;
  }

  // If this agent can support resource providers or has had any oversubscribed
  // resources set, send an `UpdateSlaveMessage` to the master to inform it of a
  // possible changes between completion of recovery and agent registration.
  if (capabilities.resourceProvider || oversubscribedResources.isSome()) {
    UpdateSlaveMessage message = generateUpdateSlaveMessage();

    LOG(INFO) << "Forwarding agent update " << JSON::protobuf(message);
    send(master.get(), message);
  }

  // Reconcile any tasks per the master's request.
  foreach (const ReconcileTasksMessage& reconcile, reconciliations) {
    Framework* framework = getFramework(reconcile.framework_id());

    foreach (const TaskStatus& status, reconcile.statuses()) {
      const TaskID& taskId = status.task_id();

      bool known = false;
      if (framework != nullptr) {
        known = framework->hasTask(taskId);
      }

      // Send a terminal status update for each task that is known to
      // the master but not known to the agent. This ensures that the
      // master will cleanup any state associated with the task, which
      // is not running. We send TASK_DROPPED to partition-aware
      // frameworks; frameworks that are not partition-aware are sent
      // TASK_LOST for backward compatibility.
      //
      // If the task is known to the agent, we don't need to send a
      // status update to the master: because the master already knows
      // about the task, any subsequent status updates will be
      // propagated correctly.
      if (!known) {
        // NOTE: The `framework` field of the `ReconcileTasksMessage`
        // is only set by masters running Mesos 1.1.0 or later. If the
        // field is unset, we assume the framework is not partition-aware.
        mesos::TaskState taskState = TASK_LOST;

        if (reconcile.has_framework() &&
            protobuf::frameworkHasCapability(
                reconcile.framework(),
                FrameworkInfo::Capability::PARTITION_AWARE)) {
          taskState = TASK_DROPPED;
        }

        LOG(WARNING) << "Agent reconciling task " << taskId
                     << " of framework " << reconcile.framework_id()
                     << " in state " << taskState
                     << ": task unknown to the agent";

        const StatusUpdate update = protobuf::createStatusUpdate(
            reconcile.framework_id(),
            info.id(),
            taskId,
            taskState,
            TaskStatus::SOURCE_SLAVE,
            id::UUID::random(),
            "Reconciliation: task unknown to the agent",
            TaskStatus::REASON_RECONCILIATION);

        // NOTE: We can't use statusUpdate() here because it drops
        // updates for unknown frameworks.
        taskStatusUpdateManager->update(update, info.id())
          .onAny(defer(self(),
                       &Slave::___statusUpdate,
                       lambda::_1,
                       update,
                       UPID()));
      }
    }
  }
}


void Slave::doReliableRegistration(Duration maxBackoff)
{
  if (master.isNone()) {
    LOG(INFO) << "Skipping registration because no master present";
    return;
  }

  if (credential.isSome() && !authenticated) {
    LOG(INFO) << "Skipping registration because not authenticated";
    return;
  }

  if (state == RUNNING) { // Slave (re-)registered with the master.
    return;
  }

  if (state == TERMINATING) {
    LOG(INFO) << "Skipping registration because agent is terminating";
    return;
  }

  CHECK(state == DISCONNECTED) << state;

  CHECK_NE("cleanup", flags.recover);

  // Ensure there is a link to the master before we start
  // communicating with it. We want to link after the initial
  // registration backoff in order to avoid all of the agents
  // establishing connections with the master at once.
  // See MESOS-5330.
  link(master.get());

  if (!info.has_id()) {
    // Registering for the first time.
    RegisterSlaveMessage message;
    message.set_version(MESOS_VERSION);
    message.mutable_slave()->CopyFrom(info);

    message.mutable_agent_capabilities()->CopyFrom(
        capabilities.toRepeatedPtrField());

    // Include checkpointed resources.
    message.mutable_checkpointed_resources()->CopyFrom(checkpointedResources);

    message.mutable_resource_version_uuid()->CopyFrom(resourceVersion);

    // If the `Try` from `downgradeResources` returns an `Error`, we currently
    // continue to send the resources to the master in a partially downgraded
    // state. This implies that an agent with refined reservations cannot work
    // with versions of master before reservation refinement support, which was
    // introduced in 1.4.0.
    //
    // TODO(mpark): Do something smarter with the result once something
    //              like a master capability is introduced.
    downgradeResources(&message);

    send(master.get(), message);
  } else {
    // Re-registering, so send tasks running.
    ReregisterSlaveMessage message;
    message.set_version(MESOS_VERSION);

    message.mutable_agent_capabilities()->CopyFrom(
        capabilities.toRepeatedPtrField());

    // Include checkpointed resources.
    message.mutable_checkpointed_resources()->CopyFrom(checkpointedResources);

    message.mutable_resource_version_uuid()->CopyFrom(resourceVersion);
    message.mutable_slave()->CopyFrom(info);

    foreachvalue (Framework* framework, frameworks) {
      message.add_frameworks()->CopyFrom(framework->info);

      // TODO(bmahler): We need to send the executors for these
      // pending tasks, and we need to send exited events if they
      // cannot be launched, see MESOS-1715, MESOS-1720, MESOS-1800.
      typedef hashmap<TaskID, TaskInfo> TaskMap;
      foreachvalue (const TaskMap& tasks, framework->pendingTasks) {
        foreachvalue (const TaskInfo& task, tasks) {
          message.add_tasks()->CopyFrom(protobuf::createTask(
              task, TASK_STAGING, framework->id()));
        }
      }

      foreachvalue (Executor* executor, framework->executors) {
        // Add launched, terminated, and queued tasks.
        // Note that terminated executors will only have terminated
        // unacknowledged tasks.
        // Note that for each task the latest state and status update
        // state (if any) is also included.
        foreachvalue (Task* task, executor->launchedTasks) {
          message.add_tasks()->CopyFrom(*task);
        }

        foreachvalue (Task* task, executor->terminatedTasks) {
          message.add_tasks()->CopyFrom(*task);
        }

        foreachvalue (const TaskInfo& task, executor->queuedTasks) {
          message.add_tasks()->CopyFrom(protobuf::createTask(
              task, TASK_STAGING, framework->id()));
        }

        // Do not reregister with Command (or Docker) Executors
        // because the master doesn't store them; they are generated
        // by the slave.
        if (!executor->isGeneratedForCommandTask()) {
          // Ignore terminated executors because they do not consume
          // any resources.
          if (executor->state != Executor::TERMINATED) {
            ExecutorInfo* executorInfo = message.add_executor_infos();
            executorInfo->MergeFrom(executor->info);

            // Scheduler Driver will ensure the framework id is set in
            // ExecutorInfo, effectively making it a required field.
            CHECK(executorInfo->has_framework_id());
          }
        }
      }
    }

    // Add completed frameworks.
    foreachvalue (const Owned<Framework>& completedFramework,
                  completedFrameworks) {
      VLOG(1) << "Reregistering completed framework "
                << completedFramework->id();

      Archive::Framework* completedFramework_ =
        message.add_completed_frameworks();

      completedFramework_->mutable_framework_info()->CopyFrom(
          completedFramework->info);

      if (completedFramework->pid.isSome()) {
        completedFramework_->set_pid(completedFramework->pid.get());
      }

      foreach (const Owned<Executor>& executor,
               completedFramework->completedExecutors) {
        VLOG(2) << "Reregistering completed executor '" << executor->id
                << "' with " << executor->terminatedTasks.size()
                << " terminated tasks, " << executor->completedTasks.size()
                << " completed tasks";

        foreachvalue (const Task* task, executor->terminatedTasks) {
          VLOG(2) << "Reregistering terminated task " << task->task_id();
          completedFramework_->add_tasks()->CopyFrom(*task);
        }

        foreach (const shared_ptr<Task>& task, executor->completedTasks) {
          VLOG(2) << "Reregistering completed task " << task->task_id();
          completedFramework_->add_tasks()->CopyFrom(*task);
        }
      }
    }

    // If the `Try` from `downgradeResources` returns an `Error`, we currently
    // continue to send the resources to the master in a partially downgraded
    // state. This implies that an agent with refined reservations cannot work
    // with versions of master before reservation refinement support, which was
    // introduced in 1.4.0.
    //
    // TODO(mpark): Do something smarter with the result once something
    // like a master capability is introduced.
    downgradeResources(&message);

    CHECK_SOME(master);
    send(master.get(), message);
  }

  // Bound the maximum backoff by 'REGISTER_RETRY_INTERVAL_MAX'.
  maxBackoff = std::min(maxBackoff, REGISTER_RETRY_INTERVAL_MAX);

  // Determine the delay for next attempt by picking a random
  // duration between 0 and 'maxBackoff'.
  Duration delay = maxBackoff * ((double) os::random() / RAND_MAX);

  VLOG(1) << "Will retry registration in " << delay << " if necessary";

  // Backoff.
  agentRegistrationTimer = process::delay(
      delay,
      self(),
      &Slave::doReliableRegistration,
      maxBackoff * 2);
}


void Slave::handleRunTaskMessage(
    const UPID& from,
    RunTaskMessage&& runTaskMessage)
{
  runTask(
      from,
      runTaskMessage.framework(),
      runTaskMessage.framework_id(),
      runTaskMessage.pid(),
      runTaskMessage.task(),
      google::protobuf::convert(runTaskMessage.resource_version_uuids()),
      runTaskMessage.has_launch_executor() ?
          Option<bool>(runTaskMessage.launch_executor()) : None());
}


// TODO(vinod): Instead of crashing the slave on checkpoint errors,
// send TASK_LOST to the framework.
void Slave::runTask(
    const UPID& from,
    const FrameworkInfo& frameworkInfo,
    const FrameworkID& frameworkId,
    const UPID& pid,
    const TaskInfo& task,
    const vector<ResourceVersionUUID>& resourceVersionUuids,
    const Option<bool>& launchExecutor)
{
  CHECK_NE(task.has_executor(), task.has_command())
    << "Task " << task.task_id()
    << " should have either CommandInfo or ExecutorInfo set but not both";

  if (master != from) {
    LOG(WARNING) << "Ignoring run task message from " << from
                 << " because it is not the expected master: "
                 << (master.isSome() ? stringify(master.get()) : "None");
    return;
  }

  if (!frameworkInfo.has_id()) {
    LOG(ERROR) << "Ignoring run task message from " << from
               << " because it does not have a framework ID";
    return;
  }

  const ExecutorInfo executorInfo = getExecutorInfo(frameworkInfo, task);

  bool executorGeneratedForCommandTask = !task.has_executor();

  run(frameworkInfo,
      executorInfo,
      task,
      None(),
      resourceVersionUuids,
      pid,
      launchExecutor,
      executorGeneratedForCommandTask);
}


Option<Error> Slave::validateResourceLimitsAndIsolators(
    const vector<TaskInfo>& tasks)
{
  foreach (const TaskInfo& task, tasks) {
    if (!(task.has_container() &&
          task.container().type() == ContainerInfo::DOCKER)) {
      if (task.limits().count("cpus") &&
          !(strings::contains(flags.isolation, "cgroups/cpu") ||
            strings::contains(flags.isolation, "cgroups/all"))) {
        return Error(
            "CPU limits can only be set on tasks launched in Mesos containers"
            " when the agent has loaded the 'cgroups/cpu' isolator");
      }

      if (task.limits().count("mem") &&
          !(strings::contains(flags.isolation, "cgroups/mem") ||
            strings::contains(flags.isolation, "cgroups/all"))) {
        return Error(
            "Memory limits can only be set on tasks launched in Mesos"
            " containers when the agent has loaded the 'cgroups/mem' isolator");
      }
    }
  }

  return None();
}


void Slave::run(
    const FrameworkInfo& frameworkInfo,
    ExecutorInfo executorInfo,
    Option<TaskInfo> task,
    Option<TaskGroupInfo> taskGroup,
    const vector<ResourceVersionUUID>& resourceVersionUuids,
    const UPID& pid,
    const Option<bool>& launchExecutor,
    bool executorGeneratedForCommandTask)
{
  CHECK_NE(task.isSome(), taskGroup.isSome())
    << "Either task or task group should be set but not both";

  auto injectAllocationInfo = [](
      RepeatedPtrField<Resource>* resources,
      const FrameworkInfo& frameworkInfo) {
    set<string> roles = protobuf::framework::getRoles(frameworkInfo);

    foreach (Resource& resource, *resources) {
      if (!resource.has_allocation_info()) {
        if (roles.size() != 1) {
          LOG(FATAL) << "Missing 'Resource.AllocationInfo' for resources"
                     << " allocated to MULTI_ROLE framework"
                     << " '" << frameworkInfo.name() << "'";
        }

        resource.mutable_allocation_info()->set_role(*roles.begin());
      }
    }
  };

  injectAllocationInfo(executorInfo.mutable_resources(), frameworkInfo);
  upgradeResources(&executorInfo);

  if (task.isSome()) {
    injectAllocationInfo(task->mutable_resources(), frameworkInfo);

    if (task->has_executor()) {
      injectAllocationInfo(
          task->mutable_executor()->mutable_resources(),
          frameworkInfo);
    }

    upgradeResources(&task.get());
  }

  if (taskGroup.isSome()) {
    foreach (TaskInfo& task, *taskGroup->mutable_tasks()) {
      injectAllocationInfo(task.mutable_resources(), frameworkInfo);

      if (task.has_executor()) {
        injectAllocationInfo(
            task.mutable_executor()->mutable_resources(),
            frameworkInfo);
      }
    }

    upgradeResources(&taskGroup.get());
  }

  vector<TaskInfo> tasks;
  if (task.isSome()) {
    tasks.push_back(task.get());
  } else {
    foreach (const TaskInfo& task, taskGroup->tasks()) {
      tasks.push_back(task);
    }
  }

  const FrameworkID& frameworkId = frameworkInfo.id();

  LOG(INFO) << "Got assigned " << taskOrTaskGroup(task, taskGroup)
            << " for framework " << frameworkId;

  foreach (const TaskInfo& _task, tasks) {
    if (_task.slave_id() != info.id()) {
      LOG(WARNING)
        << "Agent " << info.id() << " ignoring running "
        << taskOrTaskGroup(_task, taskGroup) << " because "
        << "it was intended for old agent " << _task.slave_id();
      return;
    }
  }

  CHECK(state == RECOVERING || state == DISCONNECTED ||
        state == RUNNING || state == TERMINATING)
    << state;

  // TODO(bmahler): Also ignore if we're DISCONNECTED.
  if (state == RECOVERING || state == TERMINATING) {
    LOG(WARNING) << "Ignoring running " << taskOrTaskGroup(task, taskGroup)
                 << " because the agent is " << state;

    // We do not send `ExitedExecutorMessage` here because the disconnected
    // agent is expected to (eventually) reregister and reconcile the executor
    // states with the master.

    // TODO(vinod): Consider sending a TASK_LOST here.
    // Currently it is tricky because 'statusUpdate()'
    // ignores updates for unknown frameworks.
    return;
  }

  vector<Future<bool>> unschedules;

  // If we are about to create a new framework, unschedule the work
  // and meta directories from getting gc'ed.
  Framework* framework = getFramework(frameworkId);
  if (framework == nullptr) {
    // Unschedule framework work directory.
    string path = paths::getFrameworkPath(
        flags.work_dir, info.id(), frameworkId);

    if (os::exists(path)) {
      unschedules.push_back(gc->unschedule(path));
    }

    // Unschedule framework meta directory.
    path = paths::getFrameworkPath(metaDir, info.id(), frameworkId);
    if (os::exists(path)) {
      unschedules.push_back(gc->unschedule(path));
    }

    Option<UPID> frameworkPid = None();

    if (pid != UPID()) {
      frameworkPid = pid;
    }

    framework = new Framework(
        this,
        flags,
        frameworkInfo,
        frameworkPid);

    frameworks[frameworkId] = framework;
    if (frameworkInfo.checkpoint()) {
      framework->checkpointFramework();
    }

    // Does this framework ID already exist in `completedFrameworks`?
    // If so, move the completed executors of the old framework to
    // this new framework and remove the old completed framework.
    if (completedFrameworks.contains(frameworkId)) {
      Owned<Framework>& completedFramework =
        completedFrameworks.at(frameworkId);

      framework->completedExecutors = completedFramework->completedExecutors;
      completedFrameworks.erase(frameworkId);
    }
  }

  CHECK_NOTNULL(framework);

  Option<Error> error = validateResourceLimitsAndIsolators(tasks);
  if (error.isSome()) {
    // We report TASK_DROPPED to the framework because the task was
    // never launched. For non-partition-aware frameworks, we report
    // TASK_LOST for backward compatibility.
    mesos::TaskState taskState = TASK_DROPPED;
    if (!protobuf::frameworkHasCapability(
            frameworkInfo, FrameworkInfo::Capability::PARTITION_AWARE)) {
      taskState = TASK_LOST;
    }

    foreach (const TaskInfo& _task, tasks) {
      const StatusUpdate update = protobuf::createStatusUpdate(
          frameworkId,
          info.id(),
          _task.task_id(),
          taskState,
          TaskStatus::SOURCE_SLAVE,
          id::UUID::random(),
          error->message,
          TaskStatus::REASON_GC_ERROR);

      statusUpdate(update, UPID());
    }

    if (framework->idle()) {
      removeFramework(framework);
    }

    return;
  }

  const ExecutorID& executorId = executorInfo.executor_id();

  if (HookManager::hooksAvailable()) {
    // Set task labels from run task label decorator.
    for (auto it = tasks.begin(); it != tasks.end(); ++it) {
      (*it).mutable_labels()->CopyFrom(
          HookManager::slaveRunTaskLabelDecorator(
              (*it), executorInfo, frameworkInfo, info));
    }

    // Update `task`/`taskGroup` to reflect the task label updates.
    if (task.isSome()) {
      CHECK_EQ(1u, tasks.size());
      task->mutable_labels()->CopyFrom(tasks[0].labels());
    } else {
      for (int i = 0; i < taskGroup->tasks().size(); ++i) {
        taskGroup->mutable_tasks(i)->mutable_labels()->
          CopyFrom(tasks[i].labels());
      }
    }
  }

  // Track the pending task / task group to ensure the framework is
  // not removed and the framework and top level executor directories
  // are not scheduled for deletion before '_run()' is called.
  //
  // TODO(bmahler): Can we instead track pending tasks within the
  // `Executor` struct by creating it earlier?
  if (task.isSome()) {
    framework->addPendingTask(executorId, task.get());
  } else {
    framework->addPendingTaskGroup(executorId, taskGroup.get());
  }

  // If we are about to create a new executor, unschedule the top
  // level work and meta directories from getting gc'ed.
  Executor* executor = framework->getExecutor(executorId);
  if (executor == nullptr) {
    // Unschedule executor work directory.
    string path = paths::getExecutorPath(
        flags.work_dir, info.id(), frameworkId, executorId);

    if (os::exists(path)) {
      unschedules.push_back(gc->unschedule(path));
    }

    // Unschedule executor meta directory.
    path = paths::getExecutorPath(metaDir, info.id(), frameworkId, executorId);

    if (os::exists(path)) {
      unschedules.push_back(gc->unschedule(path));
    }
  }

  auto onUnscheduleGCFailure =
    [=](const Future<vector<bool>>& unschedules) -> Future<vector<bool>> {
      LOG(ERROR) << "Failed to unschedule directories scheduled for gc: "
                 << unschedules.failure();

      Framework* _framework = getFramework(frameworkId);
      if (_framework == nullptr) {
        const string error =
          "Cannot handle unschedule GC failure for " +
          taskOrTaskGroup(task, taskGroup) + " because the framework " +
          stringify(frameworkId) + " does not exist";

        LOG(WARNING) << error;

        return Failure(error);
      }

      // We report TASK_DROPPED to the framework because the task was
      // never launched. For non-partition-aware frameworks, we report
      // TASK_LOST for backward compatibility.
      mesos::TaskState taskState = TASK_DROPPED;
      if (!protobuf::frameworkHasCapability(
              frameworkInfo, FrameworkInfo::Capability::PARTITION_AWARE)) {
        taskState = TASK_LOST;
      }

      foreach (const TaskInfo& _task, tasks) {
        _framework->removePendingTask(_task.task_id());

        const StatusUpdate update = protobuf::createStatusUpdate(
            frameworkId,
            info.id(),
            _task.task_id(),
            taskState,
            TaskStatus::SOURCE_SLAVE,
            id::UUID::random(),
            "Could not launch the task because we failed to unschedule"
            " directories scheduled for gc",
            TaskStatus::REASON_GC_ERROR);

        // TODO(vinod): Ensure that the task status update manager
        // reliably delivers this update. Currently, we don't guarantee
        // this because removal of the framework causes the status
        // update manager to stop retrying for its un-acked updates.
        statusUpdate(update, UPID());
      }

      if (_framework->idle()) {
        removeFramework(_framework);
      }

      return unschedules;
  };

  // `taskLaunch` encapsulates each task's launch steps from this point
  // to the end of `_run` (the completion of task authorization).
  Future<Nothing> taskLaunch = collect(unschedules)
    // Handle the failure iff unschedule GC fails.
    .repair(defer(self(), onUnscheduleGCFailure))
    // If unschedule GC succeeds, trigger the next continuation.
    .then(defer(
        self(),
        &Self::_run,
        frameworkInfo,
        executorInfo,
        task,
        taskGroup,
        resourceVersionUuids,
        launchExecutor));

  // Use a sequence to ensure that task launch order is preserved.
  framework->taskLaunchSequences[executorId]
    .add<Nothing>([taskLaunch]() -> Future<Nothing> {
      // We use this sequence only to maintain the task launching order. If the
      // sequence is deleted, we do not want the resulting discard event to
      // propagate up the chain, which would prevent the previous `.then()` or
      // `.repair()` callbacks from being invoked. Thus, we use `undiscardable`
      // to protect each `taskLaunch`.
      return undiscardable(taskLaunch);
    })
    // We register `onAny` on the future returned by the sequence (referred to
    // as `seqFuture` below). The following scenarios could happen:
    //
    // (1) `seqFuture` becomes ready. This happens when all previous tasks'
    // `taskLaunch` futures are in non-pending state AND this task's own
    // `taskLaunch` future is in ready state. The `onReady` call registered
    // below will be triggered and continue the success path.
    //
    // (2) `seqFuture` becomes failed. This happens when all previous tasks'
    // `taskLaunch` futures are in non-pending state AND this task's own
    // `taskLaunch` future is in failed state (e.g. due to unschedule GC
    // failure or some other failure). The `onFailed` call registered below
    // will be triggered to handle the failure.
    //
    // (3) `seqFuture` becomes discarded. This happens when the sequence is
    // destructed (see declaration of `taskLaunchSequences` on its lifecycle)
    // while the `seqFuture` is still pending. In this case, we wait until
    // this task's own `taskLaunch` future becomes non-pending and trigger
    // callbacks accordingly.
    //
    // TODO(mzhu): In case (3), the destruction of the sequence means that the
    // agent will eventually discover that the executor is absent and drop
    // the task. While `__run` is capable of handling this case, it is more
    // optimal to handle the failure earlier here rather than waiting for
    // the `taskLaunch` transition and directing control to `__run`.
    .onAny(defer(self(), [=](const Future<Nothing>&) {
      // We only want to execute the following callbacks once the work performed
      // in the `taskLaunch` chain is complete. Thus, we add them onto the
      // `taskLaunch` chain rather than dispatching directly.
      taskLaunch
        .onReady(defer(
            self(),
            &Self::__run,
            frameworkInfo,
            executorInfo,
            task,
            taskGroup,
            resourceVersionUuids,
            launchExecutor,
            executorGeneratedForCommandTask))
        .onFailed(defer(self(), [=](const string& failure) {
          Framework* _framework = getFramework(frameworkId);
          if (_framework == nullptr) {
            LOG(WARNING) << "Ignoring running "
                         << taskOrTaskGroup(task, taskGroup)
                         << " because the framework " << stringify(frameworkId)
                         << " does not exist";
          }

          if (launchExecutor.isSome() && launchExecutor.get()) {
            // Master expects a new executor to be launched for this task(s).
            // To keep the master executor entries updated, the agent needs to
            // send `ExitedExecutorMessage` even though no executor launched.
            sendExitedExecutorMessage(frameworkId, executorInfo.executor_id());

            // See the declaration of `taskLaunchSequences` regarding its
            // lifecycle management.
            if (_framework != nullptr) {
              _framework->taskLaunchSequences.erase(executorInfo.executor_id());
            }
          }
        }));
    }));

  // TODO(mzhu): Consolidate error handling code in `__run` here.
}


Future<Nothing> Slave::_run(
    const FrameworkInfo& frameworkInfo,
    const ExecutorInfo& executorInfo,
    const Option<TaskInfo>& task,
    const Option<TaskGroupInfo>& taskGroup,
    const std::vector<ResourceVersionUUID>& resourceVersionUuids,
    const Option<bool>& launchExecutor)
{
  // TODO(anindya_sinha): Consider refactoring the initial steps common
  // to `_run()` and `__run()`.
  CHECK_NE(task.isSome(), taskGroup.isSome())
    << "Either task or task group should be set but not both";

  vector<TaskInfo> tasks;
  if (task.isSome()) {
    tasks.push_back(task.get());
  } else {
    foreach (const TaskInfo& _task, taskGroup->tasks()) {
      tasks.push_back(_task);
    }
  }

  const FrameworkID& frameworkId = frameworkInfo.id();
  Framework* framework = getFramework(frameworkId);
  if (framework == nullptr) {
    const string error =
      "Ignoring running " + taskOrTaskGroup(task, taskGroup) +
      " because the framework " + stringify(frameworkId) + " does not exist";

    LOG(WARNING) << error;

    return Failure(error);
  }

  // We don't send a status update here because a terminating
  // framework cannot send acknowledgements.
  if (framework->state == Framework::TERMINATING) {
    const string error = "Ignoring running " +
                         taskOrTaskGroup(task, taskGroup) + " of framework " +
                         stringify(frameworkId) +
                         " because the framework is terminating";

    LOG(WARNING) << error;

    // Although we cannot send a status update in this case, we remove
    // the affected tasks from the pending tasks.
    foreach (const TaskInfo& _task, tasks) {
      framework->removePendingTask(_task.task_id());
    }

    if (framework->idle()) {
      removeFramework(framework);
    }

    return Failure(error);
  }

  // Ignore the launch if killed in the interim. The invariant here
  // is that all tasks in the group are still pending, or all were
  // removed due to a kill arriving for one of the tasks in the group.
  bool allPending = true;
  bool allRemoved = true;
  foreach (const TaskInfo& _task, tasks) {
    if (framework->isPending(_task.task_id())) {
      allRemoved = false;
    } else {
      allPending = false;
    }
  }

  CHECK(allPending != allRemoved)
    << "BUG: The " << taskOrTaskGroup(task, taskGroup)
    << " was partially killed";

  if (allRemoved) {
    const string error = "Ignoring running " +
                         taskOrTaskGroup(task, taskGroup) + " of framework " +
                         stringify(frameworkId) +
                         " because it has been killed in the meantime";

    LOG(WARNING) << error;

    return Failure(error);
  }

  // Authorize the task or tasks (as in a task group) to ensure that the
  // task user is allowed to launch tasks on the agent. If authorization
  // fails, the task (or all tasks in a task group) are not launched.
  vector<Future<bool>> authorizations;

  LOG(INFO) << "Authorizing " << taskOrTaskGroup(task, taskGroup)
            << " for framework " << frameworkId;

  foreach (const TaskInfo& _task, tasks) {
    authorizations.push_back(authorizeTask(_task, frameworkInfo));
  }

  auto onTaskAuthorizationFailure =
    [=](const string& error, Framework* _framework) {
      CHECK_NOTNULL(_framework);

      // For failed authorization, we send a TASK_ERROR status update
      // for all tasks.
      const TaskStatus::Reason reason = task.isSome()
        ? TaskStatus::REASON_TASK_UNAUTHORIZED
        : TaskStatus::REASON_TASK_GROUP_UNAUTHORIZED;

      LOG(ERROR) << "Authorization failed for "
                 << taskOrTaskGroup(task, taskGroup) << " of framework "
                 << frameworkId << ": " << error;

      foreach (const TaskInfo& _task, tasks) {
        _framework->removePendingTask(_task.task_id());

        const StatusUpdate update = protobuf::createStatusUpdate(
            frameworkId,
            info.id(),
            _task.task_id(),
            TASK_ERROR,
            TaskStatus::SOURCE_SLAVE,
            id::UUID::random(),
            error,
            reason);

        statusUpdate(update, UPID());
      }

      if (_framework->idle()) {
        removeFramework(_framework);
      }
  };

  return collect(authorizations)
    .repair(defer(self(),
      [=](const Future<vector<bool>>& future) -> Future<vector<bool>> {
        Framework* _framework = getFramework(frameworkId);
        if (_framework == nullptr) {
          const string error =
            "Authorization failed for " + taskOrTaskGroup(task, taskGroup) +
            " because the framework " + stringify(frameworkId) +
            " does not exist";

            LOG(WARNING) << error;

          return Failure(error);
        }

        const string error =
          "Failed to authorize " + taskOrTaskGroup(task, taskGroup) +
          ": " + future.failure();

        onTaskAuthorizationFailure(error, _framework);

        return future;
      }
    ))
    .then(defer(self(),
      [=](const Future<vector<bool>>& future) -> Future<Nothing> {
        Framework* _framework = getFramework(frameworkId);
        if (_framework == nullptr) {
          const string error =
            "Ignoring running " + taskOrTaskGroup(task, taskGroup) +
            " because the framework " + stringify(frameworkId) +
            " does not exist";

            LOG(WARNING) << error;

          return Failure(error);
        }

        deque<bool> authorizations(future->begin(), future->end());

        foreach (const TaskInfo& _task, tasks) {
          bool authorized = authorizations.front();
          authorizations.pop_front();

          // If authorization for this task fails, we fail all tasks (in case
          // of a task group) with this specific error.
          if (!authorized) {
            const string error =
              "Framework " + stringify(frameworkId) +
              " is not authorized to launch task " + stringify(_task);

            onTaskAuthorizationFailure(error, _framework);

            return Failure(error);
          }
        }

        return Nothing();
      }
    ));
}


void Slave::__run(
    const FrameworkInfo& frameworkInfo,
    const ExecutorInfo& executorInfo,
    const Option<TaskInfo>& task,
    const Option<TaskGroupInfo>& taskGroup,
    const vector<ResourceVersionUUID>& resourceVersionUuids,
    const Option<bool>& launchExecutor,
    bool executorGeneratedForCommandTask)
{
  CHECK_NE(task.isSome(), taskGroup.isSome())
    << "Either task or task group should be set but not both";

  vector<TaskInfo> tasks;
  if (task.isSome()) {
    tasks.push_back(task.get());
  } else {
    foreach (const TaskInfo& _task, taskGroup->tasks()) {
      tasks.push_back(_task);
    }
  }

  const FrameworkID& frameworkId = frameworkInfo.id();
  Framework* framework = getFramework(frameworkId);
  if (framework == nullptr) {
    LOG(WARNING) << "Ignoring running " << taskOrTaskGroup(task, taskGroup)
                 << " because the framework " << frameworkId
                 << " does not exist";

    if (launchExecutor.isSome() && launchExecutor.get()) {
      // Master expects a new executor to be launched for this task(s).
      // To keep the master executor entries updated, the agent needs to send
      // `ExitedExecutorMessage` even though no executor launched.
      sendExitedExecutorMessage(frameworkId, executorInfo.executor_id());

      // There is no need to clean up the task launch sequence here since
      // the framework (along with the sequence) no longer exists.
    }

    return;
  }

  const ExecutorID& executorId = executorInfo.executor_id();

  // We report TASK_DROPPED to the framework because the task was
  // never launched. For non-partition-aware frameworks, we report
  // TASK_LOST for backward compatibility.
  auto sendTaskDroppedUpdate =
    [&](TaskStatus::Reason reason, const string& message) {
      mesos::TaskState taskState = TASK_DROPPED;

      if (!protobuf::frameworkHasCapability(
              frameworkInfo, FrameworkInfo::Capability::PARTITION_AWARE)) {
        taskState = TASK_LOST;
      }

      foreach (const TaskInfo& _task, tasks) {
        const StatusUpdate update = protobuf::createStatusUpdate(
            frameworkId,
            info.id(),
            _task.task_id(),
            taskState,
            TaskStatus::SOURCE_SLAVE,
            id::UUID::random(),
            message,
            reason,
            executorId);

        statusUpdate(update, UPID());
      }
    };

  // We don't send a status update here because a terminating
  // framework cannot send acknowledgements.
  if (framework->state == Framework::TERMINATING) {
    LOG(WARNING) << "Ignoring running " << taskOrTaskGroup(task, taskGroup)
                 << " of framework " << frameworkId
                 << " because the framework is terminating";

    // Although we cannot send a status update in this case, we remove
    // the affected tasks from the list of pending tasks.
    foreach (const TaskInfo& _task, tasks) {
      framework->removePendingTask(_task.task_id());
    }

    if (framework->idle()) {
      removeFramework(framework);
    }

    if (launchExecutor.isSome() && launchExecutor.get()) {
      // Master expects a new executor to be launched for this task(s).
      // To keep the master executor entries updated, the agent needs to send
      // `ExitedExecutorMessage` even though no executor launched.
      sendExitedExecutorMessage(frameworkId, executorInfo.executor_id());

      // See the declaration of `taskLaunchSequences` regarding its lifecycle
      // management.
      framework->taskLaunchSequences.erase(executorInfo.executor_id());
    }

    return;
  }

  // Ignore the launch if killed in the interim. The invariant here
  // is that all tasks in the group are still pending, or all were
  // removed due to a kill arriving for one of the tasks in the group.
  bool allPending = true;
  bool allRemoved = true;
  foreach (const TaskInfo& _task, tasks) {
    if (framework->isPending(_task.task_id())) {
      allRemoved = false;
    } else {
      allPending = false;
    }
  }

  CHECK(allPending != allRemoved)
    << "BUG: The " << taskOrTaskGroup(task, taskGroup)
    << " was partially killed";

  if (allRemoved) {
    LOG(WARNING) << "Ignoring running " << taskOrTaskGroup(task, taskGroup)
                 << " of framework " << frameworkId
                 << " because it has been killed in the meantime";

    if (launchExecutor.isSome() && launchExecutor.get()) {
      // Master expects a new executor to be launched for this task(s).
      // To keep the master executor entries updated, the agent needs to send
      // `ExitedExecutorMessage` even though no executor launched.
      sendExitedExecutorMessage(frameworkId, executorInfo.executor_id());

      // See the declaration of `taskLaunchSequences` regarding its lifecycle
      // management.
      framework->taskLaunchSequences.erase(executorInfo.executor_id());
    }

    return;
  }

  foreach (const TaskInfo& _task, tasks) {
    CHECK(framework->removePendingTask(_task.task_id()));
  }

  // Check task launch invariants.
  //
  // TODO(bbannier): Instead of copy-pasting identical code to deal
  // with cases where tasks need to be terminated, consolidate code
  // below to decouple checking from terminating.
  Option<string> kill = None();

  // Fail the launch if the agent is draining.
  if (drainConfig.isSome()) {
    LOG(WARNING) << "Ignoring running " << taskOrTaskGroup(task, taskGroup)
                 << " of framework " << frameworkId
                 << " because the agent is draining";

    kill = "Task was received while agent was already draining";
  }

  if (kill.isSome()) {
    sendTaskDroppedUpdate(TaskStatus::REASON_SLAVE_DRAINING, *kill);

    // Refer to the comment after 'framework->removePendingTask' above
    // for why we need this.
    if (framework->idle()) {
      removeFramework(framework);
    }

    if (launchExecutor.isSome() && launchExecutor.get()) {
      // Master expects a new executor to be launched for this task(s).
      // To keep the master executor entries updated, the agent needs to send
      // `ExitedExecutorMessage` even though no executor launched.
      sendExitedExecutorMessage(frameworkId, executorInfo.executor_id());

      // See the declaration of `taskLaunchSequences` regarding its lifecycle
      // management.
      framework->taskLaunchSequences.erase(executorInfo.executor_id());
    }

    return;
  }

  CHECK_NONE(kill);

  // If the master sent resource versions, perform a best-effort check
  // that they are consistent with the resources the task uses.
  //
  // TODO(bbannier): Also check executor resources.
  if (!resourceVersionUuids.empty()) {
    hashset<Option<ResourceProviderID>> usedResourceProviderIds;
    foreach (const TaskInfo& _task, tasks) {
      foreach (const Resource& resource, _task.resources()) {
        usedResourceProviderIds.insert(resource.has_provider_id()
           ? Option<ResourceProviderID>(resource.provider_id())
           : None());
      }
    }

    const hashmap<Option<ResourceProviderID>, UUID>
      receivedResourceVersions = protobuf::parseResourceVersions(
          {resourceVersionUuids.begin(), resourceVersionUuids.end()});

    foreach (const Option<ResourceProviderID>& resourceProviderId,
             usedResourceProviderIds) {
      if (resourceProviderId.isNone()) {
        CHECK(receivedResourceVersions.contains(None()));

        if (resourceVersion != receivedResourceVersions.at(None())) {
          kill = "Task assumes outdated resource state";
        }
      } else {
        ResourceProvider* resourceProvider =
          getResourceProvider(resourceProviderId.get());

        if (resourceProvider == nullptr ||
            resourceProvider->resourceVersion !=
              receivedResourceVersions.at(resourceProviderId.get())) {
          kill = "Task assumes outdated resource state";
        }
      }
    }
  }

  if (kill.isSome()) {
    sendTaskDroppedUpdate(TaskStatus::REASON_INVALID_OFFERS, *kill);

    // Refer to the comment after 'framework->removePendingTask' above
    // for why we need this.
    if (framework->idle()) {
      removeFramework(framework);
    }

    if (launchExecutor.isSome() && launchExecutor.get()) {
      // Master expects a new executor to be launched for this task(s).
      // To keep the master executor entries updated, the agent needs to send
      // `ExitedExecutorMessage` even though no executor launched.
      sendExitedExecutorMessage(frameworkId, executorInfo.executor_id());

      // See the declaration of `taskLaunchSequences` regarding its lifecycle
      // management.
      framework->taskLaunchSequences.erase(executorInfo.executor_id());
    }

    return;
  }

  auto unallocated = [](const Resources& resources) {
    Resources result = resources;
    result.unallocate();
    return result;
  };

  CHECK_NONE(kill);

  // NOTE: If the task/task group or executor uses resources that are
  // checkpointed on the slave (e.g. persistent volumes), we should
  // already know about it. If the slave doesn't know about them (e.g.
  // CheckpointResourcesMessage was dropped or came out of order), we
  // send TASK_DROPPED status updates here since restarting the task
  // may succeed in the event that CheckpointResourcesMessage arrives
  // out of order.
  foreach (const TaskInfo& _task, tasks) {
    // We must unallocate the resources to check whether they are
    // contained in the unallocated total checkpointed resources.
    Resources checkpointedTaskResources =
      unallocated(_task.resources()).filter(needCheckpointing);

    foreach (const Resource& resource, checkpointedTaskResources) {
      if (!checkpointedResources.contains(resource)) {
        LOG(WARNING) << "Unknown checkpointed resource " << resource
                     << " for task " << _task
                     << " of framework " << frameworkId;

        kill =
          "The checkpointed resources being used by the task or task group are "
          "unknown to the agent";
        break;
      }
    }
  }

  if (kill.isSome()) {
    sendTaskDroppedUpdate(TaskStatus::REASON_RESOURCES_UNKNOWN, *kill);

    // Refer to the comment after 'framework->removePendingTask' above
    // for why we need this.
    if (framework->idle()) {
      removeFramework(framework);
    }

    if (launchExecutor.isSome() && launchExecutor.get()) {
      // Master expects a new executor to be launched for this task(s).
      // To keep the master executor entries updated, the agent needs to send
      // `ExitedExecutorMessage` even though no executor launched.
      sendExitedExecutorMessage(frameworkId, executorInfo.executor_id());

      // See the declaration of `taskLaunchSequences` regarding its lifecycle
      // management.
      framework->taskLaunchSequences.erase(executorInfo.executor_id());
    }

    return;
  }

  CHECK_NONE(kill);

  // Refer to the comment above when looping across tasks on
  // why we need to unallocate resources.
  Resources checkpointedExecutorResources =
    unallocated(executorInfo.resources()).filter(needCheckpointing);

  foreach (const Resource& resource, checkpointedExecutorResources) {
    if (!checkpointedResources.contains(resource)) {
      LOG(WARNING) << "Unknown checkpointed resource " << resource
                   << " for executor '" << executorId
                   << "' of framework " << frameworkId;

      kill =
        "The checkpointed resources being used by the executor are unknown "
        "to the agent";
      break;
    }
  }

  if (kill.isSome()) {
    sendTaskDroppedUpdate(TaskStatus::REASON_RESOURCES_UNKNOWN, *kill);

    // Refer to the comment after 'framework->removePendingTask' above
    // for why we need this.
    if (framework->idle()) {
      removeFramework(framework);
    }

    if (launchExecutor.isSome() && launchExecutor.get()) {
      // Master expects a new executor to be launched for this task(s).
      // To keep the master executor entries updated, the agent needs to send
      // `ExitedExecutorMessage` even though no executor launched.
      sendExitedExecutorMessage(frameworkId, executorInfo.executor_id());

      // See the declaration of `taskLaunchSequences` regarding its lifecycle
      // management.
      framework->taskLaunchSequences.erase(executorInfo.executor_id());
    }

    return;
  }

  // NOTE: The slave cannot be in 'RECOVERING' because the task would
  // have been rejected in 'run()' in that case.
  CHECK(state == DISCONNECTED || state == RUNNING || state == TERMINATING)
    << state;

  if (state == TERMINATING) {
    LOG(WARNING) << "Ignoring running " << taskOrTaskGroup(task, taskGroup)
                 << " of framework " << frameworkId
                 << " because the agent is terminating";

    // Refer to the comment after 'framework->removePendingTask' above
    // for why we need this.
    if (framework->idle()) {
      removeFramework(framework);
    }

    // We don't send TASK_LOST or ExitedExecutorMessage here because the slave
    // is terminating.
    return;
  }

  CHECK(framework->state == Framework::RUNNING) << framework->state;

  LOG(INFO) << "Launching " << taskOrTaskGroup(task, taskGroup)
            << " for framework " << frameworkId;

  Executor* executor = framework->getExecutor(executorId);

  // If launchExecutor is NONE, this is the legacy case where the master
  // did not set the `launch_executor` flag. Executor will be launched if
  // there is none.

  if (launchExecutor.isSome()) {
    if (taskGroup.isNone() && task->has_command()) {
      // We are dealing with command task; a new command executor will be
      // launched.
      CHECK(executor == nullptr);
    } else {
      // Master set the `launch_executor` flag and this is not a command task.
      if (launchExecutor.get() && executor != nullptr) {
        // Master requests launching executor but an executor still exits
        // on the agent. In this case we will drop tasks. This could happen if
        // the executor is already terminated on the agent (and agent has sent
        // out the `ExitedExecutorMessage` and it was received by the master)
        // but the agent is still waiting for all the status updates to be
        // acked before removing the executor struct.

        sendTaskDroppedUpdate(
            TaskStatus::REASON_EXECUTOR_TERMINATED,
            "Master wants to launch executor, but one already exists");

        // Master expects a new executor to be launched for this task(s).
        // To keep the master executor entries updated, the agent needs to
        // send `ExitedExecutorMessage` even though no executor launched.
        if (executor->state == Executor::TERMINATED) {
          sendExitedExecutorMessage(frameworkId, executorInfo.executor_id());
        } else {
          // This could happen if the following sequence of events happen:
          //
          //  (1) Master sends `runTaskMessage` to agent with
          //      `launch_executor = true`;
          //
          //  (2) Before the agent got the `runTaskMessage`, it reconnects and
          //      reconciles with the master. Master then removes the executor
          //      entry it asked the agent to launch in step (1);
          //
          //  (3) Agent got the `runTaskMessage` sent in step (1), launches
          //      the task and the executor (that the master does not know
          //      about).
          //
          //  (4) Master now sends another `runTaskMessage` for the same
          //      executor id with `launch_executor = true`.
          //
          // The agent ends up with a lingering executor that the master does
          // not know about. We will shutdown the executor.
          //
          // TODO(mzhu): This could be avoided if the agent can
          // tell whether the master's message was sent before or after the
          // reconnection and discard the message in the former case.
          //
          // TODO(mzhu): Master needs to do proper executor reconciliation
          // with the agent to avoid this from happening.
          _shutdownExecutor(framework, executor);
        }

        return;
      }

      if (!launchExecutor.get() && executor == nullptr) {
        // Master wants no new executor launched and there is none running on
        // the agent. This could happen if the task expects some previous
        // tasks to launch the executor. However, the earlier task got killed
        // or dropped hence did not launch the executor but the master doesn't
        // know about it yet because the `ExitedExecutorMessage` is still in
        // flight. In this case, we will drop the task.

        sendTaskDroppedUpdate(
            TaskStatus::REASON_EXECUTOR_TERMINATED,
            "No executor is expected to launch and there is none running");

        // We do not send `ExitedExecutorMessage` here because the expectation
        // is that there is already one on the fly to master. If the message
        // gets dropped, we will hopefully reconcile with the master later.

        return;
      }
    }
  }

  // Either the master explicitly requests launching a new executor
  // or we are in the legacy case of launching one if there wasn't
  // one already. Either way, let's launch executor now.
  if (executor == nullptr) {
    Try<Executor*> added =
      framework->addExecutor(executorInfo, executorGeneratedForCommandTask);

    if (added.isError()) {
      CHECK(framework->getExecutor(executorId) == nullptr);

      sendTaskDroppedUpdate(
          TaskStatus::REASON_EXECUTOR_TERMINATED,
          added.error());

      // Refer to the comment after 'framework->removePendingTask' above
      // for why we need this.
      if (framework->idle()) {
        removeFramework(framework);
      }

      if (launchExecutor.isSome() && launchExecutor.get()) {
        // Master expects a new executor to be launched for this task(s).
        // To keep the master executor entries updated, the agent needs to send
        // `ExitedExecutorMessage` even though no executor launched.
        sendExitedExecutorMessage(frameworkId, executorInfo.executor_id());

        // See the declaration of `taskLaunchSequences` regarding its lifecycle
        // management.
        framework->taskLaunchSequences.erase(executorInfo.executor_id());
      }

      return;
    }

    executor = added.get();

    // NOTE: We make a copy of the executor info because we may mutate it with
    // some default fields and resources.
    ExecutorInfo executorInfo_ = executorInfo;

    // Populate the command info for default executor. We modify the executor
    // info to avoid resetting command info upon reregistering with the master
    // since the master doesn't store them; they are generated by the slave.
    if (executorInfo_.has_type() &&
        executorInfo_.type() == ExecutorInfo::DEFAULT) {
      CHECK(!executorInfo_.has_command());

      *executorInfo_.mutable_command() =
        defaultExecutorCommandInfo(flags.launcher_dir, executor->user);
    }

    // We modify the ExecutorInfo to include the task/task group's resources
    // when launching the executor so that the containerizer has non-zero
    // resources to work with when the executor has no resources. And this is
    // also helpful for the executor to have enough resources to start because
    // usually the resources that framework gives to executor are pretty small
    // (e.g., Marathon/mesos-execute give 0.1 CPUs to the default executor) so
    // the executor may be throttled by CFS, see MESOS-9925 for details.
    Resources tasksResources;
    foreach (const TaskInfo& _task, tasks) {
      tasksResources += _task.resources();
    }

    *executorInfo_.mutable_resources() =
      Resources(executorInfo.resources()) + tasksResources;

    // Add the default container info to the executor info.
    // TODO(jieyu): Rename the flag to be default_mesos_container_info.
    if (!executorInfo_.has_container() &&
        flags.default_container_info.isSome()) {
      *executorInfo_.mutable_container() = flags.default_container_info.get();
    }

    publishResources(executor->containerId, executorInfo_.resources())
      .then(defer(
          self(),
          &Self::generateSecret,
          frameworkId,
          executorId,
          executor->containerId))
      .onAny(defer(
          self(),
          &Self::launchExecutor,
          lambda::_1,
          frameworkId,
          executorInfo_,
          computeExecutorLimits(executorInfo.resources(), tasks),
          taskGroup.isNone() ? task.get() : Option<TaskInfo>::none()));
  }

  CHECK_NOTNULL(executor);

  switch (executor->state) {
    case Executor::TERMINATING:
    case Executor::TERMINATED: {
      string executorState;

      if (executor->state == Executor::TERMINATING) {
        executorState = "terminating";
      } else {
        executorState = "terminated";
      }

      LOG(WARNING) << "Asked to run " << taskOrTaskGroup(task, taskGroup)
                   << "' for framework " << frameworkId
                   << " with executor '" << executorId
                   << "' which is " << executorState;

      // We report TASK_DROPPED to the framework because the task was
      // never launched. For non-partition-aware frameworks, we report
      // TASK_LOST for backward compatibility.
      mesos::TaskState taskState = TASK_DROPPED;
      if (!protobuf::frameworkHasCapability(
              frameworkInfo, FrameworkInfo::Capability::PARTITION_AWARE)) {
        taskState = TASK_LOST;
      }

      foreach (const TaskInfo& _task, tasks) {
        const StatusUpdate update = protobuf::createStatusUpdate(
            frameworkId,
            info.id(),
            _task.task_id(),
            taskState,
            TaskStatus::SOURCE_SLAVE,
            id::UUID::random(),
            "Executor " + executorState,
            TaskStatus::REASON_EXECUTOR_TERMINATED);

        statusUpdate(update, UPID());
      }

      break;
    }
    case Executor::REGISTERING:
      if (executor->checkpoint) {
        foreach (const TaskInfo& _task, tasks) {
          executor->checkpointTask(_task);
        }
      }

      if (taskGroup.isSome()) {
        executor->enqueueTaskGroup(taskGroup.get());
      } else {
        foreach (const TaskInfo& _task, tasks) {
          executor->enqueueTask(_task);
        }
      }

      LOG(INFO) << "Queued " << taskOrTaskGroup(task, taskGroup)
                << " for executor " << *executor;

      break;
    case Executor::RUNNING: {
      if (executor->checkpoint) {
        foreach (const TaskInfo& _task, tasks) {
          executor->checkpointTask(_task);
        }
      }

      // Queue tasks until the containerizer is updated
      // with new resource limits (MESOS-998).
      if (taskGroup.isSome()) {
        executor->enqueueTaskGroup(taskGroup.get());
      } else {
        foreach (const TaskInfo& _task, tasks) {
          executor->enqueueTask(_task);
        }
      }

      LOG(INFO) << "Queued " << taskOrTaskGroup(task, taskGroup)
                << " for executor " << *executor;

      const ContainerID& containerId = executor->containerId;
      const Resources& resourceRequests = executor->allocatedResources();
      const google::protobuf::Map<string, Value::Scalar>& resourceLimits =
        computeExecutorLimits(
            executor->info.resources(),
            executor->queuedTasks.values(),
            executor->launchedTasks.values());

      publishResources(containerId, resourceRequests)
        .then(defer(
            self(),
            [this, containerId, resourceRequests, resourceLimits] {
              // NOTE: The executor struct could have been removed before
              // containerizer update, so we use the captured container ID,
              // resource requests and limits here. If this happens, the
              // containerizer would simply skip updating a destroyed container.
              return containerizer->update(
                  containerId, resourceRequests, resourceLimits);
        }))
        .onAny(defer(self(),
                     &Self::___run,
                     lambda::_1,
                     frameworkId,
                     executorId,
                     executor->containerId,
                     task.isSome()
                       ? vector<TaskInfo>({task.get()})
                       : vector<TaskInfo>(),
                     taskGroup.isSome()
                       ? vector<TaskGroupInfo>({taskGroup.get()})
                       : vector<TaskGroupInfo>()));

      break;
    }
    default:
      LOG(FATAL) << "Executor " << *executor << " is in unexpected state "
                 << executor->state;
      break;
  }

  // We don't perform the checks for 'removeFramework' here since
  // we're guaranteed by 'addExecutor' that 'framework->executors'
  // will be non-empty.
  CHECK(!framework->executors.empty());
}


void Slave::___run(
    const Future<Nothing>& future,
    const FrameworkID& frameworkId,
    const ExecutorID& executorId,
    const ContainerID& containerId,
    const vector<TaskInfo>& tasks,
    const vector<TaskGroupInfo>& taskGroups)
{
  if (!future.isReady()) {
    LOG(ERROR) << "Failed to update resources for container " << containerId
               << " of executor '" << executorId
               << "' of framework " << frameworkId
               << ", destroying container: "
               << (future.isFailed() ? future.failure() : "discarded");

    containerizer->destroy(containerId);

    Executor* executor = getExecutor(frameworkId, executorId);
    if (executor != nullptr) {
      Framework* framework = getFramework(frameworkId);
      CHECK_NOTNULL(framework);

      // Send TASK_GONE because the task was started but has now
      // been terminated. If the framework is not partition-aware,
      // we send TASK_LOST instead for backward compatibility.
      mesos::TaskState taskState = TASK_GONE;
      if (!framework->capabilities.partitionAware) {
        taskState = TASK_LOST;
      }

      ContainerTermination termination;
      termination.set_state(taskState);
      termination.set_reason(TaskStatus::REASON_CONTAINER_UPDATE_FAILED);
      termination.set_message(
          "Failed to update resources for container: " +
          (future.isFailed() ? future.failure() : "discarded"));

      executor->pendingTermination = termination;

      // TODO(jieyu): Set executor->state to be TERMINATING.
    }

    return;
  }

  // Needed for logging.
  auto tasksAndTaskGroups = [&tasks, &taskGroups]() {
    ostringstream out;
    if (!tasks.empty()) {
      vector<TaskID> taskIds;
      foreach (const TaskInfo& task, tasks) {
        taskIds.push_back(task.task_id());
      }
      out << "tasks " << stringify(taskIds);
    }

    if (!taskGroups.empty()) {
      if (!tasks.empty()) {
        out << " and ";
      }

      out << "task groups ";

      vector<vector<TaskID>> taskIds;
      for (auto it = taskGroups.begin(); it != taskGroups.end(); it++) {
        vector<TaskID> taskIds_;
        foreach (const TaskInfo& task, (*it).tasks()) {
          taskIds_.push_back(task.task_id());
        }
        taskIds.push_back(taskIds_);
      }

      out << stringify(taskIds);
    }

    return out.str();
  };

  Framework* framework = getFramework(frameworkId);
  if (framework == nullptr) {
    LOG(WARNING) << "Ignoring sending queued " << tasksAndTaskGroups()
                 << " to executor '" << executorId
                 << "' of framework " << frameworkId
                 << " because the framework does not exist";
    return;
  }

  // No need to send the task to the executor because the framework is
  // being shutdown. No need to send status update for the task as
  // well because the framework is terminating!
  if (framework->state == Framework::TERMINATING) {
    LOG(WARNING) << "Ignoring sending queued " << tasksAndTaskGroups()
                 << " to executor '" << executorId
                 << "' of framework " << frameworkId
                 << " because the framework is terminating";
    return;
  }

  Executor* executor = framework->getExecutor(executorId);
  if (executor == nullptr) {
    LOG(WARNING) << "Ignoring sending queued " << tasksAndTaskGroups()
                 << " to executor '" << executorId
                 << "' of framework " << frameworkId
                 << " because the executor does not exist";
    return;
  }

  // This is the case where the original instance of the executor has
  // been shutdown and a new instance is brought up. No need to send
  // status update as well because it should have already been sent
  // when the original instance of the executor was shutting down.
  if (executor->containerId != containerId) {
    LOG(WARNING) << "Ignoring sending queued " << tasksAndTaskGroups()
                 << "' to executor " << *executor
                 << " because the target container " << containerId
                 << " has exited";
    return;
  }

  CHECK(executor->state == Executor::RUNNING ||
        executor->state == Executor::TERMINATING ||
        executor->state == Executor::TERMINATED)
    << executor->state;

  // No need to send the task to the executor because the executor is
  // terminating or has been terminated. No need to send status update
  // for the task as well because it will be properly handled by
  // 'executorTerminated'.
  if (executor->state != Executor::RUNNING) {
    LOG(WARNING) << "Ignoring sending queued " << tasksAndTaskGroups()
                 << " to executor " << *executor
                 << " because the executor is in "
                 << executor->state << " state";
    return;
  }

  // At this point, we must have either sent some tasks to the running
  // executor or there are queued tasks that need to be delivered.
  // Otherwise, the executor state would have been synchronously
  // transitioned to TERMINATING when the queued tasks were killed.
  CHECK(executor->everSentTask() || !executor->queuedTasks.empty());

  foreach (const TaskInfo& task, tasks) {
    // This is the case where the task is killed. No need to send
    // status update because it should be handled in 'killTask'.
    if (!executor->queuedTasks.contains(task.task_id())) {
      LOG(WARNING) << "Ignoring sending queued task '" << task.task_id()
                   << "' to executor " << *executor
                   << " because the task has been killed";
      continue;
    }

    CHECK_SOME(executor->dequeueTask(task.task_id()));
    executor->addLaunchedTask(task);

    LOG(INFO) << "Sending queued task '" << task.task_id()
              << "' to executor " << *executor;

    RunTaskMessage message;
    message.mutable_framework()->MergeFrom(framework->info);
    message.mutable_task()->MergeFrom(task);

    // Note that 0.23.x executors require the 'pid' to be set
    // to decode the message, but do not use the field.
    message.set_pid(framework->pid.getOrElse(UPID()));

    executor->send(message);
  }

  foreach (const TaskGroupInfo& taskGroup, taskGroups) {
    // The invariant here is that all queued tasks in the group
    // are still queued, or all were removed due to a kill arriving
    // for one of the tasks in the group.
    bool allQueued = true;
    bool allRemoved = true;
    foreach (const TaskInfo& task, taskGroup.tasks()) {
      if (executor->queuedTasks.contains(task.task_id())) {
        allRemoved = false;
      } else {
        allQueued = false;
      }
    }

    CHECK(allQueued != allRemoved)
      << "BUG: The " << taskOrTaskGroup(None(), taskGroup)
      << " was partially killed";

    if (allRemoved) {
      // This is the case where the task group is killed. No need to send
      // status update because it should be handled in 'killTask'.
      LOG(WARNING) << "Ignoring sending queued "
                   << taskOrTaskGroup(None(), taskGroup) << " to executor "
                   << *executor << " because the task group has been killed";
      continue;
    }

    LOG(INFO) << "Sending queued " << taskOrTaskGroup(None(), taskGroup)
              << " to executor " << *executor;

    foreach (const TaskInfo& task, taskGroup.tasks()) {
      CHECK_SOME(executor->dequeueTask(task.task_id()));
      executor->addLaunchedTask(task);
    }

    executor::Event event;
    event.set_type(executor::Event::LAUNCH_GROUP);

    executor::Event::LaunchGroup* launchGroup = event.mutable_launch_group();
    launchGroup->mutable_task_group()->CopyFrom(taskGroup);

    executor->send(event);
  }
}


Future<Option<Secret>> Slave::generateSecret(
    const FrameworkID& frameworkId,
    const ExecutorID& executorId,
    const ContainerID& containerId)
{
  if (!secretGenerator) {
    return None();
  }

  Principal principal(
      Option<string>::none(),
      {
        {"fid", frameworkId.value()},
        {"eid", executorId.value()},
        {"cid", containerId.value()}
      });

  return secretGenerator->generate(principal)
    .then([](const Secret& secret) -> Future<Option<Secret>> {
      Option<Error> error = common::validation::validateSecret(secret);

      if (error.isSome()) {
        return Failure(
            "Failed to validate generated secret: " + error->message);
      } else if (secret.type() != Secret::VALUE) {
        return Failure(
            "Expecting generated secret to be of VALUE type instead of " +
            stringify(secret.type()) + " type; " +
            "only VALUE type secrets are supported at this time");
      }

      return secret;
    });
}


// Launches an executor which was previously created.
void Slave::launchExecutor(
    const Future<Option<Secret>>& authenticationToken,
    const FrameworkID& frameworkId,
    const ExecutorInfo& executorInfo,
    const google::protobuf::Map<string, Value::Scalar>& executorLimits,
    const Option<TaskInfo>& taskInfo)
{
  Framework* framework = getFramework(frameworkId);
  if (framework == nullptr) {
    LOG(WARNING) << "Ignoring launching executor '"
                 << executorInfo.executor_id() << "' because the framework "
                 << frameworkId << " does not exist";
    return;
  }

  if (framework->state == Framework::TERMINATING) {
    LOG(WARNING) << "Ignoring launching executor '"
                 << executorInfo.executor_id() << "' of framework "
                 << frameworkId << " because the framework is terminating";
    return;
  }

  Executor* executor = framework->getExecutor(executorInfo.executor_id());
  if (executor == nullptr) {
    LOG(WARNING) << "Ignoring launching executor '"
                 << executorInfo.executor_id() << "' of framework "
                 << frameworkId << " because the executor does not exist";
    return;
  }

  if (executor->state == Executor::TERMINATING ||
      executor->state == Executor::TERMINATED) {
    string executorState;
    if (executor->state == Executor::TERMINATING) {
      executorState = "terminating";
    } else {
      executorState = "terminated";
    }

    LOG(WARNING) << "Ignoring launching executor " << *executor
                 << " in container " << executor->containerId
                 << " because the executor is " << executorState;

    // The framework may have shutdown this executor already, transitioning it
    // to the TERMINATING/TERMINATED state. However, the executor still exists
    // in the agent's map, so we must send status updates for any queued tasks
    // and perform cleanup via `executorTerminated`.
    ContainerTermination termination;
    termination.set_state(TASK_FAILED);
    termination.set_reason(TaskStatus::REASON_CONTAINER_LAUNCH_FAILED);
    termination.set_message("Executor " + executorState);

    executorTerminated(frameworkId, executor->id, termination);

    return;
  }

  CHECK_EQ(Executor::REGISTERING, executor->state);

  if (!authenticationToken.isReady()) {
    const string message = "Secret generation failed: " +
      (authenticationToken.isFailed()
         ? authenticationToken.failure() : "future discarded");

    LOG(ERROR) << "Failed to launch executor " << *executor << " in container "
               << executor->containerId << ": " << message;

    ContainerTermination termination;
    termination.set_state(TASK_FAILED);
    termination.set_reason(TaskStatus::REASON_CONTAINER_LAUNCH_FAILED);
    termination.set_message(message);

    executorTerminated(frameworkId, executor->id, termination);

    return;
  }

  // Tell the containerizer to launch the executor.

  // Bundle all the container launch fields together.
  ContainerConfig containerConfig;
  *containerConfig.mutable_executor_info() = executorInfo;
  *containerConfig.mutable_command_info() = executorInfo.command();
  *containerConfig.mutable_resources() = executorInfo.resources();
  containerConfig.set_directory(executor->directory);

  if (!executorLimits.empty()) {
    *containerConfig.mutable_limits() = executorLimits;
  }

  if (executor->user.isSome()) {
    containerConfig.set_user(executor->user.get());
  }

  // For both of the following cases, `ExecutorInfo.container` is what
  // we want to tell the containerizer about the container to be
  // launched:
  // (1) If this is a command task case (i.e., the framework specifies
  //     the `TaskInfo` but not `ExecutorInfo`), the
  //     `ExecutorInfo.container` is already copied from
  //     `TaskInfo.container` in `Slave::getExecutorInfo`. As a
  //     result, we should just inform the containerizer about
  //     `ExecutorInfo.container`.
  // (2) If this is a non command task (e.g., default executor, custom
  //     executor), the `ExecutorInfo.container` is what we want to
  //     tell the containerizer anyway.
  if (executorInfo.has_container()) {
    *containerConfig.mutable_container_info() = executorInfo.container();
  }

  if (executor->isGeneratedForCommandTask()) {
    CHECK_SOME(taskInfo)
      << "Command (or Docker) executor does not support task group";

    containerConfig.mutable_task_info()->CopyFrom(taskInfo.get());
  }

  // Prepare environment variables for the executor.
  map<string, string> environment = executorEnvironment(
      flags,
      executorInfo,
      executor->directory,
      info.id(),
      self(),
      authenticationToken.get(),
      framework->info.checkpoint());

  // Prepare the filename of the pidfile, for checkpoint-enabled frameworks.
  Option<string> pidCheckpointPath = None();
  if (framework->info.checkpoint()){
    pidCheckpointPath = slave::paths::getForkedPidPath(
        slave::paths::getMetaRootDir(flags.work_dir),
        info.id(),
        framework->id(),
        executor->id,
        executor->containerId);
  }

  LOG(INFO) << "Launching container " << executor->containerId
            << " for executor '" << executor->id
            << "' of framework " << framework->id();

  // Launch the container.
  //
  // NOTE: This must be called synchronously to avoid launching a container for
  // a removed executor.
  containerizer->launch(
      executor->containerId, containerConfig, environment, pidCheckpointPath)
    .onAny(defer(
        self(),
        &Self::executorLaunched,
        frameworkId,
        executor->id,
        executor->containerId,
        lambda::_1));

  // Make sure the executor registers within the given timeout.
  delay(flags.executor_registration_timeout,
        self(),
        &Self::registerExecutorTimeout,
        frameworkId,
        executor->id,
        executor->containerId);

  return;
}


void Slave::handleRunTaskGroupMessage(
    const UPID& from,
    RunTaskGroupMessage&& runTaskGroupMessage)
{
  runTaskGroup(
      from,
      runTaskGroupMessage.framework(),
      runTaskGroupMessage.executor(),
      runTaskGroupMessage.task_group(),
      google::protobuf::convert(runTaskGroupMessage.resource_version_uuids()),
      runTaskGroupMessage.has_launch_executor() ?
          Option<bool>(runTaskGroupMessage.launch_executor()) : None());
}


void Slave::runTaskGroup(
    const UPID& from,
    const FrameworkInfo& frameworkInfo,
    const ExecutorInfo& executorInfo,
    const TaskGroupInfo& taskGroupInfo,
    const vector<ResourceVersionUUID>& resourceVersionUuids,
    const Option<bool>& launchExecutor)
{
  if (master != from) {
    LOG(WARNING) << "Ignoring run task group message from " << from
                 << " because it is not the expected master: "
                 << (master.isSome() ? stringify(master.get()) : "None");
    return;
  }

  if (!frameworkInfo.has_id()) {
    LOG(ERROR) << "Ignoring run task group message from " << from
               << " because it does not have a framework ID";
    return;
  }

  // TODO(mzhu): Consider doing a `CHECK` here since this shouldn't be possible.
  if (taskGroupInfo.tasks().empty()) {
    LOG(ERROR) << "Ignoring run task group message from " << from
               << " for framework " << frameworkInfo.id()
               << " because it has no tasks";

    return;
  }

  // Executors for task groups are injected by the master, not the agent.
  constexpr bool executorGeneratedForCommandTask = false;

  run(frameworkInfo,
      executorInfo,
      None(),
      taskGroupInfo,
      resourceVersionUuids,
      UPID(),
      launchExecutor,
      executorGeneratedForCommandTask);
}


void Slave::killTask(
    const UPID& from,
    const KillTaskMessage& killTaskMessage)
{
  if (master != from) {
    LOG(WARNING) << "Ignoring kill task message from " << from
                 << " because it is not the expected master: "
                 << (master.isSome() ? stringify(master.get()) : "None");
    return;
  }

  const FrameworkID& frameworkId = killTaskMessage.framework_id();
  const TaskID& taskId = killTaskMessage.task_id();

  LOG(INFO) << "Asked to kill task " << taskId
            << " of framework " << frameworkId;

  CHECK(state == RECOVERING || state == DISCONNECTED ||
        state == RUNNING || state == TERMINATING)
    << state;

  // TODO(bmahler): Also ignore if we're DISCONNECTED.
  if (state == RECOVERING || state == TERMINATING) {
    LOG(WARNING) << "Cannot kill task " << taskId
                 << " of framework " << frameworkId
                 << " because the agent is " << state;
    // TODO(vinod): Consider sending a TASK_LOST here.
    // Currently it is tricky because 'statusUpdate()'
    // ignores updates for unknown frameworks.
    return;
  }

  Framework* framework = getFramework(frameworkId);
  if (framework == nullptr) {
    LOG(WARNING) << "Ignoring kill task " << taskId
                 << " of framework " << frameworkId
                 << " because no such framework is running";
    return;
  }

  // We don't send a status update here because a terminating
  // framework cannot send acknowledgements.
  if (framework->state == Framework::TERMINATING) {
    LOG(WARNING) << "Ignoring kill task " << taskId
                 << " of framework " << frameworkId
                 << " because the framework is terminating";
    return;
  }

  CHECK(framework->state == Framework::RUNNING) << framework->state;

  if (framework->isPending(taskId)) {
    killPendingTask(frameworkId, framework, taskId);

    return;
  }

  Executor* executor = framework->getExecutor(taskId);
  if (executor == nullptr) {
    LOG(WARNING) << "Cannot kill task " << taskId
                 << " of framework " << frameworkId
                 << " because no corresponding executor is running";

    // We send a TASK_DROPPED update because this task has never been
    // launched on this slave. If the framework is not partition-aware,
    // we send TASK_LOST for backward compatibility.
    mesos::TaskState taskState = TASK_DROPPED;
    if (!framework->capabilities.partitionAware) {
      taskState = TASK_LOST;
    }

    const StatusUpdate update = protobuf::createStatusUpdate(
        frameworkId,
        info.id(),
        taskId,
        taskState,
        TaskStatus::SOURCE_SLAVE,
        id::UUID::random(),
        "Cannot find executor",
        TaskStatus::REASON_EXECUTOR_TERMINATED);

    statusUpdate(update, UPID());
    return;
  }

  kill(frameworkId,
       framework,
       executor,
       taskId,
       (killTaskMessage.has_kill_policy()
          ? killTaskMessage.kill_policy()
          : Option<KillPolicy>::none()));
}


void Slave::killPendingTask(
    const FrameworkID& frameworkId,
    Framework* framework,
    const TaskID& taskId)
{
  LOG(WARNING) << "Killing task " << taskId
               << " of framework " << frameworkId
               << " before it was launched";

  Option<TaskGroupInfo> taskGroup =
    framework->getTaskGroupForPendingTask(taskId);

  vector<StatusUpdate> updates;
  if (taskGroup.isSome()) {
    foreach (const TaskInfo& task, taskGroup->tasks()) {
      updates.push_back(protobuf::createStatusUpdate(
          frameworkId,
          info.id(),
          task.task_id(),
          TASK_KILLED,
          TaskStatus::SOURCE_SLAVE,
          id::UUID::random(),
          "A task within the task group was killed before"
          " delivery to the executor",
          TaskStatus::REASON_TASK_KILLED_DURING_LAUNCH,
          CHECK_NOTNONE(
              framework->getExecutorIdForPendingTask(task.task_id()))));
    }
  } else {
    updates.push_back(protobuf::createStatusUpdate(
        frameworkId,
        info.id(),
        taskId,
        TASK_KILLED,
        TaskStatus::SOURCE_SLAVE,
        id::UUID::random(),
        "Killed before delivery to the executor",
        TaskStatus::REASON_TASK_KILLED_DURING_LAUNCH,
        CHECK_NOTNONE(
            framework->getExecutorIdForPendingTask(taskId))));
  }

  foreach (const StatusUpdate& update, updates) {
    // NOTE: Sending a terminal update (TASK_KILLED) synchronously
    // removes the task/task group from 'framework->pendingTasks'
    // and 'framework->pendingTaskGroups', so that it will not be
    // launched.
    statusUpdate(update, UPID());
  }
}


void Slave::kill(
    const FrameworkID& frameworkId,
    Framework* framework,
    Executor* executor,
    const TaskID& taskId,
    const Option<KillPolicy>& killPolicy)
{
  // This function should only be called on tasks which are queued or launched,
  // so both the framework and executor should always exist.
  CHECK_NOTNULL(framework);
  CHECK_NOTNULL(executor);

  switch (executor->state) {
    case Executor::REGISTERING: {
      LOG(WARNING) << "Transitioning the state of task " << taskId
                   << " of framework " << frameworkId
                   << " to TASK_KILLED because the executor is not registered";

      // This task might be part of a task group. If so, we need to
      // send a TASK_KILLED update for all tasks in the group.
      Option<TaskGroupInfo> taskGroup = executor->getQueuedTaskGroup(taskId);

      vector<StatusUpdate> updates;
      if (taskGroup.isSome()) {
        foreach (const TaskInfo& task, taskGroup->tasks()) {
          updates.push_back(protobuf::createStatusUpdate(
              frameworkId,
              info.id(),
              task.task_id(),
              TASK_KILLED,
              TaskStatus::SOURCE_SLAVE,
              id::UUID::random(),
              "A task within the task group was killed before"
              " delivery to the executor",
              TaskStatus::REASON_TASK_KILLED_DURING_LAUNCH,
              executor->id));
        }
      } else {
        updates.push_back(protobuf::createStatusUpdate(
            frameworkId,
            info.id(),
            taskId,
            TASK_KILLED,
            TaskStatus::SOURCE_SLAVE,
            id::UUID::random(),
            "Killed before delivery to the executor",
            TaskStatus::REASON_TASK_KILLED_DURING_LAUNCH,
            executor->id));
      }

      foreach (const StatusUpdate& update, updates) {
        // NOTE: Sending a terminal update (TASK_KILLED) removes the
        // task/task group from 'executor->queuedTasks' and
        // 'executor->queuedTaskGroup', so that if the executor registers at
        // a later point in time, it won't get this task or task group.
        statusUpdate(update, UPID());
      }

      // TODO(mzhu): Consider shutting down the executor here
      // if all of its initial tasks are killed rather than
      // waiting for it to register.

      break;
    }
    case Executor::TERMINATING:
      LOG(WARNING) << "Ignoring kill task " << taskId
                   << " because the executor " << *executor
                   << " is terminating";
      break;
    case Executor::TERMINATED:
      LOG(WARNING) << "Ignoring kill task " << taskId
                   << " because the executor " << *executor
                   << " is terminated";
      break;
    case Executor::RUNNING: {
      if (executor->queuedTasks.contains(taskId)) {
        // This is the case where the task has not yet been sent to
        // the executor (e.g., waiting for containerizer update to
        // finish).

        // This task might be part of a task group. If so, we need to
        // send a TASK_KILLED update for all the other tasks.
        Option<TaskGroupInfo> taskGroup = executor->getQueuedTaskGroup(taskId);

        vector<StatusUpdate> updates;
        if (taskGroup.isSome()) {
          foreach (const TaskInfo& task, taskGroup->tasks()) {
            updates.push_back(protobuf::createStatusUpdate(
                frameworkId,
                info.id(),
                task.task_id(),
                TASK_KILLED,
                TaskStatus::SOURCE_SLAVE,
                id::UUID::random(),
                "Killed before delivery to the executor",
                TaskStatus::REASON_TASK_KILLED_DURING_LAUNCH,
                executor->id));
          }
        } else {
          updates.push_back(protobuf::createStatusUpdate(
              frameworkId,
              info.id(),
              taskId,
              TASK_KILLED,
              TaskStatus::SOURCE_SLAVE,
              id::UUID::random(),
              "Killed before delivery to the executor",
              TaskStatus::REASON_TASK_KILLED_DURING_LAUNCH,
              executor->id));
        }

        foreach (const StatusUpdate& update, updates) {
          // NOTE: Sending a terminal update (TASK_KILLED) removes the
          // task/task group from 'executor->queuedTasks' and
          // 'executor->queuedTaskGroup', so that if the executor registers at
          // a later point in time, it won't get this task.
          statusUpdate(update, UPID());
        }

        // Shutdown the executor if all of its initial tasks are killed.
        // See MESOS-8411. This is a workaround for those executors (e.g.,
        // command executor, default executor) that do not have a proper
        // self terminating logic when they haven't received the task or
        // task group within a timeout.
        if (!executor->everSentTask() && executor->queuedTasks.empty()) {
          LOG(WARNING) << "Shutting down executor " << *executor
                       << " because it has never been sent a task and all of"
                       << " its queued tasks have been killed before delivery";

          _shutdownExecutor(framework, executor);
        }
      } else {
        // Send a message to the executor and wait for
        // it to send us a status update.
        KillTaskMessage message;
        message.mutable_framework_id()->MergeFrom(frameworkId);
        message.mutable_task_id()->MergeFrom(taskId);
        if (killPolicy.isSome()) {
          message.mutable_kill_policy()->MergeFrom(killPolicy.get());
        }

        executor->send(message);
      }
      break;
    }
    default:
      LOG(FATAL) << "Executor " << *executor << " is in unexpected state "
                 << executor->state;
      break;
  }
}


// TODO(benh): Consider sending a boolean that specifies if the
// shut down should be graceful or immediate. Likewise, consider
// sending back a shut down acknowledgement, because otherwise you
// could get into a state where a shut down was sent, dropped, and
// therefore never processed.
void Slave::shutdownFramework(
    const UPID& from,
    const FrameworkID& frameworkId)
{
  // Allow shutdownFramework() only if
  // its called directly (e.g. Slave::finalize()) or
  // its a message from the currently registered master.
  if (from && master != from) {
    LOG(WARNING) << "Ignoring shutdown framework message for " << frameworkId
                 << " from " << from
                 << " because it is not from the registered master ("
                 << (master.isSome() ? stringify(master.get()) : "None") << ")";
    return;
  }

  VLOG(1) << "Asked to shut down framework " << frameworkId
          << " by " << from;

  CHECK(state == RECOVERING || state == DISCONNECTED ||
        state == RUNNING || state == TERMINATING)
    << state;

  if (state == RECOVERING || state == DISCONNECTED) {
    LOG(WARNING) << "Ignoring shutdown framework message for " << frameworkId
                 << " because the agent has not yet registered with the master";
    return;
  }

  Framework* framework = getFramework(frameworkId);
  if (framework == nullptr) {
    VLOG(1) << "Cannot shut down unknown framework " << frameworkId;
    return;
  }

  switch (framework->state) {
    case Framework::TERMINATING:
      LOG(WARNING) << "Ignoring shutdown framework " << framework->id()
                   << " because it is terminating";
      break;
    case Framework::RUNNING:
      LOG(INFO) << "Shutting down framework " << framework->id();

      framework->state = Framework::TERMINATING;

      // Shut down all executors of this framework.
      // NOTE: We use 'executors.keys()' here because 'shutdownExecutor'
      // and 'removeExecutor' can remove an executor from 'executors'.
      foreach (const ExecutorID& executorId, framework->executors.keys()) {
        Executor* executor = framework->executors[executorId];
        CHECK(executor->state == Executor::REGISTERING ||
              executor->state == Executor::RUNNING ||
              executor->state == Executor::TERMINATING ||
              executor->state == Executor::TERMINATED)
          << executor->state;

        if (executor->state == Executor::REGISTERING ||
            executor->state == Executor::RUNNING) {
          _shutdownExecutor(framework, executor);
        } else if (executor->state == Executor::TERMINATED) {
          // NOTE: We call remove here to ensure we can remove an
          // executor (of a terminating framework) that is terminated
          // but waiting for acknowledgements.
          removeExecutor(framework, executor);
        } else {
          // Executor is terminating. Ignore.
        }
      }

      // Remove this framework if it has no pending executors and tasks.
      if (framework->idle()) {
        removeFramework(framework);
      }
      break;
    default:
      LOG(FATAL) << "Framework " << frameworkId
                 << " is in unexpected state " << framework->state;
      break;
  }
}


void Slave::schedulerMessage(
    const SlaveID& slaveId,
    const FrameworkID& frameworkId,
    const ExecutorID& executorId,
    const string& data)
{
  CHECK(state == RECOVERING || state == DISCONNECTED ||
        state == RUNNING || state == TERMINATING)
    << state;

  if (state != RUNNING) {
    LOG(WARNING) << "Dropping message from framework " << frameworkId
                 << " because the agent is in " << state << " state";
    metrics.invalid_framework_messages++;
    return;
  }


  Framework* framework = getFramework(frameworkId);
  if (framework == nullptr) {
    LOG(WARNING) << "Dropping message from framework " << frameworkId
                 << " because framework does not exist";
    metrics.invalid_framework_messages++;
    return;
  }

  CHECK(framework->state == Framework::RUNNING ||
        framework->state == Framework::TERMINATING)
    << framework->state;

  if (framework->state == Framework::TERMINATING) {
    LOG(WARNING) << "Dropping message from framework " << frameworkId
                 << " because framework is terminating";
    metrics.invalid_framework_messages++;
    return;
  }

  Executor* executor = framework->getExecutor(executorId);
  if (executor == nullptr) {
    LOG(WARNING) << "Dropping message for executor " << executorId
                 << " because executor does not exist";
    metrics.invalid_framework_messages++;
    return;
  }

  switch (executor->state) {
    case Executor::REGISTERING:
    case Executor::TERMINATING:
    case Executor::TERMINATED:
      // TODO(*): If executor is not yet registered, queue framework
      // message? It's probably okay to just drop it since frameworks
      // can have the executor send a message to the master to say when
      // it's ready.
      LOG(WARNING) << "Dropping message for executor " << *executor
                   << " because executor is not running";
      metrics.invalid_framework_messages++;
      break;
    case Executor::RUNNING: {
      FrameworkToExecutorMessage message;
      message.mutable_slave_id()->MergeFrom(slaveId);
      message.mutable_framework_id()->MergeFrom(frameworkId);
      message.mutable_executor_id()->MergeFrom(executorId);
      message.set_data(data);
      executor->send(message);
      metrics.valid_framework_messages++;
      break;
    }
    default:
      LOG(FATAL) << "Executor " << *executor << " is in unexpected state "
                 << executor->state;
      break;
  }
}


void Slave::updateFramework(
    const UpdateFrameworkMessage& message)
{
  CHECK(state == RECOVERING || state == DISCONNECTED ||
        state == RUNNING || state == TERMINATING)
    << state;

  const FrameworkID& frameworkId = message.framework_id();
  const UPID& pid = message.pid();

  if (state != RUNNING) {
    LOG(WARNING) << "Dropping updateFramework message for " << frameworkId
                 << " because the agent is in " << state << " state";
    metrics.invalid_framework_messages++;
    return;
  }

  Framework* framework = getFramework(frameworkId);
  if (framework == nullptr) {
    LOG(INFO) << "Ignoring info update for framework " << frameworkId
              << " because it does not exist";
    return;
  }

  switch (framework->state) {
    case Framework::TERMINATING:
      LOG(WARNING) << "Ignoring info update for framework " << frameworkId
                   << " because it is terminating";
      break;
    case Framework::RUNNING: {
      LOG(INFO) << "Updating info for framework " << frameworkId
                << (pid != UPID() ? " with pid updated to " + stringify(pid)
                                  : "");

      // The framework info was added in 1.3, so it will not be set
      // if from a master older than 1.3.
      if (message.has_framework_info()) {
        framework->info.CopyFrom(message.framework_info());
        framework->capabilities = message.framework_info().capabilities();
      }

      if (pid == UPID()) {
        framework->pid = None();
      } else {
        framework->pid = pid;
      }

      if (framework->info.checkpoint()) {
        framework->checkpointFramework();
      }

      // Inform task status update manager to immediately resend any pending
      // updates.
      taskStatusUpdateManager->resume();

      break;
    }
    default:
      LOG(FATAL) << "Framework " << framework->id()
                << " is in unexpected state " << framework->state;
      break;
  }
}


// TODO(nfnt): Have this function return a `Result`.
void Slave::checkpointResourceState(
    vector<Resource> resources,
    bool changeTotal)
{
  // TODO(jieyu): Here we assume that CheckpointResourcesMessages are
  // ordered (i.e., slave receives them in the same order master sends
  // them). This should be true in most of the cases because TCP
  // enforces in order delivery per connection. However, the ordering
  // is technically not guaranteed because master creates multiple
  // connections to the slave in some cases (e.g., persistent socket
  // to slave breaks and master uses ephemeral socket). This could
  // potentially be solved by using a version number and rejecting
  // stale messages according to the version number.
  //
  // If CheckpointResourcesMessages are delivered out-of-order, there
  // are two cases to consider:
  //  (1) If master does not fail over, it will reconcile the state
  //      with the slave if the framework later changes the
  //      checkpointed resources. Since master is the source of truth
  //      for reservations, the inconsistency is not exposed to
  //      frameworks.
  //  (2) If master does fail over, the slave will inform the new
  //      master about the incorrect checkpointed resources. When that
  //      happens, we expect framework to reconcile based on the
  //      offers they get.

  // An agent with resource providers requires an operation feedback protocol
  // instead of simply checkpointing results by the master. Fail hard here
  // instead of applying an incompatible message.
  const bool checkpointingResourceProviderResources = std::any_of(
      resources.begin(),
      resources.end(),
      [](const Resource& resource) { return resource.has_provider_id(); });

  CHECK(!checkpointingResourceProviderResources)
    << "Resource providers must perform their own checkpointing";

  upgradeResources(&resources);

  Resources resourcesToCheckpoint = resources;

  // Tests if the given Operation needs to be checkpointed on the agent.
  //
  // The agent checkpoints pending CREATE/DESTROY operations on agent default
  // resources and terminal operations on agent default resources that have
  // unacknowledged status updates.
  auto operationNeedsCheckpointing = [](const Operation& operation) {
    Result<ResourceProviderID> resourceProviderId =
      getResourceProviderId(operation.info());

    CHECK(!resourceProviderId.isError())
      << "Failed to get resource provider ID: "
      << resourceProviderId.error();

    if (resourceProviderId.isSome()) {
      return false;
    }

    const OperationStatus& status(operation.latest_status());

    // Creating and destroying a persistent volume isn't atomic, so non-terminal
    // CREATE/DESTROY operations on agent default resources have to be
    // checkpointed to retry the creation/removal of persistent volumes.
    if (!protobuf::isTerminalState(status.state())) {
      Offer::Operation::Type type = operation.info().type();

      return type == Offer::Operation::CREATE ||
             type == Offer::Operation::DESTROY;
    }

    return status.has_uuid();
  };

  hashmap<UUID, Operation> operationsToCheckpoint;

  foreachpair (const UUID& uuid, Operation* operation, operations) {
    if (operationNeedsCheckpointing(*operation)) {
      operationsToCheckpoint.put(uuid, *operation);
    }
  }

  if (resourcesToCheckpoint == checkpointedResources &&
      operationsToCheckpoint == checkpointedOperations) {
    VLOG(1) << "Ignoring new checkpointed resources and operations identical "
            << "to the current version";
    return;
  }

  ResourceState resourceState;

  foreach (const Resource& resource, resourcesToCheckpoint) {
    resourceState.add_resources()->CopyFrom(resource);
  }

  foreach (const Operation& operation, operationsToCheckpoint.values()) {
    resourceState.add_operations()->CopyFrom(operation);
  }

  // This is a sanity check to verify that the new checkpointed
  // resources are compatible with the agent resources specified
  // through the '--resources' command line flag. The resources
  // should be guaranteed compatible by the master.
  Try<Resources> _totalResources = applyCheckpointedResources(
      info.resources(),
      resourcesToCheckpoint);

  CHECK_SOME(_totalResources)
    << "Failed to apply checkpointed resources "
    << resourcesToCheckpoint << " to agent's resources "
    << info.resources();

  if (changeTotal) {
    totalResources = _totalResources.get();
  }

  // Store the target checkpoint resources. We commit the checkpoint by renaming
  // the target file only after all operations are successful. If any of the
  // operations fail, the agent exits and the update to checkpointed resources
  // is re-attempted after the agent restarts before agent reregistration.
  //
  // Since we commit the checkpoint after all operations are successful,
  // we avoid a case of inconsistency between the master and the agent if
  // the agent restarts during handling of `CheckpointResourcesMessage`.
  //
  // NOTE: Since the addition of operation feedback on the agent, the resources
  // are checkpointed in two formats:
  // 1) Pre-operation-feedback, where only resources are written to a target
  //    file, then moved to the final checkpoint location once any persistent
  //    volumes have been committed to disk.
  // 2) Post-operation-feedback, where both resources and operations are written
  //    to a target file, then moved to the final checkpoint location once any
  //    persistent volumes have been committed to disk.
  //
  // Both of these formats continue to be written to disk in order to permit
  // agent downgrades.

  CHECK_SOME(state::checkpoint(
      paths::getResourceStateTargetPath(metaDir),
      resourceState,
      false,
      false))
    << "Failed to checkpoint resources " << resourceState.resources()
    << " and operations " << resourceState.operations() << "to target file";

  if (resourcesToCheckpoint != checkpointedResources) {
    CHECK_SOME(state::checkpoint(
        paths::getResourcesTargetPath(metaDir),
        resourcesToCheckpoint))
      << "Failed to checkpoint resources target " << resourcesToCheckpoint;

    Try<Nothing> syncResult = syncCheckpointedResources(resourcesToCheckpoint);

    if (syncResult.isError()) {
      // Exit the agent (without committing the checkpoint) on failure.
      EXIT(EXIT_FAILURE)
        << "Failed to sync checkpointed resources: "
        << syncResult.error();
    }

    // Rename the target checkpoint to the committed checkpoint.
    Try<Nothing> renameResult = os::rename(
        paths::getResourcesTargetPath(metaDir),
        paths::getResourcesInfoPath(metaDir));

    if (renameResult.isError()) {
      // Exit the agent since the checkpoint could not be committed.
      EXIT(EXIT_FAILURE)
        << "Failed to checkpoint resources " << resourcesToCheckpoint
        << ": " << renameResult.error();
    }

    LOG(INFO) << "Updated checkpointed resources from "
              << checkpointedResources << " to "
              << resourcesToCheckpoint;

    checkpointedResources = std::move(resourcesToCheckpoint);
  }

  // At this point, `syncCheckpointedResources()` has ensured that any change in
  // checkpointed resources (e.g. persistent volumes) is now reflected on disk.
  // We rename the target resource state file to the actual resource state file,
  // which is our source of truth for the current state of the agent resources.
  Try<Nothing> renameResult = os::rename(
      paths::getResourceStateTargetPath(metaDir),
      paths::getResourceStatePath(metaDir));

  if (renameResult.isError()) {
    // Exit the agent since the checkpoint could not be committed.
    EXIT(EXIT_FAILURE)
      << "Failed to move target resources " << resourceState.resources()
      << " and operations " << resourceState.operations()
      << ": " << renameResult.error();
  }

  if (operationsToCheckpoint != checkpointedOperations) {
    LOG(INFO) << "Updated checkpointed operations from "
              << checkpointedOperations.values() << " to "
              << operationsToCheckpoint.values();

    checkpointedOperations = std::move(operationsToCheckpoint);
  }
}


void Slave::checkpointResourceState(
    const Resources& resources,
    bool changeTotal)
{
  checkpointResourceState({resources.begin(), resources.end()}, changeTotal);
}


void Slave::checkpointResourcesMessage(
    const vector<Resource>& resources)
{
  checkpointResourceState(resources, true);
}


Try<Nothing> Slave::syncCheckpointedResources(
    const Resources& newCheckpointedResources)
{
  auto toPathMap = [](const string& workDir, const Resources& resources) {
    hashmap<string, Resource> pathMap;
    const Resources& persistentVolumes = resources.persistentVolumes();

    foreach (const Resource& volume, persistentVolumes) {
      // This is validated in master.
      CHECK(Resources::isReserved(volume));
      string path = paths::getPersistentVolumePath(workDir, volume);
      pathMap[path] = volume;
    }

    return pathMap;
  };

  const hashmap<string, Resource> oldPathMap =
    toPathMap(flags.work_dir, checkpointedResources);

  const hashmap<string, Resource> newPathMap =
    toPathMap(flags.work_dir, newCheckpointedResources);

  const hashset<string> oldPaths = oldPathMap.keys();
  const hashset<string> newPaths = newPathMap.keys();

  const hashset<string> createPaths = newPaths - oldPaths;
  const hashset<string> deletePaths = oldPaths - newPaths;

  // Create persistent volumes that do not already exist.
  //
  // TODO(jieyu): Consider introducing a volume manager once we start
  // to support multiple disks, or raw disks. Depending on the
  // DiskInfo, we may want to create either directories under a root
  // directory, or LVM volumes from a given device.
  foreach (const string& path, createPaths) {
    const Resource& volume = newPathMap.at(path);

    // If creation of persistent volume fails, the agent exits.
    string volumeDescription = "persistent volume " +
      volume.disk().persistence().id() + " at '" + path + "'";

    // We don't take any action if the directory already exists.
    // If the volume is on a MOUNT disk then the directory would
    // be a mount point that already exists. Otherwise it is possible
    // that pre-existing data exists at this path before it's managed
    // by Mesos agent. In any case because we make sure volume destroy
    // is retried until successful, here we are not concerned about
    // them being leaked from previous persistent volumes.
    if (!os::exists(path)) {
      // If the directory does not exist, we should proceed only if the
      // target directory is successfully created.
      Try<Nothing> result = os::mkdir(path, true);
      if (result.isError()) {
        return Error("Failed to create the " +
            volumeDescription + ": " + result.error());
      }
    }
  }

  // If a persistent volume that in the slave's previous checkpointed
  // resources doesn't appear in the new checkpointed resources, this
  // implies the volume has been explicitly destroyed. We immediately
  // remove the filesystem objects for the removed volume. Note that
  // for MOUNT disks, we don't remove the root directory (mount point)
  // of the volume.
  foreach (const string& path, deletePaths) {
    const Resource& volume = oldPathMap.at(path);

    LOG(INFO) << "Deleting persistent volume '"
              << volume.disk().persistence().id()
              << "' at '" << path << "'";

    if (!os::exists(path)) {
      LOG(WARNING) << "Failed to find persistent volume '"
                   << volume.disk().persistence().id()
                   << "' at '" << path << "'";
    } else {
      const Resource::DiskInfo::Source& source = volume.disk().source();

      bool removeRoot = true;
      if (source.type() == Resource::DiskInfo::Source::MOUNT) {
        removeRoot = false;
      }

      // We should proceed only if the directory is removed.
      Try<Nothing> result = os::rmdir(path, true, removeRoot);
      if (result.isError()) {
        return Error(
            "Failed to remove persistent volume '" +
            stringify(volume.disk().persistence().id()) +
            "' at '" + path + "': " + result.error());
      }
    }

#ifndef __WINDOWS__
    // Deallocate the shared persistent volume's gid. Please note that the
    // gid is allocated when the shared persistent volume is first used by
    // a container rather than when it is created.
    if (volumeGidManager) {
      volumeGidManager->deallocate(path);
    }
#endif // __WINDOWS__
  }

  return Nothing();
}


void Slave::applyOperation(const ApplyOperationMessage& message)
{
  // The operation might be from an operator API call, thus the framework ID
  // here is optional.
  Option<FrameworkID> frameworkId = message.has_framework_id()
    ? message.framework_id()
    : Option<FrameworkID>::none();

  Option<OperationID> operationId = message.operation_info().has_id()
    ? message.operation_info().id()
    : Option<OperationID>::none();

  Result<ResourceProviderID> resourceProviderId =
    getResourceProviderId(message.operation_info());

  const UUID& uuid = message.operation_uuid();

  if (resourceProviderId.isError()) {
    LOG(ERROR) << "Failed to get the resource provider ID of operation "
               << "'" << message.operation_info().id() << "' "
               << "(uuid: " << uuid << ") from "
               << (frameworkId.isSome()
                     ? "framework " + stringify(frameworkId.get())
                     : "an operator API call")
               << ": " << resourceProviderId.error();
    return;
  }

  Operation* operation = new Operation(protobuf::createOperation(
      message.operation_info(),
      protobuf::createOperationStatus(
          OPERATION_PENDING,
          operationId,
          None(),
          None(),
          None(),
          info.id(),
          resourceProviderId.isSome()
            ? resourceProviderId.get() : Option<ResourceProviderID>::none()),
      frameworkId,
      info.id(),
      uuid));

  addOperation(operation);

  // TODO(jieyu): We should drop the operation if the resource version
  // uuid in the operation does not match that of the agent. This is
  // currently not possible because if any speculative operation for
  // agent default resources fails, the agent will crash. We might
  // want to change that behavior in the future. Revisit this once we
  // change that behavior.
  checkpointResourceState(
      totalResources.filter(mesos::needCheckpointing), false);

  if (protobuf::isSpeculativeOperation(message.operation_info())) {
    apply(operation);
  }

  if (resourceProviderId.isSome()) {
    CHECK_NOTNULL(resourceProviderManager.get())->applyOperation(message);
    return;
  }

  CHECK(protobuf::isSpeculativeOperation(message.operation_info()));

  UpdateOperationStatusMessage update =
    protobuf::createUpdateOperationStatusMessage(
        uuid,
        protobuf::createOperationStatus(
            OPERATION_FINISHED,
            operationId,
            None(),
            None(),
            id::UUID::random(),
            info.id(),
            resourceProviderId.isSome()
              ? resourceProviderId.get() : Option<ResourceProviderID>::none()),
        None(),
        frameworkId,
        info.id());

  updateOperation(operation, update);

  checkpointResourceState(
      totalResources.filter(mesos::needCheckpointing), false);

  operationStatusUpdateManager.update(update);
}


void Slave::reconcileOperations(const ReconcileOperationsMessage& message)
{
  // If the `framework_id` field in the message is set, then this reconciliation
  // request was initiated by the framework. This means the operations in this
  // message were not known to the master at the time of reconciliation. If the
  // resource provider manager doesn't recognize the operation either, then we
  // will return OPERATION_UNKNOWN.
  if (message.has_framework_id()) {
    foreach (
        const ReconcileOperationsMessage::Operation& operation,
        message.operations()) {
      Option<UUID> operationUuid;
      if (operation.has_operation_uuid()) {
        operationUuid = operation.operation_uuid();
      } else if (operation.has_operation_id()) {
        auto key = std::make_pair(
            message.framework_id(), operation.operation_id());
        if (operationIds.contains(key)) {
          operationUuid = operationIds.at(key);
        }
      }

      if (operationUuid.isSome()) {
        Operation* storedOperation = getOperation(operationUuid.get());

        // If the agent knows this operation, then the reconciliation request
        // must have raced with an `UpdateSlaveMessage` from the agent. We
        // satisfy this reconciliation request with the latest stored state of
        // the operation.
        if (storedOperation != nullptr) {
          // Clear the status UUID from the latest status since this update is
          // not sent reliably and thus does not require acknowledgement.
          OperationStatus status = storedOperation->latest_status();
          status.clear_uuid();

          UpdateOperationStatusMessage update =
            protobuf::createUpdateOperationStatusMessage(
                operationUuid.get(),
                status,
                None(),
                message.framework_id(),
                info.id());

          send(master.get(), update);

          continue;
        }
      }

      // If the agent doesn't know this operation and the operation includes a
      // resource provider ID, then we forward the reconciliation to the
      // resource provider manager to satisfy it based on whether or not the
      // specified resource provider is known.
      CHECK_NOTNULL(resourceProviderManager.get())
        ->reconcileOperations(message);
    }
  // If the `framework_id` field in the message is not set, then this
  // reconciliation was initiated by the master. We help the master reconcile
  // its in-memory state below. If operations known by the master are not known
  // by the agent/RP, then we return OPERATION_DROPPED to indicate that the
  // operation never made it to the agent.
  } else {
    bool forwardToResourceProvider = false;

    foreach (
        const ReconcileOperationsMessage::Operation& operation,
        message.operations()) {
      // The `operation_uuid` field should always be set for
      // master-initiated reconciliations.
      CHECK(operation.has_operation_uuid());

      if (operation.has_resource_provider_id()) {
        forwardToResourceProvider = true;
        continue;
      }

      // The master reconciles when it notices that an operation is missing from
      // an `UpdateSlaveMessage`. If we cannot find an operation in the agent
      // state, we send an update to inform the master. If we do find the
      // operation, then the master and agent state are consistent and we do not
      // need to do anything.
      Operation* storedOperation = getOperation(operation.operation_uuid());
      if (storedOperation == nullptr) {
        // For agent default resources, we send best-effort operation status
        // updates to the master. This is satisfactory because a dropped message
        // would imply a subsequent agent reregistration, after which an
        // `UpdateSlaveMessage` would be sent with pending operations.
        UpdateOperationStatusMessage update =
          protobuf::createUpdateOperationStatusMessage(
              operation.operation_uuid(),
              protobuf::createOperationStatus(
                  OPERATION_DROPPED,
                  None(),
                  None(),
                  None(),
                  None(),
                  info.id()),
              None(),
              None(),
              info.id());

        send(master.get(), update);
      }
    }

    if (forwardToResourceProvider) {
      CHECK_NOTNULL(resourceProviderManager.get())
        ->reconcileOperations(message);
    }
  }
}


void Slave::statusUpdateAcknowledgement(
    const UPID& from,
    const SlaveID& slaveId,
    const FrameworkID& frameworkId,
    const TaskID& taskId,
    const string& uuid)
{
  // Originally, all status update acknowledgements were sent from the
  // scheduler driver. We'd like to have all acknowledgements sent by
  // the master instead. See: MESOS-1389.
  // For now, we handle acknowledgements from the leading master and
  // from the scheduler driver, for backwards compatibility.
  // TODO(bmahler): Aim to have the scheduler driver no longer
  // sending acknowledgements in 0.20.0. Stop handling those messages
  // here in 0.21.0.
  // NOTE: We must reject those acknowledgements coming from
  // non-leading masters because we may have already sent the terminal
  // un-acknowledged task to the leading master! Unfortunately, the
  // master's pid will not change across runs on the same machine, so
  // we may process a message from the old master on the same machine,
  // but this is a more general problem!
  if (strings::startsWith(from.id, "master")) {
    if (state != RUNNING) {
      LOG(WARNING) << "Dropping status update acknowledgement message for "
                   << frameworkId << " because the agent is in "
                   << state << " state";
      return;
    }

    if (master != from) {
      LOG(WARNING) << "Ignoring status update acknowledgement message from "
                   << from << " because it is not the expected master: "
                   << (master.isSome() ? stringify(master.get()) : "None");
      return;
    }
  }

  UUID uuid_;
  uuid_.set_value(uuid);

  taskStatusUpdateManager->acknowledgement(
      taskId, frameworkId, id::UUID::fromBytes(uuid).get())
    .onAny(defer(self(),
                 &Slave::_statusUpdateAcknowledgement,
                 lambda::_1,
                 taskId,
                 frameworkId,
                 uuid_));
}


void Slave::_statusUpdateAcknowledgement(
    const Future<bool>& future,
    const TaskID& taskId,
    const FrameworkID& frameworkId,
    const UUID& uuid)
{
  // The future could fail if this is a duplicate status update acknowledgement.
  if (!future.isReady()) {
    LOG(ERROR) << "Failed to handle status update acknowledgement (UUID: "
               << uuid << ") for task " << taskId
               << " of framework " << frameworkId << ": "
               << (future.isFailed() ? future.failure() : "future discarded");
    return;
  }

  VLOG(1) << "Task status update manager successfully handled status update"
          << " acknowledgement (UUID: " << uuid
          << ") for task " << taskId
          << " of framework " << frameworkId;

  CHECK(state == RECOVERING || state == DISCONNECTED ||
        state == RUNNING || state == TERMINATING)
    << state;

  Framework* framework = getFramework(frameworkId);
  if (framework == nullptr) {
    LOG(ERROR) << "Status update acknowledgement (UUID: " << uuid
               << ") for task " << taskId
               << " of unknown framework " << frameworkId;
    return;
  }

  CHECK(framework->state == Framework::RUNNING ||
        framework->state == Framework::TERMINATING)
    << framework->state;

  // Find the executor that has this update.
  Executor* executor = framework->getExecutor(taskId);
  if (executor == nullptr) {
    LOG(ERROR) << "Status update acknowledgement (UUID: " << uuid
               << ") for task " << taskId
               << " of unknown executor";
    return;
  }

  CHECK(executor->state == Executor::REGISTERING ||
        executor->state == Executor::RUNNING ||
        executor->state == Executor::TERMINATING ||
        executor->state == Executor::TERMINATED)
    << executor->state;

  // If the task has reached terminal state and all its updates have
  // been acknowledged, mark it completed.
  if (executor->terminatedTasks.contains(taskId) && !future.get()) {
    executor->completeTask(taskId);
  }

  // Remove the executor if it has terminated and there are no more
  // incomplete tasks.
  if (executor->state == Executor::TERMINATED && !executor->incompleteTasks()) {
    removeExecutor(framework, executor);
  }

  // Remove this framework if it has no pending executors and tasks.
  if (framework->idle()) {
    removeFramework(framework);
  }
}


void Slave::operationStatusAcknowledgement(
    const UPID& from,
    const AcknowledgeOperationStatusMessage& acknowledgement)
{
  Operation* operation = getOperation(acknowledgement.operation_uuid());

  if (operation == nullptr) {
    LOG(WARNING) << "Dropping operation update acknowledgement with"
      << " status_uuid " << acknowledgement.status_uuid() << " and"
      << " operation_uuid " << acknowledgement.operation_uuid()
      << " because the operation was not found";

    return;
  }

  // If the operation was on resource provider resources forward the
  // acknowledgement to the resource provider manager as well.
  Result<ResourceProviderID> resourceProviderId =
    getResourceProviderId(operation->info());

  CHECK(!resourceProviderId.isError())
    << "Could not determine resource provider of operation " << operation
    << ": " << resourceProviderId.error();

  if (resourceProviderId.isSome()) {
    CHECK_NOTNULL(resourceProviderManager.get())
      ->acknowledgeOperationStatus(acknowledgement);

    CHECK(operation->statuses_size() > 0);
    if (protobuf::isTerminalState(
            operation->statuses(operation->statuses_size() - 1).state())) {
      // Note that if this acknowledgement is dropped due to resource provider
      // disconnection, the resource provider will inform the agent about the
      // operation via an UPDATE_STATE call after it reregisters, which will
      // cause the agent to add the operation back.
      removeOperation(operation);
    }

    return;
  }

  // Acknowledgement was for an operation on the agent's default resources.
  auto statusUuid = id::UUID::fromBytes(
      acknowledgement.status_uuid().value());

  auto operationUuid = id::UUID::fromBytes(
      acknowledgement.operation_uuid().value());

  if (operationUuid.isError() || statusUuid.isError()) {
    LOG(WARNING) << "Dropping acknowledgement for operation " << operation
      << " with provided operation uuid "
      << acknowledgement.operation_uuid().value()
      << " and status uuid "
      << acknowledgement.status_uuid().value() << ".";
    return;
  }

  auto err = [](const id::UUID& uuid, const string& message) {
    LOG(ERROR)
      << "Failed to acknowledge status update for operation (uuid: " << uuid
      << "): " << message;
  };

  // NOTE: It is possible that an incoming acknowledgement races with an
  // outgoing retry of status update, and then a duplicated
  // acknowledgement will be received. In this case, the following call
  // will fail, so we just leave an error log.
  operationStatusUpdateManager
    .acknowledgement(operationUuid.get(), statusUuid.get())
    .then(defer(self(), [=](bool continuation) {
      if (!continuation) {
        removeOperation(operation);

        // Garbage collect the status update stream.

        const string path = slave::paths::getSlaveOperationPath(
            metaDir,
            info.id(),
            operationUuid.get());

        // NOTE: We check if the path exists since we do not checkpoint some
        // status updates, such as OPERATION_DROPPED.
        if (os::exists(path)) {
          Try<Nothing> rmdir = os::rmdir(path);
          if (rmdir.isError()) {
            LOG(ERROR) << "Failed to remove operation status update stream "
                       << "directory '" << path << "': " << rmdir.error();
          }
        }
      }

      return Nothing();
    }))
    .onFailed(std::bind(err, operationUuid.get(), lambda::_1))
    .onDiscarded(std::bind(err, operationUuid.get(), "future discarded"));
}


void Slave::subscribe(
    StreamingHttpConnection<v1::executor::Event> http,
    const Call::Subscribe& subscribe,
    Framework* framework,
    Executor* executor)
{
  CHECK_NOTNULL(framework);
  CHECK_NOTNULL(executor);

  LOG(INFO) << "Received Subscribe request for HTTP executor " << *executor;

  CHECK(state == RECOVERING || state == DISCONNECTED ||
        state == RUNNING || state == TERMINATING)
    << state;

  if (state == TERMINATING) {
    LOG(WARNING) << "Shutting down executor " << *executor << " as the agent "
                 << "is terminating";
    http.send(ShutdownExecutorMessage());
    http.close();
    return;
  }

  CHECK(framework->state == Framework::RUNNING ||
        framework->state == Framework::TERMINATING)
    << framework->state;

  if (framework->state == Framework::TERMINATING) {
    LOG(WARNING) << "Shutting down executor " << *executor << " as the "
                 << "framework is terminating";
    http.send(ShutdownExecutorMessage());
    http.close();
    return;
  }

  switch (executor->state) {
    case Executor::TERMINATING:
    case Executor::TERMINATED:
      // TERMINATED is possible if the executor forks, the parent process
      // terminates and the child process (driver) tries to register!
      LOG(WARNING) << "Shutting down executor " << *executor
                   << " because it is in unexpected state " << executor->state;
      http.send(ShutdownExecutorMessage());
      http.close();
      break;
    case Executor::RUNNING:
    case Executor::REGISTERING: {
      // Close the earlier connection if one existed. This can even
      // be a retried Subscribe request from an already connected
      // executor.
      if (executor->http.isSome()) {
        LOG(WARNING) << "Closing already existing HTTP connection from "
                     << "executor " << *executor;
        executor->http->close();
      }

      executor->state = Executor::RUNNING;

      // Save the connection for the executor.
      executor->http = http;
      executor->pid = None();

      // Create a heartbeater for HTTP executors.
      executor::Event heartbeatEvent;
      heartbeatEvent.set_type(executor::Event::HEARTBEAT);

      executor->heartbeater.reset(
          new ResponseHeartbeater<executor::Event, v1::executor::Event>(
              "executor " + stringify(executor->id),
              heartbeatEvent,
              http,
              DEFAULT_EXECUTOR_HEARTBEAT_INTERVAL,
              DEFAULT_EXECUTOR_HEARTBEAT_INTERVAL));

      if (framework->info.checkpoint()) {
        // Write a marker file to indicate that this executor
        // is HTTP based.
        const string path = paths::getExecutorHttpMarkerPath(
            metaDir,
            info.id(),
            framework->id(),
            executor->id,
            executor->containerId);

        LOG(INFO) << "Creating a marker file for HTTP based executor "
                  << *executor << " at path '" << path << "'";
        CHECK_SOME(os::touch(path));
      }

      // Handle all the pending updates.
      // The task status update manager might have already checkpointed
      // some of these pending updates (for example, if the slave died
      // right after it checkpointed the update but before it could send
      // the ACK to the executor). This is ok because the status update
      // manager correctly handles duplicate updates.
      foreach (const Call::Update& update, subscribe.unacknowledged_updates()) {
        // NOTE: This also updates the executor's resources!
        statusUpdate(protobuf::createStatusUpdate(
            framework->id(),
            update.status(),
            info.id()),
            None());
      }

      hashmap<TaskID, TaskInfo> unackedTasks;
      foreach (const TaskInfo& task, subscribe.unacknowledged_tasks()) {
        unackedTasks[task.task_id()] = task;
      }

      // Now, if there is any task still in STAGING state and not in
      // unacknowledged 'tasks' known to the executor, the slave must
      // have died before the executor received the task! We should
      // transition it to TASK_DROPPED. We only consider/store
      // unacknowledged 'tasks' at the executor driver because if a
      // task has been acknowledged, the slave must have received an
      // update for that task and transitioned it out of STAGING!
      //
      // TODO(vinod): Consider checkpointing 'TaskInfo' instead of
      // 'Task' so that we can relaunch such tasks! Currently we don't
      // do it because 'TaskInfo.data' could be huge.
      foreach (Task* task, executor->launchedTasks.values()) {
        if (task->state() == TASK_STAGING &&
            !unackedTasks.contains(task->task_id())) {
          mesos::TaskState newTaskState = TASK_DROPPED;
          if (!protobuf::frameworkHasCapability(
                  framework->info,
                  FrameworkInfo::Capability::PARTITION_AWARE)) {
            newTaskState = TASK_LOST;
          }

          LOG(INFO) << "Transitioning STAGED task " << task->task_id()
                    << " to " << newTaskState
                    << " because it is unknown to the executor "
                    << executor->id;

          const StatusUpdate update = protobuf::createStatusUpdate(
              framework->id(),
              info.id(),
              task->task_id(),
              newTaskState,
              TaskStatus::SOURCE_SLAVE,
              id::UUID::random(),
              "Task launched during agent restart",
              TaskStatus::REASON_SLAVE_RESTARTED,
              executor->id);

          statusUpdate(update, UPID());
        }
      }

      // Shutdown the executor if all of its initial tasks are killed.
      // See MESOS-8411. This is a workaround for those executors (e.g.,
      // command executor, default executor) that do not have a proper
      // self terminating logic when they haven't received the task or
      // task group within a timeout.
      if (!executor->everSentTask() && executor->queuedTasks.empty()) {
        LOG(WARNING) << "Shutting down executor " << *executor
                     << " because it has never been sent a task and all of"
                     << " its queued tasks have been killed before delivery";

        _shutdownExecutor(framework, executor);

        return;
      }

      // Tell executor it's registered and give it any queued tasks
      // or task groups.
      executor::Event event;
      event.set_type(executor::Event::SUBSCRIBED);

      executor::Event::Subscribed* subscribed = event.mutable_subscribed();
      subscribed->mutable_executor_info()->CopyFrom(executor->info);
      subscribed->mutable_framework_info()->MergeFrom(framework->info);
      subscribed->mutable_slave_info()->CopyFrom(info);
      subscribed->mutable_container_id()->CopyFrom(executor->containerId);

      executor->send(event);

      // Split the queued tasks between the task groups and tasks.
      LinkedHashMap<TaskID, TaskInfo> queuedTasks = executor->queuedTasks;

      foreach (const TaskGroupInfo& taskGroup, executor->queuedTaskGroups) {
        foreach (const TaskInfo& task, taskGroup.tasks()) {
          queuedTasks.erase(task.task_id());
        }
      }

      const ContainerID& containerId = executor->containerId;
      const Resources& resourceRequests = executor->allocatedResources();
      const google::protobuf::Map<string, Value::Scalar>& resourceLimits =
        computeExecutorLimits(
            executor->info.resources(),
            executor->queuedTasks.values(),
            executor->launchedTasks.values());

      Future<Nothing> resourcesPublished;
      if (executor->queuedTasks.empty()) {
        // Since no task is queued, all resources should have been published
        // before, so we skip resource publishing here. This avoids failures due
        // to unregistered resource providers during recovery (see MESOS-9711).
        //
        // NOTE: It is safe to not update the published resources when the
        // executor reduces its resource consumption (e.g., due to task
        // completion) because we don't require resources to be unpublished
        // after use. See comments in `publishResources` for details.
        resourcesPublished = Nothing();
      } else {
        resourcesPublished = publishResources(containerId, resourceRequests);
      }

      resourcesPublished
        .then(defer(
            self(),
            [this, containerId, resourceRequests, resourceLimits] {
              // NOTE: The executor struct could have been removed before
              // containerizer update, so we use the captured container ID,
              // resource requests and limits here. If this happens, the
              // containerizer would simply skip updating a destroyed container.
              return containerizer->update(
                  containerId, resourceRequests, resourceLimits);
        }))
        .onAny(defer(self(),
                     &Self::___run,
                     lambda::_1,
                     framework->id(),
                     executor->id,
                     executor->containerId,
                     queuedTasks.values(),
                     executor->queuedTaskGroups));

      break;
    }
    default:
      LOG(FATAL) << "Executor " << *executor << " is in unexpected state "
                 << executor->state;
      break;
  }
}


void Slave::registerExecutor(
    const UPID& from,
    const FrameworkID& frameworkId,
    const ExecutorID& executorId)
{
  LOG(INFO) << "Got registration for executor '" << executorId
            << "' of framework " << frameworkId << " from "
            << stringify(from);

  CHECK(state == RECOVERING || state == DISCONNECTED ||
        state == RUNNING || state == TERMINATING)
    << state;

  if (state == RECOVERING) {
    LOG(WARNING) << "Shutting down executor '" << executorId
                 << "' of framework " << frameworkId
                 << " because the agent is still recovering";
    reply(ShutdownExecutorMessage());
    return;
  }

  if (state == TERMINATING) {
    LOG(WARNING) << "Shutting down executor '" << executorId
                 << "' of framework " << frameworkId
                 << " because the agent is terminating";
    reply(ShutdownExecutorMessage());
    return;
  }

  Framework* framework = getFramework(frameworkId);
  if (framework == nullptr) {
    LOG(WARNING) << "Shutting down executor '" << executorId
                 << "' as the framework " << frameworkId
                 << " does not exist";

    reply(ShutdownExecutorMessage());
    return;
  }

  CHECK(framework->state == Framework::RUNNING ||
        framework->state == Framework::TERMINATING)
    << framework->state;

  if (framework->state == Framework::TERMINATING) {
    LOG(WARNING) << "Shutting down executor '" << executorId
                 << "' as the framework " << frameworkId
                 << " is terminating";

    reply(ShutdownExecutorMessage());
    return;
  }

  Executor* executor = framework->getExecutor(executorId);

  // Check the status of the executor.
  if (executor == nullptr) {
    LOG(WARNING) << "Unexpected executor '" << executorId
                 << "' registering for framework " << frameworkId;
    reply(ShutdownExecutorMessage());
    return;
  }

  switch (executor->state) {
    case Executor::TERMINATING:
    case Executor::TERMINATED:
      // TERMINATED is possible if the executor forks, the parent process
      // terminates and the child process (driver) tries to register!
    case Executor::RUNNING:
      LOG(WARNING) << "Shutting down executor " << *executor
                   << " because it is in unexpected state " << executor->state;
      reply(ShutdownExecutorMessage());
      break;
    case Executor::REGISTERING: {
      executor->state = Executor::RUNNING;

      // Save the pid for the executor.
      executor->pid = from;
      link(from);

      if (framework->info.checkpoint()) {
        // TODO(vinod): This checkpointing should be done
        // asynchronously as it is in the fast path of the slave!

        // Checkpoint the libprocess pid.
        string path = paths::getLibprocessPidPath(
            metaDir,
            info.id(),
            executor->frameworkId,
            executor->id,
            executor->containerId);

        VLOG(1) << "Checkpointing executor pid '"
                << executor->pid.get() << "' to '" << path << "'";
        CHECK_SOME(state::checkpoint(path, executor->pid.get()));
      }

      // Here, we kill the executor if it no longer has any task to run
      // (e.g., framework sent a `killTask()`). This is a workaround for those
      // single task executors (e.g., command executor) that do not have a
      // proper self terminating logic when they haven't received the task
      // within a timeout. Also note even if the agent restarts before sending
      // this shutdown message, it is safe because the executor driver shuts
      // down the executor if it gets disconnected from the agent before
      // registration.
      if (!executor->everSentTask() && executor->queuedTasks.empty()) {
        LOG(WARNING) << "Shutting down registering executor " << *executor
                     << " because it has no tasks to run";

        _shutdownExecutor(framework, executor);

        return;
      }

      // Tell executor it's registered and give it any queued tasks
      // or task groups.
      ExecutorRegisteredMessage message;
      message.mutable_executor_info()->MergeFrom(executor->info);
      message.mutable_framework_id()->MergeFrom(framework->id());
      message.mutable_framework_info()->MergeFrom(framework->info);
      message.mutable_slave_id()->MergeFrom(info.id());
      message.mutable_slave_info()->MergeFrom(info);
      executor->send(message);

      // Split the queued tasks between the task groups and tasks.
      LinkedHashMap<TaskID, TaskInfo> queuedTasks = executor->queuedTasks;

      foreach (const TaskGroupInfo& taskGroup, executor->queuedTaskGroups) {
        foreach (const TaskInfo& task, taskGroup.tasks()) {
          queuedTasks.erase(task.task_id());
        }
      }

      const ContainerID& containerId = executor->containerId;
      const Resources& resourceRequests = executor->allocatedResources();
      const google::protobuf::Map<string, Value::Scalar>& resourceLimits =
        computeExecutorLimits(
            executor->info.resources(),
            executor->queuedTasks.values(),
            executor->launchedTasks.values());

      publishResources(containerId, resourceRequests)
        .then(defer(
            self(),
            [this, containerId, resourceRequests, resourceLimits] {
              // NOTE: The executor struct could have been removed before
              // containerizer update, so we use the captured container ID,
              // resource requests and limits here. If this happens, the
              // containerizer would simply skip updating a destroyed container.
              return containerizer->update(
                  containerId, resourceRequests, resourceLimits);
        }))
        .onAny(defer(self(),
                     &Self::___run,
                     lambda::_1,
                     frameworkId,
                     executorId,
                     executor->containerId,
                     queuedTasks.values(),
                     executor->queuedTaskGroups));

      break;
    }
    default:
      LOG(FATAL) << "Executor " << *executor << " is in unexpected state "
                 << executor->state;
      break;
  }
}


void Slave::reregisterExecutor(
    const UPID& from,
    const FrameworkID& frameworkId,
    const ExecutorID& executorId,
    const vector<TaskInfo>& tasks,
    const vector<StatusUpdate>& updates)
{
  CHECK(state == RECOVERING || state == DISCONNECTED ||
        state == RUNNING || state == TERMINATING)
    << state;

  LOG(INFO) << "Received re-registration message from"
            << " executor '" << executorId << "'"
            << " of framework " << frameworkId;

  if (state == TERMINATING) {
    LOG(WARNING) << "Shutting down executor '" << executorId << "'"
                 << " of framework " << frameworkId
                 << " because the agent is terminating";

    reply(ShutdownExecutorMessage());
    return;
  }

  if (!frameworks.contains(frameworkId)) {
    LOG(WARNING) << "Shutting down executor '" << executorId << "'"
                 << " of framework " << frameworkId
                 << " because the framework is unknown";

    reply(ShutdownExecutorMessage());
    return;
  }

  Framework* framework = frameworks.at(frameworkId);

  CHECK(framework->state == Framework::RUNNING ||
        framework->state == Framework::TERMINATING)
    << framework->state;

  if (framework->state == Framework::TERMINATING) {
    LOG(WARNING) << "Shutting down executor '" << executorId << "'"
                 << " of framework " << frameworkId
                 << " because the framework is terminating";

    reply(ShutdownExecutorMessage());
    return;
  }

  Executor* executor = framework->getExecutor(executorId);

  if (executor == nullptr) {
    LOG(WARNING) << "Shutting down unknown executor '" << executorId << "'"
                 << " of framework " << frameworkId;

    reply(ShutdownExecutorMessage());
    return;
  }

  switch (executor->state) {
    case Executor::TERMINATING:
    case Executor::TERMINATED:
      // TERMINATED is possible if the executor forks, the parent process
      // terminates and the child process (driver) tries to register!
      LOG(WARNING) << "Shutting down executor " << *executor
                   << " because it is in unexpected state " << executor->state;
      reply(ShutdownExecutorMessage());
      break;

    case Executor::RUNNING:
      if (flags.executor_reregistration_retry_interval.isNone()) {
        // Previously, when an executor sends a re-registration while
        // in the RUNNING state, we would shut the executor down. We
        // preserve that behavior when the optional reconnect retry
        // is not enabled.
        LOG(WARNING) << "Shutting down executor " << *executor
                     << " because it is in unexpected state "
                     << executor->state;
        reply(ShutdownExecutorMessage());
      } else {
        // When the agent is configured to retry the reconnect requests
        // to executors, we ignore any further re-registrations. This
        // is because we can't easily handle reregistering libprocess
        // based executors in the steady state, and we plan to move to
        // only allowing v1 HTTP executors (where re-subscription in
        // the steady state is supported). Also, ignoring this message
        // ensures that any executors mimicking the libprocess protocol
        // do not have any illusion of being able to reregister without
        // an agent restart (hopefully they will commit suicide if they
        // fail to reregister).
        LOG(WARNING) << "Ignoring executor re-registration message from "
                     << *executor << " because it is already registered";
      }
      break;

    case Executor::REGISTERING: {
      executor->state = Executor::RUNNING;

      executor->pid = from; // Update the pid.
      link(from);

      // Send re-registration message to the executor.
      ExecutorReregisteredMessage message;
      message.mutable_slave_id()->MergeFrom(info.id());
      message.mutable_slave_info()->MergeFrom(info);
      send(executor->pid.get(), message);

      // Handle all the pending updates.
      // The task status update manager might have already checkpointed
      // some of these pending updates (for example, if the slave died
      // right after it checkpointed the update but before it could send
      // the ACK to the executor). This is ok because the status update
      // manager correctly handles duplicate updates.
      foreach (const StatusUpdate& update, updates) {
        // NOTE: This also updates the executor's resources!
        statusUpdate(update, executor->pid.get());
      }

      // Tell the containerizer to update the resources.
      containerizer->update(
          executor->containerId,
          executor->allocatedResources(),
          computeExecutorLimits(
              executor->info.resources(),
              executor->queuedTasks.values(),
              executor->launchedTasks.values()))
        .onAny(defer(self(),
                     &Self::_reregisterExecutor,
                     lambda::_1,
                     frameworkId,
                     executorId,
                     executor->containerId));

      hashmap<TaskID, TaskInfo> unackedTasks;
      foreach (const TaskInfo& task, tasks) {
        unackedTasks[task.task_id()] = task;
      }

      // Now, if there is any task still in STAGING state and not in
      // unacknowledged 'tasks' known to the executor, the slave must
      // have died before the executor received the task! We should
      // transition it to TASK_DROPPED. We only consider/store
      // unacknowledged 'tasks' at the executor driver because if a
      // task has been acknowledged, the slave must have received
      // an update for that task and transitioned it out of STAGING!
      //
      // TODO(vinod): Consider checkpointing 'TaskInfo' instead of
      // 'Task' so that we can relaunch such tasks! Currently we
      // don't do it because 'TaskInfo.data' could be huge.
      foreach (Task* task, executor->launchedTasks.values()) {
        if (task->state() == TASK_STAGING &&
            !unackedTasks.contains(task->task_id())) {
          mesos::TaskState newTaskState = TASK_DROPPED;
          if (!protobuf::frameworkHasCapability(
                  framework->info,
                  FrameworkInfo::Capability::PARTITION_AWARE)) {
            newTaskState = TASK_LOST;
          }

          LOG(INFO) << "Transitioning STAGED task " << task->task_id()
                    << " to " << newTaskState
                    << " because it is unknown to the executor '"
                    << executorId << "'";

          const StatusUpdate update = protobuf::createStatusUpdate(
              frameworkId,
              info.id(),
              task->task_id(),
              newTaskState,
              TaskStatus::SOURCE_SLAVE,
              id::UUID::random(),
              "Task launched during agent restart",
              TaskStatus::REASON_SLAVE_RESTARTED,
              executorId);

          statusUpdate(update, UPID());
        }
      }

      // Shutdown the executor if all of its initial tasks are killed.
      // This is a workaround for those executors (e.g.,
      // command executor, default executor) that do not have a proper
      // self terminating logic when they haven't received the task or
      // task group within a timeout.
      if (!executor->everSentTask() && executor->queuedTasks.empty()) {
        LOG(WARNING) << "Shutting down reregistering executor " << *executor
                     << " because it has no tasks to run and"
                     << " has never been sent a task";

        _shutdownExecutor(framework, executor);

        return;
      }

      break;
    }
  }
}


void Slave::_reregisterExecutor(
    const Future<Nothing>& future,
    const FrameworkID& frameworkId,
    const ExecutorID& executorId,
    const ContainerID& containerId)
{
  if (!future.isReady()) {
    LOG(ERROR) << "Failed to update resources for container " << containerId
               << " of executor '" << executorId
               << "' of framework " << frameworkId
               << ", destroying container: "
               << (future.isFailed() ? future.failure() : "discarded");

    containerizer->destroy(containerId);

    Executor* executor = getExecutor(frameworkId, executorId);
    if (executor != nullptr) {
      Framework* framework = getFramework(frameworkId);
      CHECK_NOTNULL(framework);

      // Send TASK_GONE because the task was started but has now
      // been terminated. If the framework is not partition-aware,
      // we send TASK_LOST instead for backward compatibility.
      mesos::TaskState taskState = TASK_GONE;
      if (!framework->capabilities.partitionAware) {
        taskState = TASK_LOST;
      }

      ContainerTermination termination;
      termination.set_state(taskState);
      termination.set_reason(TaskStatus::REASON_CONTAINER_UPDATE_FAILED);
      termination.set_message(
          "Failed to update resources for container: " +
          (future.isFailed() ? future.failure() : "discarded"));

      executor->pendingTermination = termination;

      // TODO(jieyu): Set executor->state to be TERMINATING.
    }
  }
}


void Slave::reregisterExecutorTimeout()
{
  CHECK(state == RECOVERING || state == TERMINATING) << state;

  LOG(INFO) << "Cleaning up un-reregistered executors";

  foreachvalue (Framework* framework, frameworks) {
    CHECK(framework->state == Framework::RUNNING ||
          framework->state == Framework::TERMINATING)
      << framework->state;

    foreachvalue (Executor* executor, framework->executors) {
      switch (executor->state) {
        case Executor::RUNNING:     // Executor reregistered.
        case Executor::TERMINATING:
        case Executor::TERMINATED:
          break;
        case Executor::REGISTERING: {
          // If we are here, the executor must have been hung and not
          // exited! This is because if the executor properly exited,
          // it should have already been identified by the isolator
          // (via the reaper) and cleaned up!
          LOG(INFO) << "Killing un-reregistered executor " << *executor;

          containerizer->destroy(executor->containerId);

          executor->state = Executor::TERMINATING;

          // Send TASK_GONE because the task was started but has now
          // been terminated. If the framework is not partition-aware,
          // we send TASK_LOST instead for backward compatibility.
          mesos::TaskState taskState = TASK_GONE;
          if (!protobuf::frameworkHasCapability(
                  framework->info,
                  FrameworkInfo::Capability::PARTITION_AWARE)) {
            taskState = TASK_LOST;
          }

          ContainerTermination termination;
          termination.set_state(taskState);
          termination.set_reason(
              TaskStatus::REASON_EXECUTOR_REREGISTRATION_TIMEOUT);
          termination.set_message(
              "Executor did not reregister within " +
              stringify(flags.executor_reregistration_timeout));

          executor->pendingTermination = termination;
          break;
        }
        default:
          LOG(FATAL) << "Executor " << *executor << " is in unexpected state "
                     << executor->state;
          break;
      }
    }
  }

  // Replay any active draining.
  if (drainConfig.isSome()) {
    DrainSlaveMessage drainSlaveMessage;
    *drainSlaveMessage.mutable_config() = *drainConfig;

    LOG(INFO) << "Replaying in-process agent draining";

    drain(self(), std::move(drainSlaveMessage));
  }

  // Signal the end of recovery.
  // TODO(greggomann): Allow the agent to complete recovery before the executor
  // re-registration timeout has elapsed. See MESOS-7539
  recoveryInfo.recovered.set(Nothing());
}


// This can be called in two ways:
// 1) When a status update from the executor is received.
// 2) When slave generates task updates (e.g LOST/KILLED/FAILED).
// NOTE: We set the pid in 'Slave::___statusUpdate()' to 'pid' so that
// whoever sent this update will get an ACK. This is important because
// we allow executors to send updates for tasks that belong to other
// executors. Currently we allow this because we cannot guarantee
// reliable delivery of status updates. Since executor driver caches
// unacked updates it is important that whoever sent the update gets
// acknowledgement for it.
void Slave::statusUpdate(StatusUpdate update, const Option<UPID>& pid)
{
  LOG(INFO) << "Handling status update " << update
            << (pid.isSome() ? " from " + stringify(pid.get()) : "");

  CHECK(state == RECOVERING || state == DISCONNECTED ||
        state == RUNNING || state == TERMINATING)
    << state;

  if (!update.has_uuid()) {
    LOG(WARNING) << "Ignoring status update " << update << " without 'uuid'";
    metrics.invalid_status_updates++;
    return;
  }

  if (update.slave_id() != info.id()) {
    LOG(WARNING) << "Ignoring status update " << update << " due to "
                 << "Slave ID mismatch; expected '" << info.id()
                 << "', received '" << update.slave_id() << "'";
    metrics.invalid_status_updates++;
    return;
  }

  if (update.status().slave_id() != info.id()) {
    LOG(WARNING) << "Ignoring status update " << update << " due to "
                 << "Slave ID mismatch; expected '" << info.id()
                 << "', received '" << update.status().slave_id() << "'";
    metrics.invalid_status_updates++;
    return;
  }

  // TODO(bmahler): With the HTTP API, we must validate the UUID
  // inside the TaskStatus. For now, we ensure that the uuid of task
  // status matches the update's uuid, in case the executor is using
  // pre 0.23.x driver.
  update.mutable_status()->set_uuid(update.uuid());

  // Set the source and UUID before forwarding the status update.
  update.mutable_status()->set_source(
      pid == UPID() ? TaskStatus::SOURCE_SLAVE : TaskStatus::SOURCE_EXECUTOR);

  // Set TaskStatus.executor_id if not already set; overwrite existing
  // value if already set.
  if (update.has_executor_id()) {
    if (update.status().has_executor_id() &&
        update.status().executor_id() != update.executor_id()) {
      LOG(WARNING) << "Executor ID mismatch in status update"
                   << (pid.isSome() ? " from " + stringify(pid.get()) : "")
                   << "; overwriting received '"
                   << update.status().executor_id() << "' with expected'"
                   << update.executor_id() << "'";
    }
    update.mutable_status()->mutable_executor_id()->CopyFrom(
        update.executor_id());
  }

  Framework* framework = getFramework(update.framework_id());
  if (framework == nullptr) {
    LOG(WARNING) << "Ignoring status update " << update
                 << " for unknown framework " << update.framework_id();
    metrics.invalid_status_updates++;
    return;
  }

  CHECK(framework->state == Framework::RUNNING ||
        framework->state == Framework::TERMINATING)
    << framework->state;

  // We don't send update when a framework is terminating because
  // it cannot send acknowledgements.
  if (framework->state == Framework::TERMINATING) {
    LOG(WARNING) << "Ignoring status update " << update
                 << " for terminating framework " << framework->id();
    metrics.invalid_status_updates++;
    return;
  }

  // If the agent is draining we provide additional
  // information for KILLING or KILLED states.
  if (drainConfig.isSome()) {
    switch (update.status().state()) {
      case TASK_STAGING:
      case TASK_STARTING:
      case TASK_RUNNING:
      case TASK_FAILED:
      case TASK_FINISHED:
      case TASK_ERROR:
      case TASK_LOST:
      case TASK_DROPPED:
      case TASK_UNREACHABLE:
      case TASK_GONE:
      case TASK_GONE_BY_OPERATOR:
      case TASK_UNKNOWN: {
        break;
      }
      case TASK_KILLING:
      case TASK_KILLED: {
        // We unconditionally overwrite any previous reason to provide a
        // consistent signal that this task went away during draining.
        update.mutable_status()->set_reason(TaskStatus::REASON_SLAVE_DRAINING);

        // If the draining marks the agent as gone report tasks as
        // gone by operator.
        if (drainConfig->mark_gone() &&
            framework->capabilities.partitionAware) {
          update.mutable_status()->set_state(TASK_GONE_BY_OPERATOR);
        }
        break;
      }
    }
  }

  if (HookManager::hooksAvailable()) {
    // Even though the hook(s) return a TaskStatus, we only use two fields:
    // container_status and labels. Remaining fields are discarded.
    TaskStatus statusFromHooks =
      HookManager::slaveTaskStatusDecorator(
          update.framework_id(), update.status());
    if (statusFromHooks.has_labels()) {
      update.mutable_status()->mutable_labels()->CopyFrom(
          statusFromHooks.labels());
    }

    if (statusFromHooks.has_container_status()) {
      update.mutable_status()->mutable_container_status()->CopyFrom(
          statusFromHooks.container_status());
    }
  }

  const TaskStatus& status = update.status();

  // For pending tasks, we must synchronously remove them
  // to guarantee that the launch is prevented.
  //
  // TODO(bmahler): Ideally we store this task as terminated
  // but with unacknowledged updates (same as the `Executor`
  // struct does).
  if (framework->isPending(status.task_id())) {
    CHECK(framework->removePendingTask(status.task_id()));

    if (framework->idle()) {
      removeFramework(framework);
    }

    metrics.valid_status_updates++;

    taskStatusUpdateManager->update(update, info.id())
      .onAny(defer(self(), &Slave::___statusUpdate, lambda::_1, update, pid));

    return;
  }

  Executor* executor = framework->getExecutor(status.task_id());
  if (executor == nullptr) {
    LOG(WARNING) << "Could not find the executor for "
                 << "status update " << update;
    metrics.valid_status_updates++;

    // NOTE: We forward the update here because this update could be
    // generated by the slave when the executor is unknown to it
    // (e.g., killTask(), _run()) or sent by an executor for a
    // task that belongs to another executor.
    // We also end up here if 1) the previous slave died after
    // checkpointing a _terminal_ update but before it could send an
    // ACK to the executor AND 2) after recovery the status update
    // manager successfully retried the update, got the ACK from the
    // scheduler and cleaned up the stream before the executor
    // reregistered. In this case, the slave cannot find the executor
    // corresponding to this task because the task has been moved to
    // 'Executor::completedTasks'.
    //
    // NOTE: We do not set the `ContainerStatus` (including the
    // `NetworkInfo` within the `ContainerStatus)  for this case,
    // because the container is unknown. We cannot use the slave IP
    // address here (for the `NetworkInfo`) since we do not know the
    // type of network isolation used for this container.
    taskStatusUpdateManager->update(update, info.id())
      .onAny(defer(self(), &Slave::___statusUpdate, lambda::_1, update, pid));

    return;
  }

  CHECK(executor->state == Executor::REGISTERING ||
        executor->state == Executor::RUNNING ||
        executor->state == Executor::TERMINATING ||
        executor->state == Executor::TERMINATED)
    << executor->state;

  // Failing this validation on the executor driver used to cause the
  // driver to abort. Now that the validation is done by the slave, it
  // should shutdown the executor to be consistent.
  //
  // TODO(arojas): Once the HTTP API is the default, return a
  // 400 Bad Request response, indicating the reason in the body.
  if (status.source() == TaskStatus::SOURCE_EXECUTOR &&
      status.state() == TASK_STAGING) {
    LOG(ERROR) << "Received TASK_STAGING from executor " << *executor
               << " which is not allowed. Shutting down the executor";

    _shutdownExecutor(framework, executor);
    return;
  }

  // TODO(vinod): Revisit these semantics when we disallow executors
  // from sending updates for tasks that belong to other executors.
  if (pid.isSome() &&
      pid != UPID() &&
      executor->pid.isSome() &&
      executor->pid != pid) {
    LOG(WARNING) << "Received status update " << update << " from " << pid.get()
                 << " on behalf of a different executor '" << executor->id
                 << "' (" << executor->pid.get() << ")";
  }

  metrics.valid_status_updates++;

  executor->addPendingTaskStatus(status);

  // Before sending update, we need to retrieve the container status
  // if the task reached the executor. For tasks that are queued, we
  // do not need to send the container status and we must
  // synchronously transition the task to ensure that it is removed
  // from the queued tasks before the run task path continues.
  //
  // Also if the task is in `launchedTasks` but was dropped by the
  // agent, we know that the task did not reach the executor. We
  // will synchronously transition the task to ensure that the
  // agent re-registration logic can call `everSentTask()` after
  // dropping tasks.
  if (executor->queuedTasks.contains(status.task_id())) {
    CHECK(protobuf::isTerminalState(status.state()))
        << "Queued tasks can only be transitioned to terminal states";

    _statusUpdate(update, pid, executor->id, None());
  } else if (executor->launchedTasks.contains(status.task_id()) &&
            (status.state() == TASK_DROPPED || status.state() == TASK_LOST) &&
            status.source() == TaskStatus::SOURCE_SLAVE) {
    _statusUpdate(update, pid, executor->id, None());
  } else {
    // NOTE: If the executor sets the ContainerID inside the
    // ContainerStatus, that indicates that the Task this status update
    // is associated with is tied to that container (could be nested).
    // Therefore, we need to get the status of that container, instead
    // of the top level executor container.
    ContainerID containerId = executor->containerId;
    if (update.status().has_container_status() &&
        update.status().container_status().has_container_id()) {
      containerId = update.status().container_status().container_id();
    }

    containerizer->status(containerId)
      .onAny(defer(self(),
                   &Slave::_statusUpdate,
                   update,
                   pid,
                   executor->id,
                   lambda::_1));
  }
}


void Slave::_statusUpdate(
    StatusUpdate update,
    const Option<process::UPID>& pid,
    const ExecutorID& executorId,
    const Option<Future<ContainerStatus>>& containerStatus)
{
  // There can be cases where a container is already removed from the
  // containerizer before the `status` call is dispatched to the
  // containerizer, leading to the failure of the returned `Future`.
  // In such a case we should simply not update the `ContainerStatus`
  // with the return `Future` but continue processing the
  // `StatusUpdate`.
  if (containerStatus.isSome() && containerStatus->isReady()) {
    ContainerStatus* status =
      update.mutable_status()->mutable_container_status();

    status->MergeFrom(containerStatus->get());

    // Fill in the container IP address with the IP from the agent
    // PID, if not already filled in.
    //
    // TODO(karya): Fill in the IP address by looking up the executor PID.
    if (status->network_infos().size() == 0) {
      NetworkInfo* networkInfo = status->add_network_infos();
      NetworkInfo::IPAddress* ipAddress = networkInfo->add_ip_addresses();

      // Set up IPv4 address.
      //
      // NOTE: By default the protocol is set to IPv4 and therefore we
      // don't explicitly set the protocol here.
      ipAddress->set_ip_address(stringify(self().address.ip));

      // Set up IPv6 address.
      if (self().addresses.v6.isSome()) {
        ipAddress = networkInfo->add_ip_addresses();
        ipAddress->set_ip_address(stringify(self().addresses.v6->ip));
        ipAddress->set_protocol(NetworkInfo::IPv6);
      }
    }
  }

  const TaskStatus& status = update.status();

  Executor* executor = getExecutor(update.framework_id(), executorId);
  if (executor == nullptr) {
    LOG(WARNING) << "Ignoring container status update for framework "
                 << update.framework_id()
                 << "for a non-existent executor";
    return;
  }

  // We set the latest state of the task here so that the slave can
  // inform the master about the latest state (via status update or
  // ReregisterSlaveMessage message) as soon as possible. Master can use
  // this information, for example, to release resources as soon as the
  // latest state of the task reaches a terminal state. This is
  // important because task status update manager queues updates and
  // only sends one update per task at a time; the next update for a
  // task is sent only after the acknowledgement for the previous one is
  // received, which could take a long time if the framework is backed
  // up or is down.
  Try<Nothing> updated = executor->updateTaskState(status);

  // If we fail to update the task state, drop the update. Note that
  // we have to acknowledge the executor so that it does not retry.
  if (updated.isError()) {
    LOG(ERROR) << "Failed to update state of task '" << status.task_id() << "'"
               << " to " << status.state() << ": " << updated.error();

    // NOTE: This may lead to out-of-order acknowledgements since other
    // update acknowledgements may be waiting for the containerizer or
    // task status update manager.
    ___statusUpdate(Nothing(), update, pid);
    return;
  }

  if (protobuf::isTerminalState(status.state())) {
    // If the task terminated, wait until the container's resources
    // have been updated before sending the status update. Note that
    // duplicate terminal updates are not possible here because they
    // lead to an error from `Executor::updateTaskState`.
    containerizer->update(
        executor->containerId,
        executor->allocatedResources(),
        computeExecutorLimits(
            executor->info.resources(),
            executor->queuedTasks.values(),
            executor->launchedTasks.values()))
      .onAny(defer(self(),
                   &Slave::__statusUpdate,
                   lambda::_1,
                   update,
                   pid,
                   executor->id,
                   executor->containerId,
                   executor->checkpoint));
  } else {
    // Immediately send the status update.
    __statusUpdate(None(),
                   update,
                   pid,
                   executor->id,
                   executor->containerId,
                   executor->checkpoint);
  }
}


void Slave::__statusUpdate(
    const Option<Future<Nothing>>& future,
    const StatusUpdate& update,
    const Option<UPID>& pid,
    const ExecutorID& executorId,
    const ContainerID& containerId,
    bool checkpoint)
{
  if (future.isSome() && !future->isReady()) {
    LOG(ERROR) << "Failed to update resources for container " << containerId
               << " of executor '" << executorId
               << "' running task " << update.status().task_id()
               << " on status update for terminal task, destroying container: "
               << (future->isFailed() ? future->failure() : "discarded");

    containerizer->destroy(containerId);

    Executor* executor = getExecutor(update.framework_id(), executorId);
    if (executor != nullptr) {
      Framework* framework = getFramework(update.framework_id());
      CHECK_NOTNULL(framework);

      // Send TASK_GONE because the task was started but has now
      // been terminated. If the framework is not partition-aware,
      // we send TASK_LOST instead for backward compatibility.
      mesos::TaskState taskState = TASK_GONE;
      if (!framework->capabilities.partitionAware) {
        taskState = TASK_LOST;
      }

      ContainerTermination termination;
      termination.set_state(taskState);
      termination.set_reason(TaskStatus::REASON_CONTAINER_UPDATE_FAILED);
      termination.set_message(
          "Failed to update resources for container: " +
          (future->isFailed() ? future->failure() : "discarded"));

      executor->pendingTermination = termination;

      // TODO(jieyu): Set executor->state to be TERMINATING.
    }
  }

  if (checkpoint) {
    // Ask the task status update manager to checkpoint and reliably send the
    // update.
    taskStatusUpdateManager->update(update, info.id(), executorId, containerId)
      .onAny(defer(self(), &Slave::___statusUpdate, lambda::_1, update, pid));
  } else {
    // Ask the task status update manager to just retry the update.
    taskStatusUpdateManager->update(update, info.id())
      .onAny(defer(self(), &Slave::___statusUpdate, lambda::_1, update, pid));
  }
}


void Slave::___statusUpdate(
    const Future<Nothing>& future,
    const StatusUpdate& update,
    const Option<UPID>& pid)
{
  CHECK_READY(future) << "Failed to handle status update " << update;

  VLOG(1) << "Task status update manager successfully handled status update "
          << update;

  const TaskStatus& status = update.status();

  Executor* executor = nullptr;
  Framework* framework = getFramework(update.framework_id());
  if (framework != nullptr) {
    executor = framework->getExecutor(status.task_id());
    if (executor != nullptr) {
      executor->removePendingTaskStatus(status);
    }
  }

  if (pid == UPID()) {
    return;
  }

  StatusUpdateAcknowledgementMessage message;
  message.mutable_framework_id()->MergeFrom(update.framework_id());
  message.mutable_slave_id()->MergeFrom(update.slave_id());
  message.mutable_task_id()->MergeFrom(status.task_id());
  message.set_uuid(update.uuid());

  // Task status update manager successfully handled the status update.
  // Acknowledge the executor, if we have a valid pid.
  if (pid.isSome()) {
    LOG(INFO) << "Sending acknowledgement for status update " << update
              << " to " << pid.get();

    send(pid.get(), message);
  } else {
    // Acknowledge the HTTP based executor.
    if (framework == nullptr) {
      LOG(WARNING) << "Ignoring sending acknowledgement for status update "
                   << update << " of unknown framework";
      return;
    }

    if (executor == nullptr) {
      // Refer to the comments in 'statusUpdate()' on when this can
      // happen.
      LOG(WARNING) << "Ignoring sending acknowledgement for status update "
                   << update << " of unknown executor";
      return;
    }

    executor->send(message);
  }
}


// NOTE: An acknowledgement for this update might have already been
// processed by the slave but not the task status update manager.
void Slave::forward(StatusUpdate update)
{
  CHECK(state == RECOVERING || state == DISCONNECTED ||
        state == RUNNING || state == TERMINATING)
    << state;

  if (state != RUNNING) {
    LOG(WARNING) << "Dropping status update " << update
                 << " sent by task status update manager because the agent"
                 << " is in " << state << " state";
    return;
  }

  // Ensure that task status uuid is set even if this update was sent by
  // the task status update manager after recovering a pre 0.23.x
  // slave/executor driver's updates. This allows us to simplify the
  // master code (in >= 0.27.0) to assume the uuid is always set for
  // retryable updates.
  CHECK(update.has_uuid())
    << "Expecting updates without 'uuid' to have been rejected";

  update.mutable_status()->set_uuid(update.uuid());

  // Update the status update state of the task and include the latest
  // state of the task in the status update.
  Framework* framework = getFramework(update.framework_id());
  if (framework != nullptr) {
    const TaskID& taskId = update.status().task_id();
    Executor* executor = framework->getExecutor(taskId);
    if (executor != nullptr) {
      // NOTE: We do not look for the task in queued tasks because
      // no update is expected for it until it's launched. Similarly,
      // we do not look for completed tasks because the state for a
      // completed task shouldn't be changed.
      Task* task = nullptr;
      if (executor->launchedTasks.contains(taskId)) {
        task = executor->launchedTasks[taskId];
      } else if (executor->terminatedTasks.contains(taskId)) {
        task = executor->terminatedTasks[taskId];
      }

      if (task != nullptr) {
        // We set the status update state of the task here because in
        // steady state master updates the status update state of the
        // task when it receives this update. If the master fails over,
        // slave reregisters with this task in this status update
        // state. Note that an acknowledgement for this update might be
        // enqueued on task status update manager when we are here. But
        // that is ok because the status update state will be updated
        // when the next update is forwarded to the slave.
        task->set_status_update_state(update.status().state());
        task->set_status_update_uuid(update.uuid());

        // Include the latest state of task in the update. See the
        // comments in 'statusUpdate()' on why informing the master
        // about the latest state of the task is important.
        update.set_latest_state(task->state());
      }
    }
  }

  CHECK_SOME(master);
  LOG(INFO) << "Forwarding the update " << update << " to " << master.get();

  // NOTE: We forward the update even if framework/executor/task doesn't
  // exist because the task status update manager will be expecting an
  // acknowledgement for the update. This could happen for example if
  // this is a retried terminal update and before we are here the slave
  // has already processed the acknowledgement of the original update
  // and removed the framework/executor/task. Also, slave
  // re-registration can generate updates when framework/executor/task
  // are unknown.

  // Forward the update to master.
  StatusUpdateMessage message;
  message.mutable_update()->MergeFrom(update);
  message.set_pid(self()); // The ACK will be first received by the slave.

  send(master.get(), message);
}


void Slave::sendOperationStatusUpdate(
    const UpdateOperationStatusMessage& update)
{
  const UUID& operationUUID = update.operation_uuid();

  Operation* operation = getOperation(operationUUID);

  // TODO(greggomann): Make a note here of which cases may lead to
  // the operation being unknown by the agent.
  if (operation != nullptr) {
    updateOperation(operation, update);
  }

  switch (state) {
    case RECOVERING:
    case DISCONNECTED:
    case TERMINATING: {
      LOG(WARNING)
        << "Dropping status update of operation"
        << (update.status().has_operation_id()
             ? " '" + stringify(update.status().operation_id()) + "'"
             : " with no ID")
        << " (operation_uuid: " << operationUUID << ")"
        << (update.has_framework_id()
             ? " for framework " + stringify(update.framework_id())
             : " for an operator API call")
        << " because agent is in " << state << " state";
      break;
    }
    case RUNNING: {
      LOG(INFO)
        << "Forwarding status update of"
        << (operation == nullptr ? " unknown" : "") << " operation"
        << (update.status().has_operation_id()
             ? " '" + stringify(update.status().operation_id()) + "'"
             : " with no ID")
        << " (operation_uuid: " << operationUUID << ")"
        << (update.has_framework_id()
             ? " for framework " + stringify(update.framework_id())
             : " for an operator API call");

      send(master.get(), update);
      break;
    }
  }
}


void Slave::executorMessage(
    const SlaveID& slaveId,
    const FrameworkID& frameworkId,
    const ExecutorID& executorId,
    const string& data)
{
  CHECK(state == RECOVERING || state == DISCONNECTED ||
        state == RUNNING || state == TERMINATING)
    << state;

  if (state != RUNNING) {
    LOG(WARNING) << "Dropping framework message from executor '"
                 << executorId << "' to framework " << frameworkId
                 << " because the agent is in " << state << " state";
    metrics.invalid_framework_messages++;
    return;
  }

  Framework* framework = getFramework(frameworkId);
  if (framework == nullptr) {
    LOG(WARNING) << "Cannot send framework message from executor '"
                 << executorId << "' to framework " << frameworkId
                 << " because framework does not exist";
    metrics.invalid_framework_messages++;
    return;
  }

  CHECK(framework->state == Framework::RUNNING ||
        framework->state == Framework::TERMINATING)
    << framework->state;

  if (framework->state == Framework::TERMINATING) {
    LOG(WARNING) << "Ignoring framework message from executor '"
                 << executorId << "' to framework " << frameworkId
                 << " because framework is terminating";
    metrics.invalid_framework_messages++;
    return;
  }

  ExecutorToFrameworkMessage message;
  message.mutable_slave_id()->MergeFrom(slaveId);
  message.mutable_framework_id()->MergeFrom(frameworkId);
  message.mutable_executor_id()->MergeFrom(executorId);
  message.set_data(data);

  CHECK_SOME(master);

  if (framework->pid.isSome()) {
    LOG(INFO) << "Sending message for framework " << frameworkId
              << " to " << framework->pid.get();
    send(framework->pid.get(), message);
  } else {
    LOG(INFO) << "Sending message for framework " << frameworkId
              << " through the master " << master.get();
    send(master.get(), message);
  }

  metrics.valid_framework_messages++;
}


// NOTE: The agent will respond to pings from the master even if it is
// not in the RUNNING state. This is because agent recovery might take
// longer than the master's ping timeout. We don't want to cause
// cluster churn by marking such agents unreachable. If the master
// sees a broken agent socket, it waits `agent_reregister_timeout` for
// the agent to reregister, which implies that recovery should finish
// within that (more generous) timeout.
void Slave::ping(const UPID& from, bool connected)
{
  VLOG(2) << "Received ping from " << from;

  if (!connected && state == RUNNING) {
    // This could happen if there is a one-way partition between
    // the master and slave, causing the master to get an exited
    // event and marking the slave disconnected but the slave
    // thinking it is still connected. Force a re-registration with
    // the master to reconcile.
    LOG(INFO) << "Master marked the agent as disconnected but the agent"
              << " considers itself registered! Forcing re-registration.";
    detection.discard();
  }

  // We just received a ping from the master, so reset the ping timer.
  Clock::cancel(pingTimer);

  pingTimer = delay(
      masterPingTimeout,
      self(),
      &Slave::pingTimeout,
      detection);

  send(from, PongSlaveMessage());
}


void Slave::pingTimeout(Future<Option<MasterInfo>> future)
{
  // It's possible that a new ping arrived since the timeout fired
  // and we were unable to cancel this timeout. If this occurs, don't
  // bother trying to re-detect.
  if (pingTimer.timeout().expired()) {
    LOG(INFO) << "No pings from master received within "
              << masterPingTimeout;

    future.discard();
  }
}


void Slave::exited(const UPID& pid)
{
  LOG(INFO) << "Got exited event for " << pid;

  if (master.isNone() || master.get() == pid) {
    // TODO(neilc): Try to re-link to the master (MESOS-1963).
    // TODO(benh): After so long waiting for a master, commit suicide.
    LOG(WARNING) << "Master disconnected!"
                 << " Waiting for a new master to be elected";
  }
}


Framework* Slave::getFramework(const FrameworkID& frameworkId) const
{
  if (frameworks.count(frameworkId) > 0) {
    return frameworks.at(frameworkId);
  }

  return nullptr;
}


Executor* Slave::getExecutor(
    const FrameworkID& frameworkId,
    const ExecutorID& executorId) const
{
  Framework* framework = getFramework(frameworkId);
  if (framework != nullptr) {
    return framework->getExecutor(executorId);
  }

  return nullptr;
}


Executor* Slave::getExecutor(const ContainerID& containerId) const
{
  const ContainerID rootContainerId = protobuf::getRootContainerId(containerId);

  // Locate the executor (for now we just loop since we don't
  // index based on container id and this likely won't have a
  // significant performance impact due to the low number of
  // executors per-agent).
  foreachvalue (Framework* framework, frameworks) {
    foreachvalue (Executor* executor, framework->executors) {
      if (rootContainerId == executor->containerId) {
        return executor;
      }
    }
  }

  return nullptr;
}


ExecutorInfo Slave::getExecutorInfo(
    const FrameworkInfo& frameworkInfo,
    const TaskInfo& task) const
{
  // In the case of tasks launched as part of a task group, the task group's
  // ExecutorInfo is injected into each TaskInfo by the master and we return
  // it here.
  if (task.has_executor()) {
    return task.executor();
  }

  ExecutorInfo executor;

  // Command executors share the same id as the task.
  executor.mutable_executor_id()->set_value(task.task_id().value());
  executor.mutable_framework_id()->CopyFrom(frameworkInfo.id());

  if (task.has_container()) {
    // Store the container info in the executor info so it will
    // be checkpointed. This allows the correct containerizer to
    // recover this task on restart.
    executor.mutable_container()->CopyFrom(task.container());
  }

  // Prepare an executor name which includes information on the
  // command being launched.
  string name = "(Task: " + task.task_id().value() + ") ";

  if (task.command().shell()) {
    if (!task.command().has_value()) {
      name += "(Command: NO COMMAND)";
    } else {
#ifdef __WINDOWS__
      name += "(Command: cmd /c '";
#else
      name += "(Command: sh -c '";
#endif // __WINDOWS__
      if (task.command().value().length() > 15) {
        name += task.command().value().substr(0, 12) + "...')";
      } else {
        name += task.command().value() + "')";
      }
    }
  } else {
    if (!task.command().has_value()) {
      name += "(Command: NO EXECUTABLE)";
    } else {
      string args =
        task.command().value() + ", " +
        strings::join(", ", task.command().arguments());

      if (args.length() > 15) {
        name += "(Command: [" + args.substr(0, 12) + "...])";
      } else {
        name += "(Command: [" + args + "])";
      }
    }
  }

  executor.set_name("Command Executor " + name);
  executor.set_source(task.task_id().value());

  // Copy the [uris, environment, container, user] fields from the
  // CommandInfo to get the URIs we need to download, the
  // environment variables that should get set, the necessary
  // container information, and the user to run the executor as but
  // nothing else because we need to set up the rest of the executor
  // command ourselves in order to invoke 'mesos-executor'.
  executor.mutable_command()->mutable_uris()->MergeFrom(
      task.command().uris());

  if (task.command().has_environment()) {
    executor.mutable_command()->mutable_environment()->MergeFrom(
        task.command().environment());
  }

  // Add fields which can be relevant (depending on Authorizer) for
  // authorization.

  if (task.has_labels()) {
    executor.mutable_labels()->MergeFrom(task.labels());
  }

  if (task.has_discovery()) {
    executor.mutable_discovery()->MergeFrom(task.discovery());
  }

  // Adjust the executor shutdown grace period if the kill policy is
  // set. We add a small buffer of time to avoid destroying the
  // container before `TASK_KILLED` is sent by the executor.
  //
  // TODO(alexr): Remove `MAX_REAP_INTERVAL` once the reaper signals
  // immediately after the watched process has exited.
  if (task.has_kill_policy() &&
      task.kill_policy().has_grace_period()) {
    Duration gracePeriod =
      Nanoseconds(task.kill_policy().grace_period().nanoseconds()) +
      process::MAX_REAP_INTERVAL() +
      Seconds(1);

    executor.mutable_shutdown_grace_period()->set_nanoseconds(
        gracePeriod.ns());
  }

  if (task.command().has_user()) {
    executor.mutable_command()->set_user(task.command().user());
  }

  Result<string> path = os::realpath(
      path::join(flags.launcher_dir, MESOS_EXECUTOR));

  if (path.isSome()) {
    executor.mutable_command()->set_shell(false);
    executor.mutable_command()->set_value(path.get());
    executor.mutable_command()->add_arguments(MESOS_EXECUTOR);
    executor.mutable_command()->add_arguments(
        "--launcher_dir=" + flags.launcher_dir);

    // TODO(jieyu): We should move those Mesos containerizer specific
    // logic (e.g., 'hasRootfs') to Mesos containerizer.
    bool hasRootfs = task.has_container() &&
                     task.container().type() == ContainerInfo::MESOS &&
                     task.container().mesos().has_image();

    if (hasRootfs) {
      executor.mutable_command()->add_arguments(
          "--sandbox_directory=" + flags.sandbox_directory);

#ifndef __WINDOWS__
      // NOTE: if switch_user flag is false and the slave runs under
      // a non-root user, the task will be rejected by the Posix
      // filesystem isolator. Linux filesystem isolator requires slave
      // to have root permission.
      if (flags.switch_user) {
        string user;
        if (task.command().has_user()) {
          user = task.command().user();
        } else {
          user = frameworkInfo.user();
        }

        executor.mutable_command()->add_arguments("--user=" + user);
      }
#endif // __WINDOWS__
    }
  } else {
    executor.mutable_command()->set_shell(true);
    executor.mutable_command()->set_value(
        "echo '" +
        (path.isError() ? path.error() : "No such file or directory") +
        "'; exit 1");
  }

  // Add an allowance for the command (or docker) executor. This does
  // lead to a small overcommit of resources.
  //
  // NOTE: The size of the memory is truncated here to preserve the
  // existing behavior for backward compatibility.
  // TODO(vinod): If a task is using revocable resources, mark the
  // corresponding executor resource (e.g., cpus) to be also
  // revocable. Currently, it is OK because the containerizer is
  // given task + executor resources on task launch resulting in
  // the container being correctly marked as revocable.
  Resources executorOverhead = Resources::parse(
      "cpus:" + stringify(DEFAULT_EXECUTOR_CPUS) + ";" +
      "mem:" + stringify(
          DEFAULT_EXECUTOR_MEM.bytes() / Bytes::MEGABYTES)).get();

  // If the task has an allocation role, we inject it into
  // the executor as well. Note that old masters will not
  // ensure the allocation info is set, and the agent will
  // inject this later, when storing the task/executor.
  Option<string> role = None();
  foreach (const Resource& resource, task.resources()) {
    if (role.isNone() && resource.has_allocation_info()) {
      role = resource.allocation_info().role();
    }

    // Check that the roles are consistent.
    Option<string> otherRole = resource.has_allocation_info()
        ? Option<string>(resource.allocation_info().role()) : None();

    CHECK(role == otherRole)
      << (role.isSome() ? role.get() : "None")
      << " vs " << (otherRole.isSome() ? otherRole.get() : "None");
  }

  if (role.isSome()) {
    executorOverhead.allocate(role.get());
  }

  executor.mutable_resources()->CopyFrom(executorOverhead);

  return executor;
}


void Slave::executorLaunched(
    const FrameworkID& frameworkId,
    const ExecutorID& executorId,
    const ContainerID& containerId,
    const Future<Containerizer::LaunchResult>& future)
{
  // Set up callback for executor termination. Note that we do this
  // regardless of whether or not we have successfully launched the
  // executor because even if we failed to launch the executor the
  // result of calling 'wait' will make sure everything gets properly
  // cleaned up. Note that we do this here instead of where we do
  // Containerizer::launch because we want to guarantee the contract
  // with the Containerizer that we won't call 'wait' until after the
  // launch has completed.
  containerizer->wait(containerId)
    .onAny(defer(self(),
                 &Self::executorTerminated,
                 frameworkId,
                 executorId,
                 lambda::_1));

  if (!future.isReady()) {
    LOG(ERROR) << "Container '" << containerId
               << "' for executor '" << executorId
               << "' of framework " << frameworkId
               << " failed to start: "
               << (future.isFailed() ? future.failure() : "future discarded");

    ++metrics.container_launch_errors;

    containerizer->destroy(containerId);

    Executor* executor = getExecutor(frameworkId, executorId);
    if (executor != nullptr) {
      ContainerTermination termination;
      termination.set_state(TASK_FAILED);
      termination.set_reason(TaskStatus::REASON_CONTAINER_LAUNCH_FAILED);
      termination.set_message(
          "Failed to launch container: " +
          (future.isFailed() ? future.failure() : "discarded"));

      executor->pendingTermination = termination;

      // TODO(jieyu): Set executor->state to be TERMINATING.
    }

    return;
  } else if (future.get() == Containerizer::LaunchResult::NOT_SUPPORTED) {
    LOG(ERROR) << "Container '" << containerId
               << "' for executor '" << executorId
               << "' of framework " << frameworkId
               << " failed to start: None of the enabled containerizers ("
               << flags.containerizers << ") could create a container for the "
               << "provided TaskInfo/ExecutorInfo message";

    ++metrics.container_launch_errors;
    return;
  } else if (future.get() == Containerizer::LaunchResult::ALREADY_LAUNCHED) {
    // This should be extremely rare, as the user would need to launch a
    // standalone container with a user-specified UUID that happens to
    // collide with the Agent-generated ContainerID for this launch.
    LOG(ERROR) << "Container '" << containerId
               << "' for executor '" << executorId
               << "' of framework " << frameworkId
               << " has already been launched.";
    return;
  }

  Framework* framework = getFramework(frameworkId);
  if (framework == nullptr) {
    LOG(WARNING) << "Framework '" << frameworkId
                 << "' for executor '" << executorId
                 << "' is no longer valid";
    return;
  }

  CHECK(framework->state == Framework::RUNNING ||
        framework->state == Framework::TERMINATING)
    << framework->state;

  if (framework->state == Framework::TERMINATING) {
    LOG(WARNING) << "Killing executor '" << executorId
                 << "' of framework " << frameworkId
                 << " because the framework is terminating";
    containerizer->destroy(containerId);
    return;
  }

  Executor* executor = framework->getExecutor(executorId);
  if (executor == nullptr) {
    LOG(WARNING) << "Killing unknown executor '" << executorId
                 << "' of framework " << frameworkId;
    containerizer->destroy(containerId);
    return;
  }

  switch (executor->state) {
    case Executor::TERMINATING:
      LOG(WARNING) << "Killing executor " << *executor
                   << " because the executor is terminating";

      containerizer->destroy(containerId);
      break;
    case Executor::REGISTERING:
    case Executor::RUNNING:
      break;
    case Executor::TERMINATED:
    default:
      LOG(FATAL) << "Executor " << *executor << " is in an unexpected state "
                 << executor->state;

      break;
  }
}


void Slave::executorTerminated(
    const FrameworkID& frameworkId,
    const ExecutorID& executorId,
    const Future<Option<ContainerTermination>>& termination)
{
  int status;
  // A termination failure indicates the containerizer could not destroy a
  // container.
  // TODO(idownes): This is a serious error so consider aborting the slave if
  // this occurs.
  if (!termination.isReady()) {
    LOG(ERROR) << "Termination of executor '" << executorId
               << "' of framework " << frameworkId
               << " failed: "
               << (termination.isFailed()
                   ? termination.failure()
                   : "discarded");
    // Set a special status for failure.
    status = -1;
  } else if (termination->isNone()) {
    LOG(ERROR) << "Termination of executor '" << executorId
               << "' of framework " << frameworkId
               << " failed: unknown container";
    // Set a special status for failure.
    status = -1;
  } else if (!termination->get().has_status()) {
    LOG(INFO) << "Executor '" << executorId
              << "' of framework " << frameworkId
              << " has terminated with unknown status";
    // Set a special status for None.
    status = -1;
  } else {
    status = termination->get().status();
    LOG(INFO) << "Executor '" << executorId
              << "' of framework " << frameworkId << " "
              << WSTRINGIFY(status);
  }

  Framework* framework = getFramework(frameworkId);
  if (framework == nullptr) {
    LOG(WARNING) << "Framework " << frameworkId
                 << " for executor '" << executorId
                 << "' does not exist";
    return;
  }

  CHECK(framework->state == Framework::RUNNING ||
        framework->state == Framework::TERMINATING)
    << framework->state;

  Executor* executor = framework->getExecutor(executorId);
  if (executor == nullptr) {
    LOG(WARNING) << "Executor '" << executorId
                 << "' of framework " << frameworkId
                 << " does not exist";
    return;
  }

  switch (executor->state) {
    case Executor::REGISTERING:
    case Executor::RUNNING:
    case Executor::TERMINATING: {
      ++metrics.executors_terminated;

      executor->state = Executor::TERMINATED;

      // Transition all live tasks to TASK_GONE/TASK_FAILED.
      // If the containerizer killed the executor (e.g., due to OOM event)
      // or if this is a command executor, we send TASK_FAILED status updates
      // instead of TASK_GONE.
      // NOTE: We don't send updates if the framework is terminating because we
      // don't want the task status update manager to keep retrying these
      // updates since it won't receive ACKs from the scheduler.  Also, the task
      // status update manager should have already cleaned up all the status
      // update streams for a framework that is terminating.
      if (framework->state != Framework::TERMINATING) {
        // Transition all live launched tasks. Note that the map is
        // removed from within the loop due terminal status updates.
        foreach (const TaskID& taskId, executor->launchedTasks.keys()) {
          Task* task = executor->launchedTasks.at(taskId);

          if (!protobuf::isTerminalState(task->state())) {
            sendExecutorTerminatedStatusUpdate(
                taskId, termination, frameworkId, executor);
          }
        }

        // Transition all queued tasks. Note that the map is removed
        // from within the loop due terminal status updates.
        foreach (const TaskID& taskId, executor->queuedTasks.keys()) {
          sendExecutorTerminatedStatusUpdate(
              taskId, termination, frameworkId, executor);
        }
      } else {
        // When the framework is TERMINATING, we cannot send status updates
        // for "launched tasks", but these tasks no longer belong in the
        // `launchedTasks` structure. These tasks will continue to show
        // in the agent's state (as a completed executor) for some time
        // after the framework/executor terminates.
        foreachpair (
            const TaskID& taskId,
            Task* task,
            utils::copy(executor->launchedTasks)) {
          executor->launchedTasks.erase(taskId);
          executor->terminatedTasks[taskId] = task;
        }
      }

      // Only send ExitedExecutorMessage if it is not a Command (or
      // Docker) Executor because the master doesn't store them; they
      // are generated by the slave.
      // TODO(vinod): Reliably forward this message to the master.
      if (!executor->isGeneratedForCommandTask()) {
        sendExitedExecutorMessage(frameworkId, executorId, status);
      }

      // Remove the executor if either the slave or framework is
      // terminating or there are no incomplete tasks.
      if (state == TERMINATING ||
          framework->state == Framework::TERMINATING ||
          !executor->incompleteTasks()) {
        removeExecutor(framework, executor);
      }

      // Remove this framework if it has no pending executors and tasks.
      if (framework->idle()) {
        removeFramework(framework);
      }
      break;
    }
    default:
      LOG(FATAL) << "Executor '" << executor->id
                 << "' of framework " << framework->id()
                 << " in unexpected state " << executor->state;
      break;
  }
}


void Slave::removeExecutor(Framework* framework, Executor* executor)
{
  CHECK_NOTNULL(framework);
  CHECK_NOTNULL(executor);

  LOG(INFO) << "Cleaning up executor " << *executor;

  CHECK(framework->state == Framework::RUNNING ||
        framework->state == Framework::TERMINATING)
    << framework->state;

  // Check that this executor has terminated.
  CHECK(executor->state == Executor::TERMINATED) << executor->state;

  // Check that either 1) the executor has no tasks with pending
  // updates or 2) the slave/framework is terminating, because no
  // acknowledgements might be received.
  CHECK(!executor->incompleteTasks() ||
        state == TERMINATING ||
        framework->state == Framework::TERMINATING);

  // Write a sentinel file to indicate that this executor
  // is completed.
  if (executor->checkpoint) {
    const string path = paths::getExecutorSentinelPath(
        metaDir,
        info.id(),
        framework->id(),
        executor->id,
        executor->containerId);
    CHECK_SOME(os::touch(path));
  }

  // TODO(vinod): Move the responsibility of gc'ing to the
  // Executor struct.

  // Schedule the executor run work directory to get garbage collected.
  const string path = paths::getExecutorRunPath(
      flags.work_dir,
      info.id(),
      framework->id(),
      executor->id,
      executor->containerId);

  // NOTE: We keep a list of default executor tasks here to for
  // detaching task volume directories, since the executor may be
  // already destroyed when the GC completes (MESOS-8460).
  vector<Task> defaultExecutorTasks;
  if (executor->info.has_type() &&
      executor->info.type() == ExecutorInfo::DEFAULT) {
    foreachvalue (const Task* task, executor->launchedTasks) {
      defaultExecutorTasks.push_back(*task);
    }

    foreachvalue (const Task* task, executor->terminatedTasks) {
      defaultExecutorTasks.push_back(*task);
    }

    foreach (const shared_ptr<Task>& task, executor->completedTasks) {
      defaultExecutorTasks.push_back(*task);
    }
  }

  os::utime(path); // Update the modification time.
  garbageCollect(path)
    .onAny(defer(self(), &Self::detachFile, path))
    .onAny(defer(
        self(),
        &Self::detachTaskVolumeDirectories,
        executor->info,
        executor->containerId,
        defaultExecutorTasks));

  // Schedule the top level executor work directory, only if the
  // framework doesn't have any 'pending' tasks for this executor.
  if (!framework->pendingTasks.contains(executor->id)) {
    const string path = paths::getExecutorPath(
        flags.work_dir, info.id(), framework->id(), executor->id);

    // Make sure we detach both real and virtual paths for "latest"
    // symlink. We prefer users to use the virtual paths because
    // they do not expose the `work_dir` and agent ID, but the real
    // paths remains for compatibility reason.
    const string latestPath = paths::getExecutorLatestRunPath(
        flags.work_dir,
        info.id(),
        framework->id(),
        executor->id);

    const string virtualLatestPath = paths::getExecutorVirtualPath(
        framework->id(),
        executor->id);

    os::utime(path); // Update the modification time.
    garbageCollect(path)
      .onAny(defer(self(), &Self::detachFile, latestPath))
      .onAny(defer(self(), &Self::detachFile, virtualLatestPath));
  }

  if (executor->checkpoint) {
    // Schedule the executor run meta directory to get garbage collected.
    const string path = paths::getExecutorRunPath(
        metaDir,
        info.id(),
        framework->id(),
        executor->id,
        executor->containerId);

    os::utime(path); // Update the modification time.
    garbageCollect(path);

    // Schedule the top level executor meta directory, only if the
    // framework doesn't have any 'pending' tasks for this executor.
    if (!framework->pendingTasks.contains(executor->id)) {
      const string path = paths::getExecutorPath(
          metaDir, info.id(), framework->id(), executor->id);

      os::utime(path); // Update the modification time.
      garbageCollect(path);
    }
  }

  if (HookManager::hooksAvailable()) {
    HookManager::slaveRemoveExecutorHook(framework->info, executor->info);
  }

  framework->destroyExecutor(executor->id);
}


void Slave::removeFramework(Framework* framework)
{
  CHECK_NOTNULL(framework);

  LOG(INFO)<< "Cleaning up framework " << framework->id();

  CHECK(framework->state == Framework::RUNNING ||
        framework->state == Framework::TERMINATING);

  // We only remove frameworks once they become idle.
  CHECK(framework->idle());

  // Close all task status update streams for this framework.
  taskStatusUpdateManager->cleanup(framework->id());

  // Schedule the framework work and meta directories for garbage
  // collection.
  // TODO(vinod): Move the responsibility of gc'ing to the
  // Framework struct.

  const string path = paths::getFrameworkPath(
      flags.work_dir, info.id(), framework->id());

  os::utime(path); // Update the modification time.
  garbageCollect(path);

  if (framework->info.checkpoint()) {
    // Schedule the framework meta directory to get garbage collected.
    const string path = paths::getFrameworkPath(
        metaDir, info.id(), framework->id());

    os::utime(path); // Update the modification time.
    garbageCollect(path);
  }

  frameworks.erase(framework->id());

  // Pass ownership of the framework pointer.
  completedFrameworks.set(framework->id(), Owned<Framework>(framework));

  updateDrainStatus();

  if (state == TERMINATING && frameworks.empty()) {
    terminate(self());
  }
}


void Slave::shutdownExecutor(
    const UPID& from,
    const FrameworkID& frameworkId,
    const ExecutorID& executorId)
{
  if (from && master != from) {
    LOG(WARNING) << "Ignoring shutdown executor message for executor '"
                 << executorId << "' of framework " << frameworkId
                 << " from " << from << " because it is not from the"
                 << " registered master ("
                 << (master.isSome() ? stringify(master.get()) : "None") << ")";
    return;
  }

  LOG(INFO) << "Asked to shut down executor '" << executorId
            << "' of framework " << frameworkId << " by " << from;

  CHECK(state == RECOVERING || state == DISCONNECTED ||
        state == RUNNING || state == TERMINATING)
    << state;

  if (state == RECOVERING || state == DISCONNECTED) {
    LOG(WARNING) << "Ignoring shutdown executor message for executor '"
                 << executorId << "' of framework " << frameworkId
                 << " because the agent has not yet registered with the master";
    return;
  }

  Framework* framework = getFramework(frameworkId);
  if (framework == nullptr) {
    LOG(WARNING) << "Cannot shut down executor '" << executorId
                 << "' of unknown framework " << frameworkId;
    return;
  }

  CHECK(framework->state == Framework::RUNNING ||
        framework->state == Framework::TERMINATING)
    << framework->state;

  if (framework->state == Framework::TERMINATING) {
    LOG(WARNING) << "Ignoring shutdown executor '" << executorId
                 << "' of framework " << frameworkId
                 << " because the framework is terminating";
    return;
  }

  if (!framework->executors.contains(executorId)) {
    LOG(WARNING) << "Ignoring shutdown of unknown executor '" << executorId
                 << "' of framework " << frameworkId;
    return;
  }

  Executor* executor = framework->executors[executorId];
  CHECK(executor->state == Executor::REGISTERING ||
        executor->state == Executor::RUNNING ||
        executor->state == Executor::TERMINATING ||
        executor->state == Executor::TERMINATED)
    << executor->state;

  if (executor->state == Executor::TERMINATING) {
    LOG(WARNING) << "Ignoring shutdown executor '" << executorId
                 << "' of framework " << frameworkId
                 << " because the executor is terminating";
    return;
  }

  if (executor->state == Executor::TERMINATED) {
    LOG(WARNING) << "Ignoring shutdown executor '" << executorId
                 << "' of framework " << frameworkId
                 << " because the executor is terminated";
    return;
  }

  _shutdownExecutor(framework, executor);
}


void Slave::_shutdownExecutor(Framework* framework, Executor* executor)
{
  CHECK_NOTNULL(framework);
  CHECK_NOTNULL(executor);

  LOG(INFO) << "Shutting down executor " << *executor;

  CHECK(framework->state == Framework::RUNNING ||
        framework->state == Framework::TERMINATING)
    << framework->state;

  CHECK(executor->state == Executor::REGISTERING ||
        executor->state == Executor::RUNNING)
    << executor->state;

  executor->state = Executor::TERMINATING;

  // If the executor hasn't yet registered, this message
  // will be dropped to the floor!
  executor->send(ShutdownExecutorMessage());

  // If the executor specifies shutdown grace period,
  // pass it instead of the default.
  Duration shutdownTimeout = flags.executor_shutdown_grace_period;
  if (executor->info.has_shutdown_grace_period()) {
    shutdownTimeout = Nanoseconds(
        executor->info.shutdown_grace_period().nanoseconds());
  }

  // Prepare for sending a kill if the executor doesn't comply.
  delay(shutdownTimeout,
        self(),
        &Slave::shutdownExecutorTimeout,
        framework->id(),
        executor->id,
        executor->containerId);
}


void Slave::shutdownExecutorTimeout(
    const FrameworkID& frameworkId,
    const ExecutorID& executorId,
    const ContainerID& containerId)
{
  Framework* framework = getFramework(frameworkId);
  if (framework == nullptr) {
    LOG(INFO) << "Framework " << frameworkId
              << " seems to have exited. Ignoring shutdown timeout"
              << " for executor '" << executorId << "'";
    return;
  }

  CHECK(framework->state == Framework::RUNNING ||
        framework->state == Framework::TERMINATING)
    << framework->state;

  Executor* executor = framework->getExecutor(executorId);
  if (executor == nullptr) {
    VLOG(1) << "Executor '" << executorId
            << "' of framework " << frameworkId
            << " seems to have exited. Ignoring its shutdown timeout";
    return;
  }

  // Make sure this timeout is valid.
  if (executor->containerId != containerId) {
    LOG(INFO) << "A new executor " << *executor
              << " with run " << executor->containerId
              << " seems to be active. Ignoring the shutdown timeout"
              << " for the old executor run " << containerId;
    return;
  }

  switch (executor->state) {
    case Executor::TERMINATED:
      LOG(INFO) << "Executor " << *executor << " has already terminated";
      break;
    case Executor::TERMINATING:
      LOG(INFO) << "Killing executor " << *executor;

      containerizer->destroy(executor->containerId);
      break;
    default:
      LOG(FATAL) << "Executor " << *executor << " is in unexpected state "
                 << executor->state;
      break;
  }
}


void Slave::registerExecutorTimeout(
    const FrameworkID& frameworkId,
    const ExecutorID& executorId,
    const ContainerID& containerId)
{
  Framework* framework = getFramework(frameworkId);
  if (framework == nullptr) {
    LOG(INFO) << "Framework " << frameworkId
              << " seems to have exited. Ignoring registration timeout"
              << " for executor '" << executorId << "'";
    return;
  }

  CHECK(framework->state == Framework::RUNNING ||
        framework->state == Framework::TERMINATING)
    << framework->state;

  if (framework->state == Framework::TERMINATING) {
    LOG(INFO) << "Ignoring registration timeout for executor '" << executorId
              << "' because the  framework " << frameworkId
              << " is terminating";
    return;
  }

  Executor* executor = framework->getExecutor(executorId);
  if (executor == nullptr) {
    VLOG(1) << "Executor '" << executorId
            << "' of framework " << frameworkId
            << " seems to have exited. Ignoring its registration timeout";
    return;
  }

  if (executor->containerId != containerId) {
    LOG(INFO) << "A new executor " << *executor
              << " with run " << executor->containerId
              << " seems to be active. Ignoring the registration timeout"
              << " for the old executor run " << containerId;
    return;
  }

  switch (executor->state) {
    case Executor::RUNNING:
    case Executor::TERMINATING:
    case Executor::TERMINATED:
      // Ignore the registration timeout.
      break;
    case Executor::REGISTERING: {
      LOG(INFO) << "Terminating executor " << *executor
                << " because it did not register within "
                << flags.executor_registration_timeout;

      // Immediately kill the executor.
      containerizer->destroy(containerId);

      executor->state = Executor::TERMINATING;

      ContainerTermination termination;
      termination.set_state(TASK_FAILED);
      termination.set_reason(TaskStatus::REASON_EXECUTOR_REGISTRATION_TIMEOUT);
      termination.set_message(
          "Executor did not register within " +
          stringify(flags.executor_registration_timeout));

      executor->pendingTermination = termination;
      break;
    }
    default:
      LOG(FATAL) << "Executor " << *executor << " is in unexpected state "
                 << executor->state;
      break;
  }
}


// TODO(vinod): Figure out a way to express this function via cmd line.
Duration Slave::age(double usage)
{
  return flags.gc_delay * std::max(0.0, (1.0 - flags.gc_disk_headroom - usage));
}


void Slave::checkDiskUsage()
{
  // TODO(vinod): We are making usage a Future, so that we can plug in
  // fs::usage() into async.
  // NOTE: We calculate disk usage of the file system on which the
  // slave work directory is mounted.
  Future<double>(::fs::usage(flags.work_dir))
    .onAny(defer(self(), &Slave::_checkDiskUsage, lambda::_1));
}


void Slave::_checkDiskUsage(const Future<double>& usage)
{
  if (!usage.isReady()) {
    LOG(ERROR) << "Failed to get disk usage: "
               << (usage.isFailed() ? usage.failure() : "future discarded");
  } else {
    executorDirectoryMaxAllowedAge = age(usage.get());
    LOG(INFO) << "Current disk usage " << std::setiosflags(std::ios::fixed)
              << std::setprecision(2) << 100 * usage.get() << "%."
              << " Max allowed age: " << executorDirectoryMaxAllowedAge;

    // We prune all directories whose deletion time is within
    // the next 'gc_delay - age'. Since a directory is always
    // scheduled for deletion 'gc_delay' into the future, only directories
    // that are at least 'age' old are deleted.
    gc->prune(flags.gc_delay - executorDirectoryMaxAllowedAge);
  }
  delay(flags.disk_watch_interval, self(), &Slave::checkDiskUsage);
}


Try<Nothing> Slave::compatible(
  const SlaveInfo& previous,
  const SlaveInfo& current) const
{
  // TODO(vinod): Also check for version compatibility.

  if (flags.reconfiguration_policy == "equal") {
    return compatibility::equal(previous, current);
  }

  if (flags.reconfiguration_policy == "additive") {
    return compatibility::additive(previous, current);
  }

  // Should have been validated during startup.
  UNREACHABLE();
}


// TODO(gilbert): Consider to read the Image GC config dynamically.
// For now, the Image GC config can only be updated after the agent
// restarts.
void Slave::checkImageDiskUsage()
{
  // TODO(gilbert): Container image gc is supported for docker image
  // in Mesos Containerizer for now. Add more image store gc supports
  // if necessary.
  Future<double>(::fs::usage(flags.docker_store_dir))
    .onAny(defer(self(), &Slave::_checkImageDiskUsage, lambda::_1));
}


void Slave::_checkImageDiskUsage(const Future<double>& usage)
{
  CHECK(flags.image_gc_config.isSome());

  if (!usage.isReady()) {
    LOG(ERROR) << "Failed to get image store disk usage: "
               << (usage.isFailed() ? usage.failure() : "future discarded");
  } else {
    LOG(INFO) << "Current docker image store disk usage: "
              << std::setiosflags(std::ios::fixed) << std::setprecision(2)
              << 100 * usage.get() << "%.";

    if ((flags.image_gc_config->image_disk_headroom() + usage.get()) > 1.0) {
      LOG(INFO) << "Image store disk usage exceeds the threshold '"
                << 100 * (1.0 - flags.image_gc_config->image_disk_headroom())
                << "%'. Container Image GC is triggered.";

      vector<Image> excludedImages(
          flags.image_gc_config->excluded_images().begin(),
          flags.image_gc_config->excluded_images().end());

      containerizer->pruneImages(excludedImages);
    }
  }

  delay(
      Nanoseconds(
          flags.image_gc_config->image_disk_watch_interval().nanoseconds()),
      self(),
      &Slave::checkImageDiskUsage);
}


Future<Nothing> Slave::recover(const Try<state::State>& state)
{
  if (state.isError()) {
    return Failure(state.error());
  }

  LOG(INFO) << "Finished recovering checkpointed state from '" << metaDir
            << "', beginning agent recovery";

  Option<ResourcesState> resourcesState = state->resources;
  Option<SlaveState> slaveState = state->slave;

  // With the addition of frameworks with multiple roles, we
  // need to inject the allocated role into each allocated
  // `Resource` object that we've persisted. Note that we
  // also must do this for MULTI_ROLE frameworks since they
  // may have tasks that were present before the framework
  // upgraded into MULTI_ROLE.
  auto injectAllocationInfo = [](
      RepeatedPtrField<Resource>* resources,
      const FrameworkInfo& frameworkInfo) {
    set<string> roles = protobuf::framework::getRoles(frameworkInfo);

    bool injectedAllocationInfo = false;
    foreach (Resource& resource, *resources) {
      if (!resource.has_allocation_info()) {
        if (roles.size() != 1) {
          LOG(FATAL) << "Missing 'Resource.AllocationInfo' for resources"
                     << " allocated to MULTI_ROLE framework"
                     << " '" << frameworkInfo.name() << "'";
        }

        resource.mutable_allocation_info()->set_role(*roles.begin());
        injectedAllocationInfo = true;
      }
    }

    return injectedAllocationInfo;
  };

  // In order to allow frameworks to change their role(s), we need to keep
  // track of the fact that the resources used to be implicitly allocated to
  // `FrameworkInfo.role` before the agent upgrade. To this end, we inject
  // the `AllocationInfo` to the resources in `ExecutorState` and `TaskState`,
  // and re-checkpoint them if necessary.

  hashset<ExecutorID> injectedExecutors;
  hashmap<ExecutorID, hashset<TaskID>> injectedTasks;

  if (slaveState.isSome()) {
    foreachvalue (FrameworkState& frameworkState, slaveState->frameworks) {
      if (!frameworkState.info.isSome()) {
        continue;
      }

      foreachvalue (ExecutorState& executorState, frameworkState.executors) {
        if (!executorState.info.isSome()) {
          continue;
        }

        if (injectAllocationInfo(
                executorState.info->mutable_resources(),
                frameworkState.info.get())) {
          injectedExecutors.insert(executorState.id);
        }

        foreachvalue (RunState& runState, executorState.runs) {
          foreachvalue (TaskState& taskState, runState.tasks) {
            if (!taskState.info.isSome()) {
              continue;
            }

            if (injectAllocationInfo(
                    taskState.info->mutable_resources(),
                    frameworkState.info.get())) {
              injectedTasks[executorState.id].insert(taskState.id);
            }
          }
        }
      }
    }
  }

  // Recover checkpointed resources.
  // NOTE: 'resourcesState' is None if the slave rootDir does not
  // exist or the resources checkpoint file cannot be found.
  if (resourcesState.isSome()) {
    if (resourcesState->errors > 0) {
      LOG(WARNING) << "Errors encountered during resources recovery: "
                   << resourcesState->errors;

      metrics.recovery_errors += resourcesState->errors;
    }

    checkpointedResources = resourcesState->resources;

    if (resourcesState->target.isSome()) {
      Resources targetResources = resourcesState->target.get();

      // Sync the checkpointed resources from the target (which was
      // only created when there are pending changes in the
      // checkpointed resources). If there is any failure, the
      // checkpoint is not committed and the agent exits. In that
      // case, sync of checkpoints will be reattempted on the next
      // agent restart (before agent reregistration).
      Try<Nothing> syncResult = syncCheckpointedResources(targetResources);

      if (syncResult.isError()) {
        return Failure(
            "Target checkpointed resources " +
            stringify(targetResources) +
            " failed to sync from current checkpointed resources " +
            stringify(checkpointedResources) + ": " +
            syncResult.error());
      }

      // At this point, `syncCheckpointedResources()` has ensured that any
      // change in checkpointed resources (e.g. persistent volumes) is now
      // reflected on disk. We rename the target resource state file to the
      // actual resource state file, which is our source of truth for the
      // current state of the agent resources.
      Try<Nothing> renameResult = os::rename(
          paths::getResourceStateTargetPath(metaDir),
          paths::getResourceStatePath(metaDir));

      if (renameResult.isError()) {
        return Failure(
            "Failed to rename target resources " +
            stringify(targetResources) + " and associated operations: " +
            renameResult.error());
      }

      // The following rename call handles the pre-operation-feedback
      // checkpoint format for backward compatibility.
      renameResult = os::rename(
          paths::getResourcesTargetPath(metaDir),
          paths::getResourcesInfoPath(metaDir));

      if (renameResult.isError()) {
        return Failure(
            "Failed to rename target resources " +
            stringify(targetResources) + ": " +
            renameResult.error());
      }

      // Since we synced the target resources to the committed resources, we
      // check resource compatibility with `--resources` command line flag
      // based on the committed checkpoint.
      checkpointedResources = targetResources;
    }

    // This is to verify that the checkpointed resources are
    // compatible with the agent resources specified through the
    // '--resources' command line flag. The compatibility has been
    // verified by the old agent but the flag may have changed during
    // agent restart in an incompatible way and the operator may need
    // to either fix the flag or the checkpointed resources.
    Try<Resources> _totalResources = applyCheckpointedResources(
        info.resources(), checkpointedResources);

    if (_totalResources.isError()) {
      return Failure(
          "Checkpointed resources " +
          stringify(checkpointedResources) +
          " are incompatible with agent resources " +
          stringify(info.resources()) + ": " +
          _totalResources.error());
    }

    totalResources = _totalResources.get();
  }

  if (slaveState.isSome() && slaveState->info.isSome()) {
    if (slaveState->errors > 0) {
      LOG(WARNING) << "Errors encountered during agent recovery: "
                   << slaveState->errors;

      metrics.recovery_errors += slaveState->errors;
    }

    // Save the previous id into the current `SlaveInfo`, so we can compare
    // both of them for equality. This is safe because if it turned out that
    // we can not reuse the id, we will either crash or erase it again.
    info.mutable_id()->CopyFrom(slaveState->info->id());

    drainConfig = slaveState->drainConfig;

    // Check for SlaveInfo compatibility.
    Try<Nothing> _compatible =
      compatible(slaveState->info.get(), info);

    if (_compatible.isSome()) {
      // Permitted change, so we reuse the recovered agent id and reconnect
      // to running executors.

      // Prior to Mesos 1.5, the master expected that an agent would never
      // change its `SlaveInfo` and keep the same slave id, and therefore would
      // not update it's internal data structures on agent re-registration.
      if (!(slaveState->info.get() == info)) {
        requiredMasterCapabilities.agentUpdate = true;
      }

      // If we restarted the agent process and will reuse the same agent ID
      // we can immediately start the resource provider manager. This allows
      // executors recovered later on to resubscribe immediately.
      initializeResourceProviderManager(flags, info.id());

      // Recover the frameworks.
      foreachvalue (const FrameworkState& frameworkState,
                    slaveState->frameworks) {
        recoverFramework(frameworkState, injectedExecutors, injectedTasks);
      }
    } else if (state->rebooted) {
      // Prior to Mesos 1.4 we directly bypass the state recovery and
      // start as a new agent upon reboot (introduced in MESOS-844).
      // This unnecessarily discards the existing agent ID (MESOS-6223).
      // Starting in Mesos 1.4 we'll attempt to recover the slave state
      // even after reboot but in case of an incompatible slave info change
      // we'll fall back to recovering as a new agent (existing behavior).
      // Prior to Mesos 1.5, an incompatible change would be any slave info
      // mismatch.
      // This prevents the agent from flapping if the slave info (resources,
      // attributes, etc.) change is due to host maintenance associated
      // with the reboot.

      LOG(WARNING) << "Falling back to recover as a new agent due to error: "
                   << _compatible.error();

      // Cleaning up the slave state to avoid any state recovery for the
      // old agent.
      info.clear_id();
      slaveState = None();

      // Remove the "latest" symlink if it exists to "checkpoint" the
      // decision to recover as a new agent.
      const string& latest = paths::getLatestSlavePath(metaDir);
      if (os::exists(latest)) {
        CHECK_SOME(os::rm(latest))
          << "Failed to remove latest symlink '" << latest << "'";
      }
    } else {
      return Failure(_compatible.error());
    }
  }

  return _recoverVolumeGidManager(state->rebooted)
    .then(defer(self(), &Slave::_recoverTaskStatusUpdates, slaveState))
    .then(defer(self(), &Slave::_recoverContainerizer, slaveState))
    .then(defer(self(), &Slave::_recoverOperations, slaveState));
}


Future<Nothing> Slave::_recoverVolumeGidManager(bool rebooted)
{
#ifndef __WINDOWS__
  if (volumeGidManager) {
    return volumeGidManager->recover(rebooted);
  }
  return Nothing();
#else
  return Nothing();
#endif // __WINDOWS__
}


Future<Option<SlaveState>> Slave::_recoverTaskStatusUpdates(
    const Option<SlaveState>& state)
{
  return taskStatusUpdateManager->recover(metaDir, state)
    .then([state]() -> Future<Option<SlaveState>> {
      return state;
    });
}


Future<Nothing> Slave::_recoverContainerizer(
    const Option<SlaveState>& state)
{
  return containerizer->recover(state);
}


Future<Nothing> Slave::_recoverOperations(
    const Option<state::SlaveState>& state)
{
  if (state.isNone()) {
    return Nothing();
  }

  operationStatusUpdateManager.initialize(
      defer(self(), &Self::sendOperationStatusUpdate, lambda::_1),
      std::bind(
          &slave::paths::getSlaveOperationUpdatesPath,
          metaDir,
          info.id(),
          lambda::_1));

  if (state->operations.isSome()) {
    foreach (const Operation& operation, state->operations.get()) {
      Result<ResourceProviderID> resourceProviderId =
        getResourceProviderId(operation.info());

      // Only operations affecting agent default resources are checkpointed.
      CHECK(resourceProviderId.isNone());

      addOperation(new Operation(operation));
    }
  }

  // Walk the operation status update streams directories in order to generate
  // the list of streams to recover.
  //
  // NOTE: It is possible for the agent to fail over right after having
  // checkpointed an operation in a `ResourceState` message, but before having
  // created the corresponding stream.
  //
  // In that case the checkpointed message will contain the operation, but the
  // corresponding directory for the operation status update stream will not
  // exist.
  //
  // Since the SUM recovery process will return an error if invoked with an
  // operation ID for which a stream hasn't been created, we can't extract the
  // list of streams to recover from the content of the checkpointed
  // `ResourceState` message.
  Try<list<string>> operationPaths =
    slave::paths::getSlaveOperationPaths(metaDir, info.id());

  if (operationPaths.isError()) {
    return Failure(
        "Failed to find operation status update streams: " +
        operationPaths.error());
  }

  list<id::UUID> operationUuids;
  foreach (const string& path, operationPaths.get()) {
    Try<id::UUID> uuid =
      slave::paths::parseSlaveOperationPath(metaDir, info.id(), path);

    if (uuid.isError()) {
      return Failure(
          "Failed to parse operation status update stream path '" + path +
          "': " + uuid.error());
    }

    UUID uuid_;
    uuid_.set_value(uuid->toBytes());

    // NOTE: This could happen if we failed to remove the operation path before.
    if (!operations.contains(uuid_)) {
      LOG(WARNING)
        << "Garbage collecting status update stream for unknown operation"
        << " (uuid: " << uuid.get() << ")";

      Try<Nothing> rmdir = os::rmdir(path);
      if (rmdir.isError()) {
        LOG(ERROR)
          << "Failed to remove directory '" << path << "': " << rmdir.error();
      }

      continue;
    }

    operationUuids.emplace_back(std::move(uuid.get()));
  }

  return operationStatusUpdateManager.recover(operationUuids, flags.strict)
    .then(defer(self(), &Slave::__recoverOperations, lambda::_1));
}


Future<Nothing> Slave::__recoverOperations(
  const Future<OperationStatusUpdateManagerState>& state)
{
  if (!state.isReady()) {
    EXIT(EXIT_FAILURE)
      << "Failed to recover operation status update manager: "
      << (state.isFailed() ? state.failure() : "future discarded") << "\n";
  }

  if (state->errors > 0) {
    LOG(WARNING)
      << "Errors encountered during operation status update manager recovery: "
      << state->errors;

    metrics.recovery_errors += state->errors;
  }

  // Clean up operations with terminated streams.
  //
  // These are operations with terminal updates that have already been
  // acknowledged. They could still be checkpointed if the agent failed
  // over just before removing them from its state.
  using StreamState = typename OperationStatusUpdateManagerState::StreamState;
  vector<id::UUID> completedOperations;
  foreachpair (const id::UUID& uuid,
               const Option<StreamState>& stream,
               state->streams) {
    if (stream.isSome() && stream->terminated) {
      UUID operationUuid;
      operationUuid.set_value(uuid.toBytes());

      Operation* operation = getOperation(operationUuid);
      if (operation != nullptr) {
        removeOperation(operation);
        completedOperations.push_back(uuid);
      }
    }
  }

  // Garbage collect the operation streams.
  foreach (const id::UUID& uuid, completedOperations) {
    const string path =
      slave::paths::getSlaveOperationPath(metaDir, info.id(), uuid);

    Try<Nothing> rmdir = os::rmdir(path);
    if (rmdir.isError()) {
      LOG(ERROR) << "Failed to remove operation status update stream "
                 << "directory '" << path << "': " << rmdir.error();
    }
  }

  foreachpair (const UUID& uuid, Operation* operation, operations) {
    const id::UUID operationUuid(
        CHECK_NOTERROR(id::UUID::fromBytes(uuid.value())));

    // The operation might be from an operator API call, thus the framework
    // ID here is optional.
    Option<FrameworkID> frameworkId =
      operation->has_framework_id()
        ? operation->framework_id()
        : Option<FrameworkID>::none();

    if (operation->latest_status().state() == OPERATION_PENDING) {
      // The agent failed over before the checkpoint of the
      // `OPERATION_FINISHED` update completed.
      CHECK(
          !state->streams.contains(operationUuid) ||
          state->streams.at(operationUuid).isNone());

      Option<OperationID> operationId =
        operation->info().has_id()
          ? operation->info().id()
          : Option<OperationID>::none();

      UpdateOperationStatusMessage update =
        protobuf::createUpdateOperationStatusMessage(
            operation->uuid(),
            protobuf::createOperationStatus(
                OPERATION_FINISHED,
                operationId,
                None(),
                None(),
                id::UUID::random(),
                info.id(),
                Option<ResourceProviderID>::none()),
            None(),
            frameworkId,
            info.id());

      updateOperation(operation, update);

      CHECK(protobuf::isSpeculativeOperation(operation->info()));
      apply(operation);

      checkpointResourceState(
          totalResources.filter(mesos::needCheckpointing), false);

      operationStatusUpdateManager.update(update);
    } else if (!state->streams.contains(operationUuid) ||
               state->streams.get(operationUuid)->isNone()) {
      // The agent failed over after creating the `OPERATION_FINISHED` update,
      // but before the operation status update manager checkpointed it.
      operationStatusUpdateManager.update(
          protobuf::createUpdateOperationStatusMessage(
              operation->uuid(),
              operation->latest_status(),
              None(),
              frameworkId,
              info.id()));
    }
  }

  return Nothing();
}


Future<Nothing> Slave::_recover()
{
  LOG(INFO) << "Recovering executors";

  // Alow HTTP based executors to subscribe after the
  // containerizer recovery is complete.
  recoveryInfo.reconnect = true;

  foreachvalue (Framework* framework, frameworks) {
    foreachvalue (Executor* executor, framework->executors) {
      // Set up callback for executor termination.
      containerizer->wait(executor->containerId)
        .onAny(defer(self(),
                     &Self::executorTerminated,
                     framework->id(),
                     executor->id,
                     lambda::_1));

      if (flags.recover == "reconnect") {
        // We send a reconnect message for PID based executors
        // as we can initiate communication with them. Recovered
        // HTTP executors, on the other hand, are responsible for
        // subscribing back with the agent using a retry interval.
        // Note that recovered http executors are marked with
        // http.isNone and pid.isNone (see comments in the header).
        if (executor->pid.isSome() && executor->pid.get()) {
          LOG(INFO)
            << "Sending reconnect request to executor " << *executor;

          ReconnectExecutorMessage message;
          message.mutable_slave_id()->MergeFrom(info.id());
          send(executor->pid.get(), message);

          // PID-based executors using Mesos libraries >= 1.1.2 always
          // re-link with the agent upon receiving the reconnect message.
          // This avoids the executor replying on a half-open TCP
          // connection to the old agent (possible if netfilter is
          // dropping packets, see: MESOS-7057). However, PID-based
          // executors using Mesos libraries < 1.1.2 do not re-link
          // and are therefore prone to replying on a half-open connection
          // after the agent restarts. If we only send a single reconnect
          // message, these "old" executors will reply on their half-open
          // connection and receive a RST; without any retries, they will
          // fail to reconnect and be killed by the agent once the executor
          // re-registration timeout elapses. To ensure these "old"
          // executors can reconnect in the presence of netfilter dropping
          // packets, we introduced optional retries of the reconnect
          // message. This results in "old" executors correctly establishing
          // a link when processing the second reconnect message.
          if (flags.executor_reregistration_retry_interval.isSome()) {
            const Duration& retryInterval =
              flags.executor_reregistration_retry_interval.get();

            const FrameworkID& frameworkId = framework->id();
            const ExecutorID& executorId = executor->id;

            process::loop(
                self(),
                [retryInterval]() {
                  return after(retryInterval);
                },
                [this, frameworkId, executorId, message](Nothing)
                    -> ControlFlow<Nothing> {
                  if (state != RECOVERING) {
                    return Break();
                  }

                  Framework* framework = getFramework(frameworkId);
                  if (framework == nullptr) {
                    return Break();
                  }

                  Executor* executor = framework->getExecutor(executorId);
                  if (executor == nullptr) {
                    return Break();
                  }

                  if (executor->state != Executor::REGISTERING) {
                    return Break();
                  }

                  LOG(INFO) << "Re-sending reconnect request to executor "
                            << *executor;

                  send(executor->pid.get(), message);
                  return Continue();
                });
          }
        } else if (executor->pid.isNone()) {
          LOG(INFO) << "Waiting for executor " << *executor
                    << " to subscribe";
        } else {
          LOG(INFO) << "Unable to reconnect to executor " << *executor
                    << " because no pid or http checkpoint file was found";
        }
      } else {
        // For PID-based executors, we ask the executor to shut
        // down and give it time to terminate. For HTTP executors,
        // we do the same, however, the shutdown will only be sent
        // when the executor subscribes.
        if ((executor->pid.isSome() && executor->pid.get()) ||
            executor->pid.isNone()) {
          LOG(INFO) << "Sending shutdown to executor " << *executor;
          _shutdownExecutor(framework, executor);
        } else {
          LOG(INFO) << "Killing executor " << *executor
                    << " because no pid or http checkpoint file was found";

          containerizer->destroy(executor->containerId);
        }
      }
    }
  }

  if (!frameworks.empty() && flags.recover == "reconnect") {
    // Cleanup unregistered executors after a delay.
    delay(flags.executor_reregistration_timeout,
          self(),
          &Slave::reregisterExecutorTimeout);

    // We set 'recovered' flag inside reregisterExecutorTimeout(),
    // so that when the slave reregisters with master it can
    // correctly inform the master about the launched tasks.
    return recoveryInfo.recovered.future();
  }

  return Nothing();
}


void Slave::__recover(const Future<Nothing>& future)
{
  if (!future.isReady()) {
    EXIT(EXIT_FAILURE)
      << "Failed to perform recovery: "
      << (future.isFailed() ? future.failure() : "future discarded") << "\n"
      << "If recovery failed due to a change in configuration and you want to\n"
      << "keep the current agent id, you might want to change the\n"
      << "`--reconfiguration_policy` flag to a more permissive value.\n"
      << "\n"
      << "To restart this agent with a new agent id instead, do as follows:\n"
      << "rm -f " << paths::getLatestSlavePath(metaDir) << "\n"
      << "This ensures that the agent does not recover old live executors.\n"
      << "\n"
      << "If you use the Docker containerizer and think that the Docker\n"
      << "daemon state is broken, you can try to clear it. But be careful:\n"
      << "these commands will erase all containers and images from this host,\n"
      << "not just those started by Mesos!\n"
      << "docker kill $(docker ps -q)\n"
      << "docker rm $(docker ps -a -q)\n"
      << "docker rmi $(docker images -q)\n"
      << "\n"
      << "Finally, restart the agent.";
  }

  LOG(INFO) << "Finished recovery";

  CHECK_EQ(RECOVERING, state);

  // Checkpoint boot ID.
  Try<string> bootId = os::bootId();
  if (bootId.isError()) {
    LOG(ERROR) << "Could not retrieve boot id: " << bootId.error();
  } else {
    const string path = paths::getBootIdPath(metaDir);
    CHECK_SOME(state::checkpoint(path, bootId.get()));
  }

  // Schedule all old slave directories for garbage collection.
  // TODO(vinod): Do this as part of recovery. This needs a fix
  // in the recovery code, to recover all slaves instead of only
  // the latest slave.
  const string directory = path::join(flags.work_dir, "slaves");
  Try<list<string>> entries = os::ls(directory);
  if (entries.isSome()) {
    foreach (const string& entry, entries.get()) {
      string path = path::join(directory, entry);
      // Ignore non-directory entries.
      if (!os::stat::isdir(path)) {
        continue;
      }

      // We garbage collect a directory if either the slave has not
      // recovered its id (hence going to get a new id when it
      // registers with the master) or if it is an old work directory.
      SlaveID slaveId;
      slaveId.set_value(entry);
      if (!info.has_id() || slaveId != info.id()) {
        LOG(INFO) << "Garbage collecting old agent " << slaveId;

        // NOTE: We update the modification time of the slave work/meta
        // directories even though these are old because these
        // directories might not have been scheduled for gc before.

        // GC the slave work directory.
        os::utime(path); // Update the modification time.
        garbageCollect(path);

        // GC the slave meta directory.
        path = paths::getSlavePath(metaDir, slaveId);
        if (os::exists(path)) {
          os::utime(path); // Update the modification time.
          garbageCollect(path);
        }
      }
    }
  }

  if (flags.recover == "reconnect") {
    state = DISCONNECTED;

    // Start detecting masters.
    detection = detector->detect()
      .onAny(defer(self(), &Slave::detected, lambda::_1));

    if (info.has_id()) {
      initializeResourceProviderManager(flags, info.id());
    }

    // Forward oversubscribed resources.
    forwardOversubscribed();

    // Start acting on correction from QoS Controller.
    qosCorrections();
  } else {
    // Slave started in cleanup mode.
    CHECK_EQ("cleanup", flags.recover);
    state = TERMINATING;

    if (frameworks.empty()) {
      terminate(self());
    }

    // If there are active executors/frameworks, the slave will
    // shutdown when all the executors are terminated. Note that
    // the executors are guaranteed to terminate because they
    // are sent shutdown signal in '_recover()' which results in
    // 'Containerizer::destroy()' being called if the termination
    // doesn't happen within a timeout.
  }

  recoveryInfo.recovered.set(Nothing()); // Signal recovery.

  metrics.setRecoveryTime(process::Clock::now() - startTime);
}


void Slave::recoverFramework(
    const FrameworkState& state,
    const hashset<ExecutorID>& executorsToRecheckpoint,
    const hashmap<ExecutorID, hashset<TaskID>>& tasksToRecheckpoint)
{
  LOG(INFO) << "Recovering framework " << state.id;

  if (state.executors.empty()) {
    // GC the framework work directory.
    garbageCollect(
        paths::getFrameworkPath(flags.work_dir, info.id(), state.id));

    // GC the framework meta directory.
    garbageCollect(
        paths::getFrameworkPath(metaDir, info.id(), state.id));

    return;
  }

  CHECK(!frameworks.contains(state.id));

  CHECK_SOME(state.info);
  FrameworkInfo frameworkInfo = state.info.get();

  // Mesos 0.22 and earlier didn't write the FrameworkID into the FrameworkInfo.
  // In this case, we we update FrameworkInfo.framework_id from directory name,
  // and rewrite the new format when we are done.
  bool recheckpoint = false;
  if (!frameworkInfo.has_id()) {
    frameworkInfo.mutable_id()->CopyFrom(state.id);
    recheckpoint = true;
  }

  CHECK(frameworkInfo.has_id());
  CHECK(frameworkInfo.checkpoint());

  // In 0.24.0, HTTP schedulers are supported and these do not
  // have a 'pid'. In this case, the slave will checkpoint UPID().
  CHECK_SOME(state.pid);

  Option<UPID> pid = state.pid.get();

  if (pid.get() == UPID()) {
    pid = None();
  }

  Framework* framework = new Framework(
      this, flags, frameworkInfo, pid);

  frameworks[framework->id()] = framework;

  if (recheckpoint) {
    framework->checkpointFramework();
  }

  // Now recover the executors for this framework.
  foreachvalue (const ExecutorState& executorState, state.executors) {
    framework->recoverExecutor(
        executorState,
        executorsToRecheckpoint.contains(executorState.id),
        tasksToRecheckpoint.contains(executorState.id)
            ? tasksToRecheckpoint.at(executorState.id)
            : hashset<TaskID>{});
  }

  // Remove the framework in case we didn't recover any executors.
  if (framework->executors.empty()) {
    removeFramework(framework);
  }
}


Future<Nothing> Slave::garbageCollect(const string& path)
{
  Try<long> mtime = os::stat::mtime(path);
  if (mtime.isError()) {
    LOG(ERROR) << "Failed to find the mtime of '" << path
               << "': " << mtime.error();
    return Failure(mtime.error());
  }

  // It is unsafe for testing to use unix time directly, we must use
  // Time::create to convert into a Time object that reflects the
  // possibly advanced state of the libprocess Clock.
  Try<Time> time = Time::create(mtime.get());
  CHECK_SOME(time);

  // GC based on the modification time.
  Duration delay = flags.gc_delay - (Clock::now() - time.get());

  return gc->schedule(delay, path);
}


void Slave::forwardOversubscribed()
{
  VLOG(2) << "Querying resource estimator for oversubscribable resources";

  resourceEstimator->oversubscribable()
    .onAny(defer(self(), &Self::_forwardOversubscribed, lambda::_1));
}


void Slave::_forwardOversubscribed(const Future<Resources>& oversubscribable)
{
  if (!oversubscribable.isReady()) {
    LOG(ERROR) << "Failed to get oversubscribable resources: "
               << (oversubscribable.isFailed()
                   ? oversubscribable.failure() : "future discarded");
  } else {
    VLOG(2) << "Received oversubscribable resources "
            << oversubscribable.get() << " from the resource estimator";

    // Oversubscribable resources must be tagged as revocable.
    //
    // TODO(bmahler): Consider tagging input as revocable
    // rather than rejecting and crashing here.
    CHECK_EQ(oversubscribable.get(), oversubscribable->revocable());

    auto unallocated = [](const Resources& resources) {
      Resources result = resources;
      result.unallocate();
      return result;
    };

    // Calculate the latest allocation of oversubscribed resources.
    // Note that this allocation value might be different from the
    // master's view because new task/executor might be in flight from
    // the master or pending on the slave etc. This is ok because the
    // allocator only considers the slave's view of allocation when
    // calculating the available oversubscribed resources to offer.
    Resources oversubscribed;
    foreachvalue (Framework* framework, frameworks) {
      oversubscribed += unallocated(
          framework->allocatedResources().revocable());
    }

    // Add oversubscribable resources to the total.
    oversubscribed += oversubscribable.get();

    // Only forward the estimate if it's different from the previous
    // estimate. We also send this whenever we get (re-)registered
    // (i.e. whenever we transition into the RUNNING state).
    if (state == RUNNING && oversubscribedResources != oversubscribed) {
      LOG(INFO) << "Forwarding total oversubscribed resources "
                << oversubscribed;

      // We do not update the agent's resource version since
      // oversubscribed resources cannot be used for any operations
      // but launches. Since oversubscription is run at regular
      // intervals updating the version could cause a lot of
      // operation churn.
      //
      // TODO(bbannier): Revisit this if we modify the operations
      // possible on oversubscribed resources.

      UpdateSlaveMessage message;
      message.mutable_slave_id()->CopyFrom(info.id());
      message.set_update_oversubscribed_resources(true);
      message.mutable_oversubscribed_resources()->CopyFrom(oversubscribed);

      CHECK_SOME(master);
      send(master.get(), message);
    }

    // Update the estimate.
    oversubscribedResources = oversubscribed;
  }

  delay(flags.oversubscribed_resources_interval,
        self(),
        &Self::forwardOversubscribed);
}


UpdateSlaveMessage Slave::generateResourceProviderUpdate() const
{
  // Agent information (total resources, operations, resource
  // versions) is not passed as part of some `ResourceProvider`, but
  // globally in `UpdateStateMessage`.
  //
  // TODO(bbannier): Pass agent information as a resource provider.
  UpdateSlaveMessage message;
  message.mutable_slave_id()->CopyFrom(info.id());
  message.set_update_oversubscribed_resources(false);
  message.mutable_resource_version_uuid()->CopyFrom(resourceVersion);
  message.mutable_operations();

  foreachvalue (const Operation* operation, operations) {
    Result<ResourceProviderID> resourceProviderId =
      getResourceProviderId(operation->info());

    if (resourceProviderId.isNone()) {
      message.mutable_operations()
        ->add_operations()->CopyFrom(*operation);
    }
  }

  // Always add a `resource_providers` field so we can distinguish the
  // empty and unset case.
  UpdateSlaveMessage::ResourceProviders* providers =
    message.mutable_resource_providers();

  foreachvalue (ResourceProvider* resourceProvider, resourceProviders) {
    // If the resource provider has not updated its state we do not
    // need to and cannot include its information in an
    // `UpdateSlaveMessage` since it requires a resource version.
    if (resourceProvider->resourceVersion.isNone()) {
      continue;
    }

    UpdateSlaveMessage::ResourceProvider* provider = providers->add_providers();

    provider->mutable_info()->CopyFrom(
        resourceProvider->info);
    provider->mutable_total_resources()->CopyFrom(
        resourceProvider->totalResources);
    provider->mutable_resource_version_uuid()->CopyFrom(
        resourceProvider->resourceVersion.get());

    provider->mutable_operations();

    foreachvalue (const Operation* operation,
                  resourceProvider->operations) {
      provider->mutable_operations()
        ->add_operations()->CopyFrom(*operation);
    }
  }

  return message;
}


UpdateSlaveMessage Slave::generateUpdateSlaveMessage() const
{
  UpdateSlaveMessage message = generateResourceProviderUpdate();

  if (oversubscribedResources.isSome()) {
    message.set_update_oversubscribed_resources(true);
    message.mutable_oversubscribed_resources()->CopyFrom(
        oversubscribedResources.get());
  }

  return message;
}


void Slave::handleResourceProviderMessage(
    const Future<ResourceProviderMessage>& message)
{
  // Ignore terminal messages which are not ready. These
  // can arise e.g., if the `Future` was discarded.
  if (!message.isReady()) {
    LOG(ERROR) << "Last resource provider message became terminal before "
                  "becoming ready: "
               << (message.isFailed() ? message.failure() : "future discarded");

    // Wait for the next message.
    CHECK_NOTNULL(resourceProviderManager.get())->messages().get()
      .onAny(defer(self(), &Self::handleResourceProviderMessage, lambda::_1));

    return;
  }

  LOG(INFO) << "Handling resource provider message '" << message.get() << "'";

  switch(message->type) {
    case ResourceProviderMessage::Type::SUBSCRIBE: {
      CHECK_SOME(message->subscribe);

      const ResourceProviderMessage::Subscribe& subscribe =
        message->subscribe.get();

      CHECK(subscribe.info.has_id());

      ResourceProvider* resourceProvider =
        getResourceProvider(subscribe.info.id());

      if (resourceProvider == nullptr) {
        resourceProvider = new ResourceProvider(subscribe.info, {}, None());

        addResourceProvider(resourceProvider);
      } else {
        // Always update the resource provider info.
        resourceProvider->info = subscribe.info;
      }
      break;
    }
    case ResourceProviderMessage::Type::UPDATE_STATE: {
      CHECK_SOME(message->updateState);

      const ResourceProviderMessage::UpdateState& updateState =
        message->updateState.get();

      ResourceProvider* resourceProvider =
        getResourceProvider(updateState.resourceProviderId);

      CHECK(resourceProvider);

      if (resourceProvider->totalResources != updateState.totalResources) {
        // Update the 'total' in the Slave.
        CHECK(totalResources.contains(resourceProvider->totalResources));
        totalResources -= resourceProvider->totalResources;
        totalResources += updateState.totalResources;

        // Update the 'total' in the resource provider.
        resourceProvider->totalResources = updateState.totalResources;
      }

      // Update operation state.
      //
      // We only update operations which are not contained in both
      // the known and just received sets. All other operations will
      // be updated via relayed operation status updates.
      const hashset<UUID> knownUuids = resourceProvider->operations.keys();
      const hashset<UUID> receivedUuids = updateState.operations.keys();

      // Handle operations known to the agent but not reported by
      // the resource provider. These could be operations where the
      // agent has started tracking an operation, but the resource
      // provider failed over before it could bookkeep the
      // operation.
      //
      // NOTE: We do not mutate operations statuses here; this would
      // be the responsibility of an operation status update handler.
      hashset<UUID> disappearedUuids = knownUuids - receivedUuids;
      foreach (const UUID& uuid, disappearedUuids) {
        // TODO(bbannier): Instead of simply dropping an operation
        // with `removeOperation` here we should instead send a
        // `Reconcile` message with a failed state to the resource
        // provider so its status update manager can reliably
        // deliver the operation status to the framework.
        removeOperation(resourceProvider->operations.at(uuid));
      }

      // Handle operations known to the resource provider but not
      // the agent. This can happen if the agent failed over and the
      // resource provider reregistered.
      hashset<UUID> reappearedUuids = receivedUuids - knownUuids;
      foreach (const UUID& uuid, reappearedUuids) {
        // Start tracking this operation.
        //
        // NOTE: We do not need to update total resources here as its
        // state was sync explicitly with the received total above.
        addOperation(new Operation(updateState.operations.at(uuid)));
      }

      // Handle operations known to both the agent and the resource provider.
      //
      // If an operation became terminal, its result is already reflected in
      // the total resources reported by the resource provider, and thus it
      // should not be applied again in an operation status update handler
      // when its terminal status update arrives. So we set the terminal
      // `latest_status` here to prevent resource conversions elsewhere.
      //
      // NOTE: We only update the `latest_status` of a known operation if it
      // is not terminal yet here; its `statuses` would be updated by an
      // operation status update handler.
      hashset<UUID> matchedUuids = knownUuids - disappearedUuids;
      foreach (const UUID& uuid, matchedUuids) {
        const Operation& operation = updateState.operations.at(uuid);
        if (operation.has_latest_status() &&
            protobuf::isTerminalState(operation.latest_status().state())) {
          updateOperationLatestStatus(
              getOperation(uuid),
              operation.latest_status());
        }
      }

      // Update resource version of this resource provider.
      resourceProvider->resourceVersion = updateState.resourceVersion;

      // Send the updated resources to the master if the agent is running. Note
      // that since we have already updated our copy of the latest resource
      // provider resources, it is safe to consume this message and wait for the
      // next one; even if we do not send the update to the master right now, an
      // update will be send once the agent reregisters.
      switch (state) {
        case RECOVERING:
        case DISCONNECTED:
        case TERMINATING: {
          break;
        }
        case RUNNING: {
          LOG(INFO) << "Forwarding new total resources " << totalResources;

          // Inform the master about the update from the resource provider.
          send(master.get(), generateResourceProviderUpdate());

          break;
        }
      }
      break;
    }
    case ResourceProviderMessage::Type::UPDATE_OPERATION_STATUS: {
      CHECK_SOME(message->updateOperationStatus);

      // The status update from the resource provider didn't provide
      // the agent ID (because the resource provider doesn't know it),
      // hence we inject it here.
      UpdateOperationStatusMessage update =
        message->updateOperationStatus->update;

      update.mutable_slave_id()->CopyFrom(info.id());
      update.mutable_status()->mutable_slave_id()->CopyFrom(info.id());
      if (update.has_latest_status()) {
        update.mutable_latest_status()->mutable_slave_id()->CopyFrom(info.id());
      }

      const UUID& operationUUID = update.operation_uuid();

      Operation* operation = getOperation(operationUUID);

      if (operation != nullptr) {
        // It is possible for the resource provider to forget or incorrectly
        // copy the OperationID in its status update. We make sure the ID
        // is filled in with the correct value before proceeding.
        if (operation->info().has_id()) {
          update.mutable_status()->mutable_operation_id()
            ->CopyFrom(operation->info().id());

          if (update.has_latest_status()) {
            update.mutable_latest_status()->mutable_operation_id()
              ->CopyFrom(operation->info().id());
          }
        }

        // The agent might not know about the operation in the
        // following cases:
        //
        // Case 1:
        // (1) The agent sends to a resource provder an ACK for a
        //     terminal operation status update and removes the
        //     operation.
        // (2) The resource provider doesn't get the ACK.
        // (3) The resource provider's status update manager resends
        //     the operation status update.
        //
        // Case 2:
        // (1) The master knows an operation that the agent doesn't
        //     know, because an ApplyOperationMessage was dropped.
        // (2) The master sends a ReconcileOperationsMessage
        //     message to the agent, who forwards it to a resource
        //     provider.
        // (3) The resource provider doesn't know the operation, so it
        //     sends an operation status update with the state
        //     OPERATION_DROPPED.
        //
        // In both cases the agent should not update it's internal
        // state, but it should still forward the operation status
        // update.
        updateOperation(operation, update);
      }

      switch (state) {
        case RECOVERING:
        case DISCONNECTED:
        case TERMINATING: {
          LOG(WARNING)
            << "Dropping status update of operation"
            << (update.status().has_operation_id()
                 ? " '" + stringify(update.status().operation_id()) + "'"
                 : " with no ID")
            << " (operation_uuid: " << operationUUID << ")"
            << (update.has_framework_id()
                 ? " for framework " + stringify(update.framework_id())
                 : " for an operator API call")
            << " because agent is in " << state << " state";
          break;
        }
        case RUNNING: {
          LOG(INFO)
            << "Forwarding status update of"
            << (operation == nullptr ? " unknown" : "") << " operation"
            << (update.status().has_operation_id()
                 ? " '" + stringify(update.status().operation_id()) + "'"
                 : " with no ID")
            << " (operation_uuid: " << operationUUID << ")"
            << (update.has_framework_id()
                 ? " for framework " + stringify(update.framework_id())
                 : " for an operator API call");

          send(master.get(), update);
          break;
        }
      }
      break;
    }
    case ResourceProviderMessage::Type::DISCONNECT: {
      CHECK_SOME(message->disconnect);

      const ResourceProviderID& resourceProviderId =
        message->disconnect->resourceProviderId;

      ResourceProvider* resourceProvider =
        getResourceProvider(resourceProviderId);

      if (resourceProvider == nullptr) {
        LOG(ERROR) << "Failed to find the disconnected resource provider "
                   << resourceProviderId << ", ignoring the message";
        break;
      }

      // Remove the resource provider's resources from the agent's
      // total resources and remove it from our internal tracking.
      CHECK(totalResources.contains(resourceProvider->totalResources));
      totalResources -= resourceProvider->totalResources;

      resourceProviders.erase(resourceProviderId);

      // Send the updated resources to the master if the agent is running. Note
      // that since we have already updated our copy of the latest resource
      // provider resources, it is safe to consume this message and wait for the
      // next one; even if we do not send the update to the master right now, an
      // update will be send once the agent reregisters.
      switch (state) {
        case RECOVERING:
        case DISCONNECTED:
        case TERMINATING: {
          break;
        }
        case RUNNING: {
          LOG(INFO) << "Forwarding new total resources " << totalResources;

          // Inform the master about the update from the resource provider.
          send(master.get(), generateResourceProviderUpdate());

          break;
        }
      }
      break;
    }
    case ResourceProviderMessage::Type::REMOVE: {
      CHECK_SOME(message->remove);

      const ResourceProviderID& resourceProviderId =
        message->remove->resourceProviderId;

      if (!resourceProviders.contains(resourceProviderId)) {
        break;
      }

      const ResourceProvider* resourceProvider =
        resourceProviders.at(resourceProviderId);

      CHECK_NOTNULL(resourceProvider);

      // Transition all non-terminal operations on the resource provider to a
      // terminal state.
      //
      // NOTE: We operate on a copy of the operations container since we trigger
      // removal of current operation in below loop. This invalidates the loop
      // iterator so it cannot be safely incremented after the loop body.
      const hashmap<UUID, Operation*> operations = resourceProvider->operations;
      foreachpair (const UUID& uuid, Operation * operation, operations) {
        CHECK_NOTNULL(operation);

        if (protobuf::isTerminalState(operation->latest_status().state())) {
          continue;
        }

        // The operation might be from an operator API call, thus the framework
        // ID here is optional.
        Option<FrameworkID> frameworkId =
          operation->has_framework_id()
            ? operation->framework_id()
            : Option<FrameworkID>::none();

        Option<OperationID> operationId =
          operation->info().has_id()
            ? operation->info().id()
            : Option<OperationID>::none();

        UpdateOperationStatusMessage update =
          protobuf::createUpdateOperationStatusMessage(
              uuid,
              protobuf::createOperationStatus(
                  OPERATION_GONE_BY_OPERATOR,
                  operationId,
                  "The resource provider was removed before a terminal "
                  "operation status update was received",
                  None(),
                  None(),
                  info.id()),
              None(),
              frameworkId);

        updateOperation(operation, update);

        removeOperation(operation);

        // Forward the operation status update to the master.
        //
        // The status update from the resource provider does not
        // provide the agent ID (because the resource provider doesn't
        // know it), so we inject it here.
        UpdateOperationStatusMessage _update;
        _update.CopyFrom(update);
        _update.mutable_slave_id()->CopyFrom(info.id());
        send(master.get(), _update);
      };

      // TODO(bbannier): Consider transitioning all tasks using resources from
      // this resource provider to e.g., `TASK_GONE_BY_OPERATOR` and terminating
      // them.

      // Remove the resources of the resource provider from the agent's total.
      // This needs to be done after triggering the operation status update so
      // that master does not receive a operations status update for an unknown
      // operation (gone from `UpdateSlaveMessage`).
      totalResources -= resourceProvider->totalResources;

      resourceProviders.erase(resourceProviderId);

      switch (state) {
        case RECOVERING:
        case DISCONNECTED:
        case TERMINATING: {
          break;
        }
        case RUNNING: {
          LOG(INFO) << "Forwarding new total resources " << totalResources;

          // Inform the master about the updated resources.
          send(master.get(), generateResourceProviderUpdate());

          break;
        }
      }

      LOG(INFO) << "Removed resource provider '" << resourceProviderId << "'";
      break;
    }
  }

  // Wait for the next message.
  CHECK_NOTNULL(resourceProviderManager.get())->messages().get()
    .onAny(defer(self(), &Self::handleResourceProviderMessage, lambda::_1));
}


void Slave::addOperation(Operation* operation)
{
  operations.put(operation->uuid(), operation);

  if (operation->info().has_id() && operation->has_framework_id()) {
    operationIds.put(
        std::make_pair(operation->framework_id(), operation->info().id()),
        operation->uuid());
  }

  Result<ResourceProviderID> resourceProviderId =
    getResourceProviderId(operation->info());

  CHECK(!resourceProviderId.isError())
    << "Failed to get resource provider ID: "
    << resourceProviderId.error();

  if (resourceProviderId.isSome()) {
    ResourceProvider* resourceProvider =
      getResourceProvider(resourceProviderId.get());

    CHECK_NOTNULL(resourceProvider);

    resourceProvider->addOperation(operation);
  }
}


void Slave::updateOperation(
    Operation* operation,
    const UpdateOperationStatusMessage& update)
{
  CHECK_NOTNULL(operation);

  const OperationStatus& status = update.status();

  Option<OperationStatus> latestStatus;
  if (update.has_latest_status()) {
    latestStatus = update.latest_status();
  }

  // Whether the operation has just become terminated.
  Option<bool> terminated;

  if (latestStatus.isSome()) {
    terminated =
      !protobuf::isTerminalState(operation->latest_status().state()) &&
      protobuf::isTerminalState(latestStatus->state());

    updateOperationLatestStatus(operation, latestStatus.get());
  } else {
    terminated =
      !protobuf::isTerminalState(operation->latest_status().state()) &&
      protobuf::isTerminalState(status.state());

    updateOperationLatestStatus(operation, status);
  }

  // Adding the update's status to the stored operation below is the one place
  // in this function where we mutate the operation state irrespective of the
  // value of `terminated`. We check to see if this status update is a retry;
  // if so, we do nothing.
  bool isRetry = false;
  if (status.has_uuid()) {
    foreach (const OperationStatus& storedStatus, operation->statuses()) {
      if (storedStatus.has_uuid() && storedStatus.uuid() == status.uuid()) {
        isRetry = true;
        break;
      }
    }
  }

  if (!isRetry) {
    operation->add_statuses()->CopyFrom(status);
  }

  LOG(INFO) << "Updating the state of operation"
            << (operation->info().has_id()
                 ? " '" + stringify(operation->info().id()) + "'"
                 : " with no ID")
            << " (uuid: " << operation->uuid() << ")"
            << (operation->has_framework_id()
                 ? " for framework " + stringify(operation->framework_id())
                 : " for an operation API call")
            << " (latest state: " << operation->latest_status().state()
            << ", status update state: " << status.state() << ")";

  CHECK_SOME(terminated);

  if (!terminated.get()) {
    return;
  }

  if (protobuf::isSpeculativeOperation(operation->info())) {
    return;
  }

  switch (operation->latest_status().state()) {
    // Terminal state, and the conversion is successful.
    case OPERATION_FINISHED: {
      apply(operation);
      break;
    }

    // Terminal state, and the conversion has failed.
    case OPERATION_DROPPED:
    case OPERATION_ERROR:
    case OPERATION_FAILED:
    case OPERATION_GONE_BY_OPERATOR: {
      break;
    }

    // Non-terminal or not sent by resource providers. This shouldn't happen.
    case OPERATION_UNSUPPORTED:
    case OPERATION_PENDING:
    case OPERATION_UNREACHABLE:
    case OPERATION_RECOVERING:
    case OPERATION_UNKNOWN: {
      LOG(FATAL)
        << "Unexpected operation state " << operation->latest_status().state();
    }
  }
}


void Slave::updateOperationLatestStatus(
    Operation* operation,
    const OperationStatus& status)
{
  CHECK_NOTNULL(operation);

  if (!protobuf::isTerminalState(operation->latest_status().state())) {
    operation->mutable_latest_status()->CopyFrom(status);
  }
}


void Slave::removeOperation(Operation* operation)
{
  const UUID& uuid = operation->uuid();

  Result<ResourceProviderID> resourceProviderId =
    getResourceProviderId(operation->info());

  CHECK(!resourceProviderId.isError())
    << "Failed to get resource provider ID: "
    << resourceProviderId.error();

  if (resourceProviderId.isSome()) {
    ResourceProvider* resourceProvider =
      getResourceProvider(resourceProviderId.get());

    CHECK_NOTNULL(resourceProvider);

    resourceProvider->removeOperation(operation);
  }

  CHECK(operations.contains(uuid))
    << "Unknown operation (uuid: " << uuid << ")";

  if (operation->info().has_id() && operation->has_framework_id()) {
    operationIds.erase(
        std::make_pair(operation->framework_id(), operation->info().id()));
  }

  operations.erase(uuid);
  delete operation;

  checkpointResourceState(
      totalResources.filter(mesos::needCheckpointing), false);

  updateDrainStatus();
}


Operation* Slave::getOperation(const UUID& uuid) const
{
  if (operations.contains(uuid)) {
    return operations.at(uuid);
  }
  return nullptr;
}


void Slave::addResourceProvider(ResourceProvider* resourceProvider)
{
  CHECK(resourceProvider->info.has_id());
  CHECK(!resourceProviders.contains(resourceProvider->info.id()));

  resourceProviders.put(
      resourceProvider->info.id(),
      resourceProvider);
}


ResourceProvider* Slave::getResourceProvider(const ResourceProviderID& id) const
{
  if (resourceProviders.contains(id)) {
    return resourceProviders.at(id);
  }
  return nullptr;
}


Future<Nothing> Slave::markResourceProviderGone(
    const ResourceProviderID& resourceProviderId) const
{
  auto message = [&resourceProviderId](const string& reason) {
    return
      "Could not mark resource provider '" + stringify(resourceProviderId) +
      "' as gone: " + reason;
  };

  if (!resourceProviderManager.get()) {
    return Failure(message("Agent has not registered yet"));
  }

  if (resourceProviders.contains(resourceProviderId) &&
      !resourceProviders.at(resourceProviderId)->totalResources.empty()) {
    return Failure(message("Resource provider has resources"));
  }

  return resourceProviderManager->removeResourceProvider(resourceProviderId);
}

void Slave::apply(Operation* operation)
{
  vector<ResourceConversion> conversions;

  // NOTE: 'totalResources' don't have allocations set, we need to
  // remove them from the conversions.

  if (protobuf::isSpeculativeOperation(operation->info())) {
    Offer::Operation strippedOperation = operation->info();
    protobuf::stripAllocationInfo(&strippedOperation);

    Try<vector<ResourceConversion>> _conversions =
      getResourceConversions(strippedOperation);

    CHECK_SOME(_conversions);

    conversions = _conversions.get();
  } else {
    // For non-speculative operations, we only apply the conversion
    // once it becomes terminal. Before that, we don't know the
    // converted resources of the conversion.
    CHECK_EQ(OPERATION_FINISHED, operation->latest_status().state());

    Try<Resources> consumed = protobuf::getConsumedResources(operation->info());
    CHECK_SOME(consumed);

    Resources converted = operation->latest_status().converted_resources();

    consumed->unallocate();
    converted.unallocate();

    conversions.emplace_back(consumed.get(), converted);
  }

  // Now, actually apply the operation.
  Try<Resources> resources = totalResources.apply(conversions);
  CHECK_SOME(resources);

  totalResources = resources.get();

  Result<ResourceProviderID> resourceProviderId =
    getResourceProviderId(operation->info());

  CHECK(!resourceProviderId.isError())
    << "Failed to get resource provider ID: "
    << resourceProviderId.error();

  // Besides updating the agent's `totalResources`, we also need to
  // update the resource provider's `totalResources`.
  if (resourceProviderId.isSome()) {
    ResourceProvider* resourceProvider =
      getResourceProvider(resourceProviderId.get());

    CHECK_NOTNULL(resourceProvider);

    Try<Resources> resources =
      resourceProvider->totalResources.apply(conversions);

    CHECK_SOME(resources);

    resourceProvider->totalResources = resources.get();
  }
}


Future<Nothing> Slave::publishResources(
    const ContainerID& containerId, const Resources& resources)
{
  hashset<ResourceProviderID> resourceProviderIds;
  foreach (const Resource& resource, resources) {
    if (resource.has_provider_id()) {
      resourceProviderIds.insert(resource.provider_id());
    }
  }

  vector<Future<Nothing>> futures;
  foreach (const ResourceProviderID& resourceProviderId, resourceProviderIds) {
    auto hasResourceProviderId = [&](const Resource& resource) {
      return resource.has_provider_id() &&
             resource.provider_id() == resourceProviderId;
    };

    // NOTE: For resources providers that serve quantity-based resources without
    // identifier (such as cpus and mem), we cannot achieve idempotency with
    // diff-based resource publishing, so we have to implement the "ensure-all"
    // semantics, and always calculate the total resources to publish.
    Option<Resources> containerResources;
    Resources complementaryResources;
    foreachvalue (const Framework* framework, frameworks) {
      foreachvalue (const Executor* executor, framework->executors) {
        if (executor->containerId == containerId) {
          containerResources = resources.filter(hasResourceProviderId);
        } else {
          complementaryResources +=
            executor->allocatedResources().filter(hasResourceProviderId);
        }
      }
    }

    if (containerResources.isNone()) {
      // NOTE: This actually should not happen, as the callers have already
      // ensured the existence of the executor before calling this function
      // synchronously. However we still treat this as a nonfatal error since
      // this might change in the future.
      LOG(WARNING) << "Ignoring publishing resources for container "
                   << containerId << ": Executor cannot be found";

      return Nothing();
    }

    // Since we already have resources from any resource provider in the
    // resource pool, the resource provider manager must have been created.
    futures.push_back(
        CHECK_NOTNULL(resourceProviderManager.get())
          ->publishResources(containerResources.get() + complementaryResources)
          .repair([=](const Future<Nothing>& future) -> Future<Nothing> {
            // TODO(chhsiao): Consider surfacing the set of published resources
            // and only fail if `published - complementaryResources` does not
            // contain `containerResources`.
            return Failure(
                "Failed to publish resources '" +
                stringify(containerResources.get()) + "' for container " +
                stringify(containerId) + ": " + future.failure());
          }));
  }

  // NOTE: Resource cleanups (e.g., unpublishing) are not performed at task
  // completion, but rather done __lazily__ when necessary. This is not just an
  // optimization but required because resource allocations are tied to task
  // lifecycles. As a result, no cleanup is needed here if any future fails.
  return collect(futures).then([] { return Nothing(); });
}


void Slave::qosCorrections()
{
  qosController->corrections()
    .onAny(defer(self(), &Self::_qosCorrections, lambda::_1));
}


void Slave::_qosCorrections(const Future<list<QoSCorrection>>& future)
{
  // Make sure correction handler is scheduled again.
  delay(flags.qos_correction_interval_min, self(), &Self::qosCorrections);

  // Verify slave state.
  CHECK(state == RECOVERING || state == DISCONNECTED ||
        state == RUNNING || state == TERMINATING)
    << state;

  if (state == RECOVERING || state == TERMINATING) {
    LOG(WARNING) << "Cannot perform QoS corrections because the agent is "
                 << state;
    return;
  }

  if (!future.isReady()) {
    LOG(WARNING) << "Failed to get corrections from QoS Controller: "
                  << (future.isFailed() ? future.failure() : "discarded");
    return;
  }

  const list<QoSCorrection>& corrections = future.get();

  VLOG(1) << "Received " << corrections.size() << " QoS corrections";

  foreach (const QoSCorrection& correction, corrections) {
    // TODO(nnielsen): Print correction, once the operator overload
    // for QoSCorrection has been implemented.
    if (correction.type() == QoSCorrection::KILL) {
      const QoSCorrection::Kill& kill = correction.kill();

      if (!kill.has_framework_id()) {
        LOG(WARNING) << "Ignoring QoS correction KILL: "
                     << "framework id not specified.";
        continue;
      }

      const FrameworkID& frameworkId = kill.framework_id();

      if (!kill.has_executor_id()) {
        // TODO(nnielsen): For now, only executor killing is supported. Check
        // can be removed when task killing is supported as well.
        LOG(WARNING) << "Ignoring QoS correction KILL on framework "
                     << frameworkId << ": executor id not specified";
        continue;
      }

      const ExecutorID& executorId = kill.executor_id();

      Framework* framework = getFramework(frameworkId);
      if (framework == nullptr) {
        LOG(WARNING) << "Ignoring QoS correction KILL on framework "
                     << frameworkId << ": framework cannot be found";
        continue;
      }

      // Verify framework state.
      CHECK(framework->state == Framework::RUNNING ||
            framework->state == Framework::TERMINATING)
        << framework->state;

      if (framework->state == Framework::TERMINATING) {
        LOG(WARNING) << "Ignoring QoS correction KILL on framework "
                     << frameworkId << ": framework is terminating.";
        continue;
      }

      Executor* executor = framework->getExecutor(executorId);
      if (executor == nullptr) {
        LOG(WARNING) << "Ignoring QoS correction KILL on executor '"
                     << executorId << "' of framework " << frameworkId
                     << ": executor cannot be found";
        continue;
      }

      const ContainerID containerId =
          kill.has_container_id() ? kill.container_id() : executor->containerId;
      if (containerId != executor->containerId) {
        LOG(WARNING) << "Ignoring QoS correction KILL on container '"
                     << containerId << "' for executor " << *executor
                     << ": container cannot be found";
        continue;
      }

      switch (executor->state) {
        case Executor::REGISTERING:
        case Executor::RUNNING: {
          LOG(INFO) << "Killing container '" << containerId
                    << "' for executor " << *executor
                    << " as QoS correction";

          containerizer->destroy(containerId);

          // TODO(nnielsen): We should ensure that we are addressing
          // the _container_ which the QoS controller intended to
          // kill. Without this check, we may run into a scenario
          // where the executor has terminated and one with the same
          // id has started in the interim i.e. running in a different
          // container than the one the QoS controller targeted
          // (MESOS-2875).
          executor->state = Executor::TERMINATING;

          // Send TASK_GONE because the task was started but has now
          // been terminated. If the framework is not partition-aware,
          // we send TASK_LOST instead for backward compatibility.
          mesos::TaskState taskState = TASK_GONE;
          if (!protobuf::frameworkHasCapability(
                  framework->info,
                  FrameworkInfo::Capability::PARTITION_AWARE)) {
            taskState = TASK_LOST;
          }

          ContainerTermination termination;
          termination.set_state(taskState);
          termination.set_reason(TaskStatus::REASON_CONTAINER_PREEMPTED);
          termination.set_message("Container preempted by QoS correction");

          executor->pendingTermination = termination;

          ++metrics.executors_preempted;
          break;
        }
        case Executor::TERMINATING:
        case Executor::TERMINATED:
          LOG(WARNING) << "Ignoring QoS correction KILL on executor "
                       << *executor << " because the executor is in "
                       << executor->state << " state";
          break;
        default:
          LOG(FATAL) << "Executor " << *executor << " is in unexpected state "
                     << executor->state;
          break;
      }
    } else {
      LOG(WARNING) << "QoS correction type " << correction.type()
                   << " is not supported";
    }
  }
}


Future<ResourceUsage> Slave::usage()
{
  // NOTE: We use 'Owned' here trying to avoid the expensive copy.
  // C++11 lambda only supports capturing variables that have copy
  // constructors. Revisit once we remove the copy constructor for
  // Owned (or C++14 lambda generalized capture is supported).
  Owned<ResourceUsage> usage(new ResourceUsage());
  vector<Future<ResourceStatistics>> futures;

  foreachvalue (const Framework* framework, frameworks) {
    foreachvalue (const Executor* executor, framework->executors) {
      // No need to get statistics and status if we know that the
      // executor has already terminated.
      if (executor->state == Executor::TERMINATED) {
        continue;
      }

      ResourceUsage::Executor* entry = usage->add_executors();
      entry->mutable_executor_info()->CopyFrom(executor->info);
      entry->mutable_allocated()->CopyFrom(executor->allocatedResources());
      entry->mutable_container_id()->CopyFrom(executor->containerId);

      // We include non-terminal tasks in ResourceUsage.
      foreachvalue (const Task* task, executor->launchedTasks) {
        ResourceUsage::Executor::Task* t = entry->add_tasks();
        t->set_name(task->name());
        t->mutable_id()->CopyFrom(task->task_id());
        t->mutable_resources()->CopyFrom(task->resources());

        if (task->has_labels()) {
          t->mutable_labels()->CopyFrom(task->labels());
        }
      }

      futures.push_back(containerizer->usage(executor->containerId));
    }
  }

  usage->mutable_total()->CopyFrom(totalResources);

  return await(futures).then(
      [usage](const vector<Future<ResourceStatistics>>& futures) {
        // NOTE: We add ResourceUsage::Executor to 'usage' the same
        // order as we push future to 'futures'. So the variables
        // 'future' and 'executor' below should be in sync.
        CHECK_EQ(futures.size(), (size_t) usage->executors_size());

        int i = 0;
        foreach (const Future<ResourceStatistics>& future, futures) {
          ResourceUsage::Executor* executor = usage->mutable_executors(i++);

          if (future.isReady()) {
            executor->mutable_statistics()->CopyFrom(future.get());
          } else {
            LOG(WARNING) << "Failed to get resource statistics for executor '"
                         << executor->executor_info().executor_id() << "'"
                         << " of framework "
                         << executor->executor_info().framework_id() << ": "
                         << (future.isFailed() ? future.failure()
                                               : "discarded");
          }
        }

        return Future<ResourceUsage>(*usage);
      });
}


// As a principle, we do not need to re-authorize actions that have already
// been authorized by the master. However, we re-authorize the RUN_TASK action
// on the agent even though the master has already authorized it because:
// a) in cases where hosts have heterogeneous user-account configurations,
//    it makes sense to set the ACL on the agent instead of on the master
// b) compared to other actions such as killing a task and shutting down a
//    framework, it's a greater security risk if malicious tasks are launched
//    as a superuser on the agent.
Future<bool> Slave::authorizeTask(
    const TaskInfo& task,
    const FrameworkInfo& frameworkInfo)
{
  if (authorizer.isNone()) {
    return true;
  }

  // Authorize the task.
  authorization::Request request;

  if (frameworkInfo.has_principal()) {
    request.mutable_subject()->set_value(frameworkInfo.principal());
  }

  request.set_action(authorization::RUN_TASK);

  authorization::Object* object = request.mutable_object();

  object->mutable_task_info()->CopyFrom(task);
  object->mutable_framework_info()->CopyFrom(frameworkInfo);

  LOG(INFO)
    << "Authorizing framework principal '"
    << (frameworkInfo.has_principal() ? frameworkInfo.principal() : "ANY")
    << "' to launch task " << task.task_id();

  return authorizer.get()->authorized(request);
}


Future<bool> Slave::authorizeSandboxAccess(
    const Option<Principal>& principal,
    const FrameworkID& frameworkId,
    const ExecutorID& executorId)
{
  if (authorizer.isNone()) {
    return true;
  }

  return ObjectApprovers::create(
      authorizer,
      principal,
      {ACCESS_SANDBOX})
    .then(defer(
        self(),
        [=](const Owned<ObjectApprovers>& approvers) -> Future<bool> {
          // Construct authorization object.
          ObjectApprover::Object object;

          if (frameworks.contains(frameworkId)) {
            Framework* framework = frameworks.at(frameworkId);

            object.framework_info = &(framework->info);

            if (framework->executors.contains(executorId)) {
              object.executor_info =
                &(framework->executors.at(executorId)->info);
            }
          }

          return approvers->approved<ACCESS_SANDBOX>(object);
        }));
}


void Slave::sendExecutorTerminatedStatusUpdate(
    const TaskID& taskId,
    const Future<Option<ContainerTermination>>& termination,
    const FrameworkID& frameworkId,
    const Executor* executor)
{
  CHECK_NOTNULL(executor);

  mesos::TaskState state;
  TaskStatus::Reason reason;
  string message;

  const bool haveTermination = termination.isReady() && termination->isSome();

  // Determine the task state for the status update.
  if (haveTermination && termination->get().has_state()) {
    state = termination->get().state();
  } else if (executor->pendingTermination.isSome() &&
             executor->pendingTermination->has_state()) {
    state = executor->pendingTermination->state();
  } else {
    state = TASK_FAILED;
  }

  // Determine the task reason for the status update.
  if (haveTermination && termination->get().has_reason()) {
    reason = termination->get().reason();
  } else if (executor->pendingTermination.isSome() &&
             executor->pendingTermination->has_reason()) {
    reason = executor->pendingTermination->reason();
  } else {
    reason = TaskStatus::REASON_EXECUTOR_TERMINATED;
  }

  // Determine the message for the status update.
  vector<string> messages;

  if (executor->pendingTermination.isSome() &&
      executor->pendingTermination->has_message()) {
    messages.push_back(executor->pendingTermination->message());
  }

  if (!termination.isReady()) {
    messages.push_back(
        "Abnormal executor termination: " +
        (termination.isFailed() ? termination.failure() : "discarded future"));
  } else if (termination->isNone()) {
    messages.push_back("Abnormal executor termination: unknown container");
  } else if (termination->get().has_message()) {
    messages.push_back(termination->get().message());
  }

  if (messages.empty()) {
    message = "Executor terminated";
  } else {
    message = strings::join("; ", messages);
  }

  Option<Resources> limitedResources;

  if (haveTermination && !termination->get().limited_resources().empty()) {
    limitedResources = termination->get().limited_resources();
  }

  statusUpdate(
      protobuf::createStatusUpdate(
          frameworkId,
          info.id(),
          taskId,
          state,
          TaskStatus::SOURCE_SLAVE,
          id::UUID::random(),
          message,
          reason,
          executor->id,
          None(),
          None(),
          None(),
          None(),
          None(),
          limitedResources),
      UPID());
}


void Slave::sendExitedExecutorMessage(
    const FrameworkID& frameworkId,
    const ExecutorID& executorId,
    const Option<int>& status)
{
  ExitedExecutorMessage message;
  message.mutable_slave_id()->MergeFrom(info.id());
  message.mutable_framework_id()->MergeFrom(frameworkId);
  message.mutable_executor_id()->MergeFrom(executorId);
  message.set_status(status.getOrElse(-1));

  if (master.isSome()) {
    send(master.get(), message);
  }
}


// TODO(dhamon): Move these to their own metrics.hpp|cpp.
double Slave::_tasks_staging()
{
  double count = 0.0;
  foreachvalue (Framework* framework, frameworks) {
    typedef hashmap<TaskID, TaskInfo> TaskMap;
    foreachvalue (const TaskMap& tasks, framework->pendingTasks) {
      count += tasks.size();
    }

    foreachvalue (Executor* executor, framework->executors) {
      count += executor->queuedTasks.size();

      foreachvalue (Task* task, executor->launchedTasks) {
        if (task->state() == TASK_STAGING) {
          count++;
        }
      }
    }
  }
  return count;
}


double Slave::_tasks_starting()
{
  double count = 0.0;
  foreachvalue (Framework* framework, frameworks) {
    foreachvalue (Executor* executor, framework->executors) {
      foreachvalue (Task* task, executor->launchedTasks) {
        if (task->state() == TASK_STARTING) {
          count++;
        }
      }
    }
  }
  return count;
}


double Slave::_tasks_running()
{
  double count = 0.0;
  foreachvalue (Framework* framework, frameworks) {
    foreachvalue (Executor* executor, framework->executors) {
      foreachvalue (Task* task, executor->launchedTasks) {
        if (task->state() == TASK_RUNNING) {
          count++;
        }
      }
    }
  }
  return count;
}


double Slave::_tasks_killing()
{
  double count = 0.0;
  foreachvalue (Framework* framework, frameworks) {
    foreachvalue (Executor* executor, framework->executors) {
      foreachvalue (Task* task, executor->launchedTasks) {
        if (task->state() == TASK_KILLING) {
          count++;
        }
      }
    }
  }
  return count;
}


double Slave::_executors_registering()
{
  double count = 0.0;
  foreachvalue (Framework* framework, frameworks) {
    foreachvalue (Executor* executor, framework->executors) {
      if (executor->state == Executor::REGISTERING) {
        count++;
      }
    }
  }
  return count;
}


double Slave::_executors_running()
{
  double count = 0.0;
  foreachvalue (Framework* framework, frameworks) {
    foreachvalue (Executor* executor, framework->executors) {
      if (executor->state == Executor::RUNNING) {
        count++;
      }
    }
  }
  return count;
}


double Slave::_executors_terminating()
{
  double count = 0.0;
  foreachvalue (Framework* framework, frameworks) {
    foreachvalue (Executor* executor, framework->executors) {
      if (executor->state == Executor::TERMINATING) {
        count++;
      }
    }
  }
  return count;
}


double Slave::_executor_directory_max_allowed_age_secs()
{
  return executorDirectoryMaxAllowedAge.secs();
}


double Slave::_resources_total(const string& name)
{
  double total = 0.0;

  foreach (const Resource& resource, info.resources()) {
    if (resource.name() == name && resource.type() == Value::SCALAR) {
      total += resource.scalar().value();
    }
  }

  return total;
}


double Slave::_resources_used(const string& name)
{
  // We use `Resources` arithmetic to accummulate the resources since the
  // `+=` operator de-duplicates the same shared resources across executors.
  Resources used;

  foreachvalue (Framework* framework, frameworks) {
    used += framework->allocatedResources().nonRevocable();
  }

  return used.get<Value::Scalar>(name).getOrElse(Value::Scalar()).value();
}


double Slave::_resources_percent(const string& name)
{
  double total = _resources_total(name);

  if (total == 0.0) {
    return 0.0;
  }

  return _resources_used(name) / total;
}


double Slave::_resources_revocable_total(const string& name)
{
  double total = 0.0;

  if (oversubscribedResources.isSome()) {
    foreach (const Resource& resource, oversubscribedResources.get()) {
      if (resource.name() == name && resource.type() == Value::SCALAR) {
        total += resource.scalar().value();
      }
    }
  }

  return total;
}


double Slave::_resources_revocable_used(const string& name)
{
  // We use `Resources` arithmetic to accummulate the resources since the
  // `+=` operator de-duplicates the same shared resources across executors.
  Resources used;

  foreachvalue (Framework* framework, frameworks) {
    used += framework->allocatedResources().revocable();
  }

  return used.get<Value::Scalar>(name).getOrElse(Value::Scalar()).value();
}


double Slave::_resources_revocable_percent(const string& name)
{
  double total = _resources_revocable_total(name);

  if (total == 0.0) {
    return 0.0;
  }

  return _resources_revocable_used(name) / total;
}


void Slave::initializeResourceProviderManager(
    const Flags& flags,
    const SlaveID& slaveId)
{
  // To simplify reasoning about lifetimes we do not allow
  // reinitialization of the resource provider manager.
  if (resourceProviderManager.get() != nullptr) {
    return;
  }

  // The registrar uses LevelDB as underlying storage. Since LevelDB
  // is currently not supported on Windows (see MESOS-5932), we fall
  // back to in-memory storage there.
  //
  // TODO(bbannier): Remove this Windows workaround once MESOS-5932 is fixed.
#ifndef __WINDOWS__
  Owned<mesos::state::Storage> storage(new mesos::state::LevelDBStorage(
      paths::getResourceProviderRegistryPath(flags.work_dir, slaveId)));
#else
  LOG(WARNING)
    << "Persisting resource provider manager state is not supported on Windows";
  Owned<mesos::state::Storage> storage(new mesos::state::InMemoryStorage());
#endif // __WINDOWS__

  Try<Owned<resource_provider::Registrar>> resourceProviderRegistrar =
    resource_provider::Registrar::create(std::move(storage));

  CHECK_SOME(resourceProviderRegistrar)
    << "Could not construct resource provider registrar: "
    << resourceProviderRegistrar.error();

  resourceProviderManager.reset(
      new ResourceProviderManager(std::move(resourceProviderRegistrar.get())));

  if (capabilities.resourceProvider) {
    // Start listening for messages from the resource provider manager.
    resourceProviderManager->messages().get().onAny(
        defer(self(), &Self::handleResourceProviderMessage, lambda::_1));
  }
}


google::protobuf::Map<string, Value::Scalar> Slave::computeExecutorLimits(
    const Resources& executorResources,
    const vector<TaskInfo>& taskInfos,
    const vector<Task*>& tasks) const
{
  Option<Value::Scalar> executorCpuLimit, executorMemLimit;
  Value::Scalar cpuRequest, memRequest;
  foreach (const TaskInfo& taskInfo, taskInfos) {
    // Count the task's CPU limit into the executor's CPU limit.
    if (taskInfo.limits().count("cpus")) {
      setLimit(executorCpuLimit, taskInfo.limits().at("cpus"));
    } else {
      Option<Value::Scalar> taskCpus =
        Resources(taskInfo.resources()).get<Value::Scalar>("cpus");

      if (taskCpus.isSome()) {
        cpuRequest += taskCpus.get();
      }
    }

    // Count the task's memory limit into the executor's memory limit.
    if (taskInfo.limits().count("mem")) {
      setLimit(executorMemLimit, taskInfo.limits().at("mem"));
    } else {
      Option<Value::Scalar> taskMem =
        Resources(taskInfo.resources()).get<Value::Scalar>("mem");

      if (taskMem.isSome()) {
        memRequest += taskMem.get();
      }
    }
  }

  foreach (const Task* task, tasks) {
    CHECK_NOTNULL(task);

    // Count the task's CPU limit into the executor's CPU limit.
    if (task->limits().count("cpus")) {
      setLimit(executorCpuLimit, task->limits().at("cpus"));
    } else {
      Option<Value::Scalar> taskCpus =
        Resources(task->resources()).get<Value::Scalar>("cpus");

      if (taskCpus.isSome()) {
        cpuRequest += taskCpus.get();
      }
    }

    // Count the task's memory limit into the executor's memory limit.
    if (task->limits().count("mem")) {
      setLimit(executorMemLimit, task->limits().at("mem"));
    } else {
      Option<Value::Scalar> taskMem =
        Resources(task->resources()).get<Value::Scalar>("mem");

      if (taskMem.isSome()) {
        memRequest += taskMem.get();
      }
    }
  }

  if (executorCpuLimit.isSome()) {
    // Count the executor's CPU request into its CPU limit as well, this is to
    // ensure the executor's CPU limit is always greater than its CPU request.
    Option<Value::Scalar> executorCpus =
      executorResources.get<Value::Scalar>("cpus");

    if (executorCpus.isSome()) {
      setLimit(executorCpuLimit, executorCpus.get());
    }

    // For the tasks which do not have CPU limit, count their CPU requests
    // into the executor's CPU limit as well, this is also to ensure the
    // executor's CPU limit is always greater than its CPU request. Please
    // note that if the flag `cgroups_enable_cfs` is not enabled, we should
    // not set the executor's CPU limit, otherwise the tasks which do not
    // have CPU limit will be throttled implicitly by the executor's CPU limit.
    if (cpuRequest.value() > 0) {
#ifdef __linux__
      if (flags.cgroups_enable_cfs) {
        setLimit(executorCpuLimit, cpuRequest);
      } else {
        executorCpuLimit = None();
      }
#else
      setLimit(executorCpuLimit, cpuRequest);
#endif // __linux__
    }
  }

  if (executorMemLimit.isSome()) {
    // Count the executor's memory request into its memory limit as well,
    // this is to ensure the executor's memory limit is always greater
    // than its memory request.
    Option<Value::Scalar> executorMem =
      executorResources.get<Value::Scalar>("mem");

    if (executorMem.isSome()) {
      setLimit(executorMemLimit, executorMem.get());
    }

    // For the tasks which do not have memory limit, count their memory
    // requests into the executor's memory limit as well, this is also
    // to ensure the executor's memory limit is always greater than its
    // memory request.
    if (memRequest.value() > 0) {
      setLimit(executorMemLimit, memRequest);
    }
  }

  google::protobuf::Map<string, Value::Scalar> executorLimits;
  if (executorCpuLimit.isSome()) {
    executorLimits.insert({"cpus", executorCpuLimit.get()});
  }

  if (executorMemLimit.isSome()) {
    executorLimits.insert({"mem", executorMemLimit.get()});
  }

  return executorLimits;
}


void Slave::updateDrainStatus()
{
  if (drainConfig.isNone()) {
    return;
  }

  bool drained = operations.empty() && frameworks.empty();

  if (!drained) {
    return;
  }

  LOG(INFO) << "Agent finished draining";

  const string drainConfigPath = paths::getDrainConfigPath(metaDir, info.id());

  Try<Nothing> rm = os::rm(drainConfigPath);

  if (rm.isError()) {
    EXIT(EXIT_FAILURE) << "Could not remove persisted drain configuration "
                       << "'" << drainConfigPath << "': " << rm.error();
  }

  drainConfig = None();
  estimatedDrainStartTime = None();
}


Framework::Framework(
    Slave* _slave,
    const Flags& slaveFlags,
    const FrameworkInfo& _info,
    const Option<UPID>& _pid)
  : state(RUNNING),
    slave(_slave),
    info(_info),
    capabilities(_info.capabilities()),
    pid(_pid),
    completedExecutors(slaveFlags.max_completed_executors_per_framework) {}


Framework::~Framework()
{
  // We own the non-completed executor pointers, so they need to be deleted.
  foreachvalue (Executor* executor, executors) {
    delete executor;
  }
}


bool Framework::idle() const
{
  return executors.empty() && pendingTasks.empty();
}


void Framework::checkpointFramework() const
{
  // Checkpoint the framework info.
  string path = paths::getFrameworkInfoPath(
      slave->metaDir, slave->info.id(), id());

  VLOG(1) << "Checkpointing FrameworkInfo to '" << path << "'";

  CHECK_SOME(state::checkpoint(path, info));

  // Checkpoint the framework pid, note that we checkpoint a
  // UPID() when it is None (for HTTP schedulers) because
  // 0.23.x slaves consider a missing pid file to be an
  // error.
  path = paths::getFrameworkPidPath(
      slave->metaDir, slave->info.id(), id());

  VLOG(1) << "Checkpointing framework pid"
          << " '" << pid.getOrElse(UPID()) << "'"
          << " to '" << path << "'";

  CHECK_SOME(state::checkpoint(path, pid.getOrElse(UPID())));
}


Try<Executor*> Framework::addExecutor(
  const ExecutorInfo& executorInfo,
  bool isGeneratedForCommandTask)
{
  // Verify that Resource.AllocationInfo is set, if coming
  // from a MULTI_ROLE master this will be set, otherwise
  // the agent will inject it when receiving the executor.
  foreach (const Resource& resource, executorInfo.resources()) {
    CHECK(resource.has_allocation_info());
  }

  // Generate an ID for the executor's container.
  // TODO(idownes) This should be done by the containerizer but we need the
  // ContainerID to create the executor's directory and generate the secret.
  // Consider fixing this since 'launchExecutor()' is handled asynchronously.
  ContainerID containerId;
  containerId.set_value(id::UUID::random().toString());

  Option<string> user = None();

#ifndef __WINDOWS__
  if (slave->flags.switch_user) {
    // The command (either in form of task or executor command) can
    // define a specific user to run as. If present, this precedes the
    // framework user value. The selected user will have been verified by
    // the master at this point through the active ACLs.
    // NOTE: The global invariant is that the executor info at this
    // point is (1) the user provided task.executor() or (2) a command
    // executor constructed by the slave from the task.command().
    // If this changes, we need to check the user in both
    // task.command() and task.executor().command() below.
    user = info.user();
    if (executorInfo.command().has_user()) {
      user = executorInfo.command().user();
    }
  }
#endif // __WINDOWS__

  // Create a directory for the executor.
  Try<string> directory = paths::createExecutorDirectory(
      slave->flags.work_dir,
      slave->info.id(),
      id(),
      executorInfo.executor_id(),
      containerId,
      user);

  if (directory.isError()) {
    return Error(directory.error());
  }

  Executor* executor = new Executor(
      slave,
      id(),
      executorInfo,
      containerId,
      directory.get(),
      user,
      info.checkpoint(),
      isGeneratedForCommandTask);

  if (executor->checkpoint) {
    executor->checkpointExecutor();
  }

  CHECK(!executors.contains(executorInfo.executor_id()))
    << "Unknown executor '" << executorInfo.executor_id() << "'";

  executors[executorInfo.executor_id()] = executor;

  LOG(INFO) << "Launching executor '" << executorInfo.executor_id()
            << "' of framework " << id()
            << " with resources " << executorInfo.resources()
            << " in work directory '" << directory.get() << "'";

  const ExecutorID& executorId = executorInfo.executor_id();
  FrameworkID frameworkId = id();

  const PID<Slave> slavePid = slave->self();

  auto authorize =
    [slavePid, executorId, frameworkId](const Option<Principal>& principal) {
      return dispatch(
          slavePid,
          &Slave::authorizeSandboxAccess,
          principal,
          frameworkId,
          executorId);
    };

  // We expose the executor's sandbox in the /files endpoint
  // via the following paths:
  //
  //  (1) /agent_workdir/frameworks/FID/executors/EID/runs/CID
  //  (2) /agent_workdir/frameworks/FID/executors/EID/runs/latest
  //  (3) /frameworks/FID/executors/EID/runs/latest
  //
  // Originally we just exposed the real path (1) and later
  // exposed the 'latest' symlink (2) since it's not easy for
  // users to know the run's container ID. We deprecated
  // (1) and (2) by exposing a virtual path (3) since we do not
  // want to expose the agent's work directory and it's not
  // something users care about in this context.
  //
  // TODO(zhitao): Remove (1) and (2) per MESOS-7960 once we
  // pass 2.0. They remain now for backwards compatibility.
  const string latestPath = paths::getExecutorLatestRunPath(
      slave->flags.work_dir,
      slave->info.id(),
      id(),
      executorInfo.executor_id());

  const string virtualLatestPath = paths::getExecutorVirtualPath(
      id(),
      executorInfo.executor_id());

  slave->files->attach(executor->directory, latestPath, authorize)
    .onAny(defer(
        slave,
        &Slave::fileAttached,
        lambda::_1,
        executor->directory,
        latestPath));

  slave->files->attach(executor->directory, virtualLatestPath, authorize)
    .onAny(defer(
        slave,
        &Slave::fileAttached,
        lambda::_1,
        executor->directory,
        virtualLatestPath));

  slave->files->attach(executor->directory, executor->directory, authorize)
    .onAny(defer(
        slave,
        &Slave::fileAttached,
        lambda::_1,
        executor->directory,
        executor->directory));

  return executor;
}


Executor* Framework::getExecutor(const ExecutorID& executorId) const
{
  if (executors.contains(executorId)) {
    return executors.at(executorId);
  }

  return nullptr;
}


Executor* Framework::getExecutor(const TaskID& taskId) const
{
  foreachvalue (Executor* executor, executors) {
    if (executor->queuedTasks.contains(taskId) ||
        executor->launchedTasks.contains(taskId) ||
        executor->terminatedTasks.contains(taskId)) {
      return executor;
    }
  }
  return nullptr;
}


void Framework::destroyExecutor(const ExecutorID& executorId)
{
  if (executors.contains(executorId)) {
    Executor* executor = executors[executorId];
    executors.erase(executorId);

    // See the declaration of `taskLaunchSequences` regarding its
    // lifecycle management.
    taskLaunchSequences.erase(executorId);

    // Pass ownership of the executor pointer.
    completedExecutors.push_back(Owned<Executor>(executor));
  }
}


void Framework::recoverExecutor(
    const ExecutorState& state,
    bool recheckpointExecutor,
    const hashset<TaskID>& tasksToRecheckpoint)
{
  LOG(INFO) << "Recovering executor '" << state.id
            << "' of framework " << id();

  CHECK_NOTNULL(slave);

  if (state.runs.empty() || state.latest.isNone() || state.info.isNone()) {
    LOG(WARNING) << "Skipping recovery of executor '" << state.id
                 << "' of framework " << id()
                 << " because its latest run or executor info"
                 << " cannot be recovered";

    // GC the top level executor work directory.
    slave->garbageCollect(paths::getExecutorPath(
        slave->flags.work_dir, slave->info.id(), id(), state.id));

    // GC the top level executor meta directory.
    slave->garbageCollect(paths::getExecutorPath(
        slave->metaDir, slave->info.id(), id(), state.id));

    return;
  }

  // Verify that Resource.AllocationInfo is set, this should
  // be injected by the agent when recovering.
  foreach (const Resource& resource, state.info->resources()) {
    CHECK(resource.has_allocation_info());
  }

  // We are only interested in the latest run of the executor!
  // So, we GC all the old runs.
  // NOTE: We don't schedule the top level executor work and meta
  // directories for GC here, because they will be scheduled when
  // the latest executor run terminates.
  const ContainerID& latest = state.latest.get();
  foreachvalue (const RunState& run, state.runs) {
    CHECK_SOME(run.id);
    const ContainerID& runId = run.id.get();
    if (latest != runId) {
      // GC the executor run's work directory.
      // TODO(vinod): Expose this directory to webui by recovering the
      // tasks and doing a 'files->attach()'.
      slave->garbageCollect(paths::getExecutorRunPath(
          slave->flags.work_dir, slave->info.id(), id(), state.id, runId));

      // GC the executor run's meta directory.
      slave->garbageCollect(paths::getExecutorRunPath(
          slave->metaDir, slave->info.id(), id(), state.id, runId));
    }
  }

  Option<RunState> run = state.runs.get(latest);
  CHECK_SOME(run)
      << "Cannot find latest run " << latest << " for executor " << state.id
      << " of framework " << id();

  // Create executor.
  const string directory = paths::getExecutorRunPath(
      slave->flags.work_dir, slave->info.id(), id(), state.id, latest);

  Executor* executor = new Executor(
      slave,
      id(),
      state.info.get(),
      latest,
      directory,
      info.user(),
      info.checkpoint(),
      state.generatedForCommandTask);

  // Recover the libprocess PID if possible for PID based executors.
  if (run->http.isSome()) {
    if (!run->http.get()) {
      // When recovering in non-strict mode, the assumption is that the
      // slave can die after checkpointing the forked pid but before the
      // libprocess pid. So, it is not possible for the libprocess pid
      // to exist but not the forked pid. If so, it is a really bad
      // situation (e.g., disk corruption).
      CHECK_SOME(run->forkedPid)
        << "Failed to get forked pid for executor " << state.id
        << " of framework " << id();

      executor->pid = run->libprocessPid.get();
    } else {
      // We set the PID to None() to signify that this is a HTTP based
      // executor.
      executor->pid = None();
    }
  } else {
    // We set the PID to UPID() to signify that the connection type for this
    // executor is unknown.
    executor->pid = UPID();
  }

  // And finally recover all the executor's tasks.
  foreachvalue (const TaskState& taskState, run->tasks) {
    executor->recoverTask(
        taskState,
        tasksToRecheckpoint.contains(taskState.id));
  }

  ExecutorID executorId = state.id;
  FrameworkID frameworkId = id();

  const PID<Slave> slavePid = slave->self();

  auto authorize =
    [slavePid, executorId, frameworkId](const Option<Principal>& principal) {
      return dispatch(
          slavePid,
          &Slave::authorizeSandboxAccess,
          principal,
          frameworkId,
          executorId);
    };

  // We expose the executor's sandbox in the /files endpoint
  // via the following paths:
  //
  //  (1) /agent_workdir/frameworks/FID/executors/EID/runs/CID
  //  (2) /agent_workdir/frameworks/FID/executors/EID/runs/latest
  //  (3) /frameworks/FID/executors/EID/runs/latest
  //
  // Originally we just exposed the real path (1) and later
  // exposed the 'latest' symlink (2) since it's not easy for
  // users to know the run's container ID. We deprecated
  // (1) and (2) by exposing a virtual path (3) since we do not
  // want to expose the agent's work directory and it's not
  // something users care about in this context.
  //
  // TODO(zhitao): Remove (1) and (2) per MESOS-7960 once we
  // pass 2.0. They remain now for backwards compatibility.
  const string latestPath = paths::getExecutorLatestRunPath(
      slave->flags.work_dir,
      slave->info.id(),
      id(),
      state.id);

  const string virtualLatestPath = paths::getExecutorVirtualPath(
      id(),
      state.id);

  slave->files->attach(executor->directory, latestPath, authorize)
    .onAny(defer(
        slave,
        &Slave::fileAttached,
        lambda::_1,
        executor->directory,
        latestPath));

  slave->files->attach(executor->directory, virtualLatestPath, authorize)
    .onAny(defer(
        slave,
        &Slave::fileAttached,
        lambda::_1,
        executor->directory,
        virtualLatestPath));

  // Expose the executor's files.
  slave->files->attach(executor->directory, executor->directory, authorize)
    .onAny(defer(
        slave,
        &Slave::fileAttached,
        lambda::_1,
        executor->directory,
        executor->directory));

  // Add the executor to the framework.
  executors[executor->id] = executor;
  if (recheckpointExecutor) {
    executor->checkpointExecutor();
  }

  // If the latest run of the executor was completed (i.e., terminated
  // and all updates are acknowledged) in the previous run, we
  // transition its state to 'TERMINATED' and gc the directories.
  if (run->completed) {
    ++slave->metrics.executors_terminated;

    executor->state = Executor::TERMINATED;

    CHECK_SOME(run->id);
    const ContainerID& runId = run->id.get();

    // GC the executor run's work directory.
    const string path = paths::getExecutorRunPath(
        slave->flags.work_dir, slave->info.id(), id(), state.id, runId);

    // NOTE: We keep a list of default executor tasks here to for
    // detaching task volume directories, since the executor may be
    // already destroyed when the GC completes (MESOS-8460).
    vector<Task> defaultExecutorTasks;
    if (executor->info.has_type() &&
        executor->info.type() == ExecutorInfo::DEFAULT) {
      foreachvalue (const Task* task, executor->launchedTasks) {
        defaultExecutorTasks.push_back(*task);
      }

      foreachvalue (const Task* task, executor->terminatedTasks) {
        defaultExecutorTasks.push_back(*task);
      }

      foreach (const shared_ptr<Task>& task, executor->completedTasks) {
        defaultExecutorTasks.push_back(*task);
      }
    }

    slave->garbageCollect(path)
      .onAny(defer(slave, &Slave::detachFile, path))
      .onAny(defer(
          slave,
          &Slave::detachTaskVolumeDirectories,
          executor->info,
          executor->containerId,
          defaultExecutorTasks));

    // Make sure there are no "launched tasks" on a recovered completed
    // executor. We can only encounter these non-terminal terminated tasks
    // when recovering a checkpointed executor that is missing a terminal
    // status update. See MESOS-9750 for a one way to enter this state.
    foreachpair (
        const TaskID& taskId,
        Task* task,
        utils::copy(executor->launchedTasks)) {
      executor->launchedTasks.erase(taskId);
      executor->terminatedTasks[taskId] = task;
    }

    // GC the executor run's meta directory.
    slave->garbageCollect(paths::getExecutorRunPath(
        slave->metaDir, slave->info.id(), id(), state.id, runId));

    // GC the top level executor work directory.
    slave->garbageCollect(paths::getExecutorPath(
        slave->flags.work_dir, slave->info.id(), id(), state.id))
        .onAny(defer(slave, &Slave::detachFile, latestPath))
        .onAny(defer(slave, &Slave::detachFile, virtualLatestPath));

    // GC the top level executor meta directory.
    slave->garbageCollect(paths::getExecutorPath(
        slave->metaDir, slave->info.id(), id(), state.id));

    // Move the executor to 'completedExecutors'.
    destroyExecutor(executor->id);
  }
}


void Framework::addPendingTask(
    const ExecutorID& executorId,
    const TaskInfo& task)
{
  pendingTasks[executorId][task.task_id()] = task;
}


void Framework::addPendingTaskGroup(
    const ExecutorID& executorId,
    const TaskGroupInfo& taskGroup)
{
  foreach (const TaskInfo& task, taskGroup.tasks()) {
    pendingTasks[executorId][task.task_id()] = task;
  }

  pendingTaskGroups.push_back(taskGroup);
}


bool Framework::hasTask(const TaskID& taskId) const
{
  foreachkey (const ExecutorID& executorId, pendingTasks) {
    if (pendingTasks.at(executorId).contains(taskId)) {
      return true;
    }
  }

  foreachvalue (Executor* executor, executors) {
    if (executor->queuedTasks.contains(taskId) ||
        executor->launchedTasks.contains(taskId) ||
        executor->terminatedTasks.contains(taskId)) {
      return true;
    }
  }

  return false;
}


bool Framework::isPending(const TaskID& taskId) const
{
  foreachkey (const ExecutorID& executorId, pendingTasks) {
    if (pendingTasks.at(executorId).contains(taskId)) {
      return true;
    }
  }

  return false;
}


Option<TaskGroupInfo> Framework::getTaskGroupForPendingTask(
    const TaskID& taskId)
{
  foreach (const TaskGroupInfo& taskGroup, pendingTaskGroups) {
    foreach (const TaskInfo& taskInfo, taskGroup.tasks()) {
      if (taskInfo.task_id() == taskId) {
        return taskGroup;
      }
    }
  }

  return None();
}


bool Framework::removePendingTask(const TaskID& taskId)
{
  bool removed = false;

  foreachkey (const ExecutorID& executorId, pendingTasks) {
    if (pendingTasks.at(executorId).contains(taskId)) {
      pendingTasks.at(executorId).erase(taskId);
      if (pendingTasks.at(executorId).empty()) {
        pendingTasks.erase(executorId);
      }

      removed = true;
      break;
    }
  }

  // We also remove the pending task group if all of its
  // tasks have been removed.
  for (auto it = pendingTaskGroups.begin();
       it != pendingTaskGroups.end();
       ++it) {
    foreach (const TaskInfo& t, it->tasks()) {
      if (t.task_id() == taskId) {
        // Found its task group, check if all tasks within
        // the group have been removed.
        bool allRemoved = true;

        foreach (const TaskInfo& t_, it->tasks()) {
          if (hasTask(t_.task_id())) {
            allRemoved = false;
            break;
          }
        }

        if (allRemoved) {
          pendingTaskGroups.erase(it);
        }

        return removed;
      }
    }
  }

  return removed;
}


Option<ExecutorID> Framework::getExecutorIdForPendingTask(
    const TaskID& taskId) const
{
  foreachkey (const ExecutorID& executorId, pendingTasks) {
    if (pendingTasks.at(executorId).contains(taskId)) {
      return executorId;
    }
  }

  return None();
}


Resources Framework::allocatedResources() const
{
  Resources allocated;

  foreachvalue (const Executor* executor, executors) {
    allocated += executor->allocatedResources();
  }

  hashset<ExecutorID> pendingExecutors;

  typedef hashmap<TaskID, TaskInfo> TaskMap;
  foreachvalue (const TaskMap& pendingTasks, pendingTasks) {
    foreachvalue (const TaskInfo& task, pendingTasks) {
      allocated += task.resources();

      ExecutorInfo executorInfo = slave->getExecutorInfo(info, task);
      const ExecutorID& executorId = executorInfo.executor_id();

      if (!executors.contains(executorId) &&
          !pendingExecutors.contains(executorId)) {
        allocated += executorInfo.resources();
        pendingExecutors.insert(executorId);
      }
    }
  }

  return allocated;
}


Executor::Executor(
    Slave* _slave,
    const FrameworkID& _frameworkId,
    const ExecutorInfo& _info,
    const ContainerID& _containerId,
    const string& _directory,
    const Option<string>& _user,
    bool _checkpoint,
    bool isGeneratedForCommandTask)
  : state(REGISTERING),
    slave(_slave),
    id(_info.executor_id()),
    info(_info),
    frameworkId(_frameworkId),
    containerId(_containerId),
    directory(_directory),
    user(_user),
    checkpoint(_checkpoint),
    http(None()),
    pid(None()),
    isGeneratedForCommandTask_(isGeneratedForCommandTask)
{
  CHECK_NOTNULL(slave);

  // NOTE: This should be greater than zero because the agent looks
  // for completed tasks to determine (with false positives) whether
  // an executor ever received tasks. See MESOS-8411.
  //
  // TODO(mzhu): Remove this check once we can determine whether an
  // executor ever received tasks without looking through the
  // completed tasks.
  static_assert(
      MAX_COMPLETED_TASKS_PER_EXECUTOR > 0,
      "Max completed tasks per executor should be greater than zero");

  completedTasks =
    circular_buffer<shared_ptr<Task>>(MAX_COMPLETED_TASKS_PER_EXECUTOR);
}


Executor::~Executor()
{
  if (http.isSome()) {
    closeHttpConnection();
  }

  // Delete the tasks.
  foreachvalue (Task* task, launchedTasks) {
    delete task;
  }
  foreachvalue (Task* task, terminatedTasks) {
    delete task;
  }
}


void Executor::enqueueTask(const TaskInfo& task)
{
  queuedTasks[task.task_id()] = task;
}


void Executor::enqueueTaskGroup(const TaskGroupInfo& taskGroup)
{
  foreach (const TaskInfo& task, taskGroup.tasks()) {
    queuedTasks[task.task_id()] = task;
  }

  queuedTaskGroups.push_back(taskGroup);
}


Option<TaskInfo> Executor::dequeueTask(const TaskID& taskId)
{
  Option<TaskInfo> taskInfo = queuedTasks.get(taskId);

  queuedTasks.erase(taskId);

  // Remove the task group if all of its tasks have been dequeued.
  for (auto it = queuedTaskGroups.begin(); it != queuedTaskGroups.end(); ++it) {
    foreach (const TaskInfo& t, it->tasks()) {
      if (t.task_id() == taskId) {
        // Found its task group, check if all tasks within
        // the group have been removed.
        bool allRemoved = true;

        foreach (const TaskInfo& t_, it->tasks()) {
          if (queuedTasks.contains(t_.task_id())) {
            allRemoved = false;
            break;
          }
        }

        if (allRemoved) {
          queuedTaskGroups.erase(it);
        }

        return taskInfo;
      }
    }
  }

  return taskInfo;
}


Task* Executor::addLaunchedTask(const TaskInfo& task)
{
  CHECK(!queuedTasks.contains(task.task_id()))
    << "Task " << task.task_id() << " was not dequeued";

  // The master should enforce unique task IDs, but just in case
  // maybe we shouldn't make this a fatal error.
  CHECK(!launchedTasks.contains(task.task_id()))
    << "Duplicate task " << task.task_id();

  // Verify that Resource.AllocationInfo is set, if coming
  // from a MULTI_ROLE master this will be set, otherwise
  // the agent will inject it when receiving the task.
  foreach (const Resource& resource, task.resources()) {
    CHECK(resource.has_allocation_info());
  }

  Task* t = new Task(protobuf::createTask(task, TASK_STAGING, frameworkId));

  launchedTasks[task.task_id()] = t;

  if (info.has_type() && info.type() == ExecutorInfo::DEFAULT) {
    slave->attachTaskVolumeDirectory(info, containerId, *t);
  }

  return t;
}


void Executor::completeTask(const TaskID& taskId)
{
  VLOG(1) << "Completing task " << taskId;

  CHECK(terminatedTasks.contains(taskId))
    << "Failed to find terminated task " << taskId;

  // If `completedTasks` is full and this is a default executor, we need
  // to detach the volume directory for the first task in `completedTasks`
  // before pushing a task into it, otherwise, we will never have chance
  // to do the detach for that task which would be a leak.
  if (info.has_type() &&
      info.type() == ExecutorInfo::DEFAULT &&
      completedTasks.full()) {
    const shared_ptr<Task>& firstTask = completedTasks.front();
    slave->detachTaskVolumeDirectories(info, containerId, {*firstTask});
  }

  // Mark the task metadata (TaskInfo and status updates) for garbage
  // collection. This is important for keeping the metadata of long-lived,
  // multi-task executors within reasonable levels.
  if (checkpoint) {
    slave->garbageCollect(paths::getTaskPath(
        slave->metaDir,
        slave->info.id(),
        frameworkId,
        id,
        containerId,
        taskId));
  }

  Task* task = terminatedTasks[taskId];
  completedTasks.push_back(shared_ptr<Task>(task));
  terminatedTasks.erase(taskId);
}


void Executor::checkpointExecutor()
{
  CHECK(checkpoint);

  // Checkpoint the executor info.
  const string path = paths::getExecutorInfoPath(
      slave->metaDir, slave->info.id(), frameworkId, id);

  VLOG(1) << "Checkpointing ExecutorInfo to '" << path << "'";

  CHECK_SOME(state::checkpoint(path, info));

  // Sync state of sentinel indicating whether the executor was
  // generated by the agent.
  CHECK_SOME(state::checkpoint(
    paths::getExecutorGeneratedForCommandTaskPath(
        slave->metaDir, slave->info.id(), frameworkId, id),
    stringify(static_cast<int>(isGeneratedForCommandTask_)),
    true));


  // Create the meta executor directory.
  // NOTE: This creates the 'latest' symlink in the meta directory.
  Try<string> mkdir = paths::createExecutorDirectory(
      slave->metaDir, slave->info.id(), frameworkId, id, containerId);

  CHECK_SOME(mkdir);
}


void Executor::checkpointTask(const TaskInfo& task)
{
  checkpointTask(protobuf::createTask(task, TASK_STAGING, frameworkId));
}


void Executor::checkpointTask(const Task& task)
{
  CHECK(checkpoint);

  const string path = paths::getTaskInfoPath(
      slave->metaDir,
      slave->info.id(),
      frameworkId,
      id,
      containerId,
      task.task_id());

  VLOG(1) << "Checkpointing TaskInfo to '" << path << "'";

  CHECK_SOME(state::checkpoint(path, task));
}


void Executor::recoverTask(const TaskState& state, bool recheckpointTask)
{
  if (state.info.isNone()) {
    LOG(WARNING) << "Skipping recovery of task " << state.id
                 << " because its info cannot be recovered";
    return;
  }

  // Verify that Resource.AllocationInfo is set, the agent
  // should inject it during recovery.
  foreach (const Resource& resource, state.info->resources()) {
    CHECK(resource.has_allocation_info());
  }

  Task* task = new Task(state.info.get());
  if (recheckpointTask) {
    checkpointTask(*task);
  }

  launchedTasks[state.id] = task;

  if (info.has_type() && info.type() == ExecutorInfo::DEFAULT) {
    slave->attachTaskVolumeDirectory(info, containerId, *task);
  }

  // Read updates to get the latest state of the task.
  foreach (const StatusUpdate& update, state.updates) {
    Try<Nothing> updated = updateTaskState(update.status());

    // TODO(bmahler): We only log this error because we used to
    // allow multiple terminal updates and so we may encounter
    // this when recovering an old executor. We can hard-CHECK
    // this 6 months from 1.1.0.
    if (updated.isError()) {
      LOG(ERROR) << "Failed to update state of recovered task"
                 << " '" << state.id << "' to " << update.status().state()
                 << ": " << updated.error();

      // The only case that should be possible here is when the
      // task had multiple terminal updates persisted.
      continue;
    }

    // Complete the task if it is terminal and
    // the update has been acknowledged.
    if (protobuf::isTerminalState(update.status().state())) {
      CHECK(update.has_uuid())
        << "Expecting updates without 'uuid' to have been rejected";

      if (state.acks.contains(id::UUID::fromBytes(update.uuid()).get())) {
        completeTask(state.id);
      }
      break;
    }
  }
}


void Executor::addPendingTaskStatus(const TaskStatus& status)
{
  auto uuid = id::UUID::fromBytes(status.uuid()).get();
  pendingStatusUpdates[status.task_id()][uuid] = status;
}


void Executor::removePendingTaskStatus(const TaskStatus& status)
{
  const TaskID& taskId = status.task_id();

  auto uuid = id::UUID::fromBytes(status.uuid()).get();

  if (!pendingStatusUpdates.contains(taskId) ||
      !pendingStatusUpdates[taskId].contains(uuid)) {
    LOG(WARNING) << "Unknown pending status update (uuid: " << uuid << ")";
    return;
  }

  pendingStatusUpdates[taskId].erase(uuid);

  if (pendingStatusUpdates[taskId].empty()) {
    pendingStatusUpdates.erase(taskId);
  }
}


Try<Nothing> Executor::updateTaskState(const TaskStatus& status)
{
  bool terminal = protobuf::isTerminalState(status.state());

  const TaskID& taskId = status.task_id();

  Task* task = nullptr;

  if (queuedTasks.contains(taskId)) {
    if (!terminal) {
      return Error("Cannot send non-terminal update for queued task");
    }

    TaskInfo taskInfo = CHECK_NOTNONE(dequeueTask(taskId));

    task = new Task(protobuf::createTask(
        taskInfo,
        status.state(),
        frameworkId));
  } else if (launchedTasks.contains(taskId)) {
    task = launchedTasks.at(status.task_id());

    if (terminal) {
      if (pendingStatusUpdates.contains(status.task_id())) {
        auto statusUpdates = pendingStatusUpdates[status.task_id()].values();

        auto firstTerminal = std::find_if(
            statusUpdates.begin(),
            statusUpdates.end(),
            [](const TaskStatus& status) {
              return protobuf::isTerminalState(status.state());
            });

        CHECK(firstTerminal != statusUpdates.end());

        if (firstTerminal->uuid() != status.uuid()) {
          return Error("Unexpected terminal status update after first status"
                       " update " + stringify(firstTerminal->state()));
        }
      }

      launchedTasks.erase(taskId);
    }
  } else if (terminatedTasks.contains(taskId)) {
    return Error("Task is already terminated with state"
                 " " + stringify(terminatedTasks.at(taskId)->state()));
  } else {
    return Error("Task is unknown");
  }

  CHECK_NOTNULL(task);

  // TODO(brenden): Consider wiping the `data` and `message` fields?
  if (task->statuses_size() > 0 &&
      task->statuses(task->statuses_size() - 1).state() == status.state()) {
    task->mutable_statuses()->RemoveLast();
  }

  task->add_statuses()->CopyFrom(status);
  task->set_state(status.state());

  // TODO(bmahler): This only increments the state when the update
  // can be handled. Should we always increment the state?
  if (terminal) {
    terminatedTasks[task->task_id()] = task;

    switch (status.state()) {
      case TASK_FINISHED: ++slave->metrics.tasks_finished; break;
      case TASK_FAILED:   ++slave->metrics.tasks_failed;   break;
      case TASK_KILLED:   ++slave->metrics.tasks_killed;   break;
      case TASK_LOST:     ++slave->metrics.tasks_lost;     break;
      case TASK_GONE:     ++slave->metrics.tasks_gone;     break;
      default:
        LOG(ERROR) << "Unexpected terminal task state " << status.state();
        break;
    }
  }

  return Nothing();
}


bool Executor::incompleteTasks()
{
  return !queuedTasks.empty() ||
         !launchedTasks.empty() ||
         !terminatedTasks.empty();
}


bool Executor::everSentTask() const
{
  if (!launchedTasks.empty()) {
    return true;
  }

  foreachvalue (Task* task, terminatedTasks) {
    foreach (const TaskStatus& status, task->statuses()) {
      if (status.source() == TaskStatus::SOURCE_EXECUTOR) {
        return true;
      }
    }
  }

  foreach (const shared_ptr<Task>& task, completedTasks) {
    foreach (const TaskStatus& status, task->statuses()) {
      if (status.source() == TaskStatus::SOURCE_EXECUTOR) {
        return true;
      }
    }
  }

  return false;
}


bool Executor::isGeneratedForCommandTask() const
{
  return isGeneratedForCommandTask_;
}


void Executor::closeHttpConnection()
{
  CHECK_SOME(http);

  if (!http->close()) {
    LOG(WARNING) << "Failed to close HTTP pipe for " << *this;
  }

  http = None();
}


Option<TaskGroupInfo> Executor::getQueuedTaskGroup(const TaskID& taskId)
{
  foreach (const TaskGroupInfo& taskGroup, queuedTaskGroups) {
    foreach (const TaskInfo& taskInfo, taskGroup.tasks()) {
      if (taskInfo.task_id() == taskId) {
        return taskGroup;
      }
    }
  }

  return None();
}


Resources Executor::allocatedResources() const
{
  Resources allocatedResources = info.resources();

  foreachvalue (const TaskInfo& task, queuedTasks) {
    allocatedResources += task.resources();
  }

  foreachvalue (const Task* task, launchedTasks) {
    allocatedResources += task->resources();
  }

  return allocatedResources;
}


void ResourceProvider::addOperation(Operation* operation)
{
  const UUID& uuid = operation->uuid();

  CHECK(!operations.contains(uuid))
    << "Operation (uuid: " << uuid << ") already exists";

  operations.put(uuid, operation);
}


void ResourceProvider::removeOperation(Operation* operation)
{
  const UUID& uuid = operation->uuid();

  CHECK(operations.contains(uuid))
    << "Unknown operation (uuid: " << uuid << ")";

  operations.erase(uuid);
}


map<string, string> executorEnvironment(
    const Flags& flags,
    const ExecutorInfo& executorInfo,
    const string& directory,
    const SlaveID& slaveId,
    const PID<Slave>& slavePid,
    const Option<Secret>& authenticationToken,
    bool checkpoint)
{
  map<string, string> environment;

  // In cases where DNS is not available on the slave, the absence of
  // LIBPROCESS_IP in the executor's environment will cause an error when the
  // new executor process attempts a hostname lookup. Thus, we pass the slave's
  // LIBPROCESS_IP through here, even if the executor environment is specified
  // explicitly. Note that a LIBPROCESS_IP present in the provided flags will
  // override this value.
  Option<string> libprocessIP = os::getenv("LIBPROCESS_IP");
  if (libprocessIP.isSome()) {
    environment["LIBPROCESS_IP"] = libprocessIP.get();
  }

  if (flags.executor_environment_variables.isSome()) {
    foreachpair (const string& key,
                 const JSON::Value& value,
                 flags.executor_environment_variables->values) {
      // See slave/flags.cpp where we validate each value is a string.
      CHECK(value.is<JSON::String>());
      environment[key] = value.as<JSON::String>().value;
    }
  }

  // Set LIBPROCESS_PORT so that we bind to a random free port (since
  // this might have been set via --port option). We do this before
  // the environment variables below in case it is included.
  environment["LIBPROCESS_PORT"] = "0";

  // Also add MESOS_NATIVE_JAVA_LIBRARY if it's not already present (and
  // like above, we do this before the environment variables below in
  // case the framework wants to override).
  // TODO(tillt): Adapt library towards JNI specific name once libmesos
  // has been split.
  if (environment.count("MESOS_NATIVE_JAVA_LIBRARY") == 0) {
    const string path =
      path::join(LIBDIR, os::libraries::expandName("mesos-" VERSION));
    if (os::exists(path)) {
      environment["MESOS_NATIVE_JAVA_LIBRARY"] = path;
    }
  }

  // Also add MESOS_NATIVE_LIBRARY if it's not already present.
  // This environment variable is kept for offering non JVM-based
  // frameworks a more compact and JNI independent library.
  if (environment.count("MESOS_NATIVE_LIBRARY") == 0) {
    const string path =
      path::join(LIBDIR, os::libraries::expandName("mesos-" VERSION));
    if (os::exists(path)) {
      environment["MESOS_NATIVE_LIBRARY"] = path;
    }
  }

  environment["MESOS_FRAMEWORK_ID"] = executorInfo.framework_id().value();
  environment["MESOS_EXECUTOR_ID"] = executorInfo.executor_id().value();
  environment["MESOS_DIRECTORY"] = directory;
  environment["MESOS_SLAVE_ID"] = slaveId.value();
  environment["MESOS_SLAVE_PID"] = stringify(slavePid);
  environment["MESOS_AGENT_ENDPOINT"] = stringify(slavePid.address);
  environment["MESOS_CHECKPOINT"] = checkpoint ? "1" : "0";
  environment["MESOS_HTTP_COMMAND_EXECUTOR"] =
    flags.http_command_executor ? "1" : "0";

#ifndef __WINDOWS__
  if (flags.http_executor_domain_sockets) {
    // If `http_executor_domain_sockets` is true, the location should have
    // been set either by the user or automatically during agent startup.
    CHECK(flags.domain_socket_location.isSome());
    environment["MESOS_DOMAIN_SOCKET"] = *flags.domain_socket_location;
  }
#endif // __WINDOWS__

  // Set executor's shutdown grace period. If set, the customized value
  // from `ExecutorInfo` overrides the default from agent flags.
  Duration executorShutdownGracePeriod = flags.executor_shutdown_grace_period;
  if (executorInfo.has_shutdown_grace_period()) {
    executorShutdownGracePeriod =
      Nanoseconds(executorInfo.shutdown_grace_period().nanoseconds());
  }

  environment["MESOS_EXECUTOR_SHUTDOWN_GRACE_PERIOD"] =
    stringify(executorShutdownGracePeriod);

  if (checkpoint) {
    environment["MESOS_RECOVERY_TIMEOUT"] = stringify(flags.recovery_timeout);

    // The maximum backoff duration to be used by an executor between two
    // retries when disconnected.
    environment["MESOS_SUBSCRIPTION_BACKOFF_MAX"] =
      stringify(flags.executor_reregistration_timeout);
  }

  if (authenticationToken.isSome()) {
    CHECK(authenticationToken->has_value());

    environment["MESOS_EXECUTOR_AUTHENTICATION_TOKEN"] =
      authenticationToken->value().data();
  }

  if (HookManager::hooksAvailable()) {
    // Include any environment variables from Hooks.
    // TODO(karya): Call environment decorator hook _after_ putting all
    // variables from executorInfo into 'env'. This would prevent the
    // ones provided by hooks from being overwritten by the ones in
    // executorInfo in case of a conflict. The overwriting takes places
    // at the callsites of executorEnvironment (e.g., ___launch function
    // in src/slave/containerizer/docker.cpp)
    // TODO(karya): Provide a mechanism to pass the new environment
    // variables created above (MESOS_*) on to the hook modules.
    const Environment& hooksEnvironment =
      HookManager::slaveExecutorEnvironmentDecorator(executorInfo);

    foreach (const Environment::Variable& variable,
             hooksEnvironment.variables()) {
      environment[variable.name()] = variable.value();
    }
  }

  return environment;
}


ostream& operator<<(ostream& stream, const Executor& executor)
{
  stream << "'" << executor.id << "' of framework " << executor.frameworkId;

  if (executor.pid.isSome() && executor.pid.get()) {
    stream << " at " << executor.pid.get();
  } else if (executor.http.isSome() ||
             (executor.slave->state == Slave::RECOVERING &&
              executor.state == Executor::REGISTERING &&
              executor.http.isNone() && executor.pid.isNone())) {
    stream << " (via HTTP)";
  }

  return stream;
}


ostream& operator<<(ostream& stream, Executor::State state)
{
  switch (state) {
    case Executor::REGISTERING: return stream << "REGISTERING";
    case Executor::RUNNING:     return stream << "RUNNING";
    case Executor::TERMINATING: return stream << "TERMINATING";
    case Executor::TERMINATED:  return stream << "TERMINATED";
    default:                    return stream << "UNKNOWN";
  }
}


ostream& operator<<(ostream& stream, Framework::State state)
{
  switch (state) {
    case Framework::RUNNING:     return stream << "RUNNING";
    case Framework::TERMINATING: return stream << "TERMINATING";
    default:                     return stream << "UNKNOWN";
  }
}


ostream& operator<<(ostream& stream, Slave::State state)
{
  switch (state) {
    case Slave::RECOVERING:   return stream << "RECOVERING";
    case Slave::DISCONNECTED: return stream << "DISCONNECTED";
    case Slave::RUNNING:      return stream << "RUNNING";
    case Slave::TERMINATING:  return stream << "TERMINATING";
    default:                  return stream << "UNKNOWN";
  }
}


static string taskOrTaskGroup(
    const Option<TaskInfo>& task,
    const Option<TaskGroupInfo>& taskGroup)
{
  ostringstream out;
  if (task.isSome()) {
    out << "task '" << task->task_id() << "'";
  } else {
    CHECK_SOME(taskGroup);

    vector<TaskID> taskIds;
    foreach (const TaskInfo& task, taskGroup->tasks()) {
      taskIds.push_back(task.task_id());
    }
    out << "task group containing tasks " << taskIds;
  }

  return out.str();
}


static CommandInfo defaultExecutorCommandInfo(
    const string& launcherDir,
    const Option<string>& user)
{
  Result<string> path = os::realpath(
      path::join(launcherDir, MESOS_DEFAULT_EXECUTOR));

  CommandInfo commandInfo;
  if (path.isSome()) {
    commandInfo.set_shell(false);
    commandInfo.set_value(path.get());
    commandInfo.add_arguments(MESOS_DEFAULT_EXECUTOR);
    commandInfo.add_arguments("--launcher_dir=" + launcherDir);
  } else {
    commandInfo.set_shell(true);
    commandInfo.set_value(
        "echo '" +
        (path.isError() ? path.error() : "No such file or directory") +
        "'; exit 1");
  }

  if (user.isSome()) {
    commandInfo.set_user(user.get());
  }

  return commandInfo;
}


static void setLimit(Option<Value::Scalar>& limit, const Value::Scalar& delta)
{
  if (limit.isSome() && std::isinf(limit->value())) {
    // Just return if the limit is already infinite.
    return;
  }

  Value::Scalar scalar;
  if (limit.isNone() || std::isinf(delta.value())) {
    // Set limit directly if it is the first time or the value to be
    // added is infinite.
    scalar.set_value(delta.value());
  } else {
    // Add the value into the limit.
    scalar.set_value(limit->value() + delta.value());
  }

  limit = scalar;
};

} // namespace slave {
} // namespace internal {
} // namespace mesos {
