blob: 431f7c600c0280cc2f63df326022a2501c9f537b [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <list>
#include <map>
#include <set>
#include <string>
#include <vector>
#include <mesos/slave/container_logger.hpp>
#include <mesos/slave/containerizer.hpp>
#include <process/check.hpp>
#include <process/collect.hpp>
#include <process/defer.hpp>
#include <process/delay.hpp>
#include <process/io.hpp>
#include <process/network.hpp>
#include <process/owned.hpp>
#include <process/reap.hpp>
#include <process/subprocess.hpp>
#ifdef __WINDOWS__
#include <process/windows/jobobject.hpp>
#endif // __WINDOWS__
#include <stout/adaptor.hpp>
#include <stout/fs.hpp>
#include <stout/hashmap.hpp>
#include <stout/hashset.hpp>
#include <stout/jsonify.hpp>
#include <stout/os.hpp>
#include <stout/path.hpp>
#include <stout/protobuf.hpp>
#include <stout/uuid.hpp>
#include <stout/os/killtree.hpp>
#include <stout/os/which.hpp>
#ifdef __WINDOWS__
#include <stout/os/windows/jobobject.hpp>
#endif // __WINDOWS__
#include "common/status_utils.hpp"
#include "hook/manager.hpp"
#ifdef __linux__
#include "linux/cgroups.hpp"
#include "linux/fs.hpp"
#include "linux/systemd.hpp"
#endif // __linux__
#include "slave/paths.hpp"
#include "slave/slave.hpp"
#include "slave/containerizer/containerizer.hpp"
#include "slave/containerizer/docker.hpp"
#include "slave/containerizer/fetcher.hpp"
#include "slave/containerizer/mesos/isolators/cgroups/constants.hpp"
#include "usage/usage.hpp"
using namespace process;
using std::list;
using std::map;
using std::set;
using std::string;
using std::vector;
using mesos::slave::ContainerConfig;
using mesos::slave::ContainerIO;
using mesos::slave::ContainerLogger;
using mesos::slave::ContainerTermination;
using mesos::internal::slave::state::SlaveState;
using mesos::internal::slave::state::FrameworkState;
using mesos::internal::slave::state::ExecutorState;
using mesos::internal::slave::state::RunState;
namespace mesos {
namespace internal {
namespace slave {
// Declared in header, see explanation there.
const string DOCKER_NAME_PREFIX = "mesos-";
// Declared in header, see explanation there.
const string DOCKER_NAME_SEPERATOR = ".";
// Declared in header, see explanation there.
const string DOCKER_SYMLINK_DIRECTORY = path::join("docker", "links");
#ifdef __WINDOWS__
const string MESOS_DOCKER_EXECUTOR = "mesos-docker-executor.exe";
#else
const string MESOS_DOCKER_EXECUTOR = "mesos-docker-executor";
#endif // __WINDOWS__
// Parse the ContainerID from a Docker container and return None if
// the container was not launched from Mesos.
Option<ContainerID> parse(const Docker::Container& container)
{
Option<string> name = None();
Option<ContainerID> containerId = None();
if (strings::startsWith(container.name, DOCKER_NAME_PREFIX)) {
name = strings::remove(
container.name, DOCKER_NAME_PREFIX, strings::PREFIX);
} else if (strings::startsWith(container.name, "/" + DOCKER_NAME_PREFIX)) {
name = strings::remove(
container.name, "/" + DOCKER_NAME_PREFIX, strings::PREFIX);
}
if (name.isSome()) {
// For Mesos versions 0.23 to 1.3 (inclusive), the docker
// container name format was:
// DOCKER_NAME_PREFIX + SlaveID + DOCKER_NAME_SEPERATOR + ContainerID.
//
// In versions <= 0.22 or >= 1.4, the name format is:
// DOCKER_NAME_PREFIX + ContainerID.
//
// To be backward compatible during upgrade, we still have to
// support all formats.
if (!strings::contains(name.get(), DOCKER_NAME_SEPERATOR)) {
ContainerID id;
id.set_value(name.get());
containerId = id;
} else {
vector<string> parts = strings::split(name.get(), DOCKER_NAME_SEPERATOR);
if (parts.size() == 2 || parts.size() == 3) {
ContainerID id;
id.set_value(parts[1]);
containerId = id;
}
}
// Check if id is a valid UUID.
if (containerId.isSome()) {
Try<id::UUID> uuid = id::UUID::fromString(containerId->value());
if (uuid.isError()) {
return None();
}
}
}
return containerId;
}
Try<DockerContainerizer*> DockerContainerizer::create(
const Flags& flags,
Fetcher* fetcher,
const Option<NvidiaComponents>& nvidia)
{
// Create and initialize the container logger module.
Try<ContainerLogger*> logger =
ContainerLogger::create(flags.container_logger);
if (logger.isError()) {
return Error("Failed to create container logger: " + logger.error());
}
Try<Owned<Docker>> create = Docker::create(
flags.docker,
flags.docker_socket,
true,
flags.docker_config);
if (create.isError()) {
return Error("Failed to create docker: " + create.error());
}
Shared<Docker> docker = create->share();
// TODO(tnachen): We should also mark the work directory as shared
// mount here, more details please refer to MESOS-3483.
return new DockerContainerizer(
flags,
fetcher,
Owned<ContainerLogger>(logger.get()),
docker,
nvidia);
}
DockerContainerizer::DockerContainerizer(
const Owned<DockerContainerizerProcess>& _process)
: process(_process)
{
spawn(process.get());
}
DockerContainerizer::DockerContainerizer(
const Flags& flags,
Fetcher* fetcher,
const Owned<ContainerLogger>& logger,
Shared<Docker> docker,
const Option<NvidiaComponents>& nvidia)
: process(new DockerContainerizerProcess(
flags,
fetcher,
logger,
docker,
nvidia))
{
spawn(process.get());
}
DockerContainerizer::~DockerContainerizer()
{
terminate(process.get());
process::wait(process.get());
}
// Constructs the flags for the `mesos-docker-executor`.
// Custom docker executors will also be invoked with these flags.
//
// NOTE: `taskEnvironment` is currently used to propagate environment variables
// from a hook: `slavePreLaunchDockerEnvironmentDecorator`.
::mesos::internal::docker::Flags dockerFlags(
const Flags& flags,
const string& name,
const string& directory,
const Option<map<string, string>>& taskEnvironment)
{
::mesos::internal::docker::Flags dockerFlags;
dockerFlags.container = name;
dockerFlags.docker = flags.docker;
dockerFlags.sandbox_directory = directory;
dockerFlags.mapped_directory = flags.sandbox_directory;
dockerFlags.docker_socket = flags.docker_socket;
dockerFlags.launcher_dir = flags.launcher_dir;
if (taskEnvironment.isSome()) {
dockerFlags.task_environment = string(jsonify(taskEnvironment.get()));
}
if (flags.default_container_dns.isSome()) {
dockerFlags.default_container_dns =
string(jsonify(JSON::Protobuf(flags.default_container_dns.get())));
}
#ifdef __linux__
dockerFlags.cgroups_enable_cfs = flags.cgroups_enable_cfs,
#endif
// TODO(alexr): Remove this after the deprecation cycle (started in 1.0).
dockerFlags.stop_timeout = flags.docker_stop_timeout;
return dockerFlags;
}
Try<DockerContainerizerProcess::Container*>
DockerContainerizerProcess::Container::create(
const ContainerID& id,
const ContainerConfig& containerConfig,
const map<string, string>& environment,
const Option<string>& pidCheckpointPath,
const Flags& flags)
{
// We need to extract a SlaveID based on the sandbox directory,
// for the purpose of working around a limitation of the Docker CLI.
// If the sandbox directory contains a colon, the sandbox directory
// cannot be mounted directly into the container directory. Instead,
// we symlink the sandbox directory and mount the symlink.
// See MESOS-1833 for more details.
Try<paths::ExecutorRunPath> runPath =
paths::parseExecutorRunPath(flags.work_dir, containerConfig.directory());
CHECK_SOME(runPath) << "Unable to determine SlaveID from sandbox directory";
string dockerSymlinkPath = path::join(
paths::getSlavePath(flags.work_dir, runPath->slaveId),
DOCKER_SYMLINK_DIRECTORY);
Try<Nothing> mkdir = os::mkdir(dockerSymlinkPath);
if (mkdir.isError()) {
return Error("Unable to create symlink folder for docker " +
dockerSymlinkPath + ": " + mkdir.error());
}
bool symlinked = false;
string containerWorkdir = containerConfig.directory();
if (strings::contains(containerConfig.directory(), ":")) {
containerWorkdir = path::join(dockerSymlinkPath, id.value());
Try<Nothing> symlink =
::fs::symlink(containerConfig.directory(), containerWorkdir);
if (symlink.isError()) {
return Error(
"Failed to symlink directory '" + containerConfig.directory() +
"' to '" + containerWorkdir + "': " + symlink.error());
}
symlinked = true;
}
Option<ContainerInfo> containerInfo = None();
Option<CommandInfo> commandInfo = None();
bool launchesExecutorContainer = false;
if (containerConfig.has_task_info() && flags.docker_mesos_image.isSome()) {
// Override the container and command to launch an executor
// in a docker container.
ContainerInfo newContainerInfo;
// Mounting in the docker socket so the executor can communicate to
// the host docker daemon. We are assuming the current instance is
// launching docker containers to the host daemon as well.
Volume* dockerSockVolume = newContainerInfo.add_volumes();
dockerSockVolume->set_host_path(flags.docker_socket);
dockerSockVolume->set_container_path(flags.docker_socket);
dockerSockVolume->set_mode(Volume::RO);
// Mounting in sandbox so the logs from the executor can be
// persisted over container failures.
Volume* sandboxVolume = newContainerInfo.add_volumes();
sandboxVolume->set_host_path(containerWorkdir);
sandboxVolume->set_container_path(containerWorkdir);
sandboxVolume->set_mode(Volume::RW);
ContainerInfo::DockerInfo dockerInfo;
dockerInfo.set_image(flags.docker_mesos_image.get());
// `--pid=host` is required for `mesos-docker-executor` to find
// the pid of the task in `/proc` when running
// `mesos-docker-executor` in a separate docker container.
Parameter* pidParameter = dockerInfo.add_parameters();
pidParameter->set_key("pid");
pidParameter->set_value("host");
// `--cap-add=SYS_ADMIN` and `--cap-add=SYS_PTRACE` are required
// for `mesos-docker-executor` to enter the namespaces of the task
// during health checking when running `mesos-docker-executor` in a
// separate docker container.
Parameter* capAddParameter = dockerInfo.add_parameters();
capAddParameter->set_key("cap-add");
capAddParameter->set_value("SYS_ADMIN");
capAddParameter = dockerInfo.add_parameters();
capAddParameter->set_key("cap-add");
capAddParameter->set_value("SYS_PTRACE");
newContainerInfo.mutable_docker()->CopyFrom(dockerInfo);
// NOTE: We do not set the optional `taskEnvironment` here as
// this field is currently used to propagate environment variables
// from a hook. This hook is called after `Container::create`.
::mesos::internal::docker::Flags dockerExecutorFlags = dockerFlags(
flags,
Container::name(id),
containerWorkdir,
None());
// Override the command with the docker command executor.
CommandInfo newCommandInfo;
newCommandInfo.set_shell(false);
newCommandInfo.set_value(
path::join(flags.launcher_dir, MESOS_DOCKER_EXECUTOR));
// Stringify the flags as arguments.
// This minimizes the need for escaping flag values.
foreachvalue (const flags::Flag& flag, dockerExecutorFlags) {
Option<string> value = flag.stringify(dockerExecutorFlags);
if (value.isSome()) {
newCommandInfo.add_arguments(
"--" + flag.effective_name().value + "=" + value.get());
}
}
if (containerConfig.task_info().has_command()) {
newCommandInfo.mutable_uris()
->CopyFrom(containerConfig.task_info().command().uris());
}
containerInfo = newContainerInfo;
commandInfo = newCommandInfo;
launchesExecutorContainer = true;
}
return new Container(
id,
containerConfig,
environment,
pidCheckpointPath,
symlinked,
containerWorkdir,
commandInfo,
containerInfo,
launchesExecutorContainer);
}
Future<Nothing> DockerContainerizerProcess::fetch(
const ContainerID& containerId)
{
CHECK(containers_.contains(containerId));
Container* container = containers_.at(containerId);
return fetcher->fetch(
containerId,
container->command,
container->containerWorkDir,
container->containerConfig.has_user() ? container->containerConfig.user()
: Option<string>::none());
}
Future<Nothing> DockerContainerizerProcess::pull(
const ContainerID& containerId)
{
if (!containers_.contains(containerId)) {
return Failure("Container is already destroyed");
}
Container* container = containers_.at(containerId);
container->state = Container::PULLING;
string image = container->image();
Future<Docker::Image> future = metrics.image_pull.time(docker->pull(
container->containerWorkDir,
image,
container->forcePullImage()));
containers_.at(containerId)->pull = future;
return future.then(defer(self(), [=]() {
VLOG(1) << "Docker pull " << image << " completed";
return Nothing();
}));
}
Try<Nothing> DockerContainerizerProcess::updatePersistentVolumes(
const ContainerID& containerId,
const string& directory,
const Resources& current,
const Resources& updated)
{
// Docker Containerizer currently is only expected to run on Linux.
#ifdef __linux__
// Unmount all persistent volumes that are no longer present.
foreach (const Resource& resource, current.persistentVolumes()) {
// This is enforced by the master.
CHECK(resource.disk().has_volume());
// Ignore absolute and nested paths.
const string& containerPath = resource.disk().volume().container_path();
if (strings::contains(containerPath, "/")) {
LOG(WARNING) << "Skipping updating mount for persistent volume "
<< resource << " of container " << containerId
<< " because the container path '" << containerPath
<< "' contains slash";
continue;
}
if (updated.contains(resource)) {
continue;
}
const string target = path::join(
directory, resource.disk().volume().container_path());
Try<Nothing> unmount = fs::unmount(target);
if (unmount.isError()) {
return Error("Failed to unmount persistent volume at '" + target +
"': " + unmount.error());
}
// TODO(tnachen): Remove mount point after unmounting. This requires
// making sure the work directory is marked as a shared mount. For
// more details please refer to MESOS-3483.
}
// Get user and group info for this task based on the sandbox directory.
struct stat s;
if (::stat(directory.c_str(), &s) < 0) {
return Error("Failed to get ownership for '" + directory + "': " +
os::strerror(errno));
}
const uid_t uid = s.st_uid;
const gid_t gid = s.st_gid;
// Mount all new persistent volumes added.
foreach (const Resource& resource, updated.persistentVolumes()) {
// This is enforced by the master.
CHECK(resource.disk().has_volume());
if (current.contains(resource)) {
continue;
}
const string source =
paths::getPersistentVolumePath(flags.work_dir, resource);
// Ignore absolute and nested paths.
const string& containerPath = resource.disk().volume().container_path();
if (strings::contains(containerPath, "/")) {
LOG(WARNING) << "Skipping updating mount for persistent volume "
<< resource << " of container " << containerId
<< " because the container path '" << containerPath
<< "' contains slash";
continue;
}
bool isVolumeInUse = false;
foreachpair (const ContainerID& _containerId,
const Container* _container,
containers_) {
// Skip self.
if (_containerId == containerId) {
continue;
}
if (_container->resourceRequests.contains(resource)) {
isVolumeInUse = true;
break;
}
}
// Set the ownership of the persistent volume to match that of the sandbox
// directory if the volume is not already in use. If the volume is
// currently in use by other containers, tasks in this container may fail
// to read from or write to the persistent volume due to incompatible
// ownership and file system permissions.
if (!isVolumeInUse) {
LOG(INFO) << "Changing the ownership of the persistent volume at '"
<< source << "' with uid " << uid << " and gid " << gid;
Try<Nothing> chown = os::chown(uid, gid, source, false);
if (chown.isError()) {
return Error(
"Failed to change the ownership of the persistent volume at '" +
source + "' with uid " + stringify(uid) +
" and gid " + stringify(gid) + ": " + chown.error());
}
}
// TODO(tnachen): We should check if the target already exists
// when we support updating persistent mounts.
const string target = path::join(directory, containerPath);
Try<Nothing> mkdir = os::mkdir(target);
if (mkdir.isError()) {
return Error("Failed to create persistent mount point at '" + target
+ "': " + mkdir.error());
}
LOG(INFO) << "Mounting '" << source << "' to '" << target
<< "' for persistent volume " << resource
<< " of container " << containerId;
const unsigned flags =
MS_BIND | (resource.disk().volume().mode() == Volume::RO ? MS_RDONLY : 0);
// Bind mount the persistent volume to the container.
Try<Nothing> mount = fs::mount(source, target, None(), flags, nullptr);
if (mount.isError()) {
return Error(
"Failed to mount persistent volume from '" +
source + "' to '" + target + "': " + mount.error());
}
}
#else
if (!current.persistentVolumes().empty() ||
!updated.persistentVolumes().empty()) {
return Error("Persistent volumes are only supported on linux");
}
#endif // __linux__
return Nothing();
}
Future<Nothing> DockerContainerizerProcess::mountPersistentVolumes(
const ContainerID& containerId)
{
if (!containers_.contains(containerId)) {
return Failure("Container is already destroyed");
}
Container* container = containers_.at(containerId);
container->state = Container::MOUNTING;
if (!container->containerConfig.has_task_info() &&
!container->resourceRequests.persistentVolumes().empty()) {
LOG(ERROR) << "Persistent volumes found with container '" << containerId
<< "' but are not supported with custom executors";
return Nothing();
}
Try<Nothing> updateVolumes = updatePersistentVolumes(
containerId,
container->containerWorkDir,
Resources(),
container->resourceRequests);
if (updateVolumes.isError()) {
return Failure(updateVolumes.error());
}
return Nothing();
}
/**
* Unmount persistent volumes that is mounted for a container.
*/
Try<Nothing> DockerContainerizerProcess::unmountPersistentVolumes(
const ContainerID& containerId)
{
// We assume volumes are only supported on Linux, and also
// the target path contains the containerId.
#ifdef __linux__
Try<fs::MountInfoTable> table = fs::MountInfoTable::read();
if (table.isError()) {
return Error("Failed to get mount table: " + table.error());
}
vector<string> unmountErrors;
foreach (const fs::MountInfoTable::Entry& entry,
adaptor::reverse(table->entries)) {
// TODO(tnachen): We assume there is only one docker container
// running per container Id and no other mounts will have the
// container Id name. We might need to revisit if this is no
// longer true.
//
// TODO(jieyu): Currently, we don't enforce that slave's work_dir
// is a slave+shared mount (similar to what we did in the Linux
// filesystem isolator). Therefore, it's likely that the
// persistent volume mounts are propagate to other mount points in
// the system (MESOS-4832). That's the reason we need the
// 'startsWith' check below. Consider making sure slave's work_dir
// is a slave_shared mount. In that way, we can lift that check.
//
// TODO(jieyu): Consider checking if 'entry.root' is under volume
// root directory or not as well.
if (strings::startsWith(entry.target, flags.work_dir) &&
strings::contains(entry.target, containerId.value())) {
LOG(INFO) << "Unmounting volume for container '" << containerId << "'";
// TODO(jieyu): Use MNT_DETACH here to workaround an issue of
// incorrect handling of container destroy failures. Currently,
// if unmount fails there, the containerizer will still treat
// the container as terminated, and the agent will schedule the
// cleanup of the container's sandbox. Since the mount hasn't
// been removed in the sandbox, that'll result in data in the
// persistent volume being incorrectly deleted. Use MNT_DETACH
// here so that the mount point in the sandbox will be removed
// immediately. See MESOS-7366 for more details.
Try<Nothing> unmount = fs::unmount(entry.target, MNT_DETACH);
if (unmount.isError()) {
// NOTE: Instead of short circuit, we try to perform as many
// unmount as possible. We'll accumulate the errors together
// in the end.
unmountErrors.push_back(
"Failed to unmount volume '" + entry.target +
"': " + unmount.error());
}
}
}
if (!unmountErrors.empty()) {
return Error(strings::join(", ", unmountErrors));
}
#endif // __linux__
return Nothing();
}
#ifdef __linux__
Future<Nothing> DockerContainerizerProcess::allocateNvidiaGpus(
const ContainerID& containerId,
const size_t count)
{
if (!nvidia.isSome()) {
return Failure("Attempted to allocate GPUs"
" without Nvidia libraries available");
}
if (!containers_.contains(containerId)) {
return Failure("Container is already destroyed");
}
return nvidia->allocator.allocate(count)
.then(defer(
self(),
&Self::_allocateNvidiaGpus,
containerId,
lambda::_1));
}
Future<Nothing> DockerContainerizerProcess::_allocateNvidiaGpus(
const ContainerID& containerId,
const set<Gpu>& allocated)
{
if (!containers_.contains(containerId)) {
return nvidia->allocator.deallocate(allocated);
}
foreach (const Gpu& gpu, allocated) {
containers_.at(containerId)->gpus.insert(gpu);
}
return Nothing();
}
Future<Nothing> DockerContainerizerProcess::deallocateNvidiaGpus(
const ContainerID& containerId)
{
if (!nvidia.isSome()) {
return Failure("Attempted to deallocate GPUs"
" without Nvidia libraries available");
}
return nvidia->allocator.deallocate(containers_.at(containerId)->gpus)
.then(defer(
self(),
&Self::_deallocateNvidiaGpus,
containerId,
containers_.at(containerId)->gpus));
}
Future<Nothing> DockerContainerizerProcess::_deallocateNvidiaGpus(
const ContainerID& containerId,
const set<Gpu>& deallocated)
{
if (containers_.contains(containerId)) {
foreach (const Gpu& gpu, deallocated) {
containers_.at(containerId)->gpus.erase(gpu);
}
}
return Nothing();
}
#endif // __linux__
Try<Nothing> DockerContainerizerProcess::checkpoint(
const ContainerID& containerId,
pid_t pid)
{
CHECK(containers_.contains(containerId));
Container* container = containers_.at(containerId);
container->executorPid = pid;
if (container->pidCheckpointPath.isSome()) {
LOG(INFO) << "Checkpointing pid " << pid
<< " to '" << container->pidCheckpointPath.get() << "'";
return slave::state::checkpoint(
container->pidCheckpointPath.get(), stringify(pid));
}
return Nothing();
}
Future<Nothing> DockerContainerizer::recover(
const Option<SlaveState>& state)
{
return dispatch(
process.get(),
&DockerContainerizerProcess::recover,
state);
}
Future<Containerizer::LaunchResult> DockerContainerizer::launch(
const ContainerID& containerId,
const ContainerConfig& containerConfig,
const map<string, string>& environment,
const Option<string>& pidCheckpointPath)
{
return dispatch(
process.get(),
&DockerContainerizerProcess::launch,
containerId,
containerConfig,
environment,
pidCheckpointPath);
}
Future<Nothing> DockerContainerizer::update(
const ContainerID& containerId,
const Resources& resourceRequests,
const google::protobuf::Map<string, Value::Scalar>& resourceLimits)
{
return dispatch(
process.get(),
&DockerContainerizerProcess::update,
containerId,
resourceRequests,
resourceLimits,
false);
}
Future<ResourceStatistics> DockerContainerizer::usage(
const ContainerID& containerId)
{
return dispatch(
process.get(),
&DockerContainerizerProcess::usage,
containerId);
}
Future<ContainerStatus> DockerContainerizer::status(
const ContainerID& containerId)
{
return dispatch(
process.get(),
&DockerContainerizerProcess::status,
containerId);
}
Future<Option<ContainerTermination>> DockerContainerizer::wait(
const ContainerID& containerId)
{
return dispatch(
process.get(),
&DockerContainerizerProcess::wait,
containerId);
}
Future<Option<ContainerTermination>> DockerContainerizer::destroy(
const ContainerID& containerId)
{
return dispatch(
process.get(),
&DockerContainerizerProcess::destroy,
containerId, true);
}
Future<hashset<ContainerID>> DockerContainerizer::containers()
{
return dispatch(process.get(), &DockerContainerizerProcess::containers);
}
Future<Nothing> DockerContainerizer::pruneImages(
const vector<Image>& excludedImages)
{
VLOG(1) << "DockerContainerizer does not support pruneImages";
return Nothing();
}
Future<Nothing> DockerContainerizerProcess::recover(
const Option<SlaveState>& state)
{
LOG(INFO) << "Recovering Docker containers";
// Get the list of all Docker containers (running and exited) in
// order to remove any orphans and reconcile checkpointed executors.
return docker->ps(true, DOCKER_NAME_PREFIX)
.then(defer(self(), &Self::_recover, state, lambda::_1));
}
Future<Nothing> DockerContainerizerProcess::_recover(
const Option<SlaveState>& state,
const vector<Docker::Container>& _containers)
{
LOG(INFO) << "Got the list of Docker containers";
if (state.isSome()) {
// This mapping of ContainerIDs to running Docker container names
// is established for two reasons:
// * Docker containers launched by Mesos versions prior to 0.23
// did not checkpoint the container type, so the Docker
// Containerizer does not know if it should recover that
// container or not.
// * The naming scheme of Docker containers changed in Mesos
// versions 0.23 and 1.4. The Docker Containerizer code needs
// to use the name of the container when interacting with the
// Docker CLI, rather than generating the container name
// based on the current version's scheme.
hashmap<ContainerID, string> existingContainers;
// Tracks all the task containers that launched an executor in
// a docker container.
hashset<ContainerID> executorContainers;
foreach (const Docker::Container& container, _containers) {
Option<ContainerID> id = parse(container);
if (id.isSome()) {
// NOTE: The container name returned by `docker inspect` may
// sometimes be prefixed with a forward slash. While this is
// technically part of the container name, subsequent calls
// to the Docker CLI do not expect the prefix.
existingContainers[id.get()] = strings::remove(
container.name, "/", strings::PREFIX);
if (strings::contains(container.name, ".executor")) {
executorContainers.insert(id.get());
}
}
}
foreachvalue (const FrameworkState& framework, state->frameworks) {
foreachvalue (const ExecutorState& executor, framework.executors) {
if (executor.info.isNone()) {
LOG(WARNING) << "Skipping recovery of executor '" << executor.id
<< "' of framework " << framework.id
<< " because its info could not be recovered";
continue;
}
if (executor.latest.isNone()) {
LOG(WARNING) << "Skipping recovery of executor '" << executor.id
<< "' of framework " << framework.id
<< " because its latest run could not be recovered";
continue;
}
// We are only interested in the latest run of the executor!
const ContainerID& containerId = executor.latest.get();
Option<RunState> run = executor.runs.get(containerId);
CHECK_SOME(run);
CHECK_SOME(run->id);
CHECK_EQ(containerId, run->id.get());
// We need the pid so the reaper can monitor the executor so skip this
// executor if it's not present. We will also skip this executor if the
// libprocess pid is not present which means the slave exited before
// checkpointing it, in which case the executor will shutdown itself
// immediately. Both of these two cases are safe to skip because the
// slave will try to wait on the container which will return `None()`
// and everything will get cleaned up.
if (run->forkedPid.isNone() || run->libprocessPid.isNone()) {
continue;
}
if (run->completed) {
VLOG(1) << "Skipping recovery of executor '" << executor.id
<< "' of framework " << framework.id
<< " because its latest run "
<< containerId << " is completed";
continue;
}
const ExecutorInfo executorInfo = executor.info.get();
if (executorInfo.has_container() &&
executorInfo.container().type() != ContainerInfo::DOCKER) {
LOG(INFO) << "Skipping recovery of executor '" << executor.id
<< "' of framework " << framework.id
<< " because it was not launched from docker "
<< "containerizer";
continue;
}
if (!executorInfo.has_container() &&
!existingContainers.contains(containerId)) {
LOG(INFO) << "Skipping recovery of executor '" << executor.id
<< "' of framework " << framework.id
<< " because its executor is not marked as docker "
<< "and the docker container doesn't exist";
continue;
}
LOG(INFO) << "Recovering container '" << containerId
<< "' for executor '" << executor.id
<< "' of framework " << framework.id;
// Create and store a container.
Container* container = new Container(containerId);
containers_[containerId] = container;
container->state = Container::RUNNING;
container->generatedForCommandTask = executor.generatedForCommandTask;
container->launchesExecutorContainer =
executorContainers.contains(containerId);
if (existingContainers.contains(containerId)) {
container->containerName = existingContainers.at(containerId);
}
// Only reap the executor process if the executor can be connected
// otherwise just set `container->status` to `None()`. This is to
// avoid reaping an irrelevant process, e.g., agent process is stopped
// for a long time, and during this time executor terminates and its
// pid happens to be reused by another irrelevant process. When agent
// is restarted, it still considers this executor not complete (i.e.,
// `run->completed` is false), so we would reap the irrelevant process
// if we do not check whether that process can be connected.
// Note that if both the pid and the port of the executor are reused
// by another process or two processes respectively after the agent
// host reboots we will still reap an irrelevant process, but that
// should be highly unlikely.
pid_t pid = run->forkedPid.get();
// Create a TCP socket.
Try<int_fd> socket = net::socket(AF_INET, SOCK_STREAM, 0);
if (socket.isError()) {
return Failure(
"Failed to create socket for connecting to executor '" +
stringify(executor.id) + "': " + socket.error());
}
Try<Nothing, SocketError> connect = process::network::connect(
socket.get(),
run->libprocessPid->address);
if (connect.isSome()) {
container->status.set(process::reap(pid));
} else {
LOG(WARNING) << "Failed to connect to executor '" << executor.id
<< "' of framework " << framework.id << ": "
<< connect.error().message;
container->status.set(Future<Option<int>>(None()));
}
// Shutdown and close the socket.
::shutdown(socket.get(), SHUT_RDWR);
os::close(socket.get());
container->status.future()
->onAny(defer(self(), &Self::reaped, containerId));
const string sandboxDirectory = paths::getExecutorRunPath(
flags.work_dir,
state->id,
framework.id,
executor.id,
containerId);
container->containerWorkDir = sandboxDirectory;
}
}
}
if (flags.docker_kill_orphans) {
return __recover(_containers);
}
return Nothing();
}
Future<Nothing> DockerContainerizerProcess::__recover(
const vector<Docker::Container>& _containers)
{
vector<ContainerID> containerIds;
vector<Future<Nothing>> futures;
foreach (const Docker::Container& container, _containers) {
VLOG(1) << "Checking if Docker container named '"
<< container.name << "' was started by Mesos";
Option<ContainerID> id = parse(container);
// Ignore containers that Mesos didn't start.
if (id.isNone()) {
continue;
}
VLOG(1) << "Checking if Mesos container with ID '"
<< stringify(id.get()) << "' has been orphaned";
// Check if we're watching an executor for this container ID and
// if not, rm -f the Docker container.
if (!containers_.contains(id.get())) {
// TODO(alexr): After the deprecation cycle (started in 1.0), update
// this to omit the timeout. Graceful shutdown of the container is not
// a containerizer responsibility; it is the responsibility of the agent
// in co-operation with the executor. Once `destroy()` is called, the
// container should be destroyed forcefully.
futures.push_back(
docker->stop(
container.id,
flags.docker_stop_timeout,
true));
containerIds.push_back(id.get());
}
}
return collect(futures)
.then(defer(self(), [=]() -> Future<Nothing> {
foreach (const ContainerID& containerId, containerIds) {
Try<Nothing> unmount = unmountPersistentVolumes(containerId);
if (unmount.isError()) {
return Failure("Unable to unmount volumes for Docker container '" +
containerId.value() + "': " + unmount.error());
}
}
LOG(INFO) << "Finished processing orphaned Docker containers";
return Nothing();
}));
}
Future<Containerizer::LaunchResult> DockerContainerizerProcess::launch(
const ContainerID& containerId,
const ContainerConfig& containerConfig,
const map<string, string>& environment,
const Option<string>& pidCheckpointPath)
{
if (containerId.has_parent()) {
return Failure("Nested containers are not supported");
}
if (containers_.contains(containerId)) {
return Failure("Container already started");
}
if (!containerConfig.has_container_info()) {
LOG(INFO) << "No container info found, skipping launch";
return Containerizer::LaunchResult::NOT_SUPPORTED;
}
if (containerConfig.container_info().type() != ContainerInfo::DOCKER) {
LOG(INFO) << "Skipping non-docker container";
return Containerizer::LaunchResult::NOT_SUPPORTED;
}
Try<Container*> container = Container::create(
containerId,
containerConfig,
environment,
pidCheckpointPath,
flags);
if (container.isError()) {
return Failure("Failed to create container: " + container.error());
}
containers_[containerId] = container.get();
LOG(INFO)
<< "Starting container '" << containerId
<< (containerConfig.has_task_info()
? "' for task '" + stringify(containerConfig.task_info().task_id())
: "")
<< "' (and executor '" << containerConfig.executor_info().executor_id()
<< "') of framework " << containerConfig.executor_info().framework_id();
Future<Nothing> f = Nothing();
if (HookManager::hooksAvailable()) {
f = HookManager::slavePreLaunchDockerTaskExecutorDecorator(
containerConfig.has_task_info()
? containerConfig.task_info()
: Option<TaskInfo>::none(),
containerConfig.executor_info(),
container.get()->containerName,
container.get()->containerWorkDir,
flags.sandbox_directory,
container.get()->environment)
.then(defer(self(), [this, containerId, containerConfig](
const DockerTaskExecutorPrepareInfo& decoratorInfo)
-> Future<Nothing> {
if (!containers_.contains(containerId)) {
return Failure("Container is already destroyed");
}
Container* container = containers_.at(containerId);
if (decoratorInfo.has_executorenvironment()) {
foreach (
const Environment::Variable& variable,
decoratorInfo.executorenvironment().variables()) {
// TODO(tillt): Tell the user about overrides possibly
// happening here while making sure we state the source
// hook causing this conflict.
container->environment[variable.name()] =
variable.value();
}
}
if (!decoratorInfo.has_taskenvironment()) {
return Nothing();
}
map<string, string> taskEnvironment;
foreach (
const Environment::Variable& variable,
decoratorInfo.taskenvironment().variables()) {
taskEnvironment[variable.name()] = variable.value();
}
if (containerConfig.has_task_info()) {
container->taskEnvironment = taskEnvironment;
// For dockerized command executors, the flags have already
// been serialized into the command, albeit without these
// environment variables. Append the last flag to the
// overridden command.
if (container->launchesExecutorContainer) {
container->command.add_arguments(
"--task_environment=" +
string(jsonify(taskEnvironment)));
}
} else {
// For custom executors, the environment variables from a
// hook are passed directly into the executor. It is up to
// the custom executor whether individual tasks should
// inherit these variables.
foreachpair (
const string& key,
const string& value,
taskEnvironment) {
container->environment[key] = value;
}
}
return Nothing();
}));
}
return f.then(defer(
self(),
&Self::_launch,
containerId,
containerConfig));
}
Future<Containerizer::LaunchResult> DockerContainerizerProcess::_launch(
const ContainerID& containerId,
const ContainerConfig& containerConfig)
{
if (!containers_.contains(containerId)) {
return Failure("Container is already destroyed");
}
Container* container = containers_.at(containerId);
if (containerConfig.has_task_info() && flags.docker_mesos_image.isNone()) {
// Launching task by forking a subprocess to run docker executor.
// TODO(steveniemitz): We should call 'update' to set CPU/CFS/mem
// quotas after 'launchExecutorProcess'. However, there is a race
// where 'update' can be called before mesos-docker-executor
// creates the Docker container for the task. See more details in
// the comments of r33174.
return container->launch = fetch(containerId)
.then(defer(self(), [=]() {
return pull(containerId);
}))
.then(defer(self(), [=]() {
if (HookManager::hooksAvailable()) {
HookManager::slavePostFetchHook(
containerId, containerConfig.directory());
}
return mountPersistentVolumes(containerId);
}))
.then(defer(self(), [=]() {
return launchExecutorProcess(containerId);
}))
.then(defer(self(), [=](pid_t pid) {
return reapExecutor(containerId, pid);
}))
.then([]() {
return Containerizer::LaunchResult::SUCCESS;
});
}
string containerName = container->containerName;
if (container->executorName().isSome()) {
// Launch the container with the executor name as we expect the
// executor will launch the docker container.
containerName = container->executorName().get();
}
// Launching task or executor by launching a separate docker
// container to run the executor.
// We need to do so for launching a task because as the slave is
// running in a container (via docker_mesos_image flag) we want the
// executor to keep running when the slave container dies.
return container->launch = fetch(containerId)
.then(defer(self(), [=]() {
return pull(containerId);
}))
.then(defer(self(), [=]() {
if (HookManager::hooksAvailable()) {
HookManager::slavePostFetchHook(
containerId, containerConfig.directory());
}
return mountPersistentVolumes(containerId);
}))
.then(defer(self(), [=]() {
return launchExecutorContainer(containerId, containerName);
}))
.then(defer(self(), [=](const Docker::Container& dockerContainer) {
// Call update to set CPU/CFS/mem quotas at launch.
// TODO(steveniemitz): Once the minimum docker version supported
// is >= 1.7 this can be changed to pass --cpu-period and
// --cpu-quota to the 'docker run' call in
// launchExecutorContainer.
return update(
containerId,
containerConfig.executor_info().resources(),
containerConfig.limits(),
true)
.then([=]() {
return Future<Docker::Container>(dockerContainer);
});
}))
.then(defer(self(), [=](const Docker::Container& dockerContainer) {
return checkpointExecutor(containerId, dockerContainer);
}))
.then(defer(self(), [=](pid_t pid) {
return reapExecutor(containerId, pid);
}))
.then([]() {
return Containerizer::LaunchResult::SUCCESS;
});
}
Future<Docker::Container> DockerContainerizerProcess::launchExecutorContainer(
const ContainerID& containerId,
const string& containerName)
{
if (!containers_.contains(containerId)) {
return Failure("Container is already destroyed");
}
if (containers_[containerId]->state == Container::DESTROYING) {
return Failure(
"Container is being destroyed during launching excutor container");
}
Container* container = containers_.at(containerId);
container->state = Container::RUNNING;
return logger->prepare(container->id, container->containerConfig)
.then(defer(
self(),
[=](const ContainerIO& containerIO)
-> Future<Docker::Container> {
// We need to pass `flags.default_container_dns` only when the agent is not
// running in a Docker container. This is to handle the case of launching a
// custom executor in a Docker container. If the agent is running in a
// Docker container (i.e., flags.docker_mesos_image.isSome() == true), that
// is the case of launching `mesos-docker-executor` in a Docker container
// with the Docker image `flags.docker_mesos_image`. In that case we already
// set `flags.default_container_dns` in the method `dockerFlags()`.
Try<Docker::RunOptions> runOptions = Docker::RunOptions::create(
container->container,
container->command,
containerName,
container->containerWorkDir,
flags.sandbox_directory,
container->resourceRequests,
#ifdef __linux__
flags.cgroups_enable_cfs,
#else
false,
#endif
container->environment,
None(), // No extra devices.
flags.docker_mesos_image.isNone() ? flags.default_container_dns : None(),
container->resourceLimits);
if (runOptions.isError()) {
return Failure(runOptions.error());
}
// Start the executor in a Docker container.
// This executor could either be a custom executor specified by an
// ExecutorInfo, or the docker executor.
Future<Option<int>> run = docker->run(
runOptions.get(),
containerIO.out,
containerIO.err);
// It's possible that 'run' terminates before we're able to
// obtain an 'inspect' result. It's also possible that 'run'
// fails in such a manner that we will never see the container
// via 'inspect'. In these cases we discard the 'inspect' and
// propagate a failure back.
auto promise = std::make_shared<Promise<Docker::Container>>();
Future<Docker::Container> inspect =
docker->inspect(containerName, slave::DOCKER_INSPECT_DELAY);
inspect
.onAny([=](Future<Docker::Container> container) {
promise->associate(container);
});
run.onAny([=]() mutable {
if (!run.isReady()) {
promise->fail(run.isFailed() ? run.failure() : "discarded");
inspect.discard();
} else if (run->isNone()) {
promise->fail("Failed to obtain exit status of container");
inspect.discard();
} else {
if (!WSUCCEEDED(run->get())) {
promise->fail("Container " + WSTRINGIFY(run->get()));
inspect.discard();
}
// TODO(bmahler): Handle the case where the 'run' exits
// cleanly but no 'inspect' result is available.
}
});
return promise->future();
}));
}
Future<pid_t> DockerContainerizerProcess::launchExecutorProcess(
const ContainerID& containerId)
{
if (!containers_.contains(containerId)) {
return Failure("Container is already destroyed");
}
if (containers_[containerId]->state == Container::DESTROYING) {
return Failure(
"Container is being destroyed during launching executor process");
}
Container* container = containers_.at(containerId);
container->state = Container::RUNNING;
// Prepare environment variables for the executor.
map<string, string> environment = container->environment;
// Include any environment variables from ExecutorInfo.
foreach (const Environment::Variable& variable,
container->containerConfig.executor_info()
.command().environment().variables()) {
const string& name = variable.name();
const string& value = variable.value();
if (environment.count(name)) {
VLOG(1) << "Overwriting environment variable '"
<< name << "', original: '"
<< environment[name] << "', new: '"
<< value << "', for container "
<< container->id;
}
environment[name] = value;
}
// Pass GLOG flag to the executor.
const Option<string> glog = os::getenv("GLOG_v");
if (glog.isSome()) {
environment["GLOG_v"] = glog.get();
}
if (environment.count("PATH") == 0) {
environment["PATH"] = os::host_default_path();
// TODO(andschwa): We will consider removing the `#ifdef` in future, as
// other platforms may benefit from being pointed to the same `docker` in
// both Agent and Executor (there is a chance that the cleaned path results
// in using a different docker, if multiple dockers are installed).
#ifdef __WINDOWS__
// Docker is generally not installed in `os::host_default_path()` on
// Windows, so the executor will not be able to find `docker`. We search for
// `docker` in `PATH` and prepend the parent directory to
// `environment["PATH"]`. We prepend instead of append so that in the off
// chance that `docker` is in `host_default_path`, the executor and agent
// will use the same `docker`.
Option<string> dockerPath = os::which("docker");
if (dockerPath.isSome()) {
environment["PATH"] =
Path(dockerPath.get()).dirname() + ";" + environment["PATH"];
}
#endif // __WINDOWS__
}
vector<string> argv;
argv.push_back(MESOS_DOCKER_EXECUTOR);
Future<Nothing> allocateGpus = Nothing();
#ifdef __linux__
Option<double> gpus = Resources(container->resourceRequests).gpus();
if (gpus.isSome() && gpus.get() > 0) {
// Make sure that the `gpus` resource is not fractional.
// We rely on scalar resources only have 3 digits of precision.
if (static_cast<long long>(gpus.get() * 1000.0) % 1000 != 0) {
return Failure("The 'gpus' resource must be an unsigned integer");
}
allocateGpus = allocateNvidiaGpus(containerId, gpus.get());
}
#endif // __linux__
return allocateGpus
.then(defer(self(), [=]() {
return logger->prepare(container->id, container->containerConfig);
}))
.then(defer(
self(),
[=](const ContainerIO& containerIO)
-> Future<pid_t> {
// NOTE: The child process will be blocked until all hooks have been
// executed.
vector<Subprocess::ParentHook> parentHooks;
// NOTE: Currently we don't care about the order of the hooks, as
// both hooks are independent.
// A hook that is executed in the parent process. It attempts to checkpoint
// the process pid.
//
// NOTE:
// - The child process is blocked by the hook infrastructure while
// these hooks are executed.
// - It is safe to bind `this`, as hooks are executed immediately
// in a `subprocess` call.
// - If `checkpoiont` returns an Error, the child process will be killed.
parentHooks.emplace_back(Subprocess::ParentHook(lambda::bind(
&DockerContainerizerProcess::checkpoint,
this,
containerId,
lambda::_1)));
#ifdef __linux__
// If we are on systemd, then extend the life of the executor. Any
// grandchildren's lives will also be extended.
if (systemd::enabled()) {
parentHooks.emplace_back(Subprocess::ParentHook(
&systemd::mesos::extendLifetime));
}
#elif __WINDOWS__
parentHooks.emplace_back(Subprocess::ParentHook::CREATE_JOB());
// Setting the "kill on close" job object limit ties the lifetime of the
// docker processes to that of the executor. This ensures that if the
// executor exits, the docker processes aren't leaked.
parentHooks.emplace_back(Subprocess::ParentHook(
[](pid_t pid) { return os::set_job_kill_on_close_limit(pid); }));
#endif // __linux__
// Prepare the flags to pass to the mesos docker executor process.
::mesos::internal::docker::Flags launchFlags = dockerFlags(
flags,
container->containerName,
container->containerWorkDir,
container->taskEnvironment);
VLOG(1) << "Launching 'mesos-docker-executor' with flags '"
<< launchFlags << "'";
// Construct the mesos-docker-executor using the "name" we gave the
// container (to distinguish it from Docker containers not created
// by Mesos).
Try<Subprocess> s = subprocess(
path::join(flags.launcher_dir, MESOS_DOCKER_EXECUTOR),
argv,
Subprocess::PIPE(),
containerIO.out,
containerIO.err,
&launchFlags,
environment,
None(),
parentHooks,
{Subprocess::ChildHook::SETSID(),
Subprocess::ChildHook::CHDIR(container->containerWorkDir)});
if (s.isError()) {
return Failure("Failed to fork executor: " + s.error());
}
return s->pid();
}));
}
Future<pid_t> DockerContainerizerProcess::checkpointExecutor(
const ContainerID& containerId,
const Docker::Container& dockerContainer)
{
// After we do Docker::run we shouldn't remove a container until
// after we set Container::status.
CHECK(containers_.contains(containerId));
Option<pid_t> pid = dockerContainer.pid;
if (!pid.isSome()) {
return Failure("Unable to get executor pid after launch");
}
Try<Nothing> checkpointed = checkpoint(containerId, pid.get());
if (checkpointed.isError()) {
return Failure(
"Failed to checkpoint executor's pid: " + checkpointed.error());
}
return pid.get();
}
Future<Nothing> DockerContainerizerProcess::reapExecutor(
const ContainerID& containerId,
pid_t pid)
{
// After we do Docker::run we shouldn't remove a container until
// after we set 'status', which we do in this function.
CHECK(containers_.contains(containerId));
Container* container = containers_.at(containerId);
// And finally watch for when the container gets reaped.
container->status.set(process::reap(pid));
container->status.future()
->onAny(defer(self(), &Self::reaped, containerId));
return Nothing();
}
Future<Nothing> DockerContainerizerProcess::update(
const ContainerID& containerId,
const Resources& resourceRequests,
const google::protobuf::Map<string, Value::Scalar>& resourceLimits,
bool force)
{
CHECK(!containerId.has_parent());
if (!containers_.contains(containerId)) {
LOG(WARNING) << "Ignoring updating unknown container " << containerId;
return Nothing();
}
Container* container = containers_.at(containerId);
if (container->state == Container::DESTROYING) {
LOG(INFO) << "Ignoring updating container " << containerId
<< " that is being destroyed";
return Nothing();
}
if (container->generatedForCommandTask) {
// Store the resources for usage().
container->resourceRequests = resourceRequests;
container->resourceLimits = resourceLimits;
LOG(INFO) << "Ignoring updating container " << containerId
<< " because it is generated for a command task";
return Nothing();
}
if (container->resourceRequests == resourceRequests &&
container->resourceLimits == resourceLimits &&
!force) {
LOG(INFO) << "Ignoring updating container " << containerId
<< " because resources passed to update are identical to"
<< " existing resources";
return Nothing();
}
// TODO(tnachen): Support updating persistent volumes, which requires
// Docker mount propagation support.
// TODO(gyliu): Support updating GPU resources.
// Store the resources for usage().
container->resourceRequests = resourceRequests;
container->resourceLimits = resourceLimits;
#ifdef __linux__
if (!resourceRequests.cpus().isSome() &&
!resourceRequests.mem().isSome() &&
!resourceLimits.count("cpus") &&
!resourceLimits.count("mem")) {
LOG(WARNING) << "Ignoring update as no supported resources are present";
return Nothing();
}
// Skip inspecting the docker container if we already have the cgroups.
if (container->cpuCgroup.isSome() && container->memoryCgroup.isSome()) {
return __update(containerId, resourceRequests, resourceLimits);
}
string containerName = containers_.at(containerId)->containerName;
// Since the Docker daemon might hang, we have to retry the inspect command.
//
// NOTE: This code is duplicated from the built-in docker executor, but
// the retry interval is not passed to `inspect`, because the container might
// be terminated.
// TODO(abudnik): Consider using a class helper for retrying docker commands.
auto inspectLoop = loop(
self(),
[=]() {
return await(
docker->inspect(containerName)
.after(
slave::DOCKER_INSPECT_TIMEOUT,
[=](Future<Docker::Container> future) {
LOG(WARNING) << "Docker inspect timed out after "
<< slave::DOCKER_INSPECT_TIMEOUT
<< " for container "
<< "'" << containerName << "'";
// We need to clean up the hanging Docker CLI process.
// Discarding the inspect future triggers a callback in
// the Docker library that kills the subprocess and
// transitions the future.
future.discard();
return future;
}));
},
[](const Future<Docker::Container>& future)
-> Future<ControlFlow<Docker::Container>> {
if (future.isReady()) {
return Break(future.get());
}
if (future.isFailed()) {
return Failure(future.failure());
}
return Continue();
});
return inspectLoop
.then(defer(
self(),
&Self::_update,
containerId,
resourceRequests,
resourceLimits,
lambda::_1));
#else
return Nothing();
#endif // __linux__
}
#ifdef __linux__
Future<Nothing> DockerContainerizerProcess::_update(
const ContainerID& containerId,
const Resources& resourceRequests,
const google::protobuf::Map<string, Value::Scalar>& resourceLimits,
const Docker::Container& container)
{
if (container.pid.isNone()) {
return Nothing();
}
if (!containers_.contains(containerId)) {
LOG(INFO) << "Container has been removed after docker inspect, "
<< "skipping update";
return Nothing();
}
containers_.at(containerId)->pid = container.pid.get();
// NOTE: Normally, a Docker container should be in its own cgroup.
// However, a zombie process (exited but not reaped) will be
// temporarily moved into the system root cgroup. We add some
// defensive check here to make sure we are not changing the knobs
// in the root cgroup. See MESOS-8480 for details.
const string systemRootCgroup = stringify(os::PATH_SEPARATOR);
// We need to find the cgroup(s) this container is currently running
// in for both the hierarchy with the 'cpu' subsystem attached and
// the hierarchy with the 'memory' subsystem attached so we can
// update the proper cgroup control files.
// Determine the cgroup for the 'cpu' subsystem (based on the
// container's pid).
Result<string> cpuCgroup = cgroups::cpu::cgroup(container.pid.get());
if (cpuCgroup.isError()) {
return Failure("Failed to determine cgroup for the 'cpu' subsystem: " +
cpuCgroup.error());
} else if (cpuCgroup.isNone()) {
LOG(WARNING) << "Container " << containerId
<< " does not appear to be a member of a cgroup"
<< " where the 'cpu' subsystem is mounted";
} else if (cpuCgroup.get() == systemRootCgroup) {
LOG(WARNING)
<< "Process '" << container.pid.get()
<< "' should not be in the system root cgroup (being destroyed?)";
} else {
// Cache the CPU cgroup.
containers_.at(containerId)->cpuCgroup = cpuCgroup.get();
}
// Now determine the cgroup for the 'memory' subsystem.
Result<string> memoryCgroup = cgroups::memory::cgroup(container.pid.get());
if (memoryCgroup.isError()) {
return Failure("Failed to determine cgroup for the 'memory' subsystem: " +
memoryCgroup.error());
} else if (memoryCgroup.isNone()) {
LOG(WARNING) << "Container " << containerId
<< " does not appear to be a member of a cgroup"
<< " where the 'memory' subsystem is mounted";
} else if (memoryCgroup.get() == systemRootCgroup) {
LOG(WARNING)
<< "Process '" << container.pid.get()
<< "' should not be in the system root cgroup (being destroyed?)";
} else {
// Cache the memory cgroup.
containers_.at(containerId)->memoryCgroup = memoryCgroup.get();
}
if (containers_.at(containerId)->cpuCgroup.isNone() &&
containers_.at(containerId)->memoryCgroup.isNone()) {
return Nothing();
}
return __update(containerId, resourceRequests, resourceLimits);
}
Future<Nothing> DockerContainerizerProcess::__update(
const ContainerID& containerId,
const Resources& resourceRequests,
const google::protobuf::Map<string, Value::Scalar>& resourceLimits)
{
CHECK(containers_.contains(containerId));
Container* container = containers_.at(containerId);
// Determine the cgroups hierarchies where the 'cpu' and
// 'memory' subsystems are mounted (they may be the same). Note that
// we make these static so we can reuse the result for subsequent
// calls.
static Result<string> cpuHierarchy = cgroups::hierarchy("cpu");
static Result<string> memHierarchy = cgroups::hierarchy("memory");
if (cpuHierarchy.isError()) {
return Failure("Failed to determine the cgroup hierarchy "
"where the 'cpu' subsystem is mounted: " +
cpuHierarchy.error());
}
if (memHierarchy.isError()) {
return Failure("Failed to determine the cgroup hierarchy "
"where the 'memory' subsystem is mounted: " +
memHierarchy.error());
}
Option<string> cpuCgroup = container->cpuCgroup;
Option<string> memCgroup = container->memoryCgroup;
Option<double> cpuRequest = resourceRequests.cpus();
Option<Bytes> memRequest = resourceRequests.mem();
Option<double> cpuLimit, memLimit;
foreach (auto&& limit, resourceLimits) {
if (limit.first == "cpus") {
cpuLimit = limit.second.value();
} else if (limit.first == "mem") {
memLimit = limit.second.value();
}
}
// Update the CPU shares and CFS quota (if applicable).
if (cpuHierarchy.isSome() && cpuCgroup.isSome()) {
if (cpuRequest.isSome()) {
uint64_t shares = std::max(
(uint64_t) (CPU_SHARES_PER_CPU * cpuRequest.get()), MIN_CPU_SHARES);
Try<Nothing> write =
cgroups::cpu::shares(cpuHierarchy.get(), cpuCgroup.get(), shares);
if (write.isError()) {
return Failure("Failed to update 'cpu.shares': " + write.error());
}
LOG(INFO) << "Updated 'cpu.shares' to " << shares
<< " at " << path::join(cpuHierarchy.get(), cpuCgroup.get())
<< " for container " << containerId;
}
// Set CFS quota to CPU limit (if any) or to CPU request (if the
// flag `--cgroups_enable_cfs` is true).
if (cpuLimit.isSome() || (flags.cgroups_enable_cfs && cpuRequest.isSome())) {
Try<Nothing> write = cgroups::cpu::cfs_period_us(
cpuHierarchy.get(),
cpuCgroup.get(),
CPU_CFS_PERIOD);
if (write.isError()) {
return Failure(
"Failed to update 'cpu.cfs_period_us': " + write.error());
}
if (cpuLimit.isSome() && std::isinf(cpuLimit.get())) {
write = cgroups::write(
cpuHierarchy.get(), cpuCgroup.get(), "cpu.cfs_quota_us", "-1");
if (write.isError()) {
return Failure(
"Failed to update 'cpu.cfs_quota_us': " + write.error());
}
LOG(INFO) << "Updated 'cpu.cfs_period_us' to " << CPU_CFS_PERIOD
<< " and 'cpu.cfs_quota_us' to -1 at "
<< path::join(cpuHierarchy.get(), cpuCgroup.get())
<< " for container " << containerId;
} else {
const double& quota =
cpuLimit.isSome() ? cpuLimit.get() : cpuRequest.get();
Duration duration = std::max(CPU_CFS_PERIOD * quota, MIN_CPU_CFS_QUOTA);
write = cgroups::cpu::cfs_quota_us(
cpuHierarchy.get(), cpuCgroup.get(), duration);
if (write.isError()) {
return Failure(
"Failed to update 'cpu.cfs_quota_us': " + write.error());
}
LOG(INFO) << "Updated 'cpu.cfs_period_us' to " << CPU_CFS_PERIOD
<< " and 'cpu.cfs_quota_us' to " << duration << " (cpus "
<< quota << ") at "
<< path::join(cpuHierarchy.get(), cpuCgroup.get())
<< " for container " << containerId;
}
}
}
// Update the memory limits (if applicable).
if (memHierarchy.isSome() && memCgroup.isSome()) {
// TODO(tnachen): investigate and handle OOM with docker.
if (memRequest.isSome()) {
Bytes softLimit = std::max(memRequest.get(), MIN_MEMORY);
// Always set the soft limit.
Try<Nothing> write = cgroups::memory::soft_limit_in_bytes(
memHierarchy.get(), memCgroup.get(), softLimit);
if (write.isError()) {
return Failure("Failed to set 'memory.soft_limit_in_bytes': " +
write.error());
}
LOG(INFO) << "Updated 'memory.soft_limit_in_bytes' to " << softLimit
<< " at " << path::join(memHierarchy.get(), memCgroup.get())
<< " for container " << containerId;
}
// Read the existing hard limit.
Try<Bytes> currentHardLimit = cgroups::memory::limit_in_bytes(
memHierarchy.get(), memCgroup.get());
if (currentHardLimit.isError()) {
return Failure(
"Failed to read 'memory.limit_in_bytes': " +
currentHardLimit.error());
}
bool isInfiniteLimit = false;
Option<Bytes> hardLimit = None();
if (memLimit.isSome()) {
if (std::isinf(memLimit.get())) {
isInfiniteLimit = true;
} else {
hardLimit = std::max(
Megabytes(static_cast<uint64_t>(memLimit.get())), MIN_MEMORY);
}
} else if (memRequest.isSome()) {
hardLimit = std::max(memRequest.get(), MIN_MEMORY);
}
// Only update if new limit is infinite or higher than current limit.
// TODO(benh): Introduce a MemoryWatcherProcess which monitors the
// discrepancy between usage and soft limit and introduces a
// "manual oom" if necessary.
if (isInfiniteLimit) {
Try<Nothing> write = cgroups::write(
memHierarchy.get(), memCgroup.get(), "memory.limit_in_bytes", "-1");
if (write.isError()) {
return Failure(
"Failed to update 'memory.limit_in_bytes': " + write.error());
}
LOG(INFO) << "Updated 'memory.limit_in_bytes' to -1 at "
<< path::join(memHierarchy.get(), memCgroup.get())
<< " for container " << containerId;
} else if (hardLimit.isSome() && hardLimit.get() > currentHardLimit.get()) {
Try<Nothing> write = cgroups::memory::limit_in_bytes(
memHierarchy.get(), memCgroup.get(), hardLimit.get());
if (write.isError()) {
return Failure(
"Failed to set 'memory.limit_in_bytes': " + write.error());
}
LOG(INFO) << "Updated 'memory.limit_in_bytes' to " << hardLimit.get()
<< " at " << path::join(memHierarchy.get(), memCgroup.get())
<< " for container " << containerId;
}
}
return Nothing();
}
#endif // __linux__
Future<ResourceStatistics> DockerContainerizerProcess::usage(
const ContainerID& containerId)
{
CHECK(!containerId.has_parent());
if (!containers_.contains(containerId)) {
return Failure("Unknown container: " + stringify(containerId));
}
Container* container = containers_.at(containerId);
if (container->state == Container::DESTROYING) {
return Failure("Container is being removed: " + stringify(containerId));
}
auto collectUsage = [this, containerId](
pid_t pid) -> Future<ResourceStatistics> {
// First make sure container is still there.
if (!containers_.contains(containerId)) {
return Failure("Container has been destroyed: " + stringify(containerId));
}
Container* container = containers_.at(containerId);
if (container->state == Container::DESTROYING) {
return Failure("Container is being removed: " + stringify(containerId));
}
ResourceStatistics result;
#ifdef __linux__
const Try<ResourceStatistics> cgroupStats = cgroupsStatistics(pid);
if (cgroupStats.isError()) {
return Failure("Failed to collect cgroup stats: " + cgroupStats.error());
}
result = cgroupStats.get();
#endif // __linux__
Option<double> cpuRequest, cpuLimit, memLimit;
Option<Bytes> memRequest;
// For command tasks, we should subtract the default resources (0.1 cpus and
// 32MB memory) for command executor from the container's resource requests
// and limits, otherwise we would report wrong resource statistics.
if (container->resourceRequests.cpus().isSome()) {
if (container->generatedForCommandTask) {
cpuRequest =
container->resourceRequests.cpus().get() - DEFAULT_EXECUTOR_CPUS;
} else {
cpuRequest = container->resourceRequests.cpus();
}
}
if (container->resourceRequests.mem().isSome()) {
if (container->generatedForCommandTask) {
memRequest =
container->resourceRequests.mem().get() - DEFAULT_EXECUTOR_MEM;
} else {
memRequest = container->resourceRequests.mem();
}
}
foreach (auto&& limit, container->resourceLimits) {
if (limit.first == "cpus") {
if (container->generatedForCommandTask &&
!std::isinf(limit.second.value())) {
cpuLimit = limit.second.value() - DEFAULT_EXECUTOR_CPUS;
} else {
cpuLimit = limit.second.value();
}
} else if (limit.first == "mem") {
if (container->generatedForCommandTask &&
!std::isinf(limit.second.value())) {
memLimit = limit.second.value() -
DEFAULT_EXECUTOR_MEM.bytes() / Bytes::MEGABYTES;
} else {
memLimit = limit.second.value();
}
}
}
if (cpuRequest.isSome()) {
result.set_cpus_soft_limit(cpuRequest.get());
}
if (cpuLimit.isSome()) {
// Get the total CPU numbers of this node, we will use
// it to set container's hard CPU limit if the CPU limit
// specified by framework is infinity.
static Option<long> totalCPUs;
if (totalCPUs.isNone()) {
Try<long> cpus = os::cpus();
if (cpus.isError()) {
return Failure(
"Failed to auto-detect the number of cpus: " + cpus.error());
}
totalCPUs = cpus.get();
}
CHECK_SOME(totalCPUs);
result.set_cpus_limit(
std::isinf(cpuLimit.get()) ? totalCPUs.get() : cpuLimit.get());
#ifdef __linux__
} else if (flags.cgroups_enable_cfs && cpuRequest.isSome()) {
result.set_cpus_limit(cpuRequest.get());
#endif
}
if (memLimit.isSome()) {
// Get the total memory of this node, we will use it to
// set container's hard memory limit if the memory limit
// specified by framework is infinity.
static Option<Bytes> totalMem;
if (totalMem.isNone()) {
Try<os::Memory> mem = os::memory();
if (mem.isError()) {
return Failure(
"Failed to auto-detect the size of main memory: " + mem.error());
}
totalMem = mem->total;
}
CHECK_SOME(totalMem);
result.set_mem_limit_bytes(
std::isinf(memLimit.get())
? totalMem->bytes()
: Megabytes(static_cast<uint64_t>(memLimit.get())).bytes());
if (memRequest.isSome()) {
result.set_mem_soft_limit_bytes(memRequest->bytes());
}
} else if (memRequest.isSome()) {
result.set_mem_limit_bytes(memRequest->bytes());
}
return result;
};
// Skip inspecting the docker container if we already have the pid.
if (container->pid.isSome()) {
return collectUsage(container->pid.get());
}
return docker->inspect(container->containerName)
.then(defer(
self(),
[this, containerId, collectUsage]
(const Docker::Container& _container) -> Future<ResourceStatistics> {
const Option<pid_t> pid = _container.pid;
if (pid.isNone()) {
return Failure("Container is not running");
}
if (!containers_.contains(containerId)) {
return Failure(
"Container has been destroyed:" + stringify(containerId));
}
Container* container = containers_.at(containerId);
// Update the container's pid now. We ran inspect because we didn't have
// a pid for the container.
container->pid = pid;
return collectUsage(pid.get());
}));
}
Try<ResourceStatistics> DockerContainerizerProcess::cgroupsStatistics(
pid_t pid) const
{
#ifndef __linux__
return Error("Does not support cgroups on non-linux platform");
#else
static const Result<string> cpuacctHierarchy = cgroups::hierarchy("cpuacct");
static const Result<string> memHierarchy = cgroups::hierarchy("memory");
// NOTE: Normally, a Docker container should be in its own cgroup.
// However, a zombie process (exited but not reaped) will be
// temporarily moved into the system root cgroup. We add some
// defensive check here to make sure we are not reporting statistics
// for the root cgroup. See MESOS-8480 for details.
const string systemRootCgroup = stringify(os::PATH_SEPARATOR);
if (cpuacctHierarchy.isError()) {
return Error(
"Failed to determine the cgroup 'cpuacct' subsystem hierarchy: " +
cpuacctHierarchy.error());
}
if (memHierarchy.isError()) {
return Error(
"Failed to determine the cgroup 'memory' subsystem hierarchy: " +
memHierarchy.error());
}
const Result<string> cpuacctCgroup = cgroups::cpuacct::cgroup(pid);
if (cpuacctCgroup.isError()) {
return Error(
"Failed to determine cgroup for the 'cpuacct' subsystem: " +
cpuacctCgroup.error());
} else if (cpuacctCgroup.isNone()) {
return Error("Unable to find 'cpuacct' cgroup subsystem");
} else if (cpuacctCgroup.get() == systemRootCgroup) {
return Error(
"Process '" + stringify(pid) +
"' should not be in the system root cgroup (being destroyed?)");
}
const Result<string> memCgroup = cgroups::memory::cgroup(pid);
if (memCgroup.isError()) {
return Error(
"Failed to determine cgroup for the 'memory' subsystem: " +
memCgroup.error());
} else if (memCgroup.isNone()) {
return Error("Unable to find 'memory' cgroup subsystem");
} else if (memCgroup.get() == systemRootCgroup) {
return Error(
"Process '" + stringify(pid) +
"' should not be in the system root cgroup (being destroyed?)");
}
const Try<cgroups::cpuacct::Stats> cpuAcctStat =
cgroups::cpuacct::stat(cpuacctHierarchy.get(), cpuacctCgroup.get());
if (cpuAcctStat.isError()) {
return Error("Failed to get cpu.stat: " + cpuAcctStat.error());
}
const Try<hashmap<string, uint64_t>> memStats =
cgroups::stat(memHierarchy.get(), memCgroup.get(), "memory.stat");
if (memStats.isError()) {
return Error(
"Error getting memory statistics from cgroups memory subsystem: " +
memStats.error());
}
if (!memStats->contains("rss")) {
return Error("cgroups memory stats does not contain 'rss' data");
}
ResourceStatistics result;
result.set_timestamp(Clock::now().secs());
result.set_cpus_system_time_secs(cpuAcctStat->system.secs());
result.set_cpus_user_time_secs(cpuAcctStat->user.secs());
result.set_mem_rss_bytes(memStats->at("rss"));
// Add the cpu.stat information only if CFS is enabled.
if (flags.cgroups_enable_cfs) {
static const Result<string> cpuHierarchy = cgroups::hierarchy("cpu");
if (cpuHierarchy.isError()) {
return Error(
"Failed to determine the cgroup 'cpu' subsystem hierarchy: " +
cpuHierarchy.error());
}
const Result<string> cpuCgroup = cgroups::cpu::cgroup(pid);
if (cpuCgroup.isError()) {
return Error(
"Failed to determine cgroup for the 'cpu' subsystem: " +
cpuCgroup.error());
} else if (cpuCgroup.isNone()) {
return Error("Unable to find 'cpu' cgroup subsystem");
} else if (cpuCgroup.get() == systemRootCgroup) {
return Error(
"Process '" + stringify(pid) +
"' should not be in the system root cgroup (being destroyed?)");
}
const Try<hashmap<string, uint64_t>> stat =
cgroups::stat(cpuHierarchy.get(), cpuCgroup.get(), "cpu.stat");
if (stat.isError()) {
return Error("Failed to read cpu.stat: " + stat.error());
}
Option<uint64_t> nr_periods = stat->get("nr_periods");
if (nr_periods.isSome()) {
result.set_cpus_nr_periods(nr_periods.get());
}
Option<uint64_t> nr_throttled = stat->get("nr_throttled");
if (nr_throttled.isSome()) {
result.set_cpus_nr_throttled(nr_throttled.get());
}
Option<uint64_t> throttled_time = stat->get("throttled_time");
if (throttled_time.isSome()) {
result.set_cpus_throttled_time_secs(
Nanoseconds(throttled_time.get()).secs());
}
}
return result;
#endif // __linux__
}
Future<ContainerStatus> DockerContainerizerProcess::status(
const ContainerID& containerId)
{
ContainerStatus result;
result.mutable_container_id()->CopyFrom(containerId);
return result;
}
Future<Option<ContainerTermination>> DockerContainerizerProcess::wait(
const ContainerID& containerId)
{
CHECK(!containerId.has_parent());
if (!containers_.contains(containerId)) {
return None();
}
return containers_.at(containerId)->termination.future()
.then(Option<ContainerTermination>::some);
}
Future<Option<ContainerTermination>> DockerContainerizerProcess::destroy(
const ContainerID& containerId,
bool killed)
{
if (!containers_.contains(containerId)) {
// TODO(bmahler): Currently the agent does not log destroy
// failures or unknown containers, so we log it here for now.
// Move this logging into the callers.
LOG(WARNING) << "Attempted to destroy unknown container " << containerId;
return None();
}
// TODO(klueska): Ideally, we would do this check as the first thing
// we do after entering this function. However, the containerizer
// API currently requires callers of `launch()` to also call
// `destroy()` if the launch fails (MESOS-6214). As such, putting
// the check at the top of this function would cause the
// containerizer to crash if the launch failure was due to the
// container having its `parent` field set. Once we remove the
// requirement for `destroy()` to be called explicitly after launch
// failures, we should move this check to the top of this function.
CHECK(!containerId.has_parent());
Container* container = containers_.at(containerId);
if (container->launch.isFailed()) {
VLOG(1) << "Container " << containerId << " launch failed";
// This means we failed to launch the container and we're trying to
// cleanup.
CHECK_PENDING(container->status.future());
ContainerTermination termination;
// NOTE: The launch error message will be retrieved by the slave
// and properly set in the corresponding status update.
container->termination.set(termination);
containers_.erase(containerId);
delete container;
return termination;
}
if (container->state == Container::DESTROYING) {
return container->termination.future()
.then(Option<ContainerTermination>::some);
}
// It's possible that destroy is getting called before
// DockerContainerizer::launch has completed (i.e., after we've
// returned a future but before we've completed the fetching of the
// URIs, or the Docker::run, or the wait, etc.).
//
// If we're FETCHING, we want to stop the fetching and then
// cleanup. Note, we need to make sure that we deal with the race
// with trying to terminate the fetcher so that even if the fetcher
// returns successfully we won't try to do a Docker::run.
//
// If we're PULLING, we want to terminate the 'docker pull' and then
// cleanup. Just as above, we'll need to deal with the race with
// 'docker pull' returning successfully.
//
// If we're MOUNTING, we want to unmount all the persistent volumes
// that has been mounted.
//
// If we're RUNNING, we want to wait for the status to get set, then
// do a Docker::kill, then wait for the status to complete, then
// cleanup.
if (container->state == Container::FETCHING) {
LOG(INFO) << "Destroying container " << containerId << " in FETCHING state";
fetcher->kill(containerId);
ContainerTermination termination;
termination.set_message("Container destroyed while fetching");
container->termination.set(termination);
// Even if the fetch succeeded just before we did the killtree,
// removing the container here means that we won't proceed with
// the Docker::run.
containers_.erase(containerId);
delete container;
return termination;
}
if (container->state == Container::PULLING) {
LOG(INFO) << "Destroying container " << containerId << " in PULLING state";
container->pull.discard();
ContainerTermination termination;
termination.set_message("Container destroyed while pulling image");
container->termination.set(termination);
containers_.erase(containerId);
delete container;
return termination;
}
if (container->state == Container::MOUNTING) {
LOG(INFO) << "Destroying container " << containerId << " in MOUNTING state";
// Persistent volumes might already been mounted, remove them
// if necessary.
Try<Nothing> unmount = unmountPersistentVolumes(containerId);
if (unmount.isError()) {
LOG(WARNING) << "Failed to remove persistent volumes on destroy for"
<< " container " << containerId << ": " << unmount.error();
}
ContainerTermination termination;
termination.set_message("Container destroyed while mounting volumes");
container->termination.set(termination);
containers_.erase(containerId);
delete container;
return termination;
}
CHECK(container->state == Container::RUNNING);
LOG(INFO) << "Destroying container " << containerId << " in RUNNING state";
container->state = Container::DESTROYING;
if (killed && container->executorPid.isSome()) {
LOG(INFO) << "Sending SIGTERM to executor with pid: "
<< container->executorPid.get();
// We need to clean up the executor as the executor might not have
// received run task due to a failed containerizer update.
// We also kill the executor first since container->status below
// is waiting for the executor to finish.
Try<list<os::ProcessTree>> kill =
os::killtree(container->executorPid.get(), SIGTERM);
if (kill.isError()) {
// Ignoring the error from killing executor as it can already
// have exited.
VLOG(1) << "Ignoring error when killing executor pid "
<< container->executorPid.get() << " in destroy, error: "
<< kill.error();
}
}
// Otherwise, wait for Docker::run to succeed, in which case we'll
// continue in _destroy (calling Docker::kill) or for Docker::run to
// fail, in which case we'll re-execute this function and cleanup
// above.
container->status.future()
.onAny(defer(self(), &Self::_destroy, containerId, killed));
return container->termination.future()
.then(Option<ContainerTermination>::some);
}
void DockerContainerizerProcess::_destroy(
const ContainerID& containerId,
bool killed)
{
CHECK(containers_.contains(containerId));
Container* container = containers_.at(containerId);
CHECK(container->state == Container::DESTROYING);
// Do a 'docker stop' which we'll then find out about in '_destroy'
// after we've reaped either the container's root process (in the
// event that we had just launched a container for an executor) or
// the mesos-docker-executor (in the case we launched a container
// for a task).
LOG(INFO) << "Running docker stop on container " << containerId;
if (killed) {
// TODO(alexr): After the deprecation cycle (started in 1.0), update
// this to omit the timeout. Graceful shutdown of the container is not
// a containerizer responsibility; it is the responsibility of the agent
// in co-operation with the executor. Once `destroy()` is called, the
// container should be destroyed forcefully.
// The `after` fallback should remain as a precaution against the docker
// stop command hanging.
docker->stop(container->containerName, flags.docker_stop_timeout)
.after(
flags.docker_stop_timeout + DOCKER_FORCE_KILL_TIMEOUT,
defer(self(), &Self::destroyTimeout, containerId, lambda::_1))
.onAny(defer(self(), &Self::__destroy, containerId, killed, lambda::_1));
} else {
__destroy(containerId, killed, Nothing());
}
}
void DockerContainerizerProcess::__destroy(
const ContainerID& containerId,
bool killed,
const Future<Nothing>& kill)
{
CHECK(containers_.contains(containerId));
Container* container = containers_.at(containerId);
if (!kill.isReady() && !container->status.future().isReady()) {
// TODO(benh): This means we've failed to do a Docker::kill, which
// means it's possible that the container is still going to be
// running after we return! We either need to have a periodic
// "garbage collector", or we need to retry the Docker::kill
// indefinitely until it has been successful.
string failure = "Failed to kill the Docker container: " +
(kill.isFailed() ? kill.failure() : "discarded future");
#ifdef __linux__
// TODO(gyliu): We will never de-allocate these GPUs,
// unless the agent is restarted!
if (!container->gpus.empty()) {
failure += ": " + stringify(container->gpus.size()) + " GPUs leaked";
}
#endif // __linux__
container->termination.fail(failure);
containers_.erase(containerId);
delay(
flags.docker_remove_delay,
self(),
&Self::remove,
container->containerName,
container->executorName());
delete container;
return;
}
// Status must be ready since we did a Docker::kill.
CHECK_READY(container->status.future());
container->status.future()
->onAny(defer(self(), &Self::___destroy, containerId, killed, lambda::_1));
}
void DockerContainerizerProcess::___destroy(
const ContainerID& containerId,
bool killed,
const Future<Option<int>>& status)
{
CHECK(containers_.contains(containerId));
Try<Nothing> unmount = unmountPersistentVolumes(containerId);
if (unmount.isError()) {
// TODO(tnachen): Failing to unmount a persistent volume now
// leads to leaving the volume on the host, and we won't retry
// again since the Docker container is removed. We should consider
// not removing the container so we can retry.
LOG(WARNING) << "Failed to remove persistent volumes on destroy for"
<< " container " << containerId << ": " << unmount.error();
}
Future<Nothing> deallocateGpus = Nothing();
#ifdef __linux__
// Deallocate GPU resources before we destroy container.
if (!containers_.at(containerId)->gpus.empty()) {
deallocateGpus = deallocateNvidiaGpus(containerId);
}
#endif // __linux__
deallocateGpus
.onAny(defer(self(), &Self::____destroy, containerId, killed, status));
}
void DockerContainerizerProcess::____destroy(
const ContainerID& containerId,
bool killed,
const Future<Option<int>>& status)
{
Container* container = containers_.at(containerId);
ContainerTermination termination;
if (status.isReady() && status->isSome()) {
termination.set_status(status->get());
}
termination.set_message(
killed ? "Container killed" : "Container terminated");
container->termination.set(termination);
containers_.erase(containerId);
delay(
flags.docker_remove_delay,
self(),
&Self::remove,
container->containerName,
container->executorName());
delete container;
}
Future<Nothing> DockerContainerizerProcess::destroyTimeout(
const ContainerID& containerId,
Future<Nothing> future)
{
CHECK(containers_.contains(containerId));
LOG(WARNING) << "Docker stop timed out for container " << containerId;
Container* container = containers_.at(containerId);
// A hanging `docker stop` could be a problem with docker or even a kernel
// bug. Assuming that this is a docker problem, circumventing docker and
// killing the process run by it ourselves might help here.
if (container->pid.isSome()) {
LOG(WARNING) << "Sending SIGKILL to process with pid "
<< container->pid.get();
Try<list<os::ProcessTree>> kill =
os::killtree(container->pid.get(), SIGKILL);
if (kill.isError()) {
// Ignoring the error from killing process as it can already
// have exited.
VLOG(1) << "Ignoring error when killing process pid "
<< container->pid.get() << " in destroy, error: "
<< kill.error();
}
}
return future;
}
Future<hashset<ContainerID>> DockerContainerizerProcess::containers()
{
return containers_.keys();
}
void DockerContainerizerProcess::reaped(const ContainerID& containerId)
{
if (!containers_.contains(containerId)) {
return;
}
LOG(INFO) << "Executor for container " << containerId << " has exited";
// The executor has exited so destroy the container.
destroy(containerId, false);
}
void DockerContainerizerProcess::remove(
const string& containerName,
const Option<string>& executor)
{
docker->rm(containerName, true);
if (executor.isSome()) {
docker->rm(executor.get(), true);
}
}
} // namespace slave {
} // namespace internal {
} // namespace mesos {