blob: 43dd1ceeecf26b5dc4baf3df970a7a6207321fa1 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <errno.h>
#ifdef __linux__
#include <sched.h>
#include <signal.h>
#include <sys/prctl.h>
#endif // __linux__
#include <string.h>
#include <algorithm>
#include <iostream>
#include <set>
#include <string>
#include <glog/logging.h>
#include <glog/raw_logging.h>
#include <process/subprocess.hpp>
#include <stout/adaptor.hpp>
#include <stout/foreach.hpp>
#include <stout/fs.hpp>
#include <stout/option.hpp>
#include <stout/os.hpp>
#include <stout/path.hpp>
#include <stout/protobuf.hpp>
#include <stout/stringify.hpp>
#include <stout/try.hpp>
#include <stout/unreachable.hpp>
#include <stout/os/exec.hpp>
#include <stout/os/int_fd.hpp>
#include <stout/os/open.hpp>
#include <stout/os/realpath.hpp>
#include <stout/os/shell.hpp>
#include <stout/os/which.hpp>
#include <stout/os/write.hpp>
#ifdef __WINDOWS__
#include <stout/os/windows/jobobject.hpp>
#endif // __WINDOWS__
#include <mesos/mesos.hpp>
#include <mesos/type_utils.hpp>
#include <mesos/slave/containerizer.hpp>
#include "common/parse.hpp"
#include "common/status_utils.hpp"
#ifdef __linux__
#include "linux/capabilities.hpp"
#include "linux/fs.hpp"
#include "linux/ns.hpp"
#endif
#ifdef ENABLE_SECCOMP_ISOLATOR
#include "linux/seccomp/seccomp.hpp"
#endif
#ifndef __WINDOWS__
#include "posix/rlimits.hpp"
#endif // __WINDOWS__
#include "slave/containerizer/mesos/launch.hpp"
#include "slave/containerizer/mesos/paths.hpp"
using std::cerr;
using std::endl;
using std::set;
using std::string;
using std::vector;
using process::Owned;
#ifdef __linux__
using mesos::internal::capabilities::Capabilities;
using mesos::internal::capabilities::Capability;
using mesos::internal::capabilities::ProcessCapabilities;
#endif // __linux__
#ifdef ENABLE_SECCOMP_ISOLATOR
using mesos::internal::seccomp::SeccompFilter;
#endif
using mesos::slave::ContainerFileOperation;
using mesos::slave::ContainerLaunchInfo;
using mesos::slave::ContainerMountInfo;
namespace mesos {
namespace internal {
namespace slave {
const string MesosContainerizerLaunch::NAME = "launch";
MesosContainerizerLaunch::Flags::Flags()
{
add(&Flags::launch_info,
"launch_info",
"");
add(&Flags::pipe_read,
"pipe_read",
"The read end of the control pipe. This is a file descriptor \n"
"on Posix, or a handle on Windows. It's caller's responsibility \n"
"to make sure the file descriptor or the handle is inherited \n"
"properly in the subprocess. It's used to synchronize with the \n"
"parent process. If not specified, no synchronization will happen.");
add(&Flags::pipe_write,
"pipe_write",
"The write end of the control pipe. This is a file descriptor \n"
"on Posix, or a handle on Windows. It's caller's responsibility \n"
"to make sure the file descriptor or the handle is inherited \n"
"properly in the subprocess. It's used to synchronize with the \n"
"parent process. If not specified, no synchronization will happen.");
add(&Flags::runtime_directory,
"runtime_directory",
"The runtime directory for the container (used for checkpointing)");
#ifdef __linux__
add(&Flags::namespace_mnt_target,
"namespace_mnt_target",
"The target 'pid' of the process whose mount namespace we'd like\n"
"to enter before executing the command.");
add(&Flags::unshare_namespace_mnt,
"unshare_namespace_mnt",
"Whether to launch the command in a new mount namespace.",
false);
#endif // __linux__
}
static Option<pid_t> containerPid = None();
static Option<string> containerStatusPath = None();
static Option<int_fd> containerStatusFd = None();
static void exitWithSignal(int sig);
static void exitWithStatus(int status);
#ifndef __WINDOWS__
static void signalSafeWriteStatus(int status)
{
const string statusString = std::to_string(status);
ssize_t result =
os::signal_safe::write(containerStatusFd.get(), statusString);
if (result < 0) {
// NOTE: We use RAW_LOG instead of LOG because RAW_LOG doesn't
// allocate any memory or grab locks. And according to
// https://code.google.com/p/google-glog/issues/detail?id=161
// it should work in 'most' cases in signal handlers.
RAW_LOG(ERROR, "Failed to write container status '%d': %d", status, errno);
}
}
// When launching the executor with an 'init' process, we need to
// forward all relevant signals to it. The functions below help to
// enable this forwarding.
static void signalHandler(int sig)
{
// If we don't yet have a container pid, we treat
// receiving a signal like a failure and exit.
if (containerPid.isNone()) {
exitWithSignal(sig);
}
// Otherwise we simply forward the signal to `containerPid`. We
// purposefully ignore the error here since we have to remain async
// signal safe. The only possible error scenario relevant to us is
// ESRCH, but if that happens that means our pid is already gone and
// the process will exit soon. So we are safe.
os::kill(containerPid.get(), sig);
}
static Try<Nothing> installSignalHandlers()
{
// Install handlers for all standard POSIX signals
// (i.e. any signal less than `NSIG`).
for (int i = 1; i < NSIG; i++) {
// We don't want to forward the SIGCHLD signal, nor do we want to
// handle it ourselves because we reap all children inline in the
// `execute` function.
if (i == SIGCHLD) {
continue;
}
// We can't catch or ignore these signals, so we shouldn't try
// to register a handler for them.
if (i == SIGKILL || i == SIGSTOP) {
continue;
}
// The NSIG constant is used to determine the number of signals
// available on a system. However, Darwin, Linux, and BSD differ
// on their interpretation of of the value of NSIG. Linux, for
// example, sets it to 65, where Darwin sets it to 32. The reason
// for the discrepancy is that Linux includes the real-time
// signals in this count, where Darwin does not. However, even on
// linux, we are not able to arbitrarily install signal handlers
// for all the real-time signals -- they must have not been
// registered with the system first. For this reason, we
// standardize on verifying the installation of handlers for
// signals 1-31 (since these are defined in the POSIX standard),
// but we continue to attempt to install handlers up to the value
// of NSIG without verification.
const int posixLimit = 32;
if (os::signals::install(i, signalHandler) != 0 && i < posixLimit) {
return ErrnoError("Unable to register signal"
" '" + stringify(strsignal(i)) + "'");
}
}
return Nothing();
}
#endif // __WINDOWS__
static void exitWithSignal(int sig)
{
#ifndef __WINDOWS__
if (containerStatusFd.isSome()) {
signalSafeWriteStatus(W_EXITCODE(0, sig));
os::close(containerStatusFd.get());
}
#endif // __WINDOWS__
::_exit(EXIT_FAILURE);
}
static void exitWithStatus(int status)
{
#ifndef __WINDOWS__
if (containerStatusFd.isSome()) {
signalSafeWriteStatus(W_EXITCODE(status, 0));
os::close(containerStatusFd.get());
}
#endif // __WINDOWS__
::_exit(status);
}
#ifdef __linux__
static Try<Nothing> mountContainerFilesystem(const ContainerMountInfo& mount)
{
return fs::mount(
mount.has_source() ? Option<string>(mount.source()) : None(),
mount.target(),
mount.has_type() ? Option<string>(mount.type()) : None(),
mount.has_flags() ? mount.flags() : 0,
mount.has_options() ? Option<string>(mount.options()) : None());
}
#endif // __linux__
static Try<Nothing> prepareMounts(const ContainerLaunchInfo& launchInfo)
{
#ifdef __linux__
bool cloneMountNamespace = std::find(
launchInfo.clone_namespaces().begin(),
launchInfo.clone_namespaces().end(),
CLONE_NEWNS) != launchInfo.clone_namespaces().end();
if (!cloneMountNamespace) {
// Mounts are not supported if the mount namespace is not cloned.
// Otherwise, we'll pollute the parent mount namespace.
if (!launchInfo.mounts().empty()) {
return Error(
"Mounts are not supported if the mount namespace is not cloned");
}
return Nothing();
}
// Now, setup the mount propagation for the container.
// 1) If there is no shared mount (i.e., "bidirectional"
// propagation), mark the root as recursively slave propagation
// (i.e., --make-rslave) so that mounts do not leak to parent
// mount namespace.
// 2) If there exist shared mounts, scan the mount table and mark
// the rest as shared mounts one by one.
//
// TODO(jieyu): Currently, if the container has its own rootfs, the
// 'fs::chroot::enter' function will mark `/` as recursively slave.
// This will cause problems for shared mounts. As a result,
// bidirectional mount propagation does not work for containers that
// have rootfses.
//
// TODO(jieyu): Another caveat right now is that the CNI isolator
// will mark `/` as recursively slave in `isolate()` method if the
// container joins a named network. As a result, bidirectional mount
// propagation currently does not work for containers that want to
// join a CNI network.
bool hasSharedMount = std::find_if(
launchInfo.mounts().begin(),
launchInfo.mounts().end(),
[](const ContainerMountInfo& mount) {
return (mount.flags() & MS_SHARED) != 0;
}) != launchInfo.mounts().end();
if (!hasSharedMount) {
Try<Nothing> mnt =
fs::mount(None(), "/", None(), MS_SLAVE | MS_REC, None());
if (mnt.isError()) {
return Error("Failed to mark '/' as rslave: " + mnt.error());
}
cerr << "Marked '/' as rslave" << endl;
} else {
hashset<string> sharedMountTargets;
foreach (const ContainerMountInfo& mount, launchInfo.mounts()) {
// Skip normal mounts.
if ((mount.flags() & MS_SHARED) == 0) {
continue;
}
sharedMountTargets.insert(mount.target());
}
Try<fs::MountInfoTable> table = fs::MountInfoTable::read();
if (table.isError()) {
return Error("Failed to get mount table: " + table.error());
}
foreach (const fs::MountInfoTable::Entry& entry,
adaptor::reverse(table->entries)) {
if (!sharedMountTargets.contains(entry.target)) {
Try<Nothing> mnt = fs::mount(
None(),
entry.target,
None(),
MS_SLAVE,
None());
if (mnt.isError()) {
return Error(
"Failed to mark '" + entry.target +
"' as slave: " + mnt.error());
}
}
}
}
foreach (const ContainerMountInfo& mount, launchInfo.mounts()) {
// Skip those mounts that are used for setting up propagation.
if ((mount.flags() & MS_SHARED) != 0) {
continue;
}
// If bidirectional mount exists, we will not mark `/` as
// recursively slave (otherwise, the bidirectional mount
// propagation won't work).
//
// At the same time, we want to prevent mounts in the child
// process from being propagated to the host mount namespace,
// except for the ones that set the propagation mode to be
// bidirectional. This ensures a clean host mount table, and
// greatly simplifies the container cleanup.
//
// If the target of a volume mount is under a non shared mount,
// the mount won't be propagated to the host mount namespace,
// which is what we want. Otherwise, the volume mount will be
// propagated to the host mount namespace, which will make proper
// cleanup almost impossible. Therefore, we perform a sanity check
// here to make sure the propagation to host mount namespace does
// not happen.
//
// One implication of this check is that: if the target of a
// volume mount is under a mount that has to be shared (e.g.,
// explicitly specified by the user using 'MountPropagation' in
// HOST_PATH volume), the volume mount will fail.
//
// TODO(jieyu): Some isolators are still using `pre_exe_commands`
// to do mounts. Those isolators thus will escape this check. We
// should consider forcing all isolators to use
// `ContainerMountInfo` for volume mounts.
if (hasSharedMount) {
Try<fs::MountInfoTable> table = fs::MountInfoTable::read();
if (table.isError()) {
return Error("Failed to read mount table: " + table.error());
}
Try<fs::MountInfoTable::Entry> entry =
table->findByTarget(mount.target());
if (entry.isError()) {
return Error(
"Cannot find the mount containing the mount target '" +
mount.target() + "': " + entry.error());
}
if (entry->shared().isSome()) {
return Error(
"Cannot perform mount '" + stringify(JSON::protobuf(mount)) +
"' because the target is under a shared mount "
"'" + entry->target + "'");
}
}
// If the mount target doesn't exist yet, create it. The isolator
// is responsible for ensuring the target path is safe.
if (mount.has_source() && !os::exists(mount.target())) {
const string dirname = Path(mount.target()).dirname();
if (!os::exists(dirname)) {
Try<Nothing> mkdir = os::mkdir(Path(mount.target()).dirname());
if (mkdir.isError()) {
return Error(
"Failed to create mount target directory '" + dirname + "': " +
mkdir.error());
}
}
// The mount source could be a file, a directory, or a Linux
// pseudo-filesystem. In the last case, the target must be a
// directory, so if the source doesn't exist, we default to
// mounting on a directory.
Try<Nothing> target =
(!os::exists(mount.source()) || os::stat::isdir(mount.source()))
? os::mkdir(mount.target())
: os::touch(mount.target());
if (target.isError()) {
return Error(
"Failed to create mount target '" + mount.target() + "': " +
target.error());
}
}
Try<Nothing> mnt = mountContainerFilesystem(mount);
if (mnt.isError()) {
return Error(
"Failed to mount '" + stringify(JSON::protobuf(mount)) +
"': " + mnt.error());
}
}
#endif // __linux__
return Nothing();
}
static Try<Nothing> maskPath(const string& target)
{
Try<Nothing> mnt = Nothing();
#ifdef __linux__
if (os::stat::isfile(target)) {
mnt = fs::mount("/dev/null", target, None(), MS_BIND | MS_RDONLY, None());
} else if (os::stat::isdir(target)) {
mnt = fs::mount(None(), target, "tmpfs", MS_RDONLY, "size=0");
}
#endif // __linux__
if (mnt.isError()) {
return Error("Failed to mask '" + target + "': " + mnt.error());
}
return Nothing();
}
static Try<Nothing> executeFileOperation(const ContainerFileOperation& op)
{
switch (op.operation()) {
case ContainerFileOperation::SYMLINK: {
Try<Nothing> result =
::fs::symlink(op.symlink().source(), op.symlink().target());
if (result.isError()) {
return Error(
"Failed to link '" + op.symlink().source() + "' as '" +
op.symlink().target() + "': " + result.error());
}
return Nothing();
}
case ContainerFileOperation::MOUNT: {
#ifdef __linux__
Try<Nothing> result = mountContainerFilesystem(op.mount());
if (result.isError()) {
return Error(
"Failed to mount '" + stringify(JSON::protobuf(op.mount())) +
"': " + result.error());
}
return Nothing();
#else
return Error("Container mount is not supported on non-Linux systems");
#endif // __linux__
}
case ContainerFileOperation::RENAME: {
Try<Nothing> result =
os::rename(op.rename().source(), op.rename().target());
// TODO(jpeach): We should only fall back to `mv` if the error
// is EXDEV, in which case a rename is a copy+unlink.
if (result.isError()) {
Option<int> status = os::spawn(
"mv", {"mv", "-f", op.rename().source(), op.rename().target()});
if (status.isNone()) {
return Error(
"Failed to rename '" + op.rename().source() + "' to '" +
op.rename().target() + "': spawn failed");
}
if (!WSUCCEEDED(status.get())) {
return Error(
"Failed to rename '" + op.rename().source() + "' to '" +
op.rename().target() + "': " + WSTRINGIFY(status.get()));
}
result = Nothing();
}
return Nothing();
}
case ContainerFileOperation::MKDIR: {
Try<Nothing> result =
os::mkdir(op.mkdir().target(), op.mkdir().recursive());
if (result.isError()) {
return Error(
"Failed to create directory '" + op.mkdir().target() + "': " +
result.error());
}
return result;
}
}
return Nothing();
}
static Try<Nothing> installResourceLimits(const RLimitInfo& limits)
{
#ifdef __WINDOWS__
return Error("Rlimits are not supported on Windows");
#else
foreach (const RLimitInfo::RLimit& limit, limits.rlimits()) {
Try<Nothing> set = rlimits::set(limit);
if (set.isError()) {
return Error(
"Failed to set " +
RLimitInfo::RLimit::Type_Name(limit.type()) + " limit: " +
set.error());
}
}
return Nothing();
#endif // __WINDOWS__
}
static Try<Nothing> enterChroot(const string& rootfs)
{
#ifdef __WINDOWS__
return Error("Changing rootfs is not supported on Windows");
#else
#ifdef __linux__
Try<Nothing> chroot = fs::chroot::enter(rootfs);
#else
// For any other platform we'll just use POSIX chroot.
Try<Nothing> chroot = os::chroot(rootfs);
#endif // __linux__
if (chroot.isError()) {
return Error(
"Failed to enter chroot '" + rootfs + "': " +
chroot.error());
}
return Nothing();
#endif // __WINDOWS__
}
#ifdef __linux__
static void calculateCapabilities(
const ContainerLaunchInfo& launchInfo,
ProcessCapabilities* capabilities)
{
// If the task has any effective capabilities, grant them to all
// the capability sets.
if (launchInfo.has_effective_capabilities()) {
set<Capability> target =
capabilities::convert(launchInfo.effective_capabilities());
capabilities->set(capabilities::AMBIENT, target);
capabilities->set(capabilities::EFFECTIVE, target);
capabilities->set(capabilities::PERMITTED, target);
capabilities->set(capabilities::INHERITABLE, target);
capabilities->set(capabilities::BOUNDING, target);
}
// If we also have bounding capabilities, apply that in preference to
// the effective capabilities.
if (launchInfo.has_bounding_capabilities()) {
set<Capability> bounding =
capabilities::convert(launchInfo.bounding_capabilities());
capabilities->set(capabilities::BOUNDING, bounding);
}
// Force the inherited set to be the same as the bounding set. If we
// are root and capabilities have not been specified, then this is a
// no-op. If capabilities have been specified, then we need to clip the
// inherited set to prevent file-based capabilities granting privileges
// outside the bounding set.
capabilities->set(
capabilities::INHERITABLE,
capabilities->get(capabilities::BOUNDING));
}
#endif // __linux__
int MesosContainerizerLaunch::execute()
{
if (flags.help) {
cerr << flags.usage();
return EXIT_SUCCESS;
}
// The existence of the `runtime_directory` flag implies that we
// want to checkpoint the container's status upon exit.
if (flags.runtime_directory.isSome()) {
containerStatusPath = path::join(
flags.runtime_directory.get(),
containerizer::paths::STATUS_FILE);
Try<int_fd> open = os::open(
containerStatusPath.get(),
O_WRONLY | O_CREAT | O_CLOEXEC,
S_IRUSR | S_IWUSR);
if (open.isError()) {
cerr << "Failed to open file for writing the container status"
<< " '" << containerStatusPath.get() << "':"
<< " " << open.error() << endl;
exitWithStatus(EXIT_FAILURE);
}
containerStatusFd = open.get();
}
#ifndef __WINDOWS__
// We need a signal fence here to ensure that `containerStatusFd` is
// actually written to memory and not just to a temporary register.
// Without this, it's possible that the signal handler we are about
// to install would never see the correct value since there's no
// guarantee that it is written to memory until this function
// completes (which won't happen for a really long time because we
// do a blocking `waitpid()` below).
std::atomic_signal_fence(std::memory_order_relaxed);
// Install signal handlers for all incoming signals.
Try<Nothing> signals = installSignalHandlers();
if (signals.isError()) {
cerr << "Failed to install signal handlers: " << signals.error() << endl;
exitWithStatus(EXIT_FAILURE);
}
#else
// We need a handle to the job object which this container is associated with.
// Without this handle, the job object would be destroyed by the OS when the
// agent exits (or crashes), making recovery impossible. By holding a handle,
// we tie the lifetime of the job object to the container itself. In this way,
// a recovering agent can reattach to the container by opening a new handle to
// the job object.
const pid_t pid = ::GetCurrentProcessId();
const Try<std::wstring> name = os::name_job(pid);
if (name.isError()) {
cerr << "Failed to create job object name from pid: " << name.error()
<< endl;
exitWithStatus(EXIT_FAILURE);
}
// NOTE: This handle will not be destructed, even though it is a
// `SharedHandle`, because it will (purposefully) never go out of scope.
Try<SharedHandle> handle = os::open_job(JOB_OBJECT_QUERY, false, name.get());
if (handle.isError()) {
cerr << "Failed to open job object '" << stringify(name.get())
<< "' for the current container: " << handle.error() << endl;
exitWithStatus(EXIT_FAILURE);
}
#endif // __WINDOWS__
if (flags.launch_info.isNone()) {
cerr << "Flag --launch_info is not specified" << endl;
exitWithStatus(EXIT_FAILURE);
}
Try<ContainerLaunchInfo> _launchInfo =
::protobuf::parse<ContainerLaunchInfo>(flags.launch_info.get());
if (_launchInfo.isError()) {
cerr << "Failed to parse launch info: " << _launchInfo.error() << endl;
exitWithStatus(EXIT_FAILURE);
}
ContainerLaunchInfo launchInfo = _launchInfo.get();
if (!launchInfo.has_command()) {
cerr << "Launch command is not specified" << endl;
exitWithStatus(EXIT_FAILURE);
}
// Validate the command.
if (launchInfo.command().shell()) {
if (!launchInfo.command().has_value()) {
cerr << "Shell command is not specified" << endl;
exitWithStatus(EXIT_FAILURE);
}
} else {
if (!launchInfo.command().has_value()) {
cerr << "Executable path is not specified" << endl;
exitWithStatus(EXIT_FAILURE);
}
}
if ((flags.pipe_read.isSome() && flags.pipe_write.isNone()) ||
(flags.pipe_read.isNone() && flags.pipe_write.isSome())) {
cerr << "Flag --pipe_read and --pipe_write should either be "
<< "both set or both not set" << endl;
exitWithStatus(EXIT_FAILURE);
}
bool controlPipeSpecified =
flags.pipe_read.isSome() && flags.pipe_write.isSome();
if (controlPipeSpecified) {
int_fd pipe[2] = { flags.pipe_read.get(), flags.pipe_write.get() };
Try<Nothing> close = os::close(pipe[1]);
if (close.isError()) {
cerr << "Failed to close pipe[1]: " << close.error() << endl;
exitWithStatus(EXIT_FAILURE);
}
// Do a blocking read on the pipe until the parent signals us to continue.
char dummy;
ssize_t length;
while ((length = os::read(pipe[0], &dummy, sizeof(dummy))) == -1 &&
errno == EINTR);
if (length != sizeof(dummy)) {
// There's a reasonable probability this will occur during
// agent restarts across a large/busy cluster.
cerr << "Failed to synchronize with agent "
<< "(it's probably exited)" << endl;
exitWithStatus(EXIT_FAILURE);
}
close = os::close(pipe[0]);
if (close.isError()) {
cerr << "Failed to close pipe[0]: " << close.error() << endl;
exitWithStatus(EXIT_FAILURE);
}
}
#ifndef __WINDOWS__
if (launchInfo.has_tty_slave_path()) {
Try<Nothing> setctty = os::setctty(STDIN_FILENO);
if (setctty.isError()) {
cerr << "Failed to set control tty: " << setctty.error() << endl;
exitWithStatus(EXIT_FAILURE);
}
}
#endif // __WINDOWS__
#ifdef __linux__
// If we need a new mount namespace, we have to do it before
// we make the mounts needed to prepare the rootfs template.
if (flags.unshare_namespace_mnt) {
if (unshare(CLONE_NEWNS) != 0) {
cerr << "Failed to unshare mount namespace: "
<< os::strerror(errno) << endl;
exitWithStatus(EXIT_FAILURE);
}
}
if (flags.namespace_mnt_target.isSome()) {
if (!launchInfo.mounts().empty()) {
cerr << "Mounts are not supported if "
<< "'namespace_mnt_target' is set" << endl;
exitWithStatus(EXIT_FAILURE);
}
if (launchInfo.has_rootfs()) {
cerr << "Container rootfs is not supported if "
<< "'namespace_mnt_target' is set" << endl;
exitWithStatus(EXIT_FAILURE);
}
}
#endif // __linux__
// Verify that the rootfs is an absolute path.
if (launchInfo.has_rootfs()) {
const string& rootfs = launchInfo.rootfs();
cerr << "Preparing rootfs at '" << rootfs << "'" << endl;
Result<string> realpath = os::realpath(rootfs);
if (realpath.isError()) {
cerr << "Failed to determine if rootfs '" << rootfs
<< "' is an absolute path: " << realpath.error() << endl;
exitWithStatus(EXIT_FAILURE);
} else if (realpath.isNone()) {
cerr << "Rootfs path '" << rootfs << "' does not exist" << endl;
exitWithStatus(EXIT_FAILURE);
} else if (realpath.get() != rootfs) {
cerr << "Rootfs path '" << rootfs << "' is not an absolute path" << endl;
exitWithStatus(EXIT_FAILURE);
}
#ifdef __WINDOWS__
cerr << "Changing rootfs is not supported on Windows";
exitWithStatus(EXIT_FAILURE);
#endif // __WINDOWS__
}
Try<Nothing> mount = prepareMounts(launchInfo);
if (mount.isError()) {
cerr << "Failed to prepare mounts: " << mount.error() << endl;
exitWithStatus(EXIT_FAILURE);
}
foreach (const string& target, launchInfo.masked_paths()) {
mount = maskPath(target);
if (mount.isError()) {
cerr << "Failed to mask container paths: " << mount.error() << endl;
}
}
foreach (const ContainerFileOperation& op, launchInfo.file_operations()) {
Try<Nothing> result = executeFileOperation(op);
if (result.isError()) {
cerr << result.error() << endl;
exitWithStatus(EXIT_FAILURE);
}
}
// Run additional preparation commands. These are run as the same
// user and with the environment as the agent.
foreach (const CommandInfo& command, launchInfo.pre_exec_commands()) {
if (!command.has_value()) {
cerr << "The 'value' of a preparation command is not specified" << endl;
exitWithStatus(EXIT_FAILURE);
}
cerr << "Executing pre-exec command '"
<< JSON::protobuf(command) << "'" << endl;
Option<int> status;
if (command.shell()) {
// Execute the command using the system shell.
status = os::system(command.value());
} else {
// Directly spawn all non-shell commands to prohibit users
// from injecting arbitrary shell commands in the arguments.
vector<string> args;
foreach (const string& arg, command.arguments()) {
args.push_back(arg);
}
status = os::spawn(command.value(), args);
}
if (status.isNone() || !WSUCCEEDED(status.get())) {
cerr << "Failed to execute pre-exec command '"
<< JSON::protobuf(command) << "': ";
if (status.isNone()) {
cerr << "exited with unknown status" << endl;
} else {
cerr << WSTRINGIFY(status.get()) << endl;
}
exitWithStatus(EXIT_FAILURE);
}
}
#ifndef __WINDOWS__
// NOTE: If 'user' is set, we will get the uid, gid, and the
// supplementary group ids associated with the specified user before
// changing the filesystem root. This is because after changing the
// filesystem root, the current process might no longer have access
// to /etc/passwd and /etc/group on the host.
Option<uid_t> uid;
Option<gid_t> gid;
vector<gid_t> gids;
// TODO(gilbert): For the case container user exists, support
// framework/task/default user -> container user mapping once
// user namespace and container capabilities is available for
// mesos container.
if (launchInfo.has_user()) {
Result<uid_t> _uid = os::getuid(launchInfo.user());
if (!_uid.isSome()) {
cerr << "Failed to get the uid of user '" << launchInfo.user() << "': "
<< (_uid.isError() ? _uid.error() : "not found") << endl;
exitWithStatus(EXIT_FAILURE);
}
// No need to change user/groups if the specified user is the same
// as the effective user of the current process.
if (_uid.get() != ::geteuid()) {
Result<gid_t> _gid = os::getgid(launchInfo.user());
if (!_gid.isSome()) {
cerr << "Failed to get the gid of user '" << launchInfo.user() << "': "
<< (_gid.isError() ? _gid.error() : "not found") << endl;
exitWithStatus(EXIT_FAILURE);
}
Try<vector<gid_t>> _gids = os::getgrouplist(launchInfo.user());
if (_gids.isError()) {
cerr << "Failed to get the supplementary gids of user '"
<< launchInfo.user() << "': "
<< (_gids.isError() ? _gids.error() : "not found") << endl;
exitWithStatus(EXIT_FAILURE);
}
foreach (uint32_t supplementaryGroup, launchInfo.supplementary_groups()) {
_gids->push_back(supplementaryGroup);
}
uid = _uid.get();
gid = _gid.get();
gids = _gids.get();
}
}
#else
if (launchInfo.has_user()) {
cerr << "Switching user is not supported on Windows" << endl;
exitWithStatus(EXIT_FAILURE);
}
#endif // __WINDOWS__
#ifdef __linux__
// Initialize capabilities support if necessary.
Option<Capabilities> capabilitiesManager = None();
const bool needSetCapabilities = launchInfo.has_effective_capabilities() ||
launchInfo.has_bounding_capabilities();
if (needSetCapabilities || launchInfo.has_seccomp_profile()) {
Try<Capabilities> _capabilitiesManager = Capabilities::create();
if (_capabilitiesManager.isError()) {
cerr << "Failed to initialize capabilities support: "
<< _capabilitiesManager.error() << endl;
exitWithStatus(EXIT_FAILURE);
}
capabilitiesManager = _capabilitiesManager.get();
}
// Prevent clearing of capabilities on `setuid`.
if (needSetCapabilities && uid.isSome()) {
Try<Nothing> keepCaps = capabilitiesManager->setKeepCaps();
if (keepCaps.isError()) {
cerr << "Failed to set process control for keeping capabilities "
<< "on potential uid change: " << keepCaps.error() << endl;
exitWithStatus(EXIT_FAILURE);
}
}
#else
if (launchInfo.has_effective_capabilities() ||
launchInfo.has_bounding_capabilities()) {
cerr << "Capabilities are not supported on non Linux system" << endl;
exitWithStatus(EXIT_FAILURE);
}
#endif // __linux__
#ifdef __linux__
if (flags.namespace_mnt_target.isSome()) {
string path = path::join(
"/proc",
stringify(flags.namespace_mnt_target.get()),
"ns",
"mnt");
Try<Nothing> setns = ns::setns(path, "mnt", false);
if (setns.isError()) {
cerr << "Failed to enter mount namespace: "
<< setns.error() << endl;
exitWithStatus(EXIT_FAILURE);
}
}
#endif // __linux__
// Change root to a new root, if provided.
if (launchInfo.has_rootfs()) {
cerr << "Changing root to " << launchInfo.rootfs() << endl;
Try<Nothing> enter = enterChroot(launchInfo.rootfs());
if (enter.isError()) {
cerr << enter.error() << endl;
exitWithStatus(EXIT_FAILURE);
}
}
// Install resource limits for the process.
if (launchInfo.has_rlimits()) {
Try<Nothing> set = installResourceLimits(launchInfo.rlimits());
if (set.isError()) {
cerr << set.error() << endl;
exitWithStatus(EXIT_FAILURE);
}
}
if (launchInfo.has_working_directory()) {
// If working directory does not exist (e.g., being removed from
// the container image), create an empty directory even it may
// not be used. Please note that this case can only be possible
// if an image has 'WORKDIR' specified in its manifest but that
// 'WORKDIR' does not exist in the image's rootfs.
//
// TODO(gilbert): Set the proper ownership to this working
// directory to make sure a specified non-root user has the
// permission to write to this working directory. Right now
// it is owned by root, and any non-root user will fail to
// write to this directory. Please note that this is identical
// to the semantic as docker daemon. The semantic can be
// verified by:
// 'docker run -ti -u nobody quay.io/spinnaker/front50:master bash'
// The ownership of '/workdir' is root. Creating any file under
// '/workdir' will fail for 'Permission denied'.
Try<Nothing> mkdir = os::mkdir(launchInfo.working_directory());
if (mkdir.isError()) {
cerr << "Failed to create working directory "
<< "'" << launchInfo.working_directory() << "': "
<< mkdir.error() << endl;
}
Try<Nothing> chdir = os::chdir(launchInfo.working_directory());
if (chdir.isError()) {
cerr << "Failed to chdir into current working directory "
<< "'" << launchInfo.working_directory() << "': "
<< chdir.error() << endl;
exitWithStatus(EXIT_FAILURE);
}
}
#ifdef ENABLE_SECCOMP_ISOLATOR
if (launchInfo.has_seccomp_profile()) {
CHECK_SOME(capabilitiesManager);
Try<ProcessCapabilities> capabilities = capabilitiesManager->get();
if (capabilities.isError()) {
cerr << "Failed to get capabilities for the current process: "
<< capabilities.error() << endl;
exitWithStatus(EXIT_FAILURE);
}
calculateCapabilities(launchInfo, &capabilities.get());
Try<Owned<SeccompFilter>> seccompFilter = SeccompFilter::create(
launchInfo.seccomp_profile(),
capabilities.get());
if (seccompFilter.isError()) {
cerr << "Failed to create Seccomp filter: "
<< seccompFilter.error() << endl;
exitWithStatus(EXIT_FAILURE);
}
Try<Nothing> load = seccompFilter.get()->load();
if (load.isError()) {
cerr << "Failed to load Seccomp filter: " << load.error() << endl;
exitWithStatus(EXIT_FAILURE);
}
}
#endif // ENABLE_SECCOMP_ISOLATOR
#ifndef __WINDOWS__
// Change user if provided. Note that we do that after executing the
// preparation commands so that those commands will be run with the
// same privilege as the mesos-agent.
if (uid.isSome()) {
Try<Nothing> setgid = os::setgid(gid.get());
if (setgid.isError()) {
cerr << "Failed to set gid to " << gid.get()
<< ": " << setgid.error() << endl;
exitWithStatus(EXIT_FAILURE);
}
Try<Nothing> setgroups = os::setgroups(gids, uid);
if (setgroups.isError()) {
cerr << "Failed to set supplementary gids: "
<< setgroups.error() << endl;
exitWithStatus(EXIT_FAILURE);
}
Try<Nothing> setuid = os::setuid(uid.get());
if (setuid.isError()) {
cerr << "Failed to set uid to " << uid.get()
<< ": " << setuid.error() << endl;
exitWithStatus(EXIT_FAILURE);
}
}
#endif // __WINDOWS__
#ifdef __linux__
if (needSetCapabilities) {
CHECK_SOME(capabilitiesManager);
Try<ProcessCapabilities> capabilities = capabilitiesManager->get();
if (capabilities.isError()) {
cerr << "Failed to get capabilities for the current process: "
<< capabilities.error() << endl;
exitWithStatus(EXIT_FAILURE);
}
// After 'setuid', the 'effective' set is cleared. Since `SETPCAP`
// is required in the `effective` set of a process to change the
// bounding set, we need to restore it first so we can make the
// final capability changes.
capabilities->add(capabilities::EFFECTIVE, capabilities::SETPCAP);
Try<Nothing> setPcap = capabilitiesManager->set(capabilities.get());
if (setPcap.isError()) {
cerr << "Failed to add SETPCAP to the effective set: "
<< setPcap.error() << endl;
exitWithStatus(EXIT_FAILURE);
}
calculateCapabilities(launchInfo, &capabilities.get());
Try<Nothing> set = capabilitiesManager->set(capabilities.get());
if (set.isError()) {
cerr << "Failed to set process capabilities: " << set.error() << endl;
exitWithStatus(EXIT_FAILURE);
}
}
if (launchInfo.has_no_new_privileges()) {
const int val = launchInfo.no_new_privileges() ? 1 : 0;
if (prctl(PR_SET_NO_NEW_PRIVS, val, 0, 0, 0) == -1) {
cerr << "Failed to set NO_NEW_PRIVS: " << os::strerror(errno) << endl;
exitWithStatus(EXIT_FAILURE);
}
}
#endif // __linux__
// Prepare the executable and the argument list for the child.
string executable(launchInfo.command().shell()
? os::Shell::name
: launchInfo.command().value().c_str());
// Search executable in the current working directory as well.
// execvpe and execvp will only search executable from the current
// working directory if environment variable PATH is not set.
if (!path::is_absolute(executable) &&
launchInfo.has_working_directory()) {
Option<string> which = os::which(
executable,
launchInfo.working_directory());
if (which.isSome()) {
executable = which.get();
}
}
os::raw::Argv argv(launchInfo.command().shell()
? vector<string>({
os::Shell::arg0,
os::Shell::arg1,
launchInfo.command().value()})
: vector<string>(
launchInfo.command().arguments().begin(),
launchInfo.command().arguments().end()));
// Prepare the environment for the child. If 'environment' is not
// specified, inherit the environment of the current process.
Option<os::raw::Envp> envp;
if (launchInfo.has_environment()) {
// TODO(tillt): `Environment::Variable` is not a string anymore,
// consider cleaning this up with the complete rollout of `Secrets`.
// This entire merging should be handled by the solution introduced
// by MESOS-7299.
hashmap<string, string> environment;
foreach (const Environment::Variable& variable,
launchInfo.environment().variables()) {
const string& name = variable.name();
const string& value = variable.value();
// TODO(tillt): Once we have a solution for MESOS-7292, allow
// logging of values.
if (environment.contains(name) && environment[name] != value) {
cerr << "Overwriting environment variable '" << name << "'" << endl;
}
environment[name] = value;
}
if (!environment.contains("PATH")) {
environment["PATH"] = os::host_default_path();
}
envp = os::raw::Envp(environment);
}
#ifndef __WINDOWS__
// Construct a set of file descriptors to close before `exec`'ing.
//
// On Windows all new processes create by Mesos go through the
// `create_process` wrapper which with the completion of MESOS-8926
// will prevent inadvertent leaks making this code unnecessary there.
Try<vector<int_fd>> fds = os::lsof();
CHECK_SOME(fds);
// If we have `containerStatusFd` set, then we need to fork-exec the
// command we are launching and checkpoint its status on exit. We
// use fork-exec directly (as opposed to `process::subprocess()`) to
// avoid initializing libprocess for this simple helper binary.
//
// TODO(klueska): Once we move the majority of `process::subprocess()`
// into stout, update the code below to use it.
if (containerStatusFd.isSome()) {
pid_t pid = ::fork();
if (pid == -1) {
cerr << "Failed to fork() the command: " << os::strerror(errno) << endl;
exitWithStatus(EXIT_FAILURE);
}
// If we are the parent...
if (pid > 0) {
// Set the global `containerPid` variable to enable signal forwarding.
//
// NOTE: We need a signal fence here to ensure that `containerPid`
// is actually written to memory and not just to a temporary register.
// Without this, it's possible that the signal handler would
// never notice the change since there's no guarantee that it is
// written out to memory until this function completes (which
// won't happen until it's too late because we loop inside a
// blocking `waitpid()` call below).
containerPid = pid;
std::atomic_signal_fence(std::memory_order_relaxed);
// Wait for the newly created process to finish.
int status = 0;
Result<pid_t> waitpid = None();
// Reap all descendants, but only continue once we reap the
// process we just launched.
while (true) {
waitpid = os::waitpid(-1, &status, 0);
if (waitpid.isError()) {
// If the error was an EINTR, we were interrupted by a
// signal and should just call `waitpid()` over again.
if (errno == EINTR) {
continue;
}
cerr << "Failed to os::waitpid(): " << waitpid.error() << endl;
exitWithStatus(EXIT_FAILURE);
}
if (waitpid.isNone()) {
cerr << "Calling os::waitpid() with blocking semantics"
<< "returned asynchronously" << endl;
exitWithStatus(EXIT_FAILURE);
}
// We only forward the signal if the child has terminated. If
// the child has stopped due to some signal (e.g., SIGSTOP),
// we will simply ignore it.
if (WIFSTOPPED(status)) {
continue;
}
if (pid == waitpid.get()) {
break;
}
}
signalSafeWriteStatus(status);
os::close(containerStatusFd.get());
::_exit(EXIT_SUCCESS);
}
// Avoid leaking not required file descriptors into the forked process.
foreach (int_fd fd, fds.get()) {
if (fd != STDIN_FILENO && fd != STDOUT_FILENO && fd != STDERR_FILENO) {
// NOTE: Set "FD_CLOEXEC" on the fd, instead of closing it
// because exec below might exec a memfd.
int flags = ::fcntl(fd, F_GETFD);
if (flags == -1) {
ABORT("Failed to get FD flags");
}
if (::fcntl(fd, F_SETFD, flags | FD_CLOEXEC) == -1) {
ABORT("Failed to set FD_CLOEXEC");
}
}
}
}
#endif // __WINDOWS__
// NOTE: On Windows, these functions call `CreateProcess` and then wait for
// the new process to exit. Because of this, the `SharedHandle` to the job
// object does not go out of scope. This is unlike the POSIX behavior of
// `exec`, as the process image is intentionally not replaced.
if (envp.isSome()) {
os::execvpe(executable.c_str(), argv, envp.get());
} else {
os::execvp(executable.c_str(), argv);
}
// If we get here, the execvp call failed.
cerr << "Failed to execute '" << executable << "': "
<< os::strerror(errno) << endl;
exitWithStatus(EXIT_FAILURE);
UNREACHABLE();
}
} // namespace slave {
} // namespace internal {
} // namespace mesos {