// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// See the License for the specific language governing permissions and
// limitations under the License.
#include <string>
#include <vector>
#include <process/collect.hpp>
#include <process/id.hpp>
#include <stout/fs.hpp>
#include <stout/os.hpp>
#include <stout/path.hpp>
#include <stout/os/realpath.hpp>
#include "slave/paths.hpp"
#include "slave/containerizer/mesos/isolators/filesystem/posix.hpp"
using namespace process;
using std::string;
using std::vector;
using mesos::slave::ContainerConfig;
using mesos::slave::ContainerLaunchInfo;
using mesos::slave::ContainerLimitation;
using mesos::slave::ContainerState;
using mesos::slave::Isolator;
namespace mesos {
namespace internal {
namespace slave {
const Flags& _flags,
VolumeGidManager* _volumeGidManager)
: ProcessBase(process::ID::generate("posix-filesystem-isolator")),
volumeGidManager(_volumeGidManager) {}
PosixFilesystemIsolatorProcess::~PosixFilesystemIsolatorProcess() {}
Try<Isolator*> PosixFilesystemIsolatorProcess::create(
const Flags& flags,
VolumeGidManager* volumeGidManager)
process::Owned<MesosIsolatorProcess> process(
new PosixFilesystemIsolatorProcess(flags, volumeGidManager));
return new MesosIsolator(process);
Future<Nothing> PosixFilesystemIsolatorProcess::recover(
const vector<ContainerState>& states,
const hashset<ContainerID>& orphans)
foreach (const ContainerState& state, states) {
infos.put(state.container_id(), Owned<Info>(new Info(;
return Nothing();
Future<Option<ContainerLaunchInfo>> PosixFilesystemIsolatorProcess::prepare(
const ContainerID& containerId,
const ContainerConfig& containerConfig)
if (infos.contains(containerId)) {
return Failure("Container has already been prepared");
const ExecutorInfo& executorInfo = containerConfig.executor_info();
if (executorInfo.has_container()) {
CHECK_EQ(executorInfo.container().type(), ContainerInfo::MESOS);
// Return failure if the container change the filesystem root
// because the symlinks will become invalid in the new root.
if (executorInfo.container().mesos().has_image()) {
return Failure("Container root filesystems not supported");
if (executorInfo.container().volumes().size() > 0) {
return Failure("Volumes in ContainerInfo is not supported");
infos.put(containerId, Owned<Info>(new Info(;
return update(containerId, executorInfo.resources())
[this, containerId, containerConfig]() mutable
-> Future<Option<ContainerLaunchInfo>> {
if (!infos.contains(containerId)) {
return Failure("Unknown container");
ContainerLaunchInfo launchInfo;
foreach (gid_t gid, infos[containerId]->gids) {
// For command task with its own rootfs, the command executor will
// run as root and the task itself will run as the specified normal
// user, so here we add the supplementary group for the task and the
// command executor will set it accordingly when launching the task.
if (containerConfig.has_task_info() &&
containerConfig.has_rootfs()) {
} else {
return launchInfo;
Future<Nothing> PosixFilesystemIsolatorProcess::update(
const ContainerID& containerId,
const Resources& resourceRequests,
const google::protobuf::Map<string, Value::Scalar>& resourceLimits)
if (!infos.contains(containerId)) {
return Failure("Unknown container");
const Owned<Info>& info = infos[containerId];
// TODO(jieyu): Currently, we only allow non-nested relative
// container paths for volumes. This is enforced by the master. For
// those volumes, we create symlinks in the executor directory.
Resources current = info->resources;
// We first remove unneeded persistent volumes.
foreach (const Resource& resource, current.persistentVolumes()) {
// This is enforced by the master.
// Ignore absolute and nested paths.
const string& containerPath = resource.disk().volume().container_path();
if (strings::contains(containerPath, "/")) {
LOG(WARNING) << "Skipping updating symlink for persistent volume "
<< resource << " of container " << containerId
<< " because the container path '" << containerPath
<< "' contains slash";
if (resourceRequests.contains(resource)) {
string link = path::join(info->directory, containerPath);
LOG(INFO) << "Removing symlink '" << link << "' for persistent volume "
<< resource << " of container " << containerId;
Try<Nothing> rm = os::rm(link);
if (rm.isError()) {
return Failure(
"Failed to remove the symlink for the unneeded "
"persistent volume at '" + link + "'");
// Get user and group info for this task based on the task's sandbox.
struct stat s;
if (::stat(info->directory.c_str(), &s) < 0) {
return Failure("Failed to get ownership for '" + info->directory +
"': " + os::strerror(errno));
const uid_t uid = s.st_uid;
const gid_t gid = s.st_gid;
vector<Future<gid_t>> futures;
// We then link additional persistent volumes.
foreach (const Resource& resource, resourceRequests.persistentVolumes()) {
// This is enforced by the master.
// Ignore absolute and nested paths.
const string& containerPath = resource.disk().volume().container_path();
if (strings::contains(containerPath, "/")) {
LOG(WARNING) << "Skipping updating symlink for persistent volume "
<< resource << " of container " << containerId
<< " because the container path '" << containerPath
<< "' contains slash";
if (current.contains(resource)) {
string original = paths::getPersistentVolumePath(flags.work_dir, resource);
// If the container's user is root (uid == 0), we do not need to do any
// changes about the volume's ownership since it has the full permissions
// to access the volume.
if (uid != 0) {
// For persistent volumes not from resource providers, if volume gid
// manager is enabled, call volume gid manager to allocate a gid to
// make sure the container has the permission to access the volume.
// TODO(qianzhang): Support gid allocation for persistent volumes from
// resource providers.
if (!Resources::hasResourceProvider(resource) &&
volumeGidManager) {
#ifndef __WINDOWS__
LOG(INFO) << "Invoking volume gid manager to allocate gid to the "
<< "volume path '" << original << "' for container "
<< containerId;
volumeGidManager->allocate(original, VolumeGidInfo::PERSISTENT));
#endif // __WINDOWS__
} else {
bool isVolumeInUse = false;
// Check if the shared persistent volume is currently used by another
// container. We do not need to do this check for local persistent
// volume since it can only be used by one container at a time.
if (resource.has_shared()) {
foreachpair (const ContainerID& _containerId,
const Owned<Info>& info,
infos) {
// Skip self.
if (_containerId == containerId) {
if (info->resources.contains(resource)) {
isVolumeInUse = true;
// Set the ownership of the persistent volume to match that of the
// sandbox directory if the volume is not already in use. If the
// volume is currently in use by other containers, tasks in this
// container may fail to read from or write to the persistent volume
// due to incompatible ownership and file system permissions.
if (!isVolumeInUse) {
// TODO(hausdorff): (MESOS-5461) Persistent volumes maintain the
// invariant that they are used by one task at a time. This is
// currently enforced by `os::chown`. Windows does not support
// `os::chown`, we will need to revisit this later.
#ifndef __WINDOWS__
LOG(INFO) << "Changing the ownership of the persistent volume at '"
<< original << "' with uid " << uid << " and gid " << gid;
Try<Nothing> chown = os::chown(uid, gid, original, false);
if (chown.isError()) {
return Failure(
"Failed to change the ownership of the persistent volume at '" +
original + "' with uid " + stringify(uid) +
" and gid " + stringify(gid) + ": " + chown.error());
#endif // __WINDOWS__
string link = path::join(info->directory, containerPath);
if (os::exists(link)) {
// NOTE: This is possible because 'info->resources' will be
// reset when slave restarts and recovers. When the slave calls
// 'containerizer->update' after the executor reregisters,
// we'll try to relink all the already symlinked volumes.
Result<string> realpath = os::realpath(link);
if (!realpath.isSome()) {
return Failure(
"Failed to get the realpath of symlink '" + link + "': " +
(realpath.isError() ? realpath.error() : "No such directory"));
// A sanity check to make sure the target of the symlink does
// not change. In fact, this is not supposed to happen.
// NOTE: Here, we compare the realpaths because 'original' might
// contain symbolic links.
Result<string> _original = os::realpath(original);
if (!_original.isSome()) {
return Failure(
"Failed to get the realpath of volume '" + original + "': " +
(_original.isError() ? _original.error() : "No such directory"));
if (realpath.get() != _original.get()) {
return Failure(
"The existing symlink '" + link + "' points to '" +
_original.get() + "' and the new target is '" +
realpath.get() + "'");
} else {
LOG(INFO) << "Adding symlink from '" << original << "' to '"
<< link << "' for persistent volume " << resource
<< " of container " << containerId;
// NOTE: We cannot enforce read-only access given the symlink without
// changing the source so we just log a warning here.
if (resource.disk().volume().mode() == Volume::RO) {
LOG(WARNING) << "Allowing read-write access to read-only volume '"
<< original << "' of container " << containerId;
Try<Nothing> symlink = ::fs::symlink(original, link);
if (symlink.isError()) {
return Failure(
"Failed to symlink persistent volume from '" +
original + "' to '" + link + "'");
// Store the updated resources.
info->resources = resourceRequests;
return collect(futures)
.then(defer(self(), [this, containerId](const vector<gid_t>& gids)
-> Future<Nothing> {
if (!infos.contains(containerId)) {
return Failure("Unknown container");
infos[containerId]->gids = gids;
return Nothing();
Future<Nothing> PosixFilesystemIsolatorProcess::cleanup(
const ContainerID& containerId)
// Symlinks for persistent resources will be removed when the work
// directory is GC'ed, therefore no need to do explicit cleanup.
return Nothing();
} // namespace slave {
} // namespace internal {
} // namespace mesos {