// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// See the License for the specific language governing permissions and
// limitations under the License.
#include <fts.h>
#include <sys/types.h>
#include <string>
#include <mesos/resources.hpp>
#include <process/async.hpp>
#include <process/collect.hpp>
#include <process/dispatch.hpp>
#include <process/id.hpp>
#include <process/metrics/metrics.hpp>
#include <process/metrics/push_gauge.hpp>
#include <stout/os/exists.hpp>
#include <stout/os/su.hpp>
#include "common/values.hpp"
#ifdef __linux__
#include "linux/fs.hpp"
#endif // __linux__
#include "slave/paths.hpp"
#include "slave/state.hpp"
#include "slave/volume_gid_manager/volume_gid_manager.hpp"
using std::pair;
using std::string;
using std::vector;
using process::async;
using process::dispatch;
using process::Failure;
using process::Future;
using process::Owned;
using process::Promise;
using mesos::internal::values::intervalSetToRanges;
using mesos::internal::values::rangesToIntervalSet;
namespace mesos {
namespace internal {
namespace slave {
// Recursively change the owner group of the given path
// to the given gid and set/unset the `setgid` bit.
static Try<Nothing> setVolumeOwnership(
const string& path,
gid_t gid,
bool setgid)
LOG(INFO) << "Start setting the owner group of the volume path '"
<< path << "' " << (setgid ? "" : "back ") << "to " << gid;
char* path_[] = {const_cast<char*>(path.c_str()), nullptr};
FTS* tree = ::fts_open(path_, FTS_NOCHDIR | FTS_PHYSICAL, nullptr);
if (tree == nullptr) {
return ErrnoError("Failed to open '" + path + "'");
FTSENT *node;
while ((node = ::fts_read(tree)) != nullptr) {
const Path path = Path(node->fts_path);
switch (node->fts_info) {
// Preorder directory.
case FTS_D:
// Regular file.
case FTS_F:
// Symbolic link.
case FTS_SL: {
// Change the owner group to the given gid.
if (::lchown(node->fts_path, node->fts_statp->st_uid, gid) < 0) {
Error error = ErrnoError();
return Error(
"Chown failed on '" + path.string() + "': " + error.message);
if (node->fts_info == FTS_D) {
// Set the `setgid` bit for directories and add the write
// permission for the owner group.
if (setgid) {
if (::chmod(
node->fts_statp->st_mode | S_ISGID | S_IWGRP)) {
Error error = ErrnoError();
return Error(
"Chmod failed on '" + path.string() + "': " + error.message);
} else {
// Unset the `setgid` bit for directories and remove the write
// permission for the owner group.
if (::chmod(
node->fts_statp->st_mode & ~S_ISGID & ~S_IWGRP)) {
Error error = ErrnoError();
return Error(
"Chmod failed on '" + path.string() + "': " + error.message);
// Unreadable directory.
case FTS_DNR:
// Error; errno is set.
case FTS_ERR:
// `stat(2)` failed.
case FTS_NS: {
Error error = ErrnoError(node->fts_errno);
return Error(
"Failed to read '" + path.string() + "': " + error.message);
if (errno != 0) {
Error error = ErrnoError();
return error;
if (::fts_close(tree) != 0) {
return ErrnoError("Failed to stop traversing file system");
LOG(INFO) << "Finished setting the owner group of the volume path '"
<< path << "' " << (setgid ? "" : "back ") << "to " << gid;
return Nothing();
class VolumeGidManagerProcess : public process::Process<VolumeGidManagerProcess>
const IntervalSet<gid_t>& gids,
const string& workDir)
: ProcessBase(process::ID::generate("volume-gid-manager")),
// At the beginning, the free gid range is the same as the
// configured gid range (i.e., the total gid range).
LOG(INFO) << "Allocating " << totalGids.size()
<< " volume gids from the range " << totalGids;
metrics.volume_gids_total = totalGids.size();
metrics.volume_gids_free = freeGids.size();
Future<Nothing> recover(bool rebooted)
LOG(INFO) << "Recovering volume gid manager";
const string volumeGidsPath = paths::getVolumeGidsPath(metaDir);
if (os::exists(volumeGidsPath)) {
Result<VolumeGidInfos> volumeGidInfos =
if (volumeGidInfos.isError()) {
return Failure(
"Failed to read volume gid infos from '" + volumeGidsPath +
"' " + volumeGidInfos.error());
} else if (volumeGidInfos.isNone()) {
// This could happen if the agent is hard rebooted after the file is
// created but before the data is synced on disk.
LOG(WARNING) << "The volume gids file '"
<< volumeGidsPath << "' is empty";
} else {
hashset<string> orphans;
foreach (const VolumeGidInfo& info, volumeGidInfos->infos()) {
freeGids -= info.gid();
// The operator could have changed the volume gid range, so as per
// deallocate(), we should only count this if is is still in range.
if (totalGids.contains(info.gid())) {
infos.put(info.path(), info);
// Normally the gid allocated to the PARENT type SANDBOX_PATH
// volume is deallocated when the parent container is destroyed,
// However after agent reboot, containerizer will not destroy any
// containers since all containers are already gone, so to avoid
// gid leak in this case, we need to deallocate gid for the PARENT
// type SANDBOX_PATH volume here.
if (rebooted && info.type() == VolumeGidInfo::SANDBOX_PATH) {
LOG(INFO) << "Deallocating gid " << info.gid() << " for the PARENT "
<< "type SANDBOX_PATH volume '" << info.path()
<< "' after agent reboot";
// This could happen in the case that agent crashes after the
// shared persistent volume is deleted but before volume gid
// manager deallocates its gid.
if (!os::exists(info.path())) {
LOG(WARNING) << "Deallocating gid " << info.gid() << " for the "
<< "non-existent volume path '" << info.path() << "'";
// Deallocate all the orphaned paths.
foreach (const string& path, orphans) {
return Nothing();
// This method will be called when a container running as non-root user tries
// to use a shared persistent volume or a PARENT type SANDBOX_PATH volume, the
// parameter `path` will be the source path of the volume.
Future<gid_t> allocate(const string& path, VolumeGidInfo::Type type)
gid_t gid;
// If a gid has already been allocated for the specified path,
// just return the gid.
if (infos.contains(path)) {
gid = infos[path].gid();
LOG(INFO) << "Use the allocated gid " << gid << " of the volume path '"
<< path << "'";
// If we are already setting ownership for the specified path, skip the
// additional setting.
if (setting.contains(path)) {
return setting[path]->future();
} else {
struct stat s;
if (::stat(path.c_str(), &s) < 0) {
return Failure("Failed to stat '" + path + "': " + os::strerror(errno));
// If the gid of the specified path is in the total gid range, just
// return the gid. This could happen in the case that nested container
// uses persistent volume, in which case we did a workaround in the
// default executor to set up a volume mapping (i.e., map the persistent
// volume to a PARENT type SANDBOX_PATH volume for the nested container)
// so that the nested container can access the persistent volume.
// Please note that in the case of shared persistent volume, operator
// should NOT restart agent with a different total gid range, otherwise
// the gid of the shared persistent volume may be overwritten if a nested
// container tries to use the shared persistent volume after the restart.
if (totalGids.contains(s.st_gid)) {
gid = s.st_gid;
LOG(INFO) << "Use the gid " << gid << " for the volume path '" << path
<< "' which should be the mount point of another volume "
<< "which is actually allocated with the gid";
} else {
// Allocate a free gid to the specified path and then set the
// ownership for it.
if (freeGids.empty()) {
return Failure(
"Failed to allocate gid to the volume path '" + path +
"' because the free gid range is exhausted");
gid = freeGids.begin()->lower();
LOG(INFO) << "Allocating gid " << gid << " to the volume path '"
<< path << "'";
freeGids -= gid;
VolumeGidInfo info;
infos.put(path, info);
Try<Nothing> status = persist();
if (status.isError()) {
return Failure(
"Failed to save state of volume gid infos: " + status.error());
Owned<Promise<gid_t>> promise(new Promise<gid_t>());
Future<gid_t> future = async(&setVolumeOwnership, path, gid, true)
.then([path, gid](const Try<Nothing>& result) -> Future<gid_t> {
if (result.isError()) {
return Failure(
"Failed to set the owner group of the volume path '" + path +
"' to " + stringify(gid) + ": " + result.error());
return gid;
.onAny(defer(self(), [=](const Future<gid_t>&) {
setting[path] = promise;
return promise->future();
return gid;
// This method will be called in two cases:
// 1. When a shared persistent volume is destroyed by agent, the parameter
// `path` will be the shared persistent volume's path.
// 2. When a container is destroyed by containerizer, the parameter `path`
// will be the container's sandbox path.
// We search if the given path is contained in `infos` (for the case 1) or is
// the parent directory of any volume paths in `infos` (for the case 2, i.e.,
// the PARENT type SANDBOX_PATH volume must be a subdirectory in the parent
// container's sandbox) and then free the allocated gid for the found path(s).
Future<Nothing> deallocate(const string& path)
vector<string> sandboxPathVolumes;
bool changed = false;
for (auto it = infos.begin(); it != infos.end(); ) {
const VolumeGidInfo& info = it->second;
const string& volumePath = info.path();
if (strings::startsWith(volumePath, path)) {
if (volumePath != path) {
// This is the case of the PARENT type SANDBOX_PATH volume.
gid_t gid = info.gid();
LOG(INFO) << "Deallocated gid " << gid << " for the volume path '"
<< volumePath << "'";
// Only return the gid to the free range if it is in the total
// range. The gid may not be in the total range in the case that
// Mesos agent is restarted with a different total range and we
// deallocate gid for a previous volume path from the old range.
if (totalGids.contains(gid)) {
freeGids += gid;
it = infos.erase(it);
changed = true;
} else {
// For the PARENT type SANDBOX_PATH volume, it will exist for a while
// (depending on GC policy) after the container is destroyed. So to
// avoid leaking it to other containers in the case that its gid is
// allocated to another volume, we need to change its owner group back
// to the original one (i.e., the primary group of its owner).
vector<Future<Try<Nothing>>> futures;
vector<pair<string, gid_t>> volumeGids;
foreach (const string& volume, sandboxPathVolumes) {
// Get the uid of the volume's owner.
struct stat s;
if (::stat(volume.c_str(), &s) < 0) {
LOG(WARNING) << "Failed to stat '" << volume << "': "
<< os::strerror(errno);
Result<string> user = os::user(s.st_uid);
if (!user.isSome()) {
LOG(WARNING) << "Failed to get username for the uid " << s.st_uid
<< ": " << (user.isError() ? user.error() : "not found");
// Get the primary group ID of the user.
Result<gid_t> gid = os::getgid(user.get());
if (!gid.isSome()) {
LOG(WARNING) << "Failed to get gid for the user '" << user.get()
<< "': " << (gid.isError() ? gid.error() : "not found");
futures.push_back(async(&setVolumeOwnership, volume, gid.get(), false));
volumeGids.push_back({volume, gid.get()});
return await(futures)
[=](const vector<Future<Try<Nothing>>>& results) -> Future<Nothing> {
for (size_t i = 0; i < results.size(); ++i) {
const Future<Try<Nothing>>& result = results[i];
const string& path = volumeGids[i].first;
const gid_t gid = volumeGids[i].second;
if (!result.isReady()) {
LOG(WARNING) << "Failed to set the owner group of the volume "
<< "path '" << path << "' back to " << gid << ": "
<< (result.isFailed() ?
result.failure() : "discarded");
} else if (result->isError()) {
LOG(WARNING) << "Failed to set the owner group of the volume "
<< "path '" << path << "' back to " << gid << ": "
<< result->error();
if (changed) {
Try<Nothing> status = persist();
if (status.isError()) {
return Failure(
"Failed to save state of volume gid infos: " +
return Nothing();
Try<Nothing> persist()
VolumeGidInfos volumeGidInfos;
foreachvalue (const VolumeGidInfo& info, infos) {
Try<Nothing> status = state::checkpoint(
paths::getVolumeGidsPath(metaDir), volumeGidInfos);
if (status.isError()) {
return Error("Failed to perform checkpoint: " + status.error());
return Nothing();
const IntervalSet<gid_t> totalGids;
IntervalSet<gid_t> freeGids;
const string metaDir;
hashmap<string, Owned<Promise<gid_t>>> setting;
// Allocated gid infos keyed by the volume path.
hashmap<string, VolumeGidInfo> infos;
struct Metrics
: volume_gids_total("volume_gid_manager/volume_gids_total"),
process::metrics::PushGauge volume_gids_total;
process::metrics::PushGauge volume_gids_free;
} metrics;
Try<VolumeGidManager*> VolumeGidManager::create(const Flags& flags)
if (geteuid() != 0) {
return Error("Volume gid manager requires root privileges");
Try<Resource> parse =
Resources::parse("gids", flags.volume_gid_range.get(), "*");
if (parse.isError()) {
return Error(
"Failed to parse volume gid range '" +
flags.volume_gid_range.get() + "'");
if (parse->type() != Value::RANGES) {
return Error(
"Invalid volume gid range type " +
mesos::Value_Type_Name(parse->type()) +
", expecting " +
Try<IntervalSet<gid_t>> gids =
if (gids.isError()) {
return Error("Invalid volume gid range '" +
stringify(parse->ranges()) + "': " + gids.error());
} else if (gids->empty()) {
return Error("Empty volume gid range");
return new VolumeGidManager(Owned<VolumeGidManagerProcess>(
new VolumeGidManagerProcess(gids.get(), flags.work_dir)));
const Owned<VolumeGidManagerProcess>& _process)
: process(_process)
Future<Nothing> VolumeGidManager::recover(bool rebooted) const
return dispatch(process.get(), &VolumeGidManagerProcess::recover, rebooted);
Future<gid_t> VolumeGidManager::allocate(
const string& path,
VolumeGidInfo::Type type) const
return dispatch(process.get(),
Future<Nothing> VolumeGidManager::deallocate(const string& path) const
return dispatch(process.get(), &VolumeGidManagerProcess::deallocate, path);
} // namespace slave {
} // namespace internal {
} // namespace mesos {