blob: 1b257d3befd1361c9b2830003ae9453ff9625c00 [file] [log] [blame]
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
#include <errno.h>
#include <fts.h>
#include <signal.h>
#include <stdint.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <glog/logging.h>
#include <fstream>
#include <list>
#include <map>
#include <set>
#include <sstream>
#include <string>
#include <vector>
#include <process/collect.hpp>
#include <process/defer.hpp>
#include <process/delay.hpp>
#include <process/io.hpp>
#include <process/process.hpp>
#include <stout/duration.hpp>
#include <stout/error.hpp>
#include <stout/foreach.hpp>
#include <stout/hashset.hpp>
#include <stout/lambda.hpp>
#include <stout/none.hpp>
#include <stout/option.hpp>
#include <stout/os.hpp>
#include <stout/path.hpp>
#include <stout/proc.hpp>
#include <stout/stringify.hpp>
#include <stout/strings.hpp>
#include "linux/cgroups.hpp"
#include "linux/fs.hpp"
using namespace process;
// TODO(benh): Move linux/fs.hpp out of 'mesos- namespace.
using namespace mesos::internal;
using std::list;
using std::map;
using std::set;
using std::string;
using std::vector;
namespace cgroups {
namespace internal {
// Snapshot of a subsystem (modeled after a line in /proc/cgroups).
struct SubsystemInfo
: hierarchy(0),
enabled(false) {}
SubsystemInfo(const string& _name,
int _hierarchy,
int _cgroups,
bool _enabled)
: name(_name),
enabled(_enabled) {}
string name; // Name of the subsystem.
int hierarchy; // ID of the hierarchy the subsystem is attached to.
int cgroups; // Number of cgroups for the subsystem.
bool enabled; // Whether the subsystem is enabled or not.
// Return information about subsystems on the current machine. We get
// information from /proc/cgroups file. Each line in it describes a
// subsystem.
// @return A map from subsystem names to SubsystemInfo instances if
// succeeds. Error if anything unexpected happens.
static Try<map<string, SubsystemInfo> > subsystems()
// TODO(benh): Use os::read to get better error information.
std::ifstream file("/proc/cgroups");
if (!file.is_open()) {
return Error("Failed to open /proc/cgroups");
map<string, SubsystemInfo> infos;
while (!file.eof()) {
string line;
std::getline(file, line);
if ( {
if (!file.eof()) {
return Error("Failed to read /proc/cgroups");
} else {
if (line.empty()) {
// Skip empty lines.
} else if (line.find_first_of('#') == 0) {
// Skip the first line which starts with '#' (contains titles).
} else {
// Parse line to get subsystem info.
string name;
int hierarchy;
int cgroups;
bool enabled;
std::istringstream ss(line);
ss >> std::dec >> name >> hierarchy >> cgroups >> enabled;
// Check for any read/parse errors.
if ( && !ss.eof()) {
return Error("Failed to parse /proc/cgroups");
infos[name] = SubsystemInfo(name, hierarchy, cgroups, enabled);
return infos;
// Mount a cgroups virtual file system (with proper subsystems
// attached) to a given directory (hierarchy root). The cgroups
// virtual file system is the interface exposed by the kernel to
// control cgroups. Each directory created inside the hierarchy root
// is a cgroup. Therefore, cgroups are organized in a tree like
// structure. User can specify what subsystems to be attached to the
// hierarchy root so that these subsystems can be controlled through
// normal file system APIs. A subsystem can only be attached to one
// hierarchy. This function assumes the given hierarchy is an empty
// directory and the given subsystems are enabled in the current
// platform.
// @param hierarchy Path to the hierarchy root.
// @param subsystems Comma-separated subsystem names.
// @return Some if the operation succeeds.
// Error if the operation fails.
static Try<Nothing> mount(const string& hierarchy, const string& subsystems)
if (os::exists(hierarchy)) {
return Error("'" + hierarchy + "' already exists in the file system");
// Make sure all subsystems are enabled and not busy.
foreach (const string& subsystem, strings::tokenize(subsystems, ",")) {
Try<bool> result = enabled(subsystem);
if (result.isError()) {
return Error(result.error());
} else if (!result.get()) {
return Error("'" + subsystem + "' is not enabled by the kernel");
result = busy(subsystem);
if (result.isError()) {
return Error(result.error());
} else if (result.get()) {
return Error(
"'" + subsystem + "' is already attached to another hierarchy");
// Create the directory for the hierarchy.
Try<Nothing> mkdir = os::mkdir(hierarchy);
if (mkdir.isError()) {
return Error(
"Failed to create directory '" + hierarchy + "': " + mkdir.error());
// Mount the virtual file system (attach subsystems).
Try<Nothing> result =
fs::mount(subsystems, hierarchy, "cgroup", 0, subsystems.c_str());
if (result.isError()) {
// Do a best effort rmdir of hierarchy (ignoring success or failure).
return result;
return Nothing();
// Unmount the cgroups virtual file system from the given hierarchy root. Make
// sure to remove all cgroups in the hierarchy before unmount. This function
// assumes the given hierarchy is currently mounted with a cgroups virtual file
// system.
// @param hierarchy Path to the hierarchy root.
// @return Some if the operation succeeds.
// Error if the operation fails.
static Try<Nothing> unmount(const string& hierarchy)
return fs::unmount(hierarchy);
// Copies the value of 'cpuset.cpus' and 'cpuset.mems' from a parent
// cgroup to a child cgroup so the child cgroup can actually run tasks
// (otherwise it gets the error 'Device or resource busy').
// @param hierarchy Path to hierarchy root.
// @param parentCgroup Path to parent cgroup relative to the hierarchy root.
// @param childCgroup Path to child cgroup relative to the hierarchy root.
// @return Some if the operation succeeds.
// Error if the operation fails.
static Try<Nothing> cloneCpusetCpusMems(
const string& hierarchy,
const string& parentCgroup,
const string& childCgroup)
Try<string> cpus = cgroups::read(hierarchy, parentCgroup, "cpuset.cpus");
if (cpus.isError()) {
return Error("Failed to read control 'cpuset.cpus': " + cpus.error());
Try<string> mems = cgroups::read(hierarchy, parentCgroup, "cpuset.mems");
if (mems.isError()) {
return Error("Failed to read control 'cpuset.mems': " + mems.error());
Try<Nothing> write =
cgroups::write(hierarchy, childCgroup, "cpuset.cpus", cpus.get());
if (write.isError()) {
return Error("Failed to write control 'cpuset.cpus': " + write.error());
write = cgroups::write(hierarchy, childCgroup, "cpuset.mems", mems.get());
if (write.isError()) {
return Error("Failed to write control 'cpuset.mems': " + write.error());
return Nothing();
// Create a cgroup in a given hierarchy. To create a cgroup, one just
// need to create a directory in the cgroups virtual file system. The
// given cgroup is a relative path to the given hierarchy. This
// function assumes the given hierarchy is valid and is currently
// mounted with a cgroup virtual file system. The function also
// assumes the given cgroup is valid. This function will not create
// directories recursively, which means it will return error if any of
// the parent cgroups do not exist.
// @param hierarchy Path to the hierarchy root.
// @param cgroup Path to the cgroup relative to the hierarchy root.
// @return Some if the operation succeeds.
// Error if the operation fails.
static Try<Nothing> create(const string& hierarchy, const string& cgroup)
string path = path::join(hierarchy, cgroup);
Try<Nothing> mkdir = os::mkdir(path, false); // Do NOT create recursively.
if (mkdir.isError()) {
return Error(
"Failed to create directory '" + path + "': " + mkdir.error());
// Now clone 'cpuset.cpus' and 'cpuset.mems' if the 'cpuset'
// subsystem is attached to the hierarchy.
Try<set<string> > attached = cgroups::subsystems(hierarchy);
if (attached.isError()) {
return Error(
"Failed to determine if hierarchy '" + hierarchy +
"' has the 'cpuset' subsystem attached: " + attached.error());
} else if (attached.get().count("cpuset") > 0) {
Try<string> parent = os::dirname(path::join("/", cgroup));
if (parent.isError()) {
return Error(
"Failed to determine parent cgroup of " + cgroup +
": " + parent.error());
return cloneCpusetCpusMems(hierarchy, parent.get(), cgroup);
return Nothing();
// Remove a cgroup in a given hierarchy. To remove a cgroup, one needs
// to remove the corresponding directory in the cgroups virtual file
// system. A cgroup cannot be removed if it has processes or
// sub-cgroups inside. This function does nothing but tries to remove
// the corresponding directory of the given cgroup. It will return
// error if the remove operation fails because it has either processes
// or sub-cgroups inside.
// @param hierarchy Path to the hierarchy root.
// @param cgroup Path to the cgroup relative to the hierarchy root.
// @return Some if the operation succeeds.
// Error if the operation fails.
static Try<Nothing> remove(const string& hierarchy, const string& cgroup)
string path = path::join(hierarchy, cgroup);
// Do NOT recursively remove cgroups.
Try<Nothing> rmdir = os::rmdir(path, false);
if (rmdir.isError()) {
return Error(
"Failed to remove cgroup '" + path + "': " + rmdir.error());
return rmdir;
// Read a control file. Control files are the gateway to monitor and
// control cgroups. This function assumes the cgroups virtual file
// systems are properly mounted on the given hierarchy, and the given
// cgroup has been already created properly. The given control file
// name should also be valid.
// @param hierarchy Path to the hierarchy root.
// @param cgroup Path to the cgroup relative to the hierarchy root.
// @param control Name of the control file.
// @return The value read from the control file.
static Try<string> read(
const string& hierarchy,
const string& cgroup,
const string& control)
string path = path::join(hierarchy, cgroup, control);
// TODO(benh): Use os::read. Note that we do not use os::read
// currently because it cannot correctly read /proc or cgroups
// control files since lseek (in os::read) will return error.
std::ifstream file(path.c_str());
if (!file.is_open()) {
return Error("Failed to open file " + path);
std::ostringstream ss;
ss << file.rdbuf();
if ( {
ErrnoError error; // TODO(jieyu): Does std::ifstream actually set errno?
return error;
return ss.str();
// Write a control file.
// @param hierarchy Path to the hierarchy root.
// @param cgroup Path to the cgroup relative to the hierarchy root.
// @param control Name of the control file.
// @param value Value to be written.
// @return Some if the operation succeeds.
// Error if the operation fails.
static Try<Nothing> write(
const string& hierarchy,
const string& cgroup,
const string& control,
const string& value)
string path = path::join(hierarchy, cgroup, control);
std::ofstream file(path.c_str());
if (!file.is_open()) {
return Error("Failed to open file " + path);
file << value << std::endl;
if ( {
ErrnoError error; // TODO(jieyu): Does std::ifstream actually set errno?
return error;
return Nothing();
} // namespace internal {
Try<string> prepare(
const std::string& baseHierarchy,
const std::string& subsystem,
const std::string& cgroup)
// Construct the hierarchy for this subsystem.
std::string hierarchy = path::join(baseHierarchy, subsystem);
// Ensure cgroups are enabled in the kernel.
if (!cgroups::enabled()) {
return Error("No cgroups support detected in this kernel");
// Ensure we have root permissions.
if (geteuid() != 0) {
return Error("Using cgroups requires root permissions");
// Check if the hierarchy is already mounted, and if not, mount it.
Try<bool> mounted = cgroups::mounted(hierarchy);
if (mounted.isError()) {
return Error("Failed to determine if " + hierarchy +
" is already mounted: " + mounted.error());
if (mounted.get()) {
// Make sure that desired subsystem is attached to the already
// mounted hierarchy.
Try<std::set<std::string> > attached = cgroups::subsystems(hierarchy);
if (attached.isError()) {
return Error(string("Failed to determine the attached subsystems") +
"for the cgroup hierarchy at " + hierarchy + ": " +
if (attached.get().count(subsystem) == 0) {
return Error("The cgroups hierarchy at " + hierarchy +
" can not be used because it does not have the '" +
subsystem + "' subsystem attached");
if (attached.get().size() > 1) {
return Error("The " + subsystem + " subsystem is co-mounted at " +
hierarchy + " with other subsytems");
} else {
// Attempt to mount the hierarchy ourselves.
if (os::exists(hierarchy)) {
// The path specified by the given hierarchy already exists in
// the file system. We try to remove it if it is an empty
// directory. This will helps us better deal with slave restarts
// since we won't need to manually remove the directory.
Try<Nothing> rmdir = os::rmdir(hierarchy, false);
if (rmdir.isError()) {
return Error("Failed to mount cgroups hierarchy at '" + hierarchy +
"' because we could not remove the existing directory: " +
// Mount the subsystem.
Try<Nothing> mount = cgroups::mount(hierarchy, subsystem);
if (mount.isError()) {
return Error("Failed to mount cgroups hierarchy at '" + hierarchy +
"': " + mount.error());
// Create the cgroup if it doesn't exist.
Try<bool> exists = cgroups::exists(hierarchy, cgroup);
if (exists.isError()) {
return Error("Failed to check existence of root cgroup " +
path::join(hierarchy, cgroup) +
": " + exists.error());
if (!exists.get()) {
// No cgroup exists, create it.
Try<Nothing> create = cgroups::create(hierarchy, cgroup);
if (create.isError()) {
return Error("Failed to create root cgroup " +
path::join(hierarchy, cgroup) +
": " + create.error());
const string& testCgroup = path::join(cgroup, "test");
// Create a nested test cgroup if it doesn't exist.
exists = cgroups::exists(hierarchy, testCgroup);
if (exists.isError()) {
return Error("Failed to check existence nested of test cgroup " +
path::join(hierarchy, testCgroup) +
": " + exists.error());
if (!exists.get()) {
// Make sure this kernel supports creating nested cgroups.
Try<Nothing> create = cgroups::create(hierarchy, testCgroup);
if (create.isError()) {
return Error(string("Failed to create a nested 'test' cgroup.") +
" Your kernel might be too old to use the" +
" cgroups isolator: " + create.error());
// Remove the nested 'test' cgroup.
Try<Nothing> remove = cgroups::remove(hierarchy, testCgroup);
if (remove.isError()) {
return Error("Failed to remove the nested test cgroup: " + remove.error());
return hierarchy;
// Returns some error string if either (a) hierarchy is not mounted,
// (b) cgroup does not exist, or (c) control file does not exist.
static Option<Error> verify(
const string& hierarchy,
const string& cgroup = "",
const string& control = "")
Try<bool> mounted = cgroups::mounted(hierarchy);
if (mounted.isError()) {
return Error(
"Failed to determine if the hierarchy at '" + hierarchy +
"' is mounted: " + mounted.error());
} else if (!mounted.get()) {
return Error("'" + hierarchy + "' is not a valid hierarchy");
if (cgroup != "") {
if (!os::exists(path::join(hierarchy, cgroup))) {
return Error("'" + cgroup + "' is not a valid cgroup");
if (control != "") {
CHECK(cgroup != "");
if (!os::exists(path::join(hierarchy, cgroup, control))) {
return Error(
"'" + control + "' is not a valid control (is subsystem attached?)");
return None();
bool enabled()
return os::exists("/proc/cgroups");
Try<set<string> > hierarchies()
// Read currently mounted file systems from /proc/mounts.
Try<fs::MountTable> table = fs::MountTable::read("/proc/mounts");
if (table.isError()) {
return Error(table.error());
set<string> results;
foreach (const fs::MountTable::Entry& entry, table.get().entries) {
if (entry.type == "cgroup") {
Result<string> realpath = os::realpath(entry.dir);
if (!realpath.isSome()) {
return Error(
"Failed to determine canonical path of " + entry.dir + ": " +
? realpath.error()
: "No such file or directory"));
return results;
Result<std::string> hierarchy(const std::string& subsystems)
Result<std::string> hierarchy = None();
Try<std::set<std::string> > hierarchies = cgroups::hierarchies();
if (hierarchies.isError()) {
return Error(hierarchies.error());
foreach (const std::string& candidate, hierarchies.get()) {
if (subsystems.empty()) {
hierarchy = candidate;
// Check and see if this candidate meets our subsystem requirements.
Try<bool> mounted = cgroups::mounted(candidate, subsystems);
if (mounted.isError()) {
return Error(mounted.error());
} else if (mounted.get()) {
hierarchy = candidate;
return hierarchy;
Try<bool> enabled(const string& subsystems)
Try<map<string, internal::SubsystemInfo> > infosResult =
if (infosResult.isError()) {
return Error(infosResult.error());
map<string, internal::SubsystemInfo> infos = infosResult.get();
bool disabled = false; // Whether some subsystems are not enabled.
foreach (const string& subsystem, strings::tokenize(subsystems, ",")) {
if (infos.find(subsystem) == infos.end()) {
return Error("'" + subsystem + "' not found");
if (!infos[subsystem].enabled) {
// Here, we don't return false immediately because we want to return
// error if any of the given subsystems is missing.
disabled = true;
return !disabled;
Try<bool> busy(const string& subsystems)
Try<map<string, internal::SubsystemInfo> > infosResult =
if (infosResult.isError()) {
return Error(infosResult.error());
map<string, internal::SubsystemInfo> infos = infosResult.get();
bool busy = false;
foreach (const string& subsystem, strings::tokenize(subsystems, ",")) {
if (infos.find(subsystem) == infos.end()) {
return Error("'" + subsystem + "' not found");
if (infos[subsystem].hierarchy != 0) {
// Here, we don't return false immediately because we want to return
// error if any of the given subsystems is missing.
busy = true;
return busy;
Try<set<string> > subsystems()
Try<map<string, internal::SubsystemInfo> > infos = internal::subsystems();
if (infos.isError()) {
return Error(infos.error());
set<string> names;
foreachvalue (const internal::SubsystemInfo& info, infos.get()) {
if (info.enabled) {
return names;
Try<set<string> > subsystems(const string& hierarchy)
// We compare the canonicalized absolute paths.
Result<string> hierarchyAbsPath = os::realpath(hierarchy);
if (!hierarchyAbsPath.isSome()) {
return Error(
"Failed to determine canonical path of '" + hierarchy + "': " +
? hierarchyAbsPath.error()
: "No such file or directory"));
// Read currently mounted file systems from /proc/mounts.
Try<fs::MountTable> table = fs::MountTable::read("/proc/mounts");
if (table.isError()) {
return Error("Failed to read mount table: " + table.error());
// Check if hierarchy is a mount point of type cgroup.
Option<fs::MountTable::Entry> hierarchyEntry;
foreach (const fs::MountTable::Entry& entry, table.get().entries) {
if (entry.type == "cgroup") {
Result<string> dirAbsPath = os::realpath(entry.dir);
if (!dirAbsPath.isSome()) {
return Error(
"Failed to determine canonical path of '" + entry.dir + "': " +
? dirAbsPath.error()
: "No such file or directory"));
// Seems that a directory can be mounted more than once. Previous mounts
// are obscured by the later mounts. Therefore, we must see all entries to
// make sure we find the last one that matches.
if (dirAbsPath.get() == hierarchyAbsPath.get()) {
hierarchyEntry = entry;
if (hierarchyEntry.isNone()) {
return Error("'" + hierarchy + "' is not a valid hierarchy");
// Get the intersection of the currently enabled subsystems and mount
// options. Notice that mount options may contain somethings (e.g. rw) that
// are not in the set of enabled subsystems.
Try<set<string> > names = subsystems();
if (names.isError()) {
return Error(names.error());
set<string> result;
foreach (const string& name, names.get()) {
if (hierarchyEntry.get().hasOption(name)) {
return result;
Try<Nothing> mount(const string& hierarchy, const string& subsystems, int retry)
Try<Nothing> mounted = internal::mount(hierarchy, subsystems);
// TODO(tmarshall) The retry option was added as a fix for a kernel
// bug in Ubuntu 12.04 that resulted in cgroups not being entirely
// cleaned up even once they have been completely unmounted from the
// file system. We should reevaluate this in the future, and
// hopefully remove it once the bug is no longer an issue.
if (mounted.isError() && retry > 0) {
return cgroups::mount(hierarchy, subsystems, retry - 1);
return mounted;
Try<Nothing> unmount(const string& hierarchy)
Option<Error> error = verify(hierarchy);
if (error.isSome()) {
return error.get();
Try<Nothing> unmount = internal::unmount(hierarchy);
if (unmount.isError()) {
return unmount;
Try<Nothing> rmdir = os::rmdir(hierarchy);
if (rmdir.isError()) {
return Error(
"Failed to remove directory '" + hierarchy + "': " + rmdir.error());
return Nothing();
Try<bool> mounted(const string& hierarchy, const string& subsystems)
if (!os::exists(hierarchy)) {
return false;
// We compare canonicalized absolute paths.
Result<string> realpath = os::realpath(hierarchy);
if (!realpath.isSome()) {
return Error(
"Failed to determine canonical path of '" + hierarchy + "': " +
? realpath.error()
: "No such file or directory"));
Try<set<string> > hierarchies = cgroups::hierarchies();
if (hierarchies.isError()) {
return Error(
"Failed to get mounted hierarchies: " + hierarchies.error());
if (hierarchies.get().count(realpath.get()) == 0) {
return false;
// Now make sure all the specified subsytems are attached.
Try<set<string> > attached = cgroups::subsystems(hierarchy);
if (attached.isError()) {
return Error(
"Failed to get subsystems attached to hierarchy '" +
hierarchy + "': " + attached.error());
foreach (const string& subsystem, strings::tokenize(subsystems, ",")) {
if (attached.get().count(subsystem) == 0) {
return false;
return true;
Try<Nothing> create(const string& hierarchy, const string& cgroup)
Option<Error> error = verify(hierarchy);
if (error.isSome()) {
return error.get();
return internal::create(hierarchy, cgroup);
Try<Nothing> remove(const string& hierarchy, const string& cgroup)
Option<Error> error = verify(hierarchy, cgroup);
if (error.isSome()) {
return error.get();
Try<vector<string> > cgroups = cgroups::get(hierarchy, cgroup);
if (cgroups.isError()) {
return Error("Failed to get nested cgroups: " + cgroups.error());
if (!cgroups.get().empty()) {
return Error("Nested cgroups exist");
return internal::remove(hierarchy, cgroup);
Try<bool> exists(const string& hierarchy, const string& cgroup)
Option<Error> error = verify(hierarchy);
if (error.isSome()) {
return error.get();
return os::exists(path::join(hierarchy, cgroup));
Try<vector<string> > get(const string& hierarchy, const string& cgroup)
Option<Error> error = verify(hierarchy, cgroup);
if (error.isSome()) {
return error.get();
Result<string> hierarchyAbsPath = os::realpath(hierarchy);
if (!hierarchyAbsPath.isSome()) {
return Error(
"Failed to determine canonical path of '" + hierarchy + "': " +
? hierarchyAbsPath.error()
: "No such file or directory"));
Result<string> destAbsPath = os::realpath(path::join(hierarchy, cgroup));
if (!destAbsPath.isSome()) {
return Error(
"Failed to determine canonical path of '" +
path::join(hierarchy, cgroup) + "': " +
? destAbsPath.error()
: "No such file or directory"));
char* paths[] = { const_cast<char*>(destAbsPath.get().c_str()), NULL };
FTS* tree = fts_open(paths, FTS_NOCHDIR, NULL);
if (tree == NULL) {
return ErrnoError("Failed to start traversing file system");
vector<string> cgroups;
FTSENT* node;
while ((node = fts_read(tree)) != NULL) {
// Use post-order walk here. fts_level is the depth of the traversal,
// numbered from -1 to N, where the file/dir was found. The traversal root
// itself is numbered 0. fts_info includes flags for the current node.
// FTS_DP indicates a directory being visited in postorder.
if (node->fts_level > 0 && node->fts_info & FTS_DP) {
string path =
strings::trim(node->fts_path + hierarchyAbsPath.get().length(), "/");
if (errno != 0) {
return ErrnoError("Failed to read a node while traversing file system");
if (fts_close(tree) != 0) {
return ErrnoError("Failed to stop traversing file system");
return cgroups;
Try<Nothing> kill(
const string& hierarchy,
const string& cgroup,
int signal)
Option<Error> error = verify(hierarchy, cgroup);
if (error.isSome()) {
return error.get();
Try<set<pid_t> > pids = processes(hierarchy, cgroup);
if (pids.isError()) {
return Error("Failed to get processes of cgroup: " + pids.error());
foreach (pid_t pid, pids.get()) {
if (::kill(pid, signal) == -1) {
// If errno is set to ESRCH, it means that either a) this process already
// terminated, or b) it's in a 'zombie' state and we can't signal it
// anyway. In either case, ignore the error.
if (errno != ESRCH) {
return ErrnoError(
"Failed to send " + string(strsignal(signal)) +
" to process " + stringify(pid));
return Nothing();
Try<string> read(
const string& hierarchy,
const string& cgroup,
const string& control)
Option<Error> error = verify(hierarchy, cgroup, control);
if (error.isSome()) {
return error.get();
return internal::read(hierarchy, cgroup, control);
Try<Nothing> write(
const string& hierarchy,
const string& cgroup,
const string& control,
const string& value)
Option<Error> error = verify(hierarchy, cgroup, control);
if (error.isSome()) {
return error.get();
return internal::write(hierarchy, cgroup, control, value);
Try<bool> exists(
const string& hierarchy,
const string& cgroup,
const string& control)
Option<Error> error = verify(hierarchy, cgroup);
if (error.isSome()) {
return error.get();
return os::exists(path::join(hierarchy, cgroup, control));
Try<set<pid_t> > processes(const string& hierarchy, const string& cgroup)
// Note: (from cgroups/cgroups.txt documentation)
// cgroup.procs: list of thread group IDs in the cgroup. This list is not
// guaranteed to be sorted or free of duplicate TGIDs, and userspace should
// sort/uniquify the list if this property is required.
Try<string> value = cgroups::read(hierarchy, cgroup, "cgroup.procs");
if (value.isError()) {
return Error("Failed to read cgroups control 'cgroup.procs': " + value.error());
// Parse the values read from the control file and insert into a set. This
// ensures they are unique (and also sorted).
set<pid_t> pids;
std::istringstream ss(value.get());
ss >> std::dec;
while (!ss.eof()) {
pid_t pid;
ss >> pid;
if ( {
if (!ss.eof()) {
return Error("Failed to parse '" + value.get() + "'");
} else {
return pids;
Try<Nothing> assign(const string& hierarchy, const string& cgroup, pid_t pid)
return cgroups::write(hierarchy, cgroup, "tasks", stringify(pid));
namespace internal {
#define EFD_SEMAPHORE (1 << 0)
#define EFD_CLOEXEC 02000000
#define EFD_NONBLOCK 04000
static int eventfd(unsigned int initval, int flags)
#ifdef __NR_eventfd2
return ::syscall(__NR_eventfd2, initval, flags);
#elif defined(__NR_eventfd)
int fd = ::syscall(__NR_eventfd, initval);
if (fd == -1) {
return -1;
// Manually set CLOEXEC and NONBLOCK.
if ((flags & EFD_CLOEXEC) != 0) {
if (os::cloexec(fd).isError()) {
return -1;
if ((flags & EFD_NONBLOCK) != 0) {
if (os::nonblock(fd).isError()) {
return -1;
// Return the file descriptor.
return fd;
#error "The eventfd syscall is not available."
// In cgroups, there is mechanism which allows to get notifications about
// changing status of a cgroup. It is based on Linux eventfd. See more
// information in the kernel documentation ("Notification API"). This function
// will create an eventfd and write appropriate control file to correlate the
// eventfd with a type of event so that users can start polling on the eventfd
// to get notified. It returns the eventfd (file descriptor) if the notifier has
// been successfully registered. This function assumes all the parameters are
// valid. The eventfd is set to be non-blocking.
// @param hierarchy Path to the hierarchy root.
// @param cgroup Path to the cgroup relative to the hierarchy root.
// @param control Name of the control file.
// @param args Control specific arguments.
// @return The eventfd if the operation succeeds.
// Error if the operation fails.
static Try<int> registerNotifier(
const string& hierarchy,
const string& cgroup,
const string& control,
const Option<string>& args = None())
int efd = internal::eventfd(0, EFD_CLOEXEC | EFD_NONBLOCK);
if (efd < 0) {
return ErrnoError("Failed to create an eventfd");
// Open the control file.
string path = path::join(hierarchy, cgroup, control);
Try<int> cfd = os::open(path, O_RDWR);
if (cfd.isError()) {
return Error("Failed to open '" + path + "': " + cfd.error());
// Write the event control file (cgroup.event_control).
std::ostringstream out;
out << std::dec << efd << " " << cfd.get();
if (args.isSome()) {
out << " " << args.get();
Try<Nothing> write = internal::write(
hierarchy, cgroup, "cgroup.event_control", out.str());
if (write.isError()) {
return Error(
"Failed to write control 'cgroup.event_control': " + write.error());
return efd;
// Unregister a notifier.
// @param fd The eventfd returned by registerNotifier.
// @return Some if the operation succeeds.
// Error if the operation fails.
static Try<Nothing> unregisterNotifier(int fd)
return os::close(fd);
// The process listening on event notifier. This class is invisible to users.
class EventListener : public Process<EventListener>
EventListener(const string& _hierarchy,
const string& _cgroup,
const string& _control,
const Option<string>& _args)
: hierarchy(_hierarchy),
data(0) {}
virtual ~EventListener() {}
Future<uint64_t> future() { return promise.future(); }
virtual void initialize()
// Stop the listener if no one cares. Note that here we explicitly specify
// the type of the terminate function because it is an overloaded function.
// The compiler complains if we do not do it.
static_cast<void (*)(const UPID&, bool)>(terminate), self(), true));
// Register an eventfd "notifier" for the given control.
Try<int> fd = internal::registerNotifier(hierarchy, cgroup, control, args);
if (fd.isError()) {"Failed to register notification eventfd: " + fd.error());
// Remember the opened event file descriptor.
eventfd = fd.get();
// Perform nonblocking read on the event file. The nonblocking read will
// start polling on the event file until it becomes readable. If we can
// successfully read 8 bytes (sizeof uint64_t) from the event file, it
// indicates an event has occurred.
reading = io::read(eventfd.get(), &data, sizeof(data));
reading.onAny(defer(self(), &EventListener::notified, lambda::_1));
virtual void finalize()
// Discard the nonblocking read.
// Unregister the eventfd if needed.
if (eventfd.isSome()) {
Try<Nothing> unregister = internal::unregisterNotifier(eventfd.get());
if (unregister.isError()) {
LOG(ERROR) << "Failed to unregistering eventfd: " << unregister.error();
// This function is called when the nonblocking read on the eventfd has
// result, either because the event has happened, or an error has occurred.
void notified(const Future<size_t>&)
// Ignore this function if the promise is no longer pending.
if (!promise.future().isPending()) {
// Since the future reading can only be discarded when the promise is no
// longer pending, we shall never see a discarded reading here because of
// the check in the beginning of the function.
if (reading.isFailed()) {"Failed to read eventfd: " + reading.failure());
} else {
if (reading.get() == sizeof(data)) {
} else {"Read less than expected");
const string hierarchy;
const string cgroup;
const string control;
const Option<string> args;
Promise<uint64_t> promise;
Future<size_t> reading;
Option<int> eventfd; // The eventfd if opened.
uint64_t data; // The data read from the eventfd.
} // namespace internal {
Future<uint64_t> listen(
const string& hierarchy,
const string& cgroup,
const string& control,
const Option<string>& args)
Option<Error> error = verify(hierarchy, cgroup, control);
if (error.isSome()) {
return Failure(error.get());
internal::EventListener* listener =
new internal::EventListener(hierarchy, cgroup, control, args);
Future<uint64_t> future = listener->future();
spawn(listener, true);
return future;
namespace internal {
// The process that freezes or thaws the cgroup.
class Freezer : public Process<Freezer>
Freezer(const string& _hierarchy,
const string& _cgroup,
const string& _action,
const Duration& _interval,
unsigned int _retries = FREEZE_RETRIES)
: hierarchy(_hierarchy),
retries(_retries) {}
virtual ~Freezer() {}
// Return a future indicating the state of the freezer.
Future<bool> future() { return promise.future(); }
virtual void initialize()
// Stop the process if no one cares.
static_cast<void (*)(const UPID&, bool)>(terminate), self(), true));
CHECK(interval >= Seconds(0));
// Start the action.
CHECK(action == "FREEZE" || action == "THAW");
if (action == "FREEZE") {
} else if (action == "THAW") {
void freeze()
LOG(INFO) << "Trying to freeze cgroup " << path::join(hierarchy, cgroup);
Try<Nothing> write = internal::write(
hierarchy, cgroup, "freezer.state", "FROZEN");
if (write.isError()) {"Failed to write control 'freezer.state': " + write.error());
} else {
void thaw()
LOG(INFO) << "Trying to thaw cgroup " << path::join(hierarchy, cgroup);
Try<Nothing> write = internal::write(
hierarchy, cgroup, "freezer.state", "THAWED");
if (write.isError()) {"Failed to write control 'freezer.state': " + write.error());
} else {
void watchFrozen(unsigned int attempt = 0)
Try<string> state = internal::read(hierarchy, cgroup, "freezer.state");
if (state.isError()) {"Failed to read control 'freezer.state': " + state.error());
if (strings::trim(state.get()) == "FROZEN") {
LOG(INFO) << "Successfully froze cgroup " << path::join(hierarchy, cgroup)
<< " after " << attempt + 1 << " attempts";
} else if (strings::trim(state.get()) == "FREEZING") {
// The freezer.state is in FREEZING state. This is because not all the
// processes in the given cgroup can be frozen at the moment. The main
// cause is that some of the processes are in stopped/traced state ('T'
// state shown in ps command). It is likely that the freezer.state keeps
// in FREEZING state if these stopped/traced processes are not resumed.
// Therefore, here we send SIGCONT to those stopped/traced processes to
// make sure that the freezer can finish.
// TODO(jieyu): This code can be removed in the future as the newer
// version of the kernel solves this problem (e.g. Linux-3.2.0).
Try<set<pid_t> > pids = processes(hierarchy, cgroup);
if (pids.isError()) {"Failed to get processes of cgroup: " + pids.error());
// It appears possible for processes to go away while the cgroup
// is in the FREEZING state. We ignore such processes.
// See:
foreach (pid_t pid, pids.get()) {
Result<proc::ProcessStatus> status = proc::status(pid);
if (!status.isSome()) {
<< "Failed to get process status for pid " << pid << ": "
<< (status.isError() ? status.error() : "pid does not exist");
// Check whether the process is in stopped/traced state.
if (status.get().state == 'T') {
// Send a SIGCONT signal to the process.
if (::kill(pid, SIGCONT) == -1) {
"Failed to send SIGCONT to process " + stringify(pid) +
": " + string(strerror(errno)));
if (attempt > retries) {
LOG(WARNING) << "Unable to freeze " << path::join(hierarchy, cgroup)
<< " within " << retries + 1 << " attempts";
// Retry the freezing operation.
Try<Nothing> write = internal::write(
hierarchy, cgroup, "freezer.state", "FROZEN");
if (write.isError()) {
"Failed to write control 'freezer.state': " + write.error());
// Not done yet, keep watching (and possibly retrying).
delay(interval, self(), &Freezer::watchFrozen, attempt + 1);
} else {
LOG(FATAL) << "Unexpected state: " << strings::trim(state.get())
<< " of cgroup " << path::join(hierarchy, cgroup);
void watchThawed()
Try<string> state = internal::read(hierarchy, cgroup, "freezer.state");
if (state.isError()) {"Failed to read control 'freezer.state': " + state.error());
if (strings::trim(state.get()) == "THAWED") {
LOG(INFO) << "Successfully thawed " << path::join(hierarchy, cgroup);
} else if (strings::trim(state.get()) == "FROZEN") {
// Not done yet, keep watching.
delay(interval, self(), &Freezer::watchThawed);
} else {
LOG(FATAL) << "Unexpected state: " << strings::trim(state.get())
<< " of cgroup " << path::join(hierarchy, cgroup);
const string hierarchy;
const string cgroup;
const string action;
const Duration interval;
const unsigned int retries;
Promise<bool> promise;
} // namespace internal {
Future<bool> freeze(
const string& hierarchy,
const string& cgroup,
const Duration& interval,
unsigned int retries)
Option<Error> error = verify(hierarchy, cgroup, "freezer.state");
if (error.isSome()) {
return Failure(error.get());
if (interval < Seconds(0)) {
return Failure("Interval should be non-negative");
// Check the current freezer state.
Try<string> state = internal::read(hierarchy, cgroup, "freezer.state");
if (state.isError()) {
return Failure(
"Failed to read control 'freezer.state': " + state.error());
} else if (strings::trim(state.get()) == "FROZEN") {
// Immediately return success.
return true;
internal::Freezer* freezer =
new internal::Freezer(hierarchy, cgroup, "FREEZE", interval, retries);
Future<bool> future = freezer->future();
spawn(freezer, true);
return future;
Future<bool> thaw(
const string& hierarchy,
const string& cgroup,
const Duration& interval)
Option<Error> error = verify(hierarchy, cgroup, "freezer.state");
if (error.isSome()) {
return Failure(error.get());
if (interval < Seconds(0)) {
return Failure("Interval should be non-negative");
// Check the current freezer state.
Try<string> state = internal::read(hierarchy, cgroup, "freezer.state");
if (state.isError()) {
return Failure(
"Failed to read control 'freezer.state': " + state.error());
} else if (strings::trim(state.get()) == "THAWED") {
// Immediately return success.
return true;
internal::Freezer* freezer =
new internal::Freezer(hierarchy, cgroup, "THAW", interval);
Future<bool> future = freezer->future();
spawn(freezer, true);
return future;
namespace internal {
// The process used to wait for a cgroup to become empty (no task in it).
class EmptyWatcher: public Process<EmptyWatcher>
EmptyWatcher(const string& _hierarchy,
const string& _cgroup,
const Duration& _interval,
unsigned int _retries = EMPTY_WATCHER_RETRIES)
: hierarchy(_hierarchy),
retries(_retries) {}
virtual ~EmptyWatcher() {}
// Return a future indicating the state of the watcher.
// There are three outcomes:
// 1. true: the cgroup became empty.
// 2. false: the cgroup did not become empty within the retry limit.
// 3. error: invalid arguments, or an unexpected error occured.
Future<bool> future() { return promise.future(); }
virtual void initialize()
// Stop when no one cares.
static_cast<void (*)(const UPID&, bool)>(terminate), self(), true));
CHECK(interval >= Seconds(0));
void check(unsigned int attempt = 0)
Try<set<pid_t> > pids = processes(hierarchy, cgroup);
if (pids.isError()) {"Failed to get processes of cgroup: " + pids.error());
if (pids.get().empty()) {
} else {
if (attempt > retries) {
// Re-check needed.
delay(interval, self(), &EmptyWatcher::check, attempt + 1);
const string hierarchy;
const string cgroup;
const Duration interval;
const unsigned int retries;
Promise<bool> promise;
// The process used to atomically kill all tasks in a cgroup.
class TasksKiller : public Process<TasksKiller>
TasksKiller(const string& _hierarchy,
const string& _cgroup,
const Duration& _interval)
: hierarchy(_hierarchy),
interval(_interval) {}
virtual ~TasksKiller() {}
// Return a future indicating the state of the killer.
Future<bool> future() { return promise.future(); }
virtual void initialize()
// Stop when no one cares.
static_cast<void(*)(const UPID&, bool)>(terminate), self(), true));
CHECK(interval >= Seconds(0));
virtual void finalize()
// Cancel the chain of operations if the user discards the future.
if (promise.future().isDiscarded()) {
// The sequence of operations to kill a cgroup is as follows:
// SIGSTOP -> SIGKILL -> empty -> freeze -> SIGKILL -> thaw -> empty
// This process is repeated until the cgroup becomes empty.
void killTasks() {
// Chain together the steps needed to kill the tasks. Note that we
// ignore the return values of freeze, kill, and thaw because,
// provided there are no errors, we'll just retry the chain as
// long as tasks still exist.
chain = kill(SIGSTOP) // Send stop signal to all tasks.
.then(defer(self(), &Self::kill, SIGKILL)) // Now send kill signal.
.then(defer(self(), &Self::empty)) // Wait until cgroup is empty.
.then(defer(self(), &Self::freeze)) // Freeze cgroug.
.then(defer(self(), &Self::kill, SIGKILL)) // Send kill signal to any remaining tasks.
.then(defer(self(), &Self::thaw)) // Thaw cgroup to deliver signals.
.then(defer(self(), &Self::empty)); // Wait until cgroup is empty.
chain.onAny(defer(self(), &Self::finished, lambda::_1));
Future<bool> freeze()
return cgroups::freeze(hierarchy, cgroup, interval);
Future<Nothing> kill(const int signal)
Try<Nothing> kill = cgroups::kill(hierarchy, cgroup, signal);
if (kill.isError()) {
return Failure(kill.error());
return Nothing();
Future<bool> thaw()
return cgroups::thaw(hierarchy, cgroup, interval);
Future<bool> empty()
EmptyWatcher* watcher = new EmptyWatcher(hierarchy, cgroup, interval);
Future<bool> future = watcher->future();
spawn(watcher, true);
return future;
void finished(const Future<bool>& empty)
CHECK(!empty.isPending() && !empty.isDiscarded());
if (empty.isFailed()) {;
} else if (empty.get()) {
} else {
// The cgroup was not empty after the retry limit.
// We need to re-attempt the freeze/kill/thaw/watch chain.
const string hierarchy;
const string cgroup;
const Duration interval;
Promise<bool> promise;
Future<bool> chain; // Used to discard the "chain" of operations.
// The process used to destroy a cgroup.
class Destroyer : public Process<Destroyer>
Destroyer(const string& _hierarchy,
const vector<string>& _cgroups,
const Duration& _interval)
: hierarchy(_hierarchy),
interval(_interval) {}
virtual ~Destroyer() {}
// Return a future indicating the state of the destroyer.
Future<bool> future() { return promise.future(); }
virtual void initialize()
// Stop when no one cares.
static_cast<void(*)(const UPID&, bool)>(terminate), self(), true));
CHECK(interval >= Seconds(0));
// Kill tasks in the given cgroups in parallel. Use collect mechanism to
// wait until all kill processes finish.
foreach (const string& cgroup, cgroups) {
internal::TasksKiller* killer =
new internal::TasksKiller(hierarchy, cgroup, interval);
spawn(killer, true);
.onAny(defer(self(), &Destroyer::killed, lambda::_1));
virtual void finalize()
// Cancel the operation if the user discards the future.
if (promise.future().isDiscarded()) {
void killed(const Future<list<bool> >& kill)
CHECK(!kill.isPending() && !kill.isDiscarded());
if (kill.isReady()) {
} else if (kill.isFailed()) {"Failed to kill tasks in nested cgroups: " + kill.failure());
void remove()
foreach (const string& cgroup, cgroups) {
Try<Nothing> remove = internal::remove(hierarchy, cgroup);
if (remove.isError()) {
"Failed to remove cgroup '" + cgroup + "': " + remove.error());
const string hierarchy;
const vector<string> cgroups;
const Duration interval;
Promise<bool> promise;
// The killer processes used to atomically kill tasks in each cgroup.
list<Future<bool> > killers;
} // namespace internal {
Future<bool> destroy(
const string& hierarchy,
const string& cgroup,
const Duration& interval)
if (interval < Seconds(0)) {
return Failure("Interval should be non-negative");
// Construct the vector of cgroups to destroy.
Try<vector<string> > cgroups = cgroups::get(hierarchy, cgroup);
if (cgroups.isError()) {
return Failure(
"Failed to get nested cgroups: " + cgroups.error());
vector<string> candidates = cgroups.get();
if (cgroup != "/") {
if (candidates.empty()) {
return true;
// If the freezer subsystem is available, destroy the cgroups.
Option<Error> error = verify(hierarchy, cgroup, "freezer.state");
if (error.isNone()) {
internal::Destroyer* destroyer =
new internal::Destroyer(hierarchy, candidates, interval);
Future<bool> future = destroyer->future();
spawn(destroyer, true);
return future;
} else {
// Otherwise, attempt to remove the cgroups in bottom-up fashion.
foreach (const std::string& cgroup, candidates) {
Try<Nothing> remove = cgroups::remove(hierarchy, cgroup);
if (remove.isError()) {
return Failure(remove.error());
return true;
// Forward declaration.
Future<bool> _cleanup(const string& hierarchy);
Future<bool> cleanup(const string& hierarchy)
Try<bool> mounted = cgroups::mounted(hierarchy);
if (mounted.isError()) {
return Failure(mounted.error());
if (mounted.get()) {
// Destroy all cgroups and then cleanup.
return destroy(hierarchy)
.then(lambda::bind(_cleanup, hierarchy));
} else {
// Remove the directory if it still exists.
if (os::exists(hierarchy)) {
Try<Nothing> rmdir = os::rmdir(hierarchy);
if (rmdir.isError()) {
return Failure(rmdir.error());
return true;
Future<bool> _cleanup(const string& hierarchy)
// Remove the hierarchy.
Try<Nothing> unmount = cgroups::unmount(hierarchy);
if (unmount.isError()) {
return Failure(unmount.error());
// Remove the directory if it still exists.
if (os::exists(hierarchy)) {
Try<Nothing> rmdir = os::rmdir(hierarchy);
if (rmdir.isError()) {
return Failure(rmdir.error());
return true;
Try<hashmap<string, uint64_t> > stat(
const string& hierarchy,
const string& cgroup,
const string& file)
Try<std::string> contents = cgroups::read(hierarchy, cgroup, file);
if (contents.isError()) {
return Error(contents.error());
hashmap<string, uint64_t> result;
foreach (const string& line, strings::split(contents.get(), "\n")) {
// Skip empty lines.
if (strings::trim(line).empty()) {
string name;
uint64_t value;
// Expected line format: "%s %llu".
std::istringstream stream(line);
stream >> name >> value;
if ( {
return Error("Unexpected line format in " + file + ": " + line);
result[name] = value;
return result;
namespace cpu {
Try<Nothing> shares(
const string& hierarchy,
const string& cgroup,
size_t shares)
return cgroups::write(
Try<Nothing> cfs_period_us(
const string& hierarchy,
const string& cgroup,
const Duration& duration)
return cgroups::write(
Try<Duration> cfs_quota_us(
const string& hierarchy,
const string& cgroup)
Try<string> read = cgroups::read(hierarchy, cgroup, "cpu.cfs_quota_us");
if (read.isError()) {
return Error(read.error());
return Duration::parse(strings::trim(read.get()) + "us");
Try<Nothing> cfs_quota_us(
const string& hierarchy,
const string& cgroup,
const Duration& duration)
return cgroups::write(
} // namespace cpu {
namespace memory {
Try<Bytes> limit_in_bytes(const string& hierarchy, const string& cgroup)
Try<string> read = cgroups::read(
hierarchy, cgroup, "memory.limit_in_bytes");
if (read.isError()) {
return Error(read.error());
return Bytes::parse(strings::trim(read.get()) + "B");
Try<Nothing> limit_in_bytes(
const string& hierarchy,
const string& cgroup,
const Bytes& limit)
return cgroups::write(
hierarchy, cgroup, "memory.limit_in_bytes", stringify(limit.bytes()));
Try<Bytes> soft_limit_in_bytes(const string& hierarchy, const string& cgroup)
Try<string> read = cgroups::read(
hierarchy, cgroup, "memory.soft_limit_in_bytes");
if (read.isError()) {
return Error(read.error());
return Bytes::parse(strings::trim(read.get()) + "B");
Try<Nothing> soft_limit_in_bytes(
const string& hierarchy,
const string& cgroup,
const Bytes& limit)
return cgroups::write(
Try<Bytes> usage_in_bytes(const string& hierarchy, const string& cgroup)
Try<string> read = cgroups::read(
hierarchy, cgroup, "memory.usage_in_bytes");
if (read.isError()) {
return Error(read.error());
return Bytes::parse(strings::trim(read.get()) + "B");
Try<Bytes> max_usage_in_bytes(const string& hierarchy, const string& cgroup)
Try<string> read = cgroups::read(
hierarchy, cgroup, "memory.max_usage_in_bytes");
if (read.isError()) {
return Error(read.error());
return Bytes::parse(strings::trim(read.get()) + "B");
} // namespace memory {
} // namespace cgroups {