| // Licensed to the Apache Software Foundation (ASF) under one |
| // or more contributor license agreements. See the NOTICE file |
| // distributed with this work for additional information |
| // regarding copyright ownership. The ASF licenses this file |
| // to you under the Apache License, Version 2.0 (the |
| // "License"); you may not use this file except in compliance |
| // with the License. You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| #include <fts.h> |
| |
| #include "linux/cgroups2.hpp" |
| |
| #include <iterator> |
| #include <ostream> |
| #include <set> |
| #include <string> |
| #include <vector> |
| |
| #include <process/after.hpp> |
| #include <process/loop.hpp> |
| #include <process/pid.hpp> |
| |
| #include <stout/adaptor.hpp> |
| #include <stout/none.hpp> |
| #include <stout/numify.hpp> |
| #include <stout/os.hpp> |
| #include <stout/path.hpp> |
| #include <stout/unreachable.hpp> |
| #include <stout/stringify.hpp> |
| #include <stout/try.hpp> |
| |
| #include "linux/ebpf.hpp" |
| #include "linux/fs.hpp" |
| |
| using std::ostream; |
| using std::set; |
| using std::string; |
| using std::vector; |
| |
| using process::Break; |
| using process::Continue; |
| using process::ControlFlow; |
| using process::Failure; |
| using process::Future; |
| using process::loop; |
| |
| using mesos::internal::fs::MountTable; |
| |
| namespace cgroups2 { |
| |
| // Name of the cgroups v2 filesystem as found in /proc/filesystems. |
| const string FILE_SYSTEM = "cgroup2"; |
| |
| // Mount point for the cgroups2 file system. |
| const string MOUNT_POINT = "/sys/fs/cgroup"; |
| |
| |
| template <typename T> |
| Try<T> read(const string& cgroup, const string& control); |
| |
| |
| template <> |
| Try<string> read(const string& cgroup, const string& control) |
| { |
| return os::read(path::join(cgroups2::path(cgroup), control)); |
| } |
| |
| |
| template <> |
| Try<uint64_t> read(const string& cgroup, const string& control) |
| { |
| Try<string> content = read<string>(cgroup, control); |
| if (content.isError()) { |
| return Error(content.error()); |
| } |
| |
| return numify<uint64_t>(strings::trim(*content)); |
| } |
| |
| |
| Try<Nothing> write( |
| const string& cgroup, |
| const string& control, |
| const string& value) |
| { |
| return os::write(path::join(cgroups2::path(cgroup), control), value); |
| } |
| |
| |
| Try<Nothing> write( |
| const string& cgroup, |
| const string& control, |
| const uint64_t& value) |
| { |
| return write(cgroup, control, stringify(value)); |
| } |
| |
| namespace control { |
| |
| // Interface files found in all cgroups. |
| const std::string CONTROLLERS = "cgroup.controllers"; |
| const std::string EVENTS = "cgroup.events"; |
| const std::string FREEZE = "cgroup.freeze"; |
| const std::string IRQ_PRESSURE = "irq.pressure"; |
| const std::string KILL = "cgroup.kill"; |
| const std::string MAX_DEPTH = "cgroup.max.depth"; |
| const std::string MAX_DESCENDANTS = "cgroup.max.descendants"; |
| const std::string PRESSURE = "cgroup.pressure"; |
| const std::string PROCESSES = "cgroup.procs"; |
| const std::string STATS = "cgroup.stat"; |
| const std::string SUBTREE_CONTROLLERS = "cgroup.subtree_control"; |
| const std::string THREADS = "cgroup.threads"; |
| const std::string TYPE = "cgroup.type"; |
| |
| namespace subtree_control { |
| |
| struct State |
| { |
| State() = default; |
| |
| // We don't return errors here because enabling something |
| // unknown will fail when writing it back out. |
| void enable(const vector<string>& controllers) |
| { |
| foreach (const string& controller, controllers) { |
| enable(controller); |
| } |
| } |
| |
| // We don't return errors here because enabling something |
| // unknown will fail when writing it back out. |
| void enable(const string& controller) |
| { |
| _disabled.erase(controller); |
| _enabled.insert(controller); |
| } |
| |
| // We don't return errors here since disabling something |
| // unknown will fail when writing it back out. |
| void disable(const string& controller) |
| { |
| _enabled.erase(controller); |
| _disabled.insert(controller); |
| } |
| |
| void disable(const set<string>& controllers) |
| { |
| foreach (const string& controller, controllers) { |
| disable(controller); |
| } |
| } |
| |
| set<string> enabled() const { return _enabled; } |
| set<string> disabled() const { return _disabled; } |
| |
| bool enabled(const string& controller) const |
| { |
| return _enabled.find(controller) != _enabled.end(); |
| } |
| |
| static State parse(const string& contents) |
| { |
| State control; |
| |
| // Trim trailing newline. |
| const string trimmed = strings::trim(contents); |
| if (trimmed.empty()) { |
| return control; |
| } |
| |
| vector<string> controllers = strings::split(trimmed, " "); |
| control._enabled.insert( |
| std::make_move_iterator(controllers.begin()), |
| std::make_move_iterator(controllers.end())); |
| return control; |
| } |
| |
| private: |
| set<string> _enabled; |
| set<string> _disabled; |
| }; |
| |
| |
| std::ostream& operator<<(std::ostream& stream, const State& state) |
| { |
| foreach (const string& system, state.enabled()) { |
| stream << "+" << system << " "; |
| } |
| foreach (const string& system, state.disabled()) { |
| stream << "-" << system << " "; |
| } |
| return stream; |
| } |
| |
| |
| Try<State> read(const string& cgroup) |
| { |
| Try<string> contents = |
| cgroups2::read<string>(cgroup, cgroups2::control::SUBTREE_CONTROLLERS); |
| |
| if (contents.isError()) { |
| return Error( |
| "Failed to read 'cgroup.subtree_control' for cgroup '" + cgroup + "': " |
| + contents.error()); |
| } |
| |
| return State::parse(*contents); |
| } |
| |
| |
| Try<Nothing> write(const string& cgroup, const State& state) |
| { |
| return cgroups2::write( |
| cgroup, control::SUBTREE_CONTROLLERS, stringify(state)); |
| } |
| |
| } // namespace subtree_control { |
| |
| } // namespace control { |
| |
| |
| bool enabled() |
| { |
| Try<bool> supported = mesos::internal::fs::supported(cgroups2::FILE_SYSTEM); |
| return supported.isSome() && *supported; |
| } |
| |
| |
| Try<Nothing> mount() |
| { |
| if (!cgroups2::enabled()) { |
| return Error("cgroups2 is not enabled"); |
| } |
| |
| Try<bool> mounted = cgroups2::mounted(); |
| if (mounted.isError()) { |
| return Error("Failed to check if cgroups2 filesystem is mounted: " |
| + mounted.error()); |
| } |
| if (*mounted) { |
| return Error("cgroup2 filesystem is already mounted at" |
| " '" + cgroups2::MOUNT_POINT + "'"); |
| } |
| |
| Try<Nothing> mkdir = os::mkdir(cgroups2::MOUNT_POINT); |
| if (mkdir.isError()) { |
| return Error("Failed to create cgroups2 directory" |
| " '" + cgroups2::MOUNT_POINT + "': " + mkdir.error()); |
| } |
| |
| return mesos::internal::fs::mount( |
| None(), |
| cgroups2::MOUNT_POINT, |
| cgroups2::FILE_SYSTEM, |
| 0, |
| None()); |
| } |
| |
| |
| Try<bool> mounted() |
| { |
| Try<MountTable> mountTable = MountTable::read("/proc/mounts"); |
| if (mountTable.isError()) { |
| return Error("Failed to read /proc/mounts: " + mountTable.error()); |
| } |
| |
| foreach (MountTable::Entry entry, mountTable->entries) { |
| if (entry.type == cgroups2::FILE_SYSTEM) { |
| if (entry.dir == MOUNT_POINT) { |
| return true; |
| } |
| return Error("Found cgroups2 mount at an unexpected location" |
| " '" + entry.dir + "'"); |
| } |
| } |
| |
| return false; |
| } |
| |
| |
| Try<Nothing> unmount() |
| { |
| Try<bool> mounted = cgroups2::mounted(); |
| if (mounted.isError()) { |
| return Error("Failed to check if the cgroup2 filesystem is mounted: " |
| + mounted.error()); |
| } |
| |
| if (!*mounted) { |
| return Error("cgroups2 filesystem is not mounted"); |
| } |
| |
| Try<Nothing> result = mesos::internal::fs::unmount(MOUNT_POINT); |
| if (result.isError()) { |
| return Error("Failed to unmount the cgroup2 hierarchy" |
| " '" + cgroups2::MOUNT_POINT + "': " + result.error()); |
| } |
| |
| Try<Nothing> rmdir = os::rmdir(cgroups2::MOUNT_POINT); |
| if (rmdir.isError()) { |
| return Error("Failed to remove directory '" + cgroups2::MOUNT_POINT + "': " |
| + rmdir.error()); |
| } |
| |
| return Nothing(); |
| } |
| |
| |
| bool exists(const string& cgroup) |
| { |
| return os::exists(cgroups2::path(cgroup)); |
| } |
| |
| |
| Try<set<string>> get(const string& cgroup) |
| { |
| const string& path = cgroups2::path(cgroup); |
| char* paths[] = {const_cast<char*>(path.c_str()), nullptr}; |
| |
| FTS* tree = fts_open(paths, FTS_NOCHDIR, nullptr); |
| if (tree == nullptr) { |
| return ErrnoError("Failed to start traversing filesystem"); |
| } |
| |
| FTSENT* node; |
| set<string> cgroups; |
| while ((node = fts_read(tree)) != nullptr) { |
| // Use post-order walk here. fts_level is the depth of the traversal, |
| // numbered from -1 to N, where the file/dir was found. The traversal root |
| // itself is numbered 0. fts_info includes flags for the current node. |
| // FTS_DP indicates a directory being visited in postorder. |
| if (node->fts_level > 0 && node->fts_info & FTS_DP) { |
| string _cgroup = strings::trim( |
| node->fts_path + MOUNT_POINT.length(), "/"); |
| cgroups.insert(_cgroup); |
| } |
| } |
| |
| if (errno != 0) { |
| Error error = |
| ErrnoError("Failed to read a node while traversing the filesystem"); |
| fts_close(tree); |
| return error; |
| } |
| |
| if (fts_close(tree) != 0) { |
| return ErrnoError("Failed to stop traversing file system"); |
| } |
| |
| return cgroups; |
| } |
| |
| |
| Try<Nothing> create(const string& cgroup, bool recursive) |
| { |
| const string path = cgroups2::path(cgroup); |
| |
| Try<Nothing> mkdir = os::mkdir(path, recursive); |
| if (mkdir.isError()) { |
| return Error("Failed to create directory '" + path + "': " + mkdir.error()); |
| } |
| |
| return Nothing(); |
| } |
| |
| |
| Try<Nothing> kill(const std::string& cgroup) |
| { |
| if (!cgroups2::exists(cgroup)) { |
| return Error("Cgroup does not exist"); |
| } |
| |
| return cgroups2::write(cgroup, cgroups2::control::KILL, "1"); |
| } |
| |
| |
| Future<Nothing> destroy(const string& cgroup) |
| { |
| if (!cgroups2::exists(cgroup)) { |
| return Failure("Cgroup '" + cgroup + "' does not exist"); |
| } |
| |
| // To destroy a subtree of cgroups we first kill all of the processes inside |
| // of the cgroup and then remove all of the cgroup directories, removing |
| // the most deeply nested directories first. |
| |
| Try<Nothing> kill = cgroups2::kill(cgroup); |
| if (kill.isError()) { |
| return Failure("Failed to kill processes in cgroup: " + kill.error()); |
| } |
| |
| // Wait until all of the processes have been killed. |
| int retries = 50; |
| Future<Nothing> emptied = loop( |
| []() { return process::after(Milliseconds(1)); }, |
| [=](const Nothing&) mutable -> Future<ControlFlow<Nothing>> { |
| Try<set<pid_t>> pids = cgroups2::processes(cgroup, true); |
| if (pids.isError()) { |
| return Failure("Failed to fetch pids in cgroup: " + pids.error()); |
| } |
| |
| if (pids->empty()) { |
| return Break(); |
| } |
| |
| --retries; |
| if (retries == 0) { |
| return Failure("Processes were still found: " + stringify(*pids)); |
| } |
| |
| return Continue(); |
| }); |
| |
| return emptied |
| .then([=]() -> Future<Nothing> { |
| Try<set<string>> cgroups = cgroups2::get(cgroup); |
| if (cgroups.isError()) { |
| return Failure("Failed to get nested cgroups: " + cgroups.error()); |
| } |
| |
| cgroups->insert(cgroup); |
| |
| // Remove the cgroups in bottom-up order. |
| foreach (const string& cgroup, adaptor::reverse(*cgroups)) { |
| const string path = cgroups2::path(cgroup); |
| |
| // Remove the cgroup's directory. If the directory does not exist, |
| // ignore the error to protect against races. |
| if (::rmdir(path.c_str()) < 0) { |
| ErrnoError error = ErrnoError(); |
| if (error.code != ENOENT) { |
| return Failure( |
| "Failed to remove directory '" + path + "': " + error.message); |
| } |
| } |
| } |
| |
| return Nothing(); |
| }); |
| } |
| |
| |
| Try<Nothing> assign(const string& cgroup, pid_t pid) |
| { |
| if (!cgroups2::exists(cgroup)) { |
| return Error("Cgroup '" + cgroup + "' does not exist"); |
| } |
| |
| return cgroups2::write(cgroup, control::PROCESSES, stringify(pid)); |
| } |
| |
| |
| Try<string> cgroup(pid_t pid) |
| { |
| // A process's cgroup membership is listed in /proc/{pid}/cgroup. |
| // The format, e.g if the process belongs to /sys/fs/cgroup/foo/bar, is: |
| // |
| // 0::/foo/bar |
| // or |
| // 0::/foo/bar (deleted) |
| // |
| // See: https://docs.kernel.org/admin-guide/cgroup-v2.html#processes |
| // https://man7.org/linux/man-pages/man7/cgroups.7.html |
| const string& cgroupFile = path::join("/proc", stringify(pid), "cgroup"); |
| if (!os::exists(cgroupFile)) { |
| return Error("'" + cgroupFile + "' does not exist"); |
| } |
| |
| Try<string> read = os::read(cgroupFile); |
| if (read.isError()) { |
| return Error("Failed to read '" + cgroupFile + "': " + read.error()); |
| } |
| |
| string content = strings::trim(*read); |
| if (!strings::startsWith(content, "0::/")) { |
| return Error("process belongs to a v1 cgroup: " + content); |
| } |
| |
| content = strings::remove(content, "0::/", strings::Mode::PREFIX); |
| content = strings::remove(content, " (deleted)", strings::Mode::SUFFIX); |
| |
| return content; |
| } |
| |
| |
| Try<set<pid_t>> processes(const string& cgroup, bool recursive) |
| { |
| if (!cgroups2::exists(cgroup)) { |
| return Error("Cgroup '" + cgroup + "' does not exist"); |
| } |
| |
| set<string> cgroups = {cgroup}; |
| |
| if (recursive) { |
| Try<set<string>> descendants = cgroups2::get(cgroup); |
| if (descendants.isError()) { |
| return Error("Failed to list cgroups: " + descendants.error()); |
| } |
| cgroups.insert(descendants->begin(), descendants->end()); |
| } |
| |
| set<pid_t> pids; |
| |
| foreach (const string& cgroup, cgroups) { |
| Try<string> contents = cgroups2::read<string>(cgroup, control::PROCESSES); |
| |
| if (contents.isError() && !exists(cgroup)) { |
| continue; // Ignore missing cgroups due to races. |
| } |
| |
| if (contents.isError()) { |
| return Error("Failed to read cgroup.procs in '" + cgroup + "': " |
| + contents.error()); |
| } |
| |
| foreach (const string& line, strings::split(*contents, "\n")) { |
| if (line.empty()) continue; |
| |
| Try<pid_t> pid = numify<pid_t>(line); |
| if (pid.isError()) { |
| return Error("Failed to parse '" + line + "' as a pid: " + pid.error()); |
| } |
| |
| pids.insert(*pid); |
| } |
| } |
| |
| return pids; |
| } |
| |
| |
| Try<set<pid_t>> threads(const string& cgroup) |
| { |
| Try<string> contents = cgroups2::read<string>(cgroup, control::THREADS); |
| if (contents.isError()) { |
| return Error("Failed to read 'cgroup.threads' in" |
| " '" + cgroup + "': " + contents.error()); |
| } |
| |
| set<pid_t> tids; |
| foreach (const string& line, strings::split(*contents, "\n")) { |
| if (line.empty()) continue; |
| |
| Try<pid_t> tid = numify<pid_t>(line); |
| if (tid.isError()) { |
| return Error("Failed to parse '" + line + "' as a tid: " + tid.error()); |
| } |
| |
| tids.insert(*tid); |
| } |
| |
| return tids; |
| } |
| |
| |
| string path(const string& cgroup) |
| { |
| return path::join(cgroups2::MOUNT_POINT, cgroup); |
| } |
| |
| namespace controllers { |
| |
| Try<set<string>> available(const string& cgroup) |
| { |
| Try<string> read = |
| cgroups2::read<string>(cgroup, cgroups2::control::CONTROLLERS); |
| |
| if (read.isError()) { |
| return Error("Failed to read cgroup.controllers in '" + cgroup + "': " |
| + read.error()); |
| } |
| |
| // Trim trailing newline. |
| const string contents = strings::trim(*read); |
| if (contents.empty()) { |
| return set<string>(); |
| } |
| |
| vector<string> controllers = strings::split(contents, " "); |
| return set<string>( |
| std::make_move_iterator(controllers.begin()), |
| std::make_move_iterator(controllers.end())); |
| } |
| |
| |
| Try<Nothing> enable(const string& cgroup, const vector<string>& controllers) |
| { |
| using State = control::subtree_control::State; |
| Try<State> control = cgroups2::control::subtree_control::read(cgroup); |
| |
| if (control.isError()) { |
| return Error(control.error()); |
| } |
| |
| control->enable(controllers); |
| return cgroups2::control::subtree_control::write(cgroup, *control); |
| } |
| |
| |
| Try<Nothing> disable(const string& cgroup, const set<string>& controllers) |
| { |
| using State = control::subtree_control::State; |
| Try<State> control = cgroups2::control::subtree_control::read(cgroup); |
| |
| if (control.isError()) { |
| return Error(control.error()); |
| } |
| |
| control->disable(controllers); |
| return cgroups2::control::subtree_control::write(cgroup, *control); |
| } |
| |
| |
| Try<set<string>> enabled(const string& cgroup) |
| { |
| Try<string> contents = |
| cgroups2::read<string>(cgroup, cgroups2::control::SUBTREE_CONTROLLERS); |
| if (contents.isError()) { |
| return Error("Failed to read 'cgroup.subtree_control' in '" + cgroup + "'" |
| ": " + contents.error()); |
| } |
| |
| using State = control::subtree_control::State; |
| State control = State::parse(*contents); |
| return control.enabled(); |
| } |
| |
| } // namespace controllers { |
| |
| namespace cpu { |
| |
| BandwidthLimit::BandwidthLimit(Duration _limit, Duration _period) |
| : limit{_limit}, |
| period{_period} {} |
| |
| |
| Try<BandwidthLimit> parse_bandwidth(const string& content) |
| { |
| // Format |
| // ----------------------------- |
| // $MAX $PERIOD |
| // ----------------------------- |
| // $MAX Maximum CPU time, in microseconds, processes in the cgroup can |
| // collectively use during one $PERIOD. If set to "max" then there |
| // is no limit. |
| // |
| // $PERIOD Length of one period, in microseconds. |
| vector<string> split = strings::split(strings::trim(content), " "); |
| if (split.size() != 2) { |
| return Error("Expected format '$MAX $PERIOD'" |
| " but received '" + content + "'"); |
| } |
| |
| if (split[0] == "max") { |
| return cpu::BandwidthLimit(); |
| } |
| |
| Try<Duration> limit = Duration::parse(split[0] + "us"); |
| if (limit.isError()) { |
| return Error("Failed to parse cpu.max's limit of '" + split[0] + "': " |
| + limit.error()); |
| } |
| |
| Try<Duration> period = Duration::parse(split[1] + "us"); |
| if (period.isError()) { |
| return Error("Failed to parse cpu.max's period of '" + split[1] + "': " |
| + period.error()); |
| } |
| |
| return BandwidthLimit(*limit, *period); |
| } |
| |
| namespace control { |
| |
| const std::string IDLE = "cpu.idle"; |
| const std::string MAX = "cpu.max"; |
| const std::string MAX_BURST = "cpu.max.burst"; |
| const std::string PRESSURE = "cpu.pressure"; |
| const std::string STATS = "cpu.stat"; |
| const std::string UCLAMP_MAX = "cpu.uclamp.max"; |
| const std::string UCLAMP_MIN = "cpu.uclamp.min"; |
| const std::string WEIGHT = "cpu.weight"; |
| const std::string WEIGHT_NICE = "cpu.weight.nice"; |
| |
| namespace stat { |
| |
| Try<Stats> parse(const string& content) |
| { |
| const vector<string> lines = strings::split(content, "\n"); |
| cpu::Stats stats; |
| |
| foreach (const string& line, lines) { |
| if (line.empty()) { |
| continue; |
| } |
| |
| vector<string> tokens = strings::split(line, " "); |
| if (tokens.size() != 2) { |
| return Error("Invalid line format in 'cpu.stat' expected " |
| "<key> <value> received: '" + line + "'"); |
| } |
| |
| const string& field = tokens[0]; |
| const string& value = tokens[1]; |
| |
| Try<uint64_t> number = numify<uint64_t>(value); |
| if (number.isError()) { |
| return Error("Failed to parse '" + field + "': " + number.error()); |
| } |
| Duration duration = Microseconds(static_cast<int64_t>(*number)); |
| |
| if (field == "usage_usec") { stats.usage = duration; } |
| else if (field == "user_usec") { stats.user_time = duration; } |
| else if (field == "system_usec") { stats.system_time = duration; } |
| else if (field == "nr_periods") { stats.periods = *number; } |
| else if (field == "nr_throttled") { stats.throttled = *number; } |
| else if (field == "throttled_usec") { stats.throttle_time = duration; } |
| else if (field == "nr_burst") { stats.bursts = *number; } |
| else if (field == "burst_usec") { stats.bursts_time = duration; } |
| } |
| |
| return stats; |
| } |
| |
| } // namespace stat { |
| |
| } // namespace control { |
| |
| |
| Try<Nothing> weight(const string& cgroup, uint64_t weight) |
| { |
| if (cgroup == ROOT_CGROUP) { |
| return Error("Operation not supported for the root cgroup"); |
| } |
| |
| return cgroups2::write(cgroup, cpu::control::WEIGHT, weight); |
| } |
| |
| |
| Try<uint64_t> weight(const string& cgroup) |
| { |
| if (cgroup == ROOT_CGROUP) { |
| return Error("Operation not supported for the root cgroup"); |
| } |
| |
| return cgroups2::read<uint64_t>(cgroup, cpu::control::WEIGHT); |
| } |
| |
| |
| Try<cpu::Stats> stats(const string& cgroup) |
| { |
| Try<string> content = cgroups2::read<string>( |
| cgroup, cgroups2::cpu::control::STATS); |
| |
| if (content.isError()) { |
| return Error("Failed to read 'cpu.stat' for the cgroup '" + cgroup + "': " |
| + content.error()); |
| } |
| |
| return cpu::control::stat::parse(*content); |
| } |
| |
| |
| Try<Nothing> set_max(const string& cgroup, const cpu::BandwidthLimit& limit) |
| { |
| if (cgroup == ROOT_CGROUP) { |
| return Error("Operation not supported for the root cgroup"); |
| } |
| |
| if (limit.limit.isNone()) { |
| return cgroups2::write(cgroup, cpu::control::MAX, "max"); |
| } |
| |
| if (limit.period.isNone()) { |
| return Error("Invalid bandwidth limit: period can only be None" |
| " for a limitless bandwidth limit"); |
| } |
| |
| if (limit.period->ns() < 0 || limit.limit->ns() < 0 |
| || limit.period->ns() % 1000 > 0 || limit.limit->ns() % 1000 > 0) { |
| return Error("Invalid bandwidth limit: period and limit must be" |
| " positive and microsecond level granularity, received" |
| " period=" + stringify(*limit.period) |
| + " limit=" + stringify(*limit.limit)); |
| } |
| |
| return cgroups2::write( |
| cgroup, |
| cpu::control::MAX, |
| stringify(static_cast<uint64_t>(limit.limit->us())) |
| + " " |
| + stringify(static_cast<uint64_t>(limit.period->us()))); |
| } |
| |
| |
| Try<cpu::BandwidthLimit> max(const string& cgroup) |
| { |
| if (cgroup == ROOT_CGROUP) { |
| return Error("Operation not supported for the root cgroup"); |
| } |
| |
| Try<string> content = cgroups2::read<string>(cgroup, cpu::control::MAX); |
| if (content.isError()) { |
| return Error("Failed the read 'cpu.max' for cgroup '" + cgroup + "': " |
| + content.error()); |
| } |
| |
| Try<BandwidthLimit> limit = parse_bandwidth(*content); |
| if (limit.isError()) { |
| return Error("Failed to parse '" + *content + "' as a bandwidth limit: " |
| + limit.error()); |
| } |
| |
| return *limit; |
| } |
| |
| } // namespace cpu { |
| |
| namespace memory { |
| |
| namespace internal { |
| |
| // Parse a byte limit from a string. |
| // |
| // Format: "max" OR a u64_t string representing bytes. |
| Result<Bytes> parse_bytelimit(const string& value) |
| { |
| const string trimmed = strings::trim(value); |
| if (trimmed == "max") { |
| return None(); |
| } |
| |
| Try<uint64_t> bytes = numify<uint64_t>(trimmed); |
| if (bytes.isError()) { |
| return Error("Failed to numify '" + trimmed + "': " + bytes.error()); |
| } |
| |
| return Bytes(*bytes); |
| } |
| |
| } // namespace internal { |
| |
| |
| namespace control { |
| |
| const string CURRENT = "memory.current"; |
| const string EVENTS = "memory.events"; |
| const string LOW = "memory.low"; |
| const string HIGH = "memory.high"; |
| const string MAX = "memory.max"; |
| const string MIN = "memory.min"; |
| const string STAT = "memory.stat"; |
| |
| namespace stat { |
| |
| Try<Stats> parse(const string& content) |
| { |
| Stats stats; |
| |
| foreach (const string& line, strings::split(content, "\n")) { |
| if (line.empty()) { |
| continue; |
| } |
| |
| vector<string> tokens = strings::split(line, " "); |
| if (tokens.size() != 2) { |
| return Error("Invalid line format in 'memory.stat'; expected " |
| "<key> <value> received: '" + line + "'"); |
| } |
| |
| const string& key = tokens[0]; |
| const string& value = tokens[1]; |
| |
| Try<uint64_t> n = numify<uint64_t>(value); |
| if (n.isError()) { |
| return Error("Failed to numify '" + value + "': " + n.error()); |
| } |
| const Bytes bytes(*n); |
| |
| if (key == "anon") { stats.anon = bytes; } |
| else if (key == "file") { stats.file = bytes; } |
| else if (key == "kernel") { stats.kernel = bytes; } |
| else if (key == "kernel_stack") { stats.kernel_stack = bytes; } |
| else if (key == "pagetables") { stats.pagetables = bytes; } |
| else if (key == "sock") { stats.sock = bytes; } |
| else if (key == "vmalloc") { stats.vmalloc = bytes; } |
| else if (key == "file_mapped") { stats.file_mapped = bytes; } |
| } |
| |
| return stats; |
| } |
| |
| } // namespace stat { |
| |
| } // namespace control { |
| |
| namespace events { |
| |
| Try<Events> parse(const string& content) |
| { |
| Events events; |
| |
| foreach (const string& line, strings::split(content, "\n")) { |
| if (line.empty()) { |
| continue; |
| } |
| |
| vector<string> tokens = strings::split(line, " "); |
| if (tokens.size() != 2) { |
| return Error("Invalid line format in 'memory.events' expected " |
| "<key> <value> received: '" + line + "'"); |
| } |
| |
| const string& field = tokens[0]; |
| const string& value = tokens[1]; |
| |
| Try<uint64_t> count = numify<uint64_t>(value); |
| if (count.isError()) { |
| return Error("Failed to numify '" + value + "': " + count.error()); |
| } |
| |
| if (field == "low") { events.low = *count; } |
| else if (field == "high") { events.high = *count; } |
| else if (field == "max") { events.max = *count; } |
| else if (field == "oom") { events.oom = *count; } |
| else if (field == "oom_kill") { events.oom_kill = *count; } |
| else if (field == "oom_group_kill") { events.oom_group_kill = *count; } |
| } |
| |
| return events; |
| } |
| |
| } // namespace events { |
| |
| Future<Nothing> oom(const string& cgroup) |
| { |
| // TODO(dleamy): Update this to use inotify, rather than polling. |
| return loop( |
| []() { |
| return process::after(Milliseconds(100)); |
| }, |
| [=](const Nothing&) -> Future<ControlFlow<Nothing>> { |
| Try<string> content = cgroups2::read<string>(cgroup, control::EVENTS); |
| if (content.isError()) { |
| return Failure("Failed to read 'memory.events': " + content.error()); |
| } |
| |
| Try<Events> events = events::parse(strings::trim(*content)); |
| if (events.isError()) { |
| return Failure("Failed to parse 'memory.events': " + events.error()); |
| } |
| |
| if (events->oom > 0) { |
| return Break(Nothing()); |
| } |
| return Continue(); |
| }); |
| } |
| |
| |
| Try<Bytes> usage(const string& cgroup) |
| { |
| Try<uint64_t> contents = cgroups2::read<uint64_t>( |
| cgroup, memory::control::CURRENT); |
| if (contents.isError()) { |
| return Error("Failed to read 'memory.current': " + contents.error()); |
| } |
| |
| return Bytes(*contents); |
| } |
| |
| |
| Try<Nothing> set_low(const string& cgroup, const Bytes& bytes) |
| { |
| return cgroups2::write(cgroup, control::LOW, bytes.bytes()); |
| } |
| |
| |
| Try<Bytes> low(const string& cgroup) |
| { |
| Try<uint64_t> contents = cgroups2::read<uint64_t>(cgroup, control::LOW); |
| if (contents.isError()) { |
| return Error("Failed to read 'memory.low': " + contents.error()); |
| } |
| |
| return Bytes(*contents); |
| } |
| |
| |
| Try<Nothing> set_min(const string& cgroup, const Bytes& bytes) |
| { |
| return cgroups2::write(cgroup, control::MIN, bytes.bytes()); |
| } |
| |
| |
| Try<Bytes> min(const string& cgroup) |
| { |
| Try<uint64_t> contents = cgroups2::read<uint64_t>(cgroup, control::MIN); |
| if (contents.isError()) { |
| return Error("Failed to read 'memory.min': " + contents.error()); |
| } |
| |
| return Bytes(*contents); |
| } |
| |
| |
| Try<Nothing> set_max(const string& cgroup, const Option<Bytes>& limit) |
| { |
| return cgroups2::write( |
| cgroup, |
| control::MAX, |
| limit.isNone() ? "max" : stringify(limit->bytes())); |
| } |
| |
| |
| Result<Bytes> max(const string& cgroup) |
| { |
| Try<string> contents = cgroups2::read<string>(cgroup, control::MAX); |
| if (contents.isError()) { |
| return Error("Failed to read 'memory.max': " + contents.error()); |
| } |
| |
| return internal::parse_bytelimit(*contents); |
| } |
| |
| |
| Try<Nothing> set_high(const string& cgroup, const Option<Bytes>& limit) |
| { |
| return cgroups2::write( |
| cgroup, |
| control::HIGH, |
| limit.isNone() ? "max" : stringify(limit->bytes())); |
| } |
| |
| |
| Result<Bytes> high(const string& cgroup) |
| { |
| Try<string> contents = cgroups2::read<string>(cgroup, control::HIGH); |
| if (contents.isError()) { |
| return Error("Failed to read 'memory.high': " + contents.error()); |
| } |
| |
| return internal::parse_bytelimit(*contents); |
| } |
| |
| |
| Try<Stats> stats(const string& cgroup) |
| { |
| Try<string> contents = cgroups2::read<string>(cgroup, control::STAT); |
| if (contents.isError()) { |
| return Error("Failed to read 'memory.stat': " + contents.error()); |
| } |
| |
| return control::stat::parse(*contents); |
| } |
| |
| } // namespace memory { |
| |
| namespace devices { |
| |
| // Utility class to construct an eBPF program to whitelist or blacklist |
| // select device accesses. |
| class DeviceProgram |
| { |
| public: |
| DeviceProgram() : program{ebpf::Program(BPF_PROG_TYPE_CGROUP_DEVICE)} |
| { |
| // The BPF_PROG_TYPE_CGROUP_DEVICE program takes in |
| // `struct bpf_cgroup_dev_ctx*` as input. We extract the fields into |
| // registers r2-5. |
| // |
| // The device type is encoded in the first 16 bits of `access_type` and |
| // the access type is encoded in the last 16 bits of `access_type`. |
| program.append({ |
| // r2: Type ('c', 'b', '?') |
| BPF_LDX_MEM( |
| BPF_W, BPF_REG_2, BPF_REG_1, offsetof(bpf_cgroup_dev_ctx, access_type)), |
| BPF_ALU32_IMM(BPF_AND, BPF_REG_2, 0xFFFF), |
| // r3: Access ('r', 'w', 'm') |
| BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, |
| offsetof(bpf_cgroup_dev_ctx, access_type)), |
| BPF_ALU32_IMM(BPF_RSH, BPF_REG_3, 16), |
| // r4: Major Version |
| BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1, |
| offsetof(bpf_cgroup_dev_ctx, major)), |
| // r5: Minor Version |
| BPF_LDX_MEM(BPF_W, BPF_REG_5, BPF_REG_1, |
| offsetof(bpf_cgroup_dev_ctx, minor)), |
| }); |
| } |
| |
| Try<Nothing> allow(const Entry entry) { return addDevice(entry, true); } |
| Try<Nothing> deny(const Entry entry) { return addDevice(entry, false); } |
| |
| ebpf::Program build() |
| { |
| if (!hasCatchAll) { |
| // Exit instructions. |
| // If no entry granted access, then deny the access. |
| program.append({ |
| BPF_MOV64_IMM (BPF_REG_0, DENY_ACCESS), |
| BPF_EXIT_INSN(), |
| }); |
| } |
| return program; |
| } |
| |
| private: |
| Try<Nothing> addDevice(const Entry entry, bool allow) |
| { |
| if (hasCatchAll) { |
| return Nothing(); |
| } |
| |
| // We create a block of bytecode with the format: |
| // 1. Major Version Check |
| // 2. Minor Version Check |
| // 3. Type Check |
| // 4. Access Check |
| // 5. Allow/Deny Access |
| // |
| // 6. NEXT BLOCK |
| // |
| // Either: |
| // 1. The device access is matched by (1,2,3,4) and the Allow/Deny access |
| // block (5) is executed. |
| // 2. One of (1,2,3,4) does not match the requested access and we skip |
| // to the next block (6). |
| |
| const Entry::Selector& selector = entry.selector; |
| const Entry::Access& access = entry.access; |
| |
| bool check_major = selector.major.isSome(); |
| bool check_minor = selector.minor.isSome(); |
| bool check_type = selector.type != Entry::Selector::Type::ALL; |
| bool check_access = !access.mknod || !access.read || !access.write; |
| |
| // Number of instructions to the [NEXT BLOCK]. This is used if a check |
| // fails (meaning this entry does not apply) and we want to skip the |
| // subsequent checks. |
| short jmp_size = 1 + (check_major ? 1 : 0) + (check_minor ? 1 : 0) + |
| (check_access ? 3 : 0) + (check_type ? 1 : 0); |
| |
| // Check major version (r4) against entry. |
| if (check_major) { |
| program.append({ |
| BPF_JMP_IMM(BPF_JNE, BPF_REG_4, (int)selector.major.get(), jmp_size), |
| }); |
| --jmp_size; |
| } |
| |
| // Check minor version (r5) against entry. |
| if (check_minor) { |
| program.append({ |
| BPF_JMP_IMM(BPF_JNE, BPF_REG_5, (int)selector.minor.get(), jmp_size), |
| }); |
| --jmp_size; |
| } |
| |
| // Check type (r2) against entry. |
| if (check_type) { |
| int bpf_type = [selector]() { |
| switch (selector.type) { |
| case Entry::Selector::Type::BLOCK: return BPF_DEVCG_DEV_BLOCK; |
| case Entry::Selector::Type::CHARACTER: return BPF_DEVCG_DEV_CHAR; |
| case Entry::Selector::Type::ALL: UNREACHABLE(); |
| } |
| }(); |
| |
| program.append({ |
| BPF_JMP_IMM(BPF_JNE, BPF_REG_2, bpf_type, jmp_size), |
| }); |
| --jmp_size; |
| } |
| |
| // Check access (r3) against entry. |
| if (check_access) { |
| int bpf_access = 0; |
| bpf_access |= access.read ? BPF_DEVCG_ACC_READ : 0; |
| bpf_access |= access.write ? BPF_DEVCG_ACC_WRITE : 0; |
| bpf_access |= access.mknod ? BPF_DEVCG_ACC_MKNOD : 0; |
| |
| program.append({ |
| BPF_MOV32_REG(BPF_REG_1, BPF_REG_3), |
| BPF_ALU32_IMM(BPF_AND, BPF_REG_1, bpf_access), |
| BPF_JMP_REG( |
| BPF_JNE, BPF_REG_1, BPF_REG_3, static_cast<short>(jmp_size - 2)), |
| }); |
| jmp_size -= 3; |
| } |
| |
| if (!check_major && !check_minor && !check_type && !check_access) { |
| // The exit instructions as well as any additional device entries would |
| // generate unreachable blocks. |
| hasCatchAll = true; |
| } |
| |
| // Allow/Deny access block. |
| program.append({ |
| BPF_MOV64_IMM(BPF_REG_0, allow ? ALLOW_ACCESS : DENY_ACCESS), |
| BPF_EXIT_INSN(), |
| }); |
| |
| return Nothing(); |
| } |
| |
| ebpf::Program program; |
| |
| // Whether the program has a device entry that allows or denies ALL accesses. |
| // Such cases need to be specially handled because any instructions added |
| // after it will be unreachable, and thus will cause the eBPF verifier to |
| // reject the program. |
| bool hasCatchAll = false; |
| |
| static const int ALLOW_ACCESS = 1; |
| static const int DENY_ACCESS = 0; |
| }; |
| |
| |
| Try<Nothing> configure( |
| const string& cgroup, |
| const vector<Entry>& allow, |
| const vector<Entry>& deny) |
| { |
| DeviceProgram program = DeviceProgram(); |
| foreach (const Entry entry, allow) { |
| program.allow(entry); |
| } |
| foreach (const Entry entry, deny) { |
| program.deny(entry); |
| } |
| |
| Try<Nothing> attach = ebpf::cgroups2::attach( |
| cgroups2::path(cgroup), |
| program.build()); |
| |
| if (attach.isError()) { |
| return Error("Failed to attach BPF_PROG_TYPE_CGROUP_DEVICE program: " + |
| attach.error()); |
| } |
| |
| return Nothing(); |
| } |
| |
| } // namespace devices { |
| |
| } // namespace cgroups2 { |