src/linux/cgroups2.cpp - mesos - Git at Google

 // Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements.  See the NOTICE file
 // distributed with this work for additional information
 // regarding copyright ownership.  The ASF licenses this file
 // to you under the Apache License, Version 2.0 (the
 // "License"); you may not use this file except in compliance
 // with the License.  You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 #include <fts.h>

 #include "linux/cgroups2.hpp"

 #include <iterator>
 #include <ostream>
 #include <set>
 #include <string>
 #include <vector>
 #include <utility>

 #include <process/after.hpp>
 #include <process/loop.hpp>
 #include <process/pid.hpp>
 #include <process/io.hpp>
 #include <process/owned.hpp>

 #include <stout/adaptor.hpp>
 #include <stout/linkedhashmap.hpp>
 #include <stout/none.hpp>
 #include <stout/numify.hpp>
 #include <stout/os.hpp>
 #include <stout/path.hpp>
 #include <stout/unreachable.hpp>
 #include <stout/stringify.hpp>
 #include <stout/try.hpp>

 #include "linux/ebpf.hpp"
 #include "linux/fs.hpp"

 using std::ostream;
 using std::set;
 using std::string;
 using std::unique_ptr;
 using std::vector;

 using process::Break;
 using process::Continue;
 using process::ControlFlow;
 using process::Failure;
 using process::Future;
 using process::loop;
 using process::io::Watcher;
 using process::Owned;
 using process::Promise;

 using mesos::internal::fs::MountTable;

 namespace cgroups2 {

 // Name of the cgroups v2 filesystem as found in /proc/filesystems.
 const string FILE_SYSTEM = "cgroup2";

 // Mount point for the cgroups2 file system.
 const string MOUNT_POINT = "/sys/fs/cgroup";


 template <typename T>
 Try<T> read(const string& cgroup, const string& control);


 template <>
 Try<string> read(const string& cgroup, const string& control)
 {
   return os::read(path::join(cgroups2::path(cgroup), control));
 }


 template <>
 Try<uint64_t> read(const string& cgroup, const string& control)
 {
   Try<string> content = read<string>(cgroup, control);
   if (content.isError()) {
     return Error(content.error());
   }

   return numify<uint64_t>(strings::trim(*content));
 }


 Try<Nothing> write(
     const string& cgroup,
     const string& control,
     const string& value)
 {
   return os::write(path::join(cgroups2::path(cgroup), control), value);
 }


 Try<Nothing> write(
     const string& cgroup,
     const string& control,
     const uint64_t& value)
 {
   return write(cgroup, control, stringify(value));
 }

 namespace control {

 // Interface files found in all cgroups.
 const std::string CONTROLLERS = "cgroup.controllers";
 const std::string EVENTS = "cgroup.events";
 const std::string FREEZE = "cgroup.freeze";
 const std::string IRQ_PRESSURE = "irq.pressure";
 const std::string KILL = "cgroup.kill";
 const std::string MAX_DEPTH = "cgroup.max.depth";
 const std::string MAX_DESCENDANTS = "cgroup.max.descendants";
 const std::string PRESSURE = "cgroup.pressure";
 const std::string PROCESSES = "cgroup.procs";
 const std::string STATS = "cgroup.stat";
 const std::string SUBTREE_CONTROLLERS = "cgroup.subtree_control";
 const std::string THREADS = "cgroup.threads";
 const std::string TYPE = "cgroup.type";

 namespace subtree_control {

 struct State
 {
   State() = default;

   // We don't return errors here because enabling something
   // unknown will fail when writing it back out.
   void enable(const set<string>& controllers)
   {
     foreach (const string& controller, controllers) {
       enable(controller);
     }
   }

   // We don't return errors here because enabling something
   // unknown will fail when writing it back out.
   void enable(const string& controller)
   {
     _disabled.erase(controller);
     _enabled.insert(controller);
   }

   // We don't return errors here since disabling something
   // unknown will fail when writing it back out.
   void disable(const string& controller)
   {
     _enabled.erase(controller);
     _disabled.insert(controller);
   }

   void disable(const set<string>& controllers)
   {
     foreach (const string& controller, controllers) {
       disable(controller);
     }
   }

   set<string> enabled()  const { return _enabled; }
   set<string> disabled() const { return _disabled; }

   bool enabled(const string& controller) const
   {
     return _enabled.find(controller) != _enabled.end();
   }

   static State parse(const string& contents)
   {
     State control;

     // Trim trailing newline.
     const string trimmed = strings::trim(contents);
     if (trimmed.empty()) {
       return control;
     }

     vector<string> controllers = strings::split(trimmed, " ");
     control._enabled.insert(
       std::make_move_iterator(controllers.begin()),
       std::make_move_iterator(controllers.end()));
     return control;
   }

 private:
   set<string> _enabled;
   set<string> _disabled;
 };


 std::ostream& operator<<(std::ostream& stream, const State& state)
 {
   foreach (const string& system, state.enabled()) {
     stream << "+" << system << " ";
   }
   foreach (const string& system, state.disabled()) {
     stream << "-" << system << " ";
   }
   return stream;
 }


 Try<State> read(const string& cgroup)
 {
   Try<string> contents =
     cgroups2::read<string>(cgroup, cgroups2::control::SUBTREE_CONTROLLERS);

   if (contents.isError()) {
     return Error(
         "Failed to read 'cgroup.subtree_control' for cgroup '" + cgroup + "': "
         + contents.error());
   }

   return State::parse(*contents);
 }


 Try<Nothing> write(const string& cgroup, const State& state)
 {
   return cgroups2::write(
       cgroup, control::SUBTREE_CONTROLLERS, stringify(state));
 }

 } // namespace subtree_control {

 } // namespace control {


 bool enabled()
 {
   Try<bool> supported = mesos::internal::fs::supported(cgroups2::FILE_SYSTEM);
   return supported.isSome() && *supported;
 }


 Try<Nothing> mount()
 {
   if (!cgroups2::enabled()) {
     return Error("cgroups2 is not enabled");
   }

   Try<bool> mounted = cgroups2::mounted();
   if (mounted.isError()) {
     return Error("Failed to check if cgroups2 filesystem is mounted: "
                  + mounted.error());
   }
   if (*mounted) {
     return Error("cgroup2 filesystem is already mounted at"
                  " '" + cgroups2::MOUNT_POINT + "'");
   }

   Try<Nothing> mkdir = os::mkdir(cgroups2::MOUNT_POINT);
   if (mkdir.isError()) {
     return Error("Failed to create cgroups2 directory"
                  " '" + cgroups2::MOUNT_POINT + "': " + mkdir.error());
   }

   return mesos::internal::fs::mount(
       None(),
       cgroups2::MOUNT_POINT,
       cgroups2::FILE_SYSTEM,
       0,
       None());
 }


 Try<bool> mounted()
 {
   Try<MountTable> mountTable = MountTable::read("/proc/mounts");
   if (mountTable.isError()) {
     return Error("Failed to read /proc/mounts: " + mountTable.error());
   }

   foreach (MountTable::Entry entry, mountTable->entries) {
     if (entry.type == cgroups2::FILE_SYSTEM) {
       if (entry.dir == MOUNT_POINT) {
         return true;
       }
       return Error("Found cgroups2 mount at an unexpected location"
                    " '" + entry.dir + "'");
     }
   }

   return false;
 }


 Try<Nothing> unmount()
 {
   Try<bool> mounted = cgroups2::mounted();
   if (mounted.isError()) {
     return Error("Failed to check if the cgroup2 filesystem is mounted: "
                  + mounted.error());
   }

   if (!*mounted) {
     return Error("cgroups2 filesystem is not mounted");
   }

   Try<Nothing> result = mesos::internal::fs::unmount(MOUNT_POINT);
   if (result.isError()) {
     return Error("Failed to unmount the cgroup2 hierarchy"
                  " '" + cgroups2::MOUNT_POINT + "': " + result.error());
   }

   Try<Nothing> rmdir = os::rmdir(cgroups2::MOUNT_POINT);
   if (rmdir.isError()) {
     return Error("Failed to remove directory '" + cgroups2::MOUNT_POINT + "': "
                  + rmdir.error());
   }

   return Nothing();
 }


 bool exists(const string& cgroup)
 {
   return os::exists(cgroups2::path(cgroup));
 }


 Try<set<string>> get(const string& cgroup)
 {
   const string& path = cgroups2::path(cgroup);
   char* paths[] = {const_cast<char*>(path.c_str()), nullptr};

   FTS* tree = fts_open(paths, FTS_NOCHDIR, nullptr);
   if (tree == nullptr) {
     return ErrnoError("Failed to start traversing filesystem");
   }

   FTSENT* node;
   set<string> cgroups;
   while ((node = fts_read(tree)) != nullptr) {
     // Use post-order walk here. fts_level is the depth of the traversal,
     // numbered from -1 to N, where the file/dir was found. The traversal root
     // itself is numbered 0. fts_info includes flags for the current node.
     // FTS_DP indicates a directory being visited in postorder.
     if (node->fts_level > 0 && node->fts_info & FTS_DP) {
       string _cgroup = strings::trim(
           node->fts_path + MOUNT_POINT.length(), "/");
       cgroups.insert(_cgroup);
     }
   }

   if (errno != 0) {
     Error error =
       ErrnoError("Failed to read a node while traversing the filesystem");
     fts_close(tree);
     return error;
   }

   if (fts_close(tree) != 0) {
     return ErrnoError("Failed to stop traversing file system");
   }

   return cgroups;
 }


 Try<Nothing> create(const string& cgroup, bool recursive)
 {
   const string path = cgroups2::path(cgroup);

   Try<Nothing> mkdir = os::mkdir(path, recursive);
   if (mkdir.isError()) {
     return Error("Failed to create directory '" + path + "': " + mkdir.error());
   }

   return Nothing();
 }


 Try<Nothing> kill(const std::string& cgroup)
 {
   if (!cgroups2::exists(cgroup)) {
     return Error("Cgroup does not exist");
   }

   return cgroups2::write(cgroup, cgroups2::control::KILL, "1");
 }


 Future<Nothing> destroy(const string& cgroup)
 {
   if (!cgroups2::exists(cgroup)) {
     return Failure("Cgroup '" + cgroup + "' does not exist");
   }

   // To destroy a subtree of cgroups we first kill all of the processes inside
   // of the cgroup and then remove all of the cgroup directories, removing
   // the most deeply nested directories first.

   Try<Nothing> kill = cgroups2::kill(cgroup);
   if (kill.isError()) {
     return Failure("Failed to kill processes in cgroup: " + kill.error());
   }

   // In order to reliably destroy a cgroup, one has to retry on EBUSY
   // *even if* all the processes are no longer found in cgroup.procs.
   // We retry for up to ~5 seconds, based on how crun destroys its
   // cgroups:
   //
   // https://github.com/containers/crun/blob/10b3038c1398b7db20b1826f
   // 94e9d4cb444e9568/src/libcrun/cgroup-utils.c#L471
   int retries = 5000;
   Future<Nothing> removal = loop(
     []() { return process::after(Milliseconds(1)); },
     [=](const Nothing&) mutable -> Future<ControlFlow<Nothing>> {
       Try<set<string>> cgroups = cgroups2::get(cgroup);
       if (cgroups.isError()) {
         return Failure("Failed to get nested cgroups: " + cgroups.error());
       }
       cgroups->insert(cgroup);

       // Remove the cgroups in bottom-up order.
       foreach (const string& cgroup, adaptor::reverse(*cgroups)) {
         const string path = cgroups2::path(cgroup);

         // Remove the cgroup's directory. If the directory does not exist,
         // ignore the error to protect against races.
         if (::rmdir(path.c_str()) < 0) {
           ErrnoError error = ErrnoError();
           if (error.code == EBUSY) {
             --retries;
             if (retries == 0) {
               return Failure("Failed to remove cgroup after 5000 attempts");
             }
             return Continue();
           } else if (error.code != ENOENT) {
             return Failure(
                 "Failed to remove directory '" + path + "': " + error.message);
           }
         }
       }

       return Break();
     });

   return removal;
 }


 Try<Nothing> assign(const string& cgroup, pid_t pid)
 {
   if (!cgroups2::exists(cgroup)) {
     return Error("Cgroup '" + cgroup + "' does not exist");
   }

   return cgroups2::write(cgroup, control::PROCESSES, stringify(pid));
 }


 Try<string> cgroup(pid_t pid)
 {
   // A process's cgroup membership is listed in /proc/{pid}/cgroup.
   // The format, e.g if the process belongs to /sys/fs/cgroup/foo/bar, is:
   //
   //   0::/foo/bar
   //   or
   //   0::/foo/bar (deleted)
   //
   // See: https://docs.kernel.org/admin-guide/cgroup-v2.html#processes
   // https://man7.org/linux/man-pages/man7/cgroups.7.html
   const string& cgroupFile = path::join("/proc", stringify(pid), "cgroup");
   if (!os::exists(cgroupFile)) {
     return Error("'" + cgroupFile + "' does not exist");
   }

   Try<string> read = os::read(cgroupFile);
   if (read.isError()) {
     return Error("Failed to read '" + cgroupFile + "': " + read.error());
   }

   string content = strings::trim(*read);
   if (!strings::startsWith(content, "0::/")) {
     return Error("process belongs to a v1 cgroup: " + content);
   }

   content = strings::remove(content, "0::/", strings::Mode::PREFIX);
   content = strings::remove(content, " (deleted)", strings::Mode::SUFFIX);

   return content;
 }


 Try<set<pid_t>> processes(const string& cgroup, bool recursive)
 {
   if (!cgroups2::exists(cgroup)) {
     return Error("Cgroup '" + cgroup + "' does not exist");
   }

   set<string> cgroups = {cgroup};

   if (recursive) {
     Try<set<string>> descendants = cgroups2::get(cgroup);
     if (descendants.isError()) {
       return Error("Failed to list cgroups: " + descendants.error());
     }
     cgroups.insert(descendants->begin(), descendants->end());
   }

   set<pid_t> pids;

   foreach (const string& cgroup, cgroups) {
     Try<string> contents = cgroups2::read<string>(cgroup, control::PROCESSES);

     if (contents.isError() && !exists(cgroup)) {
       continue; // Ignore missing cgroups due to races.
     }

     if (contents.isError()) {
       return Error("Failed to read cgroup.procs in '" + cgroup + "': "
                    + contents.error());
     }

     foreach (const string& line, strings::split(*contents, "\n")) {
       if (line.empty()) continue;

       Try<pid_t> pid = numify<pid_t>(line);
       if (pid.isError()) {
         return Error("Failed to parse '" + line + "' as a pid: " + pid.error());
       }

       pids.insert(*pid);
     }
   }

   return pids;
 }


 Try<set<pid_t>> threads(const string& cgroup)
 {
   Try<string> contents = cgroups2::read<string>(cgroup, control::THREADS);
   if (contents.isError()) {
     return Error("Failed to read 'cgroup.threads' in"
                  " '" + cgroup + "': " + contents.error());
   }

   set<pid_t> tids;
   foreach (const string& line, strings::split(*contents, "\n")) {
     if (line.empty()) continue;

     Try<pid_t> tid = numify<pid_t>(line);
     if (tid.isError()) {
       return Error("Failed to parse '" + line + "' as a tid: " + tid.error());
     }

     tids.insert(*tid);
   }

   return tids;
 }


 string path(const string& cgroup)
 {
   return (!cgroup.empty() && cgroup.at(0) == '/')
            ? cgroup
            : path::join(cgroups2::MOUNT_POINT, cgroup);
 }

 namespace controllers {

 Try<set<string>> available(const string& cgroup)
 {
   Try<string> read =
     cgroups2::read<string>(cgroup, cgroups2::control::CONTROLLERS);

   if (read.isError()) {
     return Error("Failed to read cgroup.controllers in '" + cgroup + "': "
                  + read.error());
   }

   // Trim trailing newline.
   const string contents = strings::trim(*read);
   if (contents.empty()) {
     return set<string>();
   }

   vector<string> controllers = strings::split(contents, " ");
   return set<string>(
       std::make_move_iterator(controllers.begin()),
       std::make_move_iterator(controllers.end()));
 }


 Try<Nothing> enable(const string& cgroup, const set<string>& controllers)
 {
   using State = control::subtree_control::State;
   Try<State> control = cgroups2::control::subtree_control::read(cgroup);

   if (control.isError()) {
     return Error(control.error());
   }

   control->enable(controllers);
   return cgroups2::control::subtree_control::write(cgroup, *control);
 }


 Try<Nothing> disable(const string& cgroup, const set<string>& controllers)
 {
   using State = control::subtree_control::State;
   Try<State> control = cgroups2::control::subtree_control::read(cgroup);

   if (control.isError()) {
     return Error(control.error());
   }

   control->disable(controllers);
   return cgroups2::control::subtree_control::write(cgroup, *control);
 }


 Try<set<string>> enabled(const string& cgroup)
 {
   Try<string> contents =
     cgroups2::read<string>(cgroup, cgroups2::control::SUBTREE_CONTROLLERS);
   if (contents.isError()) {
     return Error("Failed to read 'cgroup.subtree_control' in '" + cgroup + "'"
                  ": " + contents.error());
   }

   using State = control::subtree_control::State;
   State control = State::parse(*contents);
   return control.enabled();
 }

 } // namespace controllers {

 namespace cpu {

 BandwidthLimit::BandwidthLimit(Duration _limit, Duration _period)
   : limit{_limit},
     period{_period} {}


 Try<BandwidthLimit> parse_bandwidth(const string& content)
 {
   // Format
   // -----------------------------
   // $MAX $PERIOD
   // -----------------------------
   // $MAX        Maximum CPU time, in microseconds, processes in the cgroup can
   //             collectively use during one $PERIOD. If set to "max" then there
   //             is no limit.
   //
   // $PERIOD     Length of one period, in microseconds.
   vector<string> split = strings::split(strings::trim(content), " ");
   if (split.size() != 2) {
     return Error("Expected format '$MAX $PERIOD'"
                  " but received '" + content + "'");
   }

   if (split[0] == "max") {
     return cpu::BandwidthLimit();
   }

   Try<Duration> limit = Duration::parse(split[0] + "us");
   if (limit.isError()) {
     return Error("Failed to parse cpu.max's limit of '" + split[0] + "': "
                  + limit.error());
   }

   Try<Duration> period = Duration::parse(split[1] + "us");
   if (period.isError()) {
     return Error("Failed to parse cpu.max's period of '" + split[1] + "': "
                  + period.error());
   }

   return BandwidthLimit(*limit, *period);
 }

 namespace control {

 const std::string IDLE = "cpu.idle";
 const std::string MAX = "cpu.max";
 const std::string MAX_BURST = "cpu.max.burst";
 const std::string PRESSURE = "cpu.pressure";
 const std::string STATS = "cpu.stat";
 const std::string UCLAMP_MAX = "cpu.uclamp.max";
 const std::string UCLAMP_MIN = "cpu.uclamp.min";
 const std::string WEIGHT = "cpu.weight";
 const std::string WEIGHT_NICE = "cpu.weight.nice";

 namespace stat {

 Try<Stats> parse(const string& content)
 {
   const vector<string> lines = strings::split(content, "\n");
   cpu::Stats stats;

   foreach (const string& line, lines) {
     if (line.empty()) {
       continue;
     }

     vector<string> tokens = strings::split(line, " ");
     if (tokens.size() != 2) {
       return Error("Invalid line format in 'cpu.stat' expected "
                    "<key> <value> received: '" + line + "'");
     }

     const string& field = tokens[0];
     const string& value = tokens[1];

     Try<uint64_t> number = numify<uint64_t>(value);
     if (number.isError()) {
       return Error("Failed to parse '" + field + "': " + number.error());
     }
     Duration duration = Microseconds(static_cast<int64_t>(*number));

     if      (field == "usage_usec")     { stats.usage = duration; }
     else if (field == "user_usec")      { stats.user_time = duration; }
     else if (field == "system_usec")    { stats.system_time = duration; }
     else if (field == "nr_periods")     { stats.periods = *number; }
     else if (field == "nr_throttled")   { stats.throttled = *number; }
     else if (field == "throttled_usec") { stats.throttle_time = duration; }
     else if (field == "nr_burst")       { stats.bursts = *number; }
     else if (field == "burst_usec")     { stats.bursts_time = duration; }
   }

   return stats;
 }

 } // namespace stat {

 } // namespace control {


 Try<Nothing> weight(const string& cgroup, uint64_t weight)
 {
   if (cgroup == ROOT_CGROUP) {
     return Error("Operation not supported for the root cgroup");
   }

   return cgroups2::write(cgroup, cpu::control::WEIGHT, weight);
 }


 Try<uint64_t> weight(const string& cgroup)
 {
   if (cgroup == ROOT_CGROUP) {
     return Error("Operation not supported for the root cgroup");
   }

   return cgroups2::read<uint64_t>(cgroup, cpu::control::WEIGHT);
 }


 Try<cpu::Stats> stats(const string& cgroup)
 {
   Try<string> content = cgroups2::read<string>(
       cgroup, cgroups2::cpu::control::STATS);

   if (content.isError()) {
     return Error("Failed to read 'cpu.stat' for the cgroup '" + cgroup + "': "
                  + content.error());
   }

   return cpu::control::stat::parse(*content);
 }


 Try<Nothing> set_max(const string& cgroup, const cpu::BandwidthLimit& limit)
 {
   if (cgroup == ROOT_CGROUP) {
     return Error("Operation not supported for the root cgroup");
   }

   if (limit.limit.isNone()) {
     return cgroups2::write(cgroup, cpu::control::MAX, "max");
   }

   if (limit.period.isNone()) {
     return Error("Invalid bandwidth limit: period can only be None"
                  " for a limitless bandwidth limit");
   }

   if (limit.period->ns() < 0 || limit.limit->ns() < 0
       || limit.period->ns() % 1000 > 0 || limit.limit->ns() % 1000 > 0) {
     return Error("Invalid bandwidth limit: period and limit must be"
                  " positive and microsecond level granularity, received"
                  " period=" + stringify(*limit.period)
                  + " limit=" + stringify(*limit.limit));
   }

   return cgroups2::write(
       cgroup,
       cpu::control::MAX,
       stringify(static_cast<uint64_t>(limit.limit->us()))
         + " "
         + stringify(static_cast<uint64_t>(limit.period->us())));
 }


 Try<cpu::BandwidthLimit> max(const string& cgroup)
 {
   if (cgroup == ROOT_CGROUP) {
     return Error("Operation not supported for the root cgroup");
   }

   Try<string> content = cgroups2::read<string>(cgroup, cpu::control::MAX);
   if (content.isError()) {
     return Error("Failed the read 'cpu.max' for cgroup '" + cgroup + "': "
                  + content.error());
   }

   Try<BandwidthLimit> limit = parse_bandwidth(*content);
   if (limit.isError()) {
     return Error("Failed to parse '" + *content + "' as a bandwidth limit: "
                  + limit.error());
   }

   return *limit;
 }

 } // namespace cpu {

 namespace memory {

 namespace internal {

 // Parse a byte limit from a string.
 //
 // Format: "max" OR a u64_t string representing bytes.
 Result<Bytes> parse_bytelimit(const string& value)
 {
   const string trimmed = strings::trim(value);
   if (trimmed == "max") {
     return None();
   }

   Try<uint64_t> bytes = numify<uint64_t>(trimmed);
   if (bytes.isError()) {
     return Error("Failed to numify '" + trimmed + "': " + bytes.error());
   }

   return Bytes(*bytes);
 }

 } // namespace internal {


 namespace control {

 const string CURRENT = "memory.current";
 const string EVENTS = "memory.events";
 const string LOW = "memory.low";
 const string HIGH = "memory.high";
 const string MAX = "memory.max";
 const string MIN = "memory.min";
 const string STAT = "memory.stat";

 namespace stat {

 Try<Stats> parse(const string& content)
 {
   Stats stats;

   bool kernel_found = false;
   foreach (const string& line, strings::split(content, "\n")) {
     if (line.empty()) {
       continue;
     }

     vector<string> tokens = strings::split(line, " ");
     if (tokens.size() != 2) {
       return Error("Invalid line format in 'memory.stat'; expected "
                    "<key> <value> received: '" + line + "'");
     }

     const string& key = tokens[0];
     const string& value = tokens[1];

     Try<uint64_t> n = numify<uint64_t>(value);
     if (n.isError()) {
       return Error("Failed to numify '" + value + "': " + n.error());
     }
     const Bytes bytes(*n);

     if      (key == "anon")         { stats.anon          = bytes; }
     else if (key == "file")         { stats.file          = bytes; }
     else if (key == "kernel")       { stats.kernel        = bytes; }
     else if (key == "kernel_stack") { stats.kernel_stack  = bytes; }
     else if (key == "pagetables")   { stats.pagetables    = bytes; }
     else if (key == "sock")         { stats.sock          = bytes; }
     else if (key == "vmalloc")      { stats.vmalloc       = bytes; }
     else if (key == "file_mapped")  { stats.file_mapped   = bytes; }
     else if (key == "slab")         { stats.slab          = bytes; }
     else if (key == "unevictable")  { stats.unevictable   = bytes; }

     kernel_found |= key == "kernel";
   }

   // See Stats::kernel for an explanation of why this can be missing
   // and why we fill it in using these sub-metrics:
   if (!kernel_found) {
     stats.kernel = stats.kernel_stack
       + stats.pagetables
       + stats.sock
       + stats.vmalloc
       + stats.slab;
   }

   return stats;
 }

 } // namespace stat {

 } // namespace control {

 namespace events {

 Try<Events> parse(const string& content)
 {
   Events events;

   foreach (const string& line, strings::split(content, "\n")) {
     if (line.empty()) {
       continue;
     }

     vector<string> tokens = strings::split(line, " ");
     if (tokens.size() != 2) {
       return Error("Invalid line format in 'memory.events' expected "
                    "<key> <value> received: '" + line + "'");
     }

     const string& field = tokens[0];
     const string& value = tokens[1];

     Try<uint64_t> count = numify<uint64_t>(value);
     if (count.isError()) {
       return Error("Failed to numify '" + value + "': " + count.error());
     }

     if      (field == "low")            { events.low            = *count; }
     else if (field == "high")           { events.high           = *count; }
     else if (field == "max")            { events.max            = *count; }
     else if (field == "oom")            { events.oom            = *count; }
     else if (field == "oom_kill")       { events.oom_kill       = *count; }
     else if (field == "oom_group_kill") { events.oom_group_kill = *count; }
   }

   return events;
 }

 } // namespace events {


 class OomListenerProcess : public process::Process<OomListenerProcess>
 {
 public:
   OomListenerProcess(const Watcher& _watcher)
     : ProcessBase(process::ID::generate("oom-listener")), watcher(_watcher) {}

   void initialize() override
   {
     event_loop = loop(
         self(),
         [this]() {
           return watcher.events().get();
         },
         [this](const Watcher::Event& event) -> Future<ControlFlow<Nothing>> {
           if (event.type == Watcher::Event::Failure) {
             // event.path contains error message for Failure events.
             return Failure("Watcher failed: " + event.path);
           }

           if (!(event.type == Watcher::Event::Write)) {
             return Continue();
           }

           read_events(event.path);
           return Continue();
         });

     event_loop
       .onAny(defer(self(), [this](const Future<Nothing>& f) {
         if (f.isFailed())     fail("Read loop has terminated: " + f.failure());
         if (f.isDiscarded())  fail("Read loop has terminated: discarded");
         if (f.isReady())      fail("Read loop has terminated: future is ready");
         if (f.isAbandoned())  fail("Read loop has terminated: abandoned");
       }));
   }

   void finalize() override
   {
     event_loop.discard();

     // Must explicitly fail all remaining oom futures because we
     // are already in finalize, so we can't dispatch into the
     // process in the event_loop's onAny handler.
     fail("OomListenerProcess is terminating");
   }

   Future<Nothing> listen(const string& cgroup)
   {
     string events_path = path::join(cgroups2::path(cgroup), control::EVENTS);
     if (ooms.contains(events_path)) {
       return Failure("Already listening");
     }

     Try<Nothing> add = watcher.add(events_path);
     if (add.isError()) {
       return Failure("Failed to add file to watcher: " + add.error());
     }

     Promise<Nothing> promise;
     Future<Nothing> future = promise.future();

     ooms.emplace(events_path, std::move(promise));

     future
       .onDiscard(defer(self(), [this, events_path]() {
         auto it = ooms.find(events_path);
         if (it == ooms.end()) {
           return; // Already removed.
         }

         Promise<Nothing> promise = std::move(it->second);
         ooms.erase(events_path);

         // Ignoring remove failures since caller doesn't care about the file
         // anyway now.
         watcher.remove(events_path);
         promise.discard();
       }));

     // Read the events file after adding to watcher in case an oom event
     // occurred before the add was complete.
     read_events(events_path);

     return future;
   }

   void read_events(const string& path)
   {
     auto it = ooms.find(path);
     if (it == ooms.end()) {
       return;
     }

     Try<string> content = os::read(path);
     if (content.isError()) {
       it->second.fail("Failed to read 'memory.events': " + content.error());
       ooms.erase(it);
       return;
     }

     Try<Events> events = events::parse(strings::trim(*content));
     if (events.isError()) {
       it->second.fail("Failed to parse 'memory.events': " + events.error());
       ooms.erase(it);
       return;
     }

     if (events->oom > 0) {
       it->second.set(Nothing());
       ooms.erase(it);
       return;
     }
   }

   void fail(const string& reason)
   {
     foreachvalue (Promise<Nothing>& promise, ooms) {
       promise.fail(reason);
     }
     ooms.clear();
   }

 private:
   // A map of cgroup memory.event file names to their respective futures.
   hashmap<string, Promise<Nothing>> ooms;

   Future<Nothing> event_loop;

   Watcher watcher;
 };


 OomListener::OomListener(OomListener&&) = default;


 OomListener& OomListener::operator=(OomListener&&) = default;


 Try<OomListener> OomListener::create()
 {
   Try<Watcher> watcher = process::io::create_watcher();
   if (watcher.isError()) {
     return Error("Failed to create watcher: " + watcher.error());
   }
   return OomListener(
       unique_ptr<OomListenerProcess>(new OomListenerProcess(*watcher)));
 }


 OomListener::OomListener(unique_ptr<OomListenerProcess>&& _process)
   : process(std::move(_process))
 {
   spawn(*process);
 };


 OomListener::~OomListener()
 {
   if (process) {
     terminate(*process);
     process::wait(*process);
   }
 }


 Future<Nothing> OomListener::listen(const string& cgroup)
 {
   return dispatch(*process, &OomListenerProcess::listen, cgroup);
 }


 Try<Bytes> usage(const string& cgroup)
 {
   Try<uint64_t> contents = cgroups2::read<uint64_t>(
       cgroup, memory::control::CURRENT);
   if (contents.isError()) {
     return Error("Failed to read 'memory.current': " + contents.error());
   }

   return Bytes(*contents);
 }


 Try<Nothing> set_low(const string& cgroup, const Bytes& bytes)
 {
   return cgroups2::write(cgroup, control::LOW, bytes.bytes());
 }


 Try<Bytes> low(const string& cgroup)
 {
   Try<uint64_t> contents = cgroups2::read<uint64_t>(cgroup, control::LOW);
   if (contents.isError()) {
     return Error("Failed to read 'memory.low': " + contents.error());
   }

   return Bytes(*contents);
 }


 Try<Nothing> set_min(const string& cgroup, const Bytes& bytes)
 {
   return cgroups2::write(cgroup, control::MIN, bytes.bytes());
 }


 Try<Bytes> min(const string& cgroup)
 {
   Try<uint64_t> contents = cgroups2::read<uint64_t>(cgroup, control::MIN);
   if (contents.isError()) {
     return Error("Failed to read 'memory.min': " + contents.error());
   }

   return Bytes(*contents);
 }


 Try<Nothing> set_max(const string& cgroup, const Option<Bytes>& limit)
 {
   return cgroups2::write(
       cgroup,
       control::MAX,
       limit.isNone() ?  "max" : stringify(limit->bytes()));
 }


 Result<Bytes> max(const string& cgroup)
 {
   Try<string> contents = cgroups2::read<string>(cgroup, control::MAX);
   if (contents.isError()) {
     return Error("Failed to read 'memory.max': " + contents.error());
   }

   return internal::parse_bytelimit(*contents);
 }


 Try<Nothing> set_high(const string& cgroup, const Option<Bytes>& limit)
 {
   return cgroups2::write(
       cgroup,
       control::HIGH,
       limit.isNone() ?  "max" : stringify(limit->bytes()));
 }


 Result<Bytes> high(const string& cgroup)
 {
   Try<string> contents = cgroups2::read<string>(cgroup, control::HIGH);
   if (contents.isError()) {
     return Error("Failed to read 'memory.high': " + contents.error());
   }

   return internal::parse_bytelimit(*contents);
 }


 Try<Stats> stats(const string& cgroup)
 {
   Try<string> contents = cgroups2::read<string>(cgroup, control::STAT);
   if (contents.isError()) {
     return Error("Failed to read 'memory.stat': " + contents.error());
   }

   return control::stat::parse(*contents);
 }

 } // namespace memory {

 namespace devices {

 // Utility class to construct an eBPF program to whitelist or blacklist
 // select device accesses.
 class DeviceProgram
 {
 public:
   // We will generate one allow block for each entry in the allow list
   // and one deny block for each entry in the deny list.
   //
   // There are special cases for catch-all values or empty allow lists
   // Which are done to avoid generating unreachable code which are prohibited
   // by the verifier:
   // 1. If we have a catch-all in the allow entries, we will not generate any
   //    code for allow section, since there is no need to check if any entries
   //    match anything in allow.
   // 2. If we have a catch-all in the deny entries, we will immediately return
   //    with the deny value still in R0 to indicate that access is denied.
   // 3. If we have an empty allow list, we will immediately return with the deny
   //    value still in R0 to indicate that access is denied.
   //
   // ---------------------------------------------------------------------------
   // Normal code flow
   // +-------------------------------------------------------------+
   // |Initialize R0 to deny value                                  |
   // +-------------------------------------------------------------+-----------+
   // |Allow Block                                                  |           |
   // |Check each register, jump to next block if there is no match |           |
   // |                                                             |           |
   // |If each register has matched, jump over the exit instruction |           |
   // |at the end of allow blocks and go to start of deny blocks    |           |
   // +-------------------------------------------------------------+ Allow     |
   // |                                                             | Section   |
   // |Other allow blocks...                                        |           |
   // |                                                             |           |
   // +-------------------------------------------------------------+           |
   // |Exit instruction and deny access, a match in any             |           |
   // |allow block will jump over this instruction                  |           |
   // +-------------------------------------------------------------+-----------+
   // |Deny Block                                                   |           |
   // |                                                             |           |
   // |Check each register, jump to next block if there is no match |           |
   // |                                                             |           |
   // |If each register is matched, exit immediately as we have     |           |
   // |the deny value stored in result register R0                  |           |
   // +-------------------------------------------------------------+ Deny      |
   // |                                                             | Section   |
   // |Other deny blocks...                                         |           |
   // |                                                             |           |
   // +-------------------------------------------------------------+           |
   // |Exit instruction to allow access because to reach this       |           |
   // |point, there must have been a match in allow, and no         |           |
   // |matches in deny                                              |           |
   // +-------------------------------------------------------------+-----------+
   // ----------------------------------END--------------------------------------
   //
   // The code in special case 1 (allow catch-all):
   // +-------------------------------------------------------------+
   // |Initialize R0 to deny value                                  |
   // +-------------------------------------------------------------+-----------+
   // |Deny Block                                                   |           |
   // |                                                             |           |
   // |Check each register, jump to next block if there is no match |           |
   // |                                                             |           |
   // |If each register is matched, exit immediately as we have     |           |
   // |the deny value stored in result register R0                  |           |
   // +-------------------------------------------------------------+ Deny      |
   // |                                                             | Section   |
   // |Other deny blocks...                                         |           |
   // |                                                             |           |
   // +-------------------------------------------------------------+           |
   // |Exit instruction to allow access because to reach this       |           |
   // |point, there must have been a match in allow, and no         |           |
   // |matches in deny                                              |           |
   // +-------------------------------------------------------------+-----------+
   // ----------------------------------END--------------------------------------
   //
   // The code in special cases 2 (deny catch-all) and 3 (empty allow):
   // +-------------------------------------------------------------+
   // |Initialize R0 to deny value                                  |
   // +-------------------------------------------------------------+
   // |Exit instruction to deny access                              |
   // +-------------------------------------------------------------+
   static Try<ebpf::Program> build(
       const vector<Entry>& allow,
       const vector<Entry>& deny)
   {
     // The BPF_PROG_TYPE_CGROUP_DEVICE program takes in
     // `struct bpf_cgroup_dev_ctx*` as input. We extract the fields into
     // registers r2-5.
     //
     // The device type is encoded in the first 16 bits of `access_type` and
     // the access type is encoded in the last 16 bits of `access_type`.
     ebpf::Program program = ebpf::Program(BPF_PROG_TYPE_CGROUP_DEVICE);
     program.append({
       // r2: Type ('c', 'b', '?')
       BPF_LDX_MEM(
         BPF_W, BPF_REG_2, BPF_REG_1, offsetof(bpf_cgroup_dev_ctx, access_type)),
       BPF_ALU32_IMM(BPF_AND, BPF_REG_2, 0xFFFF),
       // r3: Access ('r', 'w', 'm')
       BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
         offsetof(bpf_cgroup_dev_ctx, access_type)),
       BPF_ALU32_IMM(BPF_RSH, BPF_REG_3, 16),
       // r4: Major Version
       BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1,
         offsetof(bpf_cgroup_dev_ctx, major)),
       // r5: Minor Version
       BPF_LDX_MEM(BPF_W, BPF_REG_5, BPF_REG_1,
         offsetof(bpf_cgroup_dev_ctx, minor)),
     });

     // Initialize result register R0 to deny access so we can immediately
     // exit if there is a match in a deny entry, or if there is no match
     // in the allow entries.
     program.append({BPF_MOV64_IMM(BPF_REG_0, DENY_ACCESS)});

     // Special case 2. We deny access and exit if there's a catch-all in deny.
     foreach (const Entry& entry, deny) {
       if (entry.is_catch_all()) {
         program.append({BPF_EXIT_INSN()});
         return program;
       }
     }

     // Special case 3. We deny access and exit if we see nothing in allow.
     if (allow.empty()) {
       program.append({BPF_EXIT_INSN()});
       return program;
     }

     auto allow_block_trailer = [](short jmp_size_to_deny_section) {
       return vector<bpf_insn>({BPF_JMP_A(jmp_size_to_deny_section)});
     };
     auto allow_section_trailer = []() {
       return vector<bpf_insn>({BPF_EXIT_INSN()});
     };
     auto deny_block_trailer = []() {
       return vector<bpf_insn>({BPF_EXIT_INSN()});
     };
     auto deny_section_trailer = []() {
       return vector<bpf_insn>({
         BPF_MOV64_IMM(BPF_REG_0, ALLOW_ACCESS),
         BPF_EXIT_INSN(),
       });
     };

     bool allow_catch_all = [&allow]() {
       foreach (const Entry& entry, allow) {
         if (entry.is_catch_all()) {
           return true;
         }
       }
       return false;
     }();

     // We will only add the code for the allow section if there is no catch-all
     // allow entry present. If there is a catch-all, we will skip everything
     // in the allow section, including exit instruction at the end,
     // since we just need to check if the device is explicitly denied.
     if (!allow_catch_all) {
       // We calculate the total jump distance to skip over trailer instructions
       // at the end of the allow section, we initialize jump size to length of
       // said instructions, then add the lengths of individual allow blocks.
       short start_of_deny_jmp_size = allow_section_trailer().size();
       vector<vector<bpf_insn>> allow_device_check_blocks = {};
       short allow_block_trailer_size = allow_block_trailer(0).size();

       foreach (const Entry& entry, allow) {
         vector<bpf_insn> allow_block = add_device_checks(
             entry, allow_block_trailer_size, DeviceCheckType::ALLOW);
         allow_device_check_blocks.push_back(allow_block);

         start_of_deny_jmp_size += allow_block.size() + allow_block_trailer_size;
       }

       foreach (vector<bpf_insn>& allow_block, allow_device_check_blocks) {
         start_of_deny_jmp_size -=
           (allow_block.size() + allow_block_trailer_size);
         program.append(std::move(allow_block));
         program.append(allow_block_trailer(start_of_deny_jmp_size));
       }

       // If this instruction is executed, then there is no match in allow
       // so we can deny access.
       program.append(allow_section_trailer());
     }

     // Get the deny block device check code.
     // We are either following the normal code flow or special case 1 (see
     // diagram above) if we reached this section.
     foreach (const Entry& entry, deny) {
       program.append(add_device_checks(
           entry, deny_block_trailer().size(), DeviceCheckType::DENY));
       program.append(deny_block_trailer());
     }

     // To reach this block, we must have matched with an entry in allow
     // to jump over the exit instruction at the end of allow blocks,
     // or there is a catch-all in allow. We will also have to have not
     // matched with any of the deny entries to avoid their exit instructions.
     // Meaning that the device is on the allow list, and not on the deny list.
     // Hence, we grant them access.
     program.append(deny_section_trailer());

     return program;
   }

 private:
   enum DeviceCheckType
   {
     ALLOW,
     DENY
   };

   static vector<bpf_insn> add_device_checks(
       const Entry& entry,
       short trailer_length,
       DeviceCheckType device_check_type)
   {
     // We create a block of bytecode with the format:
     // 1. Major Version Check
     // 2. Minor Version Check
     // 3. Type Check
     // 4. Access Check
     // 5. Trailer (caller-generated)
     //  5a. If block is an allow block, we jump to the start of deny blocks
     //  5b. If block is a deny block, we exit immediately
     //
     // Either:
     // 1. The device access is matched by (1,2,3,4) and the Allow/Deny trailer
     //    code is executed.
     // 2. One of (1,2,3,4) does not match the requested access and we skip
     //    the rest of the current block

     if (entry.is_catch_all()) {
       return {};
     }

     auto check_major_instructions = [](short jmp_size, int major) {
       return vector<bpf_insn>({
           BPF_JMP_IMM(BPF_JNE, BPF_REG_4, major, jmp_size),
         });
     };

     auto check_minor_instructions = [](short jmp_size, int minor) {
       return vector<bpf_insn>({
         BPF_JMP_IMM(BPF_JNE, BPF_REG_5, minor, jmp_size)
       });
     };

     auto check_deny_access_instructions =
       [](short jmp_size, const Entry::Access& access)
     {
       int bpf_access = 0;
       bpf_access |= access.read ? BPF_DEVCG_ACC_READ : 0;
       bpf_access |= access.write ? BPF_DEVCG_ACC_WRITE : 0;
       bpf_access |= access.mknod ? BPF_DEVCG_ACC_MKNOD : 0;
       return vector<bpf_insn>({
           BPF_MOV32_REG(BPF_REG_1, BPF_REG_3),
           BPF_ALU32_IMM(BPF_AND, BPF_REG_1, bpf_access),
           BPF_JMP_IMM(
             BPF_JEQ,
             BPF_REG_1,
             0,
             static_cast<short>(jmp_size - 2)),
       });
     };

     auto check_allow_access_instructions =
       [](short jmp_size, const Entry::Access& access)
     {
       int bpf_access = 0;
       bpf_access |= access.read ? BPF_DEVCG_ACC_READ : 0;
       bpf_access |= access.write ? BPF_DEVCG_ACC_WRITE : 0;
       bpf_access |= access.mknod ? BPF_DEVCG_ACC_MKNOD : 0;
       return vector<bpf_insn>({
           BPF_MOV32_REG(BPF_REG_1, BPF_REG_3),
           BPF_ALU32_IMM(BPF_AND, BPF_REG_1, bpf_access),
           BPF_JMP_REG(
             BPF_JNE,
             BPF_REG_1,
             BPF_REG_3,
             static_cast<short>(jmp_size - 2)),
         });
     };

     auto check_type_instructions =
       [](short jmp_size, const Entry::Selector& selector) -> vector<bpf_insn> {
       int bpf_type = [selector]() {
         switch (selector.type) {
           case Entry::Selector::Type::BLOCK:     return BPF_DEVCG_DEV_BLOCK;
           case Entry::Selector::Type::CHARACTER: return BPF_DEVCG_DEV_CHAR;
           case Entry::Selector::Type::ALL:       break;
         }
         UNREACHABLE();
       }();
       return {
           BPF_JMP_IMM(BPF_JNE, BPF_REG_2, bpf_type, jmp_size),
       };
     };

     const Entry::Selector& selector = entry.selector;
     const Entry::Access& access = entry.access;

     bool check_major = selector.major.isSome();
     bool check_minor = selector.minor.isSome();
     bool check_type = selector.type != Entry::Selector::Type::ALL;
     bool check_access = !access.mknod || !access.read || !access.write;

     // The jump sizes here correspond to the size of the bpf instructions
     // that each check adds to the program. The total size of the block is
     // the trailer length plus the total length of all checks.
     size_t access_insn_size =
       device_check_type == DeviceCheckType::ALLOW
         ? check_allow_access_instructions(0, access).size()
         : check_deny_access_instructions(0, access).size();
     short nxt_blk_jmp_size = trailer_length
       + (check_major ? check_major_instructions(0, 0).size() : 0)
       + (check_minor ? check_minor_instructions(0, 0).size() : 0)
       + (check_access ? access_insn_size : 0)
       + (check_type ? check_type_instructions(0, selector).size() : 0);

     // We subtract one because the program counter will be one ahead when it
     // is executing the code in this code block, so we need to jump one less
     // instruction to land at the beginning of the next entry-block
     nxt_blk_jmp_size -= 1;

     vector<bpf_insn> device_check_block = {};

     // 1. Check major version (r4) against entry.
     if (check_major) {
       vector<bpf_insn> insert_instructions =
         check_major_instructions(nxt_blk_jmp_size, (int)selector.major.get());
       foreach (const bpf_insn& insn, insert_instructions) {
         device_check_block.push_back(insn);
       }
       nxt_blk_jmp_size -= insert_instructions.size();
     }

     // 2. Check minor version (r5) against entry.
     if (check_minor) {
       vector<bpf_insn> insert_instructions =
         check_minor_instructions(nxt_blk_jmp_size, (int)selector.minor.get());
       foreach (const bpf_insn& insn, insert_instructions) {
         device_check_block.push_back(insn);
       }
       nxt_blk_jmp_size -= insert_instructions.size();
     }

     // 3. Check type (r2) against entry.
     if (check_type) {
       vector<bpf_insn> insert_instructions =
         check_type_instructions(nxt_blk_jmp_size, selector);
       foreach (const bpf_insn& insn, insert_instructions) {
         device_check_block.push_back(insn);
       }
       nxt_blk_jmp_size -= insert_instructions.size();
     }

     // 4. Check access (r3) against entry.
     if (check_access) {
       vector<bpf_insn> insert_instructions =
         device_check_type == DeviceCheckType::ALLOW
           ? check_allow_access_instructions(nxt_blk_jmp_size, access)
           : check_deny_access_instructions(nxt_blk_jmp_size, access);
       foreach (const bpf_insn& insn, insert_instructions) {
         device_check_block.push_back(insn);
       }
     }

     return device_check_block;
   }

   static const int ALLOW_ACCESS = 1;
   static const int DENY_ACCESS = 0;
 };


 Try<Nothing> configure(
     const string& cgroup,
     const vector<Entry>& allow,
     const vector<Entry>& deny)
 {
   if (!normalized(allow) || !normalized(deny)) {
     return Error(
         "Failed to validate arguments: allow or deny lists are not normalized");
   }

   Try<ebpf::Program> program = DeviceProgram::build(allow, deny);

   if (program.isError()) {
     return Error("Failed to generate device program: " + program.error());
   }

   Try<Nothing> attach = ebpf::cgroups2::attach(
       cgroup,
       *program);

   if (attach.isError()) {
     return Error("Failed to attach BPF_PROG_TYPE_CGROUP_DEVICE program: " +
                  attach.error());
   }

   return Nothing();
 }


 bool normalized(const vector<Entry>& query)
 {
   auto has_empties = [](const vector<Entry>& entries) {
     foreach (const Entry& entry, entries) {
       if (entry.access.none()) {
         return true;
       }
     }
     return false;
   };

   if (has_empties(query)) {
     return false;
   }

   auto has_duplicate_selectors = [](const vector<Entry>& entries) {
     hashset<string> selectors;
     foreach (const Entry& entry, entries) {
       selectors.insert(stringify(entry.selector));
     }
     return selectors.size() != entries.size();
   };

   if (has_duplicate_selectors(query)) {
     return false;
   }

   auto has_encompassed_entries = [](const vector<Entry>& entries) {
     foreach (const Entry& entry, entries) {
       foreach (const Entry& other, entries) {
         if ((!cgroups::devices::operator==(entry, other))
             && entry.encompasses(other)) {
           return true;
         }
       }
     }
     return false;
   };

   if (has_encompassed_entries(query)) {
     return false;
   }

   return true;
 }


 vector<Entry> normalize(const vector<Entry>& to_normalize)
 {
   auto strip_empties = [](const vector<Entry>& entries) {
     vector<Entry> stripped = {};
     foreach (const Entry& entry, entries) {
       if (!entry.access.none()) {
         stripped.push_back(entry);
       }
     }
     return stripped;
   };

   auto deduplicate = [](const vector<Entry>& entries) {
     LinkedHashMap<string, Entry> deduplicated;
     foreach (const Entry& entry, entries) {
       if (!deduplicated.contains(stringify(entry.selector))) {
         deduplicated[stringify(entry.selector)] = entry;
       }

       Entry& e = deduplicated.at(stringify(entry.selector));
       e.access.write |= entry.access.write;
       e.access.read |= entry.access.read;
       e.access.mknod |= entry.access.mknod;
     }

     return deduplicated.values();
   };

   auto strip_encompassed = [](const vector<Entry>& entries) {
     vector<Entry> result = {};
     foreach (const Entry& entry, entries) {
       bool is_encompassed = [&]() {
         foreach (const Entry& other, entries) {
           if (!cgroups::devices::operator==(entry.selector, other.selector)
               && other.encompasses(entry)) {
             return true;
           }
         }
         return false;
       }();

       // Skip entries that are encompassed by other entries.
       if (!is_encompassed) {
         result.push_back(entry);
       }
     }
     return result;
   };

   vector<Entry> result = to_normalize;
   result = strip_empties(result);
   result = deduplicate(result);
   result = strip_encompassed(result);
   CHECK(normalized(result));
   return result;
 }

 } // namespace devices {

 } // namespace cgroups2 {