[cgroups2] Fix allow deny semantics for device access.
Currently, the EBPF program we generate has the behavior where the deny
list has no effect, as we will allow device access iff the device
matched with an allow entry.
Instead we want to grant access to a device iff it is in a cgroup's
allow list *and not in its deny list.*
This means that we need to change our existing logic, which exits on the
first match. It is not our desired behavior because the current EBPF
program construction logic puts the allow-device checks before the
deny-device checks, meaning that if a device is on both allow and deny
lists for a cgroup, it will be granted access.
This change revamps the EBPF program construction to now check both the
allow and deny list of a cgroup before determining whether access may be
granted. Specifically, if a device is matched with an entry inside the
allow list, we will also be checking if it matches with any entry on
the deny list, and deny the device's access if that is the case.
We also avoid generating specific parts of the EBPF program code to
avoid creating unreachable code, explanations with a diagram are
attached above the cgroups2::devices::DeviceProgram::build function.
Review: https://reviews.apache.org/r/75026/
diff --git a/src/linux/cgroups.cpp b/src/linux/cgroups.cpp
index c1272fb..c428949 100644
--- a/src/linux/cgroups.cpp
+++ b/src/linux/cgroups.cpp
@@ -2906,6 +2906,14 @@
}
+bool Entry::is_catch_all() const
+{
+ return !selector.major.isSome() && !selector.minor.isSome()
+ && selector.type == Entry::Selector::Type::ALL
+ && access.mknod && access.read && access.write;
+}
+
+
Try<vector<Entry>> list(
const string& hierarchy,
const string& cgroup)
diff --git a/src/linux/cgroups.hpp b/src/linux/cgroups.hpp
index 9be53e3..eaa4cdf 100644
--- a/src/linux/cgroups.hpp
+++ b/src/linux/cgroups.hpp
@@ -933,6 +933,7 @@
struct Entry
{
static Try<Entry> parse(const std::string& s);
+ bool is_catch_all() const;
struct Selector
{
diff --git a/src/linux/cgroups2.cpp b/src/linux/cgroups2.cpp
index d1fc263..59c212e 100644
--- a/src/linux/cgroups2.cpp
+++ b/src/linux/cgroups2.cpp
@@ -23,6 +23,7 @@
#include <set>
#include <string>
#include <vector>
+#include <utility>
#include <process/after.hpp>
#include <process/loop.hpp>
@@ -1086,7 +1087,85 @@
class DeviceProgram
{
public:
- DeviceProgram() : program{ebpf::Program(BPF_PROG_TYPE_CGROUP_DEVICE)}
+ // We will generate one allow block for each entry in the allow list
+ // and one deny block for each entry in the deny list.
+ //
+ // There are special cases for catch-all values or empty allow lists
+ // Which are done to avoid generating unreachable code which are prohibited
+ // by the verifier:
+ // 1. If we have a catch-all in the allow entries, we will not generate any
+ // code for allow section, since there is no need to check if any entries
+ // match anything in allow.
+ // 2. If we have a catch-all in the deny entries, we will immediately return
+ // with the deny value still in R0 to indicate that access is denied.
+ // 3. If we have an empty allow list, we will immediately return with the deny
+ // value still in R0 to indicate that access is denied.
+ //
+ // ---------------------------------------------------------------------------
+ // Normal code flow
+ // +-------------------------------------------------------------+
+ // |Initialize R0 to deny value |
+ // +-------------------------------------------------------------+-----------+
+ // |Allow Block | |
+ // |Check each register, jump to next block if there is no match | |
+ // | | |
+ // |If each register has matched, jump over the exit instruction | |
+ // |at the end of allow blocks and go to start of deny blocks | |
+ // +-------------------------------------------------------------+ Allow |
+ // | | Section |
+ // |Other allow blocks... | |
+ // | | |
+ // +-------------------------------------------------------------+ |
+ // |Exit instruction and deny access, a match in any | |
+ // |allow block will jump over this instruction | |
+ // +-------------------------------------------------------------+-----------+
+ // |Deny Block | |
+ // | | |
+ // |Check each register, jump to next block if there is no match | |
+ // | | |
+ // |If each register is matched, exit immediately as we have | |
+ // |the deny value stored in result register R0 | |
+ // +-------------------------------------------------------------+ Deny |
+ // | | Section |
+ // |Other deny blocks... | |
+ // | | |
+ // +-------------------------------------------------------------+ |
+ // |Exit instruction to allow access because to reach this | |
+ // |point, there must have been a match in allow, and no | |
+ // |matches in deny | |
+ // +-------------------------------------------------------------+-----------+
+ // ----------------------------------END--------------------------------------
+ //
+ // The code in special case 1 (allow catch-all):
+ // +-------------------------------------------------------------+
+ // |Initialize R0 to deny value |
+ // +-------------------------------------------------------------+-----------+
+ // |Deny Block | |
+ // | | |
+ // |Check each register, jump to next block if there is no match | |
+ // | | |
+ // |If each register is matched, exit immediately as we have | |
+ // |the deny value stored in result register R0 | |
+ // +-------------------------------------------------------------+ Deny |
+ // | | Section |
+ // |Other deny blocks... | |
+ // | | |
+ // +-------------------------------------------------------------+ |
+ // |Exit instruction to allow access because to reach this | |
+ // |point, there must have been a match in allow, and no | |
+ // |matches in deny | |
+ // +-------------------------------------------------------------+-----------+
+ // ----------------------------------END--------------------------------------
+ //
+ // The code in special cases 2 (deny catch-all) and 3 (empty allow):
+ // +-------------------------------------------------------------+
+ // |Initialize R0 to deny value |
+ // +-------------------------------------------------------------+
+ // |Exit instruction to deny access |
+ // +-------------------------------------------------------------+
+ static Try<ebpf::Program> build(
+ const vector<Entry>& allow,
+ const vector<Entry>& deny)
{
// The BPF_PROG_TYPE_CGROUP_DEVICE program takes in
// `struct bpf_cgroup_dev_ctx*` as input. We extract the fields into
@@ -1094,6 +1173,7 @@
//
// The device type is encoded in the first 16 bits of `access_type` and
// the access type is encoded in the last 16 bits of `access_type`.
+ ebpf::Program program = ebpf::Program(BPF_PROG_TYPE_CGROUP_DEVICE);
program.append({
// r2: Type ('c', 'b', '?')
BPF_LDX_MEM(
@@ -1110,78 +1190,160 @@
BPF_LDX_MEM(BPF_W, BPF_REG_5, BPF_REG_1,
offsetof(bpf_cgroup_dev_ctx, minor)),
});
- }
- Try<Nothing> allow(const Entry entry) { return addDevice(entry, true); }
- Try<Nothing> deny(const Entry entry) { return addDevice(entry, false); }
+ // Initialize result register R0 to deny access so we can immediately
+ // exit if there is a match in a deny entry, or if there is no match
+ // in the allow entries.
+ program.append({BPF_MOV64_IMM(BPF_REG_0, DENY_ACCESS)});
- ebpf::Program build()
- {
- if (!hasCatchAll) {
- // Exit instructions.
- // If no entry granted access, then deny the access.
- program.append({
- BPF_MOV64_IMM (BPF_REG_0, DENY_ACCESS),
+ // Special case 2. We deny access and exit if there's a catch-all in deny.
+ foreach (const Entry& entry, deny) {
+ if (entry.is_catch_all()) {
+ program.append({BPF_EXIT_INSN()});
+ return program;
+ }
+ }
+
+ // Special case 3. We deny access and exit if we see nothing in allow.
+ if (allow.empty()) {
+ program.append({BPF_EXIT_INSN()});
+ return program;
+ }
+
+ auto allow_block_trailer = [](short jmp_size_to_deny_section) {
+ return vector<bpf_insn>({BPF_JMP_A(jmp_size_to_deny_section)});
+ };
+ auto allow_section_trailer = []() {
+ return vector<bpf_insn>({BPF_EXIT_INSN()});
+ };
+ auto deny_block_trailer = []() {
+ return vector<bpf_insn>({BPF_EXIT_INSN()});
+ };
+ auto deny_section_trailer = []() {
+ return vector<bpf_insn>({
+ BPF_MOV64_IMM(BPF_REG_0, ALLOW_ACCESS),
BPF_EXIT_INSN(),
});
+ };
+
+ bool allow_catch_all = [&allow]() {
+ foreach (const Entry& entry, allow) {
+ if (entry.is_catch_all()) {
+ return true;
+ }
+ }
+ return false;
+ }();
+
+ // We will only add the code for the allow section if there is no catch-all
+ // allow entry present. If there is a catch-all, we will skip everything
+ // in the allow section, including exit instruction at the end,
+ // since we just need to check if the device is explicitly denied.
+ if (!allow_catch_all) {
+ // We calculate the total jump distance to skip over trailer instructions
+ // at the end of the allow section, we initialize jump size to length of
+ // said instructions, then add the lengths of individual allow blocks.
+ short start_of_deny_jmp_size = allow_section_trailer().size();
+ vector<vector<bpf_insn>> allow_device_check_blocks = {};
+ short allow_block_trailer_size = allow_block_trailer(0).size();
+
+ foreach (const Entry& entry, allow) {
+ vector<bpf_insn> allow_block =
+ add_device_checks(entry, allow_block_trailer_size);
+ allow_device_check_blocks.push_back(allow_block);
+
+ start_of_deny_jmp_size += allow_block.size() + allow_block_trailer_size;
+ }
+
+ foreach (vector<bpf_insn>& allow_block, allow_device_check_blocks) {
+ start_of_deny_jmp_size -=
+ (allow_block.size() + allow_block_trailer_size);
+ program.append(std::move(allow_block));
+ program.append(allow_block_trailer(start_of_deny_jmp_size));
+ }
+
+ // If this instruction is executed, then there is no match in allow
+ // so we can deny access.
+ program.append(allow_section_trailer());
}
+
+ // Get the deny block device check code.
+ // We are either following the normal code flow or special case 1 (see
+ // diagram above) if we reached this section.
+ foreach (const Entry& entry, deny) {
+ program.append(add_device_checks(entry, deny_block_trailer().size()));
+ program.append(deny_block_trailer());
+ }
+
+ // To reach this block, we must have matched with an entry in allow
+ // to jump over the exit instruction at the end of allow blocks,
+ // or there is a catch-all in allow. We will also have to have not
+ // matched with any of the deny entries to avoid their exit instructions.
+ // Meaning that the device is on the allow list, and not on the deny list.
+ // Hence, we grant them access.
+ program.append(deny_section_trailer());
+
return program;
}
private:
- Try<Nothing> addDevice(const Entry entry, bool allow)
+ static vector<bpf_insn> add_device_checks(
+ const Entry& entry,
+ short trailer_length)
{
- if (hasCatchAll) {
- return Nothing();
- }
-
// We create a block of bytecode with the format:
// 1. Major Version Check
// 2. Minor Version Check
// 3. Type Check
// 4. Access Check
- // 5. Allow/Deny Access
- //
- // 6. NEXT BLOCK
+ // 5. Trailer (caller-generated)
+ // 5a. If block is an allow block, we jump to the start of deny blocks
+ // 5b. If block is a deny block, we exit immediately
//
// Either:
- // 1. The device access is matched by (1,2,3,4) and the Allow/Deny access
- // block (5) is executed.
+ // 1. The device access is matched by (1,2,3,4) and the Allow/Deny trailer
+ // code is executed.
// 2. One of (1,2,3,4) does not match the requested access and we skip
- // to the next block (6).
+ // the rest of the current block
- const Entry::Selector& selector = entry.selector;
- const Entry::Access& access = entry.access;
-
- bool check_major = selector.major.isSome();
- bool check_minor = selector.minor.isSome();
- bool check_type = selector.type != Entry::Selector::Type::ALL;
- bool check_access = !access.mknod || !access.read || !access.write;
-
- // Number of instructions to the [NEXT BLOCK]. This is used if a check
- // fails (meaning this entry does not apply) and we want to skip the
- // subsequent checks.
- short jmp_size = 1 + (check_major ? 1 : 0) + (check_minor ? 1 : 0) +
- (check_access ? 3 : 0) + (check_type ? 1 : 0);
-
- // Check major version (r4) against entry.
- if (check_major) {
- program.append({
- BPF_JMP_IMM(BPF_JNE, BPF_REG_4, (int)selector.major.get(), jmp_size),
- });
- --jmp_size;
+ if (entry.is_catch_all()) {
+ return {};
}
- // Check minor version (r5) against entry.
- if (check_minor) {
- program.append({
- BPF_JMP_IMM(BPF_JNE, BPF_REG_5, (int)selector.minor.get(), jmp_size),
- });
- --jmp_size;
- }
+ auto check_major_instructions = [](short jmp_size, int major) {
+ return vector<bpf_insn>({
+ BPF_JMP_IMM(
+ BPF_JNE, BPF_REG_4, major, jmp_size),
+ });
+ };
- // Check type (r2) against entry.
- if (check_type) {
+ auto check_minor_instructions = [](short jmp_size, int minor) {
+ return vector<bpf_insn>({
+ BPF_JMP_IMM(
+ BPF_JNE, BPF_REG_5, minor, jmp_size),
+ });
+ };
+
+ auto check_access_instructions =
+ [](short jmp_size, const Entry::Access& access)
+ {
+ int bpf_access = 0;
+ bpf_access |= access.read ? BPF_DEVCG_ACC_READ : 0;
+ bpf_access |= access.write ? BPF_DEVCG_ACC_WRITE : 0;
+ bpf_access |= access.mknod ? BPF_DEVCG_ACC_MKNOD : 0;
+ return vector<bpf_insn>({
+ BPF_MOV32_REG(BPF_REG_1, BPF_REG_3),
+ BPF_ALU32_IMM(BPF_AND, BPF_REG_1, bpf_access),
+ BPF_JMP_REG(
+ BPF_JNE,
+ BPF_REG_1,
+ BPF_REG_3,
+ static_cast<short>(jmp_size - 2)),
+ });
+ };
+
+ auto check_type_instructions =
+ [](short jmp_size, const Entry::Selector& selector) -> vector<bpf_insn> {
int bpf_type = [selector]() {
switch (selector.type) {
case Entry::Selector::Type::BLOCK: return BPF_DEVCG_DEV_BLOCK;
@@ -1190,52 +1352,78 @@
}
UNREACHABLE();
}();
+ return {
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_2, bpf_type, jmp_size),
+ };
+ };
+ const Entry::Selector& selector = entry.selector;
+ const Entry::Access& access = entry.access;
- program.append({
- BPF_JMP_IMM(BPF_JNE, BPF_REG_2, bpf_type, jmp_size),
- });
- --jmp_size;
+ bool check_major = selector.major.isSome();
+ bool check_minor = selector.minor.isSome();
+ bool check_type = selector.type != Entry::Selector::Type::ALL;
+ bool check_access = !access.mknod || !access.read || !access.write;
+
+ // The jump sizes here correspond to the size of the bpf instructions
+ // that each check adds to the program
+ // the total size of the block is the trailer length plus the total length
+ // of all checks.
+ short nxt_blk_jmp_size =
+ trailer_length +
+ (check_major ? check_major_instructions(0, 0).size() : 0) +
+ (check_minor ? check_minor_instructions(0, 0).size() : 0) +
+ (check_access ? check_access_instructions(0, access).size() : 0) +
+ (check_type ? check_type_instructions(0, selector).size() : 0);
+
+ // We subtract one because the program counter will be one ahead when it
+ // is executing the code in this code block, so we need to jump one less
+ // instruction to land at the beginning of the next entry-block
+ nxt_blk_jmp_size -= 1;
+
+ vector<bpf_insn> device_check_block = {};
+
+ // 1. Check major version (r4) against entry.
+ if (check_major) {
+ vector<bpf_insn> insert_instructions =
+ check_major_instructions(nxt_blk_jmp_size, (int)selector.major.get());
+ foreach (const bpf_insn& insn, insert_instructions) {
+ device_check_block.push_back(insn);
+ }
+ nxt_blk_jmp_size -= insert_instructions.size();
}
- // Check access (r3) against entry.
+ // 2. Check minor version (r5) against entry.
+ if (check_minor) {
+ vector<bpf_insn> insert_instructions =
+ check_minor_instructions(nxt_blk_jmp_size, (int)selector.minor.get());
+ foreach (const bpf_insn& insn, insert_instructions) {
+ device_check_block.push_back(insn);
+ }
+ nxt_blk_jmp_size -= insert_instructions.size();
+ }
+
+ // 3. Check type (r2) against entry.
+ if (check_type) {
+ vector<bpf_insn> insert_instructions =
+ check_type_instructions(nxt_blk_jmp_size, selector);
+ foreach (const bpf_insn& insn, insert_instructions) {
+ device_check_block.push_back(insn);
+ }
+ nxt_blk_jmp_size -= insert_instructions.size();
+ }
+
+ // 4. Check access (r3) against entry.
if (check_access) {
- int bpf_access = 0;
- bpf_access |= access.read ? BPF_DEVCG_ACC_READ : 0;
- bpf_access |= access.write ? BPF_DEVCG_ACC_WRITE : 0;
- bpf_access |= access.mknod ? BPF_DEVCG_ACC_MKNOD : 0;
-
- program.append({
- BPF_MOV32_REG(BPF_REG_1, BPF_REG_3),
- BPF_ALU32_IMM(BPF_AND, BPF_REG_1, bpf_access),
- BPF_JMP_REG(
- BPF_JNE, BPF_REG_1, BPF_REG_3, static_cast<short>(jmp_size - 2)),
- });
- jmp_size -= 3;
+ vector<bpf_insn> insert_instructions =
+ check_access_instructions(nxt_blk_jmp_size, access);
+ foreach (const bpf_insn& insn, insert_instructions) {
+ device_check_block.push_back(insn);
+ }
}
- if (!check_major && !check_minor && !check_type && !check_access) {
- // The exit instructions as well as any additional device entries would
- // generate unreachable blocks.
- hasCatchAll = true;
- }
-
- // Allow/Deny access block.
- program.append({
- BPF_MOV64_IMM(BPF_REG_0, allow ? ALLOW_ACCESS : DENY_ACCESS),
- BPF_EXIT_INSN(),
- });
-
- return Nothing();
+ return device_check_block;
}
- ebpf::Program program;
-
- // Whether the program has a device entry that allows or denies ALL accesses.
- // Such cases need to be specially handled because any instructions added
- // after it will be unreachable, and thus will cause the eBPF verifier to
- // reject the program.
- bool hasCatchAll = false;
-
static const int ALLOW_ACCESS = 1;
static const int DENY_ACCESS = 0;
};
@@ -1246,17 +1434,15 @@
const vector<Entry>& allow,
const vector<Entry>& deny)
{
- DeviceProgram program = DeviceProgram();
- foreach (const Entry entry, allow) {
- program.allow(entry);
- }
- foreach (const Entry entry, deny) {
- program.deny(entry);
+ Try<ebpf::Program> program = DeviceProgram::build(allow, deny);
+
+ if (program.isError()) {
+ return Error("Failed to generate device program: " + program.error());
}
Try<Nothing> attach = ebpf::cgroups2::attach(
cgroups2::path(cgroup),
- program.build());
+ *program);
if (attach.isError()) {
return Error("Failed to attach BPF_PROG_TYPE_CGROUP_DEVICE program: " +
diff --git a/src/linux/cgroups2.hpp b/src/linux/cgroups2.hpp
index 64254d0..accaebd 100644
--- a/src/linux/cgroups2.hpp
+++ b/src/linux/cgroups2.hpp
@@ -433,6 +433,8 @@
// Configure the device access permissions for the cgroup. These permissions
// are hierarchical. I.e. if a parent cgroup does not allow an access then
// 'this' cgroup will be denied access.
+// For access to be granted, the requested access must match an entry in the
+// allow list, and not match with any entry on the deny list.
Try<Nothing> configure(
const std::string& cgroup,
const std::vector<Entry>& allow,
diff --git a/src/tests/containerizer/cgroups2_tests.cpp b/src/tests/containerizer/cgroups2_tests.cpp
index cb1e229..751c0e3 100644
--- a/src/tests/containerizer/cgroups2_tests.cpp
+++ b/src/tests/containerizer/cgroups2_tests.cpp
@@ -663,6 +663,25 @@
// read is blocked
vector<OpenArgs>{{os::DEV_NULL, O_RDWR}, {os::DEV_NULL, O_RDONLY}}
},
+ // Do not allow device if it's on both allow and deny list
+ DeviceControllerTestParams{
+ vector<devices::Entry>{
+ *devices::Entry::parse("c 1:3 w"),
+ *devices::Entry::parse("b 1:3 w")},
+ vector<devices::Entry>{*devices::Entry::parse("c 1:3 w")},
+ vector<OpenArgs>{},
+ // write-only is blocked
+ vector<OpenArgs>{{os::DEV_NULL, O_WRONLY}}
+ },
+ // Mismatched entry in deny list is ignored and write access is granted
+ DeviceControllerTestParams{
+ vector<devices::Entry>{*devices::Entry::parse("c 1:3 w")},
+ vector<devices::Entry>{*devices::Entry::parse("b 1:3 w")},
+ // write-only allowed
+ vector<OpenArgs>{{os::DEV_NULL, O_WRONLY}},
+ // read is blocked
+ vector<OpenArgs>{{os::DEV_NULL, O_RDWR}, {os::DEV_NULL, O_RDONLY}}
+ },
// Access to /dev/null is denied because the allowed access is to
// a different device type with the same major:minor numbers.
DeviceControllerTestParams{
@@ -681,6 +700,61 @@
{os::DEV_NULL, O_RDWR},
{os::DEV_NULL, O_RDONLY}},
vector<OpenArgs>{},
+ },
+ // Allow access with catch-all and specified device
+ DeviceControllerTestParams{
+ vector<devices::Entry>{
+ *devices::Entry::parse("a *:* rwm"),
+ *devices::Entry::parse("c 1:3 w")},
+ vector<devices::Entry>{},
+ vector<OpenArgs>{
+ {os::DEV_NULL, O_WRONLY},
+ {os::DEV_NULL, O_RDWR},
+ {os::DEV_NULL, O_RDONLY}},
+ vector<OpenArgs>{},
+ },
+ // Allow access to all devices except one
+ DeviceControllerTestParams{
+ vector<devices::Entry>{*devices::Entry::parse("a *:* rwm")},
+ vector<devices::Entry>{*devices::Entry::parse("c 1:3 w")},
+ // read is allowed by catch-all
+ vector<OpenArgs>{{os::DEV_NULL, O_RDONLY}},
+ // write-only is blocked
+ vector<OpenArgs>{{os::DEV_NULL, O_WRONLY}}
+ },
+ // Deny access to all devices
+ DeviceControllerTestParams{
+ vector<devices::Entry>{},
+ vector<devices::Entry>{*devices::Entry::parse("a *:* rwm")},
+ vector<OpenArgs>{},
+ vector<OpenArgs>{
+ {os::DEV_NULL, O_WRONLY},
+ {os::DEV_NULL, O_RDWR},
+ {os::DEV_NULL, O_RDONLY}},
+ },
+ // Deny access using catch-all and specified device
+ DeviceControllerTestParams{
+ vector<devices::Entry>{},
+ vector<devices::Entry>{
+ *devices::Entry::parse("c 1:3 w"),
+ *devices::Entry::parse("a *:* rwm")},
+ vector<OpenArgs>{},
+ vector<OpenArgs>{
+ {os::DEV_NULL, O_WRONLY},
+ {os::DEV_NULL, O_RDWR},
+ {os::DEV_NULL, O_RDONLY}},
+ },
+ // Catch-all in both allow and deny list
+ DeviceControllerTestParams{
+ vector<devices::Entry>{
+ *devices::Entry::parse("c 1:3 w"),
+ *devices::Entry::parse("a *:* rwm")},
+ vector<devices::Entry>{*devices::Entry::parse("a *:* rwm")},
+ vector<OpenArgs>{},
+ vector<OpenArgs>{
+ {os::DEV_NULL, O_WRONLY},
+ {os::DEV_NULL, O_RDWR},
+ {os::DEV_NULL, O_RDONLY}},
}
));