blob: 41bf811792c15cb1919bcd09b09af9131ea5ced0 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <string.h>
#include <linux/capability.h>
#include <sys/prctl.h>
#include <set>
#include <string>
#include <stout/numify.hpp>
#include <stout/os.hpp>
#include <stout/stringify.hpp>
#include <stout/unreachable.hpp>
#include "linux/capabilities.hpp"
// Vendor the prctl constants for ambient capabilities so that we can build
// on systems with old glibc but still work on modern kernels.
#if !defined(PR_CAP_AMBIENT)
#define PR_CAP_AMBIENT 47
#endif
#if !defined(PR_CAP_AMBIENT_IS_SET)
#define PR_CAP_AMBIENT_IS_SET 1
#endif
#if !defined(PR_CAP_AMBIENT_RAISE)
#define PR_CAP_AMBIENT_RAISE 2
#endif
#if !defined(PR_CAP_AMBIENT_CLEAR_ALL)
#define PR_CAP_AMBIENT_CLEAR_ALL 4
#endif
using std::hex;
using std::ostream;
using std::string;
// We declare two functions provided in the libcap headers here to
// prevent introduction of that build-time dependency.
extern "C" {
extern int capset(cap_user_header_t header, cap_user_data_t data);
extern int capget(cap_user_header_t header, const cap_user_data_t data);
}
namespace mesos {
namespace internal {
namespace capabilities {
constexpr char PROC_CAP_LAST_CAP[] = "/proc/sys/kernel/cap_last_cap";
constexpr int CAPABIILITY_PROTOBUF_OFFSET = 1000;
// System call payload (for linux capability version 3).
struct SyscallPayload
{
__user_cap_header_struct head;
__user_cap_data_struct set[_LINUX_CAPABILITY_U32S_3];
SyscallPayload()
{
memset(this, 0, sizeof(SyscallPayload));
}
uint64_t effective()
{
uint64_t low = static_cast<uint64_t>(set[0].effective);
uint64_t high = static_cast<uint64_t>(set[1].effective) << 32;
return low | high;
}
uint64_t permitted()
{
uint64_t low = static_cast<uint64_t>(set[0].permitted);
uint64_t high = static_cast<uint64_t>(set[1].permitted) << 32;
return low | high;
}
uint64_t inheritable()
{
uint64_t low = static_cast<uint64_t>(set[0].inheritable);
uint64_t high = static_cast<uint64_t>(set[1].inheritable) << 32;
return low | high;
}
void setEffective(uint64_t effective)
{
set[0].effective = static_cast<uint32_t>(effective);
set[1].effective = static_cast<uint32_t>(effective >> 32);
}
void setPermitted(uint64_t permitted)
{
set[0].permitted = static_cast<uint32_t>(permitted);
set[1].permitted = static_cast<uint32_t>(permitted >> 32);
}
void setInheritable(uint64_t inheritable)
{
set[0].inheritable = static_cast<uint32_t>(inheritable);
set[1].inheritable = static_cast<uint32_t>(inheritable >> 32);
}
};
// Helper function to convert capability set to bitset.
static uint64_t toCapabilityBitset(const std::set<Capability>& capabilities)
{
uint64_t result = 0;
for (int i = 0; i < MAX_CAPABILITY; i++) {
if (capabilities.count(static_cast<Capability>(i)) > 0) {
result |= (1ULL << i);
}
}
return result;
}
// Helper function to convert capability bitset to std::set.
static std::set<Capability> toCapabilitySet(uint64_t bitset)
{
std::set<Capability> result;
for (int i = 0; i < MAX_CAPABILITY; i++) {
if ((bitset & (1ULL << i)) != 0) {
result.insert(Capability(i));
}
}
return result;
}
const std::set<Capability>& ProcessCapabilities::get(const Type& type) const
{
switch (type) {
case EFFECTIVE: return effective;
case PERMITTED: return permitted;
case INHERITABLE: return inheritable;
case BOUNDING: return bounding;
case AMBIENT: return ambient;
}
UNREACHABLE();
}
void ProcessCapabilities::set(
const Type& type,
const std::set<Capability>& capabilities)
{
switch (type) {
case EFFECTIVE: effective = capabilities; return;
case PERMITTED: permitted = capabilities; return;
case INHERITABLE: inheritable = capabilities; return;
case BOUNDING: bounding = capabilities; return;
case AMBIENT: ambient = capabilities; return;
}
UNREACHABLE();
}
void ProcessCapabilities::add(
const Type& type,
const Capability& capability)
{
switch (type) {
case EFFECTIVE: effective.insert(capability); return;
case PERMITTED: permitted.insert(capability); return;
case INHERITABLE: inheritable.insert(capability); return;
case BOUNDING: bounding.insert(capability); return;
case AMBIENT: ambient.insert(capability); return;
}
UNREACHABLE();
}
void ProcessCapabilities::drop(
const Type& type,
const Capability& capability)
{
switch (type) {
case EFFECTIVE: effective.erase(capability); return;
case PERMITTED: permitted.erase(capability); return;
case INHERITABLE: inheritable.erase(capability); return;
case BOUNDING: bounding.erase(capability); return;
case AMBIENT: ambient.erase(capability); return;
}
UNREACHABLE();
}
Capabilities::Capabilities(int _lastCap, bool _ambientSupported)
: ambientCapabilitiesSupported(_ambientSupported),
lastCap(_lastCap) {}
Try<Capabilities> Capabilities::create()
{
// Check for compatible linux capability version.
SyscallPayload payload;
if (capget(&payload.head, nullptr)) {
// If capget fails with EINVAL it still populates the version field.
if (errno != EINVAL) {
return ErrnoError("Failed to get linux capability version");
}
}
if (payload.head.version != _LINUX_CAPABILITY_VERSION_3) {
return Error(
"Unsupported linux capabilities version: " +
stringify(payload.head.version));
}
// Read and check the maximum capability value.
Try<string> _lastCap = os::read(PROC_CAP_LAST_CAP);
if (_lastCap.isError()) {
return Error(
"Failed to read '" + string(PROC_CAP_LAST_CAP) + "': " +
_lastCap.error());
}
Try<int> lastCap = numify<int>(strings::trim(
_lastCap.get(),
strings::SUFFIX,
"\n"));
if (lastCap.isError()) {
return Error(
"Failed to parse system last capability value '" +
_lastCap.get() + "': " + lastCap.error());
}
if (lastCap.get() >= MAX_CAPABILITY) {
return Error(
"System last capability value '" + stringify(lastCap.get()) +
"' is greater than maximum supported number of capabilities '" +
stringify(static_cast<int>(MAX_CAPABILITY)) + "'");
}
// Test whether the kernel supports ambinent capabilities by testing
// for the presence arbitrary capability in the ambient set. This can
// only fail if the prctl option is not supported.
bool ambientSupported =
(prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_IS_SET, CHOWN, 0, 0) != -1);
return Capabilities(lastCap.get(), ambientSupported);
}
Try<ProcessCapabilities> Capabilities::get() const
{
SyscallPayload payload;
payload.head.version = _LINUX_CAPABILITY_VERSION_3;
payload.head.pid = 0;
if (capget(&payload.head, &payload.set[0])) {
return ErrnoError("Failed to get capabilities");
}
auto getBoundingCapabilities = [this]() {
std::set<Capability> bounding;
// TODO(bbannier): Parse bounding set from the `CapBnd` entry in
// `/proc/self/status`.
for (int i = 0; i <= lastCap; i++) {
if (prctl(PR_CAPBSET_READ, i) == 1) {
bounding.insert(Capability(i));
}
}
return bounding;
};
auto getAmbientCapabilities = [this]() {
std::set<Capability> ambient;
// TODO(jpeach): Parse the ambient set from the `CapAmb` entry in
// `/proc/self/status`.
for (int i = 0; i <= lastCap; i++) {
if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_IS_SET, i, 0, 0) == 1) {
ambient.insert(Capability(i));
}
}
return ambient;
};
ProcessCapabilities capabilities;
capabilities.set(EFFECTIVE, toCapabilitySet(payload.effective()));
capabilities.set(PERMITTED, toCapabilitySet(payload.permitted()));
capabilities.set(INHERITABLE, toCapabilitySet(payload.inheritable()));
capabilities.set(BOUNDING, getBoundingCapabilities());
if (ambientCapabilitiesSupported) {
capabilities.set(AMBIENT, getAmbientCapabilities());
}
return capabilities;
}
// We do three separate operations:
// 1. Set the `bounding` capabilities for the process.
// 2. Set the `effective`, `permitted` and `inheritable` capabilities.
// 3. Clear and then set the `ambient` capabilities.
//
// TODO(jojy): Is there a way to make this atomic? Ideally, we would
// like to rollback any changes if any of the operation fails.
Try<Nothing> Capabilities::set(const ProcessCapabilities& capabilities)
{
// If we are setting ambient capabilities, verify that they are consistent
// so we don't fail after we have already changed our capabilities.
if (!capabilities.get(AMBIENT).empty()) {
const auto& ambient = capabilities.get(AMBIENT);
const auto& permitted = capabilities.get(PERMITTED);
const auto& inherited = capabilities.get(INHERITABLE);
if ((ambient & permitted).size() != ambient.size()) {
return Error("Ambient capabilities are not in the permitted set");
}
if ((ambient & inherited).size() != ambient.size()) {
return Error("Ambient capabilities are not in the inheritable set");
}
}
// NOTE: We can only drop capabilities in the bounding set.
for (int i = 0; i <= lastCap; i++) {
if (capabilities.get(BOUNDING).count(Capability(i)) > 0) {
continue;
}
VLOG(1) << "Dropping capability " << Capability(i);
if (prctl(PR_CAPBSET_DROP, i, 1) < 0) {
return ErrnoError(
"Failed to drop capability: "
"PR_CAPBSET_DROP failed for the process");
}
}
SyscallPayload payload;
payload.head.version = _LINUX_CAPABILITY_VERSION_3;
payload.head.pid = 0;
payload.setEffective(toCapabilityBitset(capabilities.get(EFFECTIVE)));
payload.setPermitted(toCapabilityBitset(capabilities.get(PERMITTED)));
payload.setInheritable(toCapabilityBitset(capabilities.get(INHERITABLE)));
if (capset(&payload.head, &payload.set[0])) {
return ErrnoError("Failed to set capabilities");
}
if (ambientCapabilitiesSupported) {
if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_CLEAR_ALL, 0, 0, 0) < 0) {
return ErrnoError("Failed to clear ambient capabilities");
}
foreach(auto cap, capabilities.get(AMBIENT)) {
if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, cap, 0, 0) < 0) {
return ErrnoError("Failed to raise capability " + stringify(cap) +
" to the ambient set");
}
}
}
return Nothing();
}
Try<Nothing> Capabilities::setKeepCaps()
{
if (prctl(PR_SET_KEEPCAPS, 1) < 0) {
return ErrnoError("Failed to set PR_SET_KEEPCAPS for the process");
}
return Nothing();
}
std::set<Capability> Capabilities::getAllSupportedCapabilities()
{
std::set<Capability> result;
for (int i = 0; i <= lastCap; i++) {
result.insert(Capability(i));
}
return result;
}
Capability convert(const CapabilityInfo::Capability& capability)
{
int value = capability - CAPABIILITY_PROTOBUF_OFFSET;
CHECK_LE(0, value);
CHECK_GT(MAX_CAPABILITY, value);
return static_cast<Capability>(value);
}
std::set<Capability> convert(const CapabilityInfo& capabilityInfo)
{
std::set<Capability> result;
foreach (int value, capabilityInfo.capabilities()) {
result.insert(convert(static_cast<CapabilityInfo::Capability>(value)));
}
return result;
}
CapabilityInfo convert(const std::set<Capability>& capabilities)
{
CapabilityInfo capabilityInfo;
foreach (const Capability& capability, capabilities) {
capabilityInfo.add_capabilities(static_cast<CapabilityInfo::Capability>(
capability + CAPABIILITY_PROTOBUF_OFFSET));
}
return capabilityInfo;
}
ostream& operator<<(ostream& stream, const Capability& capability)
{
switch (capability) {
case CHOWN: return stream << "CHOWN";
case DAC_OVERRIDE: return stream << "DAC_OVERRIDE";
case DAC_READ_SEARCH: return stream << "DAC_READ_SEARCH";
case FOWNER: return stream << "FOWNER";
case FSETID: return stream << "FSETID";
case KILL: return stream << "KILL";
case SETGID: return stream << "SETGID";
case SETUID: return stream << "SETUID";
case SETPCAP: return stream << "SETPCAP";
case LINUX_IMMUTABLE: return stream << "LINUX_IMMUTABLE";
case NET_BIND_SERVICE: return stream << "NET_BIND_SERVICE";
case NET_BROADCAST: return stream << "NET_BROADCAST";
case NET_ADMIN: return stream << "NET_ADMIN";
case NET_RAW: return stream << "NET_RAW";
case IPC_LOCK: return stream << "IPC_LOCK";
case IPC_OWNER: return stream << "IPC_OWNER";
case SYS_MODULE: return stream << "SYS_MODULE";
case SYS_RAWIO: return stream << "SYS_RAWIO";
case SYS_CHROOT: return stream << "SYS_CHROOT";
case SYS_PTRACE: return stream << "SYS_PTRACE";
case SYS_PACCT: return stream << "SYS_PACCT";
case SYS_ADMIN: return stream << "SYS_ADMIN";
case SYS_BOOT: return stream << "SYS_BOOT";
case SYS_NICE: return stream << "SYS_NICE";
case SYS_RESOURCE: return stream << "SYS_RESOURCE";
case SYS_TIME: return stream << "SYS_TIME";
case SYS_TTY_CONFIG: return stream << "SYS_TTY_CONFIG";
case MKNOD: return stream << "MKNOD";
case LEASE: return stream << "LEASE";
case AUDIT_WRITE: return stream << "AUDIT_WRITE";
case AUDIT_CONTROL: return stream << "AUDIT_CONTROL";
case SETFCAP: return stream << "SETFCAP";
case MAC_OVERRIDE: return stream << "MAC_OVERRIDE";
case MAC_ADMIN: return stream << "MAC_ADMIN";
case SYSLOG: return stream << "SYSLOG";
case WAKE_ALARM: return stream << "WAKE_ALARM";
case BLOCK_SUSPEND: return stream << "BLOCK_SUSPEND";
case AUDIT_READ: return stream << "AUDIT_READ";
case PERFMON: return stream << "PERFMON";
case BPF: return stream << "BPF";
case CHECKPOINT_RESTORE: return stream << "CHECKPOINT_RESTORE";
case MAX_CAPABILITY: UNREACHABLE();
}
UNREACHABLE();
}
ostream& operator<<(ostream& stream, const Type& type)
{
switch (type) {
case EFFECTIVE: return stream << "eff";
case PERMITTED: return stream << "perm";
case INHERITABLE: return stream << "inh";
case BOUNDING: return stream << "bnd";
case AMBIENT: return stream << "amb";
}
UNREACHABLE();
}
ostream& operator<<(
ostream& stream,
const ProcessCapabilities& processCapabilities)
{
return stream
<< "{"
<< EFFECTIVE << ": " << stringify(processCapabilities.effective) << ", "
<< PERMITTED << ": " << stringify(processCapabilities.permitted) << ", "
<< INHERITABLE << ": " << stringify(processCapabilities.inheritable) << ", "
<< BOUNDING << ": " << stringify(processCapabilities.bounding) << ", "
<< AMBIENT << ": " << stringify(processCapabilities.ambient)
<< "}";
}
} // namespace capabilities {
} // namespace internal {
} // namespace mesos {