blob: 6876967f4182e0cd0bc11fe124382629107eebf7 [file] [log] [blame]
/**
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef __LINUX_NS_HPP__
#define __LINUX_NS_HPP__
// This file contains Linux-only OS utilities.
#ifndef __linux__
#error "linux/ns.hpp is only available on Linux systems."
#endif
#include <sched.h>
#include <unistd.h>
#include <sys/syscall.h>
#include <set>
#include <string>
#include <vector>
#include <stout/error.hpp>
#include <stout/hashmap.hpp>
#include <stout/nothing.hpp>
#include <stout/os.hpp>
#include <stout/path.hpp>
#include <stout/proc.hpp>
#include <stout/stringify.hpp>
#include <stout/strings.hpp>
#include <stout/try.hpp>
#include <stout/os/exists.hpp>
#include <stout/os/ls.hpp>
#include <process/collect.hpp>
#include <process/future.hpp>
#include <process/reap.hpp>
#ifndef CLONE_NEWNS
#define CLONE_NEWNS 0x00020000
#endif
#ifndef CLONE_NEWUTS
#define CLONE_NEWUTS 0x04000000
#endif
#ifndef CLONE_NEWIPC
#define CLONE_NEWIPC 0x08000000
#endif
#ifndef CLONE_NEWPID
#define CLONE_NEWPID 0x20000000
#endif
#ifndef CLONE_NEWNET
#define CLONE_NEWNET 0x40000000
#endif
#ifndef CLONE_NEWUSER
#define CLONE_NEWUSER 0x10000000
#endif
namespace ns {
// Returns all the supported namespaces by the kernel.
inline std::set<std::string> namespaces()
{
std::set<std::string> result;
Try<std::list<std::string> > entries = os::ls("/proc/self/ns");
if (entries.isSome()) {
foreach (const std::string& entry, entries.get()) {
result.insert(entry);
}
}
return result;
}
// Returns the nstype (e.g., CLONE_NEWNET, CLONE_NEWNS, etc.) for the
// given namespace which will be used when calling ::setns.
inline Try<int> nstype(const std::string& ns)
{
hashmap<std::string, int> nstypes;
nstypes["mnt"] = CLONE_NEWNS;
nstypes["uts"] = CLONE_NEWUTS;
nstypes["ipc"] = CLONE_NEWIPC;
nstypes["net"] = CLONE_NEWNET;
nstypes["user"] = CLONE_NEWUSER;
nstypes["pid"] = CLONE_NEWPID;
if (!nstypes.contains(ns)) {
return Error("Unknown namespace '" + ns + "'");
}
return nstypes[ns];
}
// Re-associate the calling process with the specified namespace. The
// path refers to one of the corresponding namespace entries in the
// /proc/[pid]/ns/ directory (or bind mounted elsewhere). We do not
// allow a process with multiple threads to call this function because
// it will lead to some weird situations where different threads of a
// process are in different namespaces.
inline Try<Nothing> setns(const std::string& path, const std::string& ns)
{
// Return error if there're multiple threads in the calling process.
Try<std::set<pid_t> > threads = proc::threads(::getpid());
if (threads.isError()) {
return Error(
"Failed to get the threads of the current process: " +
threads.error());
} else if (threads.get().size() > 1) {
return Error("Multiple threads exist in the current process");
}
if (ns::namespaces().count(ns) == 0) {
return Error("Namespace '" + ns + "' is not supported");
}
// Currently, we don't support pid namespace as its semantics is
// different from other namespaces (instead of re-associating the
// calling thread, it re-associates the *children* of the calling
// thread with the specified namespace).
if (ns == "pid") {
return Error("Pid namespace is not supported");
}
Try<int> fd = os::open(path, O_RDONLY | O_CLOEXEC);
if (fd.isError()) {
return Error("Failed to open '" + path + "': " + fd.error());
}
#ifndef O_CLOEXEC
Try<Nothing> cloexec = os::cloexec(fd.get());
if (cloexec.isError()) {
os::close(fd.get());
return Error("Failed to cloexec: " + cloexec.error());
}
#endif
Try<int> nstype = ns::nstype(ns);
if (nstype.isError()) {
return Error(nstype.error());
}
#ifdef SYS_setns
int ret = ::syscall(SYS_setns, fd.get(), nstype.get());
#elif __x86_64__
// A workaround for those hosts that have an old glibc (older than
// 2.14) but have a new kernel. The magic number '308' here is the
// syscall number for 'setns' on x86_64 architecture.
int ret = ::syscall(308, fd.get(), nstype.get());
#else
#error "setns is not available"
#endif
if (ret == -1) {
// Save the errno as it might be overwritten by 'os::close' below.
ErrnoError error;
os::close(fd.get());
return error;
}
os::close(fd.get());
return Nothing();
}
// Re-associate the calling process with the specified namespace. The
// pid specifies the process whose namespace we will associate.
inline Try<Nothing> setns(pid_t pid, const std::string& ns)
{
if (!os::exists(pid)) {
return Error("Pid " + stringify(pid) + " does not exist");
}
std::string path = path::join("/proc", stringify(pid), "ns", ns);
if (!os::exists(path)) {
return Error("Namespace '" + ns + "' is not supported");
}
return ns::setns(path, ns);
}
// Get the inode number of the specified namespace for the specified
// pid. The inode number identifies the namespace and can be used for
// comparisons, i.e., two processes with the same inode for a given
// namespace type are in the same namespace.
inline Try<ino_t> getns(pid_t pid, const std::string& ns)
{
if (!os::exists(pid)) {
return Error("Pid " + stringify(pid) + " does not exist");
}
if (ns::namespaces().count(ns) < 1) {
return Error("Namespace '" + ns + "' is not supported");
}
std::string path = path::join("/proc", stringify(pid), "ns", ns);
struct stat s;
if (::stat(path.c_str(), &s) < 0) {
return ErrnoError("Failed to stat " + ns + " namespace handle"
" for pid " + stringify(pid));
}
return s.st_ino;
}
namespace pid {
namespace internal {
inline Nothing _nothing() { return Nothing(); }
} // namespace internal {
inline process::Future<Nothing> destroy(ino_t inode)
{
// Check we're not trying to kill the root namespace.
Try<ino_t> ns = ns::getns(1, "pid");
if (ns.isError()) {
return process::Failure(ns.error());
}
if (ns.get() == inode) {
return process::Failure("Cannot destroy root pid namespace");
}
// Or ourselves.
ns = ns::getns(::getpid(), "pid");
if (ns.isError()) {
return process::Failure(ns.error());
}
if (ns.get() == inode) {
return process::Failure("Cannot destroy own pid namespace");
}
// Signal all pids in the namespace, including the init pid if it's
// still running. Once the init pid has been signalled the kernel
// will prevent any new children forking in the namespace and will
// also signal all other pids in the namespace.
Try<std::set<pid_t>> pids = os::pids();
if (pids.isError()) {
return process::Failure("Failed to list of processes");
}
foreach (pid_t pid, pids.get()) {
// Ignore any errors, probably because the process no longer
// exists, and ignorable otherwise.
Try<ino_t> ns = ns::getns(pid, "pid");
if (ns.isSome() && ns.get() == inode) {
kill(pid, SIGKILL);
}
}
// Get a new snapshot and do a second pass of the pids to capture
// any pids that are dying so we can reap them.
pids = os::pids();
if (pids.isError()) {
return process::Failure("Failed to list of processes");
}
std::list<process::Future<Option<int>>> futures;
foreach (pid_t pid, pids.get()) {
Try<ino_t> ns = ns::getns(pid, "pid");
if (ns.isSome() && ns.get() == inode) {
futures.push_back(process::reap(pid));
}
// Ignore any errors, probably because the process no longer
// exists, and ignorable otherwise.
}
// Wait for all the signalled processes to terminate. The pid
// namespace will then be empty and will be released by the kernel
// (unless there are additional references).
return process::collect(futures)
.then(lambda::bind(&internal::_nothing));
}
} // namespace pid {
// Returns the namespace flags in the string form of bitwise-ORing the
// flags, e.g., CLONE_NEWNS | CLONE_NEWNET.
inline std::string stringify(int flags)
{
hashmap<unsigned int, std::string> names = {
{CLONE_NEWNS, "CLONE_NEWNS"},
{CLONE_NEWUTS, "CLONE_NEWUTS"},
{CLONE_NEWIPC, "CLONE_NEWIPC"},
{CLONE_NEWPID, "CLONE_NEWPID"},
{CLONE_NEWNET, "CLONE_NEWNET"},
{CLONE_NEWUSER, "CLONE_NEWUSER"}
};
std::vector<std::string> namespaces;
foreachpair (unsigned int flag, const std::string& name, names) {
if (flags & flag) {
namespaces.push_back(name);
}
}
return strings::join(" | ", namespaces);
}
} // namespace ns {
#endif // __LINUX_NS_HPP__