| // Licensed to the Apache Software Foundation (ASF) under one |
| // or more contributor license agreements. See the NOTICE file |
| // distributed with this work for additional information |
| // regarding copyright ownership. The ASF licenses this file |
| // to you under the Apache License, Version 2.0 (the |
| // "License"); you may not use this file except in compliance |
| // with the License. You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| #include "linux/ns.hpp" |
| |
| #include <unistd.h> |
| |
| #include <sys/socket.h> |
| #include <sys/wait.h> |
| |
| #include <cstring> |
| #include <type_traits> |
| #include <vector> |
| |
| #include <process/collect.hpp> |
| #include <process/future.hpp> |
| #include <process/reap.hpp> |
| |
| #include <stout/assert.hpp> |
| #include <stout/error.hpp> |
| #include <stout/hashmap.hpp> |
| #include <stout/nothing.hpp> |
| #include <stout/os.hpp> |
| #include <stout/path.hpp> |
| #include <stout/proc.hpp> |
| #include <stout/result.hpp> |
| #include <stout/stringify.hpp> |
| #include <stout/strings.hpp> |
| #include <stout/try.hpp> |
| #include <stout/version.hpp> |
| |
| #include <stout/os/exists.hpp> |
| #include <stout/os/ls.hpp> |
| #include <stout/os/socket.hpp> |
| |
| #include "common/status_utils.hpp" |
| |
| using std::set; |
| using std::string; |
| using std::vector; |
| |
| namespace ns { |
| |
| static Try<Version> kernelVersion() |
| { |
| Try<os::UTSInfo> uname = os::uname(); |
| if (!uname.isSome()) { |
| return Error("Unable to determine kernel version: " + uname.error()); |
| } |
| |
| vector<string> parts = strings::split(uname->release, "."); |
| parts.resize(2); |
| |
| Try<Version> version = Version::parse(strings::join(".", parts)); |
| if (!version.isSome()) { |
| return Error("Failed to parse kernel version '" + uname->release + |
| "': " + version.error()); |
| } |
| |
| return version; |
| } |
| |
| |
| Try<int> nstype(const string& ns) |
| { |
| const hashmap<string, int> nstypes = { |
| {"mnt", CLONE_NEWNS}, |
| {"uts", CLONE_NEWUTS}, |
| {"ipc", CLONE_NEWIPC}, |
| {"net", CLONE_NEWNET}, |
| {"user", CLONE_NEWUSER}, |
| {"pid", CLONE_NEWPID}, |
| {"cgroup", CLONE_NEWCGROUP} |
| }; |
| |
| Option<int> nstype = nstypes.get(ns); |
| |
| if (nstype.isNone()) { |
| return Error("Unknown namespace '" + ns + "'"); |
| } |
| |
| return nstype.get(); |
| } |
| |
| |
| Try<string> nsname(int nsType) |
| { |
| const hashmap<int, string> nsnames = { |
| {CLONE_NEWNS, "mnt"}, |
| {CLONE_NEWUTS, "uts"}, |
| {CLONE_NEWIPC, "ipc"}, |
| {CLONE_NEWNET, "net"}, |
| {CLONE_NEWUSER, "user"}, |
| {CLONE_NEWPID, "pid"}, |
| {CLONE_NEWCGROUP, "cgroup"} |
| }; |
| |
| Option<string> nsname = nsnames.get(nsType); |
| |
| if (nsname.isNone()) { |
| return Error("Unknown namespace"); |
| } |
| |
| return nsname.get(); |
| } |
| |
| |
| // TODO(jpeach): As we move namespace parameters from strings to CLONE |
| // constants, we should be able to eventually remove the internal uses |
| // of this function. |
| static set<string> namespaces() |
| { |
| set<string> result; |
| |
| Try<std::list<string>> entries = os::ls("/proc/self/ns"); |
| if (entries.isSome()) { |
| foreach (const string& entry, entries.get()) { |
| // Introduced in Linux 4.12, pid_for_children is a handle for the PID |
| // namespace of child processes created by the current process. |
| if (entry != "pid_for_children") { |
| result.insert(entry); |
| } |
| } |
| } |
| |
| return result; |
| } |
| |
| |
| set<int> nstypes() |
| { |
| set<int> result; |
| |
| foreach (const string& ns, namespaces()) { |
| Try<int> type = nstype(ns); |
| if (type.isSome()) { |
| result.insert(type.get()); |
| } |
| } |
| |
| return result; |
| } |
| |
| |
| Try<bool> supported(int nsTypes) |
| { |
| int supported = 0; |
| |
| foreach (const int n, nstypes()) { |
| if (nsTypes & n) { |
| supported |= n; |
| } |
| } |
| |
| if ((nsTypes & CLONE_NEWUSER) && (supported & CLONE_NEWUSER)) { |
| Try<Version> version = kernelVersion(); |
| |
| if (version.isError()) { |
| return Error(version.error()); |
| } |
| |
| if (version.get() < Version(3, 12, 0)) { |
| return false; |
| } |
| } |
| |
| return supported == nsTypes; |
| } |
| |
| |
| Try<Nothing> setns( |
| const string& path, |
| const string& ns, |
| bool checkMultithreaded) |
| { |
| if (checkMultithreaded) { |
| // Return error if there're multiple threads in the calling process. |
| Try<set<pid_t>> threads = proc::threads(::getpid()); |
| if (threads.isError()) { |
| return Error( |
| "Failed to get the threads of the current process: " + |
| threads.error()); |
| } else if (threads->size() > 1) { |
| return Error("Multiple threads exist in the current process"); |
| } |
| } |
| |
| if (ns::namespaces().count(ns) == 0) { |
| return Error("Namespace '" + ns + "' is not supported"); |
| } |
| |
| // Currently, we don't support pid namespace as its semantics is |
| // different from other namespaces (instead of re-associating the |
| // calling thread, it re-associates the *children* of the calling |
| // thread with the specified namespace). |
| if (ns == "pid") { |
| return Error("Pid namespace is not supported"); |
| } |
| |
| Try<int> fd = os::open(path, O_RDONLY | O_CLOEXEC); |
| |
| if (fd.isError()) { |
| return Error("Failed to open '" + path + "': " + fd.error()); |
| } |
| |
| Try<int> nstype = ns::nstype(ns); |
| if (nstype.isError()) { |
| return Error(nstype.error()); |
| } |
| |
| if (::setns(fd.get(), nstype.get()) == -1) { |
| // Save the errno as it might be overwritten by 'os::close' below. |
| ErrnoError error; |
| os::close(fd.get()); |
| return error; |
| } |
| |
| os::close(fd.get()); |
| return Nothing(); |
| } |
| |
| |
| Try<Nothing> setns(pid_t pid, const string& ns, bool checkMultithreaded) |
| { |
| if (!os::exists(pid)) { |
| return Error("Pid " + ::stringify(pid) + " does not exist"); |
| } |
| |
| string path = path::join("/proc", ::stringify(pid), "ns", ns); |
| if (!os::exists(path)) { |
| return Error("Namespace '" + ns + "' is not supported"); |
| } |
| |
| return ns::setns(path, ns, checkMultithreaded); |
| } |
| |
| |
| Result<ino_t> getns(pid_t pid, const string& ns) |
| { |
| if (ns::namespaces().count(ns) < 1) { |
| return Error("Namespace '" + ns + "' is not supported"); |
| } |
| |
| string path = path::join("/proc", ::stringify(pid), "ns", ns); |
| struct stat s; |
| if (::stat(path.c_str(), &s) < 0) { |
| if (errno == ENOENT) { |
| // Process is gone. |
| return None(); |
| } else { |
| return ErrnoError("Failed to stat " + ns + " namespace handle" |
| " for pid " + ::stringify(pid)); |
| } |
| } |
| |
| return s.st_ino; |
| } |
| |
| |
| // Helper for closing a container of file descriptors. |
| template < |
| typename Iterable, |
| typename = typename std::enable_if< |
| std::is_same<typename Iterable::value_type, int>::value>::type> |
| static void close(const Iterable& fds) |
| { |
| int errsav = errno; |
| |
| foreach (int fd, fds) { |
| ::close(fd); // Need to call the async-signal safe version. |
| } |
| |
| errno = errsav; |
| } |
| |
| |
| Try<pid_t> clone( |
| pid_t target, |
| int nstypes, |
| const lambda::function<int()>& f, |
| int flags) |
| { |
| // NOTE: the order in which we 'setns' is significant, so we use an |
| // array here rather than something like a map. |
| // |
| // The user namespace needs to be entered first if we need to |
| // increase the privilege and last if we want to decrease the |
| // privilege. Said another way, entering the user namespace first |
| // gives an unprivileged user the potential to enter the other |
| // namespaces. |
| const size_t NAMESPACES = 7; |
| const struct |
| { |
| int nstype; |
| string name; |
| } namespaces[NAMESPACES] = { |
| {CLONE_NEWUSER, "user"}, |
| {CLONE_NEWCGROUP, "cgroup"}, |
| {CLONE_NEWIPC, "ipc"}, |
| {CLONE_NEWUTS, "uts"}, |
| {CLONE_NEWNET, "net"}, |
| {CLONE_NEWPID, "pid"}, |
| {CLONE_NEWNS, "mnt"} |
| }; |
| |
| // Since we assume below that the parent can deallocate the stack |
| // after cloning the children, the caller must not pass CLONE_VM. |
| // That would cause the both processes to share their address space |
| // so deallocating the stack in the parent would affect the child. |
| CHECK_EQ(0, flags & CLONE_VM); |
| |
| // Support for user namespaces in all filesystems is incomplete |
| // until version 3.12 (see 'Availability' in man page of |
| // 'user_namespaces'), so for now we don't support entering them. |
| // |
| // TODO(benh): Support user namespaces if the current system can |
| // support it, e.g., check the kernel version number or try and do a |
| // clone with CLONE_NEWUSER to see if it works. NOTE: before we can |
| // fully support user namespaces, however, we must take care to |
| // either enter the user namespace first or last. We'll want to |
| // enter it first if we need to increase the privilege and last if |
| // we want to decrease the privilege. Currently nsenter.c from |
| // utils-linux does this via doing two passes to make sure we either |
| // enter first or last. We'll need to do something similar here once |
| // we support user namespaces as well. |
| if (nstypes & CLONE_NEWUSER) { |
| return Error("User namespaces are not supported"); |
| } |
| |
| // File descriptors keyed by the (parent) namespace we are entering. |
| hashmap<int, int> fds = {}; |
| |
| // NOTE: we do all of this ahead of time so we can be async signal |
| // safe after calling fork below. |
| for (size_t i = 0; i < NAMESPACES; i++) { |
| // Only open the namespace file descriptor if it's been requested. |
| if (namespaces[i].nstype & nstypes) { |
| const string path = |
| path::join("/proc", ::stringify(target), "ns", namespaces[i].name); |
| Try<int> fd = os::open(path, O_RDONLY); |
| if (fd.isError()) { |
| close(fds.values()); |
| return Error("Failed to open '" + path + |
| "' for entering namespace: " + fd.error()); |
| } |
| fds[namespaces[i].nstype] = fd.get(); |
| } |
| } |
| |
| // We use a domain socket rather than pipes so that we can send back |
| // the PID of the final child process. The parent socket is |
| // `sockets[0]` and the child socket is `sockets[1]`. Note that both |
| // sockets are both read/write but currently only the parent reads |
| // and the child writes. |
| Try<std::array<int_fd, 2>> sockets = net::socketpair(AF_UNIX, SOCK_STREAM, 0); |
| if (sockets.isError()) { |
| close(fds.values()); |
| return Error("Failed to create Unix domain socket: " + sockets.error()); |
| } |
| |
| // Need to set SO_PASSCRED option in order to receive credentials |
| // (which is how we get the pid of the clone'd process, see |
| // below). Note that apparently we only need to do this for |
| // receiving, not also for sending. |
| const int value = 1; |
| const socklen_t size = sizeof(value); |
| if (setsockopt(sockets->at(0), SOL_SOCKET, SO_PASSCRED, &value, size) == -1) { |
| close(fds.values()); |
| close(sockets.get()); |
| return ErrnoError("Failed to set socket option SO_PASSCRED"); |
| } |
| |
| // NOTE: to determine the pid of the final process executing the |
| // specified lambda we use the SCM_CREDENTIALS mechanism of |
| // 'sendmsg' and 'recvmsg'. On Linux there is also a way to do this |
| // via 'getsockopt' and SO_PEERCRED which looks easier, but IIUC |
| // requires you to do an explicit connect from the child process |
| // back to the parent so that there is only one connection per |
| // socket (unlike in our world where the socket can be used by |
| // multiple forks/clones simultaneously because it's just a file |
| // descriptor that gets copied after each fork/clone). Perhaps the |
| // SO_PEERCRED is less lines of code but this approach was taken for |
| // now. |
| |
| char base[1]; |
| |
| iovec iov = {nullptr}; |
| iov.iov_base = base; |
| iov.iov_len = sizeof(base); |
| |
| // We need to allocate a char array large enough to hold "control" data. |
| // However, since this buffer is in reality a 'cmsghdr' with the payload, we |
| // use a union to ensure that it is aligned as required for that structure. |
| union { |
| cmsghdr cmessage; |
| char control[CMSG_SPACE(sizeof(ucred))]; |
| }; |
| |
| cmessage.cmsg_len = CMSG_LEN(sizeof(ucred)); |
| cmessage.cmsg_level = SOL_SOCKET; |
| cmessage.cmsg_type = SCM_CREDENTIALS; |
| |
| msghdr message = {nullptr}; |
| message.msg_name = nullptr; |
| message.msg_namelen = 0; |
| message.msg_iov = &iov; |
| message.msg_iovlen = 1; |
| message.msg_control = control; |
| message.msg_controllen = sizeof(control); // CMSG_LEN(sizeof(ucred)); |
| |
| // Finally, the stack we'll use in the call to os::clone below (we |
| // allocate the stack here in order to keep the call to os::clone |
| // async signal safe, since otherwise it would be doing the dynamic |
| // allocation itself). |
| Try<os::Stack> stack = os::Stack::create(os::Stack::DEFAULT_SIZE); |
| if (stack.isError()) { |
| return Error("Failed to allocate stack: " + stack.error()); |
| } |
| |
| pid_t child = fork(); |
| if (child < 0) { |
| stack->deallocate(); |
| close(fds.values()); |
| close(sockets.get()); |
| return ErrnoError(); |
| } else if (child > 0) { |
| // Parent. |
| stack->deallocate(); |
| |
| close(fds.values()); |
| ::close(sockets->at(1)); |
| |
| ssize_t length = recvmsg(sockets->at(0), &message, 0); |
| |
| // TODO(benh): Note that whenever we 'kill(child, SIGKILL)' below |
| // we don't guarantee cleanup! It's possible that the |
| // greatgrandchild is still running. Require the greatgrandchild |
| // to read from the socket after sending back it's pid to ensure |
| // no orphans. |
| |
| if (length < 0) { |
| // We failed to read, close the socket and kill the child |
| // (which might die on it's own trying to write to the |
| // socket). |
| Error error = ErrnoError("Failed to receive"); |
| ::close(sockets->at(0)); |
| kill(child, SIGKILL); |
| return error; |
| } else if (length == 0) { |
| // Socket closed, child must have died, but kill anyway. |
| ::close(sockets->at(0)); |
| kill(child, SIGKILL); |
| return Error("Failed to receive: Socket closed"); |
| } |
| |
| ::close(sockets->at(0)); |
| |
| // Extract pid. |
| if (CMSG_FIRSTHDR(&message) == nullptr || |
| CMSG_FIRSTHDR(&message)->cmsg_len != CMSG_LEN(sizeof(ucred)) || |
| CMSG_FIRSTHDR(&message)->cmsg_level != SOL_SOCKET || |
| CMSG_FIRSTHDR(&message)->cmsg_type != SCM_CREDENTIALS) { |
| kill(child, SIGKILL); |
| return Error("Bad control data received"); |
| } |
| |
| ucred cred; |
| std::memcpy(&cred, CMSG_DATA(CMSG_FIRSTHDR(&message)), sizeof(ucred)); |
| |
| const pid_t pid = cred.pid; |
| |
| // Need to `waitpid` on child process to avoid a zombie. Note that |
| // it's expected that the child will terminate quickly hence |
| // blocking here. |
| int status; |
| while (true) { |
| if (waitpid(child, &status, 0) == -1) { |
| if (errno == EINTR) { |
| continue; |
| } else { |
| return ErrnoError("Failed to `waitpid` on child"); |
| } |
| } else if (WIFSTOPPED(status)) { |
| continue; |
| } else { |
| break; |
| } |
| } |
| |
| CHECK(WIFEXITED(status) || WIFSIGNALED(status)) |
| << "Unexpected wait status " << status; |
| |
| if (!WSUCCEEDED(status)) { |
| return Error("Failed to clone: " + WSTRINGIFY(status)); |
| } |
| |
| return pid; |
| } else { |
| // Child. |
| ::close(sockets->at(0)); |
| |
| // Loop through and 'setns' into all of the parent namespaces that |
| // have been requested. |
| for (size_t i = 0; i < NAMESPACES; i++) { |
| Option<int> fd = fds.get(namespaces[i].nstype); |
| if (fd.isSome()) { |
| ASSERT(namespaces[i].nstype & nstypes); |
| if (::setns(fd.get(), namespaces[i].nstype) < 0) { |
| close(fds.values()); |
| ::close(sockets->at(1)); |
| _exit(EXIT_FAILURE); |
| } |
| } |
| } |
| |
| close(fds.values()); |
| |
| auto grandchildMain = [=]() -> int { |
| // Grandchild (second child, now completely entered in the |
| // namespaces of the target). |
| // |
| // Now clone with the specified flags, close the unused socket, |
| // and execute the specified function. |
| pid_t pid = os::signal_safe::clone( |
| stack.get(), |
| flags, |
| [=]() { |
| ucred cred; |
| cred.pid = ::getpid(); |
| cred.uid = ::getuid(); |
| cred.gid = ::getgid(); |
| |
| // Now send back the pid and have it be translated appropriately |
| // by the kernel to the enclosing pid namespace. |
| // |
| // NOTE: sending back the pid is best effort because we're going |
| // to exit no matter what. |
| std::memcpy( |
| CMSG_DATA(CMSG_FIRSTHDR(&message)), &cred, sizeof(ucred)); |
| |
| if (sendmsg(sockets->at(1), &message, 0) == -1) { |
| // Failed to send the pid back to the parent! |
| _exit(EXIT_FAILURE); |
| } |
| |
| ::close(sockets->at(1)); |
| |
| return f(); |
| }); |
| |
| ::close(sockets->at(1)); |
| |
| // TODO(benh): Kill ourselves with an exit status that we can |
| // decode above to determine why `clone` failed. |
| _exit(pid < 0 ? EXIT_FAILURE : EXIT_SUCCESS); |
| UNREACHABLE(); |
| }; |
| |
| os::Stack grandchildStack(os::Stack::DEFAULT_SIZE); |
| |
| if (!grandchildStack.allocate()) { |
| ::close(sockets->at(1)); |
| _exit(EXIT_FAILURE); |
| } |
| |
| // Fork again to make sure we're actually in those namespaces |
| // (required for the pid namespace at least). |
| // |
| // NOTE: We use clone instead of fork here because of a glibc bug. |
| // glibc version < 2.25 has an assertion in 'fork()' which checks |
| // if the child process's pid is not the same as the parent. This |
| // invariant is no longer true with pid namespaces being |
| // introduced. See more details in MESOS-7858. |
| // |
| // NOTE: glibc 'fork()' also specifies 'CLONE_CHILD_SETTID' and |
| // 'CLONE_CHILD_CLEARTID' for the clone flags. However, since we |
| // are not using any pthread library in the grandchild, we don't |
| // need those flags. |
| // |
| // TODO(benh): Don't do a fork if we're not actually entering the |
| // PID namespace since the extra fork is unnecessary. |
| pid_t grandchild = |
| os::signal_safe::clone(grandchildStack, SIGCHLD, grandchildMain); |
| |
| grandchildStack.deallocate(); |
| |
| if (grandchild < 0) { |
| // TODO(benh): Exit with `errno` in order to capture `fork` error? |
| ::close(sockets->at(1)); |
| _exit(EXIT_FAILURE); |
| } else if (grandchild > 0) { |
| // Still the (first) child. |
| ::close(sockets->at(1)); |
| |
| // Need to reap the grandchild and then just exit since we're no |
| // longer necessary. Technically when the grandchild exits it'll |
| // be reaped but by doing a `waitpid` we can better propagate |
| // back any errors that might have occurred with the grandchild. |
| int status; |
| while (true) { |
| if (waitpid(grandchild, &status, 0) == -1) { |
| if (errno == EINTR) { |
| continue; |
| } else { |
| _exit(1); |
| } |
| } else if (WIFSTOPPED(status)) { |
| continue; |
| } else { |
| break; |
| } |
| } |
| |
| ASSERT(WIFEXITED(status) || WIFSIGNALED(status)); |
| |
| if (WIFEXITED(status)) { |
| _exit(WEXITSTATUS(status)); |
| } |
| |
| ASSERT(WIFSIGNALED(status)); |
| raise(WTERMSIG(status)); |
| } |
| } |
| UNREACHABLE(); |
| } |
| |
| |
| string stringify(int flags) |
| { |
| const hashmap<unsigned int, string> names = { |
| {CLONE_NEWNS, "CLONE_NEWNS"}, |
| {CLONE_NEWUTS, "CLONE_NEWUTS"}, |
| {CLONE_NEWIPC, "CLONE_NEWIPC"}, |
| {CLONE_NEWPID, "CLONE_NEWPID"}, |
| {CLONE_NEWNET, "CLONE_NEWNET"}, |
| {CLONE_NEWUSER, "CLONE_NEWUSER"}, |
| {CLONE_NEWCGROUP, "CLONE_NEWCGROUP"} |
| }; |
| |
| vector<string> namespaces; |
| foreachpair (unsigned int flag, const string& name, names) { |
| if (flags & flag) { |
| namespaces.push_back(name); |
| } |
| } |
| |
| return strings::join(" | ", namespaces); |
| } |
| |
| } // namespace ns { |