src/linux/ns.hpp - mesos - Git at Google

 // Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements.  See the NOTICE file
 // distributed with this work for additional information
 // regarding copyright ownership.  The ASF licenses this file
 // to you under the Apache License, Version 2.0 (the
 // "License"); you may not use this file except in compliance
 // with the License.  You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 #ifndef __LINUX_NS_HPP__
 #define __LINUX_NS_HPP__

 // This file contains Linux-only OS utilities.
 #ifndef __linux__
 #error "linux/ns.hpp is only available on Linux systems."
 #endif

 #include <sched.h>
 #include <unistd.h>

 #include <sys/socket.h>
 #include <sys/syscall.h>
 #include <sys/wait.h>

 #include <cstring>
 #include <set>
 #include <string>
 #include <vector>

 #include <stout/assert.hpp>
 #include <stout/error.hpp>
 #include <stout/hashmap.hpp>
 #include <stout/nothing.hpp>
 #include <stout/os.hpp>
 #include <stout/path.hpp>
 #include <stout/proc.hpp>
 #include <stout/result.hpp>
 #include <stout/stringify.hpp>
 #include <stout/strings.hpp>
 #include <stout/try.hpp>

 #include <stout/os/exists.hpp>
 #include <stout/os/ls.hpp>

 #include <process/collect.hpp>
 #include <process/future.hpp>
 #include <process/reap.hpp>

 #include "common/status_utils.hpp"

 #ifndef CLONE_NEWNS
 #define CLONE_NEWNS 0x00020000
 #endif

 #ifndef CLONE_NEWUTS
 #define CLONE_NEWUTS 0x04000000
 #endif

 #ifndef CLONE_NEWIPC
 #define CLONE_NEWIPC 0x08000000
 #endif

 #ifndef CLONE_NEWPID
 #define CLONE_NEWPID 0x20000000
 #endif

 #ifndef CLONE_NEWNET
 #define CLONE_NEWNET 0x40000000
 #endif

 #ifndef CLONE_NEWUSER
 #define CLONE_NEWUSER 0x10000000
 #endif

 #ifndef CLONE_NEWCGROUP
 #define CLONE_NEWCGROUP 0x02000000
 #endif

 // Define a 'setns' for compilation environments that don't already
 // have one.
 inline int setns(int fd, int nstype)
 {
 #ifdef SYS_setns
   return ::syscall(SYS_setns, fd, nstype);
 #elif defined(__x86_64__)
   // A workaround for those hosts that have an old glibc (older than
   // 2.14) but have a new kernel. The magic number '308' here is the
   // syscall number for 'setns' on x86_64 architecture.
   return ::syscall(308, fd, nstype);
 #else
 #error "setns is not available"
 #endif
 }

 namespace ns {

 // Returns the nstype (e.g., CLONE_NEWNET, CLONE_NEWNS, etc.) for the
 // given namespace which can be used when calling ::setns.
 inline Try<int> nstype(const std::string& ns)
 {
   const hashmap<std::string, int> nstypes = {
     {"mnt", CLONE_NEWNS},
     {"uts", CLONE_NEWUTS},
     {"ipc", CLONE_NEWIPC},
     {"net", CLONE_NEWNET},
     {"user", CLONE_NEWUSER},
     {"pid", CLONE_NEWPID},
     {"cgroup", CLONE_NEWCGROUP}
   };

   Option<int> nstype = nstypes.get(ns);

   if (nstype.isNone()) {
     return Error("Unknown namespace '" + ns + "'");
   }

   return nstype.get();
 }


 // Returns all the supported namespaces by the kernel.
 inline std::set<std::string> namespaces()
 {
   std::set<std::string> result;
   Try<std::list<std::string>> entries = os::ls("/proc/self/ns");
   if (entries.isSome()) {
     foreach (const std::string& entry, entries.get()) {
       // Introduced in Linux 4.12, pid_for_children is a handle for the PID
       // namespace of child processes created by the current process.
       if (entry != "pid_for_children") {
         result.insert(entry);
       }
     }
   }
   return result;
 }


 // Returns all the supported namespaces by the kernel.
 inline std::set<int> nstypes()
 {
   std::set<int> result;
   foreach (const std::string& ns, namespaces()) {
     Try<int> type = nstype(ns);
     if (type.isSome()) {
       result.insert(type.get());
     }
   }
   return result;
 }


 // Re-associate the calling process with the specified namespace. The
 // path refers to one of the corresponding namespace entries in the
 // /proc/[pid]/ns/ directory (or bind mounted elsewhere). We do not
 // allow a process with multiple threads to call this function because
 // it will lead to some weird situations where different threads of a
 // process are in different namespaces.
 inline Try<Nothing> setns(
     const std::string& path,
     const std::string& ns,
     bool checkMultithreaded = true)
 {
   if (checkMultithreaded) {
     // Return error if there're multiple threads in the calling process.
     Try<std::set<pid_t>> threads = proc::threads(::getpid());
     if (threads.isError()) {
       return Error(
           "Failed to get the threads of the current process: " +
           threads.error());
     } else if (threads.get().size() > 1) {
       return Error("Multiple threads exist in the current process");
     }
   }

   if (ns::namespaces().count(ns) == 0) {
     return Error("Namespace '" + ns + "' is not supported");
   }

   // Currently, we don't support pid namespace as its semantics is
   // different from other namespaces (instead of re-associating the
   // calling thread, it re-associates the *children* of the calling
   // thread with the specified namespace).
   if (ns == "pid") {
     return Error("Pid namespace is not supported");
   }

   Try<int> fd = os::open(path, O_RDONLY | O_CLOEXEC);

   if (fd.isError()) {
     return Error("Failed to open '" + path + "': " + fd.error());
   }

   Try<int> nstype = ns::nstype(ns);
   if (nstype.isError()) {
     return Error(nstype.error());
   }

   if (::setns(fd.get(), nstype.get()) == -1) {
     // Save the errno as it might be overwritten by 'os::close' below.
     ErrnoError error;
     os::close(fd.get());
     return error;
   }

   os::close(fd.get());
   return Nothing();
 }


 // Re-associate the calling process with the specified namespace. The
 // pid specifies the process whose namespace we will associate.
 inline Try<Nothing> setns(pid_t pid, const std::string& ns)
 {
   if (!os::exists(pid)) {
     return Error("Pid " + stringify(pid) + " does not exist");
   }

   std::string path = path::join("/proc", stringify(pid), "ns", ns);
   if (!os::exists(path)) {
     return Error("Namespace '" + ns + "' is not supported");
   }

   return ns::setns(path, ns);
 }


 // Get the inode number of the specified namespace for the specified
 // pid. The inode number identifies the namespace and can be used for
 // comparisons, i.e., two processes with the same inode for a given
 // namespace type are in the same namespace.
 inline Result<ino_t> getns(pid_t pid, const std::string& ns)
 {
   if (ns::namespaces().count(ns) < 1) {
     return Error("Namespace '" + ns + "' is not supported");
   }

   std::string path = path::join("/proc", stringify(pid), "ns", ns);
   struct stat s;
   if (::stat(path.c_str(), &s) < 0) {
     if (errno == ENOENT) {
       // Process is gone.
       return None();
     } else {
       return ErrnoError("Failed to stat " + ns + " namespace handle"
                         " for pid " + stringify(pid));
     }
   }

   return s.st_ino;
 }


 /**
  * Performs an `os::clone` after entering a set of namespaces for the
  * specified `target` process.
  *
  * This function provides two steps of functionality:
  *   (1) Enter a set of namespaces via two `fork` calls.
  *   (1) Perform a `clone` within that set of namespaces.
  *
  * Step (1) of functionality is similar to the `nsenter` command line
  * utility. Step (2) allows us to perform a clone that itself might
  * create a nested set of namespaces, which enables us to have nested
  * containers.
  *
  * Double Fork:
  *
  * In order to enter a PID namespace we need to do a double fork
  * because doing a `setns` for a PID namespace only effects future
  * children.
  *
  * Moreover, attempting to `setns` before we do any forks and then
  * have the parent `setns` back to the original namespaces does not
  * work because entering a depriviledged user namespace will not let
  * us reassociate back with the original namespace, even if we keep
  * the file descriptor of the original namespace open.
  *
  * Because we have to double fork we need to send back the actual PID
  * of the final process that's executing the provided function `f`.
  * We use domain sockets for this because in the event we've entered a
  * PID namespace we need the kernel to translate the PID to the PID in
  * our PID namespace.
  *
  * @param target Target process whose namespaces we should enter.
  * @param nstypes Namespaces we should enter.
  * @param f Function to invoke after entering the namespaces and cloning.
  * @param flags Flags to pass to `clone`.
  *
  * @return `pid_t` of the child process.
  */
 inline Try<pid_t> clone(
     pid_t target,
     int nstypes,
     const lambda::function<int()>& f,
     int flags)
 {
   // NOTE: the order in which we 'setns' is significant, so we use an
   // array here rather than something like a map.
   //
   // The user namespace needs to be entered first if we need to
   // increase the privilege and last if we want to decrease the
   // privilege. Said another way, entering the user namespace first
   // gives an unprivileged user the potential to enter the other
   // namespaces.
   const size_t NAMESPACES = 7;
   struct
   {
     int nstype;
     std::string name;
   } namespaces[NAMESPACES] = {
     {CLONE_NEWUSER, "user"},
     {CLONE_NEWCGROUP, "cgroup"},
     {CLONE_NEWIPC, "ipc"},
     {CLONE_NEWUTS, "uts"},
     {CLONE_NEWNET, "net"},
     {CLONE_NEWPID, "pid"},
     {CLONE_NEWNS, "mnt"}
   };

   // Since we assume below that the parent can deallocate the stack
   // after cloning the children, the caller must not pass CLONE_VM.
   // That would cause the both processes to share their address space
   // so deallocating the stack in the parent would affect the child.
   CHECK_EQ(0, flags & CLONE_VM);

   // Support for user namespaces in all filesystems is incomplete
   // until version 3.12 (see 'Availability' in man page of
   // 'user_namespaces'), so for now we don't support entering them.
   //
   // TODO(benh): Support user namespaces if the current system can
   // support it, e.g., check the kernel version number or try and do a
   // clone with CLONE_NEWUSER to see if it works. NOTE: before we can
   // fully support user namespaces, however, we must take care to
   // either enter the user namespace first or last. We'll want to
   // enter it first if we need to increase the privilege and last if
   // we want to decrease the privilege. Currently nsenter.c from
   // utils-linux does this via doing two passes to make sure we either
   // enter first or last. We'll need to do something similar here once
   // we support user namespaces as well.
   if (nstypes & CLONE_NEWUSER) {
     return Error("User namespaces are not supported");
   }

   // File descriptors keyed by the (parent) namespace we are entering.
   hashmap<int, int> fds = {};

   // Helper for closing a list of file descriptors.
   auto close = [](const std::list<int>& fds) {
     foreach (int fd, fds) {
       ::close(fd); // Need to call the async-signal safe version.
     }
   };

   // NOTE: we do all of this ahead of time so we can be async signal
   // safe after calling fork below.
   for (size_t i = 0; i < NAMESPACES; i++) {
     // Only open the namespace file descriptor if it's been requested.
     if (namespaces[i].nstype & nstypes) {
       std::string path =
         path::join("/proc", stringify(target), "ns", namespaces[i].name);
       Try<int> fd = os::open(path, O_RDONLY);
       if (fd.isError()) {
         close(fds.values());
         return Error("Failed to open '" + path +
                      "' for entering namespace: " + fd.error());
       }
       fds[namespaces[i].nstype] = fd.get();
     }
   }

   // We use a domain socket rather than pipes so that we can send back
   // the PID of the final child process. The parent socket is
   // `sockets[0]` and the child socket is `sockets[1]`. Note that both
   // sockets are both read/write but currently only the parent reads
   // and the child writes.
   int sockets[2] = {-1, -1};
   if (socketpair(AF_UNIX, SOCK_STREAM, 0, sockets) != 0) {
     close(fds.values());
     return ErrnoError("Failed to create Unix domain socket");
   }

   // Need to set SO_PASSCRED option in order to receive credentials
   // (which is how we get the pid of the clone'd process, see
   // below). Note that apparently we only need to do this for
   // receiving, not also for sending.
   const int value = 1;
   const ssize_t size = sizeof(value);
   if (setsockopt(sockets[0], SOL_SOCKET, SO_PASSCRED, &value, size) == -1) {
     Error error = ErrnoError("Failed to set socket option SO_PASSCRED");
     close(fds.values());
     ::close(sockets[0]);
     ::close(sockets[1]);
     return error;
   }

   // NOTE: to determine the pid of the final process executing the
   // specified lambda we use the SCM_CREDENTIALS mechanism of
   // 'sendmsg' and 'recvmsg'. On Linux there is also a way to do this
   // via 'getsockopt' and SO_PEERCRED which looks easier, but IIUC
   // requires you to do an explicit connect from the child process
   // back to the parent so that there is only one connection per
   // socket (unlike in our world where the socket can be used by
   // multiple forks/clones simultaneously because it's just a file
   // descriptor that gets copied after each fork/clone). Perhaps the
   // SO_PEERCRED is less lines of code but this approach was taken for
   // now.

   char base[1];

   iovec iov = {0};
   iov.iov_base = base;
   iov.iov_len = sizeof(base);

   // Need to allocate a char array large enough to hold "control"
   // data. However, since this buffer is in reality a 'struct cmsghdr'
   // we use a union to ensure that it is aligned as required for that
   // structure.
   union {
     struct cmsghdr cmessage;
     char control[CMSG_SPACE(sizeof(struct ucred))];
   };

   cmessage.cmsg_len = CMSG_LEN(sizeof(struct ucred));
   cmessage.cmsg_level = SOL_SOCKET;
   cmessage.cmsg_type = SCM_CREDENTIALS;

   msghdr message = {0};
   message.msg_name = nullptr;
   message.msg_namelen = 0;
   message.msg_iov = &iov;
   message.msg_iovlen = 1;
   message.msg_control = control;
   message.msg_controllen = sizeof(control); // CMSG_LEN(sizeof(struct ucred));

   // Finally, the stack we'll use in the call to os::clone below (we
   // allocate the stack here in order to keep the call to os::clone
   // async signal safe, since otherwise it would be doing the dynamic
   // allocation itself).
   Try<os::Stack> stack = os::Stack::create(os::Stack::DEFAULT_SIZE);
   if (stack.isError()) {
     return Error("Failed to allocate stack: " + stack.error());
   }

   pid_t child = fork();
   if (child < 0) {
     stack->deallocate();
     close(fds.values());
     ::close(sockets[0]);
     ::close(sockets[1]);
     return ErrnoError();
   } else if (child > 0) {
     // Parent.
     stack->deallocate();

     close(fds.values());
     ::close(sockets[1]);

     ssize_t length = recvmsg(sockets[0], &message, 0);

     // TODO(benh): Note that whenever we 'kill(child, SIGKILL)' below
     // we don't guarantee cleanup! It's possible that the
     // greatgrandchild is still running. Require the greatgrandchild
     // to read from the socket after sending back it's pid to ensure
     // no orphans.

     if (length < 0) {
       // We failed to read, close the socket and kill the child
       // (which might die on it's own trying to write to the
       // socket).
       Error error = ErrnoError("Failed to receive");
       ::close(sockets[0]);
       kill(child, SIGKILL);
       return error;
     } else if (length == 0) {
       // Socket closed, child must have died, but kill anyway.
       ::close(sockets[0]);
       kill(child, SIGKILL);
       return Error("Failed to receive: Socket closed");
     }

     ::close(sockets[0]);

     // Extract pid.
     if (CMSG_FIRSTHDR(&message) == nullptr ||
         CMSG_FIRSTHDR(&message)->cmsg_len != CMSG_LEN(sizeof(struct ucred)) ||
         CMSG_FIRSTHDR(&message)->cmsg_level != SOL_SOCKET ||
         CMSG_FIRSTHDR(&message)->cmsg_type != SCM_CREDENTIALS) {
       kill(child, SIGKILL);
       return Error("Bad control data received");
     }

     struct ucred cred;
     std::memcpy(
         &cred, CMSG_DATA(CMSG_FIRSTHDR(&message)), sizeof(struct ucred));

     const pid_t pid = cred.pid;

     // Need to `waitpid` on child process to avoid a zombie. Note that
     // it's expected that the child will terminate quickly hence
     // blocking here.
     int status;
     while (true) {
       if (waitpid(child, &status, 0) == -1) {
         if (errno == EINTR) {
           continue;
         } else {
           return ErrnoError("Failed to `waitpid` on child");
         }
       } else if (WIFSTOPPED(status)) {
         continue;
       } else {
         break;
       }
     }

     CHECK(WIFEXITED(status) || WIFSIGNALED(status))
       << "Unexpected wait status " << status;

     if (!WSUCCEEDED(status)) {
       return Error("Failed to clone: " + WSTRINGIFY(status));
     }

     return pid;
   } else {
     // Child.
     ::close(sockets[0]);

     // Loop through and 'setns' into all of the parent namespaces that
     // have been requested.
     for (size_t i = 0; i < NAMESPACES; i++) {
       Option<int> fd = fds.get(namespaces[i].nstype);
       if (fd.isSome()) {
         ASSERT(namespaces[i].nstype & nstypes);
         if (::setns(fd.get(), namespaces[i].nstype) < 0) {
           close(fds.values());
           ::close(sockets[1]);
           _exit(EXIT_FAILURE);
         }
       }
     }

     close(fds.values());

     auto grandchildMain = [=]() -> int {
       // Grandchild (second child, now completely entered in the
       // namespaces of the target).
       //
       // Now clone with the specified flags, close the unused socket,
       // and execute the specified function.
       pid_t pid = os::signal_safe::clone(
           stack.get(),
           flags,
           [=]() {
             struct ucred cred;
             cred.pid = ::getpid();
             cred.uid = ::getuid();
             cred.gid = ::getgid();

             // Now send back the pid and have it be translated appropriately
             // by the kernel to the enclosing pid namespace.
             //
             // NOTE: sending back the pid is best effort because we're going
             // to exit no matter what.
             std::memcpy(
                 CMSG_DATA(CMSG_FIRSTHDR(&message)),
                 &cred,
                 sizeof(struct ucred));

             if (sendmsg(sockets[1], &message, 0) == -1) {
               // Failed to send the pid back to the parent!
               _exit(EXIT_FAILURE);
             }

             ::close(sockets[1]);

             return f();
           });

       ::close(sockets[1]);

       // TODO(benh): Kill ourselves with an exit status that we can
       // decode above to determine why `clone` failed.
       _exit(pid < 0 ? EXIT_FAILURE : EXIT_SUCCESS);
       UNREACHABLE();
     };

     os::Stack grandchildStack(os::Stack::DEFAULT_SIZE);

     if (!grandchildStack.allocate()) {
       ::close(sockets[1]);
       _exit(EXIT_FAILURE);
     }

     // Fork again to make sure we're actually in those namespaces
     // (required for the pid namespace at least).
     //
     // NOTE: We use clone instead of fork here because of a glibc bug.
     // glibc version < 2.25 has an assertion in 'fork()' which checks
     // if the child process's pid is not the same as the parent. This
     // invariant is no longer true with pid namespaces being
     // introduced. See more details in MESOS-7858.
     //
     // NOTE: glibc 'fork()' also specifies 'CLONE_CHILD_SETTID' and
     // 'CLONE_CHILD_CLEARTID' for the clone flags. However, since we
     // are not using any pthread library in the grandchild, we don't
     // need those flags.
     //
     // TODO(benh): Don't do a fork if we're not actually entering the
     // PID namespace since the extra fork is unnecessary.
     pid_t grandchild =
       os::signal_safe::clone(grandchildStack, SIGCHLD, grandchildMain);

     grandchildStack.deallocate();

     if (grandchild < 0) {
       // TODO(benh): Exit with `errno` in order to capture `fork` error?
       ::close(sockets[1]);
       _exit(EXIT_FAILURE);
     } else if (grandchild > 0) {
       // Still the (first) child.
       ::close(sockets[1]);

       // Need to reap the grandchild and then just exit since we're no
       // longer necessary. Technically when the grandchild exits it'll
       // be reaped but by doing a `waitpid` we can better propagate
       // back any errors that might have occurred with the grandchild.
       int status;
       while (true) {
         if (waitpid(grandchild, &status, 0) == -1) {
           if (errno == EINTR) {
             continue;
           } else {
             _exit(1);
           }
         } else if (WIFSTOPPED(status)) {
           continue;
         } else {
           break;
         }
       }

       ASSERT(WIFEXITED(status) || WIFSIGNALED(status));

       if (WIFEXITED(status)) {
         _exit(WEXITSTATUS(status));
       }

       ASSERT(WIFSIGNALED(status));
       raise(WTERMSIG(status));
     }
   }
   UNREACHABLE();
 }


 // Returns the namespace flags in the string form of bitwise-ORing the
 // flags, e.g., CLONE_NEWNS | CLONE_NEWNET.
 inline std::string stringify(int flags)
 {
   hashmap<unsigned int, std::string> names = {
     {CLONE_NEWNS,   "CLONE_NEWNS"},
     {CLONE_NEWUTS,  "CLONE_NEWUTS"},
     {CLONE_NEWIPC,  "CLONE_NEWIPC"},
     {CLONE_NEWPID,  "CLONE_NEWPID"},
     {CLONE_NEWNET,  "CLONE_NEWNET"},
     {CLONE_NEWUSER, "CLONE_NEWUSER"},
     {CLONE_NEWCGROUP, "CLONE_NEWCGROUP"}
   };

   std::vector<std::string> namespaces;
   foreachpair (unsigned int flag, const std::string& name, names) {
     if (flags & flag) {
       namespaces.push_back(name);
     }
   }

   return strings::join(" | ", namespaces);
 }

 } // namespace ns {

 #endif // __LINUX_NS_HPP__
	// Licensed to the Apache Software Foundation (ASF) under one
	// or more contributor license agreements. See the NOTICE file
	// distributed with this work for additional information
	// regarding copyright ownership. The ASF licenses this file
	// to you under the Apache License, Version 2.0 (the
	// "License"); you may not use this file except in compliance
	// with the License. You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	#ifndef __LINUX_NS_HPP__
	#define __LINUX_NS_HPP__

	// This file contains Linux-only OS utilities.
	#ifndef __linux__
	#error "linux/ns.hpp is only available on Linux systems."
	#endif

	#include <sched.h>
	#include <unistd.h>

	#include <sys/socket.h>
	#include <sys/syscall.h>
	#include <sys/wait.h>

	#include <cstring>
	#include <set>
	#include <string>
	#include <vector>

	#include <stout/assert.hpp>
	#include <stout/error.hpp>
	#include <stout/hashmap.hpp>
	#include <stout/nothing.hpp>
	#include <stout/os.hpp>
	#include <stout/path.hpp>
	#include <stout/proc.hpp>
	#include <stout/result.hpp>
	#include <stout/stringify.hpp>
	#include <stout/strings.hpp>
	#include <stout/try.hpp>

	#include <stout/os/exists.hpp>
	#include <stout/os/ls.hpp>

	#include <process/collect.hpp>
	#include <process/future.hpp>
	#include <process/reap.hpp>

	#include "common/status_utils.hpp"

	#ifndef CLONE_NEWNS
	#define CLONE_NEWNS 0x00020000
	#endif

	#ifndef CLONE_NEWUTS
	#define CLONE_NEWUTS 0x04000000
	#endif

	#ifndef CLONE_NEWIPC
	#define CLONE_NEWIPC 0x08000000
	#endif

	#ifndef CLONE_NEWPID
	#define CLONE_NEWPID 0x20000000
	#endif

	#ifndef CLONE_NEWNET
	#define CLONE_NEWNET 0x40000000
	#endif

	#ifndef CLONE_NEWUSER
	#define CLONE_NEWUSER 0x10000000
	#endif

	#ifndef CLONE_NEWCGROUP
	#define CLONE_NEWCGROUP 0x02000000
	#endif

	// Define a 'setns' for compilation environments that don't already
	// have one.
	inline int setns(int fd, int nstype)
	{
	#ifdef SYS_setns
	return ::syscall(SYS_setns, fd, nstype);
	#elif defined(__x86_64__)
	// A workaround for those hosts that have an old glibc (older than
	// 2.14) but have a new kernel. The magic number '308' here is the
	// syscall number for 'setns' on x86_64 architecture.
	return ::syscall(308, fd, nstype);
	#else
	#error "setns is not available"
	#endif
	}

	namespace ns {

	// Returns the nstype (e.g., CLONE_NEWNET, CLONE_NEWNS, etc.) for the
	// given namespace which can be used when calling ::setns.
	inline Try<int> nstype(const std::string& ns)
	{
	const hashmap<std::string, int> nstypes = {
	{"mnt", CLONE_NEWNS},
	{"uts", CLONE_NEWUTS},
	{"ipc", CLONE_NEWIPC},
	{"net", CLONE_NEWNET},
	{"user", CLONE_NEWUSER},
	{"pid", CLONE_NEWPID},
	{"cgroup", CLONE_NEWCGROUP}
	};

	Option<int> nstype = nstypes.get(ns);

	if (nstype.isNone()) {
	return Error("Unknown namespace '" + ns + "'");
	}

	return nstype.get();
	}


	// Returns all the supported namespaces by the kernel.
	inline std::set<std::string> namespaces()
	{
	std::set<std::string> result;
	Try<std::list<std::string>> entries = os::ls("/proc/self/ns");
	if (entries.isSome()) {
	foreach (const std::string& entry, entries.get()) {
	// Introduced in Linux 4.12, pid_for_children is a handle for the PID
	// namespace of child processes created by the current process.
	if (entry != "pid_for_children") {
	result.insert(entry);
	}
	}
	}
	return result;
	}


	// Returns all the supported namespaces by the kernel.
	inline std::set<int> nstypes()
	{
	std::set<int> result;
	foreach (const std::string& ns, namespaces()) {
	Try<int> type = nstype(ns);
	if (type.isSome()) {
	result.insert(type.get());
	}
	}
	return result;
	}


	// Re-associate the calling process with the specified namespace. The
	// path refers to one of the corresponding namespace entries in the
	// /proc/[pid]/ns/ directory (or bind mounted elsewhere). We do not
	// allow a process with multiple threads to call this function because
	// it will lead to some weird situations where different threads of a
	// process are in different namespaces.
	inline Try<Nothing> setns(
	const std::string& path,
	const std::string& ns,
	bool checkMultithreaded = true)
	{
	if (checkMultithreaded) {
	// Return error if there're multiple threads in the calling process.
	Try<std::set<pid_t>> threads = proc::threads(::getpid());
	if (threads.isError()) {
	return Error(
	"Failed to get the threads of the current process: " +
	threads.error());
	} else if (threads.get().size() > 1) {
	return Error("Multiple threads exist in the current process");
	}
	}

	if (ns::namespaces().count(ns) == 0) {
	return Error("Namespace '" + ns + "' is not supported");
	}

	// Currently, we don't support pid namespace as its semantics is
	// different from other namespaces (instead of re-associating the
	// calling thread, it re-associates the children of the calling
	// thread with the specified namespace).
	if (ns == "pid") {
	return Error("Pid namespace is not supported");
	}

	Try<int> fd = os::open(path, O_RDONLY \| O_CLOEXEC);

	if (fd.isError()) {
	return Error("Failed to open '" + path + "': " + fd.error());
	}

	Try<int> nstype = ns::nstype(ns);
	if (nstype.isError()) {
	return Error(nstype.error());
	}

	if (::setns(fd.get(), nstype.get()) == -1) {
	// Save the errno as it might be overwritten by 'os::close' below.
	ErrnoError error;
	os::close(fd.get());
	return error;
	}

	os::close(fd.get());
	return Nothing();
	}


	// Re-associate the calling process with the specified namespace. The
	// pid specifies the process whose namespace we will associate.
	inline Try<Nothing> setns(pid_t pid, const std::string& ns)
	{
	if (!os::exists(pid)) {
	return Error("Pid " + stringify(pid) + " does not exist");
	}

	std::string path = path::join("/proc", stringify(pid), "ns", ns);
	if (!os::exists(path)) {
	return Error("Namespace '" + ns + "' is not supported");
	}

	return ns::setns(path, ns);
	}


	// Get the inode number of the specified namespace for the specified
	// pid. The inode number identifies the namespace and can be used for
	// comparisons, i.e., two processes with the same inode for a given
	// namespace type are in the same namespace.
	inline Result<ino_t> getns(pid_t pid, const std::string& ns)
	{
	if (ns::namespaces().count(ns) < 1) {
	return Error("Namespace '" + ns + "' is not supported");
	}

	std::string path = path::join("/proc", stringify(pid), "ns", ns);
	struct stat s;
	if (::stat(path.c_str(), &s) < 0) {
	if (errno == ENOENT) {
	// Process is gone.
	return None();
	} else {
	return ErrnoError("Failed to stat " + ns + " namespace handle"
	" for pid " + stringify(pid));
	}
	}

	return s.st_ino;
	}



	/**
	* Performs an `os::clone` after entering a set of namespaces for the
	* specified `target` process.
	*
	* This function provides two steps of functionality:
	* (1) Enter a set of namespaces via two `fork` calls.
	* (1) Perform a `clone` within that set of namespaces.
	*
	* Step (1) of functionality is similar to the `nsenter` command line
	* utility. Step (2) allows us to perform a clone that itself might
	* create a nested set of namespaces, which enables us to have nested
	* containers.
	*
	* Double Fork:
	*
	* In order to enter a PID namespace we need to do a double fork
	* because doing a `setns` for a PID namespace only effects future
	* children.
	*
	* Moreover, attempting to `setns` before we do any forks and then
	* have the parent `setns` back to the original namespaces does not
	* work because entering a depriviledged user namespace will not let
	* us reassociate back with the original namespace, even if we keep
	* the file descriptor of the original namespace open.
	*
	* Because we have to double fork we need to send back the actual PID
	* of the final process that's executing the provided function `f`.
	* We use domain sockets for this because in the event we've entered a
	* PID namespace we need the kernel to translate the PID to the PID in
	* our PID namespace.
	*
	* @param target Target process whose namespaces we should enter.
	* @param nstypes Namespaces we should enter.
	* @param f Function to invoke after entering the namespaces and cloning.
	* @param flags Flags to pass to `clone`.
	*
	* @return `pid_t` of the child process.
	*/
	inline Try<pid_t> clone(
	pid_t target,
	int nstypes,
	const lambda::function<int()>& f,
	int flags)
	{
	// NOTE: the order in which we 'setns' is significant, so we use an
	// array here rather than something like a map.
	//
	// The user namespace needs to be entered first if we need to
	// increase the privilege and last if we want to decrease the
	// privilege. Said another way, entering the user namespace first
	// gives an unprivileged user the potential to enter the other
	// namespaces.
	const size_t NAMESPACES = 7;
	struct
	{
	int nstype;
	std::string name;
	} namespaces[NAMESPACES] = {
	{CLONE_NEWUSER, "user"},
	{CLONE_NEWCGROUP, "cgroup"},
	{CLONE_NEWIPC, "ipc"},
	{CLONE_NEWUTS, "uts"},
	{CLONE_NEWNET, "net"},
	{CLONE_NEWPID, "pid"},
	{CLONE_NEWNS, "mnt"}
	};

	// Since we assume below that the parent can deallocate the stack
	// after cloning the children, the caller must not pass CLONE_VM.
	// That would cause the both processes to share their address space
	// so deallocating the stack in the parent would affect the child.
	CHECK_EQ(0, flags & CLONE_VM);

	// Support for user namespaces in all filesystems is incomplete
	// until version 3.12 (see 'Availability' in man page of
	// 'user_namespaces'), so for now we don't support entering them.
	//
	// TODO(benh): Support user namespaces if the current system can
	// support it, e.g., check the kernel version number or try and do a
	// clone with CLONE_NEWUSER to see if it works. NOTE: before we can
	// fully support user namespaces, however, we must take care to
	// either enter the user namespace first or last. We'll want to
	// enter it first if we need to increase the privilege and last if
	// we want to decrease the privilege. Currently nsenter.c from
	// utils-linux does this via doing two passes to make sure we either
	// enter first or last. We'll need to do something similar here once
	// we support user namespaces as well.
	if (nstypes & CLONE_NEWUSER) {
	return Error("User namespaces are not supported");
	}

	// File descriptors keyed by the (parent) namespace we are entering.
	hashmap<int, int> fds = {};

	// Helper for closing a list of file descriptors.
	auto close = [](const std::list<int>& fds) {
	foreach (int fd, fds) {
	::close(fd); // Need to call the async-signal safe version.
	}
	};

	// NOTE: we do all of this ahead of time so we can be async signal
	// safe after calling fork below.
	for (size_t i = 0; i < NAMESPACES; i++) {
	// Only open the namespace file descriptor if it's been requested.
	if (namespaces[i].nstype & nstypes) {
	std::string path =
	path::join("/proc", stringify(target), "ns", namespaces[i].name);
	Try<int> fd = os::open(path, O_RDONLY);
	if (fd.isError()) {
	close(fds.values());
	return Error("Failed to open '" + path +
	"' for entering namespace: " + fd.error());
	}
	fds[namespaces[i].nstype] = fd.get();
	}
	}

	// We use a domain socket rather than pipes so that we can send back
	// the PID of the final child process. The parent socket is
	// `sockets[0]` and the child socket is `sockets[1]`. Note that both
	// sockets are both read/write but currently only the parent reads
	// and the child writes.
	int sockets[2] = {-1, -1};
	if (socketpair(AF_UNIX, SOCK_STREAM, 0, sockets) != 0) {
	close(fds.values());
	return ErrnoError("Failed to create Unix domain socket");
	}

	// Need to set SO_PASSCRED option in order to receive credentials
	// (which is how we get the pid of the clone'd process, see
	// below). Note that apparently we only need to do this for
	// receiving, not also for sending.
	const int value = 1;
	const ssize_t size = sizeof(value);
	if (setsockopt(sockets[0], SOL_SOCKET, SO_PASSCRED, &value, size) == -1) {
	Error error = ErrnoError("Failed to set socket option SO_PASSCRED");
	close(fds.values());
	::close(sockets[0]);
	::close(sockets[1]);
	return error;
	}

	// NOTE: to determine the pid of the final process executing the
	// specified lambda we use the SCM_CREDENTIALS mechanism of
	// 'sendmsg' and 'recvmsg'. On Linux there is also a way to do this
	// via 'getsockopt' and SO_PEERCRED which looks easier, but IIUC
	// requires you to do an explicit connect from the child process
	// back to the parent so that there is only one connection per
	// socket (unlike in our world where the socket can be used by
	// multiple forks/clones simultaneously because it's just a file
	// descriptor that gets copied after each fork/clone). Perhaps the
	// SO_PEERCRED is less lines of code but this approach was taken for
	// now.

	char base[1];

	iovec iov = {0};
	iov.iov_base = base;
	iov.iov_len = sizeof(base);

	// Need to allocate a char array large enough to hold "control"
	// data. However, since this buffer is in reality a 'struct cmsghdr'
	// we use a union to ensure that it is aligned as required for that
	// structure.
	union {
	struct cmsghdr cmessage;
	char control[CMSG_SPACE(sizeof(struct ucred))];
	};

	cmessage.cmsg_len = CMSG_LEN(sizeof(struct ucred));
	cmessage.cmsg_level = SOL_SOCKET;
	cmessage.cmsg_type = SCM_CREDENTIALS;

	msghdr message = {0};
	message.msg_name = nullptr;
	message.msg_namelen = 0;
	message.msg_iov = &iov;
	message.msg_iovlen = 1;
	message.msg_control = control;
	message.msg_controllen = sizeof(control); // CMSG_LEN(sizeof(struct ucred));

	// Finally, the stack we'll use in the call to os::clone below (we
	// allocate the stack here in order to keep the call to os::clone
	// async signal safe, since otherwise it would be doing the dynamic
	// allocation itself).
	Try<os::Stack> stack = os::Stack::create(os::Stack::DEFAULT_SIZE);
	if (stack.isError()) {
	return Error("Failed to allocate stack: " + stack.error());
	}

	pid_t child = fork();
	if (child < 0) {
	stack->deallocate();
	close(fds.values());
	::close(sockets[0]);
	::close(sockets[1]);
	return ErrnoError();
	} else if (child > 0) {
	// Parent.
	stack->deallocate();

	close(fds.values());
	::close(sockets[1]);

	ssize_t length = recvmsg(sockets[0], &message, 0);

	// TODO(benh): Note that whenever we 'kill(child, SIGKILL)' below
	// we don't guarantee cleanup! It's possible that the
	// greatgrandchild is still running. Require the greatgrandchild
	// to read from the socket after sending back it's pid to ensure
	// no orphans.

	if (length < 0) {
	// We failed to read, close the socket and kill the child
	// (which might die on it's own trying to write to the
	// socket).
	Error error = ErrnoError("Failed to receive");
	::close(sockets[0]);
	kill(child, SIGKILL);
	return error;
	} else if (length == 0) {
	// Socket closed, child must have died, but kill anyway.
	::close(sockets[0]);
	kill(child, SIGKILL);
	return Error("Failed to receive: Socket closed");
	}

	::close(sockets[0]);

	// Extract pid.
	if (CMSG_FIRSTHDR(&message) == nullptr \|\|
	CMSG_FIRSTHDR(&message)->cmsg_len != CMSG_LEN(sizeof(struct ucred)) \|\|
	CMSG_FIRSTHDR(&message)->cmsg_level != SOL_SOCKET \|\|
	CMSG_FIRSTHDR(&message)->cmsg_type != SCM_CREDENTIALS) {
	kill(child, SIGKILL);
	return Error("Bad control data received");
	}

	struct ucred cred;
	std::memcpy(
	&cred, CMSG_DATA(CMSG_FIRSTHDR(&message)), sizeof(struct ucred));

	const pid_t pid = cred.pid;

	// Need to `waitpid` on child process to avoid a zombie. Note that
	// it's expected that the child will terminate quickly hence
	// blocking here.
	int status;
	while (true) {
	if (waitpid(child, &status, 0) == -1) {
	if (errno == EINTR) {
	continue;
	} else {
	return ErrnoError("Failed to `waitpid` on child");
	}
	} else if (WIFSTOPPED(status)) {
	continue;
	} else {
	break;
	}
	}

	CHECK(WIFEXITED(status) \|\| WIFSIGNALED(status))
	<< "Unexpected wait status " << status;

	if (!WSUCCEEDED(status)) {
	return Error("Failed to clone: " + WSTRINGIFY(status));
	}

	return pid;
	} else {
	// Child.
	::close(sockets[0]);

	// Loop through and 'setns' into all of the parent namespaces that
	// have been requested.
	for (size_t i = 0; i < NAMESPACES; i++) {
	Option<int> fd = fds.get(namespaces[i].nstype);
	if (fd.isSome()) {
	ASSERT(namespaces[i].nstype & nstypes);
	if (::setns(fd.get(), namespaces[i].nstype) < 0) {
	close(fds.values());
	::close(sockets[1]);
	_exit(EXIT_FAILURE);
	}
	}
	}

	close(fds.values());

	auto grandchildMain = [=]() -> int {
	// Grandchild (second child, now completely entered in the
	// namespaces of the target).
	//
	// Now clone with the specified flags, close the unused socket,
	// and execute the specified function.
	pid_t pid = os::signal_safe::clone(
	stack.get(),
	flags,
	[=]() {
	struct ucred cred;
	cred.pid = ::getpid();
	cred.uid = ::getuid();
	cred.gid = ::getgid();

	// Now send back the pid and have it be translated appropriately
	// by the kernel to the enclosing pid namespace.
	//
	// NOTE: sending back the pid is best effort because we're going
	// to exit no matter what.
	std::memcpy(
	CMSG_DATA(CMSG_FIRSTHDR(&message)),
	&cred,
	sizeof(struct ucred));

	if (sendmsg(sockets[1], &message, 0) == -1) {
	// Failed to send the pid back to the parent!
	_exit(EXIT_FAILURE);
	}

	::close(sockets[1]);

	return f();
	});

	::close(sockets[1]);

	// TODO(benh): Kill ourselves with an exit status that we can
	// decode above to determine why `clone` failed.
	_exit(pid < 0 ? EXIT_FAILURE : EXIT_SUCCESS);
	UNREACHABLE();
	};

	os::Stack grandchildStack(os::Stack::DEFAULT_SIZE);

	if (!grandchildStack.allocate()) {
	::close(sockets[1]);
	_exit(EXIT_FAILURE);
	}

	// Fork again to make sure we're actually in those namespaces
	// (required for the pid namespace at least).
	//
	// NOTE: We use clone instead of fork here because of a glibc bug.
	// glibc version < 2.25 has an assertion in 'fork()' which checks
	// if the child process's pid is not the same as the parent. This
	// invariant is no longer true with pid namespaces being
	// introduced. See more details in MESOS-7858.
	//
	// NOTE: glibc 'fork()' also specifies 'CLONE_CHILD_SETTID' and
	// 'CLONE_CHILD_CLEARTID' for the clone flags. However, since we
	// are not using any pthread library in the grandchild, we don't
	// need those flags.
	//
	// TODO(benh): Don't do a fork if we're not actually entering the
	// PID namespace since the extra fork is unnecessary.
	pid_t grandchild =
	os::signal_safe::clone(grandchildStack, SIGCHLD, grandchildMain);

	grandchildStack.deallocate();

	if (grandchild < 0) {
	// TODO(benh): Exit with `errno` in order to capture `fork` error?
	::close(sockets[1]);
	_exit(EXIT_FAILURE);
	} else if (grandchild > 0) {
	// Still the (first) child.
	::close(sockets[1]);

	// Need to reap the grandchild and then just exit since we're no
	// longer necessary. Technically when the grandchild exits it'll
	// be reaped but by doing a `waitpid` we can better propagate
	// back any errors that might have occurred with the grandchild.
	int status;
	while (true) {
	if (waitpid(grandchild, &status, 0) == -1) {
	if (errno == EINTR) {
	continue;
	} else {
	_exit(1);
	}
	} else if (WIFSTOPPED(status)) {
	continue;
	} else {
	break;
	}
	}

	ASSERT(WIFEXITED(status) \|\| WIFSIGNALED(status));

	if (WIFEXITED(status)) {
	_exit(WEXITSTATUS(status));
	}

	ASSERT(WIFSIGNALED(status));
	raise(WTERMSIG(status));
	}
	}
	UNREACHABLE();
	}


	// Returns the namespace flags in the string form of bitwise-ORing the
	// flags, e.g., CLONE_NEWNS \| CLONE_NEWNET.
	inline std::string stringify(int flags)
	{
	hashmap<unsigned int, std::string> names = {
	{CLONE_NEWNS, "CLONE_NEWNS"},
	{CLONE_NEWUTS, "CLONE_NEWUTS"},
	{CLONE_NEWIPC, "CLONE_NEWIPC"},
	{CLONE_NEWPID, "CLONE_NEWPID"},
	{CLONE_NEWNET, "CLONE_NEWNET"},
	{CLONE_NEWUSER, "CLONE_NEWUSER"},
	{CLONE_NEWCGROUP, "CLONE_NEWCGROUP"}
	};

	std::vector<std::string> namespaces;
	foreachpair (unsigned int flag, const std::string& name, names) {
	if (flags & flag) {
	namespaces.push_back(name);
	}
	}

	return strings::join(" \| ", namespaces);
	}

	} // namespace ns {

	#endif // __LINUX_NS_HPP__