| // Licensed to the Apache Software Foundation (ASF) under one |
| // or more contributor license agreements. See the NOTICE file |
| // distributed with this work for additional information |
| // regarding copyright ownership. The ASF licenses this file |
| // to you under the Apache License, Version 2.0 (the |
| // "License"); you may not use this file except in compliance |
| // with the License. You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| #ifndef __LINUX_NS_HPP__ |
| #define __LINUX_NS_HPP__ |
| |
| // This file contains Linux-only OS utilities. |
| #ifndef __linux__ |
| #error "linux/ns.hpp is only available on Linux systems." |
| #endif |
| |
| #include <sched.h> |
| |
| #include <sys/syscall.h> |
| |
| #include <queue> |
| #include <set> |
| #include <string> |
| #include <thread> |
| |
| #include <process/future.hpp> |
| |
| #include <stout/lambda.hpp> |
| #include <stout/nothing.hpp> |
| #include <stout/option.hpp> |
| #include <stout/result.hpp> |
| #include <stout/try.hpp> |
| |
| #ifndef CLONE_NEWNS |
| #define CLONE_NEWNS 0x00020000 |
| #endif |
| |
| #ifndef CLONE_NEWUTS |
| #define CLONE_NEWUTS 0x04000000 |
| #endif |
| |
| #ifndef CLONE_NEWIPC |
| #define CLONE_NEWIPC 0x08000000 |
| #endif |
| |
| #ifndef CLONE_NEWPID |
| #define CLONE_NEWPID 0x20000000 |
| #endif |
| |
| #ifndef CLONE_NEWNET |
| #define CLONE_NEWNET 0x40000000 |
| #endif |
| |
| #ifndef CLONE_NEWUSER |
| #define CLONE_NEWUSER 0x10000000 |
| #endif |
| |
| #ifndef CLONE_NEWCGROUP |
| #define CLONE_NEWCGROUP 0x02000000 |
| #endif |
| |
| // Define a 'setns' for compilation environments that don't already |
| // have one. |
| inline int setns(int fd, int nstype) |
| { |
| #ifdef SYS_setns |
| return ::syscall(SYS_setns, fd, nstype); |
| #elif defined(__x86_64__) |
| // A workaround for those hosts that have an old glibc (older than |
| // 2.14) but have a new kernel. The magic number '308' here is the |
| // syscall number for 'setns' on x86_64 architecture. |
| return ::syscall(308, fd, nstype); |
| #else |
| #error "setns is not available" |
| #endif |
| } |
| |
| namespace ns { |
| |
| // Returns the nstype (e.g., CLONE_NEWNET, CLONE_NEWNS, etc.) for the |
| // given namespace which can be used when calling ::setns. |
| Try<int> nstype(const std::string& ns); |
| |
| |
| // Given a single CLONE_NEW* constant, return the corresponding namespace |
| // name. This is the inverse of ns::nstype(). |
| Try<std::string> nsname(int nsType); |
| |
| |
| // Returns all the configured kernel namespaces. |
| std::set<int> nstypes(); |
| |
| |
| // Returns true if all the given CLONE_NEW* constants are supported |
| // in the running kernel. If CLONE_NEWUSER is specified, the kernel |
| // version must be at least 3.12.0 since prior to that version, major |
| // kernel subsystems (e.g. XFS) did not implement user namespace |
| // support. See also user_namespaces(7). |
| Try<bool> supported(int nsTypes); |
| |
| |
| // Re-associate the calling process with the specified namespace. The |
| // path refers to one of the corresponding namespace entries in the |
| // /proc/[pid]/ns/ directory (or bind mounted elsewhere). We do not |
| // allow a process with multiple threads to call this function because |
| // it will lead to some weird situations where different threads of a |
| // process are in different namespaces. |
| Try<Nothing> setns( |
| const std::string& path, |
| const std::string& ns, |
| bool checkMultithreaded = true); |
| |
| |
| // Re-associate the calling process with the specified namespace. The |
| // pid specifies the process whose namespace we will associate. |
| Try<Nothing> setns( |
| pid_t pid, |
| const std::string& ns, |
| bool checkMultithreaded = true); |
| |
| |
| // Get the inode number of the specified namespace for the specified |
| // pid. The inode number identifies the namespace and can be used for |
| // comparisons, i.e., two processes with the same inode for a given |
| // namespace type are in the same namespace. |
| Result<ino_t> getns(pid_t pid, const std::string& ns); |
| |
| |
| /** |
| * Performs an `os::clone` after entering a set of namespaces for the |
| * specified `target` process. |
| * |
| * This function provides two steps of functionality: |
| * (1) Enter a set of namespaces via two `fork` calls. |
| * (1) Perform a `clone` within that set of namespaces. |
| * |
| * Step (1) of functionality is similar to the `nsenter` command line |
| * utility. Step (2) allows us to perform a clone that itself might |
| * create a nested set of namespaces, which enables us to have nested |
| * containers. |
| * |
| * Double Fork: |
| * |
| * In order to enter a PID namespace we need to do a double fork |
| * because doing a `setns` for a PID namespace only effects future |
| * children. |
| * |
| * Moreover, attempting to `setns` before we do any forks and then |
| * have the parent `setns` back to the original namespaces does not |
| * work because entering a depriviledged user namespace will not let |
| * us reassociate back with the original namespace, even if we keep |
| * the file descriptor of the original namespace open. |
| * |
| * Because we have to double fork we need to send back the actual PID |
| * of the final process that's executing the provided function `f`. |
| * We use domain sockets for this because in the event we've entered a |
| * PID namespace we need the kernel to translate the PID to the PID in |
| * our PID namespace. |
| * |
| * @param target Target process whose namespaces we should enter. |
| * @param nstypes Namespaces we should enter. |
| * @param f Function to invoke after entering the namespaces and cloning. |
| * @param flags Flags to pass to `clone`. |
| * |
| * @return `pid_t` of the child process. |
| */ |
| Try<pid_t> clone( |
| pid_t target, |
| int nstypes, |
| const lambda::function<int()>& f, |
| int flags); |
| |
| |
| // Returns the namespace flags in the string form of bitwise-ORing the |
| // flags, e.g., CLONE_NEWNS | CLONE_NEWNET. |
| std::string stringify(int flags); |
| |
| |
| // The NamespaceRunner runs any function in a specified namespace. |
| // To do that it manages a separate thread which would be re-associated |
| // with that namespace. |
| class NamespaceRunner |
| { |
| public: |
| NamespaceRunner() |
| { |
| // Start the looper thread. |
| thread.reset(new std::thread(&NamespaceRunner::loop, this)); |
| } |
| |
| ~NamespaceRunner() |
| { |
| // Shutdown the queue. |
| queue.shutdown(); |
| // Wait for the thread to complete. |
| thread->join(); |
| thread.reset(); |
| } |
| |
| // Run any function in a specified namespace. |
| template <typename T> |
| process::Future<T> run( |
| const std::string& path, |
| const std::string& ns, |
| const lambda::function<Try<T>()>& func) |
| { |
| std::shared_ptr<process::Promise<T>> promise( |
| new process::Promise<T>); |
| process::Future<T> future = promise->future(); |
| |
| // Put a function to the queue, the function will be called |
| // in the thread. The thread will be re-associated with the |
| // specified namespace. |
| queue.put([=]{ |
| Try<Nothing> setns = ::ns::setns(path, ns, false); |
| if (setns.isError()) { |
| promise->fail(setns.error()); |
| } else { |
| promise->set(func()); |
| } |
| }); |
| |
| return future; |
| } |
| |
| private: |
| typedef lambda::function<void()> Func; |
| |
| // The thread loop. |
| void loop() |
| { |
| for (;;) { |
| // Get a function from the queue. |
| Option<Func> func = queue.get(); |
| |
| // Stop the thread if the queue is shutdowned. |
| if (func.isNone()) { |
| break; |
| } |
| |
| // Call the function, it re-associates the thread with the |
| // specified namespace and calls the initial user function. |
| func.get()(); |
| } |
| } |
| |
| // It's not safe to use process::Queue when not all of its callers are |
| // managed by libprocess. Calling Future::await() in looper thread |
| // might cause the looper thread to be donated to a libprocess Process. |
| // If that Process is very busy (e.g., master or agent Process), it's |
| // possible that the looper thread will never re-gain control. |
| // |
| // ProcessingQueue uses mutex and condition variable to solve this |
| // problem. ProcessingQueue::get() can block the thread. The main |
| // use cases for the class are thread workers and thread pools. |
| template <typename T> |
| class ProcessingQueue |
| { |
| public: |
| ProcessingQueue() : finished(false) {} |
| |
| ~ProcessingQueue() = default; |
| |
| // Add an element to the queue and notify one client. |
| void put(T&& t) |
| { |
| synchronized (mutex) { |
| queue.push(std::forward<T>(t)); |
| cond.notify_one(); |
| } |
| } |
| |
| // NOTE: This function blocks the thread. It returns the oldest |
| // element from the queue and returns None() if the queue is |
| // shutdowned. |
| Option<T> get() |
| { |
| synchronized (mutex) { |
| // Wait for either a new queue element or queue shutdown. |
| while (queue.empty() && !finished) { |
| synchronized_wait(&cond, &mutex); |
| } |
| |
| if (finished) { |
| // The queue is shutdowned. |
| return None(); |
| } |
| |
| // Return the oldest element from the queue. |
| T t = std::move(queue.front()); |
| queue.pop(); |
| return Some(std::move(t)); |
| } |
| } |
| |
| // Shutdown the queue and notify all clients. |
| void shutdown() { |
| synchronized (mutex) { |
| finished = true; |
| std::queue<T>().swap(queue); |
| cond.notify_all(); |
| } |
| } |
| |
| private: |
| std::mutex mutex; |
| std::condition_variable cond; |
| std::queue<T> queue; |
| bool finished; |
| }; |
| |
| ProcessingQueue<Func> queue; |
| std::unique_ptr<std::thread> thread; |
| }; |
| |
| } // namespace ns { |
| |
| #endif // __LINUX_NS_HPP__ |