blob: 2d058a16d98266af72b1594112c3d31a1bdc7a68 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// See the License for the specific language governing permissions and
// limitations under the License.
#include <algorithm>
#include <vector>
#include <linux/limits.h>
#include <process/owned.hpp>
#include <stout/adaptor.hpp>
#include <stout/elf.hpp>
#include <stout/error.hpp>
#include <stout/foreach.hpp>
#include <stout/fs.hpp>
#include <stout/nothing.hpp>
#include <stout/option.hpp>
#include <stout/path.hpp>
#include <stout/result.hpp>
#include <stout/strings.hpp>
#include <stout/os/mkdir.hpp>
#include <stout/os/realpath.hpp>
#include <stout/os/rmdir.hpp>
#include <stout/os/shell.hpp>
#include "linux/fs.hpp"
#include "linux/ldcache.hpp"
#include "slave/containerizer/mesos/isolators/gpu/nvml.hpp"
#include "slave/containerizer/mesos/isolators/gpu/volume.hpp"
using docker::spec::v1::ImageManifest;
using process::Owned;
using std::string;
using std::vector;
namespace mesos {
namespace internal {
namespace slave {
// Much of the logic in this file is borrowed from nvidia-docker 1.0:
// Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
static constexpr char HOST_VOLUME_PATH_PREFIX[] =
static constexpr char CONTAINER_VOLUME_PATH[] =
static constexpr char CONTAINER_CUDA_RUNTIME_PATH[] =
// The contents of the `BINARIES` and `LIBRARIES` arrays below are from
// libnvidia-container to support nvidia-docker 2.0:
// Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
static constexpr const char* BINARIES[] = {
// ----- Utility -----
"nvidia-smi", // System management interface.
"nvidia-debugdump", // GPU coredump utility.
"nvidia-persistenced", // Persistence mode utility.
// "nvidia-modprobe", // Kernel module loader.
// "nvidia-settings", // X server settings.
// "nvidia-xconfig", // X xorg.conf editor.
// ----- Compute -----
"nvidia-cuda-mps-control", // Multi process service CLI.
"nvidia-cuda-mps-server", // Multi process service server.
static constexpr const char* LIBRARIES[] = {
// -------- Display --------
// "", // GTK2.
// "", // GTK3.
// "", // Wrapped software rendering module for X server.
// "", // Driver module for X server.
// "", // GLX extension module for X server.
// -------- Utility --------
"", // Management library.
"", // GPU configuration.
// -------- Compute --------
"", // CUDA driver library.
"", // NVIDIA OpenCL ICD.
"", // PTX-SASS JIT compiler.
"", // fatbin loader.
"", // NVVM-PTX compiler for OpenCL.
// --------- Video ---------
"", // Video encoder.
"", // Video decoder.
// ------- Graphics --------
// "", // EGL wayland extensions.
"", // EGL core.
"", // OpenGL core.
"", // Thread local storage.
"", // OpenGL system interaction.
"", // Framebuffer capture.
"", // OpenGL framebuffer capture.
// --- Graphics (GLVND) ----
// "", // GLX ICD loader.
// "", // OpenGL ICD loader.
// "", // OpenGL dispatch.
"", // OpenGL/GLX ICD.
"", // EGL ICD.
"", // OpenGL ES v2 ICD.
"", // OpenGL ES v1 ICD.
// --- Graphics (compat) ---
"", // OpenGL/GLX legacy _or_ compatibility wrapper.
"", // EGL legacy _or_ ICD loader.
"", // OpenGL ES v1 legacy _or_ ICD loader.
"", // OpenGL ES v2 legacy _or_ ICD loader.
static Try<bool> isBlacklisted(
const string& library,
const Owned<elf::File>& elf)
// Blacklist EGL/OpenGL libraries issued by other vendors.
if (library == "" ||
library == "" ||
library == "" ||
library == "") {
Try<vector<string>> dependencies =
if (dependencies.isError()) {
return Error("Failed reading external dependencies in ELF file"
" '" + library + "': " + dependencies.error());
foreach (const string& dependency, dependencies.get()) {
if (dependency == "" ||
strings::startsWith(dependency, "libnvidia-gl") ||
strings::startsWith(dependency, "libnvidia-egl")) {
return false;
return true;
// Blacklist TLS libraries using the old ABI (i.e. those != 2.3.99).
if (library == "") {
Result<Version> abi = elf->get_abi_version();
if (!abi.isSome()) {
return Error(
"Failed to read ELF ABI version:"
" " + (abi.isError() ? abi.error() : "No ABI version found"));
if (abi.get() != Version(2, 3, 99)) {
return true;
return false;
const string& NvidiaVolume::HOST_PATH() const
return hostPath;
const string& NvidiaVolume::CONTAINER_PATH() const
return containerPath;
Environment NvidiaVolume::ENV(const ImageManifest& manifest) const
vector<string> paths;
vector<string> ldPaths;
foreach (const string& env, manifest.config().env()) {
const vector<string> tokens = strings::split(env, "=", 2);
if (tokens.size() != 2) {
if (tokens[0] == "PATH") {
paths = strings::tokenize(tokens[1], ":");
} else if (tokens[0] == "LD_LIBRARY_PATH") {
ldPaths = strings::tokenize(tokens[1], ":");
// Inject the `PATH` and `LD_LIBRARY_PATH` environment variables.
const string binaryPath = path::join(containerPath, "bin");
if (std::find(paths.begin(), paths.end(), binaryPath) == paths.end()) {
// NOTE: CUDA images may contain compatibility libraries, so we inject
// their path *BEFORE* paths to the libraries from the host. See:
// // NOLINT
// // NOLINT
const string libraryPaths[] = {
path::join(CONTAINER_CUDA_RUNTIME_PATH, "compat"),
path::join(containerPath, "lib"),
path::join(containerPath, "lib64")};
foreach (const string& libraryPath, libraryPaths) {
if (std::find(ldPaths.begin(), ldPaths.end(), libraryPath) ==
ldPaths.end()) {
Environment environment;
Environment::Variable* pathVar = environment.add_variables();
pathVar->set_value(strings::join(":", paths));
Environment::Variable* ldPathVar = environment.add_variables();
ldPathVar->set_value(strings::join(":", ldPaths));
return environment;
Try<NvidiaVolume> NvidiaVolume::create()
if (geteuid() != 0) {
return Error("NvidiaVolume::create() requires root privileges");
// Append the Nvidia driver version to the name of the volume.
Try<Nothing> initialized = nvml::initialize();
if (initialized.isError()) {
return Error("Failed to nvml::initialize: " + initialized.error());
Try<string> version = nvml::systemGetDriverVersion();
if (version.isError()) {
return Error("Failed to nvml::systemGetDriverVersion: " + version.error());
// Create the volume on the host.
string hostPath = HOST_VOLUME_PATH_PREFIX + version.get();
if (!os::exists(hostPath)) {
Try<Nothing> mkdir = os::mkdir(hostPath);
if (mkdir.isError()) {
return Error("Failed to os::mkdir '" + hostPath + "': " + mkdir.error());
// If the filesystem where we are creating this volume has the
// `noexec` bit set, we will not be able to execute any of the
// nvidia binaries we place in the volume (e.g. `nvidia-smi`). To
// fix this, we mount a `tmpfs` over the volume `hostPath` without
// the `noexec` bit set. See MESOS-5923 for more information.
Try<fs::MountInfoTable> table = fs::MountInfoTable::read();
if (table.isError()) {
return Error("Failed to get mount table: " + table.error());
Result<string> realpath = os::realpath(hostPath);
if (!realpath.isSome()) {
return Error("Failed to os::realpath '" + hostPath + "':"
" " + (realpath.isError()
? realpath.error()
: "No such file or directory"));
// Do a reverse search through the list of mounted filesystems to
// find the filesystem that is mounted with the longest overlapping
// path to our `hostPath` (which may include the `hostPath` itself).
// Only mount a new `tmpfs` over the `hostPath` if the filesysem we
// find is marked as `noexec`.
foreach (const fs::MountInfoTable::Entry& entry,
adaptor::reverse(table->entries)) {
if (strings::startsWith(realpath.get(), {
if (strings::contains(entry.vfsOptions, "noexec")) {
Try<Nothing> mnt = fs::mount(
"tmpfs", hostPath, "tmpfs", MS_NOSUID | MS_NODEV, "mode=755");
if (mnt.isError()) {
return Error("Failed to mount '" + hostPath + "': " + mnt.error());
// Create some directories in the volume if they don't yet exist.
string directories[] = {"bin", "lib", "lib64" };
foreach (const string& directory, directories) {
string path = path::join(hostPath, directory);
if (!os::exists(path)) {
Try<Nothing> mkdir = os::mkdir(path);
if (mkdir.isError()) {
return Error("Failed to os::mkdir '" + path + "': " + mkdir.error());
// Fill in the `/bin` directory with BINARIES.
foreach (const string& binary, BINARIES) {
string path = path::join(hostPath, "bin", binary);
if (!os::exists(path)) {
string command = "which " + binary;
Try<string> which = os::shell(command);
if (which.isSome()) {
which = strings::trim(which.get());
Result<string> realpath = os::realpath(which.get());
if (!realpath.isSome()) {
return Error("Failed to os::realpath '" + which.get() + "':"
" " + (realpath.isError()
? realpath.error()
: "No such file or directory"));
command = "cp " + realpath.get() + " " + path;
Try<string> cp = os::shell(command);
if (cp.isError()) {
return Error("Failed to os::shell '" + command + "': " + cp.error());
// Fill in the `/lib*` directories with LIBRARIES. Process all
// versions of a library that match `lib*.so*` in the ldcache.
Try<vector<ldcache::Entry>> cache = ldcache::parse();
if (cache.isError()) {
return Error("Failed to ldcache::parse: " + cache.error());
foreach (const string& library, LIBRARIES) {
foreach (const ldcache::Entry& entry, cache.get()) {
if (strings::startsWith(, library)) {
// Copy the fully resolved `entry.path` (i.e. the path of the
// library after following all symlinks) into either the
// `/lib` folder if it is 32-bit or `/lib64` if it is 64 bit.
Result<string> realpath = os::realpath(entry.path);
if (!realpath.isSome()) {
return Error("Failed to os::realpath '" + entry.path + "':"
" " + (realpath.isError()
? realpath.error()
: "No such file or directory"));
Try<elf::File*> load = elf::File::load(realpath.get());
if (load.isError()) {
return Error("Failed to elf::File::load '" + realpath.get() + "':"
" " + load.error());
Owned<elf::File> file(load.get());
// If the library is blacklisted, skip it.
Try<bool> blacklisted = isBlacklisted(library, file);
if (blacklisted.isError()) {
return Error("Failed to check blacklist: " + blacklisted.error());
if (blacklisted.get()) {
Option<string> libraryDirectory = None();
Try<elf::Class> c = file->get_class();
if (c.isError()) {
return Error("Failed to get ELF class for '" + + "':"
" " + c.error());
if (c.get() == elf::CLASS32) {
libraryDirectory = "lib";
} else if (c.get() == elf::CLASS64) {
libraryDirectory = "lib64";
} else {
return Error("Unknown ELF class: " + stringify(c.get()));
string libraryPath = path::join(
if (!os::exists(libraryPath)) {
string command = "cp " + realpath.get() + " " + libraryPath;
Try<string> cp = os::shell(command);
if (cp.isError()) {
return Error("Failed to os::shell '" + command + "':"
" " + cp.error());
// Set up symlinks between `` and the fully resolved
// path we just copied. This preserves the list of libraries
// we have on our host system in the mounted volume. If
// `entry.path` and the fully resolved path are the same, we
// don't make a symlink.
string symlinkPath =
path::join(hostPath, libraryDirectory.get(),;
if (!os::exists(symlinkPath)) {
Try<Nothing> symlink =
::fs::symlink(Path(realpath.get()).basename(), symlinkPath);
if (symlink.isError()) {
return Error("Failed to fs::symlink"
" '" + symlinkPath + "'"
" -> '" + Path(realpath.get()).basename() + "':"
" " + symlink.error());
// GLVND requires an extra symlink for indirect GLX support.
// This is a temproary workaround and won't be needed once we
// have an indirect GLX vendor neutral library.
// TODO(klueska): Including this symlink was borrowed
// from the `nvidia-docker-plugin` code. Remove this
// symlink when `nvidia-docker-plugin` does the same.
if (library == "") {
string libraryName = strings::replace(, "GLX_nvidia", "GLX_indirect");
string symlinkPath =
path::join(hostPath, libraryDirectory.get(), libraryName);
if (!os::exists(symlinkPath)) {
Try<Nothing> symlink =
::fs::symlink(Path(realpath.get()).basename(), symlinkPath);
if (symlink.isError()) {
return Error("Failed to fs::symlink"
" '" + symlinkPath + "'"
" -> '" + Path(realpath.get()).basename() + "':"
" " + symlink.error());
// Return the actual volume object with the fully
// resolved host path and the container path set.
return NvidiaVolume(hostPath, CONTAINER_VOLUME_PATH);
// We use the `NVIDIA_VISIBLE_DEVICES` environment variable from
// nvidia-docker to decide if we should inject the volume or not. See:
// // NOLINT
// To support legacy nvidia-docker (version 1.0 and before), we also check if
// the `com.nvidia.volumes.needed` label exists. See:
bool NvidiaVolume::shouldInject(const ImageManifest& manifest) const
foreach (const string& env, manifest.config().env()) {
const vector<string> tokens = strings::split(env, "=", 2);
if (tokens.size() != 2 || tokens[0] != "NVIDIA_VISIBLE_DEVICES") {
if (tokens[1] == "" || tokens[1] == "void") {
return false;
return true;
if (manifest.config().labels().count("com.nvidia.volumes.needed")) {
// The label value is used as the name of the volume that
// nvidia-docker-plugin registers with Docker. We therefore
// don't need to use it as we simply pass the host path
// of the volume directly.
return true;
return false;
} // namespace slave {
} // namespace internal {
} // namespace mesos {