src/slave/containerizer/mesos/isolators/gpu/isolator.cpp - mesos - Git at Google

 // Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements.  See the NOTICE file
 // distributed with this work for additional information
 // regarding copyright ownership.  The ASF licenses this file
 // to you under the Apache License, Version 2.0 (the
 // "License"); you may not use this file except in compliance
 // with the License.  You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 #include <stdint.h>

 #include <sys/mount.h>

 // This header include must be enclosed in an `extern "C"` block to
 // workaround a bug in glibc <= 2.12 (see MESOS-7378).
 //
 // TODO(neilc): Remove this when we no longer support glibc <= 2.12.
 extern "C" {
 #include <sys/sysmacros.h>
 }

 #include <algorithm>
 #include <list>
 #include <map>
 #include <set>
 #include <string>
 #include <vector>

 #include <process/collect.hpp>
 #include <process/defer.hpp>
 #include <process/future.hpp>
 #include <process/id.hpp>

 #include <stout/error.hpp>
 #include <stout/foreach.hpp>
 #include <stout/hashmap.hpp>
 #include <stout/option.hpp>
 #include <stout/os.hpp>
 #include <stout/try.hpp>

 #include "common/protobuf_utils.hpp"

 #include "linux/cgroups.hpp"
 #include "linux/fs.hpp"

 #include "slave/flags.hpp"

 #include "slave/containerizer/containerizer.hpp"

 #include "slave/containerizer/mesos/isolator.hpp"

 #include "slave/containerizer/mesos/isolators/cgroups/constants.hpp"

 #include "slave/containerizer/mesos/isolators/gpu/allocator.hpp"
 #include "slave/containerizer/mesos/isolators/gpu/isolator.hpp"
 #include "slave/containerizer/mesos/isolators/gpu/nvml.hpp"

 #include "slave/containerizer/mesos/paths.hpp"

 using cgroups::devices::Entry;

 using docker::spec::v1::ImageManifest;

 using mesos::slave::ContainerClass;
 using mesos::slave::ContainerConfig;
 using mesos::slave::ContainerLaunchInfo;
 using mesos::slave::ContainerLimitation;
 using mesos::slave::ContainerMountInfo;
 using mesos::slave::ContainerState;
 using mesos::slave::Isolator;

 using process::defer;
 using process::Failure;
 using process::Future;
 using process::PID;

 using std::list;
 using std::map;
 using std::set;
 using std::string;
 using std::vector;

 namespace mesos {
 namespace internal {
 namespace slave {

 NvidiaGpuIsolatorProcess::NvidiaGpuIsolatorProcess(
     const Flags& _flags,
     const string& _hierarchy,
     const NvidiaGpuAllocator& _allocator,
     const NvidiaVolume& _volume,
     const map<Path, cgroups::devices::Entry>& _controlDeviceEntries)
   : ProcessBase(process::ID::generate("mesos-nvidia-gpu-isolator")),
     flags(_flags),
     hierarchy(_hierarchy),
     allocator(_allocator),
     volume(_volume),
     controlDeviceEntries(_controlDeviceEntries) {}


 Try<Isolator*> NvidiaGpuIsolatorProcess::create(
     const Flags& flags,
     const NvidiaComponents& components)
 {
   // Make sure both the 'cgroups/devices' (or 'cgroups/all')
   // and the 'filesystem/linux' isolators are present.
   vector<string> tokens = strings::tokenize(flags.isolation, ",");

   auto gpuIsolator =
     std::find(tokens.begin(), tokens.end(), "gpu/nvidia");

   auto devicesIsolator =
     std::find(tokens.begin(), tokens.end(), "cgroups/devices");

   auto cgroupsAllIsolator =
     std::find(tokens.begin(), tokens.end(), "cgroups/all");

   auto filesystemIsolator =
     std::find(tokens.begin(), tokens.end(), "filesystem/linux");

   CHECK(gpuIsolator != tokens.end());

   if (cgroupsAllIsolator != tokens.end()) {
     // The reason that we need to check if `devices` cgroups subsystem is
     // enabled is, when `cgroups/all` is specified in the `--isolation` agent
     // flag, cgroups isolator will only load the enabled subsystems. So if
     // `cgroups/all` is specified but `devices` is not enabled, cgroups isolator
     // will not load `devices` subsystem in which case we should error out.
     Try<bool> result = cgroups::enabled("devices");
     if (result.isError()) {
       return Error(
           "Failed to check if the `devices` cgroups subsystem"
           " is enabled by kernel: " + result.error());
     } else if (!result.get()) {
       return Error(
           "The `devices` cgroups subsystem is not enabled by the kernel");
     }
   } else if (devicesIsolator == tokens.end()) {
     return Error(
         "The 'cgroups/devices' or 'cgroups/all' isolator must be"
         " enabled in order to use the 'gpu/nvidia' isolator");
   }

   if (filesystemIsolator == tokens.end()) {
     return Error("The 'filesystem/linux' isolator must be enabled in"
                  " order to use the 'gpu/nvidia' isolator");
   }

   // Retrieve the cgroups devices hierarchy.
   Result<string> hierarchy = cgroups::hierarchy(CGROUP_SUBSYSTEM_DEVICES_NAME);

   if (hierarchy.isError()) {
     return Error(
         "Error retrieving the 'devices' subsystem hierarchy: " +
         hierarchy.error());
   }

   // Create device entries for `/dev/nvidiactl` and
   // `/dev/nvidia-uvm`. Optionally create a device entry for
   // `/dev/nvidia-uvm-tools` if it exists.
   map<Path, cgroups::devices::Entry> deviceEntries;

   Try<dev_t> device = os::stat::rdev("/dev/nvidiactl");
   if (device.isError()) {
     return Error("Failed to obtain device ID for '/dev/nvidiactl': " +
                  device.error());
   }

   cgroups::devices::Entry entry;
   entry.selector.type = Entry::Selector::Type::CHARACTER;
   entry.selector.major = major(device.get());
   entry.selector.minor = minor(device.get());
   entry.access.read = true;
   entry.access.write = true;
   entry.access.mknod = true;

   deviceEntries[Path("/dev/nvidiactl")] = entry;

   // The `nvidia-uvm` module is not typically loaded by default on
   // systems that have Nvidia GPU drivers installed. Instead,
   // applications that require this module use `nvidia-modprobe` to
   // load it dynamically on first use. This program both loads the
   // `nvidia-uvm` kernel module and creates the corresponding
   // `/dev/nvidia-uvm` device that it controls.
   //
   // We call `nvidia-modprobe` here to ensure that `/dev/nvidia-uvm`
   // is properly created so we can inject it into any containers that
   // may require it.
   if (!os::exists("/dev/nvidia-uvm")) {
     Try<string> modprobe = os::shell("nvidia-modprobe -u -c 0");
     if (modprobe.isError()) {
       return Error("Failed to load '/dev/nvidia-uvm': " + modprobe.error());
     }
   }

   device = os::stat::rdev("/dev/nvidia-uvm");
   if (device.isError()) {
     return Error("Failed to obtain device ID for '/dev/nvidia-uvm': " +
                  device.error());
   }

   entry.selector.type = Entry::Selector::Type::CHARACTER;
   entry.selector.major = major(device.get());
   entry.selector.minor = minor(device.get());
   entry.access.read = true;
   entry.access.write = true;
   entry.access.mknod = true;

   deviceEntries[Path("/dev/nvidia-uvm")] = entry;

   device = os::stat::rdev("/dev/nvidia-uvm-tools");
   if (device.isSome()) {
     entry.selector.type = Entry::Selector::Type::CHARACTER;
     entry.selector.major = major(device.get());
     entry.selector.minor = minor(device.get());
     entry.access.read = true;
     entry.access.write = true;
     entry.access.mknod = true;

     deviceEntries[Path("/dev/nvidia-uvm-tools")] = entry;
   }

   process::Owned<MesosIsolatorProcess> process(
       new NvidiaGpuIsolatorProcess(
           flags,
           hierarchy.get(),
           components.allocator,
           components.volume,
           deviceEntries));

   return new MesosIsolator(process);
 }


 bool NvidiaGpuIsolatorProcess::supportsNesting()
 {
   return true;
 }


 bool NvidiaGpuIsolatorProcess::supportsStandalone()
 {
   return true;
 }


 Future<Nothing> NvidiaGpuIsolatorProcess::recover(
     const vector<ContainerState>& states,
     const hashset<ContainerID>& orphans)
 {
   vector<Future<Nothing>> futures;

   foreach (const ContainerState& state, states) {
     const ContainerID& containerId = state.container_id();

     // If we are a nested container, we skip the recover because our
     // root ancestor will recover the GPU state from the cgroup for us.
     if (containerId.has_parent()) {
       continue;
     }

     const string cgroup = path::join(flags.cgroups_root, containerId.value());

     if (!cgroups::exists(hierarchy, cgroup)) {
       // This may occur if the executor has exited and the isolator
       // has destroyed the cgroup but the slave dies before noticing
       // this. This will be detected when the containerizer tries to
       // monitor the executor's pid.
       LOG(WARNING) << "Couldn't find the cgroup '" << cgroup << "' "
                    << "in hierarchy '" << hierarchy << "' "
                    << "for container " << containerId;
       continue;
     }

     infos[containerId] = new Info(containerId, cgroup);

     // Determine which GPUs are allocated to this container.
     Try<vector<cgroups::devices::Entry>> entries =
       cgroups::devices::list(hierarchy, cgroup);

     if (entries.isError()) {
       return Failure("Failed to obtain devices list for cgroup"
                      " '" + cgroup + "': " + entries.error());
     }

     const set<Gpu>& available = allocator.total();

     set<Gpu> containerGpus;
     foreach (const cgroups::devices::Entry& entry, entries.get()) {
       foreach (const Gpu& gpu, available) {
         if (entry.selector.major == gpu.major &&
             entry.selector.minor == gpu.minor) {
           containerGpus.insert(gpu);
           break;
         }
       }
     }

     futures.push_back(allocator.allocate(containerGpus)
       .then(defer(self(), [=]() -> Future<Nothing> {
         infos[containerId]->allocated = containerGpus;
         return Nothing();
       })));
   }

   return collect(futures).then([]() { return Nothing(); });
 }


 Future<Option<ContainerLaunchInfo>> NvidiaGpuIsolatorProcess::prepare(
     const ContainerID& containerId,
     const mesos::slave::ContainerConfig& containerConfig)
 {
   if (containerId.has_parent()) {
     // If we are a nested container in the `DEBUG` class, then we
     // don't need to do anything special to prepare ourselves for GPU
     // support. All Nvidia volumes will be inherited from our parent.
     if (containerConfig.has_container_class() &&
         containerConfig.container_class() == ContainerClass::DEBUG) {
       return None();
     }

     // If we are a nested container in a different class, we don't
     // need to maintain an `Info()` struct about the container (since
     // we don't directly allocate any GPUs to it), but we do need to
     // mount the necessary Nvidia libraries into the container (since
     // we live in a different mount namespace than our parent). We
     // directly call `_prepare()` to do this for us.
     return _prepare(containerId, containerConfig);
   }

   if (infos.contains(containerId)) {
     return Failure("Container has already been prepared");
   }

   infos[containerId] = new Info(
       containerId, path::join(flags.cgroups_root, containerId.value()));

   // Grant access to all `controlDeviceEntries`.
   //
   // This allows standard NVIDIA tools like `nvidia-smi` to be
   // used within the container even if no GPUs are allocated.
   // Without these devices, these tools fail abnormally.
   foreachkey (const Path& devicePath, controlDeviceEntries) {
     Try<Nothing> allow = cgroups::devices::allow(
         hierarchy,
         infos[containerId]->cgroup,
         controlDeviceEntries.at(devicePath));

     if (allow.isError()) {
       return Failure("Failed to grant cgroups access to"
                      " '" + stringify(devicePath) + "': " + allow.error());
     }
   }

   return update(containerId, containerConfig.resources())
     .then(defer(PID<NvidiaGpuIsolatorProcess>(this),
                 &NvidiaGpuIsolatorProcess::_prepare,
                 containerId,
                 containerConfig));
 }


 // If our `ContainerConfig` specifies a different `rootfs` than the
 // host file system, then we need to prepare a script to inject our
 // `NvidiaVolume` into the container (if required).
 Future<Option<ContainerLaunchInfo>> NvidiaGpuIsolatorProcess::_prepare(
     const ContainerID& containerId,
     const mesos::slave::ContainerConfig& containerConfig)
 {
   if (!containerConfig.has_rootfs()) {
      return None();
   }

   // We only support docker containers at the moment.
   if (!containerConfig.has_docker()) {
     // TODO(klueska): Once ContainerConfig has
     // a type, include that in the error message.
     return Failure("Nvidia GPU isolator does not support non-Docker images");
   }

   ContainerLaunchInfo launchInfo;

   // Inject the Nvidia volume into the container.
   if (!containerConfig.docker().has_manifest()) {
      return Failure("The 'ContainerConfig' for docker is missing a manifest");
   }

   ImageManifest manifest = containerConfig.docker().manifest();

   if (volume.shouldInject(manifest)) {
     const string target = path::join(
         containerConfig.rootfs(),
         volume.CONTAINER_PATH());

     Try<Nothing> mkdir = os::mkdir(target);
     if (mkdir.isError()) {
       return Failure(
           "Failed to create the container directory at"
           " '" + target + "': " + mkdir.error());
     }

     *launchInfo.add_mounts() = protobuf::slave::createContainerMount(
         volume.HOST_PATH(), target, MS_RDONLY | MS_BIND | MS_REC);

     // TODO(chhsiao): As a workaround, we append `NvidiaVolume` paths into the
     // `PATH` and `LD_LIBRARY_PATH` environment variables so the binaries and
     // libraries can be found. However these variables might be overridden by
     // users, and `LD_LIBRARY_PATH` might get cleared across exec calls. Instead
     // of injecting `NvidiaVolume`, we could leverage libnvidia-container in the
     // future. See MESOS-9595.
     if (containerConfig.has_task_info()) {
       // Command executor.
       *launchInfo.mutable_task_environment() = volume.ENV(manifest);
     } else {
       // Default executor, custom executor, or nested container.
       *launchInfo.mutable_environment() = volume.ENV(manifest);
     }
   }

   const string devicesDir = containerizer::paths::getContainerDevicesPath(
       flags.runtime_dir, containerId);

   // The `filesystem/linux` isolator is responsible for creating the
   // devices directory and ordered to run before we do. Here, we can
   // just assert that the devices directory is still present.
   if (!os::exists(devicesDir)) {
     return Failure("Missing container devices directory '" + devicesDir + "'");
   }

   // Glob all Nvidia GPU devices on the system and add them to the
   // list of devices injected into the chroot environment.
   Try<list<string>> nvidia = os::glob("/dev/nvidia*");
   if (nvidia.isError()) {
     return Failure("Failed to glob /dev/nvidia*: " + nvidia.error());
   }

   foreach (const string& device, nvidia.get()) {
     const string devicePath = path::join(
         devicesDir, strings::remove(device, "/dev/", strings::PREFIX), device);

     Try<Nothing> mknod =
       fs::chroot::copyDeviceNode(device, devicePath);
     if (mknod.isError()) {
       return Failure(
           "Failed to copy device '" + device + "': " + mknod.error());
     }

     // Since we are adding the GPU devices to the container, make
     // them read/write to guarantee that they are accessible inside
     // the container.
     Try<Nothing> chmod = os::chmod(devicePath, 0666);
     if (chmod.isError()) {
       return Failure(
           "Failed to set permissions on device '" + device + "': " +
           chmod.error());
     }

     *launchInfo.add_mounts() = protobuf::slave::createContainerMount(
         devicePath,
         path::join(containerConfig.rootfs(), device),
         MS_BIND);
   }

   return launchInfo;
 }


 Future<Nothing> NvidiaGpuIsolatorProcess::update(
     const ContainerID& containerId,
     const Resources& resourceRequests,
     const google::protobuf::Map<string, Value::Scalar>& resourceLimits)
 {
   if (containerId.has_parent()) {
     return Failure("Not supported for nested containers");
   }

   if (!infos.contains(containerId)) {
     return Failure("Unknown container");
   }

   Info* info = CHECK_NOTNULL(infos[containerId]);

   Option<double> gpus = resourceRequests.gpus();

   // Make sure that the `gpus` resource is not fractional.
   // We rely on scalar resources only having 3 digits of precision.
   if (static_cast<long long>(gpus.getOrElse(0.0) * 1000.0) % 1000 != 0) {
     return Failure("The 'gpus' resource must be an unsigned integer");
   }

   size_t requested =
     static_cast<size_t>(resourceRequests.gpus().getOrElse(0.0));

   // Update the GPU allocation to reflect the new total.
   if (requested > info->allocated.size()) {
     size_t additional = requested - info->allocated.size();

     return allocator.allocate(additional)
       .then(defer(PID<NvidiaGpuIsolatorProcess>(this),
                   &NvidiaGpuIsolatorProcess::_update,
                   containerId,
                   lambda::_1));
   } else if (requested < info->allocated.size()) {
     size_t fewer = info->allocated.size() - requested;

     set<Gpu> deallocated;

     for (size_t i = 0; i < fewer; i++) {
       const auto gpu = info->allocated.begin();

       cgroups::devices::Entry entry;
       entry.selector.type = Entry::Selector::Type::CHARACTER;
       entry.selector.major = gpu->major;
       entry.selector.minor = gpu->minor;
       entry.access.read = true;
       entry.access.write = true;
       entry.access.mknod = true;

       Try<Nothing> deny = cgroups::devices::deny(
           hierarchy, info->cgroup, entry);

       if (deny.isError()) {
         return Failure("Failed to deny cgroups access to GPU device"
                        " '" + stringify(entry) + "': " + deny.error());
       }

       deallocated.insert(*gpu);
       info->allocated.erase(gpu);
     }

     return allocator.deallocate(deallocated);
   }

   return Nothing();
 }


 Future<Nothing> NvidiaGpuIsolatorProcess::_update(
     const ContainerID& containerId,
     const set<Gpu>& allocation)
 {
   if (!infos.contains(containerId)) {
     return Failure("Failed to complete GPU allocation: unknown container");
   }

   Info* info = CHECK_NOTNULL(infos.at(containerId));

   foreach (const Gpu& gpu, allocation) {
     cgroups::devices::Entry entry;
     entry.selector.type = Entry::Selector::Type::CHARACTER;
     entry.selector.major = gpu.major;
     entry.selector.minor = gpu.minor;
     entry.access.read = true;
     entry.access.write = true;
     entry.access.mknod = true;

     Try<Nothing> allow = cgroups::devices::allow(
         hierarchy, info->cgroup, entry);

     if (allow.isError()) {
       return Failure("Failed to grant cgroups access to GPU device"
                      " '" + stringify(entry) + "': " + allow.error());
     }
   }

   info->allocated = allocation;

   return Nothing();
 }


 Future<ResourceStatistics> NvidiaGpuIsolatorProcess::usage(
     const ContainerID& containerId)
 {
   if (containerId.has_parent()) {
     return Failure("Not supported for nested containers");
   }

   if (!infos.contains(containerId)) {
     return Failure("Unknown container");
   }

   // TODO(rtodd): Obtain usage information from NVML.

   ResourceStatistics result;
   return result;
 }


 Future<Nothing> NvidiaGpuIsolatorProcess::cleanup(
     const ContainerID& containerId)
 {
   // If we are a nested container, we don't have an `Info()` struct to
   // cleanup, so we just return immediately.
   if (containerId.has_parent()) {
     return Nothing();
   }

   // Multiple calls may occur during test clean up.
   if (!infos.contains(containerId)) {
     VLOG(1) << "Ignoring cleanup request for unknown container " << containerId;

     return Nothing();
   }

   Info* info = CHECK_NOTNULL(infos.at(containerId));

   // Make any remaining GPUs available.
   return allocator.deallocate(info->allocated)
     .then(defer(self(), [=]() -> Future<Nothing> {
       CHECK(infos.contains(containerId));
       delete infos.at(containerId);
       infos.erase(containerId);

       return Nothing();
     }));
 }

 } // namespace slave {
 } // namespace internal {
 } // namespace mesos {
	// Licensed to the Apache Software Foundation (ASF) under one
	// or more contributor license agreements. See the NOTICE file
	// distributed with this work for additional information
	// regarding copyright ownership. The ASF licenses this file
	// to you under the Apache License, Version 2.0 (the
	// "License"); you may not use this file except in compliance
	// with the License. You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	#include <stdint.h>

	#include <sys/mount.h>

	// This header include must be enclosed in an `extern "C"` block to
	// workaround a bug in glibc <= 2.12 (see MESOS-7378).
	//
	// TODO(neilc): Remove this when we no longer support glibc <= 2.12.
	extern "C" {
	#include <sys/sysmacros.h>
	}

	#include <algorithm>
	#include <list>
	#include <map>
	#include <set>
	#include <string>
	#include <vector>

	#include <process/collect.hpp>
	#include <process/defer.hpp>
	#include <process/future.hpp>
	#include <process/id.hpp>

	#include <stout/error.hpp>
	#include <stout/foreach.hpp>
	#include <stout/hashmap.hpp>
	#include <stout/option.hpp>
	#include <stout/os.hpp>
	#include <stout/try.hpp>

	#include "common/protobuf_utils.hpp"

	#include "linux/cgroups.hpp"
	#include "linux/fs.hpp"

	#include "slave/flags.hpp"

	#include "slave/containerizer/containerizer.hpp"

	#include "slave/containerizer/mesos/isolator.hpp"

	#include "slave/containerizer/mesos/isolators/cgroups/constants.hpp"

	#include "slave/containerizer/mesos/isolators/gpu/allocator.hpp"
	#include "slave/containerizer/mesos/isolators/gpu/isolator.hpp"
	#include "slave/containerizer/mesos/isolators/gpu/nvml.hpp"

	#include "slave/containerizer/mesos/paths.hpp"

	using cgroups::devices::Entry;

	using docker::spec::v1::ImageManifest;

	using mesos::slave::ContainerClass;
	using mesos::slave::ContainerConfig;
	using mesos::slave::ContainerLaunchInfo;
	using mesos::slave::ContainerLimitation;
	using mesos::slave::ContainerMountInfo;
	using mesos::slave::ContainerState;
	using mesos::slave::Isolator;

	using process::defer;
	using process::Failure;
	using process::Future;
	using process::PID;

	using std::list;
	using std::map;
	using std::set;
	using std::string;
	using std::vector;

	namespace mesos {
	namespace internal {
	namespace slave {

	NvidiaGpuIsolatorProcess::NvidiaGpuIsolatorProcess(
	const Flags& _flags,
	const string& _hierarchy,
	const NvidiaGpuAllocator& _allocator,
	const NvidiaVolume& _volume,
	const map<Path, cgroups::devices::Entry>& _controlDeviceEntries)
	: ProcessBase(process::ID::generate("mesos-nvidia-gpu-isolator")),
	flags(_flags),
	hierarchy(_hierarchy),
	allocator(_allocator),
	volume(_volume),
	controlDeviceEntries(_controlDeviceEntries) {}


	Try<Isolator*> NvidiaGpuIsolatorProcess::create(
	const Flags& flags,
	const NvidiaComponents& components)
	{
	// Make sure both the 'cgroups/devices' (or 'cgroups/all')
	// and the 'filesystem/linux' isolators are present.
	vector<string> tokens = strings::tokenize(flags.isolation, ",");

	auto gpuIsolator =
	std::find(tokens.begin(), tokens.end(), "gpu/nvidia");

	auto devicesIsolator =
	std::find(tokens.begin(), tokens.end(), "cgroups/devices");

	auto cgroupsAllIsolator =
	std::find(tokens.begin(), tokens.end(), "cgroups/all");

	auto filesystemIsolator =
	std::find(tokens.begin(), tokens.end(), "filesystem/linux");

	CHECK(gpuIsolator != tokens.end());

	if (cgroupsAllIsolator != tokens.end()) {
	// The reason that we need to check if `devices` cgroups subsystem is
	// enabled is, when `cgroups/all` is specified in the `--isolation` agent
	// flag, cgroups isolator will only load the enabled subsystems. So if
	// `cgroups/all` is specified but `devices` is not enabled, cgroups isolator
	// will not load `devices` subsystem in which case we should error out.
	Try<bool> result = cgroups::enabled("devices");
	if (result.isError()) {
	return Error(
	"Failed to check if the `devices` cgroups subsystem"
	" is enabled by kernel: " + result.error());
	} else if (!result.get()) {
	return Error(
	"The `devices` cgroups subsystem is not enabled by the kernel");
	}
	} else if (devicesIsolator == tokens.end()) {
	return Error(
	"The 'cgroups/devices' or 'cgroups/all' isolator must be"
	" enabled in order to use the 'gpu/nvidia' isolator");
	}

	if (filesystemIsolator == tokens.end()) {
	return Error("The 'filesystem/linux' isolator must be enabled in"
	" order to use the 'gpu/nvidia' isolator");
	}

	// Retrieve the cgroups devices hierarchy.
	Result<string> hierarchy = cgroups::hierarchy(CGROUP_SUBSYSTEM_DEVICES_NAME);

	if (hierarchy.isError()) {
	return Error(
	"Error retrieving the 'devices' subsystem hierarchy: " +
	hierarchy.error());
	}

	// Create device entries for `/dev/nvidiactl` and
	// `/dev/nvidia-uvm`. Optionally create a device entry for
	// `/dev/nvidia-uvm-tools` if it exists.
	map<Path, cgroups::devices::Entry> deviceEntries;

	Try<dev_t> device = os::stat::rdev("/dev/nvidiactl");
	if (device.isError()) {
	return Error("Failed to obtain device ID for '/dev/nvidiactl': " +
	device.error());
	}

	cgroups::devices::Entry entry;
	entry.selector.type = Entry::Selector::Type::CHARACTER;
	entry.selector.major = major(device.get());
	entry.selector.minor = minor(device.get());
	entry.access.read = true;
	entry.access.write = true;
	entry.access.mknod = true;

	deviceEntries[Path("/dev/nvidiactl")] = entry;

	// The `nvidia-uvm` module is not typically loaded by default on
	// systems that have Nvidia GPU drivers installed. Instead,
	// applications that require this module use `nvidia-modprobe` to
	// load it dynamically on first use. This program both loads the
	// `nvidia-uvm` kernel module and creates the corresponding
	// `/dev/nvidia-uvm` device that it controls.
	//
	// We call `nvidia-modprobe` here to ensure that `/dev/nvidia-uvm`
	// is properly created so we can inject it into any containers that
	// may require it.
	if (!os::exists("/dev/nvidia-uvm")) {
	Try<string> modprobe = os::shell("nvidia-modprobe -u -c 0");
	if (modprobe.isError()) {
	return Error("Failed to load '/dev/nvidia-uvm': " + modprobe.error());
	}
	}

	device = os::stat::rdev("/dev/nvidia-uvm");
	if (device.isError()) {
	return Error("Failed to obtain device ID for '/dev/nvidia-uvm': " +
	device.error());
	}

	entry.selector.type = Entry::Selector::Type::CHARACTER;
	entry.selector.major = major(device.get());
	entry.selector.minor = minor(device.get());
	entry.access.read = true;
	entry.access.write = true;
	entry.access.mknod = true;

	deviceEntries[Path("/dev/nvidia-uvm")] = entry;

	device = os::stat::rdev("/dev/nvidia-uvm-tools");
	if (device.isSome()) {
	entry.selector.type = Entry::Selector::Type::CHARACTER;
	entry.selector.major = major(device.get());
	entry.selector.minor = minor(device.get());
	entry.access.read = true;
	entry.access.write = true;
	entry.access.mknod = true;

	deviceEntries[Path("/dev/nvidia-uvm-tools")] = entry;
	}

	process::Owned<MesosIsolatorProcess> process(
	new NvidiaGpuIsolatorProcess(
	flags,
	hierarchy.get(),
	components.allocator,
	components.volume,
	deviceEntries));

	return new MesosIsolator(process);
	}


	bool NvidiaGpuIsolatorProcess::supportsNesting()
	{
	return true;
	}


	bool NvidiaGpuIsolatorProcess::supportsStandalone()
	{
	return true;
	}


	Future<Nothing> NvidiaGpuIsolatorProcess::recover(
	const vector<ContainerState>& states,
	const hashset<ContainerID>& orphans)
	{
	vector<Future<Nothing>> futures;

	foreach (const ContainerState& state, states) {
	const ContainerID& containerId = state.container_id();

	// If we are a nested container, we skip the recover because our
	// root ancestor will recover the GPU state from the cgroup for us.
	if (containerId.has_parent()) {
	continue;
	}

	const string cgroup = path::join(flags.cgroups_root, containerId.value());

	if (!cgroups::exists(hierarchy, cgroup)) {
	// This may occur if the executor has exited and the isolator
	// has destroyed the cgroup but the slave dies before noticing
	// this. This will be detected when the containerizer tries to
	// monitor the executor's pid.
	LOG(WARNING) << "Couldn't find the cgroup '" << cgroup << "' "
	<< "in hierarchy '" << hierarchy << "' "
	<< "for container " << containerId;
	continue;
	}

	infos[containerId] = new Info(containerId, cgroup);

	// Determine which GPUs are allocated to this container.
	Try<vector<cgroups::devices::Entry>> entries =
	cgroups::devices::list(hierarchy, cgroup);

	if (entries.isError()) {
	return Failure("Failed to obtain devices list for cgroup"
	" '" + cgroup + "': " + entries.error());
	}

	const set<Gpu>& available = allocator.total();

	set<Gpu> containerGpus;
	foreach (const cgroups::devices::Entry& entry, entries.get()) {
	foreach (const Gpu& gpu, available) {
	if (entry.selector.major == gpu.major &&
	entry.selector.minor == gpu.minor) {
	containerGpus.insert(gpu);
	break;
	}
	}
	}

	futures.push_back(allocator.allocate(containerGpus)
	.then(defer(self(), [=]() -> Future<Nothing> {
	infos[containerId]->allocated = containerGpus;
	return Nothing();
	})));
	}

	return collect(futures).then([]() { return Nothing(); });
	}


	Future<Option<ContainerLaunchInfo>> NvidiaGpuIsolatorProcess::prepare(
	const ContainerID& containerId,
	const mesos::slave::ContainerConfig& containerConfig)
	{
	if (containerId.has_parent()) {
	// If we are a nested container in the `DEBUG` class, then we
	// don't need to do anything special to prepare ourselves for GPU
	// support. All Nvidia volumes will be inherited from our parent.
	if (containerConfig.has_container_class() &&
	containerConfig.container_class() == ContainerClass::DEBUG) {
	return None();
	}

	// If we are a nested container in a different class, we don't
	// need to maintain an `Info()` struct about the container (since
	// we don't directly allocate any GPUs to it), but we do need to
	// mount the necessary Nvidia libraries into the container (since
	// we live in a different mount namespace than our parent). We
	// directly call `_prepare()` to do this for us.
	return _prepare(containerId, containerConfig);
	}

	if (infos.contains(containerId)) {
	return Failure("Container has already been prepared");
	}

	infos[containerId] = new Info(
	containerId, path::join(flags.cgroups_root, containerId.value()));

	// Grant access to all `controlDeviceEntries`.
	//
	// This allows standard NVIDIA tools like `nvidia-smi` to be
	// used within the container even if no GPUs are allocated.
	// Without these devices, these tools fail abnormally.
	foreachkey (const Path& devicePath, controlDeviceEntries) {
	Try<Nothing> allow = cgroups::devices::allow(
	hierarchy,
	infos[containerId]->cgroup,
	controlDeviceEntries.at(devicePath));

	if (allow.isError()) {
	return Failure("Failed to grant cgroups access to"
	" '" + stringify(devicePath) + "': " + allow.error());
	}
	}

	return update(containerId, containerConfig.resources())
	.then(defer(PID<NvidiaGpuIsolatorProcess>(this),
	&NvidiaGpuIsolatorProcess::_prepare,
	containerId,
	containerConfig));
	}


	// If our `ContainerConfig` specifies a different `rootfs` than the
	// host file system, then we need to prepare a script to inject our
	// `NvidiaVolume` into the container (if required).
	Future<Option<ContainerLaunchInfo>> NvidiaGpuIsolatorProcess::_prepare(
	const ContainerID& containerId,
	const mesos::slave::ContainerConfig& containerConfig)
	{
	if (!containerConfig.has_rootfs()) {
	return None();
	}

	// We only support docker containers at the moment.
	if (!containerConfig.has_docker()) {
	// TODO(klueska): Once ContainerConfig has
	// a type, include that in the error message.
	return Failure("Nvidia GPU isolator does not support non-Docker images");
	}

	ContainerLaunchInfo launchInfo;

	// Inject the Nvidia volume into the container.
	if (!containerConfig.docker().has_manifest()) {
	return Failure("The 'ContainerConfig' for docker is missing a manifest");
	}

	ImageManifest manifest = containerConfig.docker().manifest();

	if (volume.shouldInject(manifest)) {
	const string target = path::join(
	containerConfig.rootfs(),
	volume.CONTAINER_PATH());

	Try<Nothing> mkdir = os::mkdir(target);
	if (mkdir.isError()) {
	return Failure(
	"Failed to create the container directory at"
	" '" + target + "': " + mkdir.error());
	}

	*launchInfo.add_mounts() = protobuf::slave::createContainerMount(
	volume.HOST_PATH(), target, MS_RDONLY \| MS_BIND \| MS_REC);

	// TODO(chhsiao): As a workaround, we append `NvidiaVolume` paths into the
	// `PATH` and `LD_LIBRARY_PATH` environment variables so the binaries and
	// libraries can be found. However these variables might be overridden by
	// users, and `LD_LIBRARY_PATH` might get cleared across exec calls. Instead
	// of injecting `NvidiaVolume`, we could leverage libnvidia-container in the
	// future. See MESOS-9595.
	if (containerConfig.has_task_info()) {
	// Command executor.
	*launchInfo.mutable_task_environment() = volume.ENV(manifest);
	} else {
	// Default executor, custom executor, or nested container.
	*launchInfo.mutable_environment() = volume.ENV(manifest);
	}
	}

	const string devicesDir = containerizer::paths::getContainerDevicesPath(
	flags.runtime_dir, containerId);

	// The `filesystem/linux` isolator is responsible for creating the
	// devices directory and ordered to run before we do. Here, we can
	// just assert that the devices directory is still present.
	if (!os::exists(devicesDir)) {
	return Failure("Missing container devices directory '" + devicesDir + "'");
	}

	// Glob all Nvidia GPU devices on the system and add them to the
	// list of devices injected into the chroot environment.
	Try<list<string>> nvidia = os::glob("/dev/nvidia*");
	if (nvidia.isError()) {
	return Failure("Failed to glob /dev/nvidia*: " + nvidia.error());
	}

	foreach (const string& device, nvidia.get()) {
	const string devicePath = path::join(
	devicesDir, strings::remove(device, "/dev/", strings::PREFIX), device);

	Try<Nothing> mknod =
	fs::chroot::copyDeviceNode(device, devicePath);
	if (mknod.isError()) {
	return Failure(
	"Failed to copy device '" + device + "': " + mknod.error());
	}

	// Since we are adding the GPU devices to the container, make
	// them read/write to guarantee that they are accessible inside
	// the container.
	Try<Nothing> chmod = os::chmod(devicePath, 0666);
	if (chmod.isError()) {
	return Failure(
	"Failed to set permissions on device '" + device + "': " +
	chmod.error());
	}

	*launchInfo.add_mounts() = protobuf::slave::createContainerMount(
	devicePath,
	path::join(containerConfig.rootfs(), device),
	MS_BIND);
	}

	return launchInfo;
	}


	Future<Nothing> NvidiaGpuIsolatorProcess::update(
	const ContainerID& containerId,
	const Resources& resourceRequests,
	const google::protobuf::Map<string, Value::Scalar>& resourceLimits)
	{
	if (containerId.has_parent()) {
	return Failure("Not supported for nested containers");
	}

	if (!infos.contains(containerId)) {
	return Failure("Unknown container");
	}

	Info* info = CHECK_NOTNULL(infos[containerId]);

	Option<double> gpus = resourceRequests.gpus();

	// Make sure that the `gpus` resource is not fractional.
	// We rely on scalar resources only having 3 digits of precision.
	if (static_cast<long long>(gpus.getOrElse(0.0) * 1000.0) % 1000 != 0) {
	return Failure("The 'gpus' resource must be an unsigned integer");
	}

	size_t requested =
	static_cast<size_t>(resourceRequests.gpus().getOrElse(0.0));

	// Update the GPU allocation to reflect the new total.
	if (requested > info->allocated.size()) {
	size_t additional = requested - info->allocated.size();

	return allocator.allocate(additional)
	.then(defer(PID<NvidiaGpuIsolatorProcess>(this),
	&NvidiaGpuIsolatorProcess::_update,
	containerId,
	lambda::_1));
	} else if (requested < info->allocated.size()) {
	size_t fewer = info->allocated.size() - requested;

	set<Gpu> deallocated;

	for (size_t i = 0; i < fewer; i++) {
	const auto gpu = info->allocated.begin();

	cgroups::devices::Entry entry;
	entry.selector.type = Entry::Selector::Type::CHARACTER;
	entry.selector.major = gpu->major;
	entry.selector.minor = gpu->minor;
	entry.access.read = true;
	entry.access.write = true;
	entry.access.mknod = true;

	Try<Nothing> deny = cgroups::devices::deny(
	hierarchy, info->cgroup, entry);

	if (deny.isError()) {
	return Failure("Failed to deny cgroups access to GPU device"
	" '" + stringify(entry) + "': " + deny.error());
	}

	deallocated.insert(*gpu);
	info->allocated.erase(gpu);
	}

	return allocator.deallocate(deallocated);
	}

	return Nothing();
	}


	Future<Nothing> NvidiaGpuIsolatorProcess::_update(
	const ContainerID& containerId,
	const set<Gpu>& allocation)
	{
	if (!infos.contains(containerId)) {
	return Failure("Failed to complete GPU allocation: unknown container");
	}

	Info* info = CHECK_NOTNULL(infos.at(containerId));

	foreach (const Gpu& gpu, allocation) {
	cgroups::devices::Entry entry;
	entry.selector.type = Entry::Selector::Type::CHARACTER;
	entry.selector.major = gpu.major;
	entry.selector.minor = gpu.minor;
	entry.access.read = true;
	entry.access.write = true;
	entry.access.mknod = true;

	Try<Nothing> allow = cgroups::devices::allow(
	hierarchy, info->cgroup, entry);

	if (allow.isError()) {
	return Failure("Failed to grant cgroups access to GPU device"
	" '" + stringify(entry) + "': " + allow.error());
	}
	}

	info->allocated = allocation;

	return Nothing();
	}


	Future<ResourceStatistics> NvidiaGpuIsolatorProcess::usage(
	const ContainerID& containerId)
	{
	if (containerId.has_parent()) {
	return Failure("Not supported for nested containers");
	}

	if (!infos.contains(containerId)) {
	return Failure("Unknown container");
	}

	// TODO(rtodd): Obtain usage information from NVML.

	ResourceStatistics result;
	return result;
	}


	Future<Nothing> NvidiaGpuIsolatorProcess::cleanup(
	const ContainerID& containerId)
	{
	// If we are a nested container, we don't have an `Info()` struct to
	// cleanup, so we just return immediately.
	if (containerId.has_parent()) {
	return Nothing();
	}

	// Multiple calls may occur during test clean up.
	if (!infos.contains(containerId)) {
	VLOG(1) << "Ignoring cleanup request for unknown container " << containerId;

	return Nothing();
	}

	Info* info = CHECK_NOTNULL(infos.at(containerId));

	// Make any remaining GPUs available.
	return allocator.deallocate(info->allocated)
	.then(defer(self(), [=]() -> Future<Nothing> {
	CHECK(infos.contains(containerId));
	delete infos.at(containerId);
	infos.erase(containerId);

	return Nothing();
	}));
	}

	} // namespace slave {
	} // namespace internal {
	} // namespace mesos {