blob: 77522ff8e63f56260d9e423544d9d788c305e7a9 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// See the License for the specific language governing permissions and
// limitations under the License.
#include <algorithm>
#include <iterator>
#include <ostream>
#include <set>
#include <string>
#include <vector>
#include <process/check.hpp>
#include <process/dispatch.hpp>
#include <process/future.hpp>
#include <process/once.hpp>
#include <process/process.hpp>
#include <stout/nothing.hpp>
#include <stout/set.hpp>
#include <stout/stringify.hpp>
#include <stout/strings.hpp>
#include <stout/try.hpp>
#include "slave/flags.hpp"
#include "slave/containerizer/mesos/isolators/gpu/allocator.hpp"
#include "slave/containerizer/mesos/isolators/gpu/nvml.hpp"
using process::Failure;
using process::Future;
using process::Once;
using process::PID;
using std::ostream;
using std::set;
using std::string;
using std::vector;
namespace mesos {
namespace internal {
namespace slave {
static constexpr unsigned int NVIDIA_MAJOR_DEVICE = 195;
namespace {
// TODO(bmahler): Move this into stout/set.hpp (for an unknown
// reason, g++ was not able to lookup the `-` operator even
// though it was able to find the `&`, `|` operators also
// defined in stout/set.hpp).
template <typename T>
set<T> operator-(const set<T>& left, const set<T>& right)
set<T> result;
std::inserter(result, result.begin()));
return result;
// Return the GPUs devices to manage
// based on the flags and resource scalars.
static Try<set<Gpu>> enumerateGpus(
const Flags& flags,
const Resources& resources)
vector<unsigned int> indices;
if (flags.nvidia_gpu_devices.isSome()) {
indices = flags.nvidia_gpu_devices.get();
} else {
for (size_t i = 0; i < resources.gpus().getOrElse(0); ++i) {
set<Gpu> gpus;
foreach (unsigned int index, indices) {
Try<nvmlDevice_t> handle = nvml::deviceGetHandleByIndex(index);
if (handle.isError()) {
return Error("Failed to nvml::deviceGetHandleByIndex: " + handle.error());
Try<unsigned int> minor = nvml::deviceGetMinorNumber(handle.get());
if (minor.isError()) {
return Error("Failed to nvml::deviceGetMinorNumber: " + minor.error());
Gpu gpu;
gpu.minor = minor.get();
return gpus;
// To determine the proper number of GPU resources to return, we
// need to check both --resources and --nvidia_gpu_devices.
// There are two cases to consider:
// (1) --resources includes "gpus" and --nvidia_gpu_devices is set.
// The number of GPUs in --resources must equal the number of
// GPUs within --nvidia_gpu_resources.
// (2) --resources does not include "gpus" and --nvidia_gpu_devices
// is not specified. Here we auto-discover GPUs using the
// NVIDIA management Library (NVML). We special case specifying
// `gpus:0` explicitly to not perform auto-discovery.
// NOTE: We also check to make sure the `gpu/nvidia` isolation flag
// is set before enumerating GPUs. We do this because we decided it
// makes sense to only do autodiscovery of GPUs when this isolator
// is turned on (unlike for CPUs, memory, and disk where
// autodiscovery happens by default). We decided to take this
// approach, because GPU support is still experimental, and is only
// known to work well if this isolator is enabled. We didn't want to
// start advertising GPUs in our resource offer and have people
// attempt to use them in scenarious we haven't considered yet. In
// the future we may support other use cases, but for now we are
// being cautious.
static Try<Resources> enumerateGpuResources(const Flags& flags)
const vector<string> tokens = strings::tokenize(flags.isolation, ",");
const set<string> isolators = set<string>(tokens.begin(), tokens.end());
// Don't allow the `--nvidia-gpu_devices` flag without the GPU isolator.
if (flags.nvidia_gpu_devices.isSome() && isolators.count("gpu/nvidia") == 0) {
return Error("'--nvidia_gpus_devices' can only be specified if the"
" `--isolation` flag contains 'gpu/nvidia'");
// Pull out just the GPU resources from --resources.
Try<Resources> parsed = Resources::parse(
flags.resources.getOrElse(""), flags.default_role);
if (parsed.isError()) {
return Error(parsed.error());
Resources resources = parsed->filter(
[](const Resource& resource) {
return == "gpus";
// Pass the GPU resources through if we're not going to do any
// isolation or we cannot validate the resources using NVML.
if (isolators.count("gpu/nvidia") == 0 || !nvml::isAvailable()) {
return resources;
// Enumerate GPUs based on the flags.
Try<Nothing> initialized = nvml::initialize();
if (initialized.isError()) {
return Error("Failed to nvml::initialize: " + initialized.error());
Try<unsigned int> available = nvml::deviceGetCount();
if (available.isError()) {
return Error("Failed to nvml::deviceGetCount: " + available.error());
// The `Resources` wrapper does not allow us to distinguish between
// a user specifying "gpus:0" in the --resources flag and not
// specifying "gpus" at all. To help with this we short circuit
// this function to return an empty resource vector for the case of
// explicitly setting "gpus:0". After doing so, it is sufficient in
// the rest of this function to call `resources.gpus().isSome()` to
// determine if "gpus" were explicitly specified.
if (strings::contains(flags.resources.getOrElse(""), "gpus") &&
resources.gpus().getOrElse(0) == 0) {
if (flags.nvidia_gpu_devices.isSome()) {
return Error("'--nvidia_gpus_devices' cannot be specified"
" when '--resources' specifies 0 GPUs");
return Resources();
if (flags.nvidia_gpu_devices.isSome() && !resources.gpus().isSome()) {
return Error("'--nvidia_gpus_devices' cannot be set without"
" also setting 'gpus' in '--resources'");
if (resources.gpus().isSome() && !flags.nvidia_gpu_devices.isSome()) {
return Error("The `gpus` resource cannot be set without also"
" setting `--nvidia_gpu_devices`");
if (resources.gpus().isSome()) {
// Make sure that the value of "gpus" is an integer and not a
// fractional amount. We take advantage of the fact that we know
// the value of "gpus" is only precise up to 3 decimals.
long long milli = static_cast<long long>(resources.gpus().get() * 1000);
if ((milli % 1000) != 0) {
return Error("The 'gpus' resource must be an non-negative integer");
// Make sure the `nvidia_gpu_devices` flag
// contains a list of unique GPU identifiers.
vector<unsigned int> unique = flags.nvidia_gpu_devices.get();
std::sort(unique.begin(), unique.end());
auto last = std::unique(unique.begin(), unique.end());
unique.erase(last, unique.end());
if (unique.size() != flags.nvidia_gpu_devices->size()) {
return Error("'--nvidia_gpu_devices' contains duplicates");
if (flags.nvidia_gpu_devices->size() != resources.gpus().get()) {
return Error("'--resources' and '--nvidia_gpu_devices' specify"
" different numbers of GPU devices");
if (resources.gpus().get() > available.get()) {
return Error("The number of GPUs requested is greater than"
" the number of GPUs available on the machine");
return resources;
return Resources::parse(
class NvidiaGpuAllocatorProcess
: public process::Process<NvidiaGpuAllocatorProcess>
NvidiaGpuAllocatorProcess(const set<Gpu>& gpus)
: available(gpus) {}
Future<set<Gpu>> allocate(size_t count)
if (available.size() < count) {
return Failure("Requested " + stringify(count) + " gpus but only"
" " + stringify(available.size()) + " available");
set<Gpu> allocation(
std::next(available.begin(), count));
return allocate(allocation)
.then([=]() -> Future<set<Gpu>> { return allocation; });
Future<Nothing> allocate(const set<Gpu>& gpus)
set<Gpu> allocation = available & gpus;
if (allocation.size() < gpus.size()) {
return Failure(stringify(gpus - allocation) + " are not available");
available = available - allocation;
allocated = allocated | allocation;
return Nothing();
Future<Nothing> deallocate(const set<Gpu>& gpus)
set<Gpu> deallocation = allocated & gpus;
if (deallocation.size() < gpus.size()) {
return Failure(stringify(gpus - deallocation) + " are not allocated");
allocated = allocated - deallocation;
available = available | deallocation;
return Nothing();
set<Gpu> available;
set<Gpu> allocated;
} // namespace {
struct NvidiaGpuAllocator::Data
Data(const set<Gpu>& gpus_)
: gpus(gpus_),
process(process::spawn(new NvidiaGpuAllocatorProcess(gpus_), true)) {}
const set<Gpu> gpus;
PID<NvidiaGpuAllocatorProcess> process;
Try<NvidiaGpuAllocator> NvidiaGpuAllocator::create(
const Flags& flags,
const Resources& resources)
Try<set<Gpu>> gpus = enumerateGpus(flags, resources);
if (gpus.isError()) {
return Error(gpus.error());
return NvidiaGpuAllocator(gpus.get());
Try<Resources> NvidiaGpuAllocator::resources(const Flags& flags)
return enumerateGpuResources(flags);
const set<Gpu>& gpus)
: data(std::make_shared<NvidiaGpuAllocator::Data>(gpus)) {}
const set<Gpu>& NvidiaGpuAllocator::total() const { return data->gpus; }
Future<set<Gpu>> NvidiaGpuAllocator::allocate(size_t count)
// Need to disambiguate for the compiler.
Future<set<Gpu>> (NvidiaGpuAllocatorProcess::*allocate)(size_t) =
return process::dispatch(data->process, allocate, count);
Future<Nothing> NvidiaGpuAllocator::allocate(const set<Gpu>& gpus)
// Need to disambiguate for the compiler.
Future<Nothing> (NvidiaGpuAllocatorProcess::*allocate)(const set<Gpu>&) =
return process::dispatch(data->process, allocate, gpus);
Future<Nothing> NvidiaGpuAllocator::deallocate(const set<Gpu>& gpus)
return process::dispatch(
bool operator<(const Gpu& left, const Gpu& right)
if (left.major == right.major) {
return left.minor < right.minor;
return left.major < right.major;
bool operator>(const Gpu& left, const Gpu& right)
return right < left;
bool operator<=(const Gpu& left, const Gpu& right)
return !(left > right);
bool operator>=(const Gpu& left, const Gpu& right)
return !(left < right);
bool operator==(const Gpu& left, const Gpu& right)
return left.major == right.major && left.minor == right.minor;
bool operator!=(const Gpu& left, const Gpu& right)
return !(left == right);
ostream& operator<<(ostream& stream, const Gpu& gpu)
return stream << gpu.major << '.' << gpu.minor;
} // namespace slave {
} // namespace internal {
} // namespace mesos {