blob: 2fddf4dda19b9b3ef28ee0ef04ab116771180980 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <dlfcn.h>
#include <glog/logging.h>
#include <map>
#include <string>
#include <process/once.hpp>
#include <stout/error.hpp>
#include <stout/exit.hpp>
#include <stout/nothing.hpp>
#include <stout/option.hpp>
#include <stout/stringify.hpp>
#include <stout/try.hpp>
#include <stout/posix/dynamiclibrary.hpp>
#include "logging/logging.hpp"
#include "slave/containerizer/mesos/isolators/gpu/nvml.hpp"
#ifndef ENABLE_NVML
// We provide dummy types and variables in case we do not use the NVML headers.
using nvmlReturn_t = int;
constexpr bool NVML_SUCCESS = true;
constexpr size_t NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE = 1;
constexpr nvmlReturn_t NVML_ERROR_INVALID_ARGUMENT{};
#endif // ENABLE_NVML
using process::Once;
using std::map;
using std::string;
namespace nvml {
static constexpr char LIBRARY_NAME[] = "libnvidia-ml.so.1";
struct NvidiaManagementLibrary
{
NvidiaManagementLibrary(
nvmlReturn_t (*_systemGetDriverVersion)(char *, unsigned int),
nvmlReturn_t (*_deviceGetCount)(unsigned int*),
nvmlReturn_t (*_deviceGetHandleByIndex)(unsigned int, nvmlDevice_t*),
nvmlReturn_t (*_deviceGetMinorNumber)(nvmlDevice_t, unsigned int*),
const char* (*_errorString)(nvmlReturn_t))
: systemGetDriverVersion(_systemGetDriverVersion),
deviceGetCount(_deviceGetCount),
deviceGetHandleByIndex(_deviceGetHandleByIndex),
deviceGetMinorNumber(_deviceGetMinorNumber),
errorString(_errorString) {}
nvmlReturn_t (*systemGetDriverVersion)(char *, unsigned int);
nvmlReturn_t (*deviceGetCount)(unsigned int*);
nvmlReturn_t (*deviceGetHandleByIndex)(unsigned int, nvmlDevice_t*);
nvmlReturn_t (*deviceGetMinorNumber)(nvmlDevice_t, unsigned int*);
const char* (*errorString)(nvmlReturn_t);
};
static const NvidiaManagementLibrary* nvml = nullptr;
static Once* initialized = new Once();
static Option<Error>* error = new Option<Error>();
static DynamicLibrary* library = new DynamicLibrary();
Try<Nothing> initialize()
{
if (initialized->once()) {
if (error->isSome()) {
return error->get();
}
return Nothing();
}
Try<Nothing> open = library->open(LIBRARY_NAME);
if (open.isError()) {
*error = Error("Failed to open '" + stringify(LIBRARY_NAME) + "': " +
open.error());
initialized->done();
return error->get();
}
// Load the needed symbols.
//
// NOTE: NVML maps nvmlInit to nvmlInit_v2 via a macro.
// This means that we have to explicitly load nvmlInit_v2.
//
// TODO(klueska): Use nvmlDeviceGetCount_v2 and nvmlDeviceGetHandleByIndex_v2.
map<string, void*> symbols = {
{ "nvmlInit_v2", nullptr },
{ "nvmlSystemGetDriverVersion", nullptr },
{ "nvmlDeviceGetCount", nullptr },
{ "nvmlDeviceGetHandleByIndex", nullptr },
{ "nvmlDeviceGetMinorNumber", nullptr },
{ "nvmlErrorString", nullptr },
};
foreachkey (const string& name, symbols) {
Try<void*> symbol = library->loadSymbol(name);
if (symbol.isError()) {
*error = Error("Failed to load symbol '" + name + "': " + symbol.error());
initialized->done();
return error->get();
}
symbols.at(name) = symbol.get();
}
auto nvmlInit =
(nvmlReturn_t (*)()) symbols.at("nvmlInit_v2");
auto nvmlErrorString =
(const char* (*)(nvmlReturn_t)) symbols.at("nvmlErrorString");
nvmlReturn_t result = nvmlInit();
if (result != NVML_SUCCESS) {
*error = Error("nvmlInit failed: " + stringify(nvmlErrorString(result)));
initialized->done();
return error->get();
}
nvml = new NvidiaManagementLibrary(
(nvmlReturn_t (*)(char*, unsigned int))
symbols.at("nvmlSystemGetDriverVersion"),
(nvmlReturn_t (*)(unsigned int*))
symbols.at("nvmlDeviceGetCount"),
(nvmlReturn_t (*)(unsigned int, nvmlDevice_t*))
symbols.at("nvmlDeviceGetHandleByIndex"),
(nvmlReturn_t (*)(nvmlDevice_t, unsigned int*))
symbols.at("nvmlDeviceGetMinorNumber"),
(const char* (*)(nvmlReturn_t))
symbols.at("nvmlErrorString"));
initialized->done();
return Nothing();
}
bool isAvailable()
{
#ifndef ENABLE_NVML
return false;
#endif // ENABLE_NVML
// Unfortunately, there is no function available in `glibc` to check
// if a dynamic library is available to open with `dlopen()`.
// Instead, availability is determined by attempting to open a
// library with `dlopen()` and if this call fails, assuming the
// library is unavailable. The problem with using this method to
// implement a generic `isAvailable()` function is knowing if we
// should call `dlclose()` on the library once we've determined that
// the library is in fact available (because some other code path
// may have already opened the library and we don't want to close it
// out from under them). Fortunately, calls to `dlopen()` are reference
// counted, so that subsequent calls to `dlclose()` simply down the
// reference count and only actually close the library when this
// reference count hits zero. Because of this, we can
// unconditionally call `dlclose()` and trust that glibc will take
// care to do the right thing. Additionally, calling `dlopen()` with
// `RTLD_LAZY` is the preferred method here because it is faster in
// cases where the library is not yet opened, and having previously
// opened it with `RTLD_NOW` will always take precedence.
void* open = ::dlopen(LIBRARY_NAME, RTLD_LAZY);
if (open == nullptr) {
return false;
}
CHECK_EQ(0, ::dlclose(open))
<< "dlcose failed: " << dlerror();
return true;
}
Try<string> systemGetDriverVersion()
{
if (nvml == nullptr) {
return Error("NVML has not been initialized");
}
// The NVML manual specifies that the version string will never
// exceed 80 characters.
char version[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE];
nvmlReturn_t result = nvml->systemGetDriverVersion(version, sizeof(version));
if (result != NVML_SUCCESS) {
return Error(nvml->errorString(result));
}
return string(version);
}
Try<unsigned int> deviceGetCount()
{
if (nvml == nullptr) {
return Error("NVML has not been initialized");
}
unsigned int count;
nvmlReturn_t result = nvml->deviceGetCount(&count);
if (result != NVML_SUCCESS) {
return Error(nvml->errorString(result));
}
return count;
}
Try<nvmlDevice_t> deviceGetHandleByIndex(unsigned int index)
{
if (nvml == nullptr) {
return Error("NVML has not been initialized");
}
nvmlDevice_t handle;
nvmlReturn_t result = nvml->deviceGetHandleByIndex(index, &handle);
if (result == NVML_ERROR_INVALID_ARGUMENT) {
return Error("GPU device not found");
}
if (result != NVML_SUCCESS) {
return Error(nvml->errorString(result));
}
return handle;
}
Try<unsigned int> deviceGetMinorNumber(nvmlDevice_t handle)
{
if (nvml == nullptr) {
return Error("NVML has not been initialized");
}
unsigned int minor;
nvmlReturn_t result = nvml->deviceGetMinorNumber(handle, &minor);
if (result != NVML_SUCCESS) {
return Error(nvml->errorString(result));
}
return minor;
}
} // namespace nvml {