blob: e746dda3bfa160379f8d97be6276827422145616 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
/*!
* \file resource.cc
* \brief Implementation of resource manager.
*/
#include <dmlc/logging.h>
#include <dmlc/parameter.h>
#include <dmlc/thread_local.h>
#include <mxnet/base.h>
#include <mxnet/engine.h>
#include <mxnet/random_generator.h>
#include <mxnet/resource.h>
#include <limits>
#include <atomic>
#include <memory>
#include "./common/lazy_alloc_array.h"
#include "./common/utils.h"
#include "./common/cuda/utils.h"
#include "./profiler/storage_profiler.h"
namespace mxnet {
namespace resource {
// internal structure for space allocator
struct SpaceAllocator {
// internal context
Context ctx;
// internal handle
Storage::Handle handle;
// internal CPU handle
Storage::Handle host_handle;
SpaceAllocator() {
handle.dptr = nullptr;
handle.size = 0;
host_handle.dptr = nullptr;
host_handle.size = 0;
}
inline void ReleaseAll() {
Storage::Get()->DirectFree(handle);
handle.dptr = nullptr;
handle.size = 0;
Storage::Get()->DirectFree(host_handle);
host_handle.dptr = nullptr;
host_handle.size = 0;
}
inline void* GetSpace(size_t size, const std::string& name) {
if (handle.size >= size)
return handle.dptr;
Storage::Get()->DirectFree(handle);
handle = Storage::Get()->Alloc(size, ctx);
handle.profiler_scope = "resource:";
handle.name = name;
#if MXNET_USE_CUDA
profiler::GpuDeviceStorageProfiler::Get()->UpdateStorageInfo(handle);
#endif // MXNET_USE_CUDA
return handle.dptr;
}
inline void* GetHostSpace(size_t size) {
if (host_handle.size >= size)
return host_handle.dptr;
Storage::Get()->DirectFree(host_handle);
host_handle = Storage::Get()->Alloc(size, Context());
return host_handle.dptr;
}
};
// Implements resource manager
class ResourceManagerImpl : public ResourceManager {
public:
ResourceManagerImpl() noexcept(false) {
cpu_temp_space_copy_ = dmlc::GetEnv("MXNET_CPU_TEMP_COPY", 4);
gpu_temp_space_copy_ = dmlc::GetEnv("MXNET_GPU_TEMP_COPY", 1);
cpu_native_rand_copy_ = dmlc::GetEnv("MXNET_CPU_PARALLEL_RAND_COPY", 1);
gpu_native_rand_copy_ = dmlc::GetEnv("MXNET_GPU_PARALLEL_RAND_COPY", 1);
#if MXNET_USE_CUDNN == 1
gpu_cudnn_dropout_state_copy_ = dmlc::GetEnv("MXNET_GPU_CUDNN_DROPOUT_STATE_COPY", 1);
#endif // MXNET_USE_CUDNN == 1
engine_ref_ = Engine::_GetSharedRef();
storage_ref_ = Storage::_GetSharedRef();
cpu_rand_ = std::make_unique<ResourceRandom<cpu>>(Context::CPU(), global_seed_);
cpu_space_ = std::make_unique<ResourceTempSpace<ResourceRequest::kTempSpace>>(
Context::CPU(), cpu_temp_space_copy_);
cpu_parallel_rand_ = std::make_unique<ResourceParallelRandom<cpu>>(
Context::CPU(), cpu_native_rand_copy_, global_seed_);
}
~ResourceManagerImpl() override {
// need explicit delete, before engine get killed
cpu_rand_.reset(nullptr);
cpu_space_.reset(nullptr);
cpu_parallel_rand_.reset(nullptr);
#if MXNET_USE_CUDA
gpu_rand_.Clear();
gpu_space_.Clear();
gpu_parallel_rand_.Clear();
#if MXNET_USE_CUDNN == 1
gpu_cudnn_dropout_state_.Clear();
#endif // MXNET_USE_CUDNN == 1
#endif
if (engine_ref_ != nullptr) {
engine_ref_ = nullptr;
}
if (storage_ref_ != nullptr) {
storage_ref_ = nullptr;
}
}
// request resources
Resource Request(Context ctx, const ResourceRequest& req) override {
if (ctx.dev_mask() == Context::kCPU) {
switch (req.type) {
case ResourceRequest::kRandom:
return cpu_rand_->resource;
case ResourceRequest::kTempSpace:
return cpu_space_->GetNext();
case ResourceRequest::kParallelRandom:
return cpu_parallel_rand_->GetNext();
default:
LOG(FATAL) << "Unknown supported type " << req.type;
}
} else {
CHECK_EQ(ctx.dev_mask(), Context::kGPU);
#if MSHADOW_USE_CUDA
switch (req.type) {
case ResourceRequest::kRandom: {
return gpu_rand_
.Get(ctx.dev_id, [ctx, this]() { return new ResourceRandom<gpu>(ctx, global_seed_); })
->resource;
}
case ResourceRequest::kTempSpace: {
return gpu_space_
.Get(ctx.dev_id,
[ctx, this]() {
return new ResourceTempSpace<ResourceRequest::kTempSpace>(
ctx, gpu_temp_space_copy_);
})
->GetNext();
}
case ResourceRequest::kParallelRandom: {
return gpu_parallel_rand_
.Get(ctx.dev_id,
[ctx, this]() {
return new ResourceParallelRandom<gpu>(
ctx, gpu_native_rand_copy_, global_seed_);
})
->GetNext();
}
#if MXNET_USE_CUDNN == 1
case ResourceRequest::kCuDNNDropoutDesc: {
return gpu_cudnn_dropout_state_
.Get(ctx.dev_id,
[ctx, this]() {
return new ResourceCUDNNDropout(
ctx, gpu_cudnn_dropout_state_copy_, global_seed_);
})
->GetNext();
}
#endif // MXNET_USE_CUDNN == 1
default:
LOG(FATAL) << "Unknown supported type " << req.type;
}
#else
LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
#endif
}
Resource ret;
return ret;
}
void SeedRandom(uint32_t seed) override {
global_seed_ = seed;
cpu_rand_->SeedWithDeviceID(global_seed_);
cpu_parallel_rand_->SeedWithDeviceID(global_seed_);
#if MXNET_USE_CUDA
gpu_rand_.ForEach([seed](size_t i, ResourceRandom<gpu>* p) { p->SeedWithDeviceID(seed); });
gpu_parallel_rand_.ForEach(
[seed](size_t i, ResourceParallelRandom<gpu>* p) { p->SeedWithDeviceID(seed); });
#if MXNET_USE_CUDNN == 1
gpu_cudnn_dropout_state_.ForEach([seed](size_t i, ResourceCUDNNDropout* p) {
ResourceManagerImpl::SeedCUDNNDropout(p, seed);
});
#endif // MXNET_USE_CUDNN
#endif // MXNET_USE_CUDA
}
void SeedRandom(Context ctx, uint32_t seed) override {
cpu_rand_->Seed(seed);
cpu_parallel_rand_->Seed(seed);
#if MXNET_USE_CUDA
if (ctx.dev_type == Context::kGPU) {
gpu_rand_.Get(ctx.dev_id, [ctx, seed, this]() { return new ResourceRandom<gpu>(ctx, seed); })
->Seed(seed);
gpu_parallel_rand_
.Get(ctx.dev_id,
[ctx, seed, this]() {
return new ResourceParallelRandom<gpu>(ctx, gpu_native_rand_copy_, seed);
})
->Seed(seed);
#if MXNET_USE_CUDNN == 1
auto dropout_state = gpu_cudnn_dropout_state_.Get(ctx.dev_id, [ctx, seed, this]() {
return new ResourceCUDNNDropout(ctx, gpu_cudnn_dropout_state_copy_, seed);
});
ResourceManagerImpl::SeedCUDNNDropout(dropout_state.get(), seed);
#endif // MXNET_USE_CUDNN
}
#endif // MXNET_USE_CUDA
}
private:
/*! \brief Maximum number of GPUs */
static constexpr std::size_t kMaxNumGPUs = 16;
/*! \brief Random number magic number to seed different random numbers */
static constexpr uint32_t kRandMagic = 127UL;
// the random number resources
template <typename xpu>
struct ResourceRandom {
/*! \brief the context of the PRNG */
Context ctx;
/*! \brief pointer to PRNG */
mshadow::Random<xpu>* prnd;
/*! \brief resource representation */
Resource resource;
/*! \brief constructor */
explicit ResourceRandom(Context ctx, uint32_t global_seed) : ctx(ctx) {
mshadow::SetDevice<xpu>(ctx.dev_id);
resource.var = Engine::Get()->NewVariable();
prnd = new mshadow::Random<xpu>(ctx.dev_id + global_seed * kRandMagic);
resource.ptr_ = prnd;
resource.req = ResourceRequest(ResourceRequest::kRandom);
}
~ResourceRandom() {
mshadow::Random<xpu>* r = prnd;
Engine::Get()->DeleteVariable(
[r](RunContext rctx) { MSHADOW_CATCH_ERROR(delete r); }, ctx, resource.var);
}
// set seed to a PRNG using global_seed and device id
inline void SeedWithDeviceID(uint32_t global_seed) {
Seed(ctx.dev_id + global_seed * kRandMagic);
}
// set seed to a PRNG
inline void Seed(uint32_t seed) {
mshadow::Random<xpu>* r = prnd;
Engine::Get()->PushAsync(
[r, seed](RunContext rctx,
Engine::CallbackOnStart on_start,
Engine::CallbackOnComplete on_complete) {
on_start();
r->set_stream(rctx.get_stream<xpu>());
r->Seed(seed);
on_complete();
},
ctx,
{},
{resource.var},
FnProperty::kNormal,
0,
"ResourceRandomSetSeed");
}
};
// temporary space resource.
template <ResourceRequest::Type req>
struct ResourceTempSpace {
/*! \brief the context of the device */
Context ctx;
/*! \brief the underlying space */
std::vector<SpaceAllocator> space;
/*! \brief resource representation */
std::vector<Resource> resource;
/*! \brief current pointer to the round roubin allocator */
std::atomic<size_t> curr_ptr;
/*! \brief constructor */
explicit ResourceTempSpace(Context ctx, size_t ncopy)
: ctx(ctx), space(ncopy), resource(ncopy), curr_ptr(0) {
for (size_t i = 0; i < space.size(); ++i) {
resource[i].var = Engine::Get()->NewVariable();
resource[i].id = static_cast<int32_t>(i);
resource[i].ptr_ = &space[i];
resource[i].req = ResourceRequest(req);
space[i].ctx = ctx;
CHECK_EQ(space[i].handle.size, 0U);
}
}
virtual ~ResourceTempSpace() {
for (size_t i = 0; i < space.size(); ++i) {
SpaceAllocator r = space[i];
Engine::Get()->DeleteVariable(
[r](RunContext rctx) {
SpaceAllocator rcpy = r;
MSHADOW_CATCH_ERROR(rcpy.ReleaseAll());
},
ctx,
resource[i].var);
}
}
// get next resource in round roubin matter
inline Resource GetNext() {
const size_t kMaxDigit = std::numeric_limits<size_t>::max() / 2;
size_t ptr = ++curr_ptr;
// reset ptr to avoid undefined behavior during overflow
// usually this won't happen
if (ptr > kMaxDigit) {
curr_ptr.store((ptr + 1) % space.size());
}
return resource[ptr % space.size()];
}
};
#if MXNET_USE_CUDNN == 1
struct ResourceCUDNNDropout : public ResourceTempSpace<ResourceRequest::kCuDNNDropoutDesc> {
explicit ResourceCUDNNDropout(Context ctx, size_t ncopy, uint32_t global_seed)
: ResourceTempSpace<ResourceRequest::kCuDNNDropoutDesc>(ctx, ncopy) {
ResourceManagerImpl::SeedCUDNNDropout(this, global_seed);
}
};
static void SeedCUDNNDropout(ResourceCUDNNDropout* p, uint32_t seed) {
for (size_t i = 0; i < p->space.size(); ++i) {
uint32_t current_seed = p->ctx.dev_id + i * kMaxNumGPUs + seed * kRandMagic;
Resource* r = &(p->resource[i]);
Engine::Get()->PushAsync(
[r, current_seed](RunContext rctx,
Engine::CallbackOnStart on_start,
Engine::CallbackOnComplete on_complete) {
on_start();
auto state_space = static_cast<resource::SpaceAllocator*>(r->ptr_);
mshadow::Stream<gpu>* stream = rctx.get_stream<gpu>();
CHECK_EQ(state_space->ctx.dev_id, stream->dev_id)
<< "The device id of cudnn dropout state space doesn't match that from stream.";
if (!state_space->handle.size) {
// not allocated yet
size_t dropout_state_size;
CUDNN_CALL(cudnnDropoutGetStatesSize(stream->dnn_handle_, &dropout_state_size));
// reserve GPU space
Storage::Get()->DirectFree(
Storage::Get()->Alloc(dropout_state_size, state_space->ctx));
state_space->GetSpace(dropout_state_size, "cudnn_dropout_state");
}
cudnnDropoutDescriptor_t temp_descriptor;
CUDNN_CALL(cudnnCreateDropoutDescriptor(&temp_descriptor));
CUDNN_CALL(cudnnSetDropoutDescriptor(temp_descriptor,
stream->dnn_handle_,
0.5,
state_space->handle.dptr,
state_space->handle.size,
current_seed));
CUDNN_CALL(cudnnDestroyDropoutDescriptor(temp_descriptor));
cudaStream_t cuda_stream = mshadow::Stream<gpu>::GetStream(stream);
cudaStreamSynchronize(cuda_stream);
on_complete();
},
p->ctx,
{},
{r->var},
FnProperty::kNormal,
0,
"CUDNNDropoutDescriptorSeed");
}
p->curr_ptr.store(0);
}
#endif // MXNET_USE_CUDNN
// the parallel random sampler resources
// it use device API for GPU
template <typename xpu>
struct ResourceParallelRandom {
/*! \brief the context of the PRNG */
Context ctx;
/*! \brief pointers to sampler */
std::vector<common::random::RandGenerator<xpu>*> sampler;
/*! \brief resource representation */
std::vector<Resource> resource;
/*! \brief current pointer to the round roubin allocator */
std::atomic<size_t> curr_ptr;
/*! \brief constructor */
explicit ResourceParallelRandom(Context ctx, size_t ncopy, uint32_t global_seed)
: ctx(ctx), sampler(ncopy), resource(ncopy), curr_ptr(0) {
for (size_t i = 0; i < sampler.size(); ++i) {
const uint32_t seed = ctx.dev_id + i * kMaxNumGPUs + global_seed * kRandMagic;
resource[i].var = Engine::Get()->NewVariable();
common::random::RandGenerator<xpu>* r = new common::random::RandGenerator<xpu>();
Engine::Get()->PushSync(
[r, seed](RunContext rctx) {
common::random::RandGenerator<xpu>::AllocState(r);
r->Seed(rctx.get_stream<xpu>(), seed);
},
ctx,
{},
{resource[i].var},
FnProperty::kNormal,
0,
"ResourceParallelRandomSetSeed");
sampler[i] = r;
resource[i].ptr_ = sampler[i];
resource[i].req = ResourceRequest(ResourceRequest::kParallelRandom);
}
}
~ResourceParallelRandom() {
for (size_t i = 0; i < sampler.size(); ++i) {
common::random::RandGenerator<xpu>* r = sampler[i];
Engine::Get()->DeleteVariable(
[r](RunContext rctx) {
MSHADOW_CATCH_ERROR(common::random::RandGenerator<xpu>::FreeState(r));
MSHADOW_CATCH_ERROR(delete r);
},
ctx,
resource[i].var);
}
}
// set seed to a sampler using global_seed and device id
inline void SeedWithDeviceID(uint32_t global_seed) {
for (size_t i = 0; i < sampler.size(); ++i) {
SeedOne(i, ctx.dev_id + i * kMaxNumGPUs + global_seed * kRandMagic);
}
// reset pointer to ensure the same result with the same seed.
curr_ptr.store(0);
}
// set seed to a sampler
inline void Seed(uint32_t seed) {
for (size_t i = 0; i < sampler.size(); ++i) {
SeedOne(i, i * kMaxNumGPUs + seed * kRandMagic);
}
// reset pointer to ensure the same result with the same seed.
curr_ptr.store(0);
}
// set seed to a sampler
inline void SeedOne(size_t i, uint32_t seed) {
common::random::RandGenerator<xpu>* r = sampler[i];
Engine::Get()->PushAsync(
[r, seed](RunContext rctx,
Engine::CallbackOnStart on_start,
Engine::CallbackOnComplete on_complete) {
on_start();
r->Seed(rctx.get_stream<xpu>(), seed);
on_complete();
},
ctx,
{},
{resource[i].var},
FnProperty::kNormal,
0,
"ResourceNativeRandomSetSeed");
}
// get next resource in round roubin matter
inline Resource GetNext() {
const size_t kMaxDigit = std::numeric_limits<size_t>::max() / 2;
size_t ptr = ++curr_ptr;
// reset ptr to avoid undefined behavior during overflow
// usually this won't happen
if (ptr > kMaxDigit) {
curr_ptr.store((ptr + 1) % sampler.size());
}
return resource[ptr % sampler.size()];
}
};
/*! \brief number of copies in CPU temp space */
int cpu_temp_space_copy_;
/*! \brief number of copies in GPU temp space */
int gpu_temp_space_copy_;
/*! \brief number of copies in CPU native random sampler */
int cpu_native_rand_copy_;
/*! \brief number of copies in GPU native random sampler */
int gpu_native_rand_copy_;
/*! \brief Reference to the engine */
std::shared_ptr<Engine> engine_ref_;
/*! \brief Reference to the storage */
std::shared_ptr<Storage> storage_ref_;
/*! \brief internal seed to the random number generator */
uint32_t global_seed_{static_cast<uint32_t>(time(nullptr))};
/*! \brief CPU random number resources */
std::unique_ptr<ResourceRandom<cpu>> cpu_rand_;
/*! \brief CPU temp space resources */
std::unique_ptr<ResourceTempSpace<ResourceRequest::kTempSpace>> cpu_space_;
/*! \brief CPU parallel random number resources */
std::unique_ptr<ResourceParallelRandom<cpu>> cpu_parallel_rand_;
#if MXNET_USE_CUDA
/*! \brief random number generator for GPU */
common::LazyAllocArray<ResourceRandom<gpu>> gpu_rand_;
/*! \brief temp space for GPU */
common::LazyAllocArray<ResourceTempSpace<ResourceRequest::kTempSpace>> gpu_space_;
/*! \brief GPU parallel (on device) random number resources */
common::LazyAllocArray<ResourceParallelRandom<gpu>> gpu_parallel_rand_;
#if MXNET_USE_CUDNN == 1
/*! \brief number of copies in GPU cudnn dropout descriptor resources */
int gpu_cudnn_dropout_state_copy_;
/*! \brief GPU parallel (on device) random number resources */
common::LazyAllocArray<ResourceCUDNNDropout> gpu_cudnn_dropout_state_;
#endif // MXNET_USE_CUDNN == 1
#endif
};
} // namespace resource
void* Resource::get_space_internal(size_t size, const std::string& name) const {
return static_cast<resource::SpaceAllocator*>(ptr_)->GetSpace(size, name);
}
void* Resource::get_host_space_internal(size_t size) const {
return static_cast<resource::SpaceAllocator*>(ptr_)->GetHostSpace(size);
}
#if MXNET_USE_CUDNN == 1
void Resource::get_cudnn_dropout_desc(cudnnDropoutDescriptor_t* dropout_desc,
mshadow::Stream<gpu>* stream,
const float dropout,
const std::string& name) const {
CHECK_EQ(req.type, ResourceRequest::kCuDNNDropoutDesc);
auto state_space = static_cast<resource::SpaceAllocator*>(ptr_);
CHECK_EQ(state_space->ctx.dev_id, stream->dev_id)
<< "The device id of cudnn dropout state space doesn't match that from stream.";
if (dropout <= 0) {
CUDNN_CALL(
cudnnSetDropoutDescriptor(*dropout_desc, stream->dnn_handle_, dropout, nullptr, 0, 0));
} else {
CHECK(state_space->handle.size > 0) << "CUDNN dropout descriptor was not initialized yet!";
// cudnnRestoreDropoutDescriptor() introduced with cuDNN v7
STATIC_ASSERT_CUDNN_VERSION_GE(7000);
CUDNN_CALL(cudnnRestoreDropoutDescriptor(*dropout_desc,
stream->dnn_handle_,
dropout,
state_space->handle.dptr,
state_space->handle.size,
0));
}
}
#endif // MXNET_USE_CUDNN == 1
ResourceManager* ResourceManager::Get() {
typedef dmlc::ThreadLocalStore<resource::ResourceManagerImpl> inst;
return inst::Get();
}
} // namespace mxnet