blob: ea20a60948c5c08b95cc27bc3111e65262208eb8 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
#include <mxnet/rtc.h>
#include <typeinfo>
#include "../common/cuda_utils.h"
#include "../operator/operator_common.h"
#if MXNET_USE_CUDA && MXNET_ENABLE_CUDA_RTC
namespace mxnet {
namespace rtc {
CudaModule::Chunk::Chunk(
const char* source,
const std::vector<std::string>& options,
const std::vector<std::string>& exports) {
NVRTC_CALL(nvrtcCreateProgram(&prog_, source, "source.cu", 0, NULL, NULL));
for (const auto& i : exports) exports_.insert(i);
#if CUDA_VERSION >= 8000
for (const auto& func : exports) {
NVRTC_CALL(nvrtcAddNameExpression(prog_, func.c_str()));
}
#else
CHECK_EQ(exports.size(), 0)
<< "Exporting is only supported with CUDA 8.0 and above. "
<< "For lower version of CUDA, please prepend your kernel defintiions "
<< "with extern \"C\" instead.";
#endif
std::vector<const char*> c_options;
for (const auto& i : options) c_options.push_back(i.c_str());
nvrtcResult compile_res = nvrtcCompileProgram(prog_, c_options.size(), c_options.data());
if (compile_res != NVRTC_SUCCESS) {
size_t err_size;
NVRTC_CALL(nvrtcGetProgramLogSize(prog_, &err_size));
std::vector<char> err(err_size);
NVRTC_CALL(nvrtcGetProgramLog(prog_, err.data()));
LOG(FATAL) << err.data();
}
size_t ptx_size;
NVRTC_CALL(nvrtcGetPTXSize(prog_, &ptx_size));
ptx_ = new char[ptx_size];
NVRTC_CALL(nvrtcGetPTX(prog_, ptx_));
}
CudaModule::Chunk::~Chunk() {
for (const auto& kv : mod_) {
CUDA_DRIVER_CALL(cuModuleUnload(kv.second));
}
NVRTC_CALL(nvrtcDestroyProgram(&prog_));
delete ptx_;
}
CUfunction CudaModule::Chunk::GetFunction(
const std::string& mangled_name,
const Context& ctx) {
CHECK_EQ(ctx.dev_mask(), Context::kGPU)
<< "CUDA Runtime compilation only supports Nvidia GPU.";
auto iter = mod_.find(ctx.dev_id);
mxnet::common::cuda::DeviceStore device_store;
CUmodule module;
if (iter != mod_.end()) {
module = iter->second;
} else {
device_store.SetDevice(ctx.dev_id);
CUDA_DRIVER_CALL(cuModuleLoadDataEx(&module, ptx_, 0, 0, 0));
mod_[ctx.dev_id] = module;
}
CUfunction function;
auto err = cuModuleGetFunction(&function, module, mangled_name.c_str());
if (err == CUDA_ERROR_NOT_FOUND) {
LOG(FATAL) << "Cannot find cuda kernel with name '" << mangled_name
<< "'. Please either prepend kernel definition "
<< "with 'extern \"C\"' or add its name to exports "
<< "when creating CudaModule.";
}
CUDA_DRIVER_CALL(err);
return function;
}
std::shared_ptr<CudaModule::Kernel> CudaModule::GetKernel(
const std::string& name, const std::vector<ArgType>& signature) {
std::string mangled_name = name;
#if CUDA_VERSION >= 8000
if (ptr_->exports_.count(name)) {
const char * c_mangled_name;
NVRTC_CALL(nvrtcGetLoweredName(ptr_->prog_, name.c_str(), &c_mangled_name));
mangled_name = c_mangled_name;
}
#endif
return std::shared_ptr<Kernel>(new Kernel(ptr_, mangled_name, signature));
}
CudaModule::Kernel::Kernel(
const std::shared_ptr<CudaModule::Chunk>& mod,
const std::string& mangled_name,
const std::vector<ArgType>& signature)
: mangled_name_(mangled_name), signature_(signature), mod_(mod) {
}
void CudaModule::Kernel::Launch(
const Context& ctx, const std::vector<dmlc::any>& args,
uint32_t grid_dim_x, uint32_t grid_dim_y, uint32_t grid_dim_z,
uint32_t block_dim_x, uint32_t block_dim_y, uint32_t block_dim_z,
uint32_t shared_mem) {
CHECK_EQ(ctx.dev_mask(), Context::kGPU)
<< "CUDA Runtime compilation only supports Nvidia GPU.";
auto mod = mod_;
auto arg_types = signature();
CUfunction function;
auto iter = func_.find(ctx.dev_id);
if (iter != func_.end()) {
function = iter->second;
} else {
function = mod_->GetFunction(mangled_name_, ctx);
func_[ctx.dev_id] = function;
}
std::vector<Engine::VarHandle> read_vars, write_vars;
for (size_t i = 0; i < arg_types.size(); ++i) {
if (!arg_types[i].is_ndarray) continue;
const auto& array = dmlc::get<NDArray>(args[i]);
CHECK_EQ(array.dtype(), arg_types[i].dtype)
<< "The i-th argument is expected to be an NDArray of "
<< op::type_string(arg_types[i].dtype) << " type, but got "
<< op::type_string(array.dtype()) << " instead.";
if (arg_types[i].is_const) {
read_vars.emplace_back(array.var());
} else {
write_vars.emplace_back(array.var());
}
}
Engine::Get()->PushSync(
[function, mod, args, arg_types, grid_dim_x, grid_dim_y, grid_dim_z,
block_dim_x, block_dim_y, block_dim_z, shared_mem](RunContext rctx) {
std::vector<void*> p_args;
for (size_t i = 0; i < arg_types.size(); ++i) {
if (arg_types[i].is_ndarray) {
const auto& array = dmlc::get<NDArray>(args[i]);
p_args.push_back(reinterpret_cast<void*>(const_cast<void**>(&array.data().dptr_)));
} else {
MSHADOW_TYPE_SWITCH(arg_types[i].dtype, DType, {
const auto& number = dmlc::get<DType>(args[i]);
p_args.push_back(const_cast<DType*>(&number));
});
}
}
mshadow::Stream<gpu> *s = rctx.get_stream<gpu>();
CUDA_DRIVER_CALL(cuLaunchKernel(
function, grid_dim_x, grid_dim_y, grid_dim_z,
block_dim_x, block_dim_y, block_dim_z,
shared_mem, s->stream_,
p_args.data(), 0));
CUDA_CALL(cudaStreamSynchronize(s->stream_));
}, ctx, read_vars, write_vars, FnProperty::kNormal, 0,
mangled_name_.c_str());
}
} // namespace rtc
} // namespace mxnet
#endif // MXNET_USE_CUDA && MXNET_ENABLE_CUDA_RTC