| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| */ |
| |
| #include <mxnet/rtc.h> |
| #include <typeinfo> |
| |
| #include "cuda/utils.h" |
| #include "../operator/operator_common.h" |
| |
| #if MXNET_USE_CUDA |
| |
| namespace mxnet { |
| namespace rtc { |
| |
| CudaModule::Chunk::Chunk(const char* source, |
| const std::vector<std::string>& options, |
| const std::vector<std::string>& exports) { |
| NVRTC_CALL(nvrtcCreateProgram(&prog_, source, "source.cu", 0, nullptr, nullptr)); |
| for (const auto& i : exports) |
| exports_.insert(i); |
| #if CUDA_VERSION >= 8000 |
| for (const auto& func : exports) { |
| NVRTC_CALL(nvrtcAddNameExpression(prog_, func.c_str())); |
| } |
| #else |
| CHECK_EQ(exports.size(), 0) |
| << "Exporting is only supported with CUDA 8.0 and above. " |
| << "For lower version of CUDA, please prepend your kernel defintiions " |
| << "with extern \"C\" instead."; |
| #endif |
| std::vector<const char*> c_options; |
| for (const auto& i : options) |
| c_options.push_back(i.c_str()); |
| nvrtcResult compile_res = nvrtcCompileProgram(prog_, c_options.size(), c_options.data()); |
| if (compile_res != NVRTC_SUCCESS) { |
| size_t err_size; |
| NVRTC_CALL(nvrtcGetProgramLogSize(prog_, &err_size)); |
| std::vector<char> err(err_size); |
| NVRTC_CALL(nvrtcGetProgramLog(prog_, err.data())); |
| LOG(FATAL) << err.data(); |
| } |
| |
| bool use_ptx = true; |
| for (const auto& opt : options) { |
| if (opt.find("sm_") != std::string::npos) { |
| use_ptx = false; |
| break; |
| } |
| } |
| |
| if (use_ptx) { |
| size_t ptx_size; |
| NVRTC_CALL(nvrtcGetPTXSize(prog_, &ptx_size)); |
| ptx_.resize(ptx_size); |
| NVRTC_CALL(nvrtcGetPTX(prog_, ptx_.data())); |
| } else { |
| #if CUDA_VERSION >= 11010 |
| size_t cubin_size; |
| NVRTC_CALL(nvrtcGetCUBINSize(prog_, &cubin_size)); |
| ptx_.resize(cubin_size); |
| NVRTC_CALL(nvrtcGetCUBIN(prog_, ptx_.data())); |
| #else |
| LOG(FATAL) << "Your CUDA version does not support compiling for sm_XX target. " |
| << "Use compute_XX target instead or upgrade to CUDA 11.1 or later."; |
| #endif |
| } |
| } |
| |
| CudaModule::Chunk::~Chunk() { |
| for (const auto& kv : mod_) { |
| CUDA_DRIVER_CALL(cuModuleUnload(kv.second)); |
| } |
| NVRTC_CALL(nvrtcDestroyProgram(&prog_)); |
| } |
| |
| CUfunction CudaModule::Chunk::GetFunction(const std::string& mangled_name, const Context& ctx) { |
| CHECK_EQ(ctx.dev_mask(), Context::kGPU) << "CUDA Runtime compilation only supports Nvidia GPU."; |
| auto iter = mod_.find(ctx.dev_id); |
| mxnet::common::cuda::DeviceStore device_store; |
| CUmodule module; |
| if (iter != mod_.end()) { |
| module = iter->second; |
| } else { |
| device_store.SetDevice(ctx.dev_id); |
| CUDA_DRIVER_CALL(cuModuleLoadDataEx(&module, ptx_.data(), 0, nullptr, nullptr)); |
| mod_[ctx.dev_id] = module; |
| } |
| CUfunction function; |
| auto err = cuModuleGetFunction(&function, module, mangled_name.c_str()); |
| if (err == CUDA_ERROR_NOT_FOUND) { |
| LOG(FATAL) << "Cannot find cuda kernel with name '" << mangled_name |
| << "'. Please either prepend kernel definition " |
| << "with 'extern \"C\"' or add its name to exports " |
| << "when creating CudaModule."; |
| } |
| CUDA_DRIVER_CALL(err); |
| return function; |
| } |
| |
| std::shared_ptr<CudaModule::Kernel> CudaModule::GetKernel(const std::string& name, |
| const std::vector<ArgType>& signature) { |
| std::string mangled_name = name; |
| #if CUDA_VERSION >= 8000 |
| if (ptr_->exports_.count(name)) { |
| const char* c_mangled_name; |
| NVRTC_CALL(nvrtcGetLoweredName(ptr_->prog_, name.c_str(), &c_mangled_name)); |
| mangled_name = c_mangled_name; |
| } |
| #endif |
| return std::shared_ptr<Kernel>(new Kernel(ptr_, mangled_name, signature)); |
| } |
| |
| CudaModule::Kernel::Kernel(const std::shared_ptr<CudaModule::Chunk>& mod, |
| const std::string& mangled_name, |
| const std::vector<ArgType>& signature) |
| : mangled_name_(mangled_name), signature_(signature), mod_(mod) {} |
| |
| void CudaModule::Kernel::Launch(const Context& ctx, |
| const std::vector<dmlc::any>& args, |
| uint32_t grid_dim_x, |
| uint32_t grid_dim_y, |
| uint32_t grid_dim_z, |
| uint32_t block_dim_x, |
| uint32_t block_dim_y, |
| uint32_t block_dim_z, |
| uint32_t shared_mem) { |
| CHECK_EQ(ctx.dev_mask(), Context::kGPU) << "CUDA Runtime compilation only supports Nvidia GPU."; |
| |
| auto mod = mod_; |
| auto arg_types = signature(); |
| |
| CUfunction function; |
| auto iter = func_.find(ctx.dev_id); |
| if (iter != func_.end()) { |
| function = iter->second; |
| } else { |
| function = mod_->GetFunction(mangled_name_, ctx); |
| func_[ctx.dev_id] = function; |
| } |
| |
| std::vector<Engine::VarHandle> read_vars, write_vars; |
| for (size_t i = 0; i < arg_types.size(); ++i) { |
| if (!arg_types[i].is_ndarray) |
| continue; |
| const auto& array = dmlc::get<NDArray>(args[i]); |
| CHECK_EQ(array.dtype(), arg_types[i].dtype) |
| << "The i-th argument is expected to be an NDArray of " |
| << op::type_string(arg_types[i].dtype) << " type, but got " |
| << op::type_string(array.dtype()) << " instead."; |
| if (arg_types[i].is_const) { |
| read_vars.emplace_back(array.var()); |
| } else { |
| write_vars.emplace_back(array.var()); |
| } |
| } |
| |
| Engine::Get()->PushSync( |
| [function, |
| mod, |
| args, |
| arg_types, |
| grid_dim_x, |
| grid_dim_y, |
| grid_dim_z, |
| block_dim_x, |
| block_dim_y, |
| block_dim_z, |
| shared_mem](RunContext rctx) { |
| std::vector<void*> p_args; |
| for (size_t i = 0; i < arg_types.size(); ++i) { |
| if (arg_types[i].is_ndarray) { |
| const auto& array = dmlc::get<NDArray>(args[i]); |
| p_args.push_back(reinterpret_cast<void*>(const_cast<void**>(&array.data().dptr_))); |
| } else { |
| MSHADOW_TYPE_SWITCH(arg_types[i].dtype, DType, { |
| const auto& number = dmlc::get<DType>(args[i]); |
| p_args.push_back(const_cast<DType*>(&number)); |
| }); |
| } |
| } |
| |
| mshadow::Stream<gpu>* s = rctx.get_stream<gpu>(); |
| CUDA_DRIVER_CALL(cuLaunchKernel(function, |
| grid_dim_x, |
| grid_dim_y, |
| grid_dim_z, |
| block_dim_x, |
| block_dim_y, |
| block_dim_z, |
| shared_mem, |
| s->stream_, |
| p_args.data(), |
| nullptr)); |
| CUDA_CALL(cudaStreamSynchronize(s->stream_)); |
| }, |
| ctx, |
| read_vars, |
| write_vars, |
| FnProperty::kNormal, |
| 0, |
| mangled_name_.c_str()); |
| } |
| |
| } // namespace rtc |
| } // namespace mxnet |
| |
| #endif // MXNET_USE_CUDA |