| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| */ |
| |
| /*! |
| * Copyright (c) 2020 by Contributors |
| * \file cuda_graphs.h |
| * \brief Wrappers for use of CUDA Graphs API |
| */ |
| #ifndef MXNET_IMPERATIVE_CUDA_GRAPHS_H_ |
| #define MXNET_IMPERATIVE_CUDA_GRAPHS_H_ |
| |
| #include <mxnet/base.h> |
| #include <vector> |
| #include <string> |
| #include <map> |
| #include <set> |
| #include <sstream> |
| |
| #include "./exec_pass.h" |
| #include "../common/cuda/utils.h" |
| |
| #if MXNET_USE_CUDA |
| #define CUDA_GRAPHS_AVAILABLE (CUDA_VERSION >= 10020) |
| #else |
| #define CUDA_GRAPHS_AVAILABLE (0) |
| #endif |
| |
| #if CUDA_GRAPHS_AVAILABLE |
| |
| namespace mxnet { |
| namespace cuda_graphs { |
| |
| inline std::string CudaDim3ToString(const dim3& dims) { |
| std::stringstream ss; |
| if (dims.z != 1) |
| ss << "(" << dims.x << "," << dims.y << "," << dims.z << ")"; |
| else if (dims.y != 1) |
| ss << "(" << dims.x << "," << dims.y << ")"; |
| else |
| ss << "(" << dims.x << ")"; |
| return ss.str(); |
| } |
| |
| // Return the list of CUDA Graph nodes from a graph |
| inline std::vector<cudaGraphNode_t> GetCudaGraphNodes(cudaGraph_t cuda_graph) { |
| size_t numNodes; |
| CUDA_CALL(cudaGraphGetNodes(cuda_graph, static_cast<cudaGraphNode_t*>(nullptr), &numNodes)); |
| if (numNodes == 0) |
| return std::vector<cudaGraphNode_t>(); |
| std::vector<cudaGraphNode_t> graphNodes(numNodes); |
| CUDA_CALL(cudaGraphGetNodes(cuda_graph, graphNodes.data(), &numNodes)); |
| return graphNodes; |
| } |
| |
| // Create a description of a CUDA Graph node |
| inline std::string CudaGraphNodeToString(const cudaGraphNode_t node) { |
| std::stringstream ss; |
| |
| // The following introspection calls are made through the driver API in order to bypass |
| // problems that would arise if multiple statically-linked copies of the runtime exist. |
| |
| CUgraphNode cu_node = node; |
| CUgraphNodeType t; |
| CUDA_DRIVER_CALL(cuGraphNodeGetType(cu_node, &t)); |
| switch (t) { |
| case CU_GRAPH_NODE_TYPE_KERNEL: { |
| CUDA_KERNEL_NODE_PARAMS kparams; |
| auto err = cuGraphKernelNodeGetParams(cu_node, &kparams); |
| if (err == CUDA_SUCCESS) { |
| ss << "GPUKernel@" << kparams.func; |
| dim3 gridDim(kparams.gridDimX, kparams.gridDimY, kparams.gridDimZ); |
| dim3 blockDim(kparams.blockDimX, kparams.blockDimY, kparams.blockDimZ); |
| ss << "<<<gridDim=" << CudaDim3ToString(gridDim) |
| << ", blkDim=" << CudaDim3ToString(blockDim) << ">>>"; |
| ss << "(..."; |
| if (kparams.sharedMemBytes != 0) |
| ss << ", dynSharedMemBytes=" << kparams.sharedMemBytes; |
| ss << ")"; |
| } else { |
| ss << "GPU Kernel: cuGraphKernelNodeGetParams() fails with " << err; |
| } |
| } break; |
| case CU_GRAPH_NODE_TYPE_MEMCPY: { |
| cudaMemcpy3DParms mparams = {}; |
| CUDA_CALL(cudaGraphMemcpyNodeGetParams(node, &mparams)); |
| // If memcpy is seen, return without setting up runnable executor |
| switch (mparams.kind) { |
| case cudaMemcpyHostToHost: |
| ss << "Host->Host "; |
| break; |
| case cudaMemcpyHostToDevice: |
| ss << "Host->Device "; |
| break; |
| case cudaMemcpyDeviceToHost: |
| ss << "Device->Host "; |
| break; |
| case cudaMemcpyDeviceToDevice: |
| ss << "Device->Device "; |
| break; |
| default: |
| break; |
| } |
| ss << "Memcpy"; |
| } break; |
| case CU_GRAPH_NODE_TYPE_MEMSET: { |
| cudaMemsetParams mparams = {}; |
| CUDA_CALL(cudaGraphMemsetNodeGetParams(node, &mparams)); |
| if (mparams.height == 1 && mparams.elementSize == 1) { |
| ss << "cudaMemset(devPtr=" << mparams.dst << ", value=" << mparams.value |
| << ", count=" << mparams.width << ")"; |
| } else { |
| if (mparams.elementSize == 1) |
| ss << "cudaMemset2D"; |
| else |
| ss << "MemSet<elemBytes=" << mparams.elementSize << ">"; |
| ss << "(devPtr=" << mparams.dst << ", pitch=" << mparams.pitch |
| << ", value=" << mparams.value << ", width=" << mparams.width |
| << ", height=" << mparams.height << ")"; |
| } |
| } break; |
| case CU_GRAPH_NODE_TYPE_HOST: |
| ss << "Host (executable) node"; |
| break; |
| case CU_GRAPH_NODE_TYPE_GRAPH: |
| ss << "Node which executes an embedded graph"; |
| break; |
| case CU_GRAPH_NODE_TYPE_EMPTY: |
| ss << "Empty (no-op) node"; |
| break; |
| default: |
| ss << "Unknown/Invalid node type " << t; |
| } |
| return ss.str(); |
| } |
| |
| // CUDA Graphs are managed in RAII fashion by smart pointers below. |
| // Function objects (preferred for readability) provide the deleter function. |
| class CudaGraphDeleter { |
| public: |
| void operator()(cudaGraph_t graph) { |
| if (graph != nullptr) |
| CUDA_CALL(cudaGraphDestroy(graph)); |
| } |
| }; |
| |
| // CUDA Graphs Executors are managed in RAII fashion by smart pointers below. |
| // Function objects (preferred for readability) provide the deleter function. |
| class CudaGraphExecDeleter { |
| public: |
| void operator()(cudaGraphExec_t graph_exec) { |
| if (graph_exec != nullptr) |
| CUDA_CALL(cudaGraphExecDestroy(graph_exec)); |
| } |
| }; |
| |
| // A CUDA Graphs executor for a portion of an Operator Segment (i.e. a 'SubSegment'), |
| // characterized by a starting index in the OpExecutor list and a number of ops. |
| class CudaGraphsSubSegExec { |
| public: |
| CudaGraphsSubSegExec(const std::vector<std::shared_ptr<exec::OpExecutor>>& exec_list, |
| const RunContext& rctx, |
| bool is_gpu, |
| bool verbose, |
| int from_op_idx, |
| int num_ops, |
| bool ops_are_cuda_graph_compatible = true) |
| : from_op_idx_(from_op_idx), |
| num_ops_(num_ops), |
| graph_(nullptr), |
| graph_exec_(nullptr), |
| graph_exec_id_(0) { |
| if (ops_are_cuda_graph_compatible) { |
| MakeGraph(exec_list, rctx, is_gpu, verbose, from_op_idx, num_ops); |
| MakeGraphExec(exec_list, rctx); |
| } |
| } |
| |
| void Update(const std::vector<std::shared_ptr<exec::OpExecutor>>& exec_list, |
| const RunContext& rctx, |
| bool is_gpu, |
| bool verbose) { |
| // Current executor should be Runnable with the same parameters |
| CHECK(IsRunnable()); |
| MakeGraph(exec_list, rctx, is_gpu, verbose, from_op_idx_, num_ops_); |
| |
| cudaGraphExecUpdateResult update_result = cudaGraphExecUpdateError; |
| cudaGraphNode_t error_node; |
| cudaError_t err = |
| cudaGraphExecUpdate(graph_exec_.get(), graph_.get(), &error_node, &update_result); |
| switch (err) { |
| case cudaErrorGraphExecUpdateFailure: |
| MakeGraphExec(exec_list, rctx); |
| break; |
| case cudaSuccess: |
| CHECK_EQ(update_result, cudaGraphExecUpdateSuccess); |
| break; |
| default: |
| // Respond normally to unusual cudaGraphExecUpdate() ret vals |
| CUDA_CALL(err); |
| } |
| } |
| |
| void RunSubSeg(const std::vector<std::shared_ptr<exec::OpExecutor>>& exec_list, |
| const RunContext& rctx, |
| bool is_gpu) { |
| if (IsRunnable()) { |
| auto s = rctx.get_stream<gpu>(); |
| const cudaStream_t cu_s = mshadow::Stream<gpu>::GetStream(s); |
| CUDA_CALL(cudaGraphLaunch(graph_exec_.get(), cu_s)); |
| } else { |
| // No CUDA Graph could be made for this portion of the OpSegment. Run conventionally. |
| for (int i = 0; i != num_ops_; ++i) |
| exec_list[from_op_idx_ + i]->Run(rctx, is_gpu); |
| } |
| } |
| |
| bool IsRunnable() { |
| return graph_exec_ != nullptr; |
| } |
| |
| int NumGraphNodes() { |
| size_t numNodes; |
| CUDA_CALL(cudaGraphGetNodes(graph_.get(), static_cast<cudaGraphNode_t*>(nullptr), &numNodes)); |
| return numNodes; |
| } |
| |
| private: |
| void MakeGraph(const std::vector<std::shared_ptr<exec::OpExecutor>>& exec_list, |
| const RunContext& rctx, |
| bool is_gpu, |
| bool verbose, |
| int from_op_idx, |
| int num_ops) { |
| auto s = rctx.get_stream<gpu>(); |
| const cudaStream_t cu_s = mshadow::Stream<gpu>::GetStream(s); |
| // Create CUDA Graph |
| // Use of cudaStreamCaptureModeThreadLocal allows other threads like GPU Copy workers |
| // to sync their streams without disturbing this capture. |
| CUDA_CALL(cudaStreamBeginCapture(cu_s, cudaStreamCaptureModeThreadLocal)); |
| // Run those oprs in the sub segment while capturing- no actual GPU work is launched. |
| for (int i = 0; i != num_ops; ++i) |
| exec_list[from_op_idx + i]->Run(rctx, is_gpu); |
| cudaGraph_t cuda_graph = nullptr; |
| CUDA_CALL(cudaStreamEndCapture(cu_s, &cuda_graph)); |
| graph_.reset(cuda_graph, CudaGraphDeleter()); |
| |
| if (verbose) { |
| std::vector<cudaGraphNode_t> graph_nodes = GetCudaGraphNodes(cuda_graph); |
| size_t num_nodes = graph_nodes.size(); |
| LOG(INFO) << " Graph has " << num_nodes << " nodes:"; |
| for (size_t i = 0; i != num_nodes; ++i) { |
| LOG(INFO) << " node " << i << " = " << CudaGraphNodeToString(graph_nodes[i]); |
| } |
| } |
| } |
| |
| void MakeGraphExec(const std::vector<std::shared_ptr<exec::OpExecutor>>& exec_list, |
| const RunContext& rctx) { |
| // Note that this routine is not invoked when a graph executor is merely updated. |
| cudaGraphExec_t cuda_graph_exec; |
| cudaGraphNode_t error_node; |
| char log_buffer[1000]; |
| |
| CUDA_CALL(cudaGraphInstantiate(&cuda_graph_exec, graph_.get(), &error_node, log_buffer, 1000)); |
| graph_exec_.reset(cuda_graph_exec, CudaGraphExecDeleter()); |
| |
| // At this point we have a CUDA Graph executor |
| static int num_graph_creations = 0; |
| graph_exec_id_ = num_graph_creations++; |
| |
| static size_t max_log_entries = dmlc::GetEnv("MXNET_CUDA_GRAPHS_MAX_LOG_ENTRIES", 0); |
| if (graph_exec_id_ < max_log_entries) { |
| LOG(INFO) << "Created CUDA graph " << graph_exec_id_; |
| if (num_graph_creations == max_log_entries) |
| LOG(INFO) << "Further CUDA graph creation log messages are suppressed."; |
| } |
| // Create a .dot file for graph visualization if requested |
| static std::string dotfile_base = dmlc::GetEnv("MXNET_CUDA_GRAPHS_DBG_FILE", std::string()); |
| if (dotfile_base.size() > 0) { |
| #if CUDA_VERSION >= 11030 |
| static int dotfile_flags = dmlc::GetEnv("MXNET_CUDA_GRAPHS_DBG_FILE_FLAGS", |
| static_cast<int>(cudaGraphDebugDotFlagsVerbose)); |
| std::ostringstream filename; |
| const bool is_train = exec_list.size() > 0 && exec_list[0]->op_ctx.is_train; |
| int dev_id = rctx.ctx.dev_id; |
| filename << dotfile_base << "-" |
| << "dev" << dev_id << "-" << (is_train ? "trn" : "inf") << "-" << graph_exec_id_ |
| << ".dot"; |
| CUDA_CALL(cudaGraphDebugDotPrint(graph_.get(), filename.str().c_str(), dotfile_flags)); |
| #else |
| [[maybe_unused]] static bool dot_file_unsupported = []() { // NOLINT |
| LOG(INFO) << "MXNET_CUDA_GRAPHS_DBG_FILE setting ignored- requires CUDA version >= 11.3"; |
| return true; |
| }(); |
| #endif // CUDA_VERSION >= 11030 |
| } |
| } |
| |
| int from_op_idx_; |
| int num_ops_; |
| using cudaGraphStruct_t = typename std::remove_pointer<cudaGraph_t>::type; |
| using cudaGraphExecStruct_t = typename std::remove_pointer<cudaGraphExec_t>::type; |
| std::shared_ptr<cudaGraphStruct_t> graph_; |
| std::shared_ptr<cudaGraphExecStruct_t> graph_exec_; |
| size_t graph_exec_id_; |
| }; |
| |
| // The CudaGraph executor and associated Tempspace ptrs for which it is valid. |
| struct CudaGraphInfo { |
| std::vector<CudaGraphsSubSegExec> cuda_graph_subseg_execs; |
| bool has_been_run_conventionally = false; |
| std::vector<void*> tempspace_dptrs; |
| }; |
| // A CUDA graph is maintained for every combination of cudaStream_t (i.e. GPU Worker) and |
| // the state of the is_train flag of the OpContext. If the tempspace_dptrs change, we |
| // don't expect to ever see the old tempspace_dptrs config again, so we discard the CUDA graph. |
| struct CudaGraphCacheKey { |
| cudaStream_t cu_s; |
| bool is_train; |
| // overload '<' so CudaGraphCacheKey can be used as a std::map key |
| bool operator<(const CudaGraphCacheKey& other) const { |
| return cu_s < other.cu_s || (cu_s == other.cu_s && is_train < other.is_train); |
| } |
| }; |
| using CudaGraphCache = std::map<CudaGraphCacheKey, CudaGraphInfo>; |
| |
| class CudaGraphsExec { |
| public: |
| CudaGraphsExec(const std::vector<std::shared_ptr<exec::OpExecutor>>& exec_list, |
| bool is_gpu, |
| const char* opr_names) |
| : verbose_(false), is_enabled_(false) { |
| opr_names_ = opr_names ? std::string(opr_names) : std::string(); |
| if (is_gpu) { |
| is_enabled_ = dmlc::GetEnv("MXNET_ENABLE_CUDA_GRAPHS", false); |
| verbose_ = dmlc::GetEnv("MXNET_CUDA_GRAPHS_VERBOSE", false); |
| SetTempSpaces(exec_list); |
| } |
| } |
| |
| void RunAll(const std::vector<std::shared_ptr<exec::OpExecutor>>& exec_list, |
| const RunContext& rctx, |
| bool is_gpu) { |
| // If this a CPU op or CUDA Graphs use isn't possible, run normally and return |
| if (!is_gpu || !is_enabled_) { |
| // Run all opr in the sub-graph |
| exec::OpExecutor::RunAll(exec_list, rctx, is_gpu); |
| return; |
| } |
| |
| // Also if we're in a warm-up period where tempspace pointers are likely |
| // to change, run normally and return |
| auto s = rctx.get_stream<gpu>(); |
| const cudaStream_t cu_s = mshadow::Stream<gpu>::GetStream(s); |
| // All the ops in the bulked segment will have the same setting of is_train as the first op |
| const bool is_train = exec_list.size() > 0 && exec_list[0]->op_ctx.is_train; |
| const CudaGraphCacheKey key = {cu_s, is_train}; |
| // Look-up the CUDA Graph info for this combo of stream and is_train setting |
| // This may create a default-initialized new entry. |
| auto& cuda_graph_info = cache_[key]; |
| if (!cuda_graph_info.has_been_run_conventionally) { |
| // Run all opr in the sub-graph |
| exec::OpExecutor::RunAll(exec_list, rctx, is_gpu); |
| cuda_graph_info.has_been_run_conventionally = true; |
| return; |
| } |
| |
| // At this point we will launch one or more CUDA Graphs through CUDA Graphs 'executors' |
| // (there might be more than one executor if some ops in the segment are not capturable) |
| auto before_exec_tempspace_ptrs = GetGPUTempspacePtrs(s); |
| |
| // Executors exist, but the tempspace pts have changed, so update them in-place via 'recapture'. |
| if (cuda_graph_info.cuda_graph_subseg_execs.size() > 0 && |
| cuda_graph_info.tempspace_dptrs != before_exec_tempspace_ptrs) { |
| // Update all runnable executors. Non-runnable executors launch their ops conventionally. |
| for (auto& subseg_exec : cuda_graph_info.cuda_graph_subseg_execs) { |
| if (subseg_exec.IsRunnable()) |
| subseg_exec.Update(exec_list, rctx, is_gpu, verbose_); |
| } |
| } else if (cuda_graph_info.cuda_graph_subseg_execs.size() == 0) { |
| // No executors exist yet, so create them. |
| if (verbose_) |
| LOG(INFO) << "Capturing CUDA graph of op segment " << opr_names_; |
| // Make one or more CUDA Graphs, avoiding ops that are not compatible. |
| for (size_t first_op_idx = 0; first_op_idx != exec_list.size();) { |
| int num_good_ops = 0; |
| for (size_t last_op_idx = first_op_idx; last_op_idx != exec_list.size(); ++last_op_idx) { |
| if (OpOK(exec_list[last_op_idx])) |
| num_good_ops++; |
| else |
| break; |
| } |
| if (num_good_ops > 0) { |
| CreateSubExecOverRegion(exec_list, |
| rctx, |
| is_gpu, |
| first_op_idx, |
| first_op_idx + num_good_ops, |
| &cuda_graph_info.cuda_graph_subseg_execs); |
| first_op_idx += num_good_ops; |
| } |
| if (first_op_idx != exec_list.size()) { |
| // We had to have hit an op that was not OK. |
| if (verbose_) { |
| LOG(INFO) << "Bypassing notOK op segment[" << first_op_idx << "," << first_op_idx << "]" |
| << " of op segment " << opr_names_; |
| } |
| CudaGraphsSubSegExec notOK_opseg(exec_list, rctx, is_gpu, false, first_op_idx, 1, false); |
| cuda_graph_info.cuda_graph_subseg_execs.push_back(notOK_opseg); |
| first_op_idx++; |
| } |
| } |
| // During graph capture, the ops may be asking for the tempworkspace. This should |
| // not alter the base pointers, since this op seg has been executed before on this |
| // stream (i.e. on this gpu worker). Safest to double-check this though. |
| auto after_capture_tempspace_ptrs = GetGPUTempspacePtrs(s); |
| if (before_exec_tempspace_ptrs != after_capture_tempspace_ptrs) |
| LOG(FATAL) << "Internal error: saw change in TempSpace ptrs during CUDA graph use."; |
| cuda_graph_info.tempspace_dptrs = before_exec_tempspace_ptrs; |
| } |
| // Now execute the CUDA Graph that we either just created or looked-up in the cache. |
| if (verbose_) { |
| int runnable_execs = 0; |
| int bypassed_ops = 0; |
| for (auto& subseg_exec : cuda_graph_info.cuda_graph_subseg_execs) { |
| if (subseg_exec.IsRunnable()) { |
| LOG(INFO) << "Launching captured graph with " << subseg_exec.NumGraphNodes() << " nodes."; |
| runnable_execs++; |
| } else { |
| bypassed_ops++; |
| } |
| } |
| if (bypassed_ops > 0) |
| LOG(INFO) << " (bypassing " << bypassed_ops << " un-capturable ops)"; |
| } |
| for (auto& subseg_exec : cuda_graph_info.cuda_graph_subseg_execs) |
| subseg_exec.RunSubSeg(exec_list, rctx, is_gpu); |
| } |
| |
| private: |
| // Make a CUDA Graph of the region of ops [from_op_idx, upto_op_idx). If such a graph |
| // is not runnable, e.g. if it includes memcpys from unpinned cpu memory, then make a |
| // number of smaller graphs that avoid those ops with the memcpys. |
| void CreateSubExecOverRegion(const std::vector<std::shared_ptr<exec::OpExecutor>>& exec_list, |
| const RunContext& rctx, |
| bool is_gpu, |
| size_t from_op_idx, |
| size_t upto_op_idx, |
| std::vector<CudaGraphsSubSegExec>* cuda_graph_subseg_execs) { |
| // Optimistically try to create a CUDA Graph of the entire op segment region |
| |
| int num_ops = upto_op_idx - from_op_idx; |
| CudaGraphsSubSegExec full_opseg(exec_list, rctx, is_gpu, verbose_, from_op_idx, num_ops); |
| if (full_opseg.IsRunnable()) { |
| cuda_graph_subseg_execs->push_back(full_opseg); |
| } else { |
| if (verbose_) |
| LOG(INFO) << " Graph was not runnable- creating op sub-segments..."; |
| // Enter fall-back approach to making many sub-execs |
| for (size_t first_op_idx = from_op_idx; first_op_idx != upto_op_idx;) { |
| int num_good_ops = 0; |
| for (size_t last_op_idx = first_op_idx; last_op_idx != upto_op_idx; ++last_op_idx) { |
| CudaGraphsSubSegExec single_opseg(exec_list, rctx, is_gpu, false, last_op_idx, 1); |
| if (single_opseg.IsRunnable()) |
| num_good_ops++; |
| // Is it time to create a subseg exec from accumulated good ops? |
| if (num_good_ops > 0 && (last_op_idx == upto_op_idx - 1 || !single_opseg.IsRunnable())) { |
| if (verbose_) |
| LOG(INFO) << "Capturing CUDA graph of op sub segment[" << first_op_idx << ":" |
| << (first_op_idx + num_good_ops - 1) << "]" |
| << " of op segment " << opr_names_; |
| CudaGraphsSubSegExec good_opseg( |
| exec_list, rctx, is_gpu, verbose_, first_op_idx, num_good_ops); |
| CHECK(good_opseg.IsRunnable()) << "Unexpected issue with CUDA Graphs creation"; |
| cuda_graph_subseg_execs->push_back(good_opseg); |
| first_op_idx += num_good_ops; |
| } |
| // If the last single op was not runnable, use the exec to handle that op conventionally |
| if (!single_opseg.IsRunnable()) { |
| if (verbose_) { |
| LOG(INFO) << "Bypassing op sub segment[" << last_op_idx << "," << last_op_idx << "]" |
| << " of op segment " << opr_names_; |
| // Generate throw-away exec in order to produce a diagnostic listing of graph nodes |
| CudaGraphsSubSegExec dummy(exec_list, rctx, is_gpu, verbose_, last_op_idx, 1); |
| } |
| cuda_graph_subseg_execs->push_back(single_opseg); |
| first_op_idx++; |
| break; |
| } |
| } |
| } |
| } |
| } |
| |
| // Is the Op OK to make part of a CUDA Graph? |
| bool OpOK(const std::shared_ptr<exec::OpExecutor>& exec) { |
| static auto& fgraphcompatible = Op::GetAttr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible"); |
| static auto& fcompute_ex = Op::GetAttr<FComputeEx>("FComputeEx<gpu>"); |
| static auto& fstatefulcompute = Op::GetAttr<FStatefulCompute>("FStatefulCompute<gpu>"); |
| static auto& fstatefulcompute_ex = Op::GetAttr<FStatefulComputeEx>("FStatefulComputeEx<gpu>"); |
| const auto& attrs = exec->attrs; |
| if (attrs.op != nullptr) { |
| const auto f = fgraphcompatible.get(attrs.op, nullptr); |
| if (f != nullptr) { |
| return f(attrs, exec->op_ctx.is_train); |
| } |
| if (fstatefulcompute.get(attrs.op, nullptr) != nullptr || |
| fstatefulcompute_ex.get(attrs.op, nullptr) != nullptr) { |
| if (verbose_) { |
| LOG(INFO) << "Omitting stateful operator " << attrs.op->name << " from CUDA graph."; |
| } |
| return false; |
| } |
| if ((fcompute_ex.get(attrs.op, nullptr) != nullptr && |
| exec->dispatch_mode == DispatchMode::kFComputeEx) || |
| exec->dispatch_mode == DispatchMode::kFComputeFallback) { |
| if (verbose_) { |
| LOG(INFO) << "Omitting operator " << attrs.op->name |
| << " from CUDA graph due to dispatch mode " |
| << static_cast<int>(exec->dispatch_mode); |
| } |
| return false; |
| } |
| } |
| for (auto& resource : exec->op_ctx.requested) { |
| if (!(resource.req.type == ResourceRequest::kTempSpace)) { |
| if (verbose_) { |
| LOG(INFO) << "Omitting operator " << attrs.op->name |
| << " from CUDA graph due to using the resource type " |
| << static_cast<int>(resource.req.type); |
| } |
| return false; |
| } |
| } |
| return true; |
| } |
| |
| // Determine Tempspaces used by ops. Other resource uses disable CUDA Graphs. |
| void SetTempSpaces(const std::vector<std::shared_ptr<exec::OpExecutor>>& exec_list) { |
| // Gather info about the ops use of TempSpace. |
| if (is_enabled_) { |
| std::set<Resource*> tempspaces_set; |
| for (auto& exec : exec_list) { |
| for (auto& resource : exec->op_ctx.requested) { |
| if (resource.req.type == ResourceRequest::kTempSpace) { |
| tempspaces_set.insert(&resource); |
| } |
| } |
| } |
| tempspaces_.assign(tempspaces_set.begin(), tempspaces_set.end()); |
| } |
| } |
| |
| // Return the addresses of the gpu TempSpace areas |
| std::vector<void*> GetGPUTempspacePtrs(mshadow::Stream<gpu>* s) { |
| std::vector<void*> ret; |
| for (const auto& resource : tempspaces_) { |
| // Ask for minimal allocation to get base pointer without increasing the size |
| auto* base_ptr = resource->get_space_typed<gpu, 1, char>(mshadow::Shape1(1), s).dptr_; |
| ret.push_back(static_cast<void*>(base_ptr)); |
| } |
| return ret; |
| } |
| |
| CudaGraphCache cache_; |
| std::vector<Resource*> tempspaces_; |
| std::string opr_names_; |
| bool verbose_; |
| bool is_enabled_; |
| }; |
| |
| } // namespace cuda_graphs |
| } // namespace mxnet |
| |
| #endif // CUDA_GRAPHS_AVAILABLE |
| |
| #endif // MXNET_IMPERATIVE_CUDA_GRAPHS_H_ |