| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| */ |
| |
| #if MXNET_USE_TENSORRT |
| |
| #include "trt_graph_executor.h" |
| |
| #include <onnx/onnx_pb.h> |
| #include <NvInfer.h> |
| #include "./onnx_to_tensorrt.h" |
| #include "../operator/contrib/tensorrt-inl.h" |
| #include "../common/utils.h" |
| #include "../common/exec_utils.h" |
| |
| |
| namespace mxnet { |
| namespace exec { |
| |
| using namespace mxnet::common; |
| |
| /*! |
| * \brief TrtGraphExecutor initializer for simple bind flow in |
| * which only certain input shapes and dtypes are provided by users. |
| * The initializer uses these shapes and dtypes to perform |
| * shape and dtype inferences, and then create NDArrays |
| * to populate data entries of the graph. The created NDArrays |
| * for in_args, arg_grads and aux_states are passed to the |
| * front end to attach the created executor. |
| * In front end, if the simple_bind flow is trigger by |
| * _bind_ith_exec, the shared data arrays of DataParallelExecutorGroup |
| * and shared executor will be taken into account in creating |
| * NDArrays for in_args, arg_grads, and aux_states for reusing |
| * already allocated memory. |
| * |
| * This version of an executor exports the computation graph to TensorRT make use of fused |
| * kernels and other runtime enhancements. TRT will compile the sub-graphs to executable fused |
| * operators without intervention from the user. Operators in the original graph that are not |
| * supported by TRT will continue to be executed normally by MXNet. |
| * |
| */ |
| void TrtGraphExecutor::Init(nnvm::Symbol symbol, |
| const Context& default_ctx, |
| const std::map<std::string, Context>& ctx_map, |
| std::vector<Context> *in_arg_ctxes, |
| std::vector<Context> *arg_grad_ctxes, |
| std::vector<Context> *aux_state_ctxes, |
| std::unordered_map<std::string, mxnet::TShape> *arg_shape_map, |
| std::unordered_map<std::string, int> *arg_dtype_map, |
| std::unordered_map<std::string, int> *arg_stype_map, |
| std::vector<OpReqType> *grad_req_types, |
| const std::unordered_set<std::string>& shared_arg_names, |
| std::vector<NDArray>* in_arg_vec, |
| std::vector<NDArray>* arg_grad_vec, |
| std::vector<NDArray>* aux_state_vec, |
| std::unordered_map<std::string, NDArray>* shared_buffer, |
| Executor* shared_exec, |
| const nnvm::NodeEntryMap<NDArray>& feed_dict) { |
| symbol = symbol.Copy(); |
| nnvm::Graph g = InitGraph(symbol, default_ctx, ctx_map, *in_arg_ctxes, *arg_grad_ctxes, |
| *aux_state_ctxes, *grad_req_types); |
| |
| if (need_grad_) { |
| LOG(FATAL) << "You may be attempting to use TensorRT for training. TensorRT is an inference " |
| "only library. To re-enable legacy MXNet graph execution, which will support " |
| "training, set the MXNET_USE_TENSORRT environment variable to 0, or call " |
| "mx.contrib.tensorrt.set_use_tensorrt(False)"; |
| } |
| |
| if (shared_buffer == nullptr || shared_buffer->empty()) { |
| LOG(FATAL) << "MXNET_USE_TENSORRT = 1 but shared_buffer is empty. " |
| << "Please provide weights and other parameters, such as " |
| << "BatchNorm moments, via the shared_buffer, during simple bind call."; |
| } |
| |
| // The following code of shape and dtype inferences and argument |
| // initialization is for simple_bind only. Regular bind operation |
| // should do this differently. |
| |
| // Initialize arg_shapes and arg_dtypes for shape and type inferences. |
| // It contains all in_args and aux_states' shapes and types in a certain order. |
| const nnvm::IndexedGraph& idx = g.indexed_graph(); |
| mxnet::ShapeVector arg_shapes(idx.input_nodes().size(), mxnet::TShape()); |
| nnvm::DTypeVector arg_dtypes(idx.input_nodes().size(), -1); |
| StorageTypeVector arg_stypes(idx.input_nodes().size(), kUndefinedStorage); |
| for (size_t i = 0; i < num_forward_inputs_; ++i) { |
| const uint32_t nid = idx.input_nodes().at(i); |
| const std::string& name = idx[nid].source->attrs.name; |
| auto it1 = arg_shape_map->find(name); |
| if (arg_shape_map->end() != it1) { |
| arg_shapes[i] = it1->second; |
| } |
| auto it2 = arg_dtype_map->find(name); |
| if (arg_dtype_map->end() != it2) { |
| arg_dtypes[i] = it2->second; |
| } |
| auto it3 = arg_stype_map->find(name); |
| if (arg_stype_map->end() != it3) { |
| arg_stypes[i] = it3->second; |
| } |
| } |
| g = InferShape(std::move(g), std::move(arg_shapes), "__shape__"); |
| if (g.GetAttr<size_t>("shape_num_unknown_nodes") != 0U) { |
| HandleInferShapeError(num_forward_inputs_, g.indexed_graph(), |
| g.GetAttr<mxnet::ShapeVector>("shape")); |
| } |
| |
| g = InferType(std::move(g), std::move(arg_dtypes), "__dtype__"); |
| if (g.GetAttr<size_t>("dtype_num_unknown_nodes") != 0U) { |
| HandleInferTypeError(num_forward_inputs_, g.indexed_graph(), |
| g.GetAttr<nnvm::DTypeVector>("dtype")); |
| } |
| |
| g = InferStorageType(std::move(g), std::move(arg_stypes), "__storage_type__"); |
| if (g.GetAttr<size_t>("storage_type_num_unknown_nodes") != 0U) { |
| HandleInferStorageTypeError(num_forward_inputs_, g.indexed_graph(), |
| g.GetAttr<StorageTypeVector>("storage_type")); |
| } |
| |
| auto trt_groups = GetTrtCompatibleSubsets(g, shared_buffer); |
| for (const auto &trt_group : trt_groups) { |
| if (trt_group.size() > 1) { |
| g = ReplaceSubgraph(std::move(g), trt_group, shared_buffer); |
| g = ReinitGraph(std::move(g), default_ctx, ctx_map, in_arg_ctxes, arg_grad_ctxes, |
| aux_state_ctxes, grad_req_types, arg_shape_map, arg_dtype_map, |
| arg_stype_map, shared_buffer); |
| } |
| } |
| |
| InitArguments(g.indexed_graph(), g.GetAttr<mxnet::ShapeVector>("shape"), |
| g.GetAttr<nnvm::DTypeVector>("dtype"), |
| g.GetAttr<StorageTypeVector>("storage_type"), |
| *in_arg_ctxes, *arg_grad_ctxes, *aux_state_ctxes, |
| *grad_req_types, shared_arg_names, shared_exec, |
| shared_buffer, in_arg_vec, arg_grad_vec, aux_state_vec); |
| |
| // The above code of shape and dtype inferences and argument |
| // initialization is for simple_bind only. Regular bind operation |
| // should do this differently. |
| |
| // Initialize the rest attributes of the graph. |
| // This function can be called by regular bind |
| // operation flow as well. |
| FinishInitGraph(symbol, g, shared_exec, feed_dict); |
| } |
| /*! |
| * \brief Initialize in_args, arg_grads, and aux_states |
| * and their data_entry_ of the executor using |
| * shared_buffer from DataParallelExecutorGroup |
| * and shared_exec if available. |
| */ |
| void TrtGraphExecutor::InitArguments(const nnvm::IndexedGraph& idx, |
| const mxnet::ShapeVector& inferred_shapes, |
| const nnvm::DTypeVector& inferred_dtypes, |
| const StorageTypeVector& inferred_stypes, |
| const std::vector<Context>& in_arg_ctxes, |
| const std::vector<Context>& arg_grad_ctxes, |
| const std::vector<Context>& aux_state_ctxes, |
| const std::vector<OpReqType>& grad_req_types, |
| const std::unordered_set<std::string>& shared_arg_names, |
| const Executor* shared_exec, |
| std::unordered_map<std::string, NDArray>* shared_buffer, |
| std::vector<NDArray>* in_arg_vec, |
| std::vector<NDArray>* arg_grad_vec, |
| std::vector<NDArray>* aux_state_vec) { |
| // initialize in_args, arg_grads, and aux_states and populate grad_store_ |
| data_entry_.resize(idx.num_node_entries()); |
| size_t arg_top = 0, aux_top = 0; |
| const auto& mutable_nodes = idx.mutable_input_nodes(); |
| for (size_t i = 0; i < num_forward_inputs_; ++i) { |
| const uint32_t nid = idx.input_nodes().at(i); |
| const uint32_t eid = idx.entry_id(nid, 0); |
| const mxnet::TShape& inferred_shape = inferred_shapes[eid]; |
| const int inferred_dtype = inferred_dtypes[eid]; |
| const auto inferred_stype = (NDArrayStorageType) inferred_stypes[eid]; |
| const std::string& arg_name = idx[nid].source->attrs.name; |
| // aux_states |
| if (mutable_nodes.count(nid)) { |
| if (nullptr != shared_exec) { |
| const NDArray& aux_nd = shared_exec->aux_state_map().at(arg_name); |
| CHECK(inferred_stype == kDefaultStorage && aux_nd.storage_type() == kDefaultStorage) |
| << "Non-default storage type detected when creating auxilliary NDArray. The allocated " |
| << "memory of shared_exec.aux_array cannot be resued for argument: " |
| << arg_name << " for the current executor"; |
| CHECK_EQ(inferred_shape, aux_nd.shape()) |
| << "Inferred shape does not match shared_exec.aux_array's shape." |
| " Therefore, the allocated memory for shared_exec.aux_array cannot" |
| " be resued for creating auxilliary NDArray of the argument: " |
| << arg_name << " for the current executor"; |
| CHECK_EQ(inferred_dtype, aux_nd.dtype()) |
| << "Inferred dtype does not match shared_exec.aux_array's dtype." |
| " Therefore, the allocated memory for shared_exec.aux_array cannot" |
| " be resued for creating auxilliary NDArray of the argument: " |
| << arg_name << " for the current executor"; |
| aux_state_vec->emplace_back(aux_nd); |
| } else { |
| auto it = shared_buffer->find(arg_name); |
| if (it != shared_buffer->end()) { |
| aux_state_vec->push_back(std::move(it->second.Copy(aux_state_ctxes[aux_top]))); |
| } else { |
| aux_state_vec->push_back(std::move(InitZeros(inferred_stype, inferred_shape, |
| aux_state_ctxes[aux_top], inferred_dtype))); |
| } |
| } // if (has_shared_exec) |
| data_entry_[eid] = aux_state_vec->back(); |
| aux_state_map_.emplace(arg_name, aux_state_vec->back()); |
| ++aux_top; |
| } else { // in_args and grad for in_args |
| if (shared_arg_names.count(arg_name)) { // model parameter |
| // model parameter |
| if (nullptr != shared_exec) { |
| const NDArray& in_arg_nd = shared_exec->in_arg_map().at(arg_name); |
| auto arg_nd_stype = in_arg_nd.storage_type(); |
| // for model parameter, both default storage and row_sparse storage can be shared |
| bool shareable_arg_stype = inferred_stype == kDefaultStorage || |
| inferred_stype == kRowSparseStorage; |
| // try to reuse memory from shared_exec |
| CHECK(shareable_arg_stype) << "Inferred storage type " |
| << common::stype_string(inferred_stype) |
| << " does not support memory sharing with shared_exec.arg_array"; |
| CHECK_EQ(inferred_stype, arg_nd_stype) |
| << "Inferred stype does not match shared_exec.arg_array's stype" |
| " Therefore, the allocated memory for shared_exec.arg_array cannot" |
| " be resued for creating NDArray of the argument " |
| << arg_name << " for the current executor"; |
| CHECK_EQ(inferred_shape, in_arg_nd.shape()) |
| << "Inferred shape does not match shared_exec.arg_array's shape" |
| " Therefore, the allocated memory for shared_exec.arg_array cannot" |
| " be resued for creating NDArray of the argument " |
| << arg_name << " for the current executor"; |
| CHECK_EQ(inferred_dtype, in_arg_nd.dtype()) |
| << "Inferred dtype does not match shared_exec.arg_array's dtype" |
| " Therefore, the allocated memory for shared_exec.arg_array cannot" |
| " be resued for creating NDArray of the argument " |
| << arg_name << " for the current executor"; |
| in_arg_vec->emplace_back(in_arg_nd); |
| } else { |
| // doesn't have shared_exec, or non-default storage |
| EmplaceBackZeros(inferred_stype, inferred_shape, in_arg_ctxes[arg_top], |
| inferred_dtype, in_arg_vec); |
| } |
| // gradient for model parameter |
| if (kNullOp == grad_req_types[arg_top]) { |
| arg_grad_vec->emplace_back(); |
| } else { |
| auto grad_oid = grad_store_.size() + num_forward_outputs_; |
| auto grad_eid = idx.entry_id(idx.outputs()[grad_oid]); |
| auto grad_stype = (NDArrayStorageType) inferred_stypes[grad_eid]; |
| if (nullptr != shared_exec && grad_stype == kDefaultStorage && |
| shared_exec->arg_grad_map().at(arg_name).storage_type() == kDefaultStorage) { |
| // try to reuse memory from shared_exec |
| arg_grad_vec->emplace_back(shared_exec->arg_grad_map().at(arg_name)); |
| } else { |
| // no need to reuse memory from shared_exec for gradient of non-default storage |
| EmplaceBackZeros(grad_stype, inferred_shape, arg_grad_ctxes[arg_top], |
| inferred_dtype, arg_grad_vec); |
| } |
| grad_store_.emplace_back(grad_req_types[arg_top], arg_grad_vec->back()); |
| } |
| } else { // !shared_arg_names.count(arg_name) |
| // model parameter, row_sparse ndarray sharing enabled |
| auto it = shared_buffer->find(arg_name); |
| if (it != shared_buffer->end()) { |
| in_arg_vec->push_back(std::move(it->second.Copy(in_arg_ctxes[arg_top]))); |
| } else { |
| in_arg_vec->push_back(std::move(InitZeros(inferred_stype, inferred_shape, |
| in_arg_ctxes[arg_top], inferred_dtype))); |
| } |
| // gradient for model parameter, row_sparse ndarray sharing disabled |
| if (kNullOp == grad_req_types[arg_top]) { |
| arg_grad_vec->emplace_back(); |
| } else { |
| auto grad_oid = grad_store_.size() + num_forward_outputs_; |
| auto grad_eid = idx.entry_id(idx.outputs()[grad_oid]); |
| auto grad_stype = (NDArrayStorageType) inferred_stypes[grad_eid]; |
| bool enable_row_sparse_sharing = false; |
| arg_grad_vec->emplace_back(ReshapeOrCreate("grad of " + arg_name, inferred_shape, |
| inferred_dtype, grad_stype, |
| arg_grad_ctxes[arg_top], shared_buffer, |
| enable_row_sparse_sharing)); |
| grad_store_.emplace_back(grad_req_types[arg_top], arg_grad_vec->back()); |
| } // if (kNullOp == grad_req_types[arg_top]) |
| } // if (shared_arg_names.count(arg_name)) |
| in_arg_map_.emplace(arg_name, in_arg_vec->back()); |
| if (!arg_grad_vec->back().is_none()) { |
| arg_grad_map_.emplace(arg_name, arg_grad_vec->back()); |
| } |
| data_entry_[eid] = in_arg_vec->back(); |
| ++arg_top; |
| } |
| } |
| } |
| |
| |
| /*! |
| * \brief This function is triggered after each tensorrt subgraph replacement pass. |
| * Reset arguments of GraphExecutor::Init(...) as some variables (weights and biases) |
| * are absorbed into the TRT engine it also it reruns attributes inferences accordingly |
| * to the new topology. |
| */ |
| Graph TrtGraphExecutor::ReinitGraph(Graph&& g, const Context &default_ctx, |
| const std::map<std::string, Context> &ctx_map, |
| std::vector<Context> *in_arg_ctxes, |
| std::vector<Context> *arg_grad_ctxes, |
| std::vector<Context> *aux_state_ctxes, |
| std::vector<OpReqType> *grad_req_types, |
| std::unordered_map<std::string, mxnet::TShape> *arg_shape_map, |
| std::unordered_map<std::string, int> *arg_dtype_map, |
| std::unordered_map<std::string, int> *arg_stype_map, |
| std::unordered_map<std::string, NDArray> *params_map) { |
| std::unordered_set<std::string> to_remove_params; |
| for (auto& el : *params_map) { |
| to_remove_params.insert(el.first); |
| } |
| |
| DFSVisit(g.outputs, [&to_remove_params](const nnvm::NodePtr n) { |
| to_remove_params.erase(n->attrs.name); |
| }); |
| |
| for (auto& el : to_remove_params) { |
| params_map->erase(el); |
| arg_shape_map->erase(el); |
| arg_dtype_map->erase(el); |
| arg_stype_map->erase(el); |
| } |
| const auto &idx = g.indexed_graph(); |
| num_forward_inputs_ = idx.input_nodes().size(); |
| in_arg_ctxes->resize(num_forward_inputs_ - idx.mutable_input_nodes().size()); |
| arg_grad_ctxes->resize(num_forward_inputs_ - idx.mutable_input_nodes().size()); |
| grad_req_types->resize(num_forward_inputs_ - idx.mutable_input_nodes().size()); |
| aux_state_ctxes->resize(idx.mutable_input_nodes().size()); |
| |
| // create "device" and "context" attrs for the graph |
| g = AssignContext(g, default_ctx, ctx_map, *in_arg_ctxes, *arg_grad_ctxes, |
| *aux_state_ctxes, *grad_req_types, num_forward_inputs_, |
| num_forward_outputs_); |
| |
| // get number of nodes used in forward pass |
| num_forward_nodes_ = 0; |
| for (size_t i = 0; i < num_forward_outputs_; ++i) { |
| num_forward_nodes_ = std::max( |
| num_forward_nodes_, static_cast<size_t>(idx.outputs()[i].node_id + 1)); |
| } |
| mxnet::ShapeVector arg_shapes(idx.input_nodes().size(), mxnet::TShape()); |
| nnvm::DTypeVector arg_dtypes(idx.input_nodes().size(), -1); |
| StorageTypeVector arg_stypes(idx.input_nodes().size(), kUndefinedStorage); |
| for (size_t i = 0; i < num_forward_inputs_; ++i) { |
| const uint32_t nid = idx.input_nodes().at(i); |
| const std::string &name = idx[nid].source->attrs.name; |
| auto it1 = arg_shape_map->find(name); |
| if (arg_shape_map->end() != it1) { |
| arg_shapes[i] = it1->second; |
| } |
| auto it2 = arg_dtype_map->find(name); |
| if (arg_dtype_map->end() != it2) { |
| arg_dtypes[i] = it2->second; |
| } |
| auto it3 = arg_stype_map->find(name); |
| if (arg_stype_map->end() != it3) { |
| arg_stypes[i] = it3->second; |
| } |
| } |
| g = InferShape(std::move(g), std::move(arg_shapes), "__shape__"); |
| if (g.GetAttr<size_t>("shape_num_unknown_nodes") != 0U) { |
| HandleInferShapeError(num_forward_inputs_, g.indexed_graph(), |
| g.GetAttr<mxnet::ShapeVector>("shape")); |
| } |
| |
| g = InferType(std::move(g), std::move(arg_dtypes), "__dtype__"); |
| if (g.GetAttr<size_t>("dtype_num_unknown_nodes") != 0U) { |
| HandleInferTypeError(num_forward_inputs_, g.indexed_graph(), |
| g.GetAttr<nnvm::DTypeVector>("dtype")); |
| } |
| |
| g = InferStorageType(std::move(g), std::move(arg_stypes), "__storage_type__"); |
| |
| if (g.GetAttr<size_t>("storage_type_num_unknown_nodes") != 0U) { |
| HandleInferStorageTypeError(num_forward_inputs_, g.indexed_graph(), |
| g.GetAttr<StorageTypeVector>("storage_type")); |
| } |
| |
| return g; |
| } |
| |
| |
| /*! |
| * \brief Return the "optimized" symbol contained in the graph. |
| * For optimization pass such as TensorRT pass |
| */ |
| nnvm::Symbol TrtGraphExecutor::GetOptimizedSymbol() { |
| Symbol ret; |
| ret.outputs = std::vector<nnvm::NodeEntry>(graph_.outputs.begin(), |
| graph_.outputs.begin() + num_forward_outputs_); |
| return ret.Copy(); |
| } |
| |
| Executor *TrtGraphExecutor::TensorRTBind(nnvm::Symbol symbol, |
| const Context &default_ctx, |
| const std::map<std::string, Context> &group2ctx, |
| std::vector<Context> *in_arg_ctxes, |
| std::vector<Context> *arg_grad_ctxes, |
| std::vector<Context> *aux_state_ctxes, |
| std::unordered_map<std::string, mxnet::TShape> |
| *arg_shape_map, |
| std::unordered_map<std::string, int> *arg_dtype_map, |
| std::unordered_map<std::string, int> *arg_stype_map, |
| std::vector<OpReqType> *grad_req_types, |
| const std::unordered_set<std::string> ¶m_names, |
| std::vector<NDArray> *in_args, |
| std::vector<NDArray> *arg_grads, |
| std::vector<NDArray> *aux_states, |
| std::unordered_map<std::string, NDArray> *shared_buffer, |
| Executor *shared_exec) { |
| auto exec = new exec::TrtGraphExecutor(); |
| exec->Init(std::move(symbol), default_ctx, group2ctx, |
| in_arg_ctxes, arg_grad_ctxes, aux_state_ctxes, |
| arg_shape_map, arg_dtype_map, arg_stype_map, |
| grad_req_types, param_names, |
| in_args, arg_grads, aux_states, |
| shared_buffer, shared_exec); |
| return exec; |
| } |
| |
| } // namespace exec |
| |
| } // namespace mxnet |
| |
| #endif // MXNET_USE_TENSORRT |