| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| */ |
| |
| /*! |
| * Copyright (c) 2015 by Contributors |
| * \file graph_executor.cc |
| * \brief graph executor |
| */ |
| #include <mxnet/base.h> |
| #include <nnvm/graph.h> |
| #include <nnvm/pass_functions.h> |
| #include <vector> |
| #include <set> |
| #include <algorithm> |
| |
| #include "./exec_pass.h" |
| #include "./graph_executor.h" |
| #include "./cuda_graphs.h" |
| #include "../profiler/profiler.h" |
| #include "../common/utils.h" |
| #include "../common/exec_utils.h" |
| #include "../operator/subgraph/subgraph_property.h" |
| #include "../operator/operator_common.h" |
| |
| namespace mxnet { |
| namespace exec { |
| |
| using namespace mxnet::common; |
| |
| static const std::string GetDefaultSubgraphBackend() { |
| #if MXNET_USE_MKLDNN == 1 |
| return std::string("MKLDNN"); |
| #else |
| return std::string(); |
| #endif |
| } |
| |
| GraphExecutor::GraphExecutor(const nnvm::Symbol& symbol) { |
| log_verbose_ = dmlc::GetEnv("MXNET_EXEC_VERBOSE_LOGGING", false); |
| need_grad_ = false; |
| is_dynamic_ = false; |
| subgraph_property_ = dmlc::GetEnv("MXNET_SUBGRAPH_BACKEND", GetDefaultSubgraphBackend()); |
| if (subgraph_property_ == "NONE") { |
| subgraph_property_ = std::string(); |
| LOG(INFO) << "MXNET_SUBGRAPH_BACKEND=NONE is detected, subgraph backend is not in use"; |
| } |
| engine_ref_ = Engine::_GetSharedRef(); |
| symbol_ = symbol.Copy(); |
| } |
| |
| GraphExecutor::~GraphExecutor() { |
| for (auto& n : op_nodes_) { |
| if (n.cached_opr != nullptr) { |
| Engine::Get()->DeleteOperator(n.cached_opr); |
| } |
| } |
| // clean up seg ops |
| for (auto& seg : cached_seg_opr_) { |
| if (seg.opr != nullptr) { |
| Engine::Get()->DeleteOperator(seg.opr); |
| } |
| } |
| } |
| |
| void GraphExecutor::Forward(bool is_train) { |
| RunOps(is_train, 0, num_forward_nodes_); |
| } |
| |
| void GraphExecutor::PartialForward(bool is_train, int step, int *step_left) { |
| size_t sstep = static_cast<size_t>(step); |
| if (sstep >= num_forward_nodes_) { |
| *step_left = 0; |
| return; |
| } |
| RunOps(is_train, sstep, sstep + 1); |
| *step_left = static_cast<int>(num_forward_nodes_ - sstep - 1); |
| } |
| |
| void GraphExecutor::Backward(const std::vector<NDArray>& head_grads, bool is_train) { |
| { |
| const auto& idx = graph_.indexed_graph(); |
| if (num_forward_inputs_ != idx.input_nodes().size()) { |
| for (size_t i = 0; i < head_grad_array_.size(); ++i) { |
| if (!head_grad_array_[i].is_none()) { |
| CHECK(i < head_grads.size() && !head_grads[i].is_none()) |
| << "Because the last operator is not Loss function, " |
| << "head_gradient is required when calling backward. " |
| << "If you are attempting to minimize the output as " |
| << "an objective, please modify your network and " |
| << "pass it through the make_loss symbol."; |
| const NDArray &from = head_grads[i]; |
| NDArray &to = head_grad_array_[i]; |
| if (this->is_dynamic_) { |
| to.WaitToRead(); |
| if (!shape_is_known(to.shape())) { |
| to.Init(from.shape()); |
| } |
| } |
| CopyFromTo(from, &to); |
| } |
| } |
| } |
| } |
| if (this->is_dynamic_) { |
| graph_ = InferShape(std::move(graph_), {}, ""); |
| mxnet::ShapeVector rshape = graph_.MoveCopyAttr<mxnet::ShapeVector>("shape"); |
| const auto& idx = graph_.indexed_graph(); |
| for (size_t nid = 0; nid < idx.num_nodes(); ++nid) { |
| const auto& inode = idx[nid]; |
| if (inode.source->is_variable()) continue; |
| OpNode& opnode = op_nodes_[nid]; |
| if (opnode.skip_exec_node) continue; |
| for (NDArray &array : opnode.exec->in_array) { |
| array.WaitToRead(); |
| if (!shape_is_known(array.shape())) { |
| array.SetShapeFromChunk(); |
| } |
| } |
| int i = 0; |
| for (NDArray &array : opnode.exec->in_array) { |
| array.WaitToRead(); |
| if (!shape_is_known(array.shape())) { |
| array.SetShapeFromChunk(); |
| } |
| if (!shape_is_known(array.shape())) { |
| mxnet::TShape shape = rshape[idx.entry_id(inode.inputs[i])]; |
| if (shape_is_known(shape)) { |
| array.ReshapeAndAlloc(shape); |
| } |
| } |
| ++i; |
| } |
| i = 0; |
| for (NDArray &array : opnode.exec->out_array) { |
| array.WaitToRead(); |
| if (!shape_is_known(array.shape())) { |
| array.SetShapeFromChunk(); |
| } |
| if (!shape_is_known(array.shape())) { |
| mxnet::TShape shape = rshape[idx.entry_id(nid, i)]; |
| if (shape_is_known(shape)) { |
| array.ReshapeAndAlloc(shape); |
| } |
| } |
| ++i; |
| } |
| } |
| graph_.attrs["shape"] = std::make_shared<dmlc::any>(rshape); |
| } |
| const auto& idx = graph_.indexed_graph(); |
| RunOps(is_train, num_forward_nodes_, idx.num_nodes()); |
| } |
| |
| void GraphExecutor::Print(std::ostream &os) const { // NOLINT(*) |
| nnvm::Symbol s; |
| s.outputs = graph_.outputs; |
| s.Print(os); |
| // message to be backward compatible with the memonger |
| size_t total_bytes = graph_.GetAttr<size_t>("storage_allocated_bytes"); |
| os << "Total " << (total_bytes >> 20UL) << " MB allocated\n"; |
| os << "Total " << 11 << " TempSpace resource requested\n"; |
| } |
| |
| /*! |
| * \brief Return the "optimized" symbol contained in the executor graph. |
| */ |
| nnvm::Symbol GraphExecutor::GetOptimizedSymbol() { |
| Symbol ret; |
| ret.outputs = std::vector<nnvm::NodeEntry>(graph_.outputs.begin(), |
| graph_.outputs.begin() + num_forward_outputs_); |
| return ret.Copy(); |
| } |
| |
| void GraphExecutor::SetMonitorCallback(const MonitorCallback& callback, bool monitor_all) { |
| CHECK(callback) << "invalid callback"; |
| monitor_callback_ = callback; |
| monitor_all_ = monitor_all; |
| } |
| |
| const std::vector<NDArray>& GraphExecutor::outputs() const { |
| if (this->is_dynamic_) { |
| for (const NDArray &array : output_arrays_) { |
| array.WaitToRead(); |
| if (!shape_is_known(array.shape())) { |
| const_cast<NDArray &>(array).SetShapeFromChunk(); |
| } |
| } |
| } |
| return output_arrays_; |
| } |
| |
| const std::unordered_map<std::string, NDArray>& GraphExecutor::in_arg_map() const { |
| return in_arg_map_; |
| } |
| |
| const std::unordered_map<std::string, NDArray>& GraphExecutor::arg_grad_map() const { |
| return arg_grad_map_; |
| } |
| |
| const std::unordered_map<std::string, NDArray>& GraphExecutor::aux_state_map() const { |
| return aux_state_map_; |
| } |
| |
| static nnvm::NodeEntry AttrHint(nnvm::NodeEntry src, nnvm::NodeEntry like) { |
| static const Op* id_like = Op::Get("_identity_with_attr_like_rhs"); |
| nnvm::ObjectPtr n = nnvm::Node::Create(); |
| n->attrs.op = id_like; |
| n->attrs.name = src.node->attrs.name + "_id"; |
| n->inputs = {src, like}; |
| return nnvm::NodeEntry{n, 0, 0}; |
| } |
| |
| nnvm::NodeEntry AggregateGradient(std::vector<nnvm::NodeEntry>&& v) { |
| using nnvm::Op; |
| static size_t inplace_sum_cap = dmlc::GetEnv("MXNET_EXEC_INPLACE_GRAD_SUM_CAP", 8); |
| static const Op* ewise_plus_op = Op::Get("_grad_add"); |
| static const Op* ewise_sum_op = Op::Get("ElementWiseSum"); |
| static const Op* identity_op = Op::Get("identity"); |
| static const Op* zeros_op = Op::Get("_zeros"); |
| static const Op* zeros_like_op = Op::Get("zeros_like"); |
| |
| if (v.empty()) { |
| nnvm::ObjectPtr ng = nnvm::Node::Create(); |
| ng->attrs.op = Op::Get("_zeros_without_dtype"); |
| ng->attrs.name = "zeros_without_dtype"; |
| ng->attrs.op->attr_parser(&(ng->attrs)); |
| return nnvm::NodeEntry(std::move(ng), 0, 0); |
| } |
| |
| // remove zero in the sum. at least keep 1. |
| auto begin = std::remove_if(v.begin(), v.end(), [](const nnvm::NodeEntry& nodeEntry) { |
| CHECK(nodeEntry.node); |
| return nodeEntry.node->op() == zeros_op || nodeEntry.node->op() == zeros_like_op; |
| }); |
| if (begin == v.begin()) ++begin; |
| v.erase(begin, v.end()); |
| CHECK(!v.empty()); |
| |
| if (v.size() == 1) { |
| return std::move(v[0]); |
| } else { |
| if (v.size() < inplace_sum_cap) { |
| nnvm::ObjectPtr sum_node = nnvm::Node::Create(); |
| sum_node->attrs.op = ewise_sum_op; |
| sum_node->attrs.name = "sum_grad"; |
| sum_node->attrs.dict["num_args"] = std::to_string(v.size()); |
| sum_node->attrs.op->attr_parser(&(sum_node->attrs)); |
| sum_node->inputs = std::move(v); |
| return nnvm::NodeEntry(std::move(sum_node), 0, 0); |
| } else { |
| // use a stream line of plus instead |
| nnvm::NodeEntry ret = v[0]; |
| for (size_t i = 1; i < v.size(); ++i) { |
| // Add control flow dependency from to previous node |
| // This enforces the gradient sum order will be in the inverse |
| // order of forward traversal |
| // NOTE: adding control dependency can be dangerous and cause cycle in the dep. |
| // The curent usage is correct, because of the following invariant: |
| // assert: v[i-1] do not depend on v[i] |
| // To put in plain text: v is gradient vector that get pushed in the order |
| // that can generate them, which means if v[i] is not yet pushed, |
| // all previous gradient cannot depend on it. |
| // Note: For a symbol like the following: |
| // data = mx.sym.Variable('data') |
| // sym = data + data + data + data + data + data + data |
| // the node entries v passed in here are of the same node of |
| // op _identity_with_attr_like_rhs. We should skip adding a node |
| // to its own control_deps. |
| if (v[i-1].node != v[i].node) { |
| v[i].node->control_deps.push_back(ret.node); |
| } |
| |
| std::ostringstream os; |
| os << "sum_grad_" << i; |
| nnvm::ObjectPtr x = nnvm::Node::Create(); |
| x->attrs.op = ewise_plus_op; |
| x->attrs.name = os.str(); |
| x->inputs = {ret, v[i]}; |
| ret = nnvm::NodeEntry(std::move(x), 0, 0); |
| } |
| // identity node is used to avoid exposure of dummy plus node |
| // when its output get assigned to another space. |
| nnvm::ObjectPtr id_node = nnvm::Node::Create(); |
| id_node->attrs.op = identity_op; |
| id_node->attrs.name = "sum_grad_final"; |
| id_node->inputs = {ret}; |
| return nnvm::NodeEntry{id_node, 0, 0}; |
| } |
| } |
| } |
| |
| template<typename ValueType> |
| inline ValueType get_node_attr( |
| const nnvm::Node& node, |
| const std::string& key, ValueType default_value) { |
| auto it = node.attrs.dict.find(key); |
| if (it == node.attrs.dict.end()) { |
| return default_value; |
| } else { |
| ValueType ret; |
| dmlc::parameter::FieldEntry<ValueType> e; |
| e.Init(key, &ret, ret); |
| e.Set(&ret, it->second); |
| return ret; |
| } |
| } |
| |
| /*! |
| * \brief Create the graph for backward pass. |
| * This is triggered by both simple_bind and bind flows. |
| */ |
| nnvm::Graph GraphExecutor::InitFullGraph(nnvm::Symbol symbol, |
| const std::vector<OpReqType>& grad_req_types) { |
| using nnvm::ObjectPtr; |
| using nnvm::NodeEntry; |
| // initial information |
| num_forward_outputs_ = symbol.outputs.size(); |
| num_forward_inputs_ = symbol.ListInputs(nnvm::Symbol::kAll).size(); |
| |
| nnvm::Graph g; |
| g.outputs = symbol.outputs; |
| bool do_elim_common_expr = dmlc::GetEnv("MXNET_ELIMINATE_COMMON_EXPR", true); |
| if (do_elim_common_expr) |
| g = exec::EliminateCommonExpr(std::move(g)); |
| need_grad_ = false; |
| for (OpReqType req : grad_req_types) { |
| if (req != kNullOp) |
| need_grad_ = true; |
| } |
| if (!need_grad_) return g; |
| for (size_t i = 0; i < g.outputs.size(); ++i) { |
| NodeEntry ngrad(nnvm::Node::Create(), 0, 0); |
| ngrad.node->attrs.name = "_head_grad_" + std::to_string(i); |
| head_grad_entry_.emplace_back(AttrHint(ngrad, g.outputs[i])); |
| head_grad_map_[ngrad.node.get()] = i; |
| } |
| std::vector<ObjectPtr> args = symbol.ListInputs(nnvm::Symbol::kReadOnlyArgs); |
| std::vector<NodeEntry> xs; |
| for (size_t i = 0; i < grad_req_types.size(); ++i) { |
| if (grad_req_types[i] != kNullOp) { |
| xs.emplace_back(args[i]); |
| } |
| } |
| |
| int do_mirror = dmlc::GetEnv("MXNET_BACKWARD_DO_MIRROR", 0); |
| auto need_mirror = [do_mirror](const nnvm::Node& node) -> int { |
| if (node.is_variable()) return 0; |
| const std::string& type = node.attrs.op->name; |
| if (type == "Dropout") return false; |
| if (get_node_attr(node, "__force_mirroring__", false)) return true; |
| if (do_mirror == 0) return false; |
| if (type == "Convolution") return false; |
| if (type == "FullyConnected") return false; |
| if (type == "Concat") return false; |
| if (type == "SoftmaxOutput") return false; |
| return true; |
| }; |
| |
| std::vector<const nnvm::Op*> zero_ops; |
| zero_ops.push_back(nnvm::Op::Get("zeros_like")); |
| zero_ops.push_back(nnvm::Op::Get("_zeros")); |
| |
| // take gradient |
| nnvm::Graph g_grad = nnvm::pass::MXGradient( |
| g, symbol.outputs, xs, head_grad_entry_, |
| AggregateGradient, need_mirror, nullptr, |
| zero_ops, "_copy"); |
| CHECK_EQ(g_grad.outputs.size(), xs.size()); |
| for (const auto &e : g_grad.outputs) { |
| g.outputs.push_back(e); |
| } |
| |
| return g; |
| } |
| |
| /*! |
| * \brief GraphExecutor initializer for regular bind flow in which |
| * input arguments and gradients are provided by users. This initializer |
| * uses the user provided NDArrays to populate data entries of the graph. |
| */ |
| void GraphExecutor::Init(nnvm::Symbol symbol, |
| const Context& default_ctx, |
| const std::map<std::string, Context>& ctx_map, |
| const std::vector<NDArray>& in_args, |
| const std::vector<NDArray>& arg_grad_store, |
| const std::vector<OpReqType>& grad_req_types, |
| const std::vector<NDArray>& aux_states, |
| Executor* shared_exec, |
| const nnvm::NodeEntryMap<NDArray>& feed_dict) { |
| // create in_arg_ctxes, arg_grad_ctxes, aux_state_ctxes |
| auto get_ctx1 = [](const NDArray& nd) { return nd.ctx(); }; |
| auto get_ctx2 = [default_ctx](const NDArray& nd) -> Context { |
| if (nd.is_none()) return default_ctx; |
| return nd.ctx(); |
| }; |
| std::vector<Context> in_arg_ctxes(in_args.size()); |
| std::transform(in_args.begin(), in_args.end(), in_arg_ctxes.begin(), get_ctx1); |
| std::vector<Context> arg_grad_ctxes(arg_grad_store.size()); |
| std::transform(arg_grad_store.begin(), arg_grad_store.end(), arg_grad_ctxes.begin(), get_ctx2); |
| std::vector<Context> aux_state_ctxes(aux_states.size()); |
| std::transform(aux_states.begin(), aux_states.end(), aux_state_ctxes.begin(), get_ctx1); |
| |
| nnvm::Graph g = InitGraph(symbol, default_ctx, ctx_map, in_arg_ctxes, |
| arg_grad_ctxes, aux_state_ctxes, grad_req_types); |
| |
| // create arg_shapes and arg_dtypes for shape and type inferences |
| const auto& idx = g.indexed_graph(); |
| const auto& mutable_nodes = idx.mutable_input_nodes(); |
| size_t arg_top = 0, aux_top = 0; |
| data_entry_.resize(idx.num_node_entries()); |
| mxnet::ShapeVector arg_shapes; |
| nnvm::DTypeVector arg_dtypes; |
| StorageTypeVector arg_stypes(idx.num_node_entries(), -1); |
| for (size_t i = 0; i < num_forward_inputs_; ++i) { |
| const uint32_t nid = idx.input_nodes().at(i); |
| const std::string& arg_name = idx[nid].source->attrs.name; |
| size_t eid = idx.entry_id(nid, 0); |
| if (mutable_nodes.count(nid)) { |
| CHECK_LT(aux_top, aux_states.size()); |
| data_entry_[eid] = aux_states[aux_top]; |
| arg_shapes.push_back(aux_states[aux_top].shape()); |
| arg_dtypes.push_back(aux_states[aux_top].dtype()); |
| arg_stypes[eid] = aux_states[aux_top].storage_type(); |
| aux_state_map_.emplace(arg_name, aux_states[aux_top]); |
| ++aux_top; |
| } else { |
| CHECK_LT(arg_top, in_args.size()); |
| data_entry_[eid] = in_args[arg_top]; |
| arg_shapes.push_back(in_args[arg_top].shape()); |
| arg_dtypes.push_back(in_args[arg_top].dtype()); |
| arg_stypes[eid] = in_args[arg_top].storage_type(); |
| in_arg_map_.emplace(arg_name, in_args[arg_top]); |
| if (kNullOp != grad_req_types[arg_top]) { |
| auto grad_oid = grad_store_.size() + num_forward_outputs_; |
| auto grad_eid = idx.entry_id(idx.outputs()[grad_oid]); |
| arg_stypes[grad_eid] = arg_grad_store[arg_top].storage_type(); |
| grad_store_.emplace_back(grad_req_types[arg_top], arg_grad_store[arg_top]); |
| arg_grad_map_.emplace(arg_name, arg_grad_store[arg_top]); |
| if (log_verbose_) { |
| LOG(INFO) << "\tassign data entry\t" << grad_eid << " as " |
| << common::stype_string(arg_stypes[grad_eid]) << " (grad)"; |
| } |
| } |
| ++arg_top; |
| } |
| if (log_verbose_) { |
| LOG(INFO) << "\tassign data entry\t" << eid << " as " |
| << common::stype_string(data_entry_[eid].storage_type()) << " (input)"; |
| } |
| } |
| |
| // expand arg_shapes and arg_dtypes to contain backward inputs |
| arg_shapes.resize(idx.input_nodes().size(), mxnet::TShape()); |
| g = InferShape(std::move(g), std::move(arg_shapes), "__shape__"); |
| if (g.GetAttr<size_t>("shape_num_unknown_nodes") != 0U) { |
| this->is_dynamic_ = true; |
| } |
| |
| arg_dtypes.resize(idx.input_nodes().size(), -1); |
| g = InferType(std::move(g), std::move(arg_dtypes), "__dtype__"); |
| if (g.GetAttr<size_t>("dtype_num_unknown_nodes") != 0U) { |
| HandleInferTypeError(num_forward_inputs_, g.indexed_graph(), |
| g.GetAttr<nnvm::DTypeVector>("dtype")); |
| } |
| |
| g.attrs["storage_type"] = std::make_shared<dmlc::any>(std::move(arg_stypes)); |
| g = InferStorageType(std::move(g), StorageTypeVector(), ""); |
| if (g.GetAttr<size_t>("storage_type_num_unknown_nodes") != 0U) { |
| HandleInferStorageTypeError(num_forward_inputs_, g.indexed_graph(), |
| g.GetAttr<StorageTypeVector>("storage_type")); |
| } |
| |
| // Initialize the rest attributes of the graph. |
| // This function can be called by regular bind |
| // operation flow as well. |
| FinishInitGraph(symbol, g, shared_exec, feed_dict); |
| } |
| |
| /*! |
| * \brief Initialize in_args, arg_grads, and aux_states |
| * and their data_entry_ of the executor. This function |
| * is called for regular simple_bind flow, i.e. no |
| * shared data arrays are provided. |
| */ |
| void GraphExecutor::InitArguments(const nnvm::IndexedGraph& idx, |
| const mxnet::ShapeVector& inferred_shapes, |
| const nnvm::DTypeVector& inferred_dtypes, |
| const StorageTypeVector& inferred_stypes, |
| const std::vector<Context>& in_arg_ctxes, |
| const std::vector<Context>& arg_grad_ctxes, |
| const std::vector<Context>& aux_state_ctxes, |
| const std::vector<OpReqType>& grad_req_types, |
| std::vector<NDArray>* in_arg_vec, |
| std::vector<NDArray>* arg_grad_vec, |
| std::vector<NDArray>* aux_state_vec) { |
| // initialize in_args, arg_grads, and aux_states |
| // populate grad_store_ |
| data_entry_.resize(idx.num_node_entries()); |
| size_t arg_top = 0, aux_top = 0; |
| const auto& mutable_nodes = idx.mutable_input_nodes(); |
| for (size_t i = 0; i < num_forward_inputs_; ++i) { |
| const uint32_t nid = idx.input_nodes().at(i); |
| const uint32_t eid = idx.entry_id(nid, 0); |
| const mxnet::TShape& inferred_shape = inferred_shapes[eid]; |
| const int inferred_dtype = inferred_dtypes[eid]; |
| const NDArrayStorageType inferred_stype = (NDArrayStorageType) inferred_stypes[eid]; |
| const std::string& arg_name = idx[nid].source->attrs.name; |
| if (mutable_nodes.count(nid)) { // aux_states |
| EmplaceBackZeros(inferred_stype, inferred_shape, aux_state_ctxes[aux_top], |
| inferred_dtype, aux_state_vec); |
| data_entry_[eid] = aux_state_vec->back(); |
| aux_state_map_.emplace(arg_name, aux_state_vec->back()); |
| ++aux_top; |
| if (log_verbose_) { |
| LOG(INFO) << "\tassign aux entry\t" << eid << "\t as " |
| << common::stype_string(inferred_stype); |
| } |
| } else { // in_args |
| EmplaceBackZeros(inferred_stype, inferred_shape, in_arg_ctxes[arg_top], |
| inferred_dtype, in_arg_vec); |
| data_entry_[eid] = in_arg_vec->back(); |
| if (log_verbose_) { |
| LOG(INFO) << "\tassign data entry\t" << eid << "\tas " |
| << common::stype_string(inferred_stype); |
| } |
| // Get the storage type for grad |
| if (kNullOp == grad_req_types[arg_top]) { |
| arg_grad_vec->emplace_back(); |
| } else { |
| // Init based on storage type |
| auto grad_oid = grad_store_.size() + num_forward_outputs_; |
| auto grad_eid = idx.entry_id(idx.outputs()[grad_oid]); |
| auto grad_stype = (NDArrayStorageType) inferred_stypes[grad_eid]; |
| EmplaceBackZeros(grad_stype, inferred_shape, arg_grad_ctxes[arg_top], |
| inferred_dtype, arg_grad_vec); |
| if (log_verbose_) { |
| LOG(INFO) << "\tassign grad entry\t" << grad_eid << "\tas " |
| << common::stype_string(grad_stype); |
| } |
| grad_store_.emplace_back(grad_req_types[arg_top], arg_grad_vec->back()); |
| arg_grad_map_.emplace(arg_name, arg_grad_vec->back()); |
| } |
| in_arg_map_.emplace(arg_name, in_arg_vec->back()); |
| ++arg_top; |
| } |
| } |
| } |
| |
| /*! |
| * \brief Initialize in_args, arg_grads, and aux_states |
| * and their data_entry_ of the executor using |
| * shared_buffer from DataParallelExecutorGroup |
| * and shared_exec if available. |
| */ |
| void GraphExecutor::InitArguments(const nnvm::IndexedGraph& idx, |
| const mxnet::ShapeVector& inferred_shapes, |
| const nnvm::DTypeVector& inferred_dtypes, |
| const StorageTypeVector& inferred_stypes, |
| const std::vector<Context>& in_arg_ctxes, |
| const std::vector<Context>& arg_grad_ctxes, |
| const std::vector<Context>& aux_state_ctxes, |
| const std::vector<OpReqType>& grad_req_types, |
| const std::unordered_set<std::string>& shared_arg_names, |
| const Executor* shared_exec, |
| std::unordered_map<std::string, NDArray>* shared_buffer, |
| std::vector<NDArray>* in_arg_vec, |
| std::vector<NDArray>* arg_grad_vec, |
| std::vector<NDArray>* aux_state_vec) { |
| // initialize in_args, arg_grads, and aux_states and populate grad_store_ |
| data_entry_.resize(idx.num_node_entries()); |
| size_t arg_top = 0, aux_top = 0; |
| const auto& mutable_nodes = idx.mutable_input_nodes(); |
| for (size_t i = 0; i < num_forward_inputs_; ++i) { |
| const uint32_t nid = idx.input_nodes().at(i); |
| const uint32_t eid = idx.entry_id(nid, 0); |
| const mxnet::TShape& inferred_shape = inferred_shapes[eid]; |
| const int inferred_dtype = inferred_dtypes[eid]; |
| const NDArrayStorageType inferred_stype = (NDArrayStorageType) inferred_stypes[eid]; |
| const std::string& arg_name = idx[nid].source->attrs.name; |
| // aux_states |
| if (mutable_nodes.count(nid)) { |
| if (nullptr != shared_exec) { |
| const NDArray& aux_nd = shared_exec->aux_state_map().at(arg_name); |
| CHECK(inferred_stype == kDefaultStorage && aux_nd.storage_type() == kDefaultStorage) |
| << "Non-default storage type detected when creating auxilliary NDArray. The allocated " |
| << "memory of shared_exec.aux_array cannot be resued for argument: " |
| << arg_name << " for the current executor"; |
| CHECK_EQ(inferred_shape, aux_nd.shape()) |
| << "Inferred shape does not match shared_exec.aux_array's shape." |
| " Therefore, the allocated memory for shared_exec.aux_array cannot" |
| " be resued for creating auxilliary NDArray of the argument: " |
| << arg_name << " for the current executor"; |
| CHECK_EQ(inferred_dtype, aux_nd.dtype()) |
| << "Inferred dtype does not match shared_exec.aux_array's dtype." |
| " Therefore, the allocated memory for shared_exec.aux_array cannot" |
| " be resued for creating auxilliary NDArray of the argument: " |
| << arg_name << " for the current executor"; |
| aux_state_vec->emplace_back(aux_nd); |
| } else { |
| EmplaceBackZeros(inferred_stype, inferred_shape, aux_state_ctxes[aux_top], |
| inferred_dtype, aux_state_vec); |
| } // if (has_shared_exec) |
| data_entry_[eid] = aux_state_vec->back(); |
| aux_state_map_.emplace(arg_name, aux_state_vec->back()); |
| ++aux_top; |
| } else { // in_args and grad for in_args |
| if (shared_arg_names.count(arg_name)) { // model parameter |
| // model parameter |
| if (nullptr != shared_exec) { |
| const NDArray& in_arg_nd = shared_exec->in_arg_map().at(arg_name); |
| auto arg_nd_stype = in_arg_nd.storage_type(); |
| // for model parameter, both default storage and row_sparse storage can be shared |
| bool shareable_arg_stype = inferred_stype == kDefaultStorage || |
| inferred_stype == kRowSparseStorage; |
| // try to reuse memory from shared_exec |
| CHECK(shareable_arg_stype) << "Inferred storage type " |
| << common::stype_string(inferred_stype) |
| << " does not support memory sharing with shared_exec.arg_array"; |
| CHECK_EQ(inferred_stype, arg_nd_stype) |
| << "Inferred stype does not match shared_exec.arg_array's stype" |
| " Therefore, the allocated memory for shared_exec.arg_array cannot" |
| " be resued for creating NDArray of the argument " |
| << arg_name << " for the current executor"; |
| CHECK_EQ(inferred_shape, in_arg_nd.shape()) |
| << "Inferred shape does not match shared_exec.arg_array's shape" |
| " Therefore, the allocated memory for shared_exec.arg_array cannot" |
| " be resued for creating NDArray of the argument " |
| << arg_name << " for the current executor"; |
| CHECK_EQ(inferred_dtype, in_arg_nd.dtype()) |
| << "Inferred dtype does not match shared_exec.arg_array's dtype" |
| " Therefore, the allocated memory for shared_exec.arg_array cannot" |
| " be resued for creating NDArray of the argument " |
| << arg_name << " for the current executor"; |
| in_arg_vec->emplace_back(in_arg_nd); |
| } else { |
| // doesn't have shared_exec, or non-default storage |
| EmplaceBackZeros(inferred_stype, inferred_shape, in_arg_ctxes[arg_top], |
| inferred_dtype, in_arg_vec); |
| } |
| // gradient for model parameter |
| if (kNullOp == grad_req_types[arg_top]) { |
| arg_grad_vec->emplace_back(); |
| } else { |
| auto grad_oid = grad_store_.size() + num_forward_outputs_; |
| auto grad_eid = idx.entry_id(idx.outputs()[grad_oid]); |
| auto grad_stype = (NDArrayStorageType) inferred_stypes[grad_eid]; |
| if (nullptr != shared_exec && grad_stype == kDefaultStorage && |
| shared_exec->arg_grad_map().at(arg_name).storage_type() == kDefaultStorage) { |
| // try to reuse memory from shared_exec |
| arg_grad_vec->emplace_back(shared_exec->arg_grad_map().at(arg_name)); |
| } else { |
| // no need to reuse memory from shared_exec for gradient of non-default storage |
| EmplaceBackZeros(grad_stype, inferred_shape, arg_grad_ctxes[arg_top], |
| inferred_dtype, arg_grad_vec); |
| } |
| grad_store_.emplace_back(grad_req_types[arg_top], arg_grad_vec->back()); |
| } |
| } else { // !shared_arg_names.count(arg_name) |
| // model parameter, row_sparse ndarray sharing enabled |
| bool enable_row_sparse_sharing = true; |
| in_arg_vec->emplace_back(ReshapeOrCreate(arg_name, inferred_shape, inferred_dtype, |
| inferred_stype, in_arg_ctxes[arg_top], |
| shared_buffer, enable_row_sparse_sharing)); |
| // gradient for model parameter, row_sparse ndarray sharing disabled |
| if (kNullOp == grad_req_types[arg_top]) { |
| arg_grad_vec->emplace_back(); |
| } else { |
| auto grad_oid = grad_store_.size() + num_forward_outputs_; |
| auto grad_eid = idx.entry_id(idx.outputs()[grad_oid]); |
| auto grad_stype = (NDArrayStorageType) inferred_stypes[grad_eid]; |
| bool enable_row_sparse_sharing = false; |
| arg_grad_vec->emplace_back(ReshapeOrCreate("grad of " + arg_name, inferred_shape, |
| inferred_dtype, grad_stype, |
| arg_grad_ctxes[arg_top], shared_buffer, |
| enable_row_sparse_sharing)); |
| grad_store_.emplace_back(grad_req_types[arg_top], arg_grad_vec->back()); |
| } // if (kNullOp == grad_req_types[arg_top]) |
| } // if (shared_arg_names.count(arg_name)) |
| in_arg_map_.emplace(arg_name, in_arg_vec->back()); |
| if (!arg_grad_vec->back().is_none()) { |
| arg_grad_map_.emplace(arg_name, arg_grad_vec->back()); |
| } |
| data_entry_[eid] = in_arg_vec->back(); |
| ++arg_top; |
| } |
| } |
| } |
| |
| /*! |
| * \brief Finish graph initialization after shape and dtype inferences. |
| * This function is used by both simple_bind and bind flows. |
| */ |
| void GraphExecutor::FinishInitGraph(nnvm::Symbol symbol, |
| nnvm::Graph g, |
| Executor* shared_exec, |
| const nnvm::NodeEntryMap<NDArray>& feed_dict) { |
| const auto& idx = g.indexed_graph(); |
| const auto& vstorage_type = g.GetAttr<StorageTypeVector>("storage_type"); |
| |
| // data entries for output gradients |
| for (size_t j = num_forward_outputs_; j < idx.outputs().size(); ++j) { |
| data_entry_[idx.entry_id(idx.outputs()[j])] = grad_store_[j - num_forward_outputs_].second; |
| } |
| |
| { |
| // memory allocator |
| nnvm::StorageVector arg_storage_id(idx.num_node_entries(), kBadStorageID); |
| for (size_t j = num_forward_outputs_; j < idx.outputs().size(); ++j) { |
| arg_storage_id[idx.entry_id(idx.outputs()[j])] = kExternalStorageID; |
| } |
| for (const auto& kv : feed_dict) { |
| uint32_t eid = idx.entry_id(kv.first); |
| data_entry_[eid] = kv.second; |
| arg_storage_id[eid] = kExternalStorageID; |
| } |
| for (size_t i = 0; i < idx.num_node_entries(); i++) { |
| if (vstorage_type[i] != kDefaultStorage) arg_storage_id[i] = kDynamicStorageID; |
| } |
| g.attrs["storage"] = std::make_shared<dmlc::any>(std::move(arg_storage_id)); |
| g = nnvm::ApplyPass(g, "MXPlanMemory"); |
| } |
| g = DetectInplaceAddTo(g); |
| |
| // log the static memory plan of the graph |
| static bool mem_log_verbose = dmlc::GetEnv("MXNET_MEM_PLAN_VERBOSE_LOGGING", false); |
| if (mem_log_verbose) { |
| common::LogMemoryPlan(g); |
| } |
| |
| g = AttachOpExecs(g); |
| AttachOpResources(g); |
| graph_ = std::move(g); |
| |
| if (shared_exec != nullptr) { |
| this->InitDataEntryMemory(&(dynamic_cast<GraphExecutor*>(shared_exec)->data_pool_)); |
| } else { |
| this->InitDataEntryMemory(nullptr); |
| } |
| |
| { |
| // initialize output arrays |
| auto& idx = graph_.indexed_graph(); |
| for (size_t i = 0; i < num_forward_outputs_; ++i) { |
| auto& e = idx.outputs()[i]; |
| output_arrays_.push_back(data_entry_[idx.entry_id(e)]); |
| } |
| // initialize head gradient array |
| head_grad_array_.resize(symbol.outputs.size()); |
| for (size_t i = num_forward_inputs_; i < idx.input_nodes().size(); ++i) { |
| uint32_t nid = idx.input_nodes().at(i); |
| uint32_t oid = head_grad_map_.at(idx[nid].source); |
| head_grad_array_[oid] = data_entry_[idx.entry_id(nid, 0)]; |
| } |
| } |
| this->InitCachedOps(); |
| this->InitOpSegs(); |
| } |
| |
| /*! |
| * \brief GraphExecutor initializer for simple bind flow in |
| * which only certain input shapes and dtypes are provided by users. |
| * The initializer uses these shapes and dtypes to perform |
| * shape and dtype inferences, and then create NDArrays |
| * to populate data entries of the graph. The created NDArrays |
| * for in_args, arg_grads and aux_states are passed to the |
| * front end to attach the created executor. |
| * In front end, if the simple_bind flow is trigger by |
| * _bind_ith_exec, the shared data arrays of DataParallelExecutorGroup |
| * and shared executor will be taken into account in creating |
| * NDArrays for in_args, arg_grads, and aux_states for resuing |
| * already allocated memory. |
| */ |
| void GraphExecutor::Init(nnvm::Symbol symbol, |
| const Context& default_ctx, |
| const std::map<std::string, Context>& ctx_map, |
| const std::vector<Context>& in_arg_ctxes, |
| const std::vector<Context>& arg_grad_ctxes, |
| const std::vector<Context>& aux_state_ctxes, |
| const std::unordered_map<std::string, mxnet::TShape>& arg_shape_map, |
| const std::unordered_map<std::string, int>& arg_dtype_map, |
| const std::unordered_map<std::string, int>& arg_stype_map, |
| const std::vector<OpReqType>& grad_req_types, |
| const std::unordered_set<std::string>& shared_arg_names, |
| std::vector<NDArray>* in_arg_vec, |
| std::vector<NDArray>* arg_grad_vec, |
| std::vector<NDArray>* aux_state_vec, |
| std::unordered_map<std::string, NDArray>* shared_buffer, |
| Executor* shared_exec, |
| const nnvm::NodeEntryMap<NDArray>& feed_dict) { |
| nnvm::Graph g = InitGraph(symbol, default_ctx, ctx_map, in_arg_ctxes, arg_grad_ctxes, |
| aux_state_ctxes, grad_req_types); |
| |
| // The following code of shape and dtype inferences and argument |
| // initialization is for simple_bind only. Regular bind operation |
| // should do this differently. |
| |
| // Initialize arg_shapes and arg_dtypes for shape and type inferences. |
| // It contains all in_args and aux_states' shapes and types in a certain order. |
| const nnvm::IndexedGraph& idx = g.indexed_graph(); |
| mxnet::ShapeVector arg_shapes(idx.input_nodes().size(), mxnet::TShape()); |
| nnvm::DTypeVector arg_dtypes(idx.input_nodes().size(), -1); |
| StorageTypeVector arg_stypes(idx.input_nodes().size(), kUndefinedStorage); |
| for (size_t i = 0; i < num_forward_inputs_; ++i) { |
| const uint32_t nid = idx.input_nodes().at(i); |
| const std::string& name = idx[nid].source->attrs.name; |
| auto it1 = arg_shape_map.find(name); |
| if (arg_shape_map.end() != it1) { |
| arg_shapes[i] = it1->second; |
| } |
| auto it2 = arg_dtype_map.find(name); |
| if (arg_dtype_map.end() != it2) { |
| arg_dtypes[i] = it2->second; |
| } |
| auto it3 = arg_stype_map.find(name); |
| if (arg_stype_map.end() != it3) { |
| arg_stypes[i] = it3->second; |
| } |
| } |
| g = InferShape(std::move(g), std::move(arg_shapes), "__shape__"); |
| if (g.GetAttr<size_t>("shape_num_unknown_nodes") != 0U) { |
| HandleInferShapeError(num_forward_inputs_, g.indexed_graph(), |
| g.GetAttr<mxnet::ShapeVector>("shape")); |
| } |
| |
| g = InferType(std::move(g), std::move(arg_dtypes), "__dtype__"); |
| if (g.GetAttr<size_t>("dtype_num_unknown_nodes") != 0U) { |
| HandleInferTypeError(num_forward_inputs_, g.indexed_graph(), |
| g.GetAttr<nnvm::DTypeVector>("dtype")); |
| } |
| |
| g = InferStorageType(std::move(g), std::move(arg_stypes), "__storage_type__"); |
| if (g.GetAttr<size_t>("storage_type_num_unknown_nodes") != 0U) { |
| HandleInferStorageTypeError(num_forward_inputs_, g.indexed_graph(), |
| g.GetAttr<StorageTypeVector>("storage_type")); |
| } |
| |
| // Create in_args, arg_grads, and aux_states using |
| // the inferred shapes and dtypes. |
| if (nullptr == shared_buffer) { // regular simple bind |
| InitArguments(idx, g.GetAttr<mxnet::ShapeVector>("shape"), |
| g.GetAttr<nnvm::DTypeVector>("dtype"), |
| g.GetAttr<StorageTypeVector>("storage_type"), |
| in_arg_ctxes, arg_grad_ctxes, aux_state_ctxes, |
| grad_req_types, in_arg_vec, arg_grad_vec, aux_state_vec); |
| } else { // simple bind using shared data arrays and shared_exec |
| InitArguments(idx, g.GetAttr<mxnet::ShapeVector>("shape"), |
| g.GetAttr<nnvm::DTypeVector>("dtype"), |
| g.GetAttr<StorageTypeVector>("storage_type"), |
| in_arg_ctxes, arg_grad_ctxes, aux_state_ctxes, |
| grad_req_types, shared_arg_names, shared_exec, |
| shared_buffer, in_arg_vec, arg_grad_vec, aux_state_vec); |
| } |
| // The above code of shape and dtype inferences and argument |
| // initialization is for simple_bind only. Regular bind operation |
| // should do this differently. |
| |
| // Initialize the rest attributes of the graph. |
| // This function can be called by regular bind |
| // operation flow as well. |
| FinishInitGraph(symbol, g, shared_exec, feed_dict); |
| } |
| |
| /*! |
| * \brief Return a new executor with the same symbol and shared memory, |
| * but different input/output shapes. |
| * For runtime reshaping, variable length sequences, etc. |
| * The returned executor shares state with the current one, |
| * and cannot be used in parallel with it. |
| */ |
| Executor* GraphExecutor::Reshape(const bool partial_shaping, |
| const bool allow_up_sizing, |
| const Context& default_ctx, |
| const std::map<std::string, Context>& ctx_map, |
| const std::unordered_map<std::string, mxnet::TShape>& |
| provided_arg_shapes, |
| std::vector<NDArray>* in_args, |
| std::vector<NDArray>* arg_grads, |
| std::vector<NDArray>* aux_states) { |
| nnvm::Graph g; |
| nnvm::Symbol symbol; |
| symbol.outputs = symbol_.outputs; |
| g.outputs = symbol_.outputs; |
| const nnvm::IndexedGraph& idx = g.indexed_graph(); |
| mxnet::ShapeVector arg_shapes(idx.input_nodes().size(), mxnet::TShape()); |
| for (size_t i = 0; i < num_forward_inputs_; ++i) { |
| const uint32_t nid = idx.input_nodes().at(i); |
| const std::string& name = idx[nid].source->attrs.name; |
| auto it = provided_arg_shapes.find(name); |
| if (provided_arg_shapes.end() != it) { |
| arg_shapes[i] = it->second; |
| } |
| } |
| g = InferShape(std::move(g), std::move(arg_shapes), "__shape__"); |
| if (g.GetAttr<size_t>("shape_num_unknown_nodes") != 0U) { |
| this->is_dynamic_ = true; |
| } |
| const mxnet::ShapeVector& shape_vec = g.GetAttr<mxnet::ShapeVector>("shape"); |
| std::vector<OpReqType> grad_req_types; |
| size_t grad_top = 0; |
| const size_t num_args = in_arg_map_.size(); |
| const size_t num_aux = aux_state_map_.size(); |
| in_args->reserve(num_args); |
| grad_req_types.reserve(num_args); |
| arg_grads->reserve(num_args); |
| aux_states->reserve(num_aux); |
| for (uint32_t nid : idx.input_nodes()) { |
| std::string name = idx[nid].source->attrs.name; |
| const mxnet::TShape& new_shape = shape_vec[idx.entry_id(nid, 0)]; |
| if (idx.mutable_input_nodes().count(nid) == 0) { |
| NDArray& arr = in_arg_map_.at(name); |
| auto it = arg_grad_map_.find(name); |
| if (partial_shaping || provided_arg_shapes.count(name) || new_shape == arr.shape()) { |
| if (new_shape.Size() > arr.shape().Size()) { |
| CHECK(allow_up_sizing) << "New shape of arg: " << name << " is larger than original." |
| << "First making a big executor and then down sizing it " |
| << "is more efficient than the reverse." |
| << "If you really want to up size, set allow_up_sizing=True " |
| << "to enable allocation of new arrays."; |
| in_args->emplace_back(new_shape, arr.ctx(), false, arr.dtype()); |
| if (it != arg_grad_map_.end()) { |
| NDArray& darr = it->second; |
| arg_grads->emplace_back(new_shape, darr.ctx(), false, darr.dtype()); |
| grad_req_types.push_back(grad_store_.at(grad_top++).first); |
| } else { |
| arg_grads->emplace_back(); |
| grad_req_types.push_back(kNullOp); |
| } |
| } else { |
| in_args->push_back(arr.Reshape(new_shape)); |
| if (it != arg_grad_map_.end()) { |
| NDArray& darr = it->second; |
| arg_grads->push_back(darr.Reshape(new_shape)); |
| grad_req_types.push_back(grad_store_.at(grad_top++).first); |
| } else { |
| arg_grads->emplace_back(); |
| grad_req_types.push_back(kNullOp); |
| } |
| } |
| } else { |
| LOG(FATAL) << "Shape of unspecifie arg: " << name << " changed. " |
| << "This can cause the new executor to not share parameters " |
| << "with the old one. Please check for error in network." |
| << "If this is intended, set partial_shaping=True to suppress this warning."; |
| } |
| } else { |
| NDArray& arr = aux_state_map_.at(name); |
| if (partial_shaping || new_shape == arr.shape()) { |
| if (new_shape.Size() > arr.shape().Size()) { |
| CHECK(allow_up_sizing) << "New shape of arg: " << name << " is larger than original." |
| << "First making a big executor and then down sizing it " |
| << "is more efficient than the reverse." |
| << "If you really want to up size, set allow_up_sizing=True " |
| << "to enable allocation of new arrays."; |
| aux_states->emplace_back(new_shape, arr.ctx(), false, arr.dtype()); |
| } else { |
| aux_states->push_back(arr.Reshape(new_shape)); |
| } |
| } else { |
| LOG(FATAL) << "Shape of unspecifie arg: " << name << " changed. " |
| << "This can cause the new executor to not share parameters " |
| << "with the old one. Please check for error in network." |
| << "If this is intended, set partial_shaping=True to suppress this warning."; |
| } |
| } |
| } |
| auto exec = new GraphExecutor(symbol); |
| exec->Init(symbol.Copy(), default_ctx, ctx_map, |
| *in_args, *arg_grads, grad_req_types, *aux_states, |
| this); |
| return exec; |
| } |
| |
| /*! |
| * \brief This function is triggered by both simple_bind |
| * and bind flows. |
| * Setup backward graph, create device and context |
| * attributes in the graph, and calculate the number |
| * of forward nodes. |
| */ |
| Graph GraphExecutor::InitGraph(nnvm::Symbol symbol, |
| const Context& default_ctx, |
| const std::map<std::string, Context>& ctx_map, |
| const std::vector<Context>& in_arg_ctxes, |
| const std::vector<Context>& arg_grad_ctxes, |
| const std::vector<Context>& aux_state_ctxes, |
| const std::vector<OpReqType>& grad_req_types) { |
| // setup gradient |
| nnvm::Graph g = InitFullGraph(symbol, grad_req_types); |
| |
| #if MXNET_USE_CUDA && MXNET_ENABLE_CUDA_RTC && !defined(_WIN32) |
| if (default_ctx.dev_mask() == Context::kGPU && dmlc::GetEnv("MXNET_USE_FUSION", true)) { |
| nnvm::Graph unoptimized_graph; |
| common::CopyGraph(&unoptimized_graph, g, false); |
| |
| if (common::CheckForInputNameDuplicates(unoptimized_graph.indexed_graph())) { |
| g = exec::FusePointwise(std::move(g), num_forward_outputs_); |
| // Check the topological order of inputs |
| const auto &original_inputs = unoptimized_graph.indexed_graph().input_nodes(); |
| const auto &new_inputs = g.indexed_graph().input_nodes(); |
| if (original_inputs.size() != new_inputs.size()) { |
| LOG(WARNING) |
| << "Number of inputs after fusion does not match original number of inputs. " |
| << "This is most probably a bug. Disabling fusion for this run."; |
| g = unoptimized_graph; |
| } else { |
| for (size_t i = 0; i < new_inputs.size(); ++i) { |
| if (unoptimized_graph.indexed_graph()[original_inputs[i]].source->attrs.name != |
| g.indexed_graph()[new_inputs[i]].source->attrs.name) { |
| LOG(WARNING) << "Disabling fusion due to altered topological order of inputs."; |
| g = unoptimized_graph; |
| break; |
| } |
| } |
| } |
| } else { |
| LOG(WARNING) |
| << "Graph contains duplicate names for some of its inputs - fusion is NOT enabled!"; |
| } |
| } |
| #else |
| // Only warn user if MXNET_USE_FUSION env var is explicitly set |
| if (default_ctx.dev_mask() == Context::kGPU && dmlc::GetEnv("MXNET_USE_FUSION", false)) { |
| WarnFusionNotSupported(); |
| } |
| #endif // MXNET_USE_CUDA && MXNET_ENABLE_CUDA_RTC && !defined(_WIN32) |
| |
| // create "device" and "context" attrs for the graph |
| g = AssignContext(g, default_ctx, ctx_map, |
| in_arg_ctxes, |
| arg_grad_ctxes, |
| aux_state_ctxes, |
| grad_req_types, |
| num_forward_inputs_, |
| num_forward_outputs_); |
| |
| const auto& idx = g.indexed_graph(); |
| // get number of nodes used in forward pass |
| num_forward_nodes_ = 0; |
| for (size_t i = 0; i < num_forward_outputs_; ++i) { |
| num_forward_nodes_ = std::max( |
| num_forward_nodes_, static_cast<size_t>(idx.outputs()[i].node_id + 1)); |
| } |
| return g; |
| } |
| |
| // initialize the memory of each entries |
| void GraphExecutor::InitDataEntryMemory(std::vector<NDArray>* shared_pool) { |
| using nnvm::DTypeVector; |
| using mxnet::ShapeVector; |
| using nnvm::StorageVector; |
| // get the graph |
| const auto& idx = graph_.indexed_graph(); |
| // get the storage |
| const auto& vdtype = graph_.GetAttr<DTypeVector>("dtype"); |
| const auto& vshape = graph_.GetAttr<mxnet::ShapeVector>("shape"); |
| const auto& vstorage = graph_.GetAttr<StorageVector>("storage_id"); |
| const auto& vstorage_type = graph_.GetAttr<StorageTypeVector>("storage_type"); |
| const auto& vctx = graph_.GetAttr<ContextVector>("context"); |
| CHECK_EQ(idx.num_node_entries(), vshape.size()); |
| CHECK_EQ(idx.num_node_entries(), vdtype.size()); |
| CHECK_EQ(idx.num_node_entries(), vstorage.size()); |
| CHECK_EQ(data_entry_.size(), vshape.size()); |
| std::vector<Context> data_context(idx.num_node_entries()); |
| std::vector<NDArrayStorageType> data_storage_type(idx.num_node_entries(), kUndefinedStorage); |
| for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) { |
| for (uint32_t i = 0; i < idx[nid].source->num_outputs(); ++i) { |
| auto eid = idx.entry_id(nid, i); |
| data_context[eid] = vctx[nid]; |
| CHECK_NE(vstorage_type[eid], kUndefinedStorage); |
| data_storage_type[eid] = (NDArrayStorageType) vstorage_type[eid]; |
| } |
| } |
| |
| // information about the pool |
| struct PoolEntry { |
| Context ctx; |
| size_t bytes; |
| NDArrayStorageType stype; |
| }; |
| std::vector<PoolEntry> pool_info; |
| |
| // assign array to head gradient |
| for (size_t i = num_forward_inputs_; i < idx.input_nodes().size(); ++i) { |
| uint32_t nid = idx.input_nodes().at(i); |
| uint32_t oid = head_grad_map_.at(idx[nid].source); |
| uint32_t eid = idx.entry_id(idx.outputs()[oid]); |
| NDArrayStorageType stype = (NDArrayStorageType) vstorage_type[eid]; |
| bool unknown_shape = !shape_is_known(vshape[eid]); |
| CHECK_NE(vdtype[eid], -1); |
| auto data_eid = idx.entry_id(nid, 0); |
| // initialize based on storage_type |
| if (stype != kDefaultStorage) { |
| data_entry_[data_eid] = NDArray(stype, vshape[eid], data_context[eid], true, vdtype[eid]); |
| } else if (!unknown_shape) { |
| data_entry_[data_eid] = NDArray(vshape[eid], data_context[eid], false, vdtype[eid]); |
| } else { |
| data_entry_[data_eid] = NDArray(data_context[eid], vdtype[eid]); |
| } |
| if (log_verbose_) { |
| LOG(INFO) << "\tinit head_grad entry\t" << data_eid << "\tas " |
| << common::stype_string(stype); |
| } |
| } |
| // get maximum bytes in each pool |
| for (size_t i = 0; i < vshape.size(); ++i) { |
| if (!data_entry_[i].is_none()) continue; |
| size_t shape_size = 0; |
| if (shape_is_known(vshape[i])) { |
| shape_size = vshape[i].Size(); |
| } |
| size_t bytes = shape_size * mshadow::mshadow_sizeof(vdtype[i]); |
| int storage_id = vstorage[i]; |
| // skip pool allocation for kBadStorageID, kExternalStorageID and kDynamicStorageID |
| if (storage_id < 0) continue; |
| size_t sid = static_cast<size_t>(storage_id); |
| if (sid >= pool_info.size()) { |
| pool_info.resize(sid + 1, PoolEntry{Context::CPU(), size_t(0), kUndefinedStorage}); |
| } |
| PoolEntry& info = pool_info[sid]; |
| if (info.bytes == 0) { |
| info = PoolEntry{data_context[i], bytes, data_storage_type[i]}; |
| } else { |
| info.bytes = std::max(info.bytes, bytes); |
| } |
| } |
| // construct the re-use pool, if needed |
| std::multimap<size_t, NDArray> free_pool; |
| if (shared_pool != nullptr) { |
| for (const NDArray& nd : *shared_pool) { |
| size_t bytes = 0; |
| if (shape_is_known(nd.shape())) { |
| bytes = nd.shape().Size() * mshadow::mshadow_sizeof(nd.dtype()); |
| } |
| free_pool.insert(std::make_pair(bytes, nd)); |
| } |
| } |
| // remake the data pool |
| data_pool_.clear(); |
| data_pool_.resize(pool_info.size()); |
| |
| // sort the pool info the descending order before allocating memory |
| std::vector<size_t> sorted_pool_index; |
| for (size_t i = 0; i < pool_info.size(); i++) { |
| sorted_pool_index.push_back(i); |
| } |
| auto pool_comparator = [&pool_info](size_t lhs, size_t rhs){ |
| return pool_info[lhs].bytes > pool_info[rhs].bytes; |
| }; |
| std::sort(sorted_pool_index.begin(), sorted_pool_index.end(), pool_comparator); |
| |
| for (size_t i : sorted_pool_index) { |
| const Context& ctx = pool_info[i].ctx; |
| size_t bytes = pool_info[i].bytes; |
| bool allocated = false; |
| for (auto it = free_pool.lower_bound(bytes); it != free_pool.end(); ++it) { |
| if (it->second.ctx() == ctx && it->first >= bytes) { |
| data_pool_[i] = it->second; |
| free_pool.erase(it); |
| allocated = true; |
| break; |
| } |
| } |
| if (!allocated) { |
| size_t nword = (bytes + 3) / 4; |
| CHECK_LE(nword, std::numeric_limits<nnvm::dim_t>::max()); |
| // allocate float arrays |
| mxnet::TShape shape{static_cast<nnvm::dim_t>(nword)}; |
| // TODO(junwu): adding delay_alloc=true to create nd |
| // is a temporary solution. |
| NDArray nd(shape, ctx, true); |
| data_pool_[i] = nd; |
| // put the new allocated arrays to shared pool |
| if (shared_pool != nullptr) { |
| shared_pool->push_back(nd); |
| } |
| } |
| } |
| CHECK_EQ(data_pool_.size(), pool_info.size()); |
| // assign the data entries |
| for (size_t i = 0; i < data_entry_.size(); ++i) { |
| // avoid pre-allocated arrays |
| if (!data_entry_[i].is_none()) continue; |
| // assign allocated array by storage id |
| int storage_id = vstorage[i]; |
| auto storage_type = (NDArrayStorageType) vstorage_type[i]; |
| if (storage_type == kDefaultStorage) { |
| if (!shape_is_known(vshape[i])) { |
| data_entry_[i] = NDArray(data_context[i], vdtype[i]); |
| } else { |
| CHECK_GE(storage_id, 0) << "Do not support runtime shape op yet"; |
| const NDArray& src = data_pool_.at(storage_id); |
| data_entry_[i] = src.AsArray(vshape[i], vdtype[i]); |
| } |
| } else { |
| data_entry_[i] = NDArray(storage_type, vshape[i], data_context[i], |
| true, vdtype[i]); |
| } |
| if (log_verbose_) { |
| LOG(INFO) << "\tinit data entry\t" << i << "\tas " << common::stype_string(storage_type); |
| } |
| } |
| } |
| |
| |
| void GraphExecutor::InitCachedOps() { |
| // get the graph |
| const auto& idx = graph_.indexed_graph(); |
| const auto& vstorage_inplace = |
| graph_.GetAttr<std::vector<int> >("storage_inplace_index"); |
| const auto& op_execs = |
| graph_.GetAttr<OpExecVector>("op_execs"); |
| const auto& vctx = graph_.GetAttr<ContextVector>("context"); |
| const auto& addto_entry = graph_.GetAttr<std::vector<int> >("addto_entry"); |
| const auto& skip_plus_node = graph_.GetAttr<std::vector<int> >("skip_plus_node"); |
| |
| op_nodes_.resize(idx.num_nodes()); |
| // setup the array and requirements. |
| for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) { |
| const auto& inode = idx[nid]; |
| if (inode.source->is_variable()) continue; |
| op_nodes_[nid].opr_name = inode.source->op()->name.c_str(); |
| if (skip_plus_node.at(nid)) { |
| op_nodes_[nid].skip_exec_node = true; continue; |
| } |
| |
| op_nodes_[nid].exec = op_execs[nid]; |
| op_nodes_[nid].ctx = vctx[nid]; |
| auto& exec = op_nodes_[nid].exec; |
| CHECK_EQ(exec->in_array.size(), 0U); |
| CHECK_EQ(exec->out_array.size(), 0U); |
| for (const auto& e : inode.inputs) { |
| exec->in_array.push_back(data_entry_[idx.entry_id(e)]); |
| } |
| // detect inplace requirement |
| for (uint32_t index = 0; index < inode.source->num_outputs(); ++index) { |
| uint32_t eid = idx.entry_id(nid, index); |
| exec->out_array.push_back(data_entry_[eid]); |
| if (addto_entry.at(eid) != 0) { |
| exec->req.push_back(kAddTo); |
| } else if (vstorage_inplace[eid] >= 0) { |
| exec->req.push_back(kWriteInplace); |
| } else if (vstorage_inplace[eid] == -2) { |
| // -2 indicate that the entry is never referenced. |
| exec->req.push_back(kNullOp); |
| } else { |
| exec->req.push_back(kWriteTo); |
| } |
| } |
| } |
| // Note that this modifies the requirement of kWriteInplace |
| for (size_t j = num_forward_outputs_; j < idx.outputs().size(); ++j) { |
| auto& e = idx.outputs()[j]; |
| op_nodes_[e.node_id].exec->req[e.index] = |
| grad_store_[j - num_forward_outputs_].first; |
| } |
| for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) { |
| const auto& inode = idx[nid]; |
| if (inode.source->is_variable()) continue; |
| if (op_nodes_[nid].skip_exec_node) continue; |
| auto& exec = op_nodes_[nid].exec; |
| bool is_async = op_nodes_[nid].exec->exec_type() == ExecType::kAsync; |
| bool is_gpu = op_nodes_[nid].ctx.dev_mask() == gpu::kDevMask; |
| |
| // the variables |
| std::vector<Engine::VarHandle> use_vars, mutate_vars; |
| for (const auto& nd : exec->in_array) { |
| use_vars.push_back(nd.var()); |
| } |
| for (const auto& r : exec->op_ctx.requested) { |
| mutate_vars.push_back(r.var); |
| } |
| for (const auto& nd : exec->out_array) { |
| mutate_vars.push_back(nd.var()); |
| } |
| if (exec->var() != nullptr) { |
| mutate_vars.push_back(exec->var()); |
| } |
| // dedup vars |
| Engine::Get()->DeduplicateVarHandle(&use_vars, &mutate_vars); |
| // all vars include both mutate vars and use vars |
| std::vector<Engine::VarHandle> all_vars(use_vars); |
| std::copy(mutate_vars.begin(), mutate_vars.end(), |
| std::inserter(all_vars, all_vars.end())); |
| // setup exec vars |
| Engine::Get()->PushAsync( |
| [exec](RunContext rctx, Engine::CallbackOnComplete on_complete) { |
| exec->Setup(); |
| on_complete(); |
| }, Context::CPU(), {}, all_vars, FnProperty::kNormal, 0, |
| "SetupExec"); |
| auto exec_fun = [exec, is_async, is_gpu] ( |
| RunContext ctx, Engine::CallbackOnComplete on_complete) { |
| if (is_async) { |
| exec->op_ctx.async_on_complete = on_complete; |
| } |
| exec->Run(ctx, is_gpu); |
| // call on complete only if it is async op |
| if (!is_async) { |
| if (is_gpu) { |
| #if MXNET_USE_CUDA |
| // Wait GPU kernel to finish. |
| ctx.get_stream<gpu>()->Wait(); |
| #else |
| LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR; |
| #endif |
| } |
| on_complete(); |
| } |
| }; |
| // setup the vars |
| op_nodes_[nid].cached_opr = Engine::Get()->NewOperator( |
| exec_fun, use_vars, mutate_vars, FnProperty::kNormal, |
| op_nodes_[nid].opr_name); |
| op_nodes_[nid].mutate_vars = mutate_vars; |
| op_nodes_[nid].use_vars = use_vars; |
| } |
| } |
| |
| void GraphExecutor::InitOpSegs() { |
| size_t total_num_nodes = graph_.indexed_graph().num_nodes(); |
| cached_seg_opr_.clear(); |
| CachedSegOpr p; |
| cached_seg_opr_.resize(total_num_nodes, p); |
| if (monitor_callback_) return; |
| |
| // Symbolic bulking is set by the same environment variables as Imperative bulking. |
| // Generate segments based on the graph structure |
| bool prefer_bulk_exec_inference = Imperative::PreferBulkExecInference(); |
| // Whether to perform bulk exec for training |
| const profiler::Profiler *prof = profiler::Profiler::Get(); |
| bool prefer_bulk_exec_train = Imperative::PreferBulkExecTrain() |
| && (!prof || !prof->AggregateEnabled()); |
| if (this->is_dynamic_) { |
| prefer_bulk_exec_inference = false; |
| prefer_bulk_exec_train = false; |
| } |
| bool is_training = num_forward_nodes_ != total_num_nodes; |
| |
| if (prefer_bulk_exec_train && is_training) { |
| // Bulk the forward portion of the graph per the bulk segment max size for forward training |
| this->BulkOpSegs(0, num_forward_nodes_, Imperative::BulkExecMaxNodeTrainFwd()); |
| // Bulk the backward portion of the graph per the bulk segment max size for backward training |
| this->BulkOpSegs(num_forward_nodes_, total_num_nodes, Imperative::BulkExecMaxNodeTrainBwd()); |
| } |
| |
| if (prefer_bulk_exec_inference && !is_training) { |
| // Bulk the entire graph as one bulk segment if possible |
| this->BulkOpSegs(0, total_num_nodes, total_num_nodes); |
| } |
| } |
| |
| |
| void GraphExecutor::BulkOpSegs(size_t from_node, size_t up_to_node, size_t segment_num_nodes_max) { |
| size_t topo_start = from_node; |
| size_t segment_node_count = 0; |
| for (size_t nid = from_node; nid < up_to_node; nid++) { |
| auto &node = graph_.indexed_graph()[nid].source; |
| auto &op_node = op_nodes_[nid]; |
| // Variables, such as learned weights, are ignored in the segment_node_count |
| bool ignore_node = node->is_variable() || op_node.skip_exec_node || op_node.exec == nullptr; |
| if (!ignore_node) |
| segment_node_count++; |
| bool can_bulk = ignore_node || op_node.exec->exec_type() == ExecType::kSync; |
| // check if we need to create the segment based on properties of this node |
| if (!can_bulk || nid == up_to_node - 1 || segment_node_count >= segment_num_nodes_max) { |
| // Create a new segment for the previous nodes- include also this node if it's bulkable |
| cached_seg_opr_[topo_start] = this->CreateCachedSegOpr(topo_start, can_bulk ? nid + 1 : nid); |
| topo_start = nid + 1; |
| segment_node_count = 0; |
| } |
| } |
| } |
| |
| void GraphExecutor::ExecuteMonInputCallback(size_t nid) { |
| static const auto& flist_inputs = |
| nnvm::Op::GetAttr<nnvm::FListInputNames>("FListInputNames"); |
| const auto& idx = graph_.indexed_graph(); |
| std::vector<std::string> input_names; |
| OpNode& opnode = op_nodes_[nid]; |
| const auto& inode = idx[nid]; |
| const auto& node = idx[nid].source; |
| if (flist_inputs.count(node->op())) { |
| input_names = flist_inputs[node->op()](node->attrs); |
| } else { |
| for (size_t i = 0; i < node->num_inputs(); ++i) { |
| input_names.emplace_back("input" + std::to_string(i)); |
| } |
| } |
| CHECK_EQ(opnode.exec->in_array.size(), input_names.size()); |
| for (size_t i = 0; i < opnode.exec->in_array.size(); ++i) { |
| if (node->inputs[i].node->is_variable()) { |
| // Monitor variable |
| NDArray *cpy = new NDArray(opnode.exec->in_array[i]); |
| std::string name = node->inputs[i].node->attrs.name; |
| this->monitor_callback_(name.c_str(), reinterpret_cast<void*>(cpy)); |
| } |
| NDArray *cpy = new NDArray(opnode.exec->in_array[i]); |
| std::string name = inode.source->attrs.name + "_" + input_names[i]; |
| this->monitor_callback_(name.c_str(), reinterpret_cast<void*>(cpy)); |
| } |
| } |
| |
| void GraphExecutor::ExecuteMonOutputCallback(size_t nid) { |
| const auto& idx = graph_.indexed_graph(); |
| OpNode& opnode = op_nodes_[nid]; |
| const auto& node = idx[nid].source; |
| for (size_t i = 0; i < opnode.exec->out_array.size(); ++i) { |
| NDArray *cpy = new NDArray(opnode.exec->out_array[i]); |
| nnvm::ObjectPtr node_ptr = std::make_shared<nnvm::Node>(*node); |
| std::string name = GetOutputName({node_ptr, static_cast<uint32_t >(i), 0}); |
| this->monitor_callback_(name.c_str(), reinterpret_cast<void*>(cpy)); |
| } |
| } |
| |
| void GraphExecutor::RunOps(bool is_train, size_t topo_start, size_t topo_end) { |
| static auto& finfer_shape = nnvm::Op::GetAttr<mxnet::FInferShape>("FInferShape"); |
| static auto& is_backward = Op::GetAttr<nnvm::TIsBackward>("TIsBackward"); |
| // Update context |
| const auto& idx = graph_.indexed_graph(); |
| for (size_t nid = topo_start; nid < topo_end; ++nid) { |
| OpNode& opnode = op_nodes_[nid]; |
| if (opnode.skip_exec_node) continue; |
| const auto& inode = idx[nid]; |
| if (inode.source->is_variable()) continue; |
| opnode.exec->op_ctx.is_train = is_train; |
| opnode.exec->op_ctx.need_grad = need_grad_; |
| } |
| |
| mxnet::ShapeVector rshape = graph_.MoveCopyAttr<mxnet::ShapeVector>("shape"); |
| // Push Ops |
| for (size_t nid = topo_start; nid < topo_end; ++nid) { |
| auto seg_op = cached_seg_opr_[nid]; |
| // Check segments first |
| if (monitor_callback_ == nullptr && seg_op.opr != nullptr && seg_op.topo_end <= topo_end) { |
| bool profiling = profiler::Profiler::Get()->GetState() == profiler::Profiler::kRunning; |
| Engine::Get()->Push(seg_op.opr, seg_op.ctx, 0, profiling); |
| nid = seg_op.topo_end - 1; |
| continue; |
| } |
| // Normal mode |
| const auto& inode = idx[nid]; |
| const uint32_t num_inputs = inode.inputs.size(); |
| const uint32_t num_outputs = inode.source->num_outputs(); |
| if (inode.source->is_variable()) continue; |
| OpNode& opnode = op_nodes_[nid]; |
| if (op_nodes_[nid].skip_exec_node) continue; |
| // Monitor callbacks |
| if (monitor_callback_ && monitor_all_) { |
| ExecuteMonInputCallback(nid); |
| } |
| if (this->is_dynamic_) { |
| const auto &op = inode.source->op(); |
| { |
| for (NDArray &array : opnode.exec->in_array) { |
| array.WaitToRead(); |
| if (!shape_is_known(array.shape())) { |
| array.SetShapeFromChunk(); |
| } |
| } |
| int i = 0; |
| for (NDArray &array : opnode.exec->out_array) { |
| array.WaitToRead(); |
| if (!shape_is_known(array.shape())) { |
| array.SetShapeFromChunk(); |
| } |
| if (!shape_is_known(array.shape())) { |
| mxnet::TShape shape = rshape[idx.entry_id(nid, i)]; |
| if (shape_is_known(shape)) { |
| array.ReshapeAndAlloc(shape); |
| } |
| } |
| ++i; |
| } |
| } |
| if (finfer_shape.count(op)) { |
| mxnet::ShapeVector in_shapes; |
| mxnet::ShapeVector out_shapes; |
| for (NDArray &array : opnode.exec->in_array) { |
| in_shapes.push_back(array.shape()); |
| } |
| for (NDArray &array : opnode.exec->out_array) { |
| out_shapes.push_back(array.shape()); |
| } |
| auto finfer = finfer_shape[op]; |
| try { |
| bool success = finfer(inode.source->attrs, &in_shapes, &out_shapes); |
| CHECK(success) << "InferShape failed in operator " << inode.source->attrs.name; |
| } catch (const std::exception& e) { |
| throw dmlc::Error("Error in operator " + inode.source->attrs.name + ": " + e.what()); |
| } |
| int n_out = out_shapes.size(); |
| for (int i = 0; i < n_out; ++i) { |
| NDArray &array = opnode.exec->out_array[i]; |
| if (!shape_is_known(array.shape())) { |
| array.Init(out_shapes[i]); |
| } |
| } |
| } else if (is_backward.get(inode.source->op(), false) && inode.control_deps.size()) { |
| CHECK_GE(inode.control_deps.size(), 1U) << |
| "BackwardOp need to have control_deps to its forward op"; |
| uint32_t fid = inode.control_deps[0]; |
| const OpNode& fopnode = op_nodes_[fid]; |
| CHECK_EQ(fopnode.exec->in_array.size(), opnode.exec->out_array.size()); |
| int nelem = fopnode.exec->in_array.size(); |
| std::vector<NDArray> &from = fopnode.exec->in_array; |
| std::vector<NDArray> &to = opnode.exec->out_array; |
| for (int i = 0; i < nelem; ++i) { |
| if (!shape_is_known(to[i].shape())) { |
| to[i].Init(from[i].shape()); |
| } |
| } |
| } |
| } |
| opnode.exec->op_ctx.is_train = is_train; |
| opnode.exec->op_ctx.need_grad = need_grad_; |
| if (opnode.exec->exec_type() == ExecType::kCrossDeviceCopy) { |
| CHECK_EQ(inode.inputs.size(), 1U); |
| CHECK_EQ(opnode.exec->in_array.size(), 1U); |
| CHECK_EQ(opnode.exec->out_array.size(), 1U); |
| CopyFromTo(opnode.exec->in_array[0], &(opnode.exec->out_array[0])); |
| } else if (opnode.exec->exec_type() == ExecType::kSubgraphExec) { |
| // If the node contains a subgraph, we can't execute it in the engine. |
| opnode.exec->Run(opnode.exec->op_ctx.run_ctx, false); |
| } else if (opnode.cached_opr != nullptr) { |
| bool profiling = profiler::Profiler::Get()->GetState() == profiler::Profiler::kRunning; |
| Engine::Get()->Push(opnode.cached_opr, opnode.ctx, 0, profiling); |
| if (this->is_dynamic_) { |
| for (NDArray &array : opnode.exec->out_array) { |
| array.WaitToRead(); |
| if (!shape_is_known(array.shape())) { |
| array.SetShapeFromChunk(); |
| } |
| } |
| } |
| } else { |
| LOG(FATAL) << "Not accessed"; |
| } |
| for (uint32_t i = 0; i < num_inputs; ++i) { |
| int eid = idx.entry_id(inode.inputs[i]); |
| if (!shape_is_known(rshape[eid])) { |
| rshape[eid] = opnode.exec->in_array[i].shape(); |
| } |
| } |
| for (uint32_t i = 0; i < num_outputs; ++i) { |
| int eid = idx.entry_id(nid, i); |
| if (!shape_is_known(rshape[eid])) { |
| rshape[eid] = opnode.exec->out_array[i].shape(); |
| } |
| } |
| // Monitor callbacks |
| if (monitor_callback_) { |
| ExecuteMonOutputCallback(nid); |
| } |
| } |
| graph_.attrs["shape"] = std::make_shared<dmlc::any>(rshape); |
| } |
| |
| GraphExecutor::CachedSegOpr GraphExecutor::CreateCachedSegOpr(size_t topo_start, size_t topo_end) { |
| std::vector<Engine::VarHandle> use_vars; |
| std::vector<Engine::VarHandle> mutate_vars; |
| Context *pctx = nullptr; |
| GraphExecutor::CachedSegOpr ret; |
| ret.topo_start = topo_start; |
| ret.topo_end = topo_end; |
| auto& exec_list = ret.exec_list; |
| // invalid segment |
| if (topo_end <= topo_start) { |
| return ret; |
| } |
| std::string opr_names = "["; |
| |
| const auto& idx = graph_.indexed_graph(); |
| for (size_t nid = topo_start; nid < topo_end; ++nid) { |
| std::vector<Engine::VarHandle> all_vars; |
| const auto& inode = idx[nid]; |
| OpNode& op_node = op_nodes_[nid]; |
| if (op_node.skip_exec_node) continue; |
| if (inode.source->is_variable()) continue; |
| if (op_node.exec->exec_type() != ExecType::kSync) { |
| return ret; |
| } |
| if (pctx == nullptr) pctx = &(op_node.ctx); |
| if (*pctx != op_node.ctx) { |
| return ret; |
| } |
| auto& exec = op_nodes_[nid].exec; |
| std::copy(op_node.mutate_vars.begin(), op_node.mutate_vars.end(), |
| std::inserter(mutate_vars, mutate_vars.end())); |
| std::copy(op_node.use_vars.begin(), op_node.use_vars.end(), |
| std::inserter(use_vars, use_vars.end())); |
| ret.exec_list.push_back(exec); |
| opr_names += inode.source->op()->name + ","; |
| } |
| |
| if (pctx == nullptr) |
| return ret; |
| ret.ctx = *pctx; |
| Engine::Get()->DeduplicateVarHandle(&use_vars, &mutate_vars); |
| |
| bool is_gpu = pctx->dev_mask() == gpu::kDevMask; |
| |
| #if CUDA_GRAPHS_AVAILABLE |
| // Provide initialized `cuda_graphs_exec`, which when captured |
| // by exec_fun, acts like a static variable inside the mutable closure. |
| cuda_graphs::CudaGraphsExec cuda_graphs_exec(exec_list, is_gpu, opr_names.c_str()); |
| auto exec_fun = [cuda_graphs_exec, exec_list, is_gpu] ( |
| RunContext rctx, Engine::CallbackOnComplete on_complete) mutable { |
| // Run all opr in the sub-graph with CUDA graphs executor if possible |
| cuda_graphs_exec.RunAll(exec_list, rctx, is_gpu); |
| #else |
| auto exec_fun = [exec_list, is_gpu] ( |
| RunContext rctx, Engine::CallbackOnComplete on_complete) { |
| // Run all opr in the sub-graph |
| OpExecutor::RunAll(exec_list, rctx, is_gpu); |
| #endif |
| if (is_gpu) { |
| #if MXNET_USE_CUDA |
| // Wait GPU kernel to finish. |
| rctx.get_stream<gpu>()->Wait(); |
| #else |
| LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR; |
| #endif |
| } |
| on_complete(); |
| }; |
| opr_names.pop_back(); |
| opr_names += "]"; |
| ret.opr = Engine::Get()->NewOperator( |
| exec_fun, use_vars, mutate_vars, FnProperty::kNormal, |
| opr_names.c_str()); |
| return ret; |
| } |
| |
| // Infer shapes, dtypes, stypes, contexts for the forward graph |
| static nnvm::Graph InferForwardAttrs(nnvm::Graph g, |
| mxnet::ShapeVector arg_shapes, |
| nnvm::DTypeVector arg_dtypes, |
| StorageTypeVector arg_stypes, |
| const Context& default_ctx, |
| const std::map<std::string, Context>& ctx_map, |
| const std::vector<Context>& in_arg_ctxes, |
| const std::vector<Context>& aux_state_ctxes, |
| bool partial_shape = false) { |
| const auto& indexed_graph = g.indexed_graph(); |
| const auto num_forward_inputs = indexed_graph.input_nodes().size(); |
| g = AssignContext(g, default_ctx, ctx_map, in_arg_ctxes, {}, |
| aux_state_ctxes, {}, num_forward_inputs, g.outputs.size()); |
| g = InferShape(std::move(g), std::move(arg_shapes), "__shape__"); |
| if (g.GetAttr<size_t>("shape_num_unknown_nodes") != 0U) { |
| if (!partial_shape) { |
| HandleInferShapeError(num_forward_inputs, indexed_graph, |
| g.GetAttr<mxnet::ShapeVector>("shape")); |
| } |
| } |
| g = InferType(std::move(g), std::move(arg_dtypes), "__dtype__"); |
| if (g.GetAttr<size_t>("dtype_num_unknown_nodes") != 0U) { |
| HandleInferTypeError(num_forward_inputs, indexed_graph, |
| g.GetAttr<nnvm::DTypeVector>("dtype")); |
| } |
| g = InferStorageType(std::move(g), std::move(arg_stypes), "__storage_type__"); |
| if (g.GetAttr<size_t>("storage_type_num_unknown_nodes") != 0U) { |
| HandleInferStorageTypeError(num_forward_inputs, indexed_graph, |
| g.GetAttr<StorageTypeVector>("storage_type")); |
| } |
| return g; |
| } |
| |
| static bool SubgraphBackendCheck(const op::SubgraphBackendPtr& backend, |
| const Context& default_ctx, |
| int verbose = 1) { |
| if (backend->HasAttr("enable") && (backend->GetAttr<bool>("enable") != true)) { |
| if (verbose > 1) { |
| LOG(INFO) << "Subgraph backend " << backend->GetName() |
| << " isn't activated."; |
| } |
| return false; |
| } |
| if (backend->HasAttr("context") && backend->GetAttr<Context>("context") != default_ctx) { |
| if (verbose > 1) { |
| LOG(INFO) << "Subgraph backend " << backend->GetName() |
| << " isn't activated as context mismatch."; |
| } |
| return false; |
| } |
| return true; |
| } |
| |
| static bool SubgraphPropertyCheck(const std::string& backend_name, |
| const op::SubgraphPropertyPtr& prop, bool need_grad, |
| int verbose = 1) { |
| auto full_name = |
| prop->HasAttr("property_name") ? prop->GetAttr<std::string>("property_name") : std::string(); |
| if (prop->HasAttr("disable") && prop->GetAttr<bool>("disable") == true) { |
| LOG(INFO) << "subgraph property " << full_name << " from backend " << backend_name |
| << " is disabled."; |
| return false; |
| } |
| if (prop->HasAttr("inference_only") && prop->GetAttr<bool>("inference_only") == true) { |
| if (need_grad) { |
| if (verbose > 1) { |
| LOG(INFO) << "skip partitioning graph with subgraph property " << full_name |
| << " from backend " << backend_name << " as it requires `grad_req=null`."; |
| } |
| return false; |
| } |
| } |
| return true; |
| } |
| |
| // Given input attr arrays, partition the graph using the backend name equal to prop_name. |
| // This is a common function for bind and simple_bind flows. |
| static nnvm::Symbol BuildSubgraph(const nnvm::Symbol& src, op::SubgraphPropertyPtr subgraph_prop, |
| const mxnet::ShapeVector& arg_shapes, |
| const nnvm::DTypeVector& arg_dtypes, |
| const StorageTypeVector& arg_stypes, const Context& default_ctx, |
| const std::map<std::string, Context>& ctx_map, |
| const std::vector<Context>& in_arg_ctxes, |
| const std::vector<Context>& aux_state_ctxes) { |
| nnvm::Symbol ret = src.Copy(); |
| nnvm::Graph g; |
| g.outputs = ret.outputs; |
| g = InferForwardAttrs(g, arg_shapes, arg_dtypes, arg_stypes, default_ctx, ctx_map, in_arg_ctxes, |
| aux_state_ctxes, true); |
| subgraph_prop->SetAttr("graph", g); |
| g.attrs["subgraph_property"] = std::make_shared<nnvm::any>(subgraph_prop); |
| g = ApplyPass(std::move(g), "BuildSubgraph"); |
| subgraph_prop->RemoveAttr("graph"); |
| g.attrs.erase("subgraph_property"); |
| ret.outputs = g.outputs; |
| return ret; |
| } |
| |
| // Given input attr dicts, partition the graph using the backend. |
| // This is for simple_bind flow. |
| static nnvm::Symbol BuildSubgraph( |
| const nnvm::Symbol& src, const op::SubgraphBackendPtr backend, |
| const std::unordered_map<std::string, mxnet::TShape>& arg_shape_map, |
| const std::unordered_map<std::string, int>& arg_dtype_map, |
| const std::unordered_map<std::string, int>& arg_stype_map, const Context& default_ctx, |
| const std::map<std::string, Context>& ctx_map, std::vector<Context>* in_arg_ctxes, |
| std::vector<Context>* arg_grad_ctxes, std::vector<OpReqType>* grad_req_types, |
| std::vector<Context>* aux_state_ctxes, int verbose = 1) { |
| // setup map for in_arg_ctxes, arg_grad_ctxes, aux_state_ctxes and grad_req_types |
| std::unordered_map<std::string, Context> in_arg_ctx_map; |
| std::unordered_map<std::string, Context> arg_grad_ctx_map; |
| std::unordered_map<std::string, Context> aux_state_ctx_map; |
| std::unordered_map<std::string, OpReqType> grad_req_type_map; |
| |
| auto arg_names = src.ListInputNames(nnvm::Symbol::kReadOnlyArgs); |
| auto aux_names = src.ListInputNames(nnvm::Symbol::kAuxiliaryStates); |
| for (size_t i = 0; i < arg_names.size(); ++i) { |
| const auto& name = arg_names[i]; |
| in_arg_ctx_map[name] = in_arg_ctxes->at(i); |
| arg_grad_ctx_map[name] = arg_grad_ctxes->at(i); |
| grad_req_type_map[name] = grad_req_types->at(i); |
| } |
| |
| for (size_t i = 0; i < aux_names.size(); ++i) { |
| aux_state_ctx_map[aux_names[i]] = aux_state_ctxes->at(i); |
| } |
| |
| bool need_grad = false; |
| for (OpReqType req : *grad_req_types) { |
| if (req != kNullOp) { |
| need_grad = true; |
| break; |
| } |
| } |
| nnvm::Symbol ret = src.Copy(); |
| std::unordered_set<std::string> op_names_set; |
| const auto& backend_name = backend->GetName(); |
| const auto it = op::SubgraphPropertyOpNameSet::Get()->find(backend_name); |
| // assign a op name set to the subgraph property if it has been provided by users |
| if (it != op::SubgraphPropertyOpNameSet::Get()->end()) { |
| LOG(INFO) << "SubgraphPropertyOpNameSet for subgraph property " << backend_name |
| << " has been assigned a value. Please make sure it is initialized" |
| " only for the testing purpose."; |
| op_names_set = it->second; |
| } |
| |
| const auto& subgraph_prop_list = backend->GetSubgraphProperties(); |
| for (auto& subgraph_prop : subgraph_prop_list) { |
| if (SubgraphPropertyCheck(backend_name, subgraph_prop, need_grad, verbose)) { |
| subgraph_prop->SetAttr("op_names", op_names_set); |
| const std::vector<std::string> input_names = ret.ListInputNames(Symbol::kAll); |
| mxnet::ShapeVector arg_shapes(input_names.size(), mxnet::TShape()); |
| nnvm::DTypeVector arg_dtypes(input_names.size(), -1); |
| StorageTypeVector arg_stypes(input_names.size(), kUndefinedStorage); |
| for (size_t i = 0; i < input_names.size(); ++i) { |
| const auto& input_name = input_names[i]; |
| const auto it1 = arg_shape_map.find(input_name); |
| if (arg_shape_map.end() != it1) { |
| arg_shapes[i] = it1->second; |
| } |
| const auto it2 = arg_dtype_map.find(input_name); |
| if (arg_dtype_map.end() != it2) { |
| arg_dtypes[i] = it2->second; |
| } |
| const auto it3 = arg_stype_map.find(input_name); |
| if (arg_stype_map.end() != it3) { |
| arg_stypes[i] = it3->second; |
| } |
| } |
| ret = BuildSubgraph(ret, subgraph_prop, arg_shapes, arg_dtypes, arg_stypes, default_ctx, |
| ctx_map, *in_arg_ctxes, *aux_state_ctxes); |
| // Reorder in_arg_ctxes, arg_grad_ctxes, aux_state_ctxes and grad_req_types according to |
| // partitioned symbol input sequence |
| in_arg_ctxes->clear(); |
| arg_grad_ctxes->clear(); |
| aux_state_ctxes->clear(); |
| grad_req_types->clear(); |
| auto new_arg_names = ret.ListInputNames(nnvm::Symbol::kReadOnlyArgs); |
| auto new_aux_names = ret.ListInputNames(nnvm::Symbol::kAuxiliaryStates); |
| for (const auto& arg_name : new_arg_names) { |
| CHECK(in_arg_ctx_map.count(arg_name)); |
| in_arg_ctxes->push_back(in_arg_ctx_map[arg_name]); |
| arg_grad_ctxes->push_back(arg_grad_ctx_map[arg_name]); |
| grad_req_types->push_back(grad_req_type_map[arg_name]); |
| } |
| for (const auto& arg_name : new_aux_names) { |
| CHECK(aux_state_ctx_map.count(arg_name)); |
| aux_state_ctxes->push_back(aux_state_ctx_map[arg_name]); |
| } |
| } |
| } |
| return ret; |
| } |
| |
| // Given input ndarrays, partition the graph using backend. |
| // This is for bind flow. |
| static nnvm::Symbol BuildSubgraph(const nnvm::Symbol& src, const op::SubgraphBackendPtr backend, |
| const Context& default_ctx, |
| const std::map<std::string, Context>& ctx_map, |
| std::vector<NDArray>* in_args, |
| std::vector<NDArray>* arg_grad_store, |
| std::vector<OpReqType>* grad_req_type, |
| std::vector<NDArray>* aux_states, int verbose = 1) { |
| // setup map for in_args, arg_grad_store, grad_req_type and aux_states |
| std::unordered_map<std::string, NDArray> in_args_map; |
| std::unordered_map<std::string, NDArray> arg_grad_store_map; |
| std::unordered_map<std::string, OpReqType> grad_req_type_map; |
| std::unordered_map<std::string, NDArray> aux_states_map; |
| const std::vector<std::string> arg_names = src.ListInputNames(nnvm::Symbol::kReadOnlyArgs); |
| const std::vector<std::string> aux_names = src.ListInputNames(nnvm::Symbol::kAuxiliaryStates); |
| for (size_t i = 0; i < arg_names.size(); ++i) { |
| in_args_map[arg_names[i]] = in_args->at(i); |
| } |
| |
| for (size_t i = 0; i < aux_names.size(); ++i) { |
| aux_states_map[aux_names[i]] = aux_states->at(i); |
| } |
| |
| if (arg_grad_store->size()) { |
| for (size_t i = 0; i < arg_names.size(); ++i) { |
| const auto& name = arg_names[i]; |
| arg_grad_store_map[name] = arg_grad_store->at(i); |
| grad_req_type_map[name] = grad_req_type->at(i); |
| } |
| } |
| |
| bool need_grad = false; |
| for (OpReqType req : *grad_req_type) { |
| if (req != kNullOp) { |
| need_grad = true; |
| break; |
| } |
| } |
| nnvm::Symbol ret = src.Copy(); |
| std::unordered_set<std::string> op_names_set; |
| const auto& backend_name = backend->GetName(); |
| auto it = op::SubgraphPropertyOpNameSet::Get()->find(backend_name); |
| // assign a op name set to the subgraph property if it has been provided by users |
| if (it != op::SubgraphPropertyOpNameSet::Get()->end()) { |
| LOG(INFO) << "SubgraphPropertyOpNameSet for subgraph property " << backend_name |
| << " has been assigned a value. Please make sure it is initialized" |
| " only for the testing purpose."; |
| op_names_set = it->second; |
| } |
| const auto& subgraph_prop_list = backend->GetSubgraphProperties(); |
| |
| for (auto subgraph_prop : subgraph_prop_list) { |
| if (SubgraphPropertyCheck(backend_name, subgraph_prop, need_grad, verbose)) { |
| subgraph_prop->SetAttr("op_names", op_names_set); |
| const std::vector<std::string> input_names = ret.ListInputNames(Symbol::kAll); |
| const std::vector<std::string> arg_names = ret.ListInputNames(nnvm::Symbol::kReadOnlyArgs); |
| const std::vector<std::string> aux_names = ret.ListInputNames(nnvm::Symbol::kAuxiliaryStates); |
| CHECK_EQ(arg_names.size(), in_args_map.size()); |
| CHECK_EQ(aux_names.size(), aux_states_map.size()); |
| mxnet::ShapeVector arg_shapes; // all input shapes |
| arg_shapes.reserve(input_names.size()); |
| nnvm::DTypeVector arg_dtypes; // all input dtypes |
| arg_dtypes.reserve(input_names.size()); |
| StorageTypeVector arg_stypes; // all input stypes |
| arg_stypes.reserve(input_names.size()); |
| std::vector<Context> in_arg_ctxes(in_args_map.size()); |
| std::vector<Context> aux_state_ctxes(aux_states_map.size()); |
| |
| size_t i1 = 0, i2 = 0; |
| for (const auto& input_name : input_names) { |
| if (i2 < aux_names.size() && aux_names[i2] == input_name) { |
| const auto &aux_st = aux_states_map[input_name]; |
| arg_shapes.push_back(aux_st.shape()); |
| arg_dtypes.push_back(aux_st.dtype()); |
| arg_stypes.push_back(aux_st.storage_type()); |
| aux_state_ctxes[i2] = aux_st.ctx(); |
| ++i2; |
| } else { |
| CHECK(i1 < arg_names.size()); |
| CHECK_EQ(arg_names[i1], input_name); |
| const auto &in_arg = in_args_map[input_name]; |
| arg_shapes.push_back(in_arg.shape()); |
| arg_dtypes.push_back(in_arg.dtype()); |
| arg_stypes.push_back(in_arg.storage_type()); |
| in_arg_ctxes[i1] = in_arg.ctx(); |
| ++i1; |
| } |
| } |
| |
| ret = BuildSubgraph(ret, subgraph_prop, arg_shapes, arg_dtypes, arg_stypes, default_ctx, |
| ctx_map, in_arg_ctxes, aux_state_ctxes); |
| } |
| } |
| // Reorder in_args, arg_grad_store, grad_req_type and aux_states according to partitioned symbol |
| // input sequence |
| const auto new_arg_names = ret.ListInputNames(nnvm::Symbol::kReadOnlyArgs); |
| const auto new_aux_names = ret.ListInputNames(nnvm::Symbol::kAuxiliaryStates); |
| CHECK_EQ(arg_names.size(), new_arg_names.size()); |
| CHECK_EQ(arg_names.size(), new_arg_names.size()); |
| in_args->clear(); |
| aux_states->clear(); |
| for (const auto& arg_name : new_arg_names) { |
| CHECK(in_args_map.count(arg_name)); |
| in_args->push_back(in_args_map[arg_name]); |
| } |
| |
| for (const auto& arg_name : new_aux_names) { |
| CHECK(aux_states_map.count(arg_name)); |
| aux_states->push_back(aux_states_map[arg_name]); |
| } |
| |
| if (arg_grad_store->size()) { |
| arg_grad_store->clear(); |
| grad_req_type->clear(); |
| for (const auto& arg_name : new_arg_names) { |
| arg_grad_store->push_back(arg_grad_store_map[arg_name]); |
| grad_req_type->push_back(grad_req_type_map[arg_name]); |
| } |
| } |
| return ret; |
| } |
| } // namespace exec |
| |
| Executor *Executor::SimpleBind(nnvm::Symbol symbol, |
| const Context& default_ctx, |
| const std::map<std::string, Context>& group2ctx, |
| const std::vector<Context>& in_arg_ctxes, |
| const std::vector<Context>& arg_grad_ctxes, |
| const std::vector<Context>& aux_state_ctxes, |
| const std::unordered_map<std::string, mxnet::TShape>& arg_shape_map, |
| const std::unordered_map<std::string, int>& arg_dtype_map, |
| const std::unordered_map<std::string, int>& arg_stype_map, |
| const std::vector<OpReqType>& grad_req_types, |
| const std::unordered_set<std::string>& shared_arg_names, |
| std::vector<NDArray>* in_args, |
| std::vector<NDArray>* arg_grads, |
| std::vector<NDArray>* aux_states, |
| std::unordered_map<std::string, NDArray>* shared_buffer, |
| Executor* shared_exec) { |
| auto exec = new exec::GraphExecutor(symbol); |
| bool init = false; |
| if (!exec->subgraph_property().empty()) { |
| static int verbose = dmlc::GetEnv("MXNET_SUBGRAPH_VERBOSE", 1); |
| const auto& backend_name = exec->subgraph_property(); |
| const auto& backend = op::SubgraphBackendRegistry::Get()->GetSubgraphBackend(backend_name); |
| if (exec::SubgraphBackendCheck(backend, default_ctx, verbose)) { |
| if (verbose) LOG(INFO) << "Subgraph backend " << backend_name << " is activated."; |
| std::vector<Context> tmp_in_arg_ctxes = in_arg_ctxes; |
| std::vector<Context> tmp_arg_grad_ctxes = arg_grad_ctxes; |
| std::vector<Context> tmp_aux_state_ctxes = aux_state_ctxes; |
| std::vector<OpReqType> tmp_grad_req_types = grad_req_types; |
| std::vector<NDArray> tmp_in_args; |
| std::vector<NDArray> tmp_arg_grads; |
| std::vector<NDArray> tmp_aux_states; |
| const auto arg_names = symbol.ListInputNames(nnvm::Symbol::kReadOnlyArgs); |
| const auto aux_names = symbol.ListInputNames(nnvm::Symbol::kAuxiliaryStates); |
| symbol = exec::BuildSubgraph(symbol, backend, arg_shape_map, arg_dtype_map, arg_stype_map, |
| default_ctx, group2ctx, &tmp_in_arg_ctxes, &tmp_arg_grad_ctxes, |
| &tmp_grad_req_types, &tmp_aux_state_ctxes, verbose); |
| // Subgraph cannot be recreated from unoptimized symbol |
| delete exec; |
| exec = new exec::GraphExecutor(symbol); |
| exec->Init(symbol.Copy(), default_ctx, group2ctx, tmp_in_arg_ctxes, tmp_arg_grad_ctxes, |
| tmp_aux_state_ctxes, arg_shape_map, arg_dtype_map, arg_stype_map, |
| tmp_grad_req_types, shared_arg_names, &tmp_in_args, &tmp_arg_grads, |
| &tmp_aux_states, shared_buffer, shared_exec); |
| init = true; |
| const auto new_arg_names = symbol.ListInputNames(nnvm::Symbol::kReadOnlyArgs); |
| const auto new_aux_names = symbol.ListInputNames(nnvm::Symbol::kAuxiliaryStates); |
| std::unordered_map<std::string, size_t> new_arg_names_idx_map; |
| std::unordered_map<std::string, size_t> new_aux_names_idx_map; |
| for (size_t i = 0; i != new_arg_names.size(); ++i) { |
| new_arg_names_idx_map[new_arg_names[i]] = i; |
| } |
| for (size_t i = 0; i != new_aux_names.size(); ++i) { |
| new_aux_names_idx_map[new_aux_names[i]] = i; |
| } |
| |
| in_args->reserve(arg_names.size()); |
| arg_grads->reserve(arg_names.size()); |
| for (size_t i = 0; i != arg_names.size(); ++i) { |
| const auto& arg_name = arg_names[i]; |
| const auto& it = new_arg_names_idx_map.find(arg_name); |
| CHECK(it != new_arg_names_idx_map.end()) |
| << "Subgraph doesn't support remove any input node for now."; |
| in_args->emplace_back(std::move(tmp_in_args[it->second])); |
| arg_grads->emplace_back(std::move(tmp_arg_grads[it->second])); |
| } |
| |
| aux_states->reserve(aux_names.size()); |
| for (size_t i = 0; i != aux_names.size(); ++i) { |
| const auto& aux_name = aux_names[i]; |
| const auto& it = new_aux_names_idx_map.find(aux_name); |
| CHECK(it != new_aux_names_idx_map.end()) |
| << "Subgraph doesn't support remove any input node for now."; |
| aux_states->emplace_back(std::move(tmp_aux_states[it->second])); |
| } |
| } |
| } |
| if (!init) { |
| // init without subgraph |
| exec->Init(symbol.Copy(), default_ctx, group2ctx, in_arg_ctxes, arg_grad_ctxes, aux_state_ctxes, |
| arg_shape_map, arg_dtype_map, arg_stype_map, grad_req_types, shared_arg_names, |
| in_args, arg_grads, aux_states, shared_buffer, shared_exec); |
| } |
| return exec; |
| } |
| |
| Executor *Executor::Bind(nnvm::Symbol symbol, |
| const Context& default_ctx, |
| const std::map<std::string, Context>& group2ctx, |
| const std::vector<NDArray> &in_args, |
| const std::vector<NDArray> &arg_grad_store, |
| const std::vector<OpReqType> &grad_req_type, |
| const std::vector<NDArray> &aux_states, |
| Executor* shared_exec) { |
| auto exec = new exec::GraphExecutor(symbol); |
| static int verbose = dmlc::GetEnv("MXNET_SUBGRAPH_VERBOSE", 1); |
| std::vector<NDArray> tmp_in_args = in_args; |
| std::vector<NDArray> tmp_arg_grad_store = arg_grad_store; |
| std::vector<OpReqType> tmp_grad_req_type = grad_req_type; |
| std::vector<NDArray> tmp_aux_states = aux_states; |
| |
| if (!exec->subgraph_property().empty()) { |
| const auto& backend_name = exec->subgraph_property(); |
| const auto& backend = op::SubgraphBackendRegistry::Get()->GetSubgraphBackend(backend_name); |
| if (exec::SubgraphBackendCheck(backend, default_ctx, verbose)) { |
| if (verbose) LOG(INFO) << "Subgraph backend " << backend_name << " is activated."; |
| symbol = exec::BuildSubgraph(symbol, backend, default_ctx, group2ctx, &tmp_in_args, |
| &tmp_arg_grad_store, &tmp_grad_req_type, &tmp_aux_states, |
| verbose); |
| // Subgraph cannot be recreated from unoptimized symbol |
| delete exec; |
| exec = new exec::GraphExecutor(symbol); |
| } |
| } |
| exec->Init(symbol.Copy(), default_ctx, group2ctx, tmp_in_args, tmp_arg_grad_store, |
| tmp_grad_req_type, tmp_aux_states, reinterpret_cast<Executor*>(shared_exec)); |
| return exec; |
| } |
| } // namespace mxnet |