| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| */ |
| |
| /*! |
| * \file exec_utils.h |
| * \brief Common utility functions for executors. |
| */ |
| #ifndef MXNET_COMMON_EXEC_UTILS_H_ |
| #define MXNET_COMMON_EXEC_UTILS_H_ |
| |
| #include <nnvm/graph.h> |
| #include <nnvm/pass_functions.h> |
| #include <map> |
| #include <vector> |
| #include <string> |
| #include <utility> |
| #include "../common/utils.h" |
| #include "../imperative/exec_pass.h" |
| |
| namespace mxnet { |
| namespace common { |
| |
| #if MXNET_USE_ONEDNN == 1 |
| // We have to make sure it's default storage and default layout. |
| #define DEFAULT_DATA(x) x.IsDefaultData() |
| #else |
| #define DEFAULT_DATA(x) (x.storage_type() == kDefaultStorage) |
| #endif |
| |
| /* |
| * \brief setup default-storage tblobs from source NDArrays. If any source NDArray has non-default |
| * storage, it creates a temp NDArray with default storage and uses the temp tblob. The |
| * function also records the indices of non-default source NDArrays and the indices of |
| * their corresponding temporary NDArrays in the temp array. |
| * \param src list of source NDArray |
| * \param blobs list of tblobs to return |
| * \param temp_src list of source NDArrays which requires temporary default storage representation |
| * \param temp_dst list of temporary destination NDArrays for default storage representation |
| * \param idx_map mapping from indices in source NDArrays to indices in temp_dst. When not set, |
| indices are not recorded |
| * \return true if any source NDArray need to cast storage |
| */ |
| inline bool SetupDefaultBlobsIn(const std::vector<NDArray>& src, |
| const std::vector<NDArray>* bufs, |
| std::vector<TBlob>* blobs, |
| std::vector<NDArray>* temp_src, |
| std::vector<NDArray>* temp_dst, |
| std::unordered_map<uint32_t, uint32_t>* idx_map) { |
| bool require_cast = false; |
| for (size_t i = 0; i < src.size(); i++) { |
| const auto& nd = src[i]; |
| if (!DEFAULT_DATA(nd)) { |
| (*idx_map)[i] = temp_dst->size(); |
| NDArray temp = |
| bufs != nullptr ? bufs->at(i) : NDArray(nd.shape(), nd.ctx(), true, nd.dtype()); |
| #if MXNET_USE_ONEDNN == 1 |
| CHECK(temp.IsDefaultData()); |
| #endif |
| temp_src->emplace_back(nd); |
| temp_dst->emplace_back(temp); |
| blobs->emplace_back(temp.data()); |
| require_cast = true; |
| } else { |
| blobs->push_back(nd.data()); |
| } |
| } |
| return require_cast; |
| } |
| |
| inline bool SetupDefaultBlobsOut(const std::vector<NDArray>& src, |
| const std::vector<NDArray>* bufs, |
| std::vector<OpReqType>* req, |
| std::vector<TBlob>* blobs, |
| std::vector<NDArray>* temp_src, |
| std::vector<NDArray>* temp_dst) { |
| bool require_cast = false; |
| for (size_t i = 0; i < src.size(); i++) { |
| const auto& nd = src[i]; |
| |
| #if MXNET_USE_ONEDNN == 1 |
| if (req->at(i) == kWriteInplace && nd.IsDNNLData()) |
| // If it's write inplace and the output array doesn't use the default |
| // layout, we'll generate a temporary output array below, which means |
| // the input array and the output array are no longer the same array. |
| // we should change the request type. |
| req->at(i) = kWriteTo; |
| // We have to make sure it's default storage and default layout. |
| #endif |
| if (!DEFAULT_DATA(nd)) { |
| #if MXNET_USE_ONEDNN == 1 |
| NDArray temp; |
| if (bufs != nullptr) { |
| temp = bufs->at(i); |
| } else if (kAddTo == req->at(i)) { |
| temp = nd.IsDNNLData() ? nd.Reorder2Default() : nd; |
| } else { |
| temp = NDArray(nd.shape(), nd.ctx(), true, nd.dtype()); |
| } |
| CHECK(temp.IsDefaultData()); |
| #else |
| NDArray temp = |
| bufs != nullptr ? bufs->at(i) : NDArray(nd.shape(), nd.ctx(), true, nd.dtype()); |
| #endif |
| temp_src->emplace_back(nd); |
| temp_dst->emplace_back(temp); |
| blobs->emplace_back(temp.data()); |
| require_cast = true; |
| } else { |
| blobs->push_back(nd.data()); |
| } |
| } |
| return require_cast; |
| } |
| |
| /* |
| * \brief setup default-storage tblobs for input and output NDArrays. |
| * If any NDArray has non-default storage, |
| * it creates a temp NDArray with default storage and uses the temp tblob. The |
| * function also records the indices of non-default source NDArrays and the indices of |
| * their corresponding temporary NDArrays in the temp array. |
| */ |
| inline void SetupDefaultBlobsInOut(const std::vector<NDArray>& ndinputs, |
| const std::vector<NDArray>& ndoutputs, |
| const std::vector<NDArray>* in_bufs, |
| const std::vector<NDArray>* out_bufs, |
| std::vector<OpReqType>* req, |
| std::vector<TBlob>* input_blobs, |
| std::vector<TBlob>* output_blobs, |
| std::vector<NDArray>* pre_temp_src, |
| std::vector<NDArray>* pre_temp_dst, |
| std::vector<NDArray>* post_temp_src, |
| std::vector<NDArray>* post_temp_dst, |
| std::unordered_map<uint32_t, uint32_t>* in_temp_idx_map, |
| const std::vector<uint32_t>& mutate_idx) { |
| // populate input blobs |
| SetupDefaultBlobsIn(ndinputs, in_bufs, input_blobs, pre_temp_src, pre_temp_dst, in_temp_idx_map); |
| // populate output blobs |
| SetupDefaultBlobsOut(ndoutputs, out_bufs, req, output_blobs, post_temp_dst, post_temp_src); |
| // add mutable inputs to post temp list |
| for (const auto idx : mutate_idx) { |
| auto map_iter = in_temp_idx_map->find(idx); |
| if (map_iter != in_temp_idx_map->end()) { |
| post_temp_src->push_back(pre_temp_dst->at(map_iter->second)); |
| post_temp_dst->push_back(ndinputs[idx]); |
| } |
| } |
| } |
| |
| /* |
| * \brief cast the NDArrays in `src` and store the result in NDArrays in `dst`. |
| * This is only used for storage fallback in executor. |
| * \param src list of source NDArray to cast |
| * \param dst list of destionation NDArray which hold the result of cast_storage operation |
| * \param ctx operator context for cast_storage operation |
| */ |
| inline void CastNonDefaultStorage(const std::vector<NDArray>& src, |
| const std::vector<NDArray>& dst, |
| const OpContext& ctx, |
| const bool is_gpu) { |
| CHECK_EQ(dst.size(), src.size()); |
| for (size_t i = 0; i < src.size(); i++) { |
| if (is_gpu) { |
| #if MXNET_USE_CUDA |
| CastStorageDispatch<gpu>(ctx, src[i], dst[i]); |
| #else |
| LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR; |
| #endif |
| } else { |
| CastStorageDispatch<cpu>(ctx, src[i], dst[i]); |
| } |
| } |
| } |
| |
| /*! \brief The default type inference function, which assigns all undefined |
| * types to the same type of one of the inputs or outputs. |
| */ |
| inline bool SameType(const nnvm::NodeAttrs& attrs, |
| std::vector<int>* iattr, |
| std::vector<int>* oattr) { |
| int def_v = -1; |
| for (int v : *oattr) { |
| if (v != -1) { |
| def_v = v; |
| break; |
| } |
| } |
| if (def_v == -1) { |
| for (int v : *iattr) { |
| if (v != -1) { |
| def_v = v; |
| break; |
| } |
| } |
| } |
| if (def_v == -1) |
| return false; |
| for (int& v : *oattr) { |
| v = def_v; |
| } |
| for (int& v : *iattr) { |
| v = def_v; |
| } |
| return true; |
| } |
| |
| /*! \brief The default storage type inference function, which assigns all undefined |
| * storage types to kDefaultStorage. If all of input and output storage types |
| * are kDefaultStorage, DispatchMode::kFCompute is assigned to dispatch_mode. Otherwise, |
| * DispatchMode::kFComputeFallback is assigned to dispatch_mode. |
| */ |
| inline bool DefaultStorageType(const nnvm::NodeAttrs& attrs, |
| const int dev_mask, |
| DispatchMode* dispatch_mode, |
| std::vector<int>* iattr, |
| std::vector<int>* oattr) { |
| bool fallback = false; |
| for (int& v : *oattr) { |
| if (v == -1) |
| v = kDefaultStorage; |
| if (v != kDefaultStorage) |
| fallback = true; |
| } |
| for (int& v : *iattr) { |
| if (v == -1) |
| v = kDefaultStorage; |
| if (v != kDefaultStorage) |
| fallback = true; |
| } |
| if (*dispatch_mode == DispatchMode::kUndefined) { |
| if (fallback) { |
| *dispatch_mode = DispatchMode::kFComputeFallback; |
| } else { |
| *dispatch_mode = DispatchMode::kFCompute; |
| } |
| } |
| return true; |
| } |
| |
| // string representation of storage id |
| inline std::string storage_str(int storage_id) { |
| std::string str; |
| if (storage_id == -1) { |
| str = "var (-1)"; |
| } else if (storage_id == -2) { |
| str = "external storage (-2)"; |
| } else { |
| str = "group " + std::to_string(storage_id); |
| } |
| return str; |
| } |
| |
| /* log the static memory plan of the graph. Example: |
| node 0 var |
| node 1 _copy |
| input 0: [80,3,224,224] (47040 KB) -> var storage (-1) |
| output 1: [80,3,224,224] (47040 KB) -> group 0 |
| node 2 var |
| node 3 var |
| node 4 var |
| node 5 var |
| node 6 BatchNorm |
| input 1: [80,3,224,224] (47040 KB) -> group 0 |
| input 2: [3] (0 KB) -> var storage (-1) |
| input 3: [3] (0 KB) -> var storage (-1) |
| input 4: [3] (0 KB) -> var storage (-1) |
| input 5: [3] (0 KB) -> var storage (-1) |
| output 6: [80,3,224,224] (47040 KB) -> group 1 |
| output 7: [3] (0 KB) -> group 3 |
| output 8: [3] (0 KB) -> group 2 |
| ... |
| */ |
| inline void LogMemoryPlan(const nnvm::Graph& g) { |
| const auto& idx = g.indexed_graph(); |
| const auto& vshape = g.GetAttr<mxnet::ShapeVector>("shape"); |
| const auto& vtype = g.GetAttr<nnvm::DTypeVector>("dtype"); |
| // find node range |
| uint32_t node_start = 0, node_end = idx.num_nodes(); |
| if (g.attrs.count("node_range")) { |
| const auto& range = g.GetAttr<std::pair<uint32_t, uint32_t> >("node_range"); |
| node_start = range.first; |
| node_end = range.second; |
| } |
| for (uint32_t nid = node_start; nid < node_end; ++nid) { |
| const auto& inode = idx[nid]; |
| if (inode.source->is_variable()) { |
| LOG(INFO) << "node " << nid << " var"; |
| } else { |
| LOG(INFO) << "node " << nid << " " << inode.source->attrs.op->name; |
| for (const auto& e : inode.inputs) { |
| auto eid = idx.entry_id(e); |
| size_t kilo_bytes = vshape[eid].Size() * mshadow::mshadow_sizeof(vtype[eid]) / 1024; |
| LOG(INFO) << "\t\tinput " << eid << ": " << vshape[eid] << " (" << kilo_bytes << " KB)"; |
| } |
| for (uint32_t index = 0; index < inode.source->num_outputs(); ++index) { |
| uint32_t eid = idx.entry_id(nid, index); |
| size_t kilo_bytes = vshape[eid].Size() * mshadow::mshadow_sizeof(vtype[eid]) / 1024; |
| LOG(INFO) << "\t\toutput " << eid << ": " << vshape[eid] << " (" << kilo_bytes << " KB)"; |
| } |
| } |
| } |
| } |
| |
| /* log the static memory plan of the graph. Example: |
| node 0 var |
| node 1 _copy: fcompute |
| input 0: default |
| output 1: default |
| node 2 var |
| node 3 Convolution: fcompute |
| input 1: default |
| input 2: default |
| output 3: default |
| node 4 var |
| node 5 var |
| node 6 var |
| node 7 var |
| node 8 BatchNorm: fcompute |
| input 3: default |
| input 4: default |
| input 5: default |
| input 6: default |
| input 7: default |
| output 8: default |
| output 9: default |
| output 10: default |
| ... |
| */ |
| inline void LogInferStorage(const nnvm::Graph& g) { |
| const auto& idx = g.indexed_graph(); |
| const auto& vstorage_type = g.GetAttr<StorageTypeVector>("storage_type"); |
| const auto& dispatch_modes = g.GetAttr<DispatchModeVector>("dispatch_mode"); |
| uint32_t node_start = 0, node_end = idx.num_nodes(); |
| if (g.attrs.count("node_range")) { |
| const auto& range = g.GetAttr<std::pair<uint32_t, uint32_t> >("node_range"); |
| node_start = range.first; |
| node_end = range.second; |
| } |
| for (uint32_t nid = node_start; nid < node_end; ++nid) { |
| const auto& inode = idx[nid]; |
| if (inode.source->is_variable()) { |
| LOG(INFO) << "node " << nid << " var"; |
| } else { |
| LOG(INFO) << "node " << nid << " " << inode.source->attrs.op->name << ": " |
| << dispatch_mode_string(dispatch_modes[nid]); |
| for (const auto& e : inode.inputs) { |
| auto eid = idx.entry_id(e); |
| LOG(INFO) << "\t\tinput " << eid << ": " << stype_string(vstorage_type[eid]); |
| } |
| for (uint32_t index = 0; index < inode.source->num_outputs(); ++index) { |
| uint32_t eid = idx.entry_id(nid, index); |
| LOG(INFO) << "\t\toutput " << eid << ": " << stype_string(vstorage_type[eid]); |
| } |
| } |
| } |
| } |
| |
| /*! |
| * \brief If the requested ndarray's shape size is less than |
| * the corresponding shared_data_array's shape size and the |
| * storage type is shareable, reuse the memory allocation |
| * in shared_buffer; otherwise, create a zero ndarray. |
| * Shareable storages include both default storage and row_sparse storage |
| * if enable_row_sparse_sharing is `True`, otherwise default storage only. |
| */ |
| inline NDArray ReshapeOrCreate(const std::string& name, |
| const mxnet::TShape& dest_arg_shape, |
| const int dest_arg_dtype, |
| const NDArrayStorageType dest_arg_stype, |
| const Context& ctx, |
| std::unordered_map<std::string, NDArray>* shared_buffer, |
| bool enable_row_sparse_sharing) { |
| bool stype_shareable = dest_arg_stype == kDefaultStorage; |
| if (enable_row_sparse_sharing) { |
| stype_shareable = stype_shareable || dest_arg_stype == kRowSparseStorage; |
| } |
| auto it = shared_buffer->find(name); |
| if (it != shared_buffer->end()) { |
| // check if size is large enough for sharing |
| bool size_shareable = it->second.shape().Size() >= dest_arg_shape.Size(); |
| if (size_shareable && stype_shareable) { // memory can be reused |
| CHECK_EQ(it->second.dtype(), dest_arg_dtype) |
| << "Requested arg array's dtype does not match that of the reusable ndarray"; |
| CHECK_EQ(it->second.storage_type(), dest_arg_stype) |
| << "Requested arg array's stype does not match that of the reusable ndarray"; |
| return it->second.Reshape(dest_arg_shape); |
| } else if (stype_shareable) { |
| LOG(WARNING) << "Bucketing: data " << name << " has a shape " << dest_arg_shape |
| << ", which is larger than already allocated shape " << it->second.shape() |
| << ". Need to re-allocate. Consider putting default bucket key to be " |
| << "the bucket taking the largest input for better memory sharing."; |
| // size is not large enough, creating a larger one for sharing |
| // the NDArrays in shared_buffer are guaranteed to be of shareable storages |
| it->second = InitZeros(dest_arg_stype, dest_arg_shape, ctx, dest_arg_dtype); |
| return it->second; |
| } else { |
| // not shareable storage |
| return InitZeros(dest_arg_stype, dest_arg_shape, ctx, dest_arg_dtype); |
| } |
| } else { |
| auto ret = InitZeros(dest_arg_stype, dest_arg_shape, ctx, dest_arg_dtype); |
| if (stype_shareable) { |
| shared_buffer->emplace(name, ret); |
| } |
| return ret; |
| } // if (it != shared_buffer->end()) |
| } |
| |
| /*! |
| * \brief Assign context to the graph. |
| * This is triggered by both simple_bind and bind flows. |
| */ |
| inline nnvm::Graph AssignContext(nnvm::Graph g, |
| const Context& default_ctx, |
| const std::map<std::string, Context>& ctx_map, |
| const std::vector<Context>& in_arg_ctxes, |
| const std::vector<Context>& arg_grad_ctxes, |
| const std::vector<Context>& aux_state_ctxes, |
| const std::vector<OpReqType>& grad_req_types, |
| size_t num_forward_inputs, |
| size_t num_forward_outputs) { |
| const auto& idx = g.indexed_graph(); |
| const auto& mutable_nodes = idx.mutable_input_nodes(); |
| // default use default context. |
| if (ctx_map.size() == 0) { |
| g.attrs["context"] = |
| std::make_shared<nnvm::any>(exec::ContextVector(idx.num_nodes(), default_ctx)); |
| for (const auto& x : in_arg_ctxes) { |
| CHECK(x == default_ctx) << "Input array is in " << x |
| << " while binding with ctx=" << default_ctx |
| << ". All arguments must be in global context (" << default_ctx |
| << ") unless group2ctx is specified for cross-device graph."; |
| } |
| for (const auto& x : arg_grad_ctxes) { |
| CHECK(x == default_ctx) << "Gradient array is in " << x |
| << " while binding with ctx=" << default_ctx |
| << ". All gradients must be in global context (" << default_ctx |
| << ") unless group2ctx is specified for cross-device graph."; |
| } |
| return g; |
| } |
| |
| // otherwise, use context assignment. |
| std::map<Context, int> ctx2id; // map ctx to device id |
| std::vector<Context> ctx_list; // index is device id |
| nnvm::DeviceVector device(idx.num_nodes(), -1); // index is node id |
| nnvm::DeviceAssignMap device_map; // map arg name to device id |
| |
| // loop through the user input ctx_map and |
| // populate maps and lists |
| for (auto& kv : ctx_map) { |
| if (ctx2id.count(kv.second) == 0) { // if context has no device id, create one |
| ctx2id[kv.second] = static_cast<int>(ctx_list.size()); // assign device id to ctx |
| ctx_list.push_back(kv.second); // save ctx to the list |
| } |
| // assign device id to to the arg name with the corresponding ctx |
| device_map[kv.first] = ctx2id.at(kv.second); |
| } |
| |
| // loop through all the rest of input nodes not specified |
| // in the ctx_map and populate maps and lists |
| size_t arg_top = 0, aux_top = 0; |
| for (size_t i = 0; i < num_forward_inputs; ++i) { |
| const uint32_t nid = idx.input_nodes().at(i); |
| Context ctx; |
| if (mutable_nodes.count(nid)) { // aux node is mutable |
| CHECK_LT(aux_top, aux_state_ctxes.size()); |
| ctx = aux_state_ctxes[aux_top]; |
| ++aux_top; |
| } else { // regular input node is immutable |
| CHECK_LT(arg_top, in_arg_ctxes.size()); |
| ctx = in_arg_ctxes[arg_top]; |
| ++arg_top; |
| } |
| if (ctx2id.count(ctx) == 0) { // if the current ctx is not in the map of ctx and device id |
| ctx2id[ctx] = static_cast<int>(ctx_list.size()); // assign the current ctx with device id |
| ctx_list.push_back(ctx); // save the current ctx in the list |
| } |
| device[nid] = ctx2id.at(ctx); // assign device id to the current node |
| } |
| |
| // loop through backward input nodes and populate maps and lists |
| // the backward input nodes is the gradient of the loss wrt the output |
| size_t arg_grad_offset = 0; |
| // keep an offset into the arg_grad_ctxes vector, |
| // since g.outputs exclude arg_grad whose req == null |
| CHECK_GE(grad_req_types.size(), g.outputs.size() - num_forward_outputs) |
| << "insufficient number of grad_reqs"; |
| for (size_t i = num_forward_outputs; i < g.outputs.size(); ++i, ++arg_grad_offset) { |
| while (grad_req_types[arg_grad_offset] == kNullOp) |
| ++arg_grad_offset; |
| const uint32_t nid = idx.outputs()[i].node_id; |
| Context ctx = arg_grad_ctxes[arg_grad_offset]; |
| if (ctx2id.count(ctx) == 0) { |
| ctx2id[ctx] = static_cast<int>(ctx_list.size()); |
| ctx_list.push_back(ctx); |
| } |
| int devid = ctx2id.at(ctx); |
| if (device[nid] != -1) { |
| CHECK_EQ(device[nid], devid) << "device of same output not equal to each other"; |
| } else { |
| device[nid] = devid; |
| } |
| } |
| |
| g.attrs["device"] = std::make_shared<dmlc::any>(std::move(device)); |
| g = nnvm::pass::PlaceDevice(g, "__ctx_group__", device_map, "_CrossDeviceCopy"); |
| const auto& assigned_devices = g.GetAttr<nnvm::DeviceVector>("device"); |
| |
| exec::ContextVector vcontext; |
| for (auto context : assigned_devices) { |
| if (context == -1) { |
| vcontext.push_back(default_ctx); |
| } else { |
| vcontext.push_back(ctx_list[context]); |
| } |
| } |
| |
| // after device planning, we should check again |
| // if the assigned device of gradient node |
| // corresponds to storage of grads |
| auto& new_idx = g.indexed_graph(); |
| arg_grad_offset = 0; |
| for (size_t i = num_forward_outputs; i < g.outputs.size(); ++i, ++arg_grad_offset) { |
| while (grad_req_types[arg_grad_offset] == kNullOp) |
| ++arg_grad_offset; |
| const uint32_t nid = new_idx.outputs()[i].node_id; |
| Context ctx = arg_grad_ctxes[arg_grad_offset]; |
| CHECK(ctx == vcontext[nid]) << "Trying to save gradient to " << ctx |
| << " while its source node \"" << new_idx[nid].source->attrs.name |
| << "\" computes it on " << vcontext[nid] |
| << ". Check your ctx in NDArray allocation."; |
| } |
| |
| g.attrs["context"] = std::make_shared<nnvm::any>(std::move(vcontext)); |
| return g; |
| } |
| |
| /*! |
| * \brief Copy the graph, optionally leaving original Variable nodes. |
| * |
| * \param dst destination graph |
| * \param src source graph being copied |
| * \param copy_variable whether to copy or reuse Variable nodes from the |
| * source graph |
| */ |
| void CopyGraph(nnvm::Graph* dst, const nnvm::Graph& src, bool copy_variables); |
| |
| /*! |
| * \brief Check whether graph contains any duplicated names in its inputs. |
| * |
| * \param idx Indexed graph being checked |
| * |
| * \return true if there are no duplicates, false otherwise |
| */ |
| bool CheckForInputNameDuplicates(const nnvm::IndexedGraph& idx); |
| |
| } // namespace common |
| } // namespace mxnet |
| #endif // MXNET_COMMON_EXEC_UTILS_H_ |