blob: 21a97130c18368fa7973b15409d19ec9d669dc08 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
/*!
* \file exec_utils.h
* \brief Common utility functions for executors.
*/
#ifndef MXNET_COMMON_EXEC_UTILS_H_
#define MXNET_COMMON_EXEC_UTILS_H_
#include <nnvm/graph.h>
#include <nnvm/pass_functions.h>
#include <map>
#include <vector>
#include <string>
#include <utility>
#include "../common/utils.h"
#include "../imperative/exec_pass.h"
namespace mxnet {
namespace common {
#if MXNET_USE_ONEDNN == 1
// We have to make sure it's default storage and default layout.
#define DEFAULT_DATA(x) x.IsDefaultData()
#else
#define DEFAULT_DATA(x) (x.storage_type() == kDefaultStorage)
#endif
/*
* \brief setup default-storage tblobs from source NDArrays. If any source NDArray has non-default
* storage, it creates a temp NDArray with default storage and uses the temp tblob. The
* function also records the indices of non-default source NDArrays and the indices of
* their corresponding temporary NDArrays in the temp array.
* \param src list of source NDArray
* \param blobs list of tblobs to return
* \param temp_src list of source NDArrays which requires temporary default storage representation
* \param temp_dst list of temporary destination NDArrays for default storage representation
* \param idx_map mapping from indices in source NDArrays to indices in temp_dst. When not set,
indices are not recorded
* \return true if any source NDArray need to cast storage
*/
inline bool SetupDefaultBlobsIn(const std::vector<NDArray>& src,
const std::vector<NDArray>* bufs,
std::vector<TBlob>* blobs,
std::vector<NDArray>* temp_src,
std::vector<NDArray>* temp_dst,
std::unordered_map<uint32_t, uint32_t>* idx_map) {
bool require_cast = false;
for (size_t i = 0; i < src.size(); i++) {
const auto& nd = src[i];
if (!DEFAULT_DATA(nd)) {
(*idx_map)[i] = temp_dst->size();
NDArray temp =
bufs != nullptr ? bufs->at(i) : NDArray(nd.shape(), nd.ctx(), true, nd.dtype());
#if MXNET_USE_ONEDNN == 1
CHECK(temp.IsDefaultData());
#endif
temp_src->emplace_back(nd);
temp_dst->emplace_back(temp);
blobs->emplace_back(temp.data());
require_cast = true;
} else {
blobs->push_back(nd.data());
}
}
return require_cast;
}
inline bool SetupDefaultBlobsOut(const std::vector<NDArray>& src,
const std::vector<NDArray>* bufs,
std::vector<OpReqType>* req,
std::vector<TBlob>* blobs,
std::vector<NDArray>* temp_src,
std::vector<NDArray>* temp_dst) {
bool require_cast = false;
for (size_t i = 0; i < src.size(); i++) {
const auto& nd = src[i];
#if MXNET_USE_ONEDNN == 1
if (req->at(i) == kWriteInplace && nd.IsDNNLData())
// If it's write inplace and the output array doesn't use the default
// layout, we'll generate a temporary output array below, which means
// the input array and the output array are no longer the same array.
// we should change the request type.
req->at(i) = kWriteTo;
// We have to make sure it's default storage and default layout.
#endif
if (!DEFAULT_DATA(nd)) {
#if MXNET_USE_ONEDNN == 1
NDArray temp;
if (bufs != nullptr) {
temp = bufs->at(i);
} else if (kAddTo == req->at(i)) {
temp = nd.IsDNNLData() ? nd.Reorder2Default() : nd;
} else {
temp = NDArray(nd.shape(), nd.ctx(), true, nd.dtype());
}
CHECK(temp.IsDefaultData());
#else
NDArray temp =
bufs != nullptr ? bufs->at(i) : NDArray(nd.shape(), nd.ctx(), true, nd.dtype());
#endif
temp_src->emplace_back(nd);
temp_dst->emplace_back(temp);
blobs->emplace_back(temp.data());
require_cast = true;
} else {
blobs->push_back(nd.data());
}
}
return require_cast;
}
/*
* \brief setup default-storage tblobs for input and output NDArrays.
* If any NDArray has non-default storage,
* it creates a temp NDArray with default storage and uses the temp tblob. The
* function also records the indices of non-default source NDArrays and the indices of
* their corresponding temporary NDArrays in the temp array.
*/
inline void SetupDefaultBlobsInOut(const std::vector<NDArray>& ndinputs,
const std::vector<NDArray>& ndoutputs,
const std::vector<NDArray>* in_bufs,
const std::vector<NDArray>* out_bufs,
std::vector<OpReqType>* req,
std::vector<TBlob>* input_blobs,
std::vector<TBlob>* output_blobs,
std::vector<NDArray>* pre_temp_src,
std::vector<NDArray>* pre_temp_dst,
std::vector<NDArray>* post_temp_src,
std::vector<NDArray>* post_temp_dst,
std::unordered_map<uint32_t, uint32_t>* in_temp_idx_map,
const std::vector<uint32_t>& mutate_idx) {
// populate input blobs
SetupDefaultBlobsIn(ndinputs, in_bufs, input_blobs, pre_temp_src, pre_temp_dst, in_temp_idx_map);
// populate output blobs
SetupDefaultBlobsOut(ndoutputs, out_bufs, req, output_blobs, post_temp_dst, post_temp_src);
// add mutable inputs to post temp list
for (const auto idx : mutate_idx) {
auto map_iter = in_temp_idx_map->find(idx);
if (map_iter != in_temp_idx_map->end()) {
post_temp_src->push_back(pre_temp_dst->at(map_iter->second));
post_temp_dst->push_back(ndinputs[idx]);
}
}
}
/*
* \brief cast the NDArrays in `src` and store the result in NDArrays in `dst`.
* This is only used for storage fallback in executor.
* \param src list of source NDArray to cast
* \param dst list of destionation NDArray which hold the result of cast_storage operation
* \param ctx operator context for cast_storage operation
*/
inline void CastNonDefaultStorage(const std::vector<NDArray>& src,
const std::vector<NDArray>& dst,
const OpContext& ctx,
const bool is_gpu) {
CHECK_EQ(dst.size(), src.size());
for (size_t i = 0; i < src.size(); i++) {
if (is_gpu) {
#if MXNET_USE_CUDA
CastStorageDispatch<gpu>(ctx, src[i], dst[i]);
#else
LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
#endif
} else {
CastStorageDispatch<cpu>(ctx, src[i], dst[i]);
}
}
}
/*! \brief The default type inference function, which assigns all undefined
* types to the same type of one of the inputs or outputs.
*/
inline bool SameType(const nnvm::NodeAttrs& attrs,
std::vector<int>* iattr,
std::vector<int>* oattr) {
int def_v = -1;
for (int v : *oattr) {
if (v != -1) {
def_v = v;
break;
}
}
if (def_v == -1) {
for (int v : *iattr) {
if (v != -1) {
def_v = v;
break;
}
}
}
if (def_v == -1)
return false;
for (int& v : *oattr) {
v = def_v;
}
for (int& v : *iattr) {
v = def_v;
}
return true;
}
/*! \brief The default storage type inference function, which assigns all undefined
* storage types to kDefaultStorage. If all of input and output storage types
* are kDefaultStorage, DispatchMode::kFCompute is assigned to dispatch_mode. Otherwise,
* DispatchMode::kFComputeFallback is assigned to dispatch_mode.
*/
inline bool DefaultStorageType(const nnvm::NodeAttrs& attrs,
const int dev_mask,
DispatchMode* dispatch_mode,
std::vector<int>* iattr,
std::vector<int>* oattr) {
bool fallback = false;
for (int& v : *oattr) {
if (v == -1)
v = kDefaultStorage;
if (v != kDefaultStorage)
fallback = true;
}
for (int& v : *iattr) {
if (v == -1)
v = kDefaultStorage;
if (v != kDefaultStorage)
fallback = true;
}
if (*dispatch_mode == DispatchMode::kUndefined) {
if (fallback) {
*dispatch_mode = DispatchMode::kFComputeFallback;
} else {
*dispatch_mode = DispatchMode::kFCompute;
}
}
return true;
}
// string representation of storage id
inline std::string storage_str(int storage_id) {
std::string str;
if (storage_id == -1) {
str = "var (-1)";
} else if (storage_id == -2) {
str = "external storage (-2)";
} else {
str = "group " + std::to_string(storage_id);
}
return str;
}
/* log the static memory plan of the graph. Example:
node 0 var
node 1 _copy
input 0: [80,3,224,224] (47040 KB) -> var storage (-1)
output 1: [80,3,224,224] (47040 KB) -> group 0
node 2 var
node 3 var
node 4 var
node 5 var
node 6 BatchNorm
input 1: [80,3,224,224] (47040 KB) -> group 0
input 2: [3] (0 KB) -> var storage (-1)
input 3: [3] (0 KB) -> var storage (-1)
input 4: [3] (0 KB) -> var storage (-1)
input 5: [3] (0 KB) -> var storage (-1)
output 6: [80,3,224,224] (47040 KB) -> group 1
output 7: [3] (0 KB) -> group 3
output 8: [3] (0 KB) -> group 2
...
*/
inline void LogMemoryPlan(const nnvm::Graph& g) {
const auto& idx = g.indexed_graph();
const auto& vshape = g.GetAttr<mxnet::ShapeVector>("shape");
const auto& vtype = g.GetAttr<nnvm::DTypeVector>("dtype");
// find node range
uint32_t node_start = 0, node_end = idx.num_nodes();
if (g.attrs.count("node_range")) {
const auto& range = g.GetAttr<std::pair<uint32_t, uint32_t> >("node_range");
node_start = range.first;
node_end = range.second;
}
for (uint32_t nid = node_start; nid < node_end; ++nid) {
const auto& inode = idx[nid];
if (inode.source->is_variable()) {
LOG(INFO) << "node " << nid << " var";
} else {
LOG(INFO) << "node " << nid << " " << inode.source->attrs.op->name;
for (const auto& e : inode.inputs) {
auto eid = idx.entry_id(e);
size_t kilo_bytes = vshape[eid].Size() * mshadow::mshadow_sizeof(vtype[eid]) / 1024;
LOG(INFO) << "\t\tinput " << eid << ": " << vshape[eid] << " (" << kilo_bytes << " KB)";
}
for (uint32_t index = 0; index < inode.source->num_outputs(); ++index) {
uint32_t eid = idx.entry_id(nid, index);
size_t kilo_bytes = vshape[eid].Size() * mshadow::mshadow_sizeof(vtype[eid]) / 1024;
LOG(INFO) << "\t\toutput " << eid << ": " << vshape[eid] << " (" << kilo_bytes << " KB)";
}
}
}
}
/* log the static memory plan of the graph. Example:
node 0 var
node 1 _copy: fcompute
input 0: default
output 1: default
node 2 var
node 3 Convolution: fcompute
input 1: default
input 2: default
output 3: default
node 4 var
node 5 var
node 6 var
node 7 var
node 8 BatchNorm: fcompute
input 3: default
input 4: default
input 5: default
input 6: default
input 7: default
output 8: default
output 9: default
output 10: default
...
*/
inline void LogInferStorage(const nnvm::Graph& g) {
const auto& idx = g.indexed_graph();
const auto& vstorage_type = g.GetAttr<StorageTypeVector>("storage_type");
const auto& dispatch_modes = g.GetAttr<DispatchModeVector>("dispatch_mode");
uint32_t node_start = 0, node_end = idx.num_nodes();
if (g.attrs.count("node_range")) {
const auto& range = g.GetAttr<std::pair<uint32_t, uint32_t> >("node_range");
node_start = range.first;
node_end = range.second;
}
for (uint32_t nid = node_start; nid < node_end; ++nid) {
const auto& inode = idx[nid];
if (inode.source->is_variable()) {
LOG(INFO) << "node " << nid << " var";
} else {
LOG(INFO) << "node " << nid << " " << inode.source->attrs.op->name << ": "
<< dispatch_mode_string(dispatch_modes[nid]);
for (const auto& e : inode.inputs) {
auto eid = idx.entry_id(e);
LOG(INFO) << "\t\tinput " << eid << ": " << stype_string(vstorage_type[eid]);
}
for (uint32_t index = 0; index < inode.source->num_outputs(); ++index) {
uint32_t eid = idx.entry_id(nid, index);
LOG(INFO) << "\t\toutput " << eid << ": " << stype_string(vstorage_type[eid]);
}
}
}
}
/*!
* \brief If the requested ndarray's shape size is less than
* the corresponding shared_data_array's shape size and the
* storage type is shareable, reuse the memory allocation
* in shared_buffer; otherwise, create a zero ndarray.
* Shareable storages include both default storage and row_sparse storage
* if enable_row_sparse_sharing is `True`, otherwise default storage only.
*/
inline NDArray ReshapeOrCreate(const std::string& name,
const mxnet::TShape& dest_arg_shape,
const int dest_arg_dtype,
const NDArrayStorageType dest_arg_stype,
const Context& ctx,
std::unordered_map<std::string, NDArray>* shared_buffer,
bool enable_row_sparse_sharing) {
bool stype_shareable = dest_arg_stype == kDefaultStorage;
if (enable_row_sparse_sharing) {
stype_shareable = stype_shareable || dest_arg_stype == kRowSparseStorage;
}
auto it = shared_buffer->find(name);
if (it != shared_buffer->end()) {
// check if size is large enough for sharing
bool size_shareable = it->second.shape().Size() >= dest_arg_shape.Size();
if (size_shareable && stype_shareable) { // memory can be reused
CHECK_EQ(it->second.dtype(), dest_arg_dtype)
<< "Requested arg array's dtype does not match that of the reusable ndarray";
CHECK_EQ(it->second.storage_type(), dest_arg_stype)
<< "Requested arg array's stype does not match that of the reusable ndarray";
return it->second.Reshape(dest_arg_shape);
} else if (stype_shareable) {
LOG(WARNING) << "Bucketing: data " << name << " has a shape " << dest_arg_shape
<< ", which is larger than already allocated shape " << it->second.shape()
<< ". Need to re-allocate. Consider putting default bucket key to be "
<< "the bucket taking the largest input for better memory sharing.";
// size is not large enough, creating a larger one for sharing
// the NDArrays in shared_buffer are guaranteed to be of shareable storages
it->second = InitZeros(dest_arg_stype, dest_arg_shape, ctx, dest_arg_dtype);
return it->second;
} else {
// not shareable storage
return InitZeros(dest_arg_stype, dest_arg_shape, ctx, dest_arg_dtype);
}
} else {
auto ret = InitZeros(dest_arg_stype, dest_arg_shape, ctx, dest_arg_dtype);
if (stype_shareable) {
shared_buffer->emplace(name, ret);
}
return ret;
} // if (it != shared_buffer->end())
}
/*!
* \brief Assign context to the graph.
* This is triggered by both simple_bind and bind flows.
*/
inline nnvm::Graph AssignContext(nnvm::Graph g,
const Context& default_ctx,
const std::map<std::string, Context>& ctx_map,
const std::vector<Context>& in_arg_ctxes,
const std::vector<Context>& arg_grad_ctxes,
const std::vector<Context>& aux_state_ctxes,
const std::vector<OpReqType>& grad_req_types,
size_t num_forward_inputs,
size_t num_forward_outputs) {
const auto& idx = g.indexed_graph();
const auto& mutable_nodes = idx.mutable_input_nodes();
// default use default context.
if (ctx_map.size() == 0) {
g.attrs["context"] =
std::make_shared<nnvm::any>(exec::ContextVector(idx.num_nodes(), default_ctx));
for (const auto& x : in_arg_ctxes) {
CHECK(x == default_ctx) << "Input array is in " << x
<< " while binding with ctx=" << default_ctx
<< ". All arguments must be in global context (" << default_ctx
<< ") unless group2ctx is specified for cross-device graph.";
}
for (const auto& x : arg_grad_ctxes) {
CHECK(x == default_ctx) << "Gradient array is in " << x
<< " while binding with ctx=" << default_ctx
<< ". All gradients must be in global context (" << default_ctx
<< ") unless group2ctx is specified for cross-device graph.";
}
return g;
}
// otherwise, use context assignment.
std::map<Context, int> ctx2id; // map ctx to device id
std::vector<Context> ctx_list; // index is device id
nnvm::DeviceVector device(idx.num_nodes(), -1); // index is node id
nnvm::DeviceAssignMap device_map; // map arg name to device id
// loop through the user input ctx_map and
// populate maps and lists
for (auto& kv : ctx_map) {
if (ctx2id.count(kv.second) == 0) { // if context has no device id, create one
ctx2id[kv.second] = static_cast<int>(ctx_list.size()); // assign device id to ctx
ctx_list.push_back(kv.second); // save ctx to the list
}
// assign device id to to the arg name with the corresponding ctx
device_map[kv.first] = ctx2id.at(kv.second);
}
// loop through all the rest of input nodes not specified
// in the ctx_map and populate maps and lists
size_t arg_top = 0, aux_top = 0;
for (size_t i = 0; i < num_forward_inputs; ++i) {
const uint32_t nid = idx.input_nodes().at(i);
Context ctx;
if (mutable_nodes.count(nid)) { // aux node is mutable
CHECK_LT(aux_top, aux_state_ctxes.size());
ctx = aux_state_ctxes[aux_top];
++aux_top;
} else { // regular input node is immutable
CHECK_LT(arg_top, in_arg_ctxes.size());
ctx = in_arg_ctxes[arg_top];
++arg_top;
}
if (ctx2id.count(ctx) == 0) { // if the current ctx is not in the map of ctx and device id
ctx2id[ctx] = static_cast<int>(ctx_list.size()); // assign the current ctx with device id
ctx_list.push_back(ctx); // save the current ctx in the list
}
device[nid] = ctx2id.at(ctx); // assign device id to the current node
}
// loop through backward input nodes and populate maps and lists
// the backward input nodes is the gradient of the loss wrt the output
size_t arg_grad_offset = 0;
// keep an offset into the arg_grad_ctxes vector,
// since g.outputs exclude arg_grad whose req == null
CHECK_GE(grad_req_types.size(), g.outputs.size() - num_forward_outputs)
<< "insufficient number of grad_reqs";
for (size_t i = num_forward_outputs; i < g.outputs.size(); ++i, ++arg_grad_offset) {
while (grad_req_types[arg_grad_offset] == kNullOp)
++arg_grad_offset;
const uint32_t nid = idx.outputs()[i].node_id;
Context ctx = arg_grad_ctxes[arg_grad_offset];
if (ctx2id.count(ctx) == 0) {
ctx2id[ctx] = static_cast<int>(ctx_list.size());
ctx_list.push_back(ctx);
}
int devid = ctx2id.at(ctx);
if (device[nid] != -1) {
CHECK_EQ(device[nid], devid) << "device of same output not equal to each other";
} else {
device[nid] = devid;
}
}
g.attrs["device"] = std::make_shared<dmlc::any>(std::move(device));
g = nnvm::pass::PlaceDevice(g, "__ctx_group__", device_map, "_CrossDeviceCopy");
const auto& assigned_devices = g.GetAttr<nnvm::DeviceVector>("device");
exec::ContextVector vcontext;
for (auto context : assigned_devices) {
if (context == -1) {
vcontext.push_back(default_ctx);
} else {
vcontext.push_back(ctx_list[context]);
}
}
// after device planning, we should check again
// if the assigned device of gradient node
// corresponds to storage of grads
auto& new_idx = g.indexed_graph();
arg_grad_offset = 0;
for (size_t i = num_forward_outputs; i < g.outputs.size(); ++i, ++arg_grad_offset) {
while (grad_req_types[arg_grad_offset] == kNullOp)
++arg_grad_offset;
const uint32_t nid = new_idx.outputs()[i].node_id;
Context ctx = arg_grad_ctxes[arg_grad_offset];
CHECK(ctx == vcontext[nid]) << "Trying to save gradient to " << ctx
<< " while its source node \"" << new_idx[nid].source->attrs.name
<< "\" computes it on " << vcontext[nid]
<< ". Check your ctx in NDArray allocation.";
}
g.attrs["context"] = std::make_shared<nnvm::any>(std::move(vcontext));
return g;
}
/*!
* \brief Copy the graph, optionally leaving original Variable nodes.
*
* \param dst destination graph
* \param src source graph being copied
* \param copy_variable whether to copy or reuse Variable nodes from the
* source graph
*/
void CopyGraph(nnvm::Graph* dst, const nnvm::Graph& src, bool copy_variables);
/*!
* \brief Check whether graph contains any duplicated names in its inputs.
*
* \param idx Indexed graph being checked
*
* \return true if there are no duplicates, false otherwise
*/
bool CheckForInputNameDuplicates(const nnvm::IndexedGraph& idx);
} // namespace common
} // namespace mxnet
#endif // MXNET_COMMON_EXEC_UTILS_H_