| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| */ |
| |
| /*! |
| * \file plan_memory.cc |
| * \brief Assign memory tag to each of the data entries. |
| */ |
| #include <nnvm/graph.h> |
| #include <nnvm/pass.h> |
| #include <nnvm/graph_attr_types.h> |
| #include <nnvm/op_attr_types.h> |
| #include <mxnet/base.h> |
| #include <memory> |
| #include "graph_algorithm.h" |
| #include "../operator/operator_common.h" |
| |
| namespace nnvm { |
| namespace pass { |
| |
| /*! |
| * \brief Return the storage in bytes for the corresponding data flag. |
| */ |
| size_t MXGetDTypeSize(const int type_flag) { |
| switch (type_flag) { |
| case mshadow::kUint8: |
| case mshadow::kInt8: |
| case mshadow::kBool: |
| return 1; |
| case mshadow::kFloat16: |
| case mshadow::kBfloat16: |
| case mshadow::kInt16: |
| case mshadow::kUint16: |
| return 2; |
| case mshadow::kFloat32: |
| case mshadow::kInt32: |
| case mshadow::kUint32: |
| return 4; |
| case mshadow::kFloat64: |
| case mshadow::kInt64: |
| case mshadow::kUint64: |
| return 8; |
| default: |
| LOG(FATAL) << "unknown type_flag=" << type_flag; |
| return -1; |
| } |
| } |
| |
| namespace { |
| |
| // simple graph based allocator. |
| class MXGraphAllocator { |
| public: |
| // storage id equals integer. |
| using StorageID = int; |
| |
| // bad storage id |
| static const StorageID kBadStorageID = -1; |
| // external storage id |
| static const StorageID kExternalStorageID = -2; |
| // dynamic storage id |
| static const StorageID kDynamicStorageID = -3; |
| |
| // request a free storage |
| StorageID Request(int dev_id, int dtype, mxnet::TShape shape, uint32_t node_id) { |
| if (!mxnet::shape_is_known(shape)) |
| return kBadStorageID; |
| // search memory block in [size / match_range_, size * match_range_) |
| size_t size = shape.Size() * MXGetDTypeSize(dtype); |
| if (match_range_ == 0) |
| return this->Alloc(dev_id, size); |
| auto begin = free_.lower_bound(size / match_range_); |
| auto mid = free_.lower_bound(size); |
| auto end = free_.upper_bound(size * match_range_); |
| // search for memory blocks larger than requested |
| for (auto it = mid; it != end; ++it) { |
| StorageEntry* e = it->second; |
| if (e->device_id != dev_id) |
| continue; |
| if (node_color_.size() != 0 && node_color_[e->released_by_node] != node_color_[node_id]) |
| continue; |
| // Use exect matching strategy |
| e->max_bytes = std::max(size, e->max_bytes); |
| // find a exact match, erase from map and return |
| free_.erase(it); |
| return e->id; |
| } |
| // then search for memory blocks smaller than requested space |
| for (auto it = mid; it != begin;) { |
| --it; |
| StorageEntry* e = it->second; |
| if (e->device_id != dev_id) |
| continue; |
| if (node_color_.size() != 0 && node_color_[e->released_by_node] != node_color_[node_id]) |
| continue; |
| // Use exect matching strategy |
| e->max_bytes = std::max(size, e->max_bytes); |
| // erase from map and return |
| free_.erase(it); |
| return e->id; |
| } |
| // cannot find anything return a new one. |
| return this->Alloc(dev_id, size); |
| } |
| // release a memory space. |
| void Release(StorageID id, uint32_t node_id) { |
| CHECK_NE(id, kBadStorageID); |
| if (id == kExternalStorageID || id == kDynamicStorageID) |
| return; |
| StorageEntry* e = data_[id].get(); |
| e->released_by_node = node_id; |
| free_.insert({e->max_bytes, e}); |
| } |
| |
| // totoal number of bytes allocated |
| size_t TotalAllocBytes() const { |
| size_t total = 0; |
| for (auto& p : data_) { |
| total += p->max_bytes; |
| } |
| return total; |
| } |
| |
| // constructor |
| explicit MXGraphAllocator(const IndexedGraph* idx, const size_t match_range) : idx_(idx) { |
| this->Init(match_range, dmlc::GetEnv("NNVM_EXEC_NUM_TEMP", 1)); |
| } |
| |
| private: |
| // initialize the graph allocator |
| void Init(const size_t match_range, const uint32_t num_match_color) { |
| match_range_ = match_range; |
| num_match_color_ = num_match_color; |
| if (num_match_color_ > 1) { |
| std::vector<uint32_t> importance(idx_->num_nodes(), 0); |
| for (uint32_t nid = 0; nid < idx_->num_nodes(); ++nid) { |
| if ((*idx_)[nid].source->is_variable()) |
| continue; |
| importance[nid] = 1; |
| } |
| num_match_color_ = pass::MXColorNodeGroup(*idx_, importance, num_match_color_, &node_color_); |
| } |
| } |
| |
| StorageID Alloc(int dev_id, size_t size) { |
| StorageID id = static_cast<StorageID>(data_.size()); |
| std::unique_ptr<StorageEntry> ptr(new StorageEntry()); |
| ptr->id = id; |
| ptr->device_id = dev_id; |
| ptr->max_bytes = size; |
| data_.emplace_back(std::move(ptr)); |
| return id; |
| } |
| // internal storage entry |
| struct StorageEntry { |
| // the id of the entry. |
| StorageID id; |
| // the device id of the storage. |
| int device_id; |
| // maximum size of storage requested. |
| size_t max_bytes{0}; |
| // node index that released it last time |
| uint32_t released_by_node{0}; |
| }; |
| // scale used for rough match |
| size_t match_range_; |
| // whether use color based match algorithm |
| uint32_t num_match_color_{1}; |
| // the size of each dtype |
| std::vector<size_t> dtype_size_dict_; |
| // free list of storage entry |
| std::multimap<size_t, StorageEntry*> free_; |
| // all the storage resources available |
| std::vector<std::unique_ptr<StorageEntry> > data_; |
| // color of nodes in the graph, used for auxiliary policy making. |
| std::vector<uint32_t> node_color_; |
| // internal indexed graph |
| const IndexedGraph* idx_; |
| }; |
| |
| /* |
| * Internal method to perform the memory allocation for a graph |
| * */ |
| size_t MXAllocMemory(const Graph& ret, |
| const IndexedGraph& idx, |
| const std::pair<uint32_t, uint32_t>& node_range, |
| StorageVector* storage_ptr, |
| std::vector<int>* storage_inplace_index_ptr, |
| const std::vector<uint32_t>& entry_ref_count, |
| MXGraphAllocator* allocator) { |
| static auto& finplace_option = Op::GetAttr<FInplaceOption>("FInplaceOption"); |
| static auto& finplace_identity = Op::GetAttr<FInplaceIdentity>("FInplaceIdentity"); |
| static auto& fignore_inputs = Op::GetAttr<FIgnoreInputs>("FIgnoreInputs"); |
| |
| // Get reference |
| auto& storage = *storage_ptr; |
| auto& storage_inplace_index = *storage_inplace_index_ptr; |
| |
| // Get attributes from the graph |
| const mxnet::ShapeVector& shape_vec = ret.GetAttr<mxnet::ShapeVector>("shape"); |
| const DTypeVector& dtype_vec = ret.GetAttr<DTypeVector>("dtype"); |
| const DeviceVector* device_vec = nullptr; |
| |
| if (ret.attrs.count("device") != 0) { |
| device_vec = &(ret.GetAttr<DeviceVector>("device")); |
| } |
| size_t num_not_allocated = 0; |
| std::vector<MXGraphAllocator::StorageID> storage_ref_count(idx.num_node_entries(), 0); |
| |
| for (uint32_t nid = node_range.first; nid < node_range.second; ++nid) { |
| const auto& inode = idx[nid]; |
| if (inode.source->is_variable()) |
| continue; |
| // check inplace option |
| if (finplace_option.count(inode.source->op()) != 0) { |
| auto inplace_pairs = finplace_option[inode.source->op()](inode.source->attrs); |
| std::vector<bool> identity; |
| if (finplace_identity.count(inode.source->op()) != 0) { |
| identity = finplace_identity[inode.source->op()](inode.source->attrs); |
| CHECK_EQ(identity.size(), inplace_pairs.size()) |
| << "FInplaceOption and FInplaceIdentity returned vectors of different " |
| << "size for operator " << inode.source->op()->name; |
| } else { |
| identity = std::vector<bool>(inplace_pairs.size(), false); |
| } |
| std::vector<bool> taken(inode.inputs.size(), false); |
| for (size_t ipair = 0; ipair < inplace_pairs.size(); ++ipair) { |
| const auto& kv = inplace_pairs[ipair]; |
| uint32_t eid_out = idx.entry_id(nid, kv.second); |
| uint32_t eid_in = idx.entry_id(inode.inputs[kv.first]); |
| auto sid_out = storage[eid_out]; |
| auto sid_in = storage[eid_in]; |
| bool ignore_all_inputs = (fignore_inputs.count(inode.source->op()) != 0 && |
| fignore_inputs[inode.source->op()](inode.source->attrs).size() == |
| inode.source->num_inputs()); |
| // Identity should only be true if shape.Size() and types match |
| bool real_identity = identity[ipair] && ndim_is_known(shape_vec[eid_out]) && |
| ndim_is_known(shape_vec[eid_in]) && |
| shape_vec[eid_out].Size() == shape_vec[eid_in].Size() && |
| dtype_vec[eid_out] == dtype_vec[eid_in]; |
| if (taken[kv.first] == false && sid_out == MXGraphAllocator::kBadStorageID && sid_in >= 0 && |
| ((storage_ref_count[sid_in] == 1 && !ignore_all_inputs) || real_identity) && |
| entry_ref_count[eid_out] > 0 && shape_vec[eid_out].Size() == shape_vec[eid_in].Size() && |
| (dtype_vec[eid_out] == dtype_vec[eid_in] || |
| MXGetDTypeSize(dtype_vec[eid_out]) == MXGetDTypeSize(dtype_vec[eid_in]))) { |
| // inplace optimization |
| taken[kv.first] = true; |
| storage[eid_out] = sid_in; |
| // Reuse storage for output and add ref count of output |
| // to storage. This will get substracted later in free |
| // input section. |
| storage_ref_count[sid_in] += entry_ref_count[eid_out]; |
| storage_inplace_index[eid_out] = kv.first; |
| } |
| } |
| } |
| // normal allocation |
| const int dev_id = (device_vec != nullptr) ? device_vec->at(nid) : 0; |
| // sort output nodes based on size before allocating output |
| std::multimap<size_t, uint32_t> eids; |
| for (uint32_t index = 0; index < inode.source->num_outputs(); ++index) { |
| uint32_t eid = idx.entry_id(nid, index); |
| // only request memory for kBadStorageID |
| if (storage[eid] == MXGraphAllocator::kBadStorageID) { |
| auto& eshape = shape_vec[eid]; |
| size_t esize = ndim_is_known(shape_vec[eid]) ? eshape.Size() : 0; |
| eids.insert(std::make_pair(esize, eid)); |
| } |
| } |
| for (auto rit = eids.rbegin(); rit != eids.rend(); ++rit) { |
| uint32_t eid = rit->second; |
| auto sid = allocator->Request(dev_id, dtype_vec[eid], shape_vec[eid], nid); |
| if (sid >= 0) { |
| storage_ref_count[sid] = entry_ref_count[eid]; |
| } |
| storage[eid] = sid; |
| } |
| // check if certain inputs is ignored. |
| std::vector<uint32_t> ignore_inputs; |
| if (fignore_inputs.count(inode.source->op()) != 0) { |
| ignore_inputs = fignore_inputs[inode.source->op()](inode.source->attrs); |
| std::sort(ignore_inputs.begin(), ignore_inputs.end()); |
| } |
| // then free inputs |
| for (size_t i = 0; i < inode.inputs.size(); ++i) { |
| // ref counter of ignored input is already decreased. |
| if (std::binary_search(ignore_inputs.begin(), ignore_inputs.end(), i)) |
| continue; |
| const auto& e = inode.inputs[i]; |
| uint32_t eid = idx.entry_id(e); |
| auto sid = storage[eid]; |
| // storage_ref_count == 0 means it is taken by inplace op |
| if (sid < 0) |
| continue; |
| // if we decrease it to zero, means we are ready to release |
| --storage_ref_count[sid]; |
| if (storage_ref_count[sid] == 0) { |
| allocator->Release(sid, nid); |
| } |
| } |
| // check if there are outputs that can be freeded immediately |
| // these output are not referenced by any operator. |
| for (uint32_t index = 0; index < inode.source->num_outputs(); ++index) { |
| uint32_t eid = idx.entry_id(nid, index); |
| auto sid = storage[eid]; |
| if (sid >= 0 && storage_ref_count[sid] == 0) { |
| allocator->Release(sid, nid); |
| // use -2 to indicate that the node was never touched. |
| storage_inplace_index[eid] = -2; |
| } |
| if (storage[eid] == MXGraphAllocator::kBadStorageID) { |
| ++num_not_allocated; |
| } |
| } |
| } |
| return num_not_allocated; |
| } |
| |
| // function to plan memory |
| Graph MXPlanMemory(Graph ret) { |
| // setup ref counter |
| const IndexedGraph& idx = ret.indexed_graph(); |
| static auto& fignore_inputs = Op::GetAttr<FIgnoreInputs>("FIgnoreInputs"); |
| std::pair<uint32_t, uint32_t> node_range = {0, idx.num_nodes()}; |
| if (ret.attrs.count("node_range")) { |
| node_range = ret.MoveCopyAttr<std::pair<uint32_t, uint32_t> >("node_range"); |
| } |
| // reference counter of each node |
| std::vector<uint32_t> ref_count; |
| // step 1: initialize reference count |
| if (ret.attrs.count("ref_count") != 0) { |
| ref_count = ret.MoveCopyAttr<std::vector<uint32_t> >("ref_count"); |
| } else { |
| ref_count.resize(idx.num_node_entries(), 0); |
| for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) { |
| const auto& inode = idx[nid]; |
| if (inode.source->is_variable()) |
| continue; |
| for (const auto& e : inode.inputs) { |
| ++ref_count[idx.entry_id(e)]; |
| } |
| // no dataflow dependency is needed for those are ignored. |
| // revoke the dependency counter. |
| if (fignore_inputs.count(inode.source->op()) != 0) { |
| auto ignore_inputs = fignore_inputs[inode.source->op()](inode.source->attrs); |
| for (uint32_t i : ignore_inputs) { |
| --ref_count[idx.entry_id(inode.inputs[i])]; |
| } |
| } |
| } |
| for (const auto& e : idx.outputs()) { |
| ++ref_count[idx.entry_id(e)]; |
| } |
| } |
| // step 2: allocate memory. |
| StorageVector storage; |
| if (ret.attrs.count("storage") != 0) { |
| storage = ret.MoveCopyAttr<StorageVector>("storage"); |
| } else { |
| storage.resize(idx.num_node_entries(), -1); |
| } |
| |
| // Search the best NNVM_EXEC_MATCH_RANGE parameter. This is turned off by default |
| size_t min_allocated_bytes = -1; |
| size_t max_match_range = dmlc::GetEnv("NNVM_EXEC_MATCH_RANGE", 16); |
| size_t min_match_range = |
| dmlc::GetEnv("MXNET_MEMORY_OPT", 0) || dmlc::GetEnv("NNVM_AUTO_SEARCH_MATCH_RANGE", false) ? |
| 1 : |
| max_match_range; |
| for (size_t match_range = min_match_range; match_range <= max_match_range; match_range *= 2) { |
| // Make a copy of related fields |
| StorageVector storage_vec(storage); |
| std::vector<int> storage_inplace_index(idx.num_node_entries(), -1); |
| |
| // the allocator |
| MXGraphAllocator allocator(&idx, match_range); |
| |
| // number of entries that are not statically allocated. |
| size_t storage_num_not_allocated = MXAllocMemory( |
| ret, idx, node_range, &storage_vec, &storage_inplace_index, ref_count, &allocator); |
| size_t storage_allocated_bytes = allocator.TotalAllocBytes(); |
| |
| // Choose the plan which leads to minimal memory usage |
| if (min_allocated_bytes > storage_allocated_bytes) { |
| ret.attrs["storage_id"] = std::make_shared<any>(std::move(storage_vec)); |
| ret.attrs["storage_inplace_index"] = std::make_shared<any>(std::move(storage_inplace_index)); |
| ret.attrs["storage_allocated_bytes"] = std::make_shared<any>(storage_allocated_bytes); |
| ret.attrs["storage_num_not_allocated"] = std::make_shared<any>(storage_num_not_allocated); |
| min_allocated_bytes = storage_allocated_bytes; |
| } |
| |
| if (max_match_range == 0) { |
| break; |
| } |
| } |
| return ret; |
| } |
| |
| NNVM_REGISTER_PASS(MXPlanMemory) |
| .describe("Plan the memory allocation of each node entries.") |
| .set_body(MXPlanMemory) |
| .set_change_graph(false) |
| .depend_graph_attr("dtype") |
| .depend_graph_attr("shape") |
| .provide_graph_attr("storage_id") |
| .provide_graph_attr("storage_inplace_index"); |
| |
| } // namespace |
| } // namespace pass |
| } // namespace nnvm |