src/nnvm/plan_memory.cc - mxnet - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 /*!
  *  Copyright (c) 2016 by Contributors
  * \file plan_memory.cc
  * \brief Assign memory tag to each of the data entries.
  */
 #include <nnvm/graph.h>
 #include <nnvm/pass.h>
 #include <nnvm/graph_attr_types.h>
 #include <nnvm/op_attr_types.h>
 #include <nnvm/top/tensor.h>
 #include <mxnet/base.h>
 #include <memory>
 #include "graph_algorithm.h"

 namespace nnvm {
 namespace pass {

 namespace {
   using namespace nnvm::top;
 // Return bytes of data flag.
 static int GetDTypeSize(int type_flag) {
   switch (type_flag) {
     case kUint8:
     case kInt8:
       return 1;
     case kFloat16:
     case kInt16:
     case kUint16:
       return 2;
     case kFloat32:
     case kInt32:
     case kUint32:
       return 4;
     case kFloat64:
     case kInt64:
     case kUint64:
       return 8;
     default:
       LOG(FATAL) << "unknown type_flag=" << type_flag;
       return -1;
   }
 }

 // simple graph based allocator.
 class GraphAllocator {
  public:
   // storage id equals integer.
   using StorageID = int;

   // bad storage id
   static const StorageID kBadStorageID = -1;
   // external storage id
   static const StorageID kExternalStorageID = -2;
   // dynamic storage id
   static const StorageID kDynamicStorageID = -3;

   // request a free storage
   StorageID Request(int dev_id, int dtype, mxnet::TShape shape, uint32_t node_id) {
     if (shape.ndim() == 0) return kBadStorageID;
     // search memory block in [size / match_range_, size * match_range_)
     // TODO(tqchen) add size of the dtype, assume 4 bytes for now
     size_t size = shape.Size() * 4;
     if (match_range_ == 0) return this->Alloc(dev_id, size);
     auto begin = free_.lower_bound(size / match_range_);
     auto mid = free_.lower_bound(size);
     auto end = free_.upper_bound(size * match_range_);
     // search for memory blocks larger than requested
     for (auto it = mid; it != end; ++it) {
       StorageEntry *e = it->second;
       if (e->device_id != dev_id) continue;
       if (node_color_.size() != 0 &&
           node_color_[e->released_by_node] != node_color_[node_id]) continue;
       // Use exect matching strategy
       e->max_bytes = std::max(size, e->max_bytes);
       // find a exact match, erase from map and return
       free_.erase(it);
       return e->id;
     }
     // then search for memory blocks smaller than requested space
     for (auto it = mid; it != begin;) {
       --it;
       StorageEntry *e = it->second;
       if (e->device_id != dev_id) continue;
       if (node_color_.size() != 0 &&
           node_color_[e->released_by_node] != node_color_[node_id]) continue;
       // Use exect matching strategy
       e->max_bytes = std::max(size, e->max_bytes);
       // erase from map and return
       free_.erase(it);
       return e->id;
     }
     // cannot find anything return a new one.
     return this->Alloc(dev_id, size);
   }
   // release a memory space.
   void Release(StorageID id, uint32_t node_id) {
     CHECK_NE(id, kBadStorageID);
     if (id == kExternalStorageID || id == kDynamicStorageID) return;
     StorageEntry *e = data_[id].get();
     e->released_by_node = node_id;
     free_.insert({e->max_bytes, e});
   }

   // totoal number of bytes allocated
   size_t TotalAllocBytes() const {
     size_t total = 0;
     for (auto &p : data_) {
       total += p->max_bytes;
     }
     return total;
   }

   // constructor
   explicit GraphAllocator(const IndexedGraph* idx, const size_t match_range) : idx_(idx) {
     this->Init(match_range, dmlc::GetEnv("NNVM_EXEC_NUM_TEMP", 1));
   }

  private:
   // initialize the graph allocator
   void Init(const size_t match_range, const uint32_t num_match_color) {
     match_range_ = match_range;
     num_match_color_ = num_match_color;
     if (num_match_color_ > 1) {
       std::vector<uint32_t> importance(idx_->num_nodes(), 0);
       for (uint32_t nid = 0; nid < idx_->num_nodes(); ++nid) {
         if ((*idx_)[nid].source->is_variable()) continue;
         importance[nid] = 1;
       }
       num_match_color_ = pass::ColorNodeGroup(
           *idx_, importance, num_match_color_, &node_color_);
     }
   }

   StorageID Alloc(int dev_id, size_t size) {
     StorageID id = static_cast<StorageID>(data_.size());
     std::unique_ptr<StorageEntry> ptr(new StorageEntry());
     ptr->id = id;
     ptr->device_id = dev_id;
     ptr->max_bytes = size;
     data_.emplace_back(std::move(ptr));
     return id;
   }
   // internal storage entry
   struct StorageEntry {
     // the id of the entry.
     StorageID id;
     // the device id of the storage.
     int device_id;
     // maximum size of storage requested.
     size_t max_bytes{0};
     // node index that released it last time
     uint32_t released_by_node{0};
   };
   // scale used for rough match
   size_t match_range_;
   // whether use color based match algorithm
   uint32_t num_match_color_{1};
   // the size of each dtype
   std::vector<size_t> dtype_size_dict_;
   // free list of storage entry
   std::multimap<size_t, StorageEntry*> free_;
   // all the storage resources available
   std::vector<std::unique_ptr<StorageEntry> > data_;
   // color of nodes in the graph, used for auxiliary policy making.
   std::vector<uint32_t> node_color_;
   // internal indexed graph
   const IndexedGraph* idx_;
 };

 /*
  * Internal method to perform the memory allocation for a graph
  * */
 size_t AllocMemory(const Graph& ret, const IndexedGraph& idx,
                    const std::pair<uint32_t, uint32_t>& node_range,
                    StorageVector* storage_ptr,
                    std::vector<int>* storage_inplace_index_ptr,
                    const std::vector<uint32_t>& entry_ref_count,
                    GraphAllocator* allocator) {
   static auto& finplace_option = Op::GetAttr<FInplaceOption>("FInplaceOption");
   static auto& finplace_identity = Op::GetAttr<FInplaceIdentity>("FInplaceIdentity");
   static auto& fignore_inputs = Op::GetAttr<FIgnoreInputs>("FIgnoreInputs");

   // Get reference
   auto &storage = *storage_ptr;
   auto &storage_inplace_index = *storage_inplace_index_ptr;

   // Get attributes from the graph
   const mxnet::ShapeVector& shape_vec = ret.GetAttr<mxnet::ShapeVector>("shape");
   const DTypeVector& dtype_vec = ret.GetAttr<DTypeVector>("dtype");
   const DeviceVector* device_vec = nullptr;

   if (ret.attrs.count("device") != 0) {
     device_vec = &(ret.GetAttr<DeviceVector>("device"));
   }
   size_t num_not_allocated = 0;
   std::vector<GraphAllocator::StorageID> storage_ref_count(idx.num_node_entries(), 0);

   for (uint32_t nid = node_range.first; nid < node_range.second; ++nid) {
     const auto& inode = idx[nid];
     if (inode.source->is_variable()) continue;
     // check inplace option
     if (finplace_option.count(inode.source->op()) != 0) {
       auto inplace_pairs = finplace_option[inode.source->op()](inode.source->attrs);
       std::vector<bool> identity;
       if (finplace_identity.count(inode.source->op()) != 0) {
         identity = finplace_identity[inode.source->op()](inode.source->attrs);
         CHECK_EQ(identity.size(), inplace_pairs.size())
             << "FInplaceOption and FInplaceIdentity returned vectors of different "
             << "size for operator " << inode.source->op()->name;
       } else {
         identity = std::vector<bool>(inplace_pairs.size(), false);
       }
       std::vector<bool> taken(inode.inputs.size(), false);
       for (size_t ipair = 0; ipair < inplace_pairs.size(); ++ipair) {
         const auto& kv = inplace_pairs[ipair];
         uint32_t eid_out = idx.entry_id(nid, kv.second);
         uint32_t eid_in = idx.entry_id(inode.inputs[kv.first]);
         auto sid_out = storage[eid_out];
         auto sid_in = storage[eid_in];
         bool ignore_all_inputs = (fignore_inputs.count(inode.source->op()) != 0 &&
                                   fignore_inputs[inode.source->op()](
                                       inode.source->attrs).size() == inode.source->num_inputs());
         if (taken[kv.first] == false &&
             sid_out == GraphAllocator::kBadStorageID &&
             sid_in >= 0 &&
             ((storage_ref_count[sid_in] == 1 && !ignore_all_inputs) || identity[ipair]) &&
             entry_ref_count[eid_out] > 0 &&
             shape_vec[eid_out].Size() == shape_vec[eid_in].Size() &&
              (dtype_vec[eid_out] == dtype_vec[eid_in] ||
              GetDTypeSize(dtype_vec[eid_out]) == GetDTypeSize(dtype_vec[eid_in]))) {
           // inplace optimization
           taken[kv.first] = true;
           storage[eid_out] = sid_in;
           // Reuse storage for output and add ref count of output
           // to storage. This will get substracted later in free
           // input section.
           storage_ref_count[sid_in] += entry_ref_count[eid_out];
           storage_inplace_index[eid_out] = kv.first;
         }
       }
     }
     // normal allocation
     const int dev_id = (device_vec != nullptr) ? device_vec->at(nid) : 0;
     // sort output nodes based on size before allocating output
     std::multimap<size_t, uint32_t> eids;
     for (uint32_t index = 0; index < inode.source->num_outputs(); ++index) {
       uint32_t eid = idx.entry_id(nid, index);
       // only request memory for kBadStorageID
       if (storage[eid] == GraphAllocator::kBadStorageID) {
         auto &eshape = shape_vec[eid];
         size_t esize = 0;
         if (eshape.ndim() != 0) esize = eshape.Size();
         eids.insert(std::make_pair(esize, eid));
       }
     }
     for (auto rit = eids.rbegin(); rit != eids.rend(); ++rit) {
         uint32_t eid = rit->second;
         auto sid = allocator->Request(dev_id, dtype_vec[eid], shape_vec[eid], nid);
         if (sid >= 0) {
           storage_ref_count[sid] = entry_ref_count[eid];
         }
         storage[eid] = sid;
     }
     // check if certain inputs is ignored.
     std::vector<uint32_t> ignore_inputs;
     if (fignore_inputs.count(inode.source->op()) != 0) {
       ignore_inputs = fignore_inputs[inode.source->op()](inode.source->attrs);
       std::sort(ignore_inputs.begin(), ignore_inputs.end());
     }
     // then free inputs
     for (size_t i = 0; i < inode.inputs.size(); ++i) {
       // ref counter of ignored input is already decreased.
       if (std::binary_search(ignore_inputs.begin(), ignore_inputs.end(), i)) continue;
       const auto& e = inode.inputs[i];
       uint32_t eid = idx.entry_id(e);
       auto sid = storage[eid];
       // storage_ref_count == 0 means it is taken by inplace op
       if (sid < 0) continue;
       // if we decrease it to zero, means we are ready to relase
       --storage_ref_count[sid];
       if (storage_ref_count[sid] == 0) {
         allocator->Release(sid, nid);
       }
     }
     // check if there are outputs that can be freeded immediately
     // these output are not referenced by any operator.
     for (uint32_t index = 0; index < inode.source->num_outputs(); ++index) {
       uint32_t eid = idx.entry_id(nid, index);
       auto sid = storage[eid];
       if (sid >= 0 && storage_ref_count[sid] == 0) {
         allocator->Release(sid, nid);
         // use -2 to indicate that the node was never touched.
         storage_inplace_index[eid] = -2;
       }
       if (storage[eid] == GraphAllocator::kBadStorageID) {
         ++num_not_allocated;
       }
     }
   }
   return num_not_allocated;
 }


 // function to plan memory
 Graph PlanMemory(Graph ret) {
   // setup ref counter
   const IndexedGraph& idx = ret.indexed_graph();
   static auto& fignore_inputs = Op::GetAttr<FIgnoreInputs>("FIgnoreInputs");
   std::pair<uint32_t, uint32_t> node_range = {0, idx.num_nodes()};
   if (ret.attrs.count("node_range")) {
     node_range = ret.MoveCopyAttr<std::pair<uint32_t, uint32_t> >("node_range");
   }
   // reference counter of each node
   std::vector<uint32_t> ref_count;
   // step 1: initialize reference count
   if (ret.attrs.count("ref_count") != 0) {
     ref_count = ret.MoveCopyAttr<std::vector<uint32_t> >("ref_count");
   } else {
     ref_count.resize(idx.num_node_entries(), 0);
     for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) {
       const auto& inode = idx[nid];
       if (inode.source->is_variable()) continue;
       for (const auto& e : inode.inputs) {
         ++ref_count[idx.entry_id(e)];
       }
       // no dataflow dependency is needed for those are ignored.
       // revoke the dependency counter.
       if (fignore_inputs.count(inode.source->op()) != 0) {
         auto ignore_inputs = fignore_inputs[inode.source->op()](inode.source->attrs);
         for (uint32_t i : ignore_inputs) {
           --ref_count[idx.entry_id(inode.inputs[i])];
         }
       }
     }
     for (const auto& e : idx.outputs()) {
       ++ref_count[idx.entry_id(e)];
     }
   }
   // step 2: allocate memory.
   StorageVector storage;
   if (ret.attrs.count("storage") != 0) {
     storage = ret.MoveCopyAttr<StorageVector>("storage");
   } else {
     storage.resize(idx.num_node_entries(), -1);
   }

   // Search the best NNVM_EXEC_MATCH_RANGE parameter. This is turned off by default
   size_t min_allocated_bytes = -1;
   size_t max_match_range = dmlc::GetEnv("NNVM_EXEC_MATCH_RANGE", 16);
   size_t min_match_range =
          dmlc::GetEnv("NNVM_AUTO_SEARCH_MATCH_RANGE", false) ? 1 : max_match_range;
   for (size_t match_range = min_match_range; match_range <= max_match_range; match_range *= 2) {
     // Make a copy of related fields
     StorageVector storage_vec(storage);
     std::vector<int> storage_inplace_index(idx.num_node_entries(), -1);

     // the allocator
     GraphAllocator allocator(&idx, match_range);

     // number of entries that are not statically allocated.
     size_t storage_num_not_allocated =
       AllocMemory(ret, idx, node_range, &storage_vec, &storage_inplace_index,
                   ref_count, &allocator);
     size_t storage_allocated_bytes = allocator.TotalAllocBytes();

     // Choose the plan which leads to minimal memory usage
     if (min_allocated_bytes > storage_allocated_bytes) {
       ret.attrs["storage_id"] = std::make_shared<any>(std::move(storage_vec));
       ret.attrs["storage_inplace_index"] = std::make_shared<any>(std::move(storage_inplace_index));
       ret.attrs["storage_allocated_bytes"] = std::make_shared<any>(storage_allocated_bytes);
       ret.attrs["storage_num_not_allocated"] = std::make_shared<any>(storage_num_not_allocated);
       min_allocated_bytes = storage_allocated_bytes;
     }

     if (max_match_range == 0) {
       break;
     }
   }
   return ret;
 }

 NNVM_REGISTER_PASS(MXPlanMemory)
 .describe("Plan the memory allocation of each node entries.")
 .set_body(PlanMemory)
 .set_change_graph(false)
 .depend_graph_attr("dtype")
 .depend_graph_attr("shape")
 .provide_graph_attr("storage_id")
 .provide_graph_attr("storage_inplace_index");

 }  // namespace
 }  // namespace pass
 }  // namespace nnvm
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/

	/*!
	* Copyright (c) 2016 by Contributors
	* \file plan_memory.cc
	* \brief Assign memory tag to each of the data entries.
	*/
	#include <nnvm/graph.h>
	#include <nnvm/pass.h>
	#include <nnvm/graph_attr_types.h>
	#include <nnvm/op_attr_types.h>
	#include <nnvm/top/tensor.h>
	#include <mxnet/base.h>
	#include <memory>
	#include "graph_algorithm.h"

	namespace nnvm {
	namespace pass {

	namespace {
	using namespace nnvm::top;
	// Return bytes of data flag.
	static int GetDTypeSize(int type_flag) {
	switch (type_flag) {
	case kUint8:
	case kInt8:
	return 1;
	case kFloat16:
	case kInt16:
	case kUint16:
	return 2;
	case kFloat32:
	case kInt32:
	case kUint32:
	return 4;
	case kFloat64:
	case kInt64:
	case kUint64:
	return 8;
	default:
	LOG(FATAL) << "unknown type_flag=" << type_flag;
	return -1;
	}
	}

	// simple graph based allocator.
	class GraphAllocator {
	public:
	// storage id equals integer.
	using StorageID = int;

	// bad storage id
	static const StorageID kBadStorageID = -1;
	// external storage id
	static const StorageID kExternalStorageID = -2;
	// dynamic storage id
	static const StorageID kDynamicStorageID = -3;

	// request a free storage
	StorageID Request(int dev_id, int dtype, mxnet::TShape shape, uint32_t node_id) {
	if (shape.ndim() == 0) return kBadStorageID;
	// search memory block in [size / match_range_, size * match_range_)
	// TODO(tqchen) add size of the dtype, assume 4 bytes for now
	size_t size = shape.Size() * 4;
	if (match_range_ == 0) return this->Alloc(dev_id, size);
	auto begin = free_.lower_bound(size / match_range_);
	auto mid = free_.lower_bound(size);
	auto end = free_.upper_bound(size * match_range_);
	// search for memory blocks larger than requested
	for (auto it = mid; it != end; ++it) {
	StorageEntry *e = it->second;
	if (e->device_id != dev_id) continue;
	if (node_color_.size() != 0 &&
	node_color_[e->released_by_node] != node_color_[node_id]) continue;
	// Use exect matching strategy
	e->max_bytes = std::max(size, e->max_bytes);
	// find a exact match, erase from map and return
	free_.erase(it);
	return e->id;
	}
	// then search for memory blocks smaller than requested space
	for (auto it = mid; it != begin;) {
	--it;
	StorageEntry *e = it->second;
	if (e->device_id != dev_id) continue;
	if (node_color_.size() != 0 &&
	node_color_[e->released_by_node] != node_color_[node_id]) continue;
	// Use exect matching strategy
	e->max_bytes = std::max(size, e->max_bytes);
	// erase from map and return
	free_.erase(it);
	return e->id;
	}
	// cannot find anything return a new one.
	return this->Alloc(dev_id, size);
	}
	// release a memory space.
	void Release(StorageID id, uint32_t node_id) {
	CHECK_NE(id, kBadStorageID);
	if (id == kExternalStorageID \|\| id == kDynamicStorageID) return;
	StorageEntry *e = data_[id].get();
	e->released_by_node = node_id;
	free_.insert({e->max_bytes, e});
	}

	// totoal number of bytes allocated
	size_t TotalAllocBytes() const {
	size_t total = 0;
	for (auto &p : data_) {
	total += p->max_bytes;
	}
	return total;
	}

	// constructor
	explicit GraphAllocator(const IndexedGraph* idx, const size_t match_range) : idx_(idx) {
	this->Init(match_range, dmlc::GetEnv("NNVM_EXEC_NUM_TEMP", 1));
	}

	private:
	// initialize the graph allocator
	void Init(const size_t match_range, const uint32_t num_match_color) {
	match_range_ = match_range;
	num_match_color_ = num_match_color;
	if (num_match_color_ > 1) {
	std::vector<uint32_t> importance(idx_->num_nodes(), 0);
	for (uint32_t nid = 0; nid < idx_->num_nodes(); ++nid) {
	if ((*idx_)[nid].source->is_variable()) continue;
	importance[nid] = 1;
	}
	num_match_color_ = pass::ColorNodeGroup(
	*idx_, importance, num_match_color_, &node_color_);
	}
	}

	StorageID Alloc(int dev_id, size_t size) {
	StorageID id = static_cast<StorageID>(data_.size());
	std::unique_ptr<StorageEntry> ptr(new StorageEntry());
	ptr->id = id;
	ptr->device_id = dev_id;
	ptr->max_bytes = size;
	data_.emplace_back(std::move(ptr));
	return id;
	}
	// internal storage entry
	struct StorageEntry {
	// the id of the entry.
	StorageID id;
	// the device id of the storage.
	int device_id;
	// maximum size of storage requested.
	size_t max_bytes{0};
	// node index that released it last time
	uint32_t released_by_node{0};
	};
	// scale used for rough match
	size_t match_range_;
	// whether use color based match algorithm
	uint32_t num_match_color_{1};
	// the size of each dtype
	std::vector<size_t> dtype_size_dict_;
	// free list of storage entry
	std::multimap<size_t, StorageEntry*> free_;
	// all the storage resources available
	std::vector<std::unique_ptr<StorageEntry> > data_;
	// color of nodes in the graph, used for auxiliary policy making.
	std::vector<uint32_t> node_color_;
	// internal indexed graph
	const IndexedGraph* idx_;
	};

	/*
	* Internal method to perform the memory allocation for a graph
	* */
	size_t AllocMemory(const Graph& ret, const IndexedGraph& idx,
	const std::pair<uint32_t, uint32_t>& node_range,
	StorageVector* storage_ptr,
	std::vector<int>* storage_inplace_index_ptr,
	const std::vector<uint32_t>& entry_ref_count,
	GraphAllocator* allocator) {
	static auto& finplace_option = Op::GetAttr<FInplaceOption>("FInplaceOption");
	static auto& finplace_identity = Op::GetAttr<FInplaceIdentity>("FInplaceIdentity");
	static auto& fignore_inputs = Op::GetAttr<FIgnoreInputs>("FIgnoreInputs");

	// Get reference
	auto &storage = *storage_ptr;
	auto &storage_inplace_index = *storage_inplace_index_ptr;

	// Get attributes from the graph
	const mxnet::ShapeVector& shape_vec = ret.GetAttr<mxnet::ShapeVector>("shape");
	const DTypeVector& dtype_vec = ret.GetAttr<DTypeVector>("dtype");
	const DeviceVector* device_vec = nullptr;

	if (ret.attrs.count("device") != 0) {
	device_vec = &(ret.GetAttr<DeviceVector>("device"));
	}
	size_t num_not_allocated = 0;
	std::vector<GraphAllocator::StorageID> storage_ref_count(idx.num_node_entries(), 0);

	for (uint32_t nid = node_range.first; nid < node_range.second; ++nid) {
	const auto& inode = idx[nid];
	if (inode.source->is_variable()) continue;
	// check inplace option
	if (finplace_option.count(inode.source->op()) != 0) {
	auto inplace_pairs = finplace_option[inode.source->op()](inode.source->attrs);
	std::vector<bool> identity;
	if (finplace_identity.count(inode.source->op()) != 0) {
	identity = finplace_identity[inode.source->op()](inode.source->attrs);
	CHECK_EQ(identity.size(), inplace_pairs.size())
	<< "FInplaceOption and FInplaceIdentity returned vectors of different "
	<< "size for operator " << inode.source->op()->name;
	} else {
	identity = std::vector<bool>(inplace_pairs.size(), false);
	}
	std::vector<bool> taken(inode.inputs.size(), false);
	for (size_t ipair = 0; ipair < inplace_pairs.size(); ++ipair) {
	const auto& kv = inplace_pairs[ipair];
	uint32_t eid_out = idx.entry_id(nid, kv.second);
	uint32_t eid_in = idx.entry_id(inode.inputs[kv.first]);
	auto sid_out = storage[eid_out];
	auto sid_in = storage[eid_in];
	bool ignore_all_inputs = (fignore_inputs.count(inode.source->op()) != 0 &&
	fignore_inputs[inode.source->op()](
	inode.source->attrs).size() == inode.source->num_inputs());
	if (taken[kv.first] == false &&
	sid_out == GraphAllocator::kBadStorageID &&
	sid_in >= 0 &&
	((storage_ref_count[sid_in] == 1 && !ignore_all_inputs) \|\| identity[ipair]) &&
	entry_ref_count[eid_out] > 0 &&
	shape_vec[eid_out].Size() == shape_vec[eid_in].Size() &&
	(dtype_vec[eid_out] == dtype_vec[eid_in] \|\|
	GetDTypeSize(dtype_vec[eid_out]) == GetDTypeSize(dtype_vec[eid_in]))) {
	// inplace optimization
	taken[kv.first] = true;
	storage[eid_out] = sid_in;
	// Reuse storage for output and add ref count of output
	// to storage. This will get substracted later in free
	// input section.
	storage_ref_count[sid_in] += entry_ref_count[eid_out];
	storage_inplace_index[eid_out] = kv.first;
	}
	}
	}
	// normal allocation
	const int dev_id = (device_vec != nullptr) ? device_vec->at(nid) : 0;
	// sort output nodes based on size before allocating output
	std::multimap<size_t, uint32_t> eids;
	for (uint32_t index = 0; index < inode.source->num_outputs(); ++index) {
	uint32_t eid = idx.entry_id(nid, index);
	// only request memory for kBadStorageID
	if (storage[eid] == GraphAllocator::kBadStorageID) {
	auto &eshape = shape_vec[eid];
	size_t esize = 0;
	if (eshape.ndim() != 0) esize = eshape.Size();
	eids.insert(std::make_pair(esize, eid));
	}
	}
	for (auto rit = eids.rbegin(); rit != eids.rend(); ++rit) {
	uint32_t eid = rit->second;
	auto sid = allocator->Request(dev_id, dtype_vec[eid], shape_vec[eid], nid);
	if (sid >= 0) {
	storage_ref_count[sid] = entry_ref_count[eid];
	}
	storage[eid] = sid;
	}
	// check if certain inputs is ignored.
	std::vector<uint32_t> ignore_inputs;
	if (fignore_inputs.count(inode.source->op()) != 0) {
	ignore_inputs = fignore_inputs[inode.source->op()](inode.source->attrs);
	std::sort(ignore_inputs.begin(), ignore_inputs.end());
	}
	// then free inputs
	for (size_t i = 0; i < inode.inputs.size(); ++i) {
	// ref counter of ignored input is already decreased.
	if (std::binary_search(ignore_inputs.begin(), ignore_inputs.end(), i)) continue;
	const auto& e = inode.inputs[i];
	uint32_t eid = idx.entry_id(e);
	auto sid = storage[eid];
	// storage_ref_count == 0 means it is taken by inplace op
	if (sid < 0) continue;
	// if we decrease it to zero, means we are ready to relase
	--storage_ref_count[sid];
	if (storage_ref_count[sid] == 0) {
	allocator->Release(sid, nid);
	}
	}
	// check if there are outputs that can be freeded immediately
	// these output are not referenced by any operator.
	for (uint32_t index = 0; index < inode.source->num_outputs(); ++index) {
	uint32_t eid = idx.entry_id(nid, index);
	auto sid = storage[eid];
	if (sid >= 0 && storage_ref_count[sid] == 0) {
	allocator->Release(sid, nid);
	// use -2 to indicate that the node was never touched.
	storage_inplace_index[eid] = -2;
	}
	if (storage[eid] == GraphAllocator::kBadStorageID) {
	++num_not_allocated;
	}
	}
	}
	return num_not_allocated;
	}


	// function to plan memory
	Graph PlanMemory(Graph ret) {
	// setup ref counter
	const IndexedGraph& idx = ret.indexed_graph();
	static auto& fignore_inputs = Op::GetAttr<FIgnoreInputs>("FIgnoreInputs");
	std::pair<uint32_t, uint32_t> node_range = {0, idx.num_nodes()};
	if (ret.attrs.count("node_range")) {
	node_range = ret.MoveCopyAttr<std::pair<uint32_t, uint32_t> >("node_range");
	}
	// reference counter of each node
	std::vector<uint32_t> ref_count;
	// step 1: initialize reference count
	if (ret.attrs.count("ref_count") != 0) {
	ref_count = ret.MoveCopyAttr<std::vector<uint32_t> >("ref_count");
	} else {
	ref_count.resize(idx.num_node_entries(), 0);
	for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) {
	const auto& inode = idx[nid];
	if (inode.source->is_variable()) continue;
	for (const auto& e : inode.inputs) {
	++ref_count[idx.entry_id(e)];
	}
	// no dataflow dependency is needed for those are ignored.
	// revoke the dependency counter.
	if (fignore_inputs.count(inode.source->op()) != 0) {
	auto ignore_inputs = fignore_inputs[inode.source->op()](inode.source->attrs);
	for (uint32_t i : ignore_inputs) {
	--ref_count[idx.entry_id(inode.inputs[i])];
	}
	}
	}
	for (const auto& e : idx.outputs()) {
	++ref_count[idx.entry_id(e)];
	}
	}
	// step 2: allocate memory.
	StorageVector storage;
	if (ret.attrs.count("storage") != 0) {
	storage = ret.MoveCopyAttr<StorageVector>("storage");
	} else {
	storage.resize(idx.num_node_entries(), -1);
	}

	// Search the best NNVM_EXEC_MATCH_RANGE parameter. This is turned off by default
	size_t min_allocated_bytes = -1;
	size_t max_match_range = dmlc::GetEnv("NNVM_EXEC_MATCH_RANGE", 16);
	size_t min_match_range =
	dmlc::GetEnv("NNVM_AUTO_SEARCH_MATCH_RANGE", false) ? 1 : max_match_range;
	for (size_t match_range = min_match_range; match_range <= max_match_range; match_range *= 2) {
	// Make a copy of related fields
	StorageVector storage_vec(storage);
	std::vector<int> storage_inplace_index(idx.num_node_entries(), -1);

	// the allocator
	GraphAllocator allocator(&idx, match_range);

	// number of entries that are not statically allocated.
	size_t storage_num_not_allocated =
	AllocMemory(ret, idx, node_range, &storage_vec, &storage_inplace_index,
	ref_count, &allocator);
	size_t storage_allocated_bytes = allocator.TotalAllocBytes();

	// Choose the plan which leads to minimal memory usage
	if (min_allocated_bytes > storage_allocated_bytes) {
	ret.attrs["storage_id"] = std::make_shared<any>(std::move(storage_vec));
	ret.attrs["storage_inplace_index"] = std::make_shared<any>(std::move(storage_inplace_index));
	ret.attrs["storage_allocated_bytes"] = std::make_shared<any>(storage_allocated_bytes);
	ret.attrs["storage_num_not_allocated"] = std::make_shared<any>(storage_num_not_allocated);
	min_allocated_bytes = storage_allocated_bytes;
	}

	if (max_match_range == 0) {
	break;
	}
	}
	return ret;
	}

	NNVM_REGISTER_PASS(MXPlanMemory)
	.describe("Plan the memory allocation of each node entries.")
	.set_body(PlanMemory)
	.set_change_graph(false)
	.depend_graph_attr("dtype")
	.depend_graph_attr("shape")
	.provide_graph_attr("storage_id")
	.provide_graph_attr("storage_inplace_index");

	} // namespace
	} // namespace pass
	} // namespace nnvm