src/ndarray/ndarray.cc - mxnet - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 /*!
  * \file ndarray.cc
  * \brief ndarry module of mxnet
  */
 #include <dmlc/io.h>
 #include <dmlc/logging.h>
 #include <dmlc/memory_io.h>
 #include <dmlc/registry.h>
 #include <mshadow/tensor.h>
 #include <mxnet/base.h>
 #include <mxnet/imperative.h>
 #include <mxnet/ndarray.h>
 #include <mxnet/resource.h>

 #include "../common/utils.h"
 #include "../operator/nn/dnnl/dnnl_base-inl.h"
 #include "../operator/tensor/init_op.h"
 #include "../operator/tensor/matrix_op-inl.h"
 #include "../profiler/storage_profiler.h"
 #include "./ndarray_function.h"
 #if MXNET_USE_ONEDNN == 1
 #include <dnnl.hpp>
 #endif
 #if MXNET_USE_OPENCV
 #include <opencv2/opencv.hpp>
 #endif  // MXNET_USE_OPENCV

 namespace dmlc {
 DMLC_REGISTRY_ENABLE(::mxnet::NDArrayFunctionReg);
 }  // namespace dmlc

 namespace mxnet {

 void NDArray::ReInit(const NDArrayStorageType stype,
                      const mxnet::TShape& shape,
                      Context ctx,
                      int dtype,
                      bool delay_alloc,
                      const std::vector<int>* pAux_types,
                      const mxnet::ShapeVector* pAux_shapes,
                      const mxnet::TShape* pStorage_shapes) {
   Init(stype, shape, dtype);
   if (stype != kDefaultStorage) {
     const auto sparseStorage = stype == kRowSparseStorage;
     if (!sparseStorage && stype != kCSRStorage)
       LOG(FATAL) << "Unknown storage type " << stype;

     const auto& aux_types = (pAux_types && pAux_types->size()) ?
                                 *pAux_types :
                                 std::vector<int>(sparseStorage ? 1 : 2, mshadow::kInt64);

     const auto& aux_shapes = (pAux_shapes && pAux_shapes->size()) ?
                                  *pAux_shapes :
                                  ShapeVector(sparseStorage ? 1 : 2, TShape(mshadow::Shape1(0)));

     mxnet::TShape storage_shape;
     if (!pStorage_shapes || !pStorage_shapes->Size()) {
       if (sparseStorage) {
         storage_shape    = shape;
         storage_shape[0] = aux_shapes[rowsparse::kIdx][0];
       } else {
         storage_shape = aux_shapes[csr::kIdx];
       }
     } else {
       storage_shape = *pStorage_shapes;
     }
     ptr_ = std::make_shared<Chunk>(
         stype, storage_shape, ctx, delay_alloc, dtype, aux_types, aux_shapes);
   } else {
     ptr_ = std::make_shared<Chunk>(shape, ctx, delay_alloc, dtype);
   }
 }

 void NDArray::AssignStorageInfo(const std::string& profiler_scope, const std::string& name) {
   if (is_none()) {
     return;
   }
   ptr_->shandle.profiler_scope = profiler_scope;
   ptr_->shandle.name           = name;
 #if MXNET_USE_CUDA
   profiler::GpuDeviceStorageProfiler::Get()->UpdateStorageInfo(ptr_->shandle);
 #endif  // MXNET_USE_CUDA
   for (Storage::Handle& aux_handle : ptr_->aux_handles) {
     aux_handle.profiler_scope = profiler_scope;
     aux_handle.name           = name + "_aux_data";
 #if MXNET_USE_CUDA
     profiler::GpuDeviceStorageProfiler::Get()->UpdateStorageInfo(aux_handle);
 #endif  // MXNET_USE_CUDA
   }
 }

 void NDArray::SetShapeFromChunk() const {
   if (Imperative::Get()->is_np_shape() ||
       !(ptr_->storage_shape.ndim() == 1 && ptr_->storage_shape[0] == 0)) {
     shape_ = ptr_->storage_shape;
   }
 }

 struct ChunkMem {
   Storage::Handle h;
   std::vector<Storage::Handle> aux_h;
 #if MXNET_USE_ONEDNN == 1
   std::shared_ptr<DNNLMemory> mem;
 #endif
 };

 NDArray::Chunk::~Chunk() {
   bool skip_free = static_data || delay_alloc;
   ChunkMem mem;
   mem.h     = this->shandle;
   mem.aux_h = this->aux_handles;
 #if MXNET_USE_ONEDNN == 1
   // We want to delete dnnl memory after deleting the variable.
   mem.mem = this->dnnl_mem_;
 #endif
   if (auto engine = engine_ref_.lock()) {
     engine->DeleteVariable(
         [mem, skip_free, var = this->var](RunContext s) mutable {
 #if MXNET_USE_CUDA
           auto& sync_obj = var->sync_object;
           Storage::SyncObj storage_sync_obj;
           {
             std::lock_guard<std::mutex> l(sync_obj.mutex);
             for (auto& ev : sync_obj.reader_events) {
               storage_sync_obj.events.push_back(ev.event);
             }
             if (!sync_obj.writer_event.empty()) {
               auto ev = sync_obj.writer_event[0];
               storage_sync_obj.events.push_back(ev.event);
             }
           }
           mem.h.sync_obj = storage_sync_obj;
 #endif
           if (skip_free == false) {
 #if MXNET_USE_ONEDNN == 1
             if (mem.mem) {
               CHECK_LE(mem.mem->GetSize(), mem.h.size);
               CHECK_EQ(mem.mem->GetDataHandle(), mem.h.dptr);
             }
 #endif
             Storage::Get()->Free(mem.h);
             for (const auto& aux : mem.aux_h) {
               Storage::Get()->Free(aux);
             }
           }
         },
         shandle.ctx,
         var);
   }
 }

 void NDArray::Chunk::CheckAndAllocData(const mxnet::TShape& shape, int dtype) {
   CHECK_NE(aux_shapes.size(), 0) << "data is expected to be allocated after aux_data";
   auto dbytes = shape.Size() * mshadow::mshadow_sizeof(dtype);
   if (!features::is_enabled(features::INT64_TENSOR_SIZE)) {
     CHECK_LT(shape.Size(), (int64_t{1} << 31) - 1)
         << "[CheckAndAllocData] Size of tensor you are trying to allocate is larger than "
            "2^31 elements. Please build with flag USE_INT64_TENSOR_SIZE=1";
   }
   if (shandle.size < dbytes) {
     // free storage
     Storage::Get()->Free(shandle);
     // init storage
     shandle.size = dbytes;
     Storage::Get()->Alloc(&shandle);
 #if MXNET_USE_ONEDNN == 1
     dnnl_mem_ = nullptr;
 #endif
   }
   // init shape
   storage_shape = shape;
   // delay_alloc is only set when data storage handle is present
   delay_alloc = false;
 }

 NDArray NDArray::grad() const {
   if (Imperative::AGInfo::IsNone(*this))
     return NDArray();
   Imperative::AGInfo& info = Imperative::AGInfo::Get(autograd_entry_.node);
   if (info.out_grads.size()) {
     CHECK_EQ(info.out_grads.size(), 1);
     return info.out_grads[0];
   }
   return NDArray();
 }

 nnvm::Symbol NDArray::get_autograd_symbol() const {
   CHECK(!Imperative::AGInfo::IsNone(*this))
       << "NDArray is not part of a computation graph. Did you forget to turn on recording?";
   nnvm::Symbol ret;
   ret.outputs.emplace_back(autograd_entry_);
   return ret;
 }

 #if MXNET_USE_ONEDNN == 1

 NDArray::NDArray(const void* md_desc) : storage_type_(kDefaultStorage), autograd_entry_(nullptr) {
   dnnl::memory::desc md = *static_cast<const dnnl::memory::desc*>(md_desc);
   shape_                = mxnet::TShape(md.data.dims, md.data.dims + md.data.ndims);
   dtype_                = get_mxnet_type(md.data.data_type);
   ptr_                  = std::make_shared<Chunk>(shape_, Context::CPU(), true, dtype_);
   ptr_->CheckAndAlloc(md.get_size());
   ptr_->dnnl_mem_ = std::make_shared<DNNLMemory>(md, ptr_->shandle.dptr);
 }

 NDArray::NDArray(const std::shared_ptr<dnnl::memory>& dnnl_mem)
     : storage_type_(kDefaultStorage), autograd_entry_(nullptr) {
   auto mem_desc      = dnnl_mem->get_desc();
   shape_             = mxnet::TShape(mem_desc.data.dims, mem_desc.data.dims + mem_desc.data.ndims);
   dtype_             = get_mxnet_type(mem_desc.data.data_type);
   ptr_               = std::make_shared<Chunk>(shape_, Context::CPU(), true, dtype_);
   ptr_->shandle.dptr = dnnl_mem->get_data_handle();
   ptr_->shandle.size = mem_desc.get_size();
   ptr_->delay_alloc  = false;
   ptr_->dnnl_mem_    = std::make_shared<DNNLMemory>(dnnl_mem);
   ptr_->static_data  = true;
 }

 NDArray NDArray::DNNLDataReshape(const mxnet::TShape& shape) const {
   CHECK(!is_none()) << "NDArray is not initialized";
   CHECK_GE(shape_.Size(), shape.Size())
       << "NDArray.Reshape: target shape size is larger current shape";
   CHECK_EQ(storage_type(), kDefaultStorage);
   if (!IsDNNLData()) {
     NDArray ret = this->Detach();
     ret.shape_  = shape;
     return ret;
   } else {
     NDArray ret(shape, ctx(), true, dtype());
     // We shouldn't submit the reorder primitive here because submit will
     // be called in operators.
     dnnl_format_tag_t format = ptr_->dnnl_mem_->GetDefaultFormat();
     CHECK(ptr_->IsDNNL());
     dnnl::memory::desc def_desc            = ptr_->dnnl_mem_->GetDesc(format);
     dnnl::memory* def_mem                  = TmpMemMgr::Get()->Alloc(def_desc);
     DNNLStream* stream                     = DNNLStream::Get();
     std::shared_ptr<dnnl::memory> curr_mem = ptr_->dnnl_mem_->GetMem();
     stream->RegisterMem(curr_mem);
     std::unordered_map<int, dnnl::memory> args(
         {{DNNL_ARG_FROM, *curr_mem}, {DNNL_ARG_TO, *def_mem}});
     stream->RegisterPrimArgs(dnnl::reorder(*curr_mem, *def_mem), args);
     // def_mem points to a memory region in the temp space. It's only valid
     // inside an operator. As such, the returned NDArray can only be valid
     // inside an operator and the shared point doesn't need to do anything
     // when it's destroyed.
     auto tmp = std::shared_ptr<dnnl::memory>(def_mem, [](dnnl::memory* mem) {});
     ret.ptr_->dnnl_mem_.reset(new DNNLMemory(tmp));
     ret.ptr_->shandle.dptr = def_mem->get_data_handle();
     ret.ptr_->shandle.size = def_mem->get_desc().get_size();
     ret.ptr_->delay_alloc  = false;
     ret.ptr_->static_data  = true;
     ret.byte_offset_       = byte_offset_;
     ret.reuse_             = false;
     return ret;
   }
 }
 #endif

 NDArray NDArray::Reshape(const mxnet::TShape& shape) const {
   CHECK(!is_none()) << "NDArray is not initialized";
   if (Imperative::Get()->is_np_shape()) {
     CHECK_EQ(shape_.Size(), shape.Size())
         << "NDArray.Reshape: target shape must have the same size as "
         << "current shape.";
   } else {
     CHECK_GE(shape_.Size(), shape.Size())
         << "NDArray.Reshape: target shape size is larger than the current shape";
   }
   NDArray ret = this->Detach();
   // If the shape doesn't change, we can just return it now.
   if (ret.shape_ == shape)
     return ret;
   // Otherwise, reshape only works on the default layout.
   CHECK_EQ(storage_type(), kDefaultStorage);
   ret.shape_ = shape;
   ret.reuse_ = false;
   return ret;
 }

 NDArray NDArray::ReshapeWithRecord(const mxnet::TShape& shape) {
   bool is_recording        = Imperative::Get()->is_recording();
   bool is_deferred_compute = Imperative::Get()->is_deferred_compute();
   NDArray ret;
   if (!is_deferred_compute) {
     // The new array shares memory with this array, thus make sure this array
     // has been computed already computed. (noop if this array is not deferred)
     Imperative::DCInfo::Compute(*this);
     ret = this->Reshape(shape);
     if (!is_recording) {
       return ret;
     }
   } else {
     if (shape_is_known(this->shape())) {
       // Imperative reshape only works if shape is already known.
       ret = this->Reshape(shape);
     } else {
       // Reshape called on after dynamic shape operator.
       ret = this->Detach();
     }
   }

   if (!is_deferred_compute || shape_is_known(this->shape())) {
     CHECK_EQ(shape_.Size(), shape.Size())
         << "NDArray.Reshape: target shape must have the same size as "
         << "current shape when recording with autograd "
         << "or in deferred compute mode.";
   }

   nnvm::NodeAttrs attrs;
   std::ostringstream os;
   os << shape;
   if (!Imperative::Get()->is_np_shape()) {
     attrs.op = nnvm::Op::Get("Reshape");
     attrs.dict.insert({"shape", os.str()});
   } else {
     attrs.op = nnvm::Op::Get("_np_reshape");
     attrs.dict.insert({"newshape", os.str()});
   }
   attrs.op->attr_parser(&attrs);
   std::vector<NDArray*> inputs(1, this), outputs(1, &ret);

   if (is_recording) {
     Imperative::Get()->RecordOp(std::move(attrs), inputs, outputs);
   } else if (is_deferred_compute) {
     Imperative::Get()->RecordDeferredCompute(std::move(attrs), inputs, outputs);
   }
   return ret;
 }

 NDArray NDArray::Slice(index_t begin, index_t end) const {
   CHECK(!is_none()) << "NDArray is empty";
   CHECK_LE(begin, end) << "Invalid slicing range [" << begin << ", " << end << ")";
   CHECK_GE(shape_[0], end) << "Slice end index out of range";
   CHECK_EQ(storage_type(), kDefaultStorage);
   NDArray ret   = this->Detach();
   size_t length = shape_.ProdShape(1, shape_.ndim());
   MSHADOW_TYPE_SWITCH_EXT_WITH_BOOL(
       ret.dtype(), DType, { ret.byte_offset_ += begin * length * sizeof(DType); });
   ret.reuse_    = false;
   ret.shape_[0] = end - begin;
   return ret;
 }

 NDArray NDArray::SliceWithRecord(index_t begin, index_t end) {
   bool is_recording        = Imperative::Get()->is_recording();
   bool is_deferred_compute = Imperative::Get()->is_deferred_compute();
   NDArray ret;
   if (!is_deferred_compute) {
     // The new array shares memory with this array, thus make sure this array
     // has been computed already computed. (noop if this array is not deferred)
     Imperative::DCInfo::Compute(*this);
     ret = this->Slice(begin, end);
     if (!is_recording) {
       return ret;
     }
   } else {
     if (shape_is_known(this->shape())) {
       // Imperative slice only works if shape is already known.
       ret = this->Slice(begin, end);
     } else {
       // Slice called on after dynamic shape operator.
       ret = this->Detach();
     }
   }

   // fake a slice op
   nnvm::NodeAttrs attrs;
   attrs.op = nnvm::Op::Get("slice");
   attrs.dict.insert({"begin", std::to_string(begin)});
   attrs.dict.insert({"end", std::to_string(end)});
   attrs.op->attr_parser(&attrs);
   std::vector<NDArray*> inputs(1, this), outputs(1, &ret);

   if (is_recording) {
     Imperative::Get()->RecordOp(std::move(attrs), inputs, outputs);
   } else if (is_deferred_compute) {
     Imperative::Get()->RecordDeferredCompute(std::move(attrs), inputs, outputs);
   }

   return ret;
 }

 NDArray NDArray::At(index_t idx) const {
   CHECK(storage_type() == kDefaultStorage)
       << "Storage type " << storage_type() << " doesn't support At()";
   NDArray ret = this->Slice(idx, idx + 1);
   if (shape_.ndim() > 1) {
     return ret.Reshape(mxnet::TShape(shape_.data() + 1, shape_.data() + shape_.ndim()));
   } else {
     return ret;
   }
 }

 NDArray NDArray::AtWithRecord(index_t idx) {
   CHECK(storage_type() == kDefaultStorage)
       << "Storage type " << storage_type() << " doesn't support At()";
   NDArray sliced = this->SliceWithRecord(idx, idx + 1);
   if (shape_.ndim() > 1 || Imperative::Get()->is_np_shape()) {
     // Imperative reshape with concrete shape
     NDArray reshaped =
         sliced.Reshape(mxnet::TShape(shape_.data() + 1, shape_.data() + shape_.ndim()));

     // Record reshape with magic numbers
     nnvm::NodeAttrs attrs;
     std::ostringstream os;
     if (!Imperative::Get()->is_np_shape()) {
       os << mxnet::TShape({-3, -2});  // See ndarray.py reshape for definition of magic numbers
       attrs.op = nnvm::Op::Get("Reshape");
       attrs.dict.insert({"shape", os.str()});
     } else {
       // See NumpyXReshapeInferShape for definition of magic numbers
       os << mxnet::TShape({-3, -4});
       attrs.op = nnvm::Op::Get("_npx_reshape");
       attrs.dict.insert({"newshape", os.str()});
     }
     attrs.op->attr_parser(&attrs);
     std::vector<NDArray*> inputs(1, &sliced), outputs(1, &reshaped);

     bool is_recording        = Imperative::Get()->is_recording();
     bool is_deferred_compute = Imperative::Get()->is_deferred_compute();
     if (is_recording) {
       Imperative::Get()->RecordOp(std::move(attrs), inputs, outputs);
     } else if (is_deferred_compute) {
       Imperative::Get()->RecordDeferredCompute(std::move(attrs), inputs, outputs);
     }

     return reshaped;
   } else {
     return sliced;
   }
 }

 /*!
  * \brief Return deep copy of the current ndarry's aux_data(i)
  * as an NDArray of default storage type. This function blocks.
  */
 NDArray NDArray::aux_ndarray(size_t i) const {
   CHECK_NE(storage_type(), kDefaultStorage);
   CHECK(i < ptr_->aux_shapes.size());
   // create a delay_alloc default ndarray as output
   NDArray ret(mxnet::TShape(), ctx(), true, aux_type(i));
   ret.SyncCopyFromNDArray(*this, i);
   return ret;
 }

 NDArray NDArray::data_ndarray() const {
   NDArray ret(mxnet::TShape(), ctx(), true, dtype_);
   ret.SyncCopyFromNDArray(*this);
   return ret;
 }

 struct NDArrayDLManager {
   NDArray handle;  // ref NDArray
   DLManagedTensor tensor;
 };

 DLManagedTensor* NDArray::ToDLPack() const {
   CHECK(!is_none()) << "NDArray is not initialized";
   NDArrayDLManager* dlmanager(new NDArrayDLManager);
   dlmanager->handle             = *this;
   dlmanager->tensor.dl_tensor   = dlmanager->handle.data().dltensor();
   dlmanager->tensor.manager_ctx = dlmanager;
   dlmanager->tensor.deleter     = [](DLManagedTensor* dlmanager) {
     delete static_cast<NDArrayDLManager*>(dlmanager->manager_ctx);
   };
   return &(dlmanager->tensor);
 }

 NDArray NDArray::FromDLPack(const DLManagedTensor* tensor, bool transient_handle) {
   DLManagedTensor* tensor_copy =
       transient_handle ? new DLManagedTensor(*tensor) : const_cast<DLManagedTensor*>(tensor);
   auto deleter = [tensor_copy, transient_handle]() {
     if (tensor_copy->deleter != nullptr) {
       tensor_copy->deleter(tensor_copy);
     }
     if (transient_handle) {
       delete tensor_copy;
     }
   };
   return NDArray(TBlob(tensor_copy->dl_tensor), tensor_copy->dl_tensor.ctx.device_id, deleter);
 }

 bool NDArray::fresh_out_grad() const {
   if (Imperative::AGInfo::IsNone(*this))
     return false;
   Imperative::AGInfo& info = Imperative::AGInfo::Get(autograd_entry_.node);
   return info.fresh_out_grad;
 }

 void NDArray::set_fresh_out_grad(bool state) const {
   CHECK(!Imperative::AGInfo::IsNone(*this))
       << "NDArray has not been marked as a variable and does not have gradient state";
   Imperative::AGInfo& info = Imperative::AGInfo::Get(autograd_entry_.node);
   info.fresh_out_grad      = state;
 }

 #if MXNET_USE_ONEDNN == 1

 bool NDArray::Chunk::IsDNNL() const {
   if (storage_type != kDefaultStorage)
     return false;
   if (dnnl_mem_ == nullptr)
     return false;
   return dnnl_mem_->IsDNNL();
 }

 bool NDArray::Chunk::IsDefault() const {
   if (storage_type != kDefaultStorage)
     return false;
   // If we don't have dnnl memory yet, we just assume it's not the default
   // format.
   if (dnnl_mem_ == nullptr)
     return true;
   return !dnnl_mem_->IsDNNL();
 }

 void NDArray::Chunk::Reorder2Default() {
   if (dnnl_mem_ == nullptr)
     return;

   if (IsDefault())
     return;

   dnnl_format_tag_t format    = dnnl_mem_->GetDefaultFormat();
   dnnl::memory::desc def_desc = dnnl_mem_->GetDesc(format);

   CHECK(shandle.size >= def_desc.get_size());
   CheckAndAlloc(def_desc.get_size());

   // oneDNN reorder can't be performed in-place
   if (shandle.dptr == dnnl_mem_->GetDataHandle()) {
     dnnl_mem_ptr def_mem(new dnnl::memory(def_desc, CpuEngine::Get()->get_engine()));
     dnnl_mem_->ReorderTo(def_mem.get());
     memcpy(shandle.dptr, def_mem->get_data_handle(), def_desc.get_size());
   } else {
     dnnl_mem_ptr def_mem(new dnnl::memory(def_desc, CpuEngine::Get()->get_engine(), shandle.dptr));
     dnnl_mem_->ReorderTo(def_mem.get());
   }

   dnnl_mem_ = nullptr;
 }

 void NDArray::Chunk::DNNLDataReorder(const void* mem_desc) {
   const dnnl::memory::desc md = *static_cast<const dnnl::memory::desc*>(mem_desc);
   // If the memory already uses the specified layout, don't do anything.
   if (dnnl_mem_ != nullptr && dnnl_mem_->SameFormat(md))
     return;

   // If the memory is default, don't do anything.
   if (!mxnet::IsDNNL(md) && IsDefault())
     return;
   if (!mxnet::IsDNNL(md)) {
     // If the specified layout is default, we should use Reorder2Default.
     Reorder2Default();
     return;
   }
   auto engine = CpuEngine::Get()->get_engine();
   dnnl::stream s(engine);

   std::shared_ptr<dnnl::memory> new_mem(new dnnl::memory(md, engine));
   std::shared_ptr<dnnl::memory> old_mem;
   if (IsDefault()) {
     dnnl_format_tag_t def_format = GetDefaultFormat(md);
     dnnl::memory::desc def_desc  = GetDesc(md, def_format);
     old_mem.reset(new dnnl::memory(def_desc, engine, shandle.dptr));
   } else {
     old_mem = this->dnnl_mem_->GetMem();
   }
   CHECK(old_mem->get_desc().data.ndims == md.data.ndims);

   // This may be called in DNNL operators. We can't use DNNLStream here.
   dnnl::reorder(*old_mem, *new_mem).execute(s, *old_mem, *new_mem);

   CHECK(shandle.size >= md.get_size());
   CheckAndAlloc(md.get_size());
   // TODO(zhengda) We need to avoid memory copy here.
   memcpy(shandle.dptr, new_mem->get_data_handle(), md.get_size());
   dnnl_mem_.reset(new DNNLMemory(md, shandle.dptr));
 }

 void NDArray::Chunk::SetDNNLMem(const mxnet::TShape& shape, int dtype) {
   // The shape of the array and the one of the DNNL memory may mismatch.
   // For example, if the array stores parameters, the DNNL memory may store data
   // in 5 dimensions while the NDArray stores data in 4 dimensions.
   if (dnnl_mem_ && dnnl_mem_->GetDataHandle() == shandle.dptr &&
       dnnl_mem_->SameFormat(shape, dtype)) {
     return;
   }

   dnnl::memory::dims dims;
   // These are shapes supprted by DNNL.
   const int MAX_ONEDNN_DIMS = 12;
   if (shape.ndim() >= 1 && shape.ndim() <= MAX_ONEDNN_DIMS) {
     dims.resize(shape.ndim());
     for (size_t i = 0; i < dims.size(); i++)
       dims[i] = shape[i];
   } else {
     LOG(FATAL) << "oneDNN doesn't support " << shape.ndim() << " dimensions";
   }
   auto layout = static_cast<dnnl::memory::format_tag>(GetDefaultFormat(dims.size()));
   dnnl::memory::desc data_md{dims, get_dnnl_type(dtype), layout};
   if (shandle.dptr == nullptr) {
     CHECK(delay_alloc);
     CheckAndAlloc();
   }
   CHECK(shandle.size >= data_md.get_size());
   dnnl_mem_.reset(new DNNLMemory(data_md, shandle.dptr));
 }

 const dnnl::memory* NDArray::GetDNNLData(const void* mem_desc) const {
   const dnnl::memory::desc desc = *static_cast<const dnnl::memory::desc*>(mem_desc);
   if (desc.get_size() != shape().Size() * GetTypeSize(dtype_)) {
     LOG(FATAL) << "The size of NDArray doesn't match the requested oneDNN memory desc";
     return nullptr;
   }
   const dnnl::memory* mem  = GetDNNLData();
   dnnl::memory::desc desc1 = mem->get_desc();
   // The DNNL memory has the same format and shape as required,
   // or both use the default format, we can return the DNNL memory.
   if (desc1 == desc || ((!mxnet::IsDNNL(desc1)) && (!mxnet::IsDNNL(desc)))) {
     return GetDNNLExact(mem, desc);
   } else {
     return nullptr;
   }
 }

 const dnnl::memory* NDArray::GetDNNLDataReorder(const void* mem_desc) const {
   dnnl::memory::desc new_desc = *static_cast<const dnnl::memory::desc*>(mem_desc);
   CHECK(storage_type() == kDefaultStorage);

   const dnnl::memory* mem = GetDNNLData();
   // If the memory descriptor matches, it's easy.
   DNNLStream* stream = DNNLStream::Get();
   if (mem->get_desc() == new_desc) {
     return GetDNNLExact(mem, new_desc);
   }

   dnnl::memory::desc old_desc = mem->get_desc();
   // Now we need to determine if we should reorder the memory.
   // If both use the default formats, we think we don't need to reorder.
   if ((!mxnet::IsDNNL(old_desc)) && (!mxnet::IsDNNL(new_desc))) {
     dnnl_mem_ptr ret(
         new dnnl::memory(new_desc, CpuEngine::Get()->get_engine(), mem->get_data_handle()));
     stream->RegisterMem(ret);
     return ret.get();
   } else if (same_shape(old_desc, new_desc)) {
     // If they have the same shape, we can reorder data directly.
     dnnl::memory* ret = TmpMemMgr::Get()->Alloc(new_desc);
     std::unordered_map<int, dnnl::memory> args({{DNNL_ARG_FROM, *mem}, {DNNL_ARG_TO, *ret}});
     stream->RegisterPrimArgs(dnnl::reorder(*mem, *ret), args);
     return ret;
   } else {
     // If they have different shapes, we need to reshape the array first.
     // Since this method will only be used inside an operator, we can call
     // DNNLDataReshape to reshape an array.
     mxnet::TShape required_shape(new_desc.data.ndims, -1);
     for (int i = 0; i < new_desc.data.ndims; i++)
       required_shape[i] = new_desc.data.dims[i];
     NDArray reshaped        = DNNLDataReshape(required_shape);
     const dnnl::memory* ret = reshaped.GetDNNLData();
     if (ret->get_desc() == new_desc) {
       return GetDNNLExact(ret, new_desc);
     } else {
       dnnl::memory* ret2 = TmpMemMgr::Get()->Alloc(new_desc);
       std::unordered_map<int, dnnl::memory> args({{DNNL_ARG_FROM, *ret}, {DNNL_ARG_TO, *ret2}});
       stream->RegisterPrimArgs(dnnl::reorder(*ret, *ret2), args);
       return ret2;
     }
   }
 }

 NDArray NDArray::Reorder2Default() const {
   CHECK(storage_type() == kDefaultStorage);

   if (ptr_->dnnl_mem_ == nullptr)
     return *this;
   if (!ptr_->dnnl_mem_->IsDNNL())
     return *this;

   // create new ndarray from dnnl layout
   dnnl::memory::desc from_desc = ptr_->dnnl_mem_->GetDesc();
   mxnet::TShape tshape(from_desc.data.ndims, -1);
   for (int i = 0; i < from_desc.data.ndims; i++)
     tshape[i] = from_desc.data.dims[i];
   NDArray ret(tshape, ctx(), false, dtype());
   dnnl_format_tag_t format    = ptr_->dnnl_mem_->GetDefaultFormat();
   dnnl::memory::desc def_desc = ptr_->dnnl_mem_->GetDesc(format);
   CHECK(ret.ptr_->shandle.size >= def_desc.get_size());
   dnnl::memory def_mem(def_desc, CpuEngine::Get()->get_engine(), ret.ptr_->shandle.dptr);
   ptr_->dnnl_mem_->ReorderTo(&def_mem);
   // reshape as needed
   ret.shape_       = shape_;
   ret.byte_offset_ = byte_offset_;
   ret.reuse_       = false;
   return ret;
 }

 void NDArray::SelfReorder2Default() {
   if (!IsDNNLData())
     return;

   CHECK(storage_type() == kDefaultStorage);

   const auto dnnl_mem = ptr_->dnnl_mem_;
   if (dnnl_mem == nullptr || !dnnl_mem->IsDNNL())
     return;

   // create new ndarray from  dnnl layout
   dnnl::memory::desc from_desc = dnnl_mem->GetDesc();
   mxnet::TShape tshape(from_desc.data.ndims, -1);
   for (int i = 0; i < from_desc.data.ndims; i++)
     tshape[i] = from_desc.data.dims[i];

   const auto saved_shape       = shape_;
   const auto saved_byte_offset = byte_offset_;
   this->ReInit(kDefaultStorage, tshape, ctx(), dtype(), false);

   dnnl_format_tag_t format    = dnnl_mem->GetDefaultFormat();
   dnnl::memory::desc def_desc = dnnl_mem->GetDesc(format);
   CHECK(ptr_->shandle.size >= def_desc.get_size());
   dnnl::memory def_mem(def_desc, CpuEngine::Get()->get_engine(), ptr_->shandle.dptr);
   dnnl_mem->ReorderTo(&def_mem);
   // reshape as needed
   shape_       = saved_shape;
   byte_offset_ = saved_byte_offset;
   reuse_       = false;
 }

 void NDArray::Reorder2DefaultAsync() const {
   std::vector<Engine::VarHandle> const_vars;
   std::vector<Engine::VarHandle> mutable_vars(1, this->var());
   NDArray tmp = *this;
   Engine::Get()->PushAsync(
       [tmp](RunContext ctx,
             Engine::CallbackOnStart on_start,
             Engine::CallbackOnComplete on_complete) {
         on_start();
         tmp.ptr_->Reorder2Default();
         on_complete();
       },
       ctx(),
       const_vars,
       mutable_vars,
       FnProperty::kNormal,
       0,
       "Reorder2Default");
 }

 // now just support bf16->fp32
 NDArray NDArray::Reorder2DefaultFloatFormat() const {
   CHECK(storage_type() == kDefaultStorage && IsView() == false);
   if (dtype() != mshadow::kBfloat16) {
     return Reorder2Default();
   }
   NDArray ret(shape(), ctx(), false, mshadow::DataType<float>::kFlag);
   auto src_mem = GetDNNLData();
   auto dst_mem = ret.GetDNNLData();
   ReorderTo(src_mem, dst_mem);

   return ret;
 }

 void NDArray::DNNLDataReorderAsync(const void* mem_desc) const {
   dnnl::memory::desc desc = *static_cast<const dnnl::memory::desc*>(mem_desc);
   std::vector<Engine::VarHandle> const_vars;
   std::vector<Engine::VarHandle> mutable_vars(1, this->var());
   NDArray tmp        = *this;
   const auto version = this->version();
   Engine::Get()->PushAsync(
       [tmp, version, desc](RunContext ctx,
                            Engine::CallbackOnStart on_start,
                            Engine::CallbackOnComplete on_complete) {
         on_start();
         // MXNet will try to reuse NDArray from memory planning, so we need to ensure
         // the NDArray is still holding the original trunk data.
         if (tmp.version() == version) {
           tmp.ptr_->DNNLDataReorder(&desc);
         }
         on_complete();
       },
       ctx(),
       const_vars,
       mutable_vars,
       FnProperty::kNormal,
       0,
       "Reorder");
 }

 const dnnl::memory* NDArray::GetDNNLData() const {
   CHECK(storage_type() == kDefaultStorage);
   const auto is_view = IsView();
   if (IsDNNLData()) {
     // If this array uses DNNL layout, we have to make sure it's not a view.
     // Otherwise, we'll have to change the layout inside the array.
     CHECK(!is_view);
     DNNLStream::Get()->RegisterMem(ptr_->dnnl_mem_->GetMem());
     // If this array uses DNNL format, we should return now. Otherwise,
     // SetDNNLMem may mess up dnnl_mem_.
     return ptr_->dnnl_mem_->GetRaw();
   }

   CheckAndAlloc();
   if (is_view) {
     // If this is a view, we can't create a DNNL memory for the chunk
     // because we don't have the complete data type and shape information for
     // the chunk.
     void* off_addr = static_cast<char*>(ptr_->shandle.dptr) + byte_offset_;
     // Create the primitive desc for the new dnnl memory.
     dnnl::memory::dims dims(shape().ndim());
     for (size_t i = 0; i < dims.size(); i++)
       dims[i] = shape()[i];

     const auto cpp_format = static_cast<dnnl::memory::format_tag>(GetDefaultFormat(shape().ndim()));
     dnnl::memory::desc data_md(dims, get_dnnl_type(dtype_), cpp_format);
     std::shared_ptr<dnnl::memory> ret(
         new dnnl::memory(data_md, CpuEngine::Get()->get_engine(), off_addr));
     DNNLStream::Get()->RegisterMem(ret);
     return ret.get();
   }

   // If this isn't a view, we can create a DNNL memory and store it in the chunk
   ptr_->SetDNNLMem(shape_, dtype_);
   DNNLStream::Get()->RegisterMem(ptr_->dnnl_mem_->GetMem());
   return ptr_->dnnl_mem_->GetRaw();
 }

 void NDArray::InvalidateDNNLData() {
   // Removing dnnl_mem_ means the NDArray will store data in the default format.
   if (ptr_->dnnl_mem_ && ptr_->dnnl_mem_->IsDNNL())
     ptr_->dnnl_mem_ = nullptr;
 }

 void NDArray::CopyFrom(const dnnl::memory& mem) {
   CHECK(ptr_ != nullptr) << "The NDArray hasn't been initialized";
   if (ptr_->dnnl_mem_ && ptr_->dnnl_mem_->GetRaw() == &mem)
     return;

   CHECK(mem.get_desc().get_size() == shape().Size() * GetTypeSize(dtype_))
       << "The size of NDArray doesn't match the requested oneDNN memory desc";
   // If this array uses DNNL layout, we have to make sure it's not a view.
   // Otherwise, we'll have to change the layout inside the array.

   if (IsDNNLData() && IsView())
     ptr_->Reorder2Default();

   const dnnl::memory* this_mem = GetDNNLData();
   DNNLMemoryCopy(mem, this_mem);
 }

 dnnl::memory* NDArray::CreateDNNLData(const void* mem_desc) {
   dnnl::memory::desc desc = *static_cast<const dnnl::memory::desc*>(mem_desc);
   if (desc.get_size() != shape().Size() * GetTypeSize(dtype_)) {
     LOG(FATAL) << "The size of NDArray doesn't match the requested oneDNN memory desc. "
                << "oneDNN memory requests for " << desc.get_size() << " bytes, but got "
                << shape().Size() * GetTypeSize(dtype_) << " bytes from NDArray";
     return nullptr;
   }
   bool isDefaultFormat = IsDefaultFormat(desc);
   if (isDefaultFormat && !IsView()) {
     ptr_->SetDNNLMem(shape_, dtype_);
     DNNLStream::Get()->RegisterMem(ptr_->dnnl_mem_->GetMem());
     return GetDNNLExact(ptr_->dnnl_mem_->GetRaw(), desc);
   } else if (isDefaultFormat) {
     ptr_->CheckAndAlloc();
     CHECK(ptr_->shandle.dptr);
     // When this is a view and a user wants the default layout, we can simply
     // create a new dnnl memory that points to the right memory.
     std::shared_ptr<dnnl::memory> mem(
         new dnnl::memory(desc,
                          CpuEngine::Get()->get_engine(),
                          static_cast<char*>(ptr_->shandle.dptr) + byte_offset_));
     DNNLStream::Get()->RegisterMem(mem);
     return mem.get();
   } else if (IsView()) {
     // If this is a view and a user wants to write data to it with special
     // a DNNL format, we should reorder the data in the array and return NULL.
     // In this way, the user will create a new NDArray for the special format
     // and copy data back.
     ptr_->Reorder2Default();
     return nullptr;
   }

   if (ptr_->dnnl_mem_)
     CHECK(ptr_->dnnl_mem_->GetDataHandle() == ptr_->shandle.dptr);
   if (ptr_->dnnl_mem_ && ptr_->dnnl_mem_->GetDesc() == desc) {
     DNNLStream::Get()->RegisterMem(ptr_->dnnl_mem_->GetMem());
     return GetDNNLExact(ptr_->dnnl_mem_->GetRaw(), desc);
   }

   CHECK(ptr_->shandle.size >= desc.get_size());
   ptr_->CheckAndAlloc(desc.get_size());
   ptr_->dnnl_mem_.reset(new DNNLMemory(desc, ptr_->shandle.dptr));
   DNNLStream::Get()->RegisterMem(ptr_->dnnl_mem_->GetMem());
   return ptr_->dnnl_mem_->GetRaw();
 }

 void NDArray::UpdateDNNLMemDesc(const void* mem_desc) {
   dnnl::memory::desc desc = *static_cast<const dnnl::memory::desc*>(mem_desc);
   auto new_desc           = desc;
   auto this_dtype         = get_dnnl_type(dtype());
   new_desc.data.data_type = static_cast<dnnl_data_type_t>(this_dtype);
   ptr_->dnnl_mem_.reset(new DNNLMemory(new_desc, ptr_->shandle.dptr));
   DNNLStream::Get()->RegisterMem(ptr_->dnnl_mem_->GetMem());
 }

 #endif

 void NDArray::SetTBlob() const {
   CHECK(ptr_ != nullptr);
   mxnet::TShape shape = shape_;
   char* dptr          = static_cast<char*>(ptr_->shandle.dptr);
   auto stype          = storage_type();
   if (stype == kDefaultStorage) {
 #if MXNET_USE_ONEDNN == 1
     CHECK(!IsDNNLData()) << "We can't generate TBlob for oneDNN data. "
                          << "Please use Reorder2Default() to generate a new NDArray first";
 #endif
     dptr += byte_offset_;
   } else if (stype == kCSRStorage || stype == kRowSparseStorage) {
     CHECK_EQ(byte_offset_, 0);
     shape = storage_shape();
   } else {
     LOG(FATAL) << "unknown storage type " << stype;
   }
   tblob_.dptr_      = dptr;
   tblob_.shape_     = shape;
   tblob_.type_flag_ = dtype_;
   tblob_.SetDLTensor(ptr_->shandle.ctx.dev_mask(), ptr_->shandle.ctx.dev_id);
 }

 /*!
  * \brief run a ternary operation
  * \param lhs left operand
  * \param mhs middle operand
  * \param rhs right operand
  * \param out the output ndarray
  */
 template <typename OP>
 void TernaryOp(const NDArray& lhs, const NDArray& mhs, const NDArray& rhs, NDArray* out) {
   // no check if all of them are on cpu
   if (lhs.ctx().dev_mask() != cpu::kDevMask || mhs.ctx().dev_mask() != cpu::kDevMask ||
       rhs.ctx().dev_mask() != cpu::kDevMask) {
     CHECK((lhs.ctx() == mhs.ctx()) && (mhs.ctx() == rhs.ctx())) << "operands context mismatch";
   }
   // if out is none, allocate space
   if (out->is_none()) {
     *out = NDArray(OP::GetShape(lhs.shape(), mhs.shape(), rhs.shape()), lhs.ctx(), true);
   } else {
     // no check if both of them are on cpu
     if (lhs.ctx().dev_mask() != cpu::kDevMask || out->ctx().dev_mask() != cpu::kDevMask) {
       CHECK(out->ctx() == lhs.ctx()) << "target context mismatch";
     }
     CHECK(out->shape() == OP::GetShape(lhs.shape(), mhs.shape(), rhs.shape()))
         << "target shape mismatch";
   }
   // important: callback must always capture by value
   NDArray ret = *out;
   // get the const variables
   std::vector<Engine::VarHandle> const_vars;
   if (lhs.var() != ret.var())
     const_vars.push_back(lhs.var());
   if (mhs.var() != ret.var())
     const_vars.push_back(mhs.var());
   if (rhs.var() != ret.var())
     const_vars.push_back(rhs.var());

   // redirect everything to mshadow operations
   switch (lhs.ctx().dev_mask()) {
     case cpu::kDevMask: {
       Engine::Get()->PushSync(
           [lhs, mhs, rhs, ret](RunContext ctx) {
             TBlob tmp = ret.data();
             ndarray::Eval<cpu, OP>(lhs.data(), mhs.data(), rhs.data(), &tmp, ctx);
           },
           lhs.ctx(),
           const_vars,
           {ret.var()},
           FnProperty::kNormal,
           0,
           PROFILER_MESSAGE_FUNCNAME);
       break;
     }
 #if MXNET_USE_CUDA
     case gpu::kDevMask: {
       Engine::Get()->PushSync(
           [lhs, mhs, rhs, ret](RunContext ctx) {
             TBlob tmp = ret.data();
             ndarray::Eval<gpu, OP>(lhs.data(), mhs.data(), rhs.data(), &tmp, ctx);
           },
           lhs.ctx(),
           const_vars,
           {ret.var()},
           FnProperty::kNormal,
           0,
           PROFILER_MESSAGE_FUNCNAME);
       break;
     }
 #endif
     default:
       LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
   }
 }

 /*!
  * \brief Performs some preparation required to apply binary operators.
  * Checks context and shape of ndarrays, allocates space for output
  * and prepares const variables for engine
  * \param lhs left operand
  * \param rhs right operand
  * \param out the output ndarray
  * \param binary_op the real operation
  */
 template <typename OP>
 std::vector<Engine::VarHandle> BinaryOpPrepare(const NDArray& lhs,
                                                const NDArray& rhs,
                                                NDArray* out) {
   // no check if both of them are on cpu
   if (lhs.ctx().dev_mask() != cpu::kDevMask || rhs.ctx().dev_mask() != cpu::kDevMask) {
     CHECK(lhs.ctx() == rhs.ctx()) << "operands context mismatch";
   }
   // if out is none, allocate space
   if (out->is_none()) {
     *out = NDArray(OP::GetShape(lhs.shape(), rhs.shape()), lhs.ctx(), true, lhs.dtype());
   } else {
     // no check if both of them are on cpu
     if (lhs.ctx().dev_mask() != cpu::kDevMask || out->ctx().dev_mask() != cpu::kDevMask) {
       CHECK(out->ctx() == lhs.ctx()) << "target context mismatch";
     }
     CHECK(out->shape() == OP::GetShape(lhs.shape(), rhs.shape())) << "target shape mismatch";
   }
   std::vector<Engine::VarHandle> const_vars;
   // prepare const variables for engine
   if (lhs.var() != out->var())
     const_vars.push_back(lhs.var());
   if (rhs.var() != out->var())
     const_vars.push_back(rhs.var());
   return const_vars;
 }

 /*!
  * \brief run a binary operation using the kernel launch method
  * \param lhs left operand
  * \param rhs right operand
  * \param out the output ndarray
  * \param binary_op the real operation
  */
 template <typename OP>
 void BinaryOpKernel(const NDArray& lhs, const NDArray& rhs, NDArray* out) {
   std::vector<Engine::VarHandle> const_vars = BinaryOpPrepare<OP>(lhs, rhs, out);
   // important: callback must always capture by value
   NDArray ret = *out;
   switch (lhs.ctx().dev_mask()) {
     case cpu::kDevMask: {
       Engine::Get()->PushSync(
           [lhs, rhs, ret](RunContext ctx) {
             TBlob tmp               = ret.data();
             mshadow::Stream<cpu>* s = ctx.get_stream<cpu>();
             ndarray::BinaryOpKernelImpl<OP>(s, lhs.data(), rhs.data(), &tmp);
           },
           lhs.ctx(),
           const_vars,
           {ret.var()},
           FnProperty::kNormal,
           0,
           PROFILER_MESSAGE_FUNCNAME);
       break;
     }
 #if MXNET_USE_CUDA
     case gpu::kDevMask: {
       Engine::Get()->PushSync(
           [lhs, rhs, ret](RunContext ctx) {
             TBlob tmp               = ret.data();
             mshadow::Stream<gpu>* s = ctx.get_stream<gpu>();
             ndarray::BinaryOpKernelImpl<OP>(s, lhs.data(), rhs.data(), &tmp);
           },
           lhs.ctx(),
           const_vars,
           {ret.var()},
           FnProperty::kNormal,
           0,
           PROFILER_MESSAGE_FUNCNAME);
       break;
     }
 #endif
     default:
       LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
   }
 }

 /*!
  * \brief run a binary operation using mshadow operations
  * \param lhs left operand
  * \param rhs right operand
  * \param out the output ndarray
  * \param binary_op the real operation
  */
 template <typename OP>
 void BinaryOp(const NDArray& lhs, const NDArray& rhs, NDArray* out) {
   std::vector<Engine::VarHandle> const_vars = BinaryOpPrepare<OP>(lhs, rhs, out);
   // important: callback must always capture by value
   NDArray ret = *out;
   // redirect everything to mshadow operations
   switch (lhs.ctx().dev_mask()) {
     case cpu::kDevMask: {
       Engine::Get()->PushSync(
           [lhs, rhs, ret](RunContext ctx) {
             TBlob tmp = ret.data();
             ndarray::Eval<cpu, OP>(lhs.data(), rhs.data(), &tmp, ctx);
           },
           lhs.ctx(),
           const_vars,
           {ret.var()},
           FnProperty::kNormal,
           0,
           PROFILER_MESSAGE_FUNCNAME);
       break;
     }
 #if MXNET_USE_CUDA
     case gpu::kDevMask: {
       Engine::Get()->PushSync(
           [lhs, rhs, ret](RunContext ctx) {
             TBlob tmp = ret.data();
             ndarray::Eval<gpu, OP>(lhs.data(), rhs.data(), &tmp, ctx);
           },
           lhs.ctx(),
           const_vars,
           {ret.var()},
           FnProperty::kNormal,
           0,
           PROFILER_MESSAGE_FUNCNAME);
       break;
     }
 #endif
     default:
       LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
   }
 }

 void SetValueOp(const real_t& rhs, NDArray* out) {
   CHECK_NE(out->is_none(), true) << "Set value target must not be empty";
   // important: callback must always capture by value
   NDArray ret                    = *out;
   const NDArrayStorageType stype = ret.storage_type();
   Engine::Get()->PushSync(
       [rhs, ret, stype](RunContext ctx) {
         TBlob tmp = ret.data();
         switch (ret.ctx().dev_mask()) {
           case cpu::kDevMask: {
             if (stype == kDefaultStorage) {
               ndarray::Eval<cpu>(rhs, &tmp, ctx);
             } else {
               ndarray::Eval(ctx.get_stream<cpu>(), rhs, ret);
             }
             break;
           }
 #if MXNET_USE_CUDA
           case gpu::kDevMask: {
             if (stype == kDefaultStorage) {
               ndarray::Eval<gpu>(rhs, &tmp, ctx);
             } else {
               ndarray::Eval(ctx.get_stream<gpu>(), rhs, ret);
             }
             break;
           }
 #endif
           default:
             LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
         }
       },
       ret.ctx(),
       {},
       {ret.var()},
       FnProperty::kNormal,
       0,
       PROFILER_MESSAGE_FUNCNAME);
 }

 /*!
  * \brief run a binary operation
  * \param lhs left operand
  * \param rhs right operand
  * \param out the output ndarray
  * \param binary_op the real
  */
 template <typename OP, bool reverse>
 void ScalarOp(const NDArray& lhs, const real_t& rhs, NDArray* out) {
   if (out->is_none()) {
     *out = NDArray(lhs.shape(), lhs.ctx(), true, lhs.dtype());
   } else {
     CHECK(out->ctx() == lhs.ctx()) << "target context mismatch";
     CHECK(out->shape() == lhs.shape()) << "target shape mismatch";
   }
   // important: callback must always capture by value
   NDArray ret = *out;
   // get the const variables
   std::vector<Engine::VarHandle> const_vars;
   if (lhs.var() != ret.var())
     const_vars.push_back(lhs.var());

   // redirect everything to mshadow operations
   switch (lhs.ctx().dev_mask()) {
     case cpu::kDevMask: {
       Engine::Get()->PushSync(
           [lhs, rhs, ret](RunContext ctx) {
             TBlob tmp = ret.data();
             ndarray::Eval<cpu, OP, reverse>(lhs.data(), rhs, &tmp, ctx);
           },
           lhs.ctx(),
           const_vars,
           {ret.var()},
           FnProperty::kNormal,
           0,
           PROFILER_MESSAGE_FUNCNAME);
       break;
     }
 #if MXNET_USE_CUDA
     case gpu::kDevMask: {
       Engine::Get()->PushSync(
           [lhs, rhs, ret](RunContext ctx) {
             TBlob tmp = ret.data();
             ndarray::Eval<gpu, OP, reverse>(lhs.data(), rhs, &tmp, ctx);
           },
           lhs.ctx(),
           const_vars,
           {ret.var()},
           FnProperty::kNormal,
           0,
           PROFILER_MESSAGE_FUNCNAME);
       break;
     }
 #endif
     default:
       LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
   }
 }

 size_t num_aux_data(NDArrayStorageType stype) {
   size_t num = 0;
   switch (stype) {
     case kDefaultStorage:
       num = 0;
       break;
     case kCSRStorage:
       num = 2;
       break;
     case kRowSparseStorage:
       num = 1;
       break;
     default:
       LOG(FATAL) << "Unknown storage type" << stype;
       break;
   }
   return num;
 }

 // Make a copy of a CSR NDArray
 template <typename from_xpu, typename to_xpu>
 inline void CopyFromToCsrImpl(const NDArray& from, const NDArray& to, RunContext ctx) {
   using namespace mshadow;
   CHECK_EQ(from.storage_type(), to.storage_type()) << "Copying with different storage type";
   // if source storage is not initialized, fill destination with zeros
   auto s = ctx.get_stream<to_xpu>();
   if (!from.storage_initialized()) {
     op::FillZerosCsrImpl(s, to);
     return;
   }
   // Allocate storage
   to.CheckAndAllocAuxData(csr::kIndPtr, from.aux_shape(csr::kIndPtr));
   to.CheckAndAllocAuxData(csr::kIdx, from.aux_shape(csr::kIdx));
   to.CheckAndAllocData(from.aux_shape(csr::kIdx));
   TBlob val    = to.data();
   TBlob indptr = to.aux_data(csr::kIndPtr);
   TBlob idx    = to.aux_data(csr::kIdx);
   ndarray::Copy<from_xpu, to_xpu>(from.data(), &val, from.ctx(), to.ctx(), ctx);
   ndarray::Copy<from_xpu, to_xpu>(from.aux_data(csr::kIndPtr), &indptr, from.ctx(), to.ctx(), ctx);
   ndarray::Copy<from_xpu, to_xpu>(from.aux_data(csr::kIdx), &idx, from.ctx(), to.ctx(), ctx);
 }

 // Make a copy of a row-sparse NDArray
 template <typename from_xpu, typename to_xpu>
 inline void CopyFromToRspImpl(const NDArray& from, const NDArray& to, RunContext ctx) {
   using namespace mshadow;
   CHECK_EQ(from.storage_type(), to.storage_type()) << "Copying with different storage type";
   // if source is zeros, fill destination with zeros, too
   auto s = ctx.get_stream<to_xpu>();
   if (!from.storage_initialized()) {
     op::FillZerosRspImpl(s, to);
     return;
   }
   const auto& aux_shape = from.aux_shape(rowsparse::kIdx);
   to.CheckAndAlloc({aux_shape});
   TBlob val = to.data();
   TBlob idx = to.aux_data(rowsparse::kIdx);
   ndarray::Copy<from_xpu, to_xpu>(from.data(), &val, from.ctx(), to.ctx(), ctx);
   ndarray::Copy<from_xpu, to_xpu>(from.aux_data(rowsparse::kIdx), &idx, from.ctx(), to.ctx(), ctx);
 }

 // Make a copy of a dense NDArray
 template <typename from_xpu, typename to_xpu>
 inline void CopyFromToDnsImpl(const NDArray& from, const NDArray& to, RunContext ctx) {
 #if MXNET_USE_ONEDNN == 1
   // If neither is DNNL, we can copy data normally.
   if (!from.IsDNNLData() && !to.IsDNNLData()) {
 #endif
     using namespace mshadow;
     CHECK_EQ(from.storage_type(), to.storage_type()) << "Copying with different storage type";
     TBlob tmp = to.data();
     ndarray::Copy<from_xpu, to_xpu>(from.data(), &tmp, from.ctx(), to.ctx(), ctx);
 #if MXNET_USE_ONEDNN == 1
   } else if (SupportDNNL(from) && SupportDNNL(to) && from.ctx().dev_mask() == cpu::kDevMask &&
              to.ctx().dev_mask() == cpu::kDevMask) {
     // If we copy data directly, we need to make sure both NDArrays are supported
     // by DNNL.
     auto from_mem = from.GetDNNLData();
     auto to_mem   = to.GetDNNLData();
     if (from_mem->get_desc() == to_mem->get_desc()) {
       size_t size = std::min(from_mem->get_desc().get_size(), to_mem->get_desc().get_size());
       memcpy(to_mem->get_data_handle(), from_mem->get_data_handle(), size);
     } else {
       const_cast<NDArray&>(to).CopyFrom(*from_mem);
       DNNLStream::Get()->Submit();
     }
   } else {
     // In this case, one of the NDArray isn't supported by DNNL, we need
     // to convert the DNNL array to the default format first and copy data
     // with Copy().
     NDArray tmp_from = from;
     if (tmp_from.IsDNNLData()) {
       // TODO(zhengda) tmp_from should be cached.
       tmp_from     = NDArray(from.shape(), from.ctx(), false, from.dtype());
       auto tmp_mem = from.GetDNNLData();
       tmp_from.CopyFrom(*tmp_mem);
       DNNLStream::Get()->Submit();
     }
     CHECK(tmp_from.IsDefaultData());
     CHECK(to.IsDefaultData());
     TBlob tmp = to.data();
     ndarray::Copy<from_xpu, to_xpu>(tmp_from.data(), &tmp, from.ctx(), to.ctx(), ctx);
   }
 #endif
 }

 // Make a copy of an NDArray based on storage type
 template <typename from_xpu, typename to_xpu>
 void CopyFromToImpl(const NDArray& from,
                     const NDArray& to,
                     RunContext rctx,
                     const std::vector<Resource>& requested) {
   using namespace std;
   using namespace mshadow;
   // if storage type doesn't match, cast the storage first
   const NDArrayStorageType from_stype = from.storage_type();
   const NDArrayStorageType to_stype   = to.storage_type();
   CHECK(from_stype == kDefaultStorage || to_stype == kDefaultStorage || from_stype == to_stype)
       << "Copying ndarray of stype = " << from_stype << " to stype = " << to_stype
       << " is not supported";
   const Context from_ctx = from.ctx();
   const Context to_ctx   = to.ctx();
   bool is_train          = Imperative::Get()->is_training();

   OpContext opctx{
       Imperative::Get()->is_recording(), is_train, rctx, engine::CallbackOnComplete(), requested};
   if (from_ctx == to_ctx && from_stype != to_stype) {
     // same ctx, different stypes, use cast op directly without copying
     common::CastStorageDispatch<from_xpu>(opctx, from, to);
   } else {
     NDArray casted_nd;  // an intermediate result before copying from to to
     if (from_stype == to_stype) {
       casted_nd = from;  // same stype, no need to cast from
     } else {             // different stypes on different ctx needs an temporary casted_nd
       const mxnet::TShape& shape = from.shape();
       if (to_stype == kDefaultStorage) {
         casted_nd = NDArray(shape, from_ctx);
       } else {
         casted_nd = NDArray(to_stype, shape, from_ctx);
       }
       // convert from_nd to the same stype as to_nd
       common::CastStorageDispatch<from_xpu>(opctx, from, casted_nd);
     }

     if (to_stype == kDefaultStorage) {
       CopyFromToDnsImpl<from_xpu, to_xpu>(casted_nd, to, rctx);
     } else if (to_stype == kRowSparseStorage) {
       CopyFromToRspImpl<from_xpu, to_xpu>(casted_nd, to, rctx);
     } else if (to_stype == kCSRStorage) {
       CopyFromToCsrImpl<from_xpu, to_xpu>(casted_nd, to, rctx);
     } else {
       LOG(FATAL) << "unknown storage type" << to_stype;
     }
   }
 }

 void CopyFromTo(const NDArray& from, const NDArray& to, int priority, bool is_opr) {
   if (from.var() == to.var() && from.byte_offset() == to.byte_offset()) {
     // skip to copy to itself
     return;
   }
   CHECK(from.shape() == to.shape())
       << "operands shape mismatch "
       << "from.shape = " << from.shape() << " to.shape=" << to.shape();
   CHECK(!mxnet::op::shape_is_none(from.shape())) << "source operands have undefined shape";
   // zero-size array, no need to copy
   if (from.shape().Size() == 0U) {
     return;
   }
   // important: callback must always capture by value
   const Context from_ctx = from.ctx();
   const int a            = from_ctx.dev_mask();
   const int b            = to.ctx().dev_mask();
   std::vector<Engine::VarHandle> const_vars;
   if (from.var() != to.var())
     const_vars.push_back(from.var());

   const NDArrayStorageType from_stype = from.storage_type();
   const NDArrayStorageType to_stype   = to.storage_type();

   std::vector<Engine::VarHandle> mutable_vars(1, to.var());

   std::vector<Resource> requested;
   if (from_stype != to_stype) {
     using namespace common;
     static bool log = dmlc::GetEnv("MXNET_STORAGE_FALLBACK_LOG_VERBOSE", true);
     if (log) {
       std::ostringstream os;
       os << "\nStorage fallback detected:\n"
          << "Copy from " << stype_string(from_stype) << " storage type on " << dev_type_string(a)
          << " to " << stype_string(to_stype) << " storage type on " << dev_type_string(b)
          << ".\nA temporary ndarray with " << stype_string(to_stype)
          << " storage type will be generated in order to perform the copy. "
             "This does not affect the correctness of the programme. "
             "You can set environment variable "
             "MXNET_STORAGE_FALLBACK_LOG_VERBOSE to 0 to suppress this warning.";
       LogOnce(os.str());
     }

     // request temp resource if cast_storage performs on GPU
     if (a == gpu::kDevMask) {
       Resource rsc =
           ResourceManager::Get()->Request(from_ctx, ResourceRequest(ResourceRequest::kTempSpace));
       requested.push_back(rsc);
       mutable_vars.push_back(rsc.var);
     }
   }

   if (a == cpu::kDevMask && b == cpu::kDevMask) {
     Engine::Get()->PushAsync(
         [from, to, requested](RunContext ctx,
                               Engine::CallbackOnStart on_start,
                               Engine::CallbackOnComplete on_complete) {
           on_start();
           CopyFromToImpl<cpu, cpu>(from, to, ctx, requested);
           on_complete();
         },
         from.ctx(),
         const_vars,
         mutable_vars,
         FnProperty::kNormal,
         priority,
         "CopyCPU2CPU");
   } else {
 #if MXNET_USE_CUDA
     if (a == cpu::kDevMask && b == gpu::kDevMask) {
       Engine::Get()->PushAsync(
           [from, to, requested](RunContext ctx,
                                 Engine::CallbackOnStart on_start,
                                 Engine::CallbackOnComplete on_complete) {
             on_start();
             CopyFromToImpl<cpu, gpu>(from, to, ctx, requested);
             on_complete();
           },
           to.ctx(),
           const_vars,
           mutable_vars,
           FnProperty::kCopyToGPU,
           priority,
           "CopyCPU2GPU");
     } else if (a == gpu::kDevMask && b == cpu::kDevMask) {
       Engine::Get()->PushAsync(
           [from, to, requested](RunContext ctx,
                                 Engine::CallbackOnStart on_start,
                                 Engine::CallbackOnComplete on_complete) {
             on_start();
             CopyFromToImpl<gpu, cpu>(from, to, ctx, requested);
             on_complete();
           },
           from.ctx(),
           const_vars,
           mutable_vars,
           FnProperty::kCopyFromGPU,
           priority,
           "CopyGPU2CPU");
     } else if (a == gpu::kDevMask && b == gpu::kDevMask) {
       Engine::Get()->PushAsync(
           [from, to, requested](RunContext ctx,
                                 Engine::CallbackOnStart on_start,
                                 Engine::CallbackOnComplete on_complete) {
             on_start();
             CopyFromToImpl<gpu, gpu>(from, to, ctx, requested);
             on_complete();
           },
           from.ctx(),
           const_vars,
           mutable_vars,
           from.dtype() != to.dtype() ? FnProperty::kNormal : FnProperty::kCopyFromGPU,
           priority,
           is_opr ? "_copyto_GPU2GPU" : "CopyGPU2GPU");
     } else {
       LOG(FATAL) << "unknown device mask";
     }
 #else
     LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
 #endif
   }
 }

 void CopyFromTo(const NDArray& from, const NDArray* to, int priority) {
   CopyFromTo(from, *to, priority);
 }

 void ElementwiseSum(const std::vector<NDArray>& source, NDArray* out, int priority) {
   std::vector<Engine::VarHandle> const_vars;
   const_vars.reserve(source.size());
   for (const auto& source_array : source) {
     if (source_array.var() != out->var()) {
       const_vars.push_back(source_array.var());
     }
     CHECK_EQ(source_array.shape(), out->shape()) << "operands shape mismatch";
     if (out->ctx().dev_mask() == Context::kCPU) {
       CHECK_EQ(source_array.ctx().dev_mask(), Context::kCPU) << "operands context mismatch";
     } else {
       CHECK_EQ(source_array.ctx(), out->ctx()) << "operands context mismatch";
     }
   }
   // important: callback must always capture by value
   NDArray ret = *out;

   const NDArrayStorageType stype = ret.storage_type();

   if (stype == kDefaultStorage) {
     switch (out->ctx().dev_mask()) {
       case cpu::kDevMask: {
         Engine::Get()->PushSync(
             [source, ret](RunContext ctx) {
               std::vector<TBlob> source_tblob(source.size());
               for (size_t i = 0; i < source.size(); ++i) {
                 source_tblob[i] = source[i].data();
               }
               TBlob tmp = ret.data();
               ndarray::ElementwiseSum<cpu>(source_tblob, &tmp, ctx);
             },
             out->ctx(),
             const_vars,
             {ret.var()},
             FnProperty::kNormal,
             priority,
             PROFILER_MESSAGE_FUNCNAME);
         break;
       }
 #if MXNET_USE_CUDA
       case gpu::kDevMask: {
         Engine::Get()->PushSync(
             [source, ret](RunContext ctx) {
               std::vector<TBlob> source_tblob(source.size());
               for (size_t i = 0; i < source.size(); ++i) {
                 source_tblob[i] = source[i].data();
               }
               TBlob tmp = ret.data();
               ndarray::ElementwiseSum<gpu>(source_tblob, &tmp, ctx);
             },
             out->ctx(),
             const_vars,
             {ret.var()},
             FnProperty::kNormal,
             priority,
             "DenseElementwiseSum");
         break;
       }
 #endif
       default:
         LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
     }
   } else if (stype == kRowSparseStorage) {
     Resource rsc =
         ResourceManager::Get()->Request(ret.ctx(), ResourceRequest(ResourceRequest::kTempSpace));

     Engine::Get()->PushSync(
         [source, ret, rsc](RunContext rctx) {
           NDArray result = ret;
           switch (ret.ctx().dev_mask()) {
             case cpu::kDevMask: {
               mxnet::ndarray::ElementwiseSum(rctx.get_stream<cpu>(), rsc, source, &result);
               break;
             }
 #if MXNET_USE_CUDA
             case gpu::kDevMask: {
               mxnet::ndarray::ElementwiseSum(rctx.get_stream<gpu>(), rsc, source, &result);
               break;
             }
 #endif
             default:
               LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
           }
         },
         ret.ctx(),
         const_vars,
         {ret.var(), rsc.var},
         FnProperty::kNormal,
         priority,
         "RowSparseElementwiseSum");
   } else {
     LOG(FATAL) << "Not implemented for storage_type " << common::stype_string(stype);
   }
 }

 void ClipOp(const NDArray& src, const real_t& a_min, const real_t& a_max, NDArray* out) {
   if (out->is_none()) {
     *out = NDArray(src.shape(), src.ctx(), true, src.dtype());
   } else {
     CHECK(out->ctx() == src.ctx()) << "target context mismatch";
     CHECK(out->shape() == src.shape()) << "target shape mismatch";
   }
   NDArray ret = *out;
   std::vector<Engine::VarHandle> const_vars;
   if (src.var() != ret.var())
     const_vars.push_back(src.var());
   switch (src.ctx().dev_mask()) {
     case cpu::kDevMask: {
       Engine::Get()->PushSync(
           [src, a_min, a_max, ret](RunContext ctx) {
             TBlob tmp = ret.data();
             ndarray::EvalClip<cpu>(src.data(), a_min, a_max, &tmp, ctx);
           },
           src.ctx(),
           const_vars,
           {ret.var()},
           FnProperty::kNormal,
           0,
           PROFILER_MESSAGE_FUNCNAME);
       break;
     }
 #if MXNET_USE_CUDA
     case gpu::kDevMask: {
       Engine::Get()->PushSync(
           [src, a_min, a_max, ret](RunContext ctx) {
             TBlob tmp = ret.data();
             ndarray::EvalClip<gpu>(src.data(), a_min, a_max, &tmp, ctx);
           },
           src.ctx(),
           const_vars,
           {ret.var()},
           FnProperty::kNormal,
           0,
           PROFILER_MESSAGE_FUNCNAME);
       break;
     }
 #endif
     default:
       LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
   }
 }

 template <typename Distribution>
 void SampleOP(const real_t& a, const real_t& b, NDArray* out) {
   CHECK(!out->is_none());
   Resource resource = ResourceManager::Get()->Request(out->ctx(), ResourceRequest::kRandom);
   // important: callback must always capture by value
   NDArray ret = *out;
   // redirect everything to mshadow operations
   switch (out->ctx().dev_mask()) {
     case cpu::kDevMask: {
       Engine::Get()->PushSync(
           [a, b, resource, ret](RunContext ctx) {
             TBlob tmp = ret.data();
             ndarray::EvalRandom<cpu, Distribution>(a, b, resource, &tmp, ctx);
           },
           out->ctx(),
           {},
           {ret.var(), resource.var},
           FnProperty::kNormal,
           0,
           PROFILER_MESSAGE_FUNCNAME);
       break;
     }
 #if MXNET_USE_CUDA
     case gpu::kDevMask: {
       Engine::Get()->PushSync(
           [a, b, resource, ret](RunContext ctx) {
             TBlob tmp = ret.data();
             ndarray::EvalRandom<gpu, Distribution>(a, b, resource, &tmp, ctx);
           },
           out->ctx(),
           {},
           {ret.var(), resource.var},
           FnProperty::kNormal,
           0,
           PROFILER_MESSAGE_FUNCNAME);
       break;
     }
 #endif
     default:
       LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
   }
 }

 void SampleUniform(real_t begin, real_t end, NDArray* out) {
   SampleOP<ndarray::UniformDistribution>(begin, end, out);
 }

 void SampleGaussian(real_t mu, real_t sigma, NDArray* out) {
   SampleOP<ndarray::GaussianDistribution>(mu, sigma, out);
 }

 void SampleExponential(real_t lambda, NDArray* out) {
   if (out->ctx().dev_mask() != cpu::kDevMask) {
     LOG(FATAL) << "exponential sampling only valid on cpu";
   }
   real_t dummy;
   SampleOP<ndarray::ExponentialDistribution>(lambda, dummy, out);
 }

 void SamplePoisson(real_t lambda, NDArray* out) {
   if (out->ctx().dev_mask() != cpu::kDevMask) {
     LOG(FATAL) << "poisson sampling only valid on cpu";
   }
   real_t dummy;
   SampleOP<ndarray::PoissonDistribution>(lambda, dummy, out);
 }

 void SampleNegBinomial(int32_t k, real_t p, NDArray* out) {
   if (out->ctx().dev_mask() != cpu::kDevMask) {
     LOG(FATAL) << "negative binomial sampling only valid on cpu";
   }
   SampleOP<ndarray::NegBinomialDistribution>(k, p, out);
 }

 void SampleGenNegBinomial(real_t mu, real_t alpha, NDArray* out) {
   if (out->ctx().dev_mask() != cpu::kDevMask) {
     LOG(FATAL) << "negative binomial sampling only valid on cpu";
   }
   SampleOP<ndarray::GenNegBinomialDistribution>(mu, alpha, out);
 }

 void RandomSeed(uint32_t seed) {
   ResourceManager::Get()->SeedRandom(seed);
 }

 void RandomSeed(Context ctx, uint32_t seed) {
   ResourceManager::Get()->SeedRandom(ctx, seed);
 }

 template <typename OP>
 inline NDArray BinaryOpRet(const NDArray& lhs, const NDArray& rhs) {
   NDArray ret;
   BinaryOpKernel<OP>(lhs, rhs, &ret);
   return ret;
 }

 template <typename OP, bool reverse>
 inline NDArray ScalarOpRet(const NDArray& lhs, const real_t& rhs) {
   NDArray ret;
   ScalarOp<OP, reverse>(lhs, rhs, &ret);
   return ret;
 }

 template <typename OP>
 inline NDArray& BinaryOpApply(NDArray* dst, const NDArray& src) {
   BinaryOpKernel<OP>(*dst, src, dst);
   return *dst;
 }

 template <typename OP>
 inline NDArray& ScalarOpApply(NDArray* dst, const real_t& src) {
   ScalarOp<OP, false>(*dst, src, dst);
   return *dst;
 }

 // Binary
 NDArray operator+(const NDArray& lhs, const NDArray& rhs) {
   return BinaryOpRet<ndarray::Plus>(lhs, rhs);
 }
 NDArray operator-(const NDArray& lhs, const NDArray& rhs) {
   return BinaryOpRet<ndarray::Minus>(lhs, rhs);
 }
 NDArray operator*(const NDArray& lhs, const NDArray& rhs) {
   return BinaryOpRet<ndarray::Mul>(lhs, rhs);
 }
 NDArray operator/(const NDArray& lhs, const NDArray& rhs) {
   return BinaryOpRet<ndarray::Div>(lhs, rhs);
 }
 // Scalar
 NDArray operator+(const NDArray& lhs, const real_t& rhs) {
   return ScalarOpRet<ndarray::Plus, false>(lhs, rhs);
 }
 NDArray operator-(const NDArray& lhs, const real_t& rhs) {
   return ScalarOpRet<ndarray::Minus, false>(lhs, rhs);
 }
 NDArray operator*(const NDArray& lhs, const real_t& rhs) {
   return ScalarOpRet<ndarray::Mul, false>(lhs, rhs);
 }
 NDArray operator/(const NDArray& lhs, const real_t& rhs) {
   return ScalarOpRet<ndarray::Div, false>(lhs, rhs);
 }

 // Binary
 NDArray& NDArray::operator=(real_t scalar) {
   SetValueOp(scalar, this);
   return *this;
 }

 NDArray& NDArray::operator+=(const NDArray& src) {
   return BinaryOpApply<ndarray::Plus>(this, src);
 }
 NDArray& NDArray::operator-=(const NDArray& src) {
   return BinaryOpApply<ndarray::Minus>(this, src);
 }
 NDArray& NDArray::operator*=(const NDArray& src) {
   return BinaryOpApply<ndarray::Mul>(this, src);
 }
 NDArray& NDArray::operator/=(const NDArray& src) {
   return BinaryOpApply<ndarray::Div>(this, src);
 }
 // Scalar
 NDArray& NDArray::operator+=(const real_t& src) {
   return ScalarOpApply<ndarray::Plus>(this, src);
 }
 NDArray& NDArray::operator-=(const real_t& src) {
   return ScalarOpApply<ndarray::Minus>(this, src);
 }
 NDArray& NDArray::operator*=(const real_t& src) {
   return ScalarOpApply<ndarray::Mul>(this, src);
 }
 NDArray& NDArray::operator/=(const real_t& src) {
   return ScalarOpApply<ndarray::Div>(this, src);
 }

 /* magic number for ndarray version 1, with int64_t mxnet::TShape */
 static const uint32_t NDARRAY_V1_MAGIC = 0xF993fac8;

 /* magic number for ndarray version 2, with storage type */
 static const uint32_t NDARRAY_V2_MAGIC = 0xF993fac9;

 // magic number for ndarray version 3, with np shape semantics.
 // The ndarray must be saved and loaded within np shape semantics.
 static const uint32_t NDARRAY_V3_MAGIC = 0xF993faca;

 void NDArray::Save(dmlc::Stream* strm) const {
   if (Imperative::Get()->is_np_shape()) {
     CHECK_EQ(storage_type(), kDefaultStorage)
         << "only allow serializing ndarray of default storage type in np shape semantics";
     strm->Write(NDARRAY_V3_MAGIC);
   } else {
     // write magic number to mark this version
     // for storage type
     strm->Write(NDARRAY_V2_MAGIC);
   }

   // save storage type
   int32_t stype = storage_type();
   strm->Write(&stype, sizeof(stype));

   const int32_t nad = num_aux_data(storage_type());
   // save storage shape if ndarray is sparse
   if (nad > 0) {
     storage_shape().Save(strm);
   }

   // save shape
   shape_.Save(strm);
   if (is_none())
     return;

   // save context
   Context ctx = this->ctx();
   ctx.Save(strm);
   TBlob save_data;
   NDArray nd_cpu;  // a copy of *this on cpu
   if (ctx.dev_mask() != cpu::kDevMask) {
     nd_cpu = this->Copy(Context::CPU());
     nd_cpu.WaitToRead();
     save_data = nd_cpu.data();
   } else {
     this->WaitToRead();
     nd_cpu = *this;
 #if MXNET_USE_ONEDNN == 1
     if (nd_cpu.IsDNNLData())
       nd_cpu = nd_cpu.Reorder2Default();
 #endif
     save_data = nd_cpu.data();
   }

   // save type flag
   int32_t type_flag = save_data.type_flag_;
   strm->Write(&type_flag, sizeof(type_flag));

   // save aux_types and aux_shapes
   if (nad > 0) {
     for (int i = 0; i < nad; ++i) {
       int32_t aux_type_flag = aux_type(i);
       strm->Write(&aux_type_flag, sizeof(aux_type_flag));
       aux_shape(i).Save(strm);
     }
   }

   // save data
   CHECK(save_data.CheckContiguous());
   size_t type_size = mshadow::mshadow_sizeof(type_flag);
   // save data could be values of sparse tensors
   // must use save_data.shape_ instead of this->shape_
   strm->Write(save_data.dptr_, type_size * save_data.shape_.Size());

   // save aux data
   if (nad > 0) {
     for (int i = 0; i < nad; ++i) {
       TBlob save_data = nd_cpu.aux_data(i);
       // save aux_data
       CHECK(save_data.CheckContiguous());
       size_t aux_type_size = mshadow::mshadow_sizeof(aux_type(i));
       strm->Write(save_data.dptr_, aux_type_size * save_data.Size());
     }
   }
 }

 bool LegacyTShapeLoad(dmlc::Stream* strm, mxnet::TShape* shape, const uint32_t magic) {
   switch (magic) {
     case NDARRAY_V1_MAGIC:
       return shape->Load(strm);
     default:
       // meet legacy mxnet::TShape, magic is ndim here
       uint32_t ndim = magic;
       *shape        = mxnet::TShape(ndim, -1);
       std::vector<uint32_t> buffer(ndim);
       size_t nread = ndim * sizeof(uint32_t);
       if (strm->Read(buffer.data(), nread) != nread)
         return false;
       nnvm::ShapeTypeCast(buffer.begin(), buffer.end(), shape->begin());
       return true;
   }
 }

 bool NDArray::LegacyLoad(dmlc::Stream* strm, const uint32_t magic) {
   // load shape
   mxnet::TShape shape;
   if (!LegacyTShapeLoad(strm, &shape, magic))
     return false;
   if (mxnet::op::shape_is_none(shape)) {
     *this = NDArray();
     return true;
   }
   // load context
   Context ctx;
   if (!ctx.Load(strm))
     return false;
   // load type flag
   int32_t type_flag;
   if (strm->Read(&type_flag, sizeof(type_flag)) != sizeof(type_flag))
     return false;
   // load data into CPU
   NDArray temp(shape, Context::CPU(), false, type_flag);
   TBlob load_data  = temp.data();
   size_t type_size = mshadow::mshadow_sizeof(type_flag);
   size_t nread     = type_size * shape.Size();

   if (strm->Read(load_data.dptr_, nread) != nread)
     return false;
   if (ctx.dev_mask() == cpu::kDevMask) {
     *this = std::move(temp);
     return true;
   } else {
 #if MXNET_USE_CUDA
     *this = temp.Copy(ctx);
     return true;
 #else
     *this = std::move(temp);
     return true;
 #endif
   }
 }

 bool NDArray::Load(dmlc::Stream* strm) {
   uint32_t magic;
   if (strm->Read(&magic, sizeof(uint32_t)) != sizeof(uint32_t))
     return false;
   if (magic == NDARRAY_V3_MAGIC) {
     CHECK(Imperative::Get()->is_np_shape())
         << "ndarray was saved in np shape semantics, must be loaded in the same semantics."
            " Please turn on np shape semantics in Python using `with np_shape(True)`"
            " or decorator `use_np_shape` to scope the code of loading the ndarray.";
   } else {
     // when the flag is global on, skip the check since it would be always global on.
     CHECK(Imperative::Get()->is_np_shape() == GlobalOn || !Imperative::Get()->is_np_shape())
         << "ndarray was not saved in np shape semantics, but being loaded in np shape semantics."
            " Please turn off np shape semantics in Python using `with np_shape(False)`"
            " to scope the code of loading the ndarray.";
   }
   if (magic != NDARRAY_V2_MAGIC && magic != NDARRAY_V3_MAGIC) {
     return LegacyLoad(strm, magic);
   }

   // load storage type
   int32_t stype;
   if (strm->Read(&stype, sizeof(stype)) != sizeof(stype))
     return false;
   if (Imperative::Get()->is_np_shape()) {
     CHECK_EQ(stype, kDefaultStorage)
         << "only allow deserializing ndarray of default storage type in np shape semantics";
   }
   const int32_t nad = num_aux_data(static_cast<NDArrayStorageType>(stype));

   // load storage shape
   mxnet::TShape sshape;
   if (nad > 0) {
     if (!sshape.Load(strm))
       return false;
   }

   // load shape
   mxnet::TShape shape;
   if (!shape.Load(strm))
     return false;
   if (Imperative::Get()->is_np_shape()) {
     if (!shape_is_known(shape)) {
       *this = NDArray();
       return true;
     }
   } else if (shape.ndim() == 0) {
     *this = NDArray();
     return true;
   }

   // load context
   Context ctx;
   if (!ctx.Load(strm))
     return false;

   // load type flag
   int32_t type_flag;
   if (strm->Read(&type_flag, sizeof(type_flag)) != sizeof(type_flag))
     return false;

   // load aux_types and aux_shapes
   std::vector<int32_t> aux_types;
   mxnet::ShapeVector aux_shapes;
   if (nad > 0) {
     aux_types.resize(nad);
     aux_shapes.resize(nad);
     for (int i = 0; i < nad; ++i) {
       // load aux_type(i)
       if (strm->Read(&aux_types[i], sizeof(aux_types[i])) != sizeof(aux_types[i]))
         return false;
       // load aux_shapes(i)
       if (!aux_shapes[i].Load(strm))
         return false;
     }
   }

   // load data into CPU
   NDArray temp;
   if (0 == nad) {
     temp = NDArray(shape, Context::CPU(), false, type_flag);
   } else {
     temp = NDArray(static_cast<NDArrayStorageType>(stype),
                    shape,
                    Context::CPU(),
                    false,
                    type_flag,
                    aux_types,
                    aux_shapes,
                    sshape);
   }
   // load data
   TBlob load_data  = temp.data();
   size_t type_size = mshadow::mshadow_sizeof(type_flag);
   size_t nread     = type_size * load_data.Size();
   if (strm->Read(load_data.dptr_, nread) != nread)
     return false;

   // load aux_data
   if (nad > 0) {
     for (int i = 0; i < nad; ++i) {
       load_data = temp.aux_data(i);
       type_size = mshadow::mshadow_sizeof(load_data.type_flag_);
       nread     = type_size * load_data.Size();
       if (strm->Read(load_data.dptr_, nread) != nread)
         return false;
     }
   }

   if (ctx.dev_mask() == cpu::kDevMask) {
     *this = std::move(temp);
     return true;
   } else {
 #if MXNET_USE_CUDA
     int device_count = -1;
     cudaGetDeviceCount(&device_count);
     if (device_count > 0) {
       *this = temp.Copy(ctx);
       return true;
     } else {
       *this = std::move(temp);
       return true;
     }
 #else
     *this = std::move(temp);
     return true;
 #endif
   }
 }

 const uint64_t kMXAPINDArrayListMagic = 0x112;

 void NDArray::Save(dmlc::Stream* fo,
                    const std::vector<NDArray>& data,
                    const std::vector<std::string>& names) {
   uint64_t header = kMXAPINDArrayListMagic, reserved = 0;
   fo->Write(&header, sizeof(header));
   fo->Write(&reserved, sizeof(reserved));
   fo->Write(data);
   fo->Write(names);
 }

 void NDArray::Load(dmlc::Stream* fi, std::vector<NDArray>* data, std::vector<std::string>* keys) {
   uint64_t header, reserved;
   CHECK(fi->Read(&header)) << "Invalid NDArray file format";
   CHECK(fi->Read(&reserved)) << "Invalid NDArray file format";
   CHECK(header == kMXAPINDArrayListMagic) << "Invalid NDArray file format";
   CHECK(fi->Read(data)) << "Invalid NDArray file format";
   CHECK(fi->Read(keys)) << "Invalid NDArray file format";
   CHECK(keys->size() == 0 || keys->size() == data->size()) << "Invalid NDArray file format";
 }

 NDArray NDArray::Copy(Context ctx) const {
   NDArray ret;
   if (kDefaultStorage == storage_type()) {
     ret = NDArray(shape(), ctx, false, dtype_);
   } else if (kUndefinedStorage != storage_type()) {
     ret = NDArray(storage_type(),
                   shape(),
                   ctx,
                   false,
                   dtype_,
                   ptr_->aux_types,
                   ptr_->aux_shapes,
                   storage_shape());
   } else {
     LOG(FATAL) << "NDArray::Copy cannot copy undefined storage-type ndarray to ctx.dev_type="
                << ctx.dev_type << ", ctx.dev_id=" << ctx.dev_id;
   }
   CopyFromTo(*this, ret);
   return ret;
 }

 void NDArray::SyncCopyFromCPU(const void* data, size_t size) const {
   mxnet::TShape dshape = this->shape();
   if (!features::is_enabled(features::INT64_TENSOR_SIZE)) {
     CHECK_LT(size, (int64_t{1} << 31) - 1)
         << "[SyncCopyFromCPU] Size of tensor you are trying to allocate is larger than "
            "2^31 elements. Please build with flag USE_INT64_TENSOR_SIZE=1";
   }
   CHECK_EQ(dshape.Size(), size) << "Memory size do not match";
   // zero-size array, no need to copy
   if (size == 0U) {
     return;
   }
   TBlob src((void*)data, dshape, cpu::kDevMask, this->dtype_, 0);  // NOLINT(*)

   if (this->ctx().dev_mask() == cpu::kDevMask) {
     this->WaitToWrite();
     RunContext rctx{this->ctx(), nullptr, nullptr};
     TBlob dst = this->data();
     ndarray::Copy<cpu, cpu>(src, &dst, Context::CPU(), Context::CPU(), rctx);
   } else {
 #if MXNET_USE_CUDA
     Engine::Get()->PushAsync(
         [&](RunContext rctx,
             Engine::CallbackOnStart on_start,
             Engine::CallbackOnComplete on_complete) {
           on_start();
           TBlob dst = this->data();
           ndarray::Copy<cpu, gpu>(src, &dst, Context::CPU(), this->ctx(), rctx);
           on_complete();
         },
         this->ctx(),
         {},
         {this->var()},
         FnProperty::kCopyToGPU,
         0,
         "SyncCopyCPU2GPU");
     this->WaitToRead();
 #else
     LOG(FATAL) << "GPU is not enabled";
 #endif
   }
 }

 /*!
  * \brief Copy src.data()/aux_data(i) to dst->data()/aux_data(j).
  */
 void NDArray::SyncCopyFromNDArray(const NDArray& src, int i, int j) {
   if (i >= 0) {
     CHECK_NE(src.storage_type(), kDefaultStorage);
   } else {
     CHECK(!src.is_none()) << "src dense ndarray must have been initialized";
   }
   if (j >= 0) {
     CHECK_NE(storage_type(), kDefaultStorage);
   } else {
     CHECK(!this->is_none()) << "dst dense ndarray must have been initialized";
   }

   if (src.var() == var()) {
     // skip to copy to itself
     LOG(WARNING) << "SyncCopyFromNDArray does not support copying to self";
     return;
   }
   const int src_dev_mask = src.ctx().dev_mask();
   const int dst_dev_mask = ctx().dev_mask();
   std::vector<Engine::VarHandle> const_vars;
   const_vars.push_back(src.var());

   // get or create a dst tblob for copying src to it
   // if dst is a dense format and has not been allocated, allocate memory for it
   // else if dst is not initialized, allocate corresponding data blob for it
   auto get_dst_data = [&](const mxnet::TShape& src_shape) {
     if (this->storage_type() == kDefaultStorage) {
       this->ReshapeAndAlloc(src_shape);
     } else if (!this->storage_initialized()) {
       if (j < 0) {
         this->CheckAndAllocData(src_shape);
       } else {
         this->CheckAndAllocAuxData(j, src_shape);
       }
     }
     TBlob dst_data = (j >= 0 ? this->aux_data(j) : this->data());
     CHECK_LE(src_shape.Size(), dst_data.shape_.Size());
     return dst_data;
   };

   if (src_dev_mask == cpu::kDevMask && dst_dev_mask == cpu::kDevMask) {
     Engine::Get()->PushSync(
         [&](RunContext rctx) {
           const TBlob src_data = (i >= 0 ? src.aux_data(i) : src.data());
           TBlob dst_data       = get_dst_data(src_data.shape_);
           ndarray::Copy<cpu, cpu>(src_data, &dst_data, src.ctx(), this->ctx(), rctx);
         },
         this->ctx(),
         const_vars,
         {this->var()},
         FnProperty::kNormal,
         0,
         "SyncCopyFromNDArrayCPU2CPU");
   } else {
 #if MXNET_USE_CUDA
     if (src_dev_mask == cpu::kDevMask && dst_dev_mask == gpu::kDevMask) {
       Engine::Get()->PushAsync(
           [&](RunContext rctx,
               Engine::CallbackOnStart on_start,
               Engine::CallbackOnComplete on_complete) {
             on_start();
             const TBlob src_data = (i >= 0 ? src.aux_data(i) : src.data());
             TBlob dst_data       = get_dst_data(src_data.shape_);
             ndarray::Copy<cpu, gpu>(src_data, &dst_data, src.ctx(), this->ctx(), rctx);
             on_complete();
           },
           this->ctx(),
           const_vars,
           {this->var()},
           FnProperty::kCopyToGPU,
           0,
           "SyncCopyFromNDArrayCPU2GPU");
     } else if (src_dev_mask == gpu::kDevMask && dst_dev_mask == cpu::kDevMask) {
       Engine::Get()->PushAsync(
           [&](RunContext rctx,
               Engine::CallbackOnStart on_start,
               Engine::CallbackOnComplete on_complete) {
             on_start();
             const TBlob src_data = (i >= 0 ? src.aux_data(i) : src.data());
             TBlob dst_data       = get_dst_data(src_data.shape_);
             ndarray::Copy<gpu, cpu>(src_data, &dst_data, src.ctx(), this->ctx(), rctx);
             on_complete();
           },
           src.ctx(),
           const_vars,
           {this->var()},
           FnProperty::kCopyFromGPU,
           0,
           "SyncCopyFromNDArrayGPU2CPU");
     } else if (src_dev_mask == gpu::kDevMask && dst_dev_mask == gpu::kDevMask) {
       Engine::Get()->PushAsync(
           [&](RunContext rctx,
               Engine::CallbackOnStart on_start,
               Engine::CallbackOnComplete on_complete) {
             on_start();
             const TBlob src_data = (i >= 0 ? src.aux_data(i) : src.data());
             TBlob dst_data       = get_dst_data(src_data.shape_);
             ndarray::Copy<gpu, gpu>(src_data, &dst_data, src.ctx(), this->ctx(), rctx);
             on_complete();
           },
           this->ctx(),
           const_vars,
           {this->var()},
           src.dtype() != this->dtype() ? FnProperty::kNormal : FnProperty::kCopyFromGPU,
           0,
           "SyncCopyFromNDArrayGPU2GPU");
     } else {
       LOG(FATAL) << "unknown device mask";
     }
 #else
     LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
 #endif
   }
   // The copy operation was pushed to engine to execute.
   // Need to wait here for it being completed.
   // The reason for pushing the copy operation to engine
   // is because when copying data from a sparse tensor
   // to the current one, that sparse ndarray's storage_shape/aux_shape
   // may not be ready or changed and we need to ensure
   // thread safty for reading the correct shape info to allocate
   // memory for the current ndarray.
   WaitToRead();
 }

 void NDArray::SyncCopyToCPU(void* data, size_t size) const {
   mxnet::TShape dshape = this->shape();
   if (!features::is_enabled(features::INT64_TENSOR_SIZE)) {
     CHECK_LT(size, (int64_t{1} << 31) - 1)
         << "[SyncCopyToCPU] Size of tensor you are trying to allocate is larger than "
            "2^31 elements. Please build with flag USE_INT64_TENSOR_SIZE=1";
   }
   CHECK_EQ(dshape.Size(), size) << "Memory size do not match";
   // zero-size array, no need to copy
   if (size == 0U) {
     return;
   }
   TBlob dst(data, dshape, cpu::kDevMask, this->dtype_, 0);  // NOLINT(*)

   this->WaitToRead();

   if (this->ctx().dev_mask() == cpu::kDevMask) {
     RunContext rctx{this->ctx(), nullptr, nullptr};
     NDArray src = *this;
 #if MXNET_USE_ONEDNN == 1
     if (src.IsDNNLData())
       src = this->Reorder2Default();
 #endif
     ndarray::Copy<cpu, cpu>(src.data(), &dst, Context::CPU(), Context::CPU(), rctx);
   } else {
 #if MXNET_USE_CUDA
     Engine::Get()->PushAsync(
         [&](RunContext rctx,
             Engine::CallbackOnStart on_start,
             Engine::CallbackOnComplete on_complete) {
           on_start();
           {
             auto var       = this->var();
             auto& sync_obj = var->sync_object;
             std::lock_guard<std::mutex> lock{sync_obj.mutex};
             bool has_writer = false;
             std::shared_ptr<cudaEvent_t> w_ev_ptr;
             if (!sync_obj.writer_event.empty()) {
               w_ev_ptr   = sync_obj.writer_event[0].event.lock();
               has_writer = w_ev_ptr ? true : false;
             }
             for (auto ev : sync_obj.reader_events) {
               auto event_ptr = ev.event.lock();
               if (!event_ptr) {
                 continue;
               }
               cudaEvent_t event = *event_ptr;
               if (has_writer) {
                 auto w_ev = sync_obj.writer_event[0];
                 if (w_ev.stream == ev.stream) {
                   event      = w_ev.pool_index > ev.pool_index ? *w_ev_ptr : *event_ptr;
                   has_writer = false;
                 }
               }
               CUDA_CALL(cudaEventSynchronize(event));
             }
             if (has_writer) {
               CUDA_CALL(cudaEventSynchronize(*w_ev_ptr));
             }
           }
           ndarray::Copy<gpu, cpu>(this->data(), &dst, this->ctx(), Context::CPU(), rctx);
           on_complete();
         },
         this->ctx(),
         {this->var()},
         {},
         FnProperty::kCopyFromGPU,
         0,
         "SyncCopyGPU2CPU");
     this->WaitToWrite();
 #else
     LOG(FATAL) << "GPU is not enabled";
 #endif
   }
 }

 void NDArray::SyncCheckFormat(const bool full_check) const {
   int32_t err = kNormalErr;
   TBlob err_cpu(&err, mshadow::Shape1(1), cpu::kDevMask, 0);
   if (this->ctx().dev_mask() == cpu::kDevMask) {
     Engine::Get()->PushSync(
         [&](RunContext rctx) { common::CheckFormatWrapper<cpu>(rctx, *this, err_cpu, full_check); },
         this->ctx(),
         {this->var()},
         {},
         FnProperty::kNormal,
         0,
         "CheckFormat");
   } else {
 #if MXNET_USE_CUDA
     Engine::Get()->PushSync(
         [&](RunContext rctx) { common::CheckFormatWrapper<gpu>(rctx, *this, err_cpu, full_check); },
         this->ctx(),
         {this->var()},
         {},
         FnProperty::kNormal,
         0,
         "CheckFormat");
 #else
     LOG(FATAL) << "GPU is not enabled";
 #endif
   }
   this->WaitToWrite();
   CHECK_NE(err, kCSRShapeErr) << "Shape mismatch of this csr NDArray";
   CHECK_NE(err, kCSRIndPtrErr)
       << "IndPtr of csr NDArray should be non-negative, in non-decreasing order, "
       << "start with 0, and end with value equal with size of indices.";
   CHECK_NE(err, kCSRIdxErr)
       << "Indices of csr NDArray should be non-negative, in ascending order per row "
       << " and less than the number of columns.";
   CHECK_NE(err, kRSPShapeErr) << "Shape mismatch of this row_sparse NDArray";
   CHECK_NE(err, kRSPIdxErr) << "Indices of row_sparse NDArray should be non-negative, "
                             << "less than the size of first dimension and in ascending order";
   CHECK_EQ(err, kNormalErr) << "Check the validity of this sparse NDArray";
 }

 void NDArray::WaitToRead() const {
   if (is_none())
     return;
   Imperative::DCInfo::Compute(*this);
   Engine::Get()->WaitForVar(ptr_->var);
 }

 void NDArray::WaitToWrite() const {
   if (is_none())
     return;
   Imperative::DCInfo::Compute(*this);
   // Push an empty mutable function to flush all preceding reads to the variable.
   Engine::Get()->PushAsync(
       [](RunContext, Engine::CallbackOnStart on_start, Engine::CallbackOnComplete on_complete) {
         on_start();
         on_complete();
       },
       Context{},
       {},
       {ptr_->var});
   Engine::Get()->WaitForVar(ptr_->var);
 }

 void NDArray::StreamSync(int stream) const {
   if (is_none())
     return;
   Imperative::DCInfo::Compute(*this);
 #if MXNET_USE_CUDA
   Engine::Get()->PushAsync(
       [this, stream](RunContext ctx,
                      Engine::CallbackOnStart on_start,
                      Engine::CallbackOnComplete on_complete) {
         on_start();
         cudaStream_t consumer = reinterpret_cast<cudaStream_t>(stream);
         std::unordered_map<cudaStream_t, engine::EventInfo> events_per_stream;
         auto& sync_obj = this->var()->sync_object;
         std::lock_guard<std::mutex> l(sync_obj.mutex);
         auto& reader_events = sync_obj.reader_events;
         reader_events.erase(
             std::remove_if(reader_events.begin(),
                            reader_events.end(),
                            [&](const engine::EventInfo e_i) { return e_i.event.expired(); }),
             reader_events.end());
         for (auto& writer : sync_obj.writer_event) {
           if (writer.event.expired()) {
             sync_obj.writer_event.clear();
             break;
           }
           if (writer.stream != consumer) {
             bool found = false;
             for (const auto& reader : reader_events) {
               if (reader.stream == consumer) {
                 found = true;
                 break;
               }
             }
             if (!found) {
               auto event_stream = writer.stream;
               if (events_per_stream.count(event_stream) > 0) {
                 if (events_per_stream[event_stream].pool_index < writer.pool_index) {
                   events_per_stream[event_stream] = writer;
                 }
               } else {
                 events_per_stream.emplace(event_stream, writer);
               }
             }
           }
         }
         for (auto event : events_per_stream) {
           auto ev = event.second.event.lock();
           MSHADOW_CUDA_CALL(cudaStreamWaitEvent(consumer, *ev, 0));
         }
         on_complete();
       },
       this->ctx(),
       {},
       {});
 #else
   LOG(FATAL) << "GPU is not enabled";
 #endif
 }

 #if MXNET_PREDICT_ONLY == 0
 // register API function
 // those with underscore will be registered at NDArray
 MXNET_REGISTER_NDARRAY_FUN(_set_value).set_function(SetValueOp);

 MXNET_REGISTER_NDARRAY_FUN(_onehot_encode).set_function(BinaryOp<ndarray::OneHotEncode>);

 MXNET_REGISTER_NDARRAY_FUN(fill_element_0index)
     .set_function(TernaryOp<ndarray::MatFillRowElem>)
     .describe(
         "Fill one element of each line(row for python, column for R/Julia)"
         " in lhs according to index indicated by rhs and values indicated by mhs."
         " This function assume rhs uses 0-based index.");

 // register API function
 // those with underscore will be registered at NDArray

 void CopyFromToSimple(const nnvm::NodeAttrs& attrs,
                       const OpContext& ctx,
                       const std::vector<NDArray>& inputs,
                       const std::vector<OpReqType>& req,
                       const std::vector<NDArray>& outputs) {
   CopyFromTo(inputs[0], outputs[0], 0, true);
 }

 bool CopyToType(const nnvm::NodeAttrs& attrs,
                 std::vector<int>* in_attrs,
                 std::vector<int>* out_attrs) {
   CHECK_EQ(in_attrs->size(), 1U);
   CHECK_EQ(out_attrs->size(), 1U);
   int in_type = in_attrs->at(0);
   if (out_attrs->at(0) == -1) {
     TYPE_ASSIGN_CHECK(*out_attrs, 0, in_type);
   }
   return out_attrs->at(0) != -1;
 }

 // copy function is special
 // that we need to remove kAcceptEmptyMutateTarget from it
 NNVM_REGISTER_OP(_copyto)
     .add_alias("_npi_copyto")
     .set_num_inputs(1)
     .set_num_outputs(1)
     .set_attr<mxnet::FInferShape>("FInferShape", op::ElemwiseShape<1, 1>)
     .set_attr<nnvm::FInferType>("FInferType", CopyToType)
     .set_attr<FInferStorageType>("FInferStorageType",
                                  [](const NodeAttrs& attrs,
                                     const int dev_mask,
                                     DispatchMode* dispatch_mode,
                                     std::vector<int>* in_attrs,
                                     std::vector<int>* out_attrs) {
                                    op::dispatch_mode_assign(dispatch_mode,
                                                             DispatchMode::kFComputeEx);
                                    if (op::storage_type_is_none((*out_attrs)[0])) {
                                      (*out_attrs)[0] = (*in_attrs)[0];
                                    }
                                    return true;
                                  })
     .set_attr<FExecType>("FExecType",
                          [](const NodeAttrs& attrs) { return ExecType::kCrossDeviceCopy; })
     .set_attr<nnvm::FGradient>("FGradient", op::ElemwiseGradUseNone{"_copyto"})
     .set_attr<bool>("TIsBackward", true)
     .set_attr<FComputeEx>("FComputeEx<cpu>", CopyFromToSimple)
     .set_attr<FComputeEx>("FComputeEx<gpu>", CopyFromToSimple)
     .add_argument("data", "NDArray", "input data");

 void Imdecode(NDArray* ret,
               NDArray mean,
               size_t index,
               size_t x0,
               size_t y0,
               size_t x1,
               size_t y1,
               size_t n_channels,
               size_t size,
               char* str_img) {
 #if MXNET_USE_OPENCV
   cv::Mat buf(1, size, CV_8U, str_img);
   cv::Mat res = cv::imdecode(buf, n_channels == 1 ? 0 : -1);
   CHECK(res.data != nullptr) << "OpenCV Failed to decode image";
   CHECK_LE(n_channels, static_cast<size_t>(res.channels()));
   if (y1 - y0 == 0) {
     x0 = 0;
     x1 = res.cols;
     y0 = 0;
     y1 = res.rows;
   }
   CHECK(x1 <= static_cast<size_t>(res.cols) && y1 <= static_cast<size_t>(res.rows));

   if (ret->is_none()) {
     *ret = NDArray(mshadow::Shape3(n_channels, y1 - y0, x1 - x0),
                    Context::CPU(),
                    false,
                    mean.is_none() ? mshadow::default_type_flag : mean.dtype());
   }
   NDArray buff;
   if (ret->shape().ndim() == 3) {
     buff = ret->Reshape(mshadow::Shape4(1, ret->shape()[0], ret->shape()[1], ret->shape()[2]));
   } else {
     CHECK_EQ(ret->shape().ndim(), 4U);
     buff = ret->Slice(index, index + 1);
   }
   CHECK_EQ(buff.ctx().dev_mask(), Context::kCPU);
   CHECK_EQ(n_channels, buff.shape()[1]);
   CHECK_EQ(y1 - y0, buff.shape()[2]);
   CHECK_EQ(x1 - x0, buff.shape()[3]);
   buff.WaitToWrite();
   if (mean.is_none()) {
     MSHADOW_TYPE_SWITCH(buff.dtype(), DType, {
       mshadow::Tensor<cpu, 4, DType> tensor = buff.data().get<cpu, 4, DType>();
       for (size_t i = 0; i < y1 - y0; i++) {
         uchar* im_data = res.ptr<uchar>(y0 + i) + res.channels() * x0;
         for (size_t j = 0; j < x1 - x0; j++) {
           for (size_t k = 0; k < n_channels; k++) {
             tensor[0][k][i][j] = DType(im_data[k]);  // NOLINT(*)
           }
           im_data += res.channels();
         }
       }
     })
   } else {
     CHECK_EQ(mean.dtype(), buff.dtype());
     CHECK_EQ(mean.ctx().dev_mask(), Context::kCPU);
     CHECK_EQ(mean.shape()[0], buff.shape()[1]);
     CHECK_EQ(mean.shape()[1], buff.shape()[2]);
     CHECK_EQ(mean.shape()[2], buff.shape()[3]);
     mean.WaitToRead();
     MSHADOW_TYPE_SWITCH(buff.dtype(), DType, {
       mshadow::Tensor<cpu, 4, DType> tensor = buff.data().get<cpu, 4, DType>();
       mshadow::Tensor<cpu, 3, DType> tmean  = mean.data().get<cpu, 3, DType>();
       for (size_t i = 0; i < y1 - y0; i++) {
         uchar* im_data = res.ptr<uchar>(y0 + i) + res.channels() * x0;
         for (size_t j = 0; j < x1 - x0; j++) {
           for (size_t k = 0; k < n_channels; k++) {
             tensor[0][k][i][j] = DType(im_data[k]) - tmean[k][i][j];  // NOLINT(*)
           }
           im_data += res.channels();
         }
       }
     })
   }
 #else
   LOG(FATAL) << "Compile with OpenCV for image decoding.";
 #endif  // MXNET_USE_OPENCV
 }

 MXNET_REGISTER_NDARRAY_FUN(_imdecode)
     .set_type_mask(kAcceptEmptyMutateTarget | kNDArrayArgBeforeScalar)
     .set_body([](NDArray** u,
                  real_t* s,
                  NDArray** out,
                  int num_params,
                  char** param_keys,
                  char** param_vals) {
       CHECK_EQ(num_params, 1);
       Imdecode(out[0],
                *u[0],
                static_cast<size_t>(s[0]),
                static_cast<size_t>(s[1]),
                static_cast<size_t>(s[2]),
                static_cast<size_t>(s[3]),
                static_cast<size_t>(s[4]),
                static_cast<size_t>(s[5]),
                static_cast<size_t>(s[6]),
                param_vals[0]);
     })
     .set_num_use_vars(1)
     .set_num_scalars(7)
     .set_num_mutate_vars(1)
     .describe("Decode an image, clip to (x0, y0, x1, y1), subtract mean, and write to buffer")
     .add_argument("mean", "NDArray-or-Symbol", "image mean")
     .add_argument("index", "int", "buffer position for output")
     .add_argument("x0", "int", "x0")
     .add_argument("y0", "int", "y0")
     .add_argument("x1", "int", "x1")
     .add_argument("y1", "int", "y1")
     .add_argument("c", "int", "channel")
     .add_argument("size", "int", "length of str_img");
 #endif
 }  // namespace mxnet