tests/cpp/include/test_legacy_op.h - mxnet - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 /*!
  * \file test_op.h
  * \brief operator unit test utility functions
  * \author Chris Olivier
  *
  * These classes offer a framework for developing, testing and debugging operators
  * in C++.  They work for both CPU and GPU modes, as well as offer a timing
  * infrastructure in order to test inidividual operator performance.
  *
  * Operator data can be validated against general logic,
  * stored scalar values (which can be generated by this code from an existing operator via
  * BasicOperatorData::dumpC(), as well as against each other (ie check that
  * GPU, CPU, MKL, and CUDNN operators produce the same output given the same input.
  *
  * test_util.h: General testing utility functionality
  * test_perf.h: Performance-related classes
  * test_op.h:   Operator-specific testing classes
  */
 #ifndef TEST_LEGACY_OP_H_
 #define TEST_LEGACY_OP_H_

 #include <list>
 #include <string>
 #include <algorithm>
 #include <map>
 #include <vector>
 #include <list>
 #include "profiler/vtune.h"
 #include "../../../include/mxnet/operator.h"
 #include "./test_op.h"
 #include "./test_op_runner.h"

 namespace mxnet {
 namespace test {
 namespace op {

 /*!
  * \brief Manage test blobs and context, and universal logic
  * Create an operator from its "Prop" class and sets up the operator
  * and resources for both forward and backward passes
  * \tparam DType
  */
 template <typename DType, typename AccReal>
 class LegacyOperatorExecutor : public OperatorDataInitializer<DType>,
                                public OperatorExecutorTiming {
  public:
   typedef DType DataType;
   typedef AccReal AccRealType;

   /*! \brief Manage test blobs and context */
   LegacyOperatorExecutor(const bool isGPU, const mxnet::ShapeVector& topShapes)
 #if !MXNET_USE_CUDA
       : isGPU_(false)
 #else
       : isGPU_(isGPU)
 #endif
         ,
         initializeForward_(0)  // unit testing may call inits in any order based
         ,
         initializeBackward_(0)  // upon its use-case (ie may not want to run forward pass first)
         ,
         initializeCallback_(0) {
     opContext_.is_train       = true;
     opContext_.run_ctx.stream = nullptr;
     CHECK(!topShapes.empty());
     shape_input_vec_ = topShapes;
   }

   inline mxnet::Context getContext() {
     return isGPU_ ? mxnet::Context::GPU(0) : mxnet::Context{};
   }

   /*! \brief Initialize forward blob data values */
   virtual void resetForward() {}

   /*! \brief Initialize backward blob data values */
   virtual void resetBackward() {}

   /*! \brief Initialize auxiliary and output blobs */
   template <typename OperatorPropertyType>
   bool initForward(const OperatorPropertyType& opProp, std::vector<int>* in_type) {
     if (!initializeForward_++) {
       shape_input_vec_.resize(opProp.ListArguments().size());
       op_.reset(opProp.CreateOperatorEx(getContext(), &shape_input_vec_, in_type));
       if (op_) {
         const size_t output_count = opProp.ListOutputs().size();
         const size_t aux_count    = opProp.ListAuxiliaryStates().size();
         // Figure out what sort of blobs we need to allocate
         mxnet::ShapeVector out_shape, aux_shape;
         out_shape.resize(output_count);
         aux_shape.resize(aux_count);
         opProp.InferShape(&shape_input_vec_, &out_shape, &aux_shape);
         std::vector<int> out_type(output_count, -1), aux_type(aux_count, -1);
         opProp.InferType(in_type, &out_type, &aux_type);

         // Allocate top blobs (input)
         for (size_t x = 0, n = shape_input_vec_.size(); x < n; ++x) {
           int type;
           if (x < in_type->size()) {
             type = (*in_type)[x];
           } else {
             type = x ? mshadow::DataType<AccReal>::kFlag : mshadow::DataType<DType>::kFlag;
           }

           allocateBlob(&c_.blob_input_vec_, shape_input_vec_[x], false, type);
         }

         // Allocate aux blobs (scratch, hidden, etc.)
         for (size_t x = 0, n = aux_shape.size(); x < n; ++x) {
           CHECK(x < aux_type.size());
           allocateBlob(&c_.blob_aux_states_, aux_shape[x], false, aux_type[x]);
         }

         // Allocate bottom blobs (output)
         for (size_t x = 0, n = out_shape.size(); x < n; ++x) {
           CHECK(x < out_type.size());
           allocateBlob(&c_.blob_output_vec_, out_shape[x], false, out_type[x]);
         }

         // Get the resource of temporal space
         mxnet::ShapeVector inputShapes;
         for (size_t x = 0, n = shape_input_vec_.size(); x < n; ++x) {
           inputShapes.emplace_back(shape_input_vec_[x]);
         }
         allocateResources(opProp.ForwardResource(inputShapes));

         resetForward();
         return true;
       }
       return false;
     } else {
       return true;
     }
   }

   /*! \brief Initialize auxiliary and output blobs */
   template <typename OperatorPropertyType>
   bool initBackward(const OperatorPropertyType& opProp, std::vector<int>* in_type) {
     initForward(opProp, in_type);
     if (!initializeBackward_++) {
       for (size_t x = 0, n = static_cast<size_t>(opProp.NumVisibleOutputs()); x < n; ++x) {
         CHECK_LT(x, c_.blob_output_vec_.size());
         allocateBlob(&c_.blob_out_grad_,
                      c_.blob_output_vec_[x].shape_,
                      false,
                      c_.blob_output_vec_[x].type_flag_);
       }

       for (size_t x = 0, n = c_.blob_input_vec_.size(); x < n; ++x) {
         allocateBlob(&c_.blob_in_grad_,
                      c_.blob_input_vec_[x].shape_,
                      false,
                      c_.blob_input_vec_[x].type_flag_);
       }

       // Get the resource of temporal space
       mxnet::ShapeVector ishapes;
       allocateResources(opProp.BackwardResource(ishapes));

       resetBackward();
       return false;
     } else {
       return true;
     }
   }

   /*! \brief Run operator forward */
   void forward(const size_t count = 1) {
     const std::vector<OpReqType> req(c_.blob_output_vec_.size(), kWriteTo);
     // Possibly move data to/from CPU and GPU (outside of timing scope)
     MXNET_CUDA_ONLY(
         std::unique_ptr<GPUOpData> gpuData(isGPU_ ? new GPUOpData(c_, &opContext_) : nullptr));
     perf::TimingItem timeF(&OperatorExecutorTiming::GetTiming(), Forward, "Forward", count);
     if (!isGPU_) {
       mxnet::profiler::vtune::VTuneResume profile;  // VTune sample only this scope
       for (size_t x = 0; x < count; ++x) {
         op()->Forward(
             opContext_, c_.blob_input_vec_, req, c_.blob_output_vec_, c_.blob_aux_states_);
       }
     } else {
       for (size_t x = 0; x < count; ++x) {
         MXNET_CUDA_ONLY(op()->Forward(opContext_,
                                       gpuData->blob_input_vec_,
                                       req,
                                       gpuData->blob_output_vec_,
                                       gpuData->blob_aux_states_));
       }
     }
   }

   /*! \brief Run operator backwards */
   void backward(const size_t count = 1) {
     const std::vector<OpReqType> req(c_.blob_in_grad_.size(), kWriteTo);
     // Possibly move data to/from CPU and GPU (outside of timing scope)
     MXNET_CUDA_ONLY(
         std::unique_ptr<GPUOpData> gpuData(isGPU_ ? new GPUOpData(c_, &opContext_) : nullptr));
     perf::TimingItem timeB(&OperatorExecutorTiming::GetTiming(), Backward, "Backward", count);
     if (!isGPU_) {
       mxnet::profiler::vtune::VTuneResume profile;  // VTune sample only this scope
       for (size_t x = 0; x < count; ++x) {
         op()->Backward(opContext_,
                        c_.blob_out_grad_,
                        c_.blob_input_vec_,
                        c_.blob_output_vec_,
                        req,
                        c_.blob_in_grad_,
                        c_.blob_aux_states_);
       }
     } else {
       for (size_t x = 0; x < count; ++x) {
         MXNET_CUDA_ONLY(op()->Backward(opContext_,
                                        gpuData->blob_out_grad_,
                                        gpuData->blob_input_vec_,
                                        gpuData->blob_output_vec_,
                                        req,
                                        gpuData->blob_in_grad_,
                                        gpuData->blob_aux_states_));
       }
     }
   }

   /*!
    * \brief Test if operator has a backward pass
    * \return true if this operator has a backward pass
    */
   MSHADOW_CINLINE bool HasBackward() const {
     return true;
   }

   /*! \brief Getter functions for the operator */
   inline Operator* op() {
     return op_.get();
   }
   inline const Operator* op() const {
     return op_.get();
   }

   enum BlobVectorType { kInput, kOutput, kAux, kInGrad, kOutGrad, kBlobVectorTypeCount };

 #define CASE_STR(__v$) \
   case (__v$):         \
     return #__v$

   /*! \brief Convert BlobVectorType enum into a string */
   static inline const char* bvt2String(const BlobVectorType bvt) {
     switch (bvt) {
       CASE_STR(kInput);
       CASE_STR(kOutput);
       CASE_STR(kAux);
       CASE_STR(kInGrad);
       CASE_STR(kOutGrad);
       default:
         CHECK(false);
         return "";
     }
   }
 #undef CASE_STR

   /*! \brief Return a particular blob in a test data set */
   inline const std::vector<TBlob>& getBlobVect(const BlobVectorType bvt) const {
     switch (bvt) {
       case kInput:
         return c_.blob_input_vec_;
       case kOutput:
         return c_.blob_output_vec_;
       case kAux:
         return c_.blob_aux_states_;
       case kInGrad:
         return c_.blob_in_grad_;
       case kOutGrad:
         return c_.blob_out_grad_;
       default:
         CHECK(false);
         return c_.blob_input_vec_;
     }
   }

   /*! \brief Dump an operator's data set into compilable C++ data code for runtime validation
    * When writing an operator test, you can generate a "known good operator data state" in C++
    * code with this function, and then use load() to load the blob states into this
    * class (BasicOperatorData).
    * After that, you can compare with the "actual" operator state (BasicOperatorData) of
    * the operator that you are testing.
    */
   template <typename Stream>
   inline void dumpC(Stream* _os, const std::string& label) {
     Stream& os = *_os;
     os << "static const std::vector< std::vector< std::vector<float> > > ___" << label
        << "_data_shape_";
     const mxnet::TShape& shape = shape_input_vec_[0];
     for (size_t i = 0, n = shape.ndim(); i < n; ++i) {
       os << shape[i] << "_";
     }
     os << "__ =" << std::endl << "{" << std::endl;
     for (size_t x = 0; x < kBlobVectorTypeCount; ++x) {
       os << "  { /* " << bvt2String(BlobVectorType(x)) << " */" << std::endl;
       const std::vector<TBlob>& blobVect = getBlobVect(BlobVectorType(x));
       for (size_t i = 0, n = blobVect.size(); i < n; ++i) {
         os << "    { ";
         test::dump<DType>(&os, blobVect[i]);
         os << " }";
         if (i < n - 1) {
           os << ",";
         }
         os << std::endl;
       }
       os << "  }";
       if (x < kBlobVectorTypeCount - 1) {
         os << ",";
       }
       os << std::endl;
     }
     os << "};" << std::endl;
   }

   static inline void copy(const TBlob& blob,
                           const DType array[],
                           const size_t start,
                           const size_t end) {
     const size_t blobSize = blob.Size();
     DType* p              = blob.dptr<DType>();
     for (size_t i = 0, n = end - start; i < n; ++i) {
       CHECK_LT(i, blobSize);
       p[i] = array[i + start];
     }
   }

   /*! \brief Runtime load of the C++ data code generated by dumpC() */
   void load(const std::vector<std::vector<std::vector<DType>>>& cData) {
     for (size_t i = 0, ni = cData.size(); i < ni; ++i) {
       for (size_t j = 0, nj = cData[i].size(); j < nj; ++j) {
         const TBlob& blob           = getBlobVect(BlobVectorType(i))[j];
         const size_t sourceDataSize = cData[i][j].size();
         CHECK_EQ(sourceDataSize, blob.Size());
         const DType* sourceData = &cData[i][j][0];
         copy(blob, sourceData, 0, sourceDataSize);
       }
     }
   }

   /*! \brief Runtime load of the C++ data code generated by dumpC() */
   void load(const std::vector<std::vector<std::vector<DType>>>& cData, const BlobVectorType type) {
     CHECK_LT(type, cData.size());
     for (size_t j = 0, nj = cData[type].size(); j < nj; ++j) {
       const TBlob& blob           = getBlobVect(type)[j];
       const size_t sourceDataSize = cData[type][j].size();
       CHECK_EQ(sourceDataSize, blob.Size());
       const DType* sourceData = &cData[type][j][0];
       copy(blob, sourceData, 0, sourceDataSize);
     }
   }

   /*! \brief Runtime load of the C++ data code generated by dumpC() */
   void load(const std::vector<std::vector<std::vector<DType>>>& cData,
             const BlobVectorType type,
             const int idx) {
     CHECK_LT(type, cData.size());
     CHECK_LT(idx, cData[type].size());
     const TBlob& blob           = getBlobVect(type)[idx];
     const size_t sourceDataSize = cData[type][idx].size();
     CHECK_EQ(sourceDataSize, blob.Size());
     const DType* sourceData = &cData[type][idx][0];
     copy(blob, sourceData, 0, sourceDataSize);
   }

   //  void FillRandom() {
   //    for (size_t j = 0, jn = this->c_.all_blob_vects_.size(); j < jn; ++j) {
   //      std::vector<TBlob> *data_vect = this->c_.all_blob_vects_[j];
   //      if (data_vect) {
   //        for (size_t i = 0, n = data_vect->size(); i < n; ++i) {
   //          OperatorDataInitializer<DType>::FillRandom((*data_vect)[i]);
   //        }
   //      }
   //    }
   //  }

   std::vector<TBlob>& inputs() {
     return c_.blob_input_vec_;
   }
   const std::vector<TBlob>& inputs() const {
     return c_.blob_input_vec_;
   }
   std::vector<TBlob>& outputs() {
     return c_.blob_output_vec_;
   }
   const std::vector<TBlob>& outputs() const {
     return c_.blob_output_vec_;
   }
   std::vector<TBlob>& bwd_inputs() {
     return c_.blob_out_grad_;
   }
   std::vector<TBlob>& bwd_outputs() {
     return c_.blob_in_grad_;
   }

   /*! \brief Input and output blobs */
   OpContext opContext_;

   mxnet::ShapeVector shape_input_vec_;

   struct OpData {
     std::vector<TBlob> blob_input_vec_;
     std::vector<TBlob> blob_output_vec_;
     std::vector<TBlob> blob_aux_states_;
     std::vector<TBlob> blob_in_grad_;
     std::vector<TBlob> blob_out_grad_;  // Remaining err (loss) pushing back upstream

     std::vector<std::vector<TBlob>*> all_blob_vects_;
     inline OpData() {
       all_blob_vects_.emplace_back(&blob_input_vec_);
       all_blob_vects_.emplace_back(&blob_output_vec_);
       all_blob_vects_.emplace_back(&blob_aux_states_);
       all_blob_vects_.emplace_back(&blob_in_grad_);
       all_blob_vects_.emplace_back(&blob_out_grad_);  // Remaining err (loss) pushing back upstream
     }
     virtual ~OpData() {}
   };

 #if MXNET_USE_CUDA
   class GPUOpData : public OpData {
     GPUOpData()                   = delete;
     GPUOpData(const GPUOpData& o) = delete;

    public:
     inline GPUOpData(const OpData& cpuData, OpContext* opContext)
         : cpuData_(cpuData), allocGPUStream_(opContext) {
       // Copy CPU->GPU
       CHECK_EQ(gpuBlobs_.size(), 0U);
       CHECK_EQ(cpuData_.all_blob_vects_.size(), this->all_blob_vects_.size());
       for (size_t bvt = 0, nbvt = cpuData_.all_blob_vects_.size(); bvt < nbvt; ++bvt) {
         std::vector<TBlob>& bv_src   = *cpuData_.all_blob_vects_[bvt];
         std::vector<TBlob>& bvt_dest = *this->all_blob_vects_[bvt];
         for (size_t i = 0, n = bv_src.size(); i < n; ++i) {
           const TBlob& srcBlob = bv_src[i];
           TBlob* destBlob =
               allocateBlob(&gpuBlobs_, &bvt_dest, srcBlob.shape_, true, srcBlob.type_flag_);

           Context cpu_ctx, gpu_ctx;
           cpu_ctx.dev_type = Context::kCPU;
           gpu_ctx.dev_type = Context::kGPU;
           cpu_ctx.dev_id = gpu_ctx.dev_id = 0;

           mxnet::ndarray::Copy<cpu, gpu>(
               srcBlob, destBlob, cpu_ctx, gpu_ctx, allocGPUStream_.opContext_.run_ctx);
         }
       }
       cudaDeviceSynchronize();
     }
     inline ~GPUOpData() {
       // Copy GPU->CPU
       cudaDeviceSynchronize();
       for (size_t bvt = 0, nbvt = this->all_blob_vects_.size(); bvt < nbvt; ++bvt) {
         std::vector<TBlob>& bv_src   = *this->all_blob_vects_[bvt];
         std::vector<TBlob>& bvt_dest = *cpuData_.all_blob_vects_[bvt];
         for (size_t i = 0, n = bv_src.size(); i < n; ++i) {
           const TBlob& srcBlob = bv_src[i];
           TBlob* destBlob      = &bvt_dest[i];

           Context cpu_ctx, gpu_ctx;
           cpu_ctx.dev_type = Context::kCPU;
           gpu_ctx.dev_type = Context::kGPU;
           cpu_ctx.dev_id = gpu_ctx.dev_id = 0;

           mxnet::ndarray::Copy<gpu, cpu>(
               srcBlob, destBlob, gpu_ctx, cpu_ctx, allocGPUStream_.opContext_.run_ctx);
         }
       }
       gpuBlobs_.clear();  // Force deallocation of the GPU blob data
       cudaDeviceSynchronize();
     }

    private:
     /*! \brief Reference to the src/dest CPU data */
     const OpData& cpuData_;
     /*! \brief The GPU-allocated blobs */
     std::list<std::unique_ptr<test::StandaloneBlob>> gpuBlobs_;
     /*! \brief Scoped GPU stream */
     GPUStreamScope allocGPUStream_;
   };
 #endif  // MXNET_USE_CUDA

  protected:
   OpData c_;

   /*! \brief Allocate the operator's resource requests */
   void allocateResources(const std::vector<ResourceRequest>& reqs) {
     std::map<Context, Resource> cached_temp;

     Context ctx;
     ctx.dev_type = isGPU_ ? Context::kGPU : Context::kCPU;
     ctx.dev_id   = 0;

     for (const ResourceRequest& req : reqs) {
       switch (req.type) {
         case ResourceRequest::kTempSpace: {
           if (cached_temp.count(ctx) != 0) {
             opContext_.requested.emplace_back(cached_temp.at(ctx));
           } else {
             Resource r = ResourceManager::Get()->Request(ctx, req);
             opContext_.requested.emplace_back(r);
             cached_temp[ctx] = r;
           }
           break;
         }
         case ResourceRequest::kRandom: {
           opContext_.requested.emplace_back(ResourceManager::Get()->Request(ctx, req));
           break;
         }
         case ResourceRequest::kParallelRandom: {
           Resource rm = ResourceManager::Get()->Request(ctx, req);
           if (ctx.dev_mask() == Context::kCPU) {
             common::random::RandGenerator<cpu, DType>::AllocState(
                 rm.get_parallel_random<cpu, DType>());
           }
           opContext_.requested.emplace_back(rm);
           break;
         }
 #if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7
         case ResourceRequest::kCuDNNDropoutDesc: {
           opContext_.requested.push_back(ResourceManager::Get()->Request(ctx, req));
           break;
         }
 #endif  // MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7
         default:
           LOG(FATAL) << "resource type " << req.type << " is not yet supported";
       }
     }
   }

   /*! \brief Locally allocate a managed TBlob and insert into the supplied vector */
   static TBlob* allocateBlob(std::list<std::unique_ptr<test::StandaloneBlob>>* standalone_blobs,
                              std::vector<TBlob>* dest,
                              const mxnet::TShape& shape,
                              const bool isGPU,
                              const int dtype) {
     test::StandaloneBlob* blob = new test::StandaloneBlob(shape, isGPU, dtype);
     CHECK_NE(blob, static_cast<TBlob*>(nullptr));
     standalone_blobs->emplace_back(std::unique_ptr<test::StandaloneBlob>(blob));
     (*dest).emplace_back(*blob);
     return blob;
   }

   /*! \brief Locally allocate a managed TBlob and insert into the supplied vector */
   inline TBlob* allocateBlob(std::vector<TBlob>* dest,
                              const mxnet::TShape& shape,
                              const bool isGPU,
                              const int dtype) {
     return allocateBlob(&standalone_blobs_, dest, shape, isGPU, dtype);
   }

   /*! \brief Performance timing categories */
   enum TimingId { Forward, Backward };

   /*! \brief The operator */
   std::unique_ptr<Operator> op_;
   /*! \brief Is this for a GPU? */
   const bool isGPU_;
   /*! \brief Assure that the Forward initialized only once */
   std::atomic<int> initializeForward_;
   /*! \brief Assure that the Forward initialized only once */
   std::atomic<int> initializeBackward_;
   /*! \brief Assure that the callback is initialized only once */
   std::atomic<int> initializeCallback_;
   /*! \brief scoped lifecycle management of allocated blobs */
   std::list<std::unique_ptr<test::StandaloneBlob>> standalone_blobs_;
 };

 template <typename OperatorProp, typename DType, typename AccReal>
 using LegacyOpRunner =
     mxnet::test::OperatorRunner<OperatorProp, LegacyOperatorExecutor<DType, AccReal>>;

 }  // namespace op
 }  // namespace test
 }  // namespace mxnet

 #endif  // TEST_LEGACY_OP_H_
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/

	/*!
	* \file test_op.h
	* \brief operator unit test utility functions
	* \author Chris Olivier
	*
	* These classes offer a framework for developing, testing and debugging operators
	* in C++. They work for both CPU and GPU modes, as well as offer a timing
	* infrastructure in order to test inidividual operator performance.
	*
	* Operator data can be validated against general logic,
	* stored scalar values (which can be generated by this code from an existing operator via
	* BasicOperatorData::dumpC(), as well as against each other (ie check that
	* GPU, CPU, MKL, and CUDNN operators produce the same output given the same input.
	*
	* test_util.h: General testing utility functionality
	* test_perf.h: Performance-related classes
	* test_op.h: Operator-specific testing classes
	*/
	#ifndef TEST_LEGACY_OP_H_
	#define TEST_LEGACY_OP_H_

	#include <list>
	#include <string>
	#include <algorithm>
	#include <map>
	#include <vector>
	#include <list>
	#include "profiler/vtune.h"
	#include "../../../include/mxnet/operator.h"
	#include "./test_op.h"
	#include "./test_op_runner.h"

	namespace mxnet {
	namespace test {
	namespace op {

	/*!
	* \brief Manage test blobs and context, and universal logic
	* Create an operator from its "Prop" class and sets up the operator
	* and resources for both forward and backward passes
	* \tparam DType
	*/
	template <typename DType, typename AccReal>
	class LegacyOperatorExecutor : public OperatorDataInitializer<DType>,
	public OperatorExecutorTiming {
	public:
	typedef DType DataType;
	typedef AccReal AccRealType;

	/! \brief Manage test blobs and context /
	LegacyOperatorExecutor(const bool isGPU, const mxnet::ShapeVector& topShapes)
	#if !MXNET_USE_CUDA
	: isGPU_(false)
	#else
	: isGPU_(isGPU)
	#endif
	,
	initializeForward_(0) // unit testing may call inits in any order based
	,
	initializeBackward_(0) // upon its use-case (ie may not want to run forward pass first)
	,
	initializeCallback_(0) {
	opContext_.is_train = true;
	opContext_.run_ctx.stream = nullptr;
	CHECK(!topShapes.empty());
	shape_input_vec_ = topShapes;
	}

	inline mxnet::Context getContext() {
	return isGPU_ ? mxnet::Context::GPU(0) : mxnet::Context{};
	}

	/! \brief Initialize forward blob data values /
	virtual void resetForward() {}

	/! \brief Initialize backward blob data values /
	virtual void resetBackward() {}

	/! \brief Initialize auxiliary and output blobs /
	template <typename OperatorPropertyType>
	bool initForward(const OperatorPropertyType& opProp, std::vector<int>* in_type) {
	if (!initializeForward_++) {
	shape_input_vec_.resize(opProp.ListArguments().size());
	op_.reset(opProp.CreateOperatorEx(getContext(), &shape_input_vec_, in_type));
	if (op_) {
	const size_t output_count = opProp.ListOutputs().size();
	const size_t aux_count = opProp.ListAuxiliaryStates().size();
	// Figure out what sort of blobs we need to allocate
	mxnet::ShapeVector out_shape, aux_shape;
	out_shape.resize(output_count);
	aux_shape.resize(aux_count);
	opProp.InferShape(&shape_input_vec_, &out_shape, &aux_shape);
	std::vector<int> out_type(output_count, -1), aux_type(aux_count, -1);
	opProp.InferType(in_type, &out_type, &aux_type);

	// Allocate top blobs (input)
	for (size_t x = 0, n = shape_input_vec_.size(); x < n; ++x) {
	int type;
	if (x < in_type->size()) {
	type = (*in_type)[x];
	} else {
	type = x ? mshadow::DataType<AccReal>::kFlag : mshadow::DataType<DType>::kFlag;
	}

	allocateBlob(&c_.blob_input_vec_, shape_input_vec_[x], false, type);
	}

	// Allocate aux blobs (scratch, hidden, etc.)
	for (size_t x = 0, n = aux_shape.size(); x < n; ++x) {
	CHECK(x < aux_type.size());
	allocateBlob(&c_.blob_aux_states_, aux_shape[x], false, aux_type[x]);
	}

	// Allocate bottom blobs (output)
	for (size_t x = 0, n = out_shape.size(); x < n; ++x) {
	CHECK(x < out_type.size());
	allocateBlob(&c_.blob_output_vec_, out_shape[x], false, out_type[x]);
	}

	// Get the resource of temporal space
	mxnet::ShapeVector inputShapes;
	for (size_t x = 0, n = shape_input_vec_.size(); x < n; ++x) {
	inputShapes.emplace_back(shape_input_vec_[x]);
	}
	allocateResources(opProp.ForwardResource(inputShapes));

	resetForward();
	return true;
	}
	return false;
	} else {
	return true;
	}
	}

	/! \brief Initialize auxiliary and output blobs /
	template <typename OperatorPropertyType>
	bool initBackward(const OperatorPropertyType& opProp, std::vector<int>* in_type) {
	initForward(opProp, in_type);
	if (!initializeBackward_++) {
	for (size_t x = 0, n = static_cast<size_t>(opProp.NumVisibleOutputs()); x < n; ++x) {
	CHECK_LT(x, c_.blob_output_vec_.size());
	allocateBlob(&c_.blob_out_grad_,
	c_.blob_output_vec_[x].shape_,
	false,
	c_.blob_output_vec_[x].type_flag_);
	}

	for (size_t x = 0, n = c_.blob_input_vec_.size(); x < n; ++x) {
	allocateBlob(&c_.blob_in_grad_,
	c_.blob_input_vec_[x].shape_,
	false,
	c_.blob_input_vec_[x].type_flag_);
	}

	// Get the resource of temporal space
	mxnet::ShapeVector ishapes;
	allocateResources(opProp.BackwardResource(ishapes));

	resetBackward();
	return false;
	} else {
	return true;
	}
	}

	/! \brief Run operator forward /
	void forward(const size_t count = 1) {
	const std::vector<OpReqType> req(c_.blob_output_vec_.size(), kWriteTo);
	// Possibly move data to/from CPU and GPU (outside of timing scope)
	MXNET_CUDA_ONLY(
	std::unique_ptr<GPUOpData> gpuData(isGPU_ ? new GPUOpData(c_, &opContext_) : nullptr));
	perf::TimingItem timeF(&OperatorExecutorTiming::GetTiming(), Forward, "Forward", count);
	if (!isGPU_) {
	mxnet::profiler::vtune::VTuneResume profile; // VTune sample only this scope
	for (size_t x = 0; x < count; ++x) {
	op()->Forward(
	opContext_, c_.blob_input_vec_, req, c_.blob_output_vec_, c_.blob_aux_states_);
	}
	} else {
	for (size_t x = 0; x < count; ++x) {
	MXNET_CUDA_ONLY(op()->Forward(opContext_,
	gpuData->blob_input_vec_,
	req,
	gpuData->blob_output_vec_,
	gpuData->blob_aux_states_));
	}
	}
	}

	/! \brief Run operator backwards /
	void backward(const size_t count = 1) {
	const std::vector<OpReqType> req(c_.blob_in_grad_.size(), kWriteTo);
	// Possibly move data to/from CPU and GPU (outside of timing scope)
	MXNET_CUDA_ONLY(
	std::unique_ptr<GPUOpData> gpuData(isGPU_ ? new GPUOpData(c_, &opContext_) : nullptr));
	perf::TimingItem timeB(&OperatorExecutorTiming::GetTiming(), Backward, "Backward", count);
	if (!isGPU_) {
	mxnet::profiler::vtune::VTuneResume profile; // VTune sample only this scope
	for (size_t x = 0; x < count; ++x) {
	op()->Backward(opContext_,
	c_.blob_out_grad_,
	c_.blob_input_vec_,
	c_.blob_output_vec_,
	req,
	c_.blob_in_grad_,
	c_.blob_aux_states_);
	}
	} else {
	for (size_t x = 0; x < count; ++x) {
	MXNET_CUDA_ONLY(op()->Backward(opContext_,
	gpuData->blob_out_grad_,
	gpuData->blob_input_vec_,
	gpuData->blob_output_vec_,
	req,
	gpuData->blob_in_grad_,
	gpuData->blob_aux_states_));
	}
	}
	}

	/*!
	* \brief Test if operator has a backward pass
	* \return true if this operator has a backward pass
	*/
	MSHADOW_CINLINE bool HasBackward() const {
	return true;
	}

	/! \brief Getter functions for the operator /
	inline Operator* op() {
	return op_.get();
	}
	inline const Operator* op() const {
	return op_.get();
	}

	enum BlobVectorType { kInput, kOutput, kAux, kInGrad, kOutGrad, kBlobVectorTypeCount };

	#define CASE_STR(__v$) \
	case (__v$): \
	return #__v$

	/! \brief Convert BlobVectorType enum into a string /
	static inline const char* bvt2String(const BlobVectorType bvt) {
	switch (bvt) {
	CASE_STR(kInput);
	CASE_STR(kOutput);
	CASE_STR(kAux);
	CASE_STR(kInGrad);
	CASE_STR(kOutGrad);
	default:
	CHECK(false);
	return "";
	}
	}
	#undef CASE_STR

	/! \brief Return a particular blob in a test data set /
	inline const std::vector<TBlob>& getBlobVect(const BlobVectorType bvt) const {
	switch (bvt) {
	case kInput:
	return c_.blob_input_vec_;
	case kOutput:
	return c_.blob_output_vec_;
	case kAux:
	return c_.blob_aux_states_;
	case kInGrad:
	return c_.blob_in_grad_;
	case kOutGrad:
	return c_.blob_out_grad_;
	default:
	CHECK(false);
	return c_.blob_input_vec_;
	}
	}

	/*! \brief Dump an operator's data set into compilable C++ data code for runtime validation
	* When writing an operator test, you can generate a "known good operator data state" in C++
	* code with this function, and then use load() to load the blob states into this
	* class (BasicOperatorData).
	* After that, you can compare with the "actual" operator state (BasicOperatorData) of
	* the operator that you are testing.
	*/
	template <typename Stream>
	inline void dumpC(Stream* _os, const std::string& label) {
	Stream& os = *_os;
	os << "static const std::vector< std::vector< std::vector<float> > > ___" << label
	<< "_data_shape_";
	const mxnet::TShape& shape = shape_input_vec_[0];
	for (size_t i = 0, n = shape.ndim(); i < n; ++i) {
	os << shape[i] << "_";
	}
	os << "__ =" << std::endl << "{" << std::endl;
	for (size_t x = 0; x < kBlobVectorTypeCount; ++x) {
	os << " { /* " << bvt2String(BlobVectorType(x)) << " */" << std::endl;
	const std::vector<TBlob>& blobVect = getBlobVect(BlobVectorType(x));
	for (size_t i = 0, n = blobVect.size(); i < n; ++i) {
	os << " { ";
	test::dump<DType>(&os, blobVect[i]);
	os << " }";
	if (i < n - 1) {
	os << ",";
	}
	os << std::endl;
	}
	os << " }";
	if (x < kBlobVectorTypeCount - 1) {
	os << ",";
	}
	os << std::endl;
	}
	os << "};" << std::endl;
	}

	static inline void copy(const TBlob& blob,
	const DType array[],
	const size_t start,
	const size_t end) {
	const size_t blobSize = blob.Size();
	DType* p = blob.dptr<DType>();
	for (size_t i = 0, n = end - start; i < n; ++i) {
	CHECK_LT(i, blobSize);
	p[i] = array[i + start];
	}
	}

	/! \brief Runtime load of the C++ data code generated by dumpC() /
	void load(const std::vector<std::vector<std::vector<DType>>>& cData) {
	for (size_t i = 0, ni = cData.size(); i < ni; ++i) {
	for (size_t j = 0, nj = cData[i].size(); j < nj; ++j) {
	const TBlob& blob = getBlobVect(BlobVectorType(i))[j];
	const size_t sourceDataSize = cData[i][j].size();
	CHECK_EQ(sourceDataSize, blob.Size());
	const DType* sourceData = &cData[i][j][0];
	copy(blob, sourceData, 0, sourceDataSize);
	}
	}
	}

	/! \brief Runtime load of the C++ data code generated by dumpC() /
	void load(const std::vector<std::vector<std::vector<DType>>>& cData, const BlobVectorType type) {
	CHECK_LT(type, cData.size());
	for (size_t j = 0, nj = cData[type].size(); j < nj; ++j) {
	const TBlob& blob = getBlobVect(type)[j];
	const size_t sourceDataSize = cData[type][j].size();
	CHECK_EQ(sourceDataSize, blob.Size());
	const DType* sourceData = &cData[type][j][0];
	copy(blob, sourceData, 0, sourceDataSize);
	}
	}

	/! \brief Runtime load of the C++ data code generated by dumpC() /
	void load(const std::vector<std::vector<std::vector<DType>>>& cData,
	const BlobVectorType type,
	const int idx) {
	CHECK_LT(type, cData.size());
	CHECK_LT(idx, cData[type].size());
	const TBlob& blob = getBlobVect(type)[idx];
	const size_t sourceDataSize = cData[type][idx].size();
	CHECK_EQ(sourceDataSize, blob.Size());
	const DType* sourceData = &cData[type][idx][0];
	copy(blob, sourceData, 0, sourceDataSize);
	}

	// void FillRandom() {
	// for (size_t j = 0, jn = this->c_.all_blob_vects_.size(); j < jn; ++j) {
	// std::vector<TBlob> *data_vect = this->c_.all_blob_vects_[j];
	// if (data_vect) {
	// for (size_t i = 0, n = data_vect->size(); i < n; ++i) {
	// OperatorDataInitializer<DType>::FillRandom((*data_vect)[i]);
	// }
	// }
	// }
	// }

	std::vector<TBlob>& inputs() {
	return c_.blob_input_vec_;
	}
	const std::vector<TBlob>& inputs() const {
	return c_.blob_input_vec_;
	}
	std::vector<TBlob>& outputs() {
	return c_.blob_output_vec_;
	}
	const std::vector<TBlob>& outputs() const {
	return c_.blob_output_vec_;
	}
	std::vector<TBlob>& bwd_inputs() {
	return c_.blob_out_grad_;
	}
	std::vector<TBlob>& bwd_outputs() {
	return c_.blob_in_grad_;
	}

	/! \brief Input and output blobs /
	OpContext opContext_;

	mxnet::ShapeVector shape_input_vec_;

	struct OpData {
	std::vector<TBlob> blob_input_vec_;
	std::vector<TBlob> blob_output_vec_;
	std::vector<TBlob> blob_aux_states_;
	std::vector<TBlob> blob_in_grad_;
	std::vector<TBlob> blob_out_grad_; // Remaining err (loss) pushing back upstream

	std::vector<std::vector<TBlob>*> all_blob_vects_;
	inline OpData() {
	all_blob_vects_.emplace_back(&blob_input_vec_);
	all_blob_vects_.emplace_back(&blob_output_vec_);
	all_blob_vects_.emplace_back(&blob_aux_states_);
	all_blob_vects_.emplace_back(&blob_in_grad_);
	all_blob_vects_.emplace_back(&blob_out_grad_); // Remaining err (loss) pushing back upstream
	}
	virtual ~OpData() {}
	};

	#if MXNET_USE_CUDA
	class GPUOpData : public OpData {
	GPUOpData() = delete;
	GPUOpData(const GPUOpData& o) = delete;

	public:
	inline GPUOpData(const OpData& cpuData, OpContext* opContext)
	: cpuData_(cpuData), allocGPUStream_(opContext) {
	// Copy CPU->GPU
	CHECK_EQ(gpuBlobs_.size(), 0U);
	CHECK_EQ(cpuData_.all_blob_vects_.size(), this->all_blob_vects_.size());
	for (size_t bvt = 0, nbvt = cpuData_.all_blob_vects_.size(); bvt < nbvt; ++bvt) {
	std::vector<TBlob>& bv_src = *cpuData_.all_blob_vects_[bvt];
	std::vector<TBlob>& bvt_dest = *this->all_blob_vects_[bvt];
	for (size_t i = 0, n = bv_src.size(); i < n; ++i) {
	const TBlob& srcBlob = bv_src[i];
	TBlob* destBlob =
	allocateBlob(&gpuBlobs_, &bvt_dest, srcBlob.shape_, true, srcBlob.type_flag_);

	Context cpu_ctx, gpu_ctx;
	cpu_ctx.dev_type = Context::kCPU;
	gpu_ctx.dev_type = Context::kGPU;
	cpu_ctx.dev_id = gpu_ctx.dev_id = 0;

	mxnet::ndarray::Copy<cpu, gpu>(
	srcBlob, destBlob, cpu_ctx, gpu_ctx, allocGPUStream_.opContext_.run_ctx);
	}
	}
	cudaDeviceSynchronize();
	}
	inline ~GPUOpData() {
	// Copy GPU->CPU
	cudaDeviceSynchronize();
	for (size_t bvt = 0, nbvt = this->all_blob_vects_.size(); bvt < nbvt; ++bvt) {
	std::vector<TBlob>& bv_src = *this->all_blob_vects_[bvt];
	std::vector<TBlob>& bvt_dest = *cpuData_.all_blob_vects_[bvt];
	for (size_t i = 0, n = bv_src.size(); i < n; ++i) {
	const TBlob& srcBlob = bv_src[i];
	TBlob* destBlob = &bvt_dest[i];

	Context cpu_ctx, gpu_ctx;
	cpu_ctx.dev_type = Context::kCPU;
	gpu_ctx.dev_type = Context::kGPU;
	cpu_ctx.dev_id = gpu_ctx.dev_id = 0;

	mxnet::ndarray::Copy<gpu, cpu>(
	srcBlob, destBlob, gpu_ctx, cpu_ctx, allocGPUStream_.opContext_.run_ctx);
	}
	}
	gpuBlobs_.clear(); // Force deallocation of the GPU blob data
	cudaDeviceSynchronize();
	}

	private:
	/! \brief Reference to the src/dest CPU data /
	const OpData& cpuData_;
	/! \brief The GPU-allocated blobs /
	std::list<std::unique_ptr<test::StandaloneBlob>> gpuBlobs_;
	/! \brief Scoped GPU stream /
	GPUStreamScope allocGPUStream_;
	};
	#endif // MXNET_USE_CUDA

	protected:
	OpData c_;

	/! \brief Allocate the operator's resource requests /
	void allocateResources(const std::vector<ResourceRequest>& reqs) {
	std::map<Context, Resource> cached_temp;

	Context ctx;
	ctx.dev_type = isGPU_ ? Context::kGPU : Context::kCPU;
	ctx.dev_id = 0;

	for (const ResourceRequest& req : reqs) {
	switch (req.type) {
	case ResourceRequest::kTempSpace: {
	if (cached_temp.count(ctx) != 0) {
	opContext_.requested.emplace_back(cached_temp.at(ctx));
	} else {
	Resource r = ResourceManager::Get()->Request(ctx, req);
	opContext_.requested.emplace_back(r);
	cached_temp[ctx] = r;
	}
	break;
	}
	case ResourceRequest::kRandom: {
	opContext_.requested.emplace_back(ResourceManager::Get()->Request(ctx, req));
	break;
	}
	case ResourceRequest::kParallelRandom: {
	Resource rm = ResourceManager::Get()->Request(ctx, req);
	if (ctx.dev_mask() == Context::kCPU) {
	common::random::RandGenerator<cpu, DType>::AllocState(
	rm.get_parallel_random<cpu, DType>());
	}
	opContext_.requested.emplace_back(rm);
	break;
	}
	#if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7
	case ResourceRequest::kCuDNNDropoutDesc: {
	opContext_.requested.push_back(ResourceManager::Get()->Request(ctx, req));
	break;
	}
	#endif // MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7
	default:
	LOG(FATAL) << "resource type " << req.type << " is not yet supported";
	}
	}
	}

	/! \brief Locally allocate a managed TBlob and insert into the supplied vector /
	static TBlob* allocateBlob(std::list<std::unique_ptr<test::StandaloneBlob>>* standalone_blobs,
	std::vector<TBlob>* dest,
	const mxnet::TShape& shape,
	const bool isGPU,
	const int dtype) {
	test::StandaloneBlob* blob = new test::StandaloneBlob(shape, isGPU, dtype);
	CHECK_NE(blob, static_cast<TBlob*>(nullptr));
	standalone_blobs->emplace_back(std::unique_ptr<test::StandaloneBlob>(blob));
	(dest).emplace_back(blob);
	return blob;
	}

	/! \brief Locally allocate a managed TBlob and insert into the supplied vector /
	inline TBlob* allocateBlob(std::vector<TBlob>* dest,
	const mxnet::TShape& shape,
	const bool isGPU,
	const int dtype) {
	return allocateBlob(&standalone_blobs_, dest, shape, isGPU, dtype);
	}

	/! \brief Performance timing categories /
	enum TimingId { Forward, Backward };

	/! \brief The operator /
	std::unique_ptr<Operator> op_;
	/! \brief Is this for a GPU? /
	const bool isGPU_;
	/! \brief Assure that the Forward initialized only once /
	std::atomic<int> initializeForward_;
	/! \brief Assure that the Forward initialized only once /
	std::atomic<int> initializeBackward_;
	/! \brief Assure that the callback is initialized only once /
	std::atomic<int> initializeCallback_;
	/! \brief scoped lifecycle management of allocated blobs /
	std::list<std::unique_ptr<test::StandaloneBlob>> standalone_blobs_;
	};

	template <typename OperatorProp, typename DType, typename AccReal>
	using LegacyOpRunner =
	mxnet::test::OperatorRunner<OperatorProp, LegacyOperatorExecutor<DType, AccReal>>;

	} // namespace op
	} // namespace test
	} // namespace mxnet

	#endif // TEST_LEGACY_OP_H_