src/operator/mkl/mkl_pooling-inl.h - mxnet-test - Git at Google

 /*******************************************************************************
 * Copyright 2016 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * \file mkl_pooling-inl.h
 * \brief
 * \author zhenlin.luo@intel.com
 *         lingyan.guo@intel.com
 *
 *******************************************************************************/

 #ifndef MXNET_OPERATOR_MKL_MKL_POOLING_INL_H_
 #define MXNET_OPERATOR_MKL_MKL_POOLING_INL_H_
 #include <vector>
 #include <string>
 #include <utility>
 #include "../operator_common.h"
 #include "../pooling-inl.h"
 #include "./mkl_util-inl.h"

 namespace mxnet {
 namespace op {


 template<typename xpu, typename DType>
 class MKLPoolingOp : public Operator {
  public:
   static std::string getName() {
     return "MKLPoolingOp";
   }
   explicit MKLPoolingOp(PoolingParam p) {
     poolingFwd = static_cast<dnnPrimitive_t>(NULL);
     poolingBwd = static_cast<dnnPrimitive_t>(NULL);
     max_idx_data = static_cast<DType*>(NULL);
     fwd_top_data = MKLData<DType>::create();
     fwd_bottom_data = MKLData<DType>::create();
     bwd_top_diff = MKLData<DType>::create();
     bwd_bottom_diff = MKLData<DType>::create();
     this->param_ = p;
     init_mkldnn_ = false;
   }
   virtual ~MKLPoolingOp() {
     if (poolingFwd != NULL) {
       dnnDelete<DType>(poolingFwd);
       poolingFwd = NULL;
     }
     if (poolingBwd != NULL) {
       dnnDelete<DType>(poolingBwd);
       poolingBwd = NULL;
     }
     if (max_idx_data != NULL) {
       dnnReleaseBuffer<DType>(max_idx_data);
       max_idx_data = NULL;
     }
   }

  private:
   void LayerSetUp(const mshadow::Tensor<xpu, 4, DType> &data,
                   const mshadow::Tensor<xpu, 4, DType> &out) {
     channels_ = data.shape_[1];
     height_ = data.shape_[2];
     width_ = data.shape_[3];
     num_ = data.shape_[0];
     global_pooling_ = param_.global_pool;
     if (global_pooling_) {
       kernel_h_ = height_;
       kernel_w_ = width_;
     } else {
       kernel_h_ = param_.kernel[0];
       kernel_w_ = param_.kernel[1];
     }
     CHECK_GT(kernel_h_, 0) << "Filter dimensions cannot be zero.";
     CHECK_GT(kernel_w_, 0) << "Filter dimensions cannot be zero.";
     pad_h_ = param_.pad[0];
     pad_w_ = param_.pad[1];
     if (global_pooling_) {
       stride_h_ = stride_w_ = 1;
     } else {
       stride_h_ = param_.stride[0];
       stride_w_ = param_.stride[1];
     }
     if (global_pooling_) {
       CHECK(pad_h_ == 0 && pad_w_ == 0 && stride_h_ == 1 && stride_w_ == 1)
         << "With Global_pooling: true; only pad = 0 and stride = 1";
     }
     if (pad_h_ != 0 || pad_w_ != 0) {
       CHECK(param_.pool_type == pool_enum::kAvgPooling
           || param_.pool_type == pool_enum::kMaxPooling)
         << "Padding implemented only for average and max pooling.";
       CHECK_LT(pad_h_, kernel_h_);
       CHECK_LT(pad_w_, kernel_w_);
     }
     pooled_height_ = out.shape_[2];
     pooled_width_ = out.shape_[3];

     size_t dim = 4;
     size_t src_sizes[4], src_strides[4];
     size_t dst_sizes[4], dst_strides[4];
     src_sizes[0] = width_;
     src_sizes[1] = height_;
     src_sizes[2] = channels_;
     src_sizes[3] = num_;
     src_strides[0] = 1;
     src_strides[1] = src_sizes[0];
     src_strides[2] = src_sizes[0] * src_sizes[1];
     src_strides[3] = src_sizes[0] * src_sizes[1] * src_sizes[2];
     dst_sizes[0] = pooled_width_;
     dst_sizes[1] = pooled_height_;
     dst_sizes[2] = src_sizes[2];
     dst_sizes[3] = src_sizes[3];
     dst_strides[0] = 1;
     dst_strides[1] = dst_sizes[0];
     dst_strides[2] = dst_sizes[0] * dst_sizes[1];
     dst_strides[3] = dst_sizes[0] * dst_sizes[1] * dst_sizes[2];
     src_offset[0] = -pad_w_;
     src_offset[1] = -pad_h_;
     src_offset[2] = -pad_w_;
     src_offset[3] = -pad_h_;
     kernel_stride[0] = stride_w_;
     kernel_stride[1] = stride_h_;
     kernel_size[0] = kernel_w_;
     kernel_size[1] = kernel_h_;

     // Names are for debugging only
     fwd_bottom_data->name = "fwd_bottom_data   @ " + getName();
     fwd_top_data->name = "fwd_top_data      @ " + getName();
     bwd_top_diff->name = "bwd_top_diff      @ " + getName();
     bwd_bottom_diff->name = "bwd_bottom_diff   @ " + getName();

     fwd_bottom_data->create_user_layout(dim, src_sizes, src_strides);
     fwd_top_data->create_user_layout(dim, dst_sizes, dst_strides);
     bwd_bottom_diff->create_user_layout(dim, src_sizes, src_strides);
     bwd_top_diff->create_user_layout(dim, dst_sizes, dst_strides);

     // Primitives will be allocated during the first fwd pass
     poolingFwd = NULL;
     poolingBwd = NULL;
     max_idx_data = NULL;
   }

  public:
   virtual void Forward(const OpContext &ctx,
                        const std::vector<TBlob> &in_data,
                        const std::vector<OpReqType> &req,
                        const std::vector<TBlob> &out_data,
                        const std::vector<TBlob> &aux_args) {
     using namespace mshadow;
     using namespace mshadow::expr;
     CHECK_EQ(in_data.size(), 1);
     CHECK_EQ(out_data.size(), 1);
     Stream<xpu> *s = ctx.get_stream<xpu>();
     if (param_.kernel.ndim() >= 3) {
       LOG(FATAL) << "Not implmented";
     }
     Tensor<xpu, 4, DType> data = mkl_experimental_direct_get<xpu, 4, DType>(
       in_data[pool_enum::kData], s);
     Tensor<xpu, 4, DType> out = mkl_experimental_direct_get<xpu, 4, DType>(
       out_data[pool_enum::kOut], s);
     if (!init_mkldnn_) {
       LayerSetUp(data, out);
       init_mkldnn_ = true;
     }
     auto first_pass = false;
     if (poolingFwd == NULL) first_pass = true;

     dnnAlgorithm_t algorithm = dnnAlgorithmPoolingMax;

     switch (param_.pool_type) {
     case pool_enum::kMaxPooling:
       algorithm = dnnAlgorithmPoolingMax;
       break;
     case pool_enum::kAvgPooling:
       algorithm = (param_.pooling_convention == pool_enum::kValid) ?
           dnnAlgorithmPoolingAvgIncludePadding : dnnAlgorithmPoolingAvg;

       break;
     default:
       LOG(FATAL) << "Unknown pooling method.";
     }

     dnnError_t status;
     void* pooling_res[dnnResourceNumber];

     void* bottom_data = NULL;
 #if MKL_EXPERIMENTAL == 1
     bottom_data =
           reinterpret_cast<void *>(mkl_prv_data<DType>(in_data[pool_enum::kData]));
 #endif
     dnnBorder_t border_type = dnnBorderZerosAsymm;
     switch (param_.pooling_convention) {
     case pool_enum::kFull:
       border_type = dnnBorderZeros;
       break;
     case pool_enum::kValid:
       border_type = dnnBorderZerosAsymm;
       break;
     default:
       border_type = dnnBorderZerosAsymm;
       break;
     }
     if (NULL == bottom_data) {
       bottom_data = data.dptr_;
       if (NULL == poolingFwd) {
         status = dnnPoolingCreateForward<DType>(&poolingFwd, NULL,
                                                 algorithm, fwd_bottom_data->layout_usr,
                                                 kernel_size, kernel_stride,
                                                 src_offset, border_type);
       CHECK_EQ(status, E_SUCCESS);
       // Now create poolingBwd
       status = dnnPoolingCreateBackward<DType>(&poolingBwd, NULL,
                                                algorithm, fwd_bottom_data->layout_usr,
                                                kernel_size, kernel_stride,
                                                src_offset, border_type);
       CHECK_EQ(status, E_SUCCESS);
       }
     }
 #if MKL_EXPERIMENTAL == 1
     if (NULL != bottom_data) {
        if (NULL == poolingFwd) {
           std::shared_ptr<MKLMemHolder> bottom_data_mem = in_data[pool_enum::kData].Mkl_mem_;
           std::shared_ptr<PrvMemDescr> bottom_prv_descriptor =
             bottom_data_mem->get_prv_descriptor();
           CHECK_EQ(bottom_prv_descriptor->get_descr_type(),
                    PrvMemDescr::PRV_DESCR_MKL2017);
           std::shared_ptr<MKLData<DType> > mem_descr
             = std::static_pointer_cast<MKLData<DType>>(bottom_prv_descriptor);
           CHECK(mem_descr != nullptr);
           fwd_bottom_data = mem_descr;

           status = dnnPoolingCreateForward<DType>(&poolingFwd, NULL,
                                                   algorithm, fwd_bottom_data->layout_int,
                                                   kernel_size, kernel_stride,
                                                   src_offset, border_type);
           CHECK_EQ(status, E_SUCCESS);
           fwd_top_data->create_internal_layout(poolingFwd, dnnResourceDst);

           // Now create poolingBwd
           status = dnnPoolingCreateBackward<DType>(&poolingBwd, NULL,
                                                    algorithm, fwd_bottom_data->layout_int,
                                                    kernel_size, kernel_stride,
                                                    src_offset, border_type);
           CHECK_EQ(status, E_SUCCESS);
           bwd_top_diff->create_internal_layout(poolingFwd, dnnResourceDst);
           bwd_bottom_diff->create_internal_layout(poolingFwd, dnnResourceSrc);
         }
     }
 #endif

     if (first_pass) {
       dnnLayout_t max_idx_datal = NULL;
       status = dnnLayoutCreateFromPrimitive<DType>(
           &max_idx_datal, poolingFwd, dnnResourceWorkspace);
       CHECK_EQ(status, E_SUCCESS);
       status = dnnAllocateBuffer<DType>(reinterpret_cast<void**>(&max_idx_data), max_idx_datal);
       CHECK_EQ(status, E_SUCCESS);
 #if MKL_EXPERIMENTAL == 0
       fwd_bottom_data->create_internal_layout(poolingFwd, dnnResourceSrc);
       fwd_top_data->create_internal_layout(poolingFwd, dnnResourceDst);
       bwd_top_diff->create_internal_layout(poolingBwd, dnnResourceDiffDst);
       bwd_bottom_diff->create_internal_layout(poolingBwd, dnnResourceDiffSrc);
 #endif
       dnnLayoutDelete<DType>(max_idx_datal);
       first_pass = false;
     }
     pooling_res[dnnResourceSrc] = bottom_data;
     pooling_res[dnnResourceWorkspace] = max_idx_data;

     std::shared_ptr<MKLMemHolder> top_mem = NULL;
 #if MKL_EXPERIMENTAL == 1
     top_mem = out_data[pool_enum::kOut].Mkl_mem_;
 #endif
     pooling_res[dnnResourceDst] = fwd_top_data->get_output_ptr(
       out.dptr_, fwd_top_data, top_mem);
     status = dnnExecute<DType>(poolingFwd, pooling_res);
     CHECK_EQ(status, E_SUCCESS);
 #if MKL_EXPERIMENTAL == 0
     if (fwd_top_data->conversion_needed()) {
       fwd_top_data->convert_from_prv(out.dptr_);
     }
 #endif
   }
   virtual void Backward(const OpContext &ctx,
                         const std::vector<TBlob> &out_grad,
                         const std::vector<TBlob> &in_data,
                         const std::vector<TBlob> &out_data,
                         const std::vector<OpReqType> &req,
                         const std::vector<TBlob> &in_grad,
                         const std::vector<TBlob> &aux_args) {
     if (!req[0]) {
       return;
     }
     using namespace mshadow;
     using namespace mshadow::expr;
     CHECK_EQ(out_grad.size(), 1);
     CHECK_EQ(in_data.size(), 1);
     CHECK_EQ(out_data.size(), 1);
     CHECK_EQ(req.size(), 1);
     CHECK_EQ(in_grad.size(), 1);
     if (param_.kernel.ndim() >= 3) {
       LOG(FATAL) << "Not implmented";
     }
     Stream<xpu> *s = ctx.get_stream<xpu>();
     Tensor<xpu, 4, DType> grad = mkl_experimental_direct_get<xpu, 4, DType>(
       out_grad[pool_enum::kOut], s);
     Tensor<xpu, 4, DType> input_grad = mkl_experimental_direct_get<xpu, 4, DType>(
       in_grad[pool_enum::kData], s);
     dnnError_t e;
     void* pooling_res[dnnResourceNumber];
     pooling_res[dnnResourceWorkspace] = reinterpret_cast<void *>(max_idx_data);

     pooling_res[dnnResourceDiffDst] =
       bwd_top_diff->get_converted_prv(grad.dptr_, true, out_grad[pool_enum::kOut]);

     std::shared_ptr<MKLMemHolder> bottom_diff_mem = NULL;
 #if MKL_EXPERIMENTAL == 1
     bottom_diff_mem = in_grad[pool_enum::kData].Mkl_mem_;
 #endif
     pooling_res[dnnResourceDiffSrc] = bwd_bottom_diff->get_output_ptr(
       input_grad.dptr_, bwd_bottom_diff, bottom_diff_mem);
     e = dnnExecute<DType>(poolingBwd, pooling_res);
     CHECK_EQ(e, E_SUCCESS);
 #if MKL_EXPERIMENTAL == 0
     if (bwd_bottom_diff->conversion_needed()) {
       bwd_bottom_diff->convert_from_prv(input_grad.dptr_);
     }
 #endif
   }

  private:
   PoolingParam param_;
   int kernel_h_, kernel_w_;
   int stride_h_, stride_w_;
   int pad_h_, pad_w_;
   int channels_, num_;
   int height_, width_;
   int pooled_height_, pooled_width_;
   bool global_pooling_;

  private:
   size_t kernel_size[2],
          kernel_stride[4];
   int src_offset[4];  // 2*(dimension-2)
   dnnPrimitive_t poolingFwd, poolingBwd;
   DType *max_idx_data;

   std::shared_ptr<MKLData<DType> > fwd_top_data;
   std::shared_ptr<MKLData<DType> > fwd_bottom_data;
   std::shared_ptr<MKLData<DType> > bwd_top_diff;
   std::shared_ptr<MKLData<DType> > bwd_bottom_diff;
   bool init_mkldnn_;
 };  // class MKLPoolingOp
 }   // namespace op
 }   // namespace mxnet

 #endif  // MXNET_OPERATOR_MKL_MKL_POOLING_INL_H_
	/*******************************************************************************
	* Copyright 2016 Intel Corporation
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*
	* \file mkl_pooling-inl.h
	* \brief
	* \author zhenlin.luo@intel.com
	* lingyan.guo@intel.com
	*
	*******************************************************************************/

	#ifndef MXNET_OPERATOR_MKL_MKL_POOLING_INL_H_
	#define MXNET_OPERATOR_MKL_MKL_POOLING_INL_H_
	#include <vector>
	#include <string>
	#include <utility>
	#include "../operator_common.h"
	#include "../pooling-inl.h"
	#include "./mkl_util-inl.h"

	namespace mxnet {
	namespace op {


	template<typename xpu, typename DType>
	class MKLPoolingOp : public Operator {
	public:
	static std::string getName() {
	return "MKLPoolingOp";
	}
	explicit MKLPoolingOp(PoolingParam p) {
	poolingFwd = static_cast<dnnPrimitive_t>(NULL);
	poolingBwd = static_cast<dnnPrimitive_t>(NULL);
	max_idx_data = static_cast<DType*>(NULL);
	fwd_top_data = MKLData<DType>::create();
	fwd_bottom_data = MKLData<DType>::create();
	bwd_top_diff = MKLData<DType>::create();
	bwd_bottom_diff = MKLData<DType>::create();
	this->param_ = p;
	init_mkldnn_ = false;
	}
	virtual ~MKLPoolingOp() {
	if (poolingFwd != NULL) {
	dnnDelete<DType>(poolingFwd);
	poolingFwd = NULL;
	}
	if (poolingBwd != NULL) {
	dnnDelete<DType>(poolingBwd);
	poolingBwd = NULL;
	}
	if (max_idx_data != NULL) {
	dnnReleaseBuffer<DType>(max_idx_data);
	max_idx_data = NULL;
	}
	}

	private:
	void LayerSetUp(const mshadow::Tensor<xpu, 4, DType> &data,
	const mshadow::Tensor<xpu, 4, DType> &out) {
	channels_ = data.shape_[1];
	height_ = data.shape_[2];
	width_ = data.shape_[3];
	num_ = data.shape_[0];
	global_pooling_ = param_.global_pool;
	if (global_pooling_) {
	kernel_h_ = height_;
	kernel_w_ = width_;
	} else {
	kernel_h_ = param_.kernel[0];
	kernel_w_ = param_.kernel[1];
	}
	CHECK_GT(kernel_h_, 0) << "Filter dimensions cannot be zero.";
	CHECK_GT(kernel_w_, 0) << "Filter dimensions cannot be zero.";
	pad_h_ = param_.pad[0];
	pad_w_ = param_.pad[1];
	if (global_pooling_) {
	stride_h_ = stride_w_ = 1;
	} else {
	stride_h_ = param_.stride[0];
	stride_w_ = param_.stride[1];
	}
	if (global_pooling_) {
	CHECK(pad_h_ == 0 && pad_w_ == 0 && stride_h_ == 1 && stride_w_ == 1)
	<< "With Global_pooling: true; only pad = 0 and stride = 1";
	}
	if (pad_h_ != 0 \|\| pad_w_ != 0) {
	CHECK(param_.pool_type == pool_enum::kAvgPooling
	\|\| param_.pool_type == pool_enum::kMaxPooling)
	<< "Padding implemented only for average and max pooling.";
	CHECK_LT(pad_h_, kernel_h_);
	CHECK_LT(pad_w_, kernel_w_);
	}
	pooled_height_ = out.shape_[2];
	pooled_width_ = out.shape_[3];

	size_t dim = 4;
	size_t src_sizes[4], src_strides[4];
	size_t dst_sizes[4], dst_strides[4];
	src_sizes[0] = width_;
	src_sizes[1] = height_;
	src_sizes[2] = channels_;
	src_sizes[3] = num_;
	src_strides[0] = 1;
	src_strides[1] = src_sizes[0];
	src_strides[2] = src_sizes[0] * src_sizes[1];
	src_strides[3] = src_sizes[0] * src_sizes[1] * src_sizes[2];
	dst_sizes[0] = pooled_width_;
	dst_sizes[1] = pooled_height_;
	dst_sizes[2] = src_sizes[2];
	dst_sizes[3] = src_sizes[3];
	dst_strides[0] = 1;
	dst_strides[1] = dst_sizes[0];
	dst_strides[2] = dst_sizes[0] * dst_sizes[1];
	dst_strides[3] = dst_sizes[0] * dst_sizes[1] * dst_sizes[2];
	src_offset[0] = -pad_w_;
	src_offset[1] = -pad_h_;
	src_offset[2] = -pad_w_;
	src_offset[3] = -pad_h_;
	kernel_stride[0] = stride_w_;
	kernel_stride[1] = stride_h_;
	kernel_size[0] = kernel_w_;
	kernel_size[1] = kernel_h_;

	// Names are for debugging only
	fwd_bottom_data->name = "fwd_bottom_data @ " + getName();
	fwd_top_data->name = "fwd_top_data @ " + getName();
	bwd_top_diff->name = "bwd_top_diff @ " + getName();
	bwd_bottom_diff->name = "bwd_bottom_diff @ " + getName();

	fwd_bottom_data->create_user_layout(dim, src_sizes, src_strides);
	fwd_top_data->create_user_layout(dim, dst_sizes, dst_strides);
	bwd_bottom_diff->create_user_layout(dim, src_sizes, src_strides);
	bwd_top_diff->create_user_layout(dim, dst_sizes, dst_strides);

	// Primitives will be allocated during the first fwd pass
	poolingFwd = NULL;
	poolingBwd = NULL;
	max_idx_data = NULL;
	}

	public:
	virtual void Forward(const OpContext &ctx,
	const std::vector<TBlob> &in_data,
	const std::vector<OpReqType> &req,
	const std::vector<TBlob> &out_data,
	const std::vector<TBlob> &aux_args) {
	using namespace mshadow;
	using namespace mshadow::expr;
	CHECK_EQ(in_data.size(), 1);
	CHECK_EQ(out_data.size(), 1);
	Stream<xpu> *s = ctx.get_stream<xpu>();
	if (param_.kernel.ndim() >= 3) {
	LOG(FATAL) << "Not implmented";
	}
	Tensor<xpu, 4, DType> data = mkl_experimental_direct_get<xpu, 4, DType>(
	in_data[pool_enum::kData], s);
	Tensor<xpu, 4, DType> out = mkl_experimental_direct_get<xpu, 4, DType>(
	out_data[pool_enum::kOut], s);
	if (!init_mkldnn_) {
	LayerSetUp(data, out);
	init_mkldnn_ = true;
	}
	auto first_pass = false;
	if (poolingFwd == NULL) first_pass = true;

	dnnAlgorithm_t algorithm = dnnAlgorithmPoolingMax;

	switch (param_.pool_type) {
	case pool_enum::kMaxPooling:
	algorithm = dnnAlgorithmPoolingMax;
	break;
	case pool_enum::kAvgPooling:
	algorithm = (param_.pooling_convention == pool_enum::kValid) ?
	dnnAlgorithmPoolingAvgIncludePadding : dnnAlgorithmPoolingAvg;

	break;
	default:
	LOG(FATAL) << "Unknown pooling method.";
	}

	dnnError_t status;
	void* pooling_res[dnnResourceNumber];

	void* bottom_data = NULL;
	#if MKL_EXPERIMENTAL == 1
	bottom_data =
	reinterpret_cast<void *>(mkl_prv_data<DType>(in_data[pool_enum::kData]));
	#endif
	dnnBorder_t border_type = dnnBorderZerosAsymm;
	switch (param_.pooling_convention) {
	case pool_enum::kFull:
	border_type = dnnBorderZeros;
	break;
	case pool_enum::kValid:
	border_type = dnnBorderZerosAsymm;
	break;
	default:
	border_type = dnnBorderZerosAsymm;
	break;
	}
	if (NULL == bottom_data) {
	bottom_data = data.dptr_;
	if (NULL == poolingFwd) {
	status = dnnPoolingCreateForward<DType>(&poolingFwd, NULL,
	algorithm, fwd_bottom_data->layout_usr,
	kernel_size, kernel_stride,
	src_offset, border_type);
	CHECK_EQ(status, E_SUCCESS);
	// Now create poolingBwd
	status = dnnPoolingCreateBackward<DType>(&poolingBwd, NULL,
	algorithm, fwd_bottom_data->layout_usr,
	kernel_size, kernel_stride,
	src_offset, border_type);
	CHECK_EQ(status, E_SUCCESS);
	}
	}
	#if MKL_EXPERIMENTAL == 1
	if (NULL != bottom_data) {
	if (NULL == poolingFwd) {
	std::shared_ptr<MKLMemHolder> bottom_data_mem = in_data[pool_enum::kData].Mkl_mem_;
	std::shared_ptr<PrvMemDescr> bottom_prv_descriptor =
	bottom_data_mem->get_prv_descriptor();
	CHECK_EQ(bottom_prv_descriptor->get_descr_type(),
	PrvMemDescr::PRV_DESCR_MKL2017);
	std::shared_ptr<MKLData<DType> > mem_descr
	= std::static_pointer_cast<MKLData<DType>>(bottom_prv_descriptor);
	CHECK(mem_descr != nullptr);
	fwd_bottom_data = mem_descr;

	status = dnnPoolingCreateForward<DType>(&poolingFwd, NULL,
	algorithm, fwd_bottom_data->layout_int,
	kernel_size, kernel_stride,
	src_offset, border_type);
	CHECK_EQ(status, E_SUCCESS);
	fwd_top_data->create_internal_layout(poolingFwd, dnnResourceDst);

	// Now create poolingBwd
	status = dnnPoolingCreateBackward<DType>(&poolingBwd, NULL,
	algorithm, fwd_bottom_data->layout_int,
	kernel_size, kernel_stride,
	src_offset, border_type);
	CHECK_EQ(status, E_SUCCESS);
	bwd_top_diff->create_internal_layout(poolingFwd, dnnResourceDst);
	bwd_bottom_diff->create_internal_layout(poolingFwd, dnnResourceSrc);
	}
	}
	#endif

	if (first_pass) {
	dnnLayout_t max_idx_datal = NULL;
	status = dnnLayoutCreateFromPrimitive<DType>(
	&max_idx_datal, poolingFwd, dnnResourceWorkspace);
	CHECK_EQ(status, E_SUCCESS);
	status = dnnAllocateBuffer<DType>(reinterpret_cast<void**>(&max_idx_data), max_idx_datal);
	CHECK_EQ(status, E_SUCCESS);
	#if MKL_EXPERIMENTAL == 0
	fwd_bottom_data->create_internal_layout(poolingFwd, dnnResourceSrc);
	fwd_top_data->create_internal_layout(poolingFwd, dnnResourceDst);
	bwd_top_diff->create_internal_layout(poolingBwd, dnnResourceDiffDst);
	bwd_bottom_diff->create_internal_layout(poolingBwd, dnnResourceDiffSrc);
	#endif
	dnnLayoutDelete<DType>(max_idx_datal);
	first_pass = false;
	}
	pooling_res[dnnResourceSrc] = bottom_data;
	pooling_res[dnnResourceWorkspace] = max_idx_data;

	std::shared_ptr<MKLMemHolder> top_mem = NULL;
	#if MKL_EXPERIMENTAL == 1
	top_mem = out_data[pool_enum::kOut].Mkl_mem_;
	#endif
	pooling_res[dnnResourceDst] = fwd_top_data->get_output_ptr(
	out.dptr_, fwd_top_data, top_mem);
	status = dnnExecute<DType>(poolingFwd, pooling_res);
	CHECK_EQ(status, E_SUCCESS);
	#if MKL_EXPERIMENTAL == 0
	if (fwd_top_data->conversion_needed()) {
	fwd_top_data->convert_from_prv(out.dptr_);
	}
	#endif
	}
	virtual void Backward(const OpContext &ctx,
	const std::vector<TBlob> &out_grad,
	const std::vector<TBlob> &in_data,
	const std::vector<TBlob> &out_data,
	const std::vector<OpReqType> &req,
	const std::vector<TBlob> &in_grad,
	const std::vector<TBlob> &aux_args) {
	if (!req[0]) {
	return;
	}
	using namespace mshadow;
	using namespace mshadow::expr;
	CHECK_EQ(out_grad.size(), 1);
	CHECK_EQ(in_data.size(), 1);
	CHECK_EQ(out_data.size(), 1);
	CHECK_EQ(req.size(), 1);
	CHECK_EQ(in_grad.size(), 1);
	if (param_.kernel.ndim() >= 3) {
	LOG(FATAL) << "Not implmented";
	}
	Stream<xpu> *s = ctx.get_stream<xpu>();
	Tensor<xpu, 4, DType> grad = mkl_experimental_direct_get<xpu, 4, DType>(
	out_grad[pool_enum::kOut], s);
	Tensor<xpu, 4, DType> input_grad = mkl_experimental_direct_get<xpu, 4, DType>(
	in_grad[pool_enum::kData], s);
	dnnError_t e;
	void* pooling_res[dnnResourceNumber];
	pooling_res[dnnResourceWorkspace] = reinterpret_cast<void *>(max_idx_data);

	pooling_res[dnnResourceDiffDst] =
	bwd_top_diff->get_converted_prv(grad.dptr_, true, out_grad[pool_enum::kOut]);

	std::shared_ptr<MKLMemHolder> bottom_diff_mem = NULL;
	#if MKL_EXPERIMENTAL == 1
	bottom_diff_mem = in_grad[pool_enum::kData].Mkl_mem_;
	#endif
	pooling_res[dnnResourceDiffSrc] = bwd_bottom_diff->get_output_ptr(
	input_grad.dptr_, bwd_bottom_diff, bottom_diff_mem);
	e = dnnExecute<DType>(poolingBwd, pooling_res);
	CHECK_EQ(e, E_SUCCESS);
	#if MKL_EXPERIMENTAL == 0
	if (bwd_bottom_diff->conversion_needed()) {
	bwd_bottom_diff->convert_from_prv(input_grad.dptr_);
	}
	#endif
	}

	private:
	PoolingParam param_;
	int kernel_h_, kernel_w_;
	int stride_h_, stride_w_;
	int pad_h_, pad_w_;
	int channels_, num_;
	int height_, width_;
	int pooled_height_, pooled_width_;
	bool global_pooling_;

	private:
	size_t kernel_size[2],
	kernel_stride[4];
	int src_offset[4]; // 2*(dimension-2)
	dnnPrimitive_t poolingFwd, poolingBwd;
	DType *max_idx_data;

	std::shared_ptr<MKLData<DType> > fwd_top_data;
	std::shared_ptr<MKLData<DType> > fwd_bottom_data;
	std::shared_ptr<MKLData<DType> > bwd_top_diff;
	std::shared_ptr<MKLData<DType> > bwd_bottom_diff;
	bool init_mkldnn_;
	}; // class MKLPoolingOp
	} // namespace op
	} // namespace mxnet

	#endif // MXNET_OPERATOR_MKL_MKL_POOLING_INL_H_