src/operator/quantization/quantized_pooling.cu - mxnet - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 /*!
  * Copyright (c) 2017 by Contributors
  * \file quantized_pooling.cu
 */
 #include <mxnet/operator_util.h>
 #include <vector>
 #include "../nn/pooling-inl.h"
 #include "../mshadow_op.h"

 namespace mxnet {
 namespace op {

 #if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 6 && CUDA_VERSION >= 8000
 template<typename DType>
 class QuantizedCuDNNPoolingOp {
  public:
   QuantizedCuDNNPoolingOp() {
     CUDNN_CALL(cudnnCreatePoolingDescriptor(&pool_desc_));
     CUDNN_CALL(cudnnCreateTensorDescriptor(&in_desc_));
     CUDNN_CALL(cudnnCreateTensorDescriptor(&out_desc_));
   }

   void Init(const PoolingParam& param, const mxnet::TShape& dshape, const mxnet::TShape& oshape) {
     const int N = 0, H = 2, W = 3, C = 1;
     const cudnnDataType_t dtype = mshadow::DataType<DType>::kCudnnFlag;
     CHECK(param.kernel.ndim() == 2) << "Only support 2D pooling";
     if (param.pool_type == pool_enum::kMaxPooling) {
       mode_ = CUDNN_POOLING_MAX;
     } else if (param.pool_type == pool_enum::kAvgPooling) {
       mode_ = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
     } else {
       LOG(FATAL) << "QuantizedCuDNNPoolingOp only supports pool_type=max/avg";
     }
     CUDNN_CALL(cudnnSetTensor4dDescriptor(in_desc_,
                                           CUDNN_TENSOR_NCHW,
                                           dtype,
                                           dshape[N],
                                           dshape[C],
                                           dshape[H],
                                           dshape[W]));
     CUDNN_CALL(cudnnSetTensor4dDescriptor(out_desc_,
                                           CUDNN_TENSOR_NCHW,
                                           dtype,
                                           oshape[N],
                                           oshape[C],
                                           oshape[H],
                                           oshape[W]));
     CUDNN_CALL(cudnnSetPooling2dDescriptor(pool_desc_,
                                            mode_,
                                            CUDNN_NOT_PROPAGATE_NAN,
                                            param.global_pool ? dshape[2] : param.kernel[0],
                                            param.global_pool ? dshape[3] : param.kernel[1],
                                            param.pad[0],
                                            param.pad[1],
                                            param.global_pool ? 1 : param.stride[0],
                                            param.global_pool ? 1 :param.stride[1]));
   }

   ~QuantizedCuDNNPoolingOp() {
     CUDNN_CALL(cudnnDestroyTensorDescriptor(in_desc_));
     CUDNN_CALL(cudnnDestroyTensorDescriptor(out_desc_));
     CUDNN_CALL(cudnnDestroyPoolingDescriptor(pool_desc_));
   }

   void Forward(mshadow::Stream<gpu>* s,
                const std::vector<TBlob> &inputs,
                const std::vector<OpReqType> &req,
                const std::vector<TBlob> &outputs) {
     CHECK_EQ(inputs.size(), 3U);
     CHECK_EQ(outputs.size(), 3U);
     using namespace mshadow;
     using namespace mshadow::expr;
     CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream<gpu>::OwnHandle);
     float alpha = 1.0f;
     float beta  = 0.0f;
     CUDNN_CALL(cudnnPoolingForward(s->dnn_handle_,
                                    pool_desc_,
                                    &alpha,
                                    in_desc_,
                                    inputs[0].dptr_,
                                    &beta,
                                    out_desc_,
                                    outputs[0].dptr_));

     Tensor<gpu, 1, float> omin_range = outputs[1].FlatTo1D<gpu, float>(s);
     Tensor<gpu, 1, float> omax_range = outputs[2].FlatTo1D<gpu, float>(s);
     ASSIGN_DISPATCH(omin_range, req[1],
       F<mshadow_op::identity>(inputs[1].FlatTo1D<gpu, float>(s)));
     ASSIGN_DISPATCH(omax_range, req[2],
       F<mshadow_op::identity>(inputs[2].FlatTo1D<gpu, float>(s)));
   }

  private:
   cudnnPoolingMode_t mode_;
   cudnnTensorDescriptor_t in_desc_;
   cudnnTensorDescriptor_t out_desc_;
   cudnnPoolingDescriptor_t pool_desc_;
 };  // class QuantizedCuDNNPoolingOp
 #endif  // MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 6 && CUDA_VERSION >= 8000

 void QuantizedPoolingForwardGPU(const nnvm::NodeAttrs& attrs,
                                 const OpContext& ctx,
                                 const std::vector<TBlob>& inputs,
                                 const std::vector<OpReqType>& req,
                                 const std::vector<TBlob>& outputs) {
   const PoolingParam& param = nnvm::get<PoolingParam>(attrs.parsed);
   CHECK_EQ(param.kernel.ndim(), 2U)
     << "QuantizedPoolingForward<gpu> only supports 2D convolution for now";
 #if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 6 && CUDA_VERSION >= 8000
 #if DMLC_CXX11_THREAD_LOCAL
   static thread_local QuantizedCuDNNPoolingOp<int8_t> op;
 #else
   static MX_THREAD_LOCAL QuantizedCuDNNPoolingOp<int8_t> op;
 #endif  // DMLC_CXX11_THREAD_LOCAL
   op.Init(param, {inputs[0].shape_}, {outputs[0].shape_});
   op.Forward(ctx.get_stream<gpu>(), inputs, req, outputs);
 #else
   LOG(FATAL) << "QuantizedPoolingForward<gpu> only supports cudnnPoolingForward "
                 "with CUDNN >= 6.0 and CUDA >= 8.0";
 #endif  // MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 6 && CUDA_VERSION >= 8000
 }

 NNVM_REGISTER_OP(_contrib_quantized_pooling)
 .set_attr<FCompute>("FCompute<gpu>", QuantizedPoolingForwardGPU);

 }  // namespace op
 }  // namespace mxnet
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/

	/*!
	* Copyright (c) 2017 by Contributors
	* \file quantized_pooling.cu
	*/
	#include <mxnet/operator_util.h>
	#include <vector>
	#include "../nn/pooling-inl.h"
	#include "../mshadow_op.h"

	namespace mxnet {
	namespace op {

	#if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 6 && CUDA_VERSION >= 8000
	template<typename DType>
	class QuantizedCuDNNPoolingOp {
	public:
	QuantizedCuDNNPoolingOp() {
	CUDNN_CALL(cudnnCreatePoolingDescriptor(&pool_desc_));
	CUDNN_CALL(cudnnCreateTensorDescriptor(&in_desc_));
	CUDNN_CALL(cudnnCreateTensorDescriptor(&out_desc_));
	}

	void Init(const PoolingParam& param, const mxnet::TShape& dshape, const mxnet::TShape& oshape) {
	const int N = 0, H = 2, W = 3, C = 1;
	const cudnnDataType_t dtype = mshadow::DataType<DType>::kCudnnFlag;
	CHECK(param.kernel.ndim() == 2) << "Only support 2D pooling";
	if (param.pool_type == pool_enum::kMaxPooling) {
	mode_ = CUDNN_POOLING_MAX;
	} else if (param.pool_type == pool_enum::kAvgPooling) {
	mode_ = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
	} else {
	LOG(FATAL) << "QuantizedCuDNNPoolingOp only supports pool_type=max/avg";
	}
	CUDNN_CALL(cudnnSetTensor4dDescriptor(in_desc_,
	CUDNN_TENSOR_NCHW,
	dtype,
	dshape[N],
	dshape[C],
	dshape[H],
	dshape[W]));
	CUDNN_CALL(cudnnSetTensor4dDescriptor(out_desc_,
	CUDNN_TENSOR_NCHW,
	dtype,
	oshape[N],
	oshape[C],
	oshape[H],
	oshape[W]));
	CUDNN_CALL(cudnnSetPooling2dDescriptor(pool_desc_,
	mode_,
	CUDNN_NOT_PROPAGATE_NAN,
	param.global_pool ? dshape[2] : param.kernel[0],
	param.global_pool ? dshape[3] : param.kernel[1],
	param.pad[0],
	param.pad[1],
	param.global_pool ? 1 : param.stride[0],
	param.global_pool ? 1 :param.stride[1]));
	}

	~QuantizedCuDNNPoolingOp() {
	CUDNN_CALL(cudnnDestroyTensorDescriptor(in_desc_));
	CUDNN_CALL(cudnnDestroyTensorDescriptor(out_desc_));
	CUDNN_CALL(cudnnDestroyPoolingDescriptor(pool_desc_));
	}

	void Forward(mshadow::Stream<gpu>* s,
	const std::vector<TBlob> &inputs,
	const std::vector<OpReqType> &req,
	const std::vector<TBlob> &outputs) {
	CHECK_EQ(inputs.size(), 3U);
	CHECK_EQ(outputs.size(), 3U);
	using namespace mshadow;
	using namespace mshadow::expr;
	CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream<gpu>::OwnHandle);
	float alpha = 1.0f;
	float beta = 0.0f;
	CUDNN_CALL(cudnnPoolingForward(s->dnn_handle_,
	pool_desc_,
	&alpha,
	in_desc_,
	inputs[0].dptr_,
	&beta,
	out_desc_,
	outputs[0].dptr_));

	Tensor<gpu, 1, float> omin_range = outputs[1].FlatTo1D<gpu, float>(s);
	Tensor<gpu, 1, float> omax_range = outputs[2].FlatTo1D<gpu, float>(s);
	ASSIGN_DISPATCH(omin_range, req[1],
	F<mshadow_op::identity>(inputs[1].FlatTo1D<gpu, float>(s)));
	ASSIGN_DISPATCH(omax_range, req[2],
	F<mshadow_op::identity>(inputs[2].FlatTo1D<gpu, float>(s)));
	}

	private:
	cudnnPoolingMode_t mode_;
	cudnnTensorDescriptor_t in_desc_;
	cudnnTensorDescriptor_t out_desc_;
	cudnnPoolingDescriptor_t pool_desc_;
	}; // class QuantizedCuDNNPoolingOp
	#endif // MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 6 && CUDA_VERSION >= 8000

	void QuantizedPoolingForwardGPU(const nnvm::NodeAttrs& attrs,
	const OpContext& ctx,
	const std::vector<TBlob>& inputs,
	const std::vector<OpReqType>& req,
	const std::vector<TBlob>& outputs) {
	const PoolingParam& param = nnvm::get<PoolingParam>(attrs.parsed);
	CHECK_EQ(param.kernel.ndim(), 2U)
	<< "QuantizedPoolingForward<gpu> only supports 2D convolution for now";
	#if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 6 && CUDA_VERSION >= 8000
	#if DMLC_CXX11_THREAD_LOCAL
	static thread_local QuantizedCuDNNPoolingOp<int8_t> op;
	#else
	static MX_THREAD_LOCAL QuantizedCuDNNPoolingOp<int8_t> op;
	#endif // DMLC_CXX11_THREAD_LOCAL
	op.Init(param, {inputs[0].shape_}, {outputs[0].shape_});
	op.Forward(ctx.get_stream<gpu>(), inputs, req, outputs);
	#else
	LOG(FATAL) << "QuantizedPoolingForward<gpu> only supports cudnnPoolingForward "
	"with CUDNN >= 6.0 and CUDA >= 8.0";
	#endif // MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 6 && CUDA_VERSION >= 8000
	}

	NNVM_REGISTER_OP(_contrib_quantized_pooling)
	.set_attr<FCompute>("FCompute<gpu>", QuantizedPoolingForwardGPU);

	} // namespace op
	} // namespace mxnet