src/model/layer/opencl_convolution.cc - singa - Git at Google

 /**
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 #include "opencl_convolution.h"

 #ifdef USE_OPENCL

 namespace singa {

 RegisterLayerClass(singacl_convolution, OpenclConvolution);

 /// \copydoc Layer::Forward(int flag, const Tensor&)
 const Tensor OpenclConvolution::Forward(int flag, const Tensor &input) {
   CHECK(buf_.empty());
   CHECK_EQ(input.device()->lang(), kOpencl);
   CHECK_EQ(input.nDim(), 4u);

   if (flag & kTrain) buf_.push(input);

   auto batchsize = input.shape(0);
   auto imagesize = input.Size() / batchsize;
   auto data_type = input.data_type();
   auto device = input.device();

   // TODO(wangwei) update the layer config if the input sample shape changes
   CHECK(input.shape(1) == channels_ && input.shape(2) == height_ &&
         input.shape(3) == width_) << "input sample shape should not change";

   Shape shape{batchsize, num_filters_, conv_height_, conv_width_};
   Tensor output(shape, device, data_type);
   Tensor col_data(Shape{col_height_, col_width_}, device, data_type);

   for (size_t b = 0; b < batchsize; b++) {
     int offset = b * imagesize;

     col_data.device()->Exec([input, offset, col_data, this](Context * ctx) mutable {

       this->Im2Col(input.block(), offset,
       height_, width_,
       kernel_h_, kernel_w_,
       pad_h_, pad_w_,
       stride_h_, stride_w_,
       conv_height_, conv_width_,
       0, channels_,
       col_data.block(), ctx);
     },
     {input.block()},
     {col_data.block()});

     Tensor each = Mult(weight_, col_data);

     if (bias_term_) {
       AddColumn(bias_, &each);
     }

     CopyDataToFrom(&output, each, each.Size(), b * each.Size());
   }

   return output;
 }


 /// \copydoc Layer::Backward(int, const Tensor&, const Tensor&);
 const std::pair<Tensor, std::vector<Tensor>>
 OpenclConvolution::Backward(int flag, const Tensor &grad) {
   CHECK(!buf_.empty());
   CHECK_EQ(grad.device()->lang(), kOpencl);
   CHECK_EQ(grad.nDim(), 4u);

   std::vector<Tensor> param_grad;

   Tensor src_data = buf_.top();
   buf_.pop();

   Tensor dx, db, dw;
   dx.ResetLike(src_data);
   db.ResetLike(bias_);
   dw.ResetLike(weight_);
   dw.SetValue(0.0f);

   size_t batchsize = grad.shape(0);
   size_t imagesize = src_data.Size() / batchsize;

   if (bias_term_) {
     auto tmpshp = Shape{batchsize * num_filters_, grad.Size() / (batchsize * num_filters_)};
     Tensor tmp1 = Reshape(grad, tmpshp);

     Tensor tmp2(Shape{batchsize * num_filters_},
                 grad.device(), grad.data_type());
     SumColumns(tmp1, &tmp2);
     Tensor tmp3 = Reshape(tmp2, Shape{batchsize, num_filters_});

     SumRows(tmp3, &db);
   }

   Tensor col_data(Shape{col_height_, col_width_},
                   grad.device(), grad.data_type());

   for (size_t b = 0; b < batchsize; b++) {

     int im_offset = b * imagesize;
     int col_offset = 0; // Always keep this to zero.

     col_data.device()->Exec([src_data, col_data, im_offset, col_offset,
     this](Context * ctx) mutable {

       this->Im2Col(src_data.block(), im_offset,
       height_, width_,
       kernel_h_, kernel_w_,
       pad_h_, pad_w_,
       stride_h_, stride_w_,
       conv_height_, conv_width_,
       col_offset, channels_,
       col_data.block(), ctx);
     },
     {src_data.block()},
     {col_data.block()});

     Tensor grad_b(Shape{num_filters_, conv_height_ * conv_width_},
                   grad.device(), grad.data_type());
     CopyDataToFrom(&grad_b, grad, grad_b.Size(), 0, b * grad_b.Size());

     dw += Mult(grad_b, Transpose(col_data));
     Tensor dcol_b = Mult(Transpose(weight_), grad_b);

     dx.device()->Exec([dcol_b, dx, im_offset, col_offset,
     this](Context * ctx) mutable {

       this->Col2Im(dcol_b.block(), col_offset,
       height_, width_,
       kernel_h_, kernel_w_,
       pad_h_, pad_w_,
       stride_h_, stride_w_,
       conv_height_, conv_width_,
       im_offset, channels_,
       dx.block(), ctx);
     },
     {dcol_b.block()},
     {dx.block()});
   }

   param_grad.push_back(dw);
   param_grad.push_back(db);

   return std::make_pair(dx, param_grad);
 }


 void OpenclConvolution::Setup(const Shape &in_sample, const LayerConf &conf) {
   Convolution::Setup(in_sample, conf);
 }


 void OpenclConvolution::ToDevice(std::shared_ptr<Device> device) {
   Convolution::ToDevice(device);
 }


 void OpenclConvolution::Im2Col(Block* src, int data_im_off,
                                const int height, const int width,
                                const int kernel_h, const int kernel_w,
                                const int pad_h, const int pad_w,
                                const int stride_h, const int stride_w,
                                const int conv_h, const int conv_w,
                                const int col_data_off, const int channels,
                                Block* dst, Context* ctx) {

   auto ocl_ctx = viennacl::ocl::get_context(ctx->vcl_ctx_id);
   auto kernel = ocl_ctx.get_kernel("opencl_im2col", "im2col");

   auto src_buf = WrapHandle(static_cast<cl_mem>(src->mutable_data()), ocl_ctx);
   auto dst_buf = WrapHandle(static_cast<cl_mem>(dst->mutable_data()), ocl_ctx);

   int num_kernels = channels * conv_h * conv_w;

   viennacl::ocl::enqueue(kernel(num_kernels, src_buf, data_im_off,
                                 height, width, kernel_h, kernel_w,
                                 pad_h, pad_w, stride_h, stride_w,
                                 1, 1, conv_h, conv_w,
                                 dst_buf, col_data_off));
 }


 void OpenclConvolution::Col2Im(Block* src, const int col_data_off,
                                const int height, const int width,
                                const int kernel_h, const int kernel_w,
                                const int pad_h, const int pad_w,
                                const int stride_h, const int stride_w,
                                const int conv_h, const int conv_w,
                                const int data_im_off, const int channels,
                                Block* dst, Context* ctx) {

   auto ocl_ctx = viennacl::ocl::get_context(ctx->vcl_ctx_id);
   auto kernel = ocl_ctx.get_kernel("opencl_im2col", "col2im");

   auto src_buf = WrapHandle(static_cast<cl_mem>(src->mutable_data()), ocl_ctx);
   auto dst_buf = WrapHandle(static_cast<cl_mem>(dst->mutable_data()), ocl_ctx);

   int num_kernels = channels * height * width;

   viennacl::ocl::enqueue(kernel(num_kernels, src_buf, col_data_off, channels,
                                 height, width, kernel_h, kernel_w,
                                 pad_h, pad_w, stride_h, stride_w,
                                 1, 1, conv_h, conv_w,
                                 dst_buf, data_im_off));
 }


 } // namespace singa

 #endif // USE_OPENCL
	/**
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	#include "opencl_convolution.h"

	#ifdef USE_OPENCL

	namespace singa {

	RegisterLayerClass(singacl_convolution, OpenclConvolution);

	/// \copydoc Layer::Forward(int flag, const Tensor&)
	const Tensor OpenclConvolution::Forward(int flag, const Tensor &input) {
	CHECK(buf_.empty());
	CHECK_EQ(input.device()->lang(), kOpencl);
	CHECK_EQ(input.nDim(), 4u);

	if (flag & kTrain) buf_.push(input);

	auto batchsize = input.shape(0);
	auto imagesize = input.Size() / batchsize;
	auto data_type = input.data_type();
	auto device = input.device();

	// TODO(wangwei) update the layer config if the input sample shape changes
	CHECK(input.shape(1) == channels_ && input.shape(2) == height_ &&
	input.shape(3) == width_) << "input sample shape should not change";

	Shape shape{batchsize, num_filters_, conv_height_, conv_width_};
	Tensor output(shape, device, data_type);
	Tensor col_data(Shape{col_height_, col_width_}, device, data_type);

	for (size_t b = 0; b < batchsize; b++) {
	int offset = b * imagesize;

	col_data.device()->Exec([input, offset, col_data, this](Context * ctx) mutable {

	this->Im2Col(input.block(), offset,
	height_, width_,
	kernel_h_, kernel_w_,
	pad_h_, pad_w_,
	stride_h_, stride_w_,
	conv_height_, conv_width_,
	0, channels_,
	col_data.block(), ctx);
	},
	{input.block()},
	{col_data.block()});

	Tensor each = Mult(weight_, col_data);

	if (bias_term_) {
	AddColumn(bias_, &each);
	}

	CopyDataToFrom(&output, each, each.Size(), b * each.Size());
	}

	return output;
	}


	/// \copydoc Layer::Backward(int, const Tensor&, const Tensor&);
	const std::pair<Tensor, std::vector<Tensor>>
	OpenclConvolution::Backward(int flag, const Tensor &grad) {
	CHECK(!buf_.empty());
	CHECK_EQ(grad.device()->lang(), kOpencl);
	CHECK_EQ(grad.nDim(), 4u);

	std::vector<Tensor> param_grad;

	Tensor src_data = buf_.top();
	buf_.pop();

	Tensor dx, db, dw;
	dx.ResetLike(src_data);
	db.ResetLike(bias_);
	dw.ResetLike(weight_);
	dw.SetValue(0.0f);

	size_t batchsize = grad.shape(0);
	size_t imagesize = src_data.Size() / batchsize;

	if (bias_term_) {
	auto tmpshp = Shape{batchsize * num_filters_, grad.Size() / (batchsize * num_filters_)};
	Tensor tmp1 = Reshape(grad, tmpshp);

	Tensor tmp2(Shape{batchsize * num_filters_},
	grad.device(), grad.data_type());
	SumColumns(tmp1, &tmp2);
	Tensor tmp3 = Reshape(tmp2, Shape{batchsize, num_filters_});

	SumRows(tmp3, &db);
	}

	Tensor col_data(Shape{col_height_, col_width_},
	grad.device(), grad.data_type());

	for (size_t b = 0; b < batchsize; b++) {

	int im_offset = b * imagesize;
	int col_offset = 0; // Always keep this to zero.

	col_data.device()->Exec([src_data, col_data, im_offset, col_offset,
	this](Context * ctx) mutable {

	this->Im2Col(src_data.block(), im_offset,
	height_, width_,
	kernel_h_, kernel_w_,
	pad_h_, pad_w_,
	stride_h_, stride_w_,
	conv_height_, conv_width_,
	col_offset, channels_,
	col_data.block(), ctx);
	},
	{src_data.block()},
	{col_data.block()});

	Tensor grad_b(Shape{num_filters_, conv_height_ * conv_width_},
	grad.device(), grad.data_type());
	CopyDataToFrom(&grad_b, grad, grad_b.Size(), 0, b * grad_b.Size());

	dw += Mult(grad_b, Transpose(col_data));
	Tensor dcol_b = Mult(Transpose(weight_), grad_b);

	dx.device()->Exec([dcol_b, dx, im_offset, col_offset,
	this](Context * ctx) mutable {

	this->Col2Im(dcol_b.block(), col_offset,
	height_, width_,
	kernel_h_, kernel_w_,
	pad_h_, pad_w_,
	stride_h_, stride_w_,
	conv_height_, conv_width_,
	im_offset, channels_,
	dx.block(), ctx);
	},
	{dcol_b.block()},
	{dx.block()});
	}

	param_grad.push_back(dw);
	param_grad.push_back(db);

	return std::make_pair(dx, param_grad);
	}


	void OpenclConvolution::Setup(const Shape &in_sample, const LayerConf &conf) {
	Convolution::Setup(in_sample, conf);
	}


	void OpenclConvolution::ToDevice(std::shared_ptr<Device> device) {
	Convolution::ToDevice(device);
	}


	void OpenclConvolution::Im2Col(Block* src, int data_im_off,
	const int height, const int width,
	const int kernel_h, const int kernel_w,
	const int pad_h, const int pad_w,
	const int stride_h, const int stride_w,
	const int conv_h, const int conv_w,
	const int col_data_off, const int channels,
	Block* dst, Context* ctx) {

	auto ocl_ctx = viennacl::ocl::get_context(ctx->vcl_ctx_id);
	auto kernel = ocl_ctx.get_kernel("opencl_im2col", "im2col");

	auto src_buf = WrapHandle(static_cast<cl_mem>(src->mutable_data()), ocl_ctx);
	auto dst_buf = WrapHandle(static_cast<cl_mem>(dst->mutable_data()), ocl_ctx);

	int num_kernels = channels * conv_h * conv_w;

	viennacl::ocl::enqueue(kernel(num_kernels, src_buf, data_im_off,
	height, width, kernel_h, kernel_w,
	pad_h, pad_w, stride_h, stride_w,
	1, 1, conv_h, conv_w,
	dst_buf, col_data_off));
	}


	void OpenclConvolution::Col2Im(Block* src, const int col_data_off,
	const int height, const int width,
	const int kernel_h, const int kernel_w,
	const int pad_h, const int pad_w,
	const int stride_h, const int stride_w,
	const int conv_h, const int conv_w,
	const int data_im_off, const int channels,
	Block* dst, Context* ctx) {

	auto ocl_ctx = viennacl::ocl::get_context(ctx->vcl_ctx_id);
	auto kernel = ocl_ctx.get_kernel("opencl_im2col", "col2im");

	auto src_buf = WrapHandle(static_cast<cl_mem>(src->mutable_data()), ocl_ctx);
	auto dst_buf = WrapHandle(static_cast<cl_mem>(dst->mutable_data()), ocl_ctx);

	int num_kernels = channels * height * width;

	viennacl::ocl::enqueue(kernel(num_kernels, src_buf, col_data_off, channels,
	height, width, kernel_h, kernel_w,
	pad_h, pad_w, stride_h, stride_w,
	1, 1, conv_h, conv_w,
	dst_buf, data_im_off));
	}


	} // namespace singa

	#endif // USE_OPENCL