src/operator/contrib/roi_align.cu - mxnet - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */
 /*!
  * Copyright (c) 2018 by Contributors
  * \file roi_align.cu
  * \brief roi align operator
  * \author Hang Zhang, Shesung
  * Adapted from Caffe2
 */
 #include "./roi_align-inl.h"
 #include "../mxnet_op.h"


 namespace mxnet {
 namespace op {

 using namespace mshadow::cuda;

 // The maximum number of blocks to use in the default kernel call.
 constexpr int ROI_MAXIMUM_NUM_BLOCKS = 4096;

 /**
  * @brief Compute the number of blocks needed to run N threads.
  */
 inline int ROI_GET_BLOCKS(const int N) {
   return std::max(
       std::min(
           (N + kMaxThreadsPerBlock - 1) / kMaxThreadsPerBlock,
           ROI_MAXIMUM_NUM_BLOCKS),
       // Use at least 1 block, since CUDA does not allow empty block
       1);
 }


 template <typename T>
 __device__ T bilinear_interpolate(
     const T* bottom_data,
     const int height,
     const int width,
     T y,
     T x,
     const int index /* index for debug only*/) {
   // deal with cases that inverse elements are out of feature map boundary
   if (y < -1.0 || y > height || x < -1.0 || x > width) {
     // empty
     return 0;
   }

   if (y <= 0) {
     y = 0;
   }
   if (x <= 0) {
     x = 0;
   }

   int y_low = static_cast<int>(y);
   int x_low = static_cast<int>(x);
   int y_high;
   int x_high;

   if (y_low >= height - 1) {
     y_high = y_low = height - 1;
     y = (T)y_low;
   } else {
     y_high = y_low + 1;
   }

   if (x_low >= width - 1) {
     x_high = x_low = width - 1;
     x = (T)x_low;
   } else {
     x_high = x_low + 1;
   }

   T ly = y - y_low;
   T lx = x - x_low;
   T hy = 1. - ly, hx = 1. - lx;
   // do bilinear interpolation
   T v1 = bottom_data[y_low * width + x_low];
   T v2 = bottom_data[y_low * width + x_high];
   T v3 = bottom_data[y_high * width + x_low];
   T v4 = bottom_data[y_high * width + x_high];
   T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;

   T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);

   return val;
 }

 template <typename T>
 __global__ void RoIAlignForwardKernel(
     const int nthreads,
     const T* bottom_data,
     const T spatial_scale,
     const bool position_sensitive,
     const int channels,
     const int height,
     const int width,
     const int pooled_height,
     const int pooled_width,
     const int sampling_ratio,
     const T* bottom_rois,
     T* top_data) {
   CUDA_KERNEL_LOOP(index, nthreads) {
     // (n, c, ph, pw) is an element in the pooled output
     int pw = index % pooled_width;
     int ph = (index / pooled_width) % pooled_height;
     int c = (index / pooled_width / pooled_height) % channels;
     int n = index / pooled_width / pooled_height / channels;

     const T* offset_bottom_rois = bottom_rois + n * 5;
     int roi_batch_ind = offset_bottom_rois[0];

     if (roi_batch_ind < 0) {
       top_data[index] = 0.;
       continue;
     }

     // Do not using rounding; this implementation detail is critical
     T roi_start_w = offset_bottom_rois[1] * spatial_scale;
     T roi_start_h = offset_bottom_rois[2] * spatial_scale;
     T roi_end_w = offset_bottom_rois[3] * spatial_scale;
     T roi_end_h = offset_bottom_rois[4] * spatial_scale;
     // T roi_start_w = round(offset_bottom_rois[1] * spatial_scale);
     // T roi_start_h = round(offset_bottom_rois[2] * spatial_scale);
     // T roi_end_w = round(offset_bottom_rois[3] * spatial_scale);
     // T roi_end_h = round(offset_bottom_rois[4] * spatial_scale);

     // Force malformed ROIs to be 1x1
     T roi_width = max(roi_end_w - roi_start_w, (T)1.);
     T roi_height = max(roi_end_h - roi_start_h, (T)1.);
     T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
     T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);

     int c_unpooled = c;
     int channels_unpooled = channels;
     if (position_sensitive) {
       c_unpooled = c * pooled_height * pooled_width + ph * pooled_width + pw;
       channels_unpooled = channels * pooled_height * pooled_width;
     }
     const T* offset_bottom_data =
         bottom_data + (roi_batch_ind * channels_unpooled + c_unpooled)
         * height * width;

     // We use roi_bin_grid to sample the grid and mimic integral
     int roi_bin_grid_h = (sampling_ratio > 0)
         ? sampling_ratio
         : ceil(roi_height / pooled_height);  // e.g., = 2
     int roi_bin_grid_w =
         (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);

     // We do average (integral) pooling inside a bin
     const T count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4

     T output_val = 0.;
     for (int iy = 0; iy < roi_bin_grid_h; iy++) {  // e.g., iy = 0, 1
       const T y = roi_start_h + ph * bin_size_h +
           static_cast<T>(iy + .5f) * bin_size_h /
               static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
       for (int ix = 0; ix < roi_bin_grid_w; ix++) {
         const T x = roi_start_w + pw * bin_size_w +
             static_cast<T>(ix + .5f) * bin_size_w /
                 static_cast<T>(roi_bin_grid_w);

         T val = bilinear_interpolate(
             offset_bottom_data, height, width, y, x, index);
         output_val += val;
       }
     }
     output_val /= count;

     top_data[index] = output_val;
   }
 }


 template <typename T>
 __device__ void bilinear_interpolate_gradient(
     const int height,
     const int width,
     T y,
     T x,
     T* w1,
     T* w2,
     T* w3,
     T* w4,
     int* x_low,
     int* x_high,
     int* y_low,
     int* y_high,
     const int /*index*/ /* index for debug only*/) {
   // deal with cases that inverse elements are out of feature map boundary
   if (y < -1.0 || y > height || x < -1.0 || x > width) {
     // empty
     *w1 = *w2 = *w3 = *w4 = 0.;
     *x_low = *x_high = *y_low = *y_high = -1;
     return;
   }

   if (y <= 0) {
     y = 0;
   }
   if (x <= 0) {
     x = 0;
   }

   *y_low = static_cast<int>(y);
   *x_low = static_cast<int>(x);

   if (*y_low >= height - 1) {
     *y_high = *y_low = height - 1;
     y = (T)*y_low;
   } else {
     *y_high = *y_low + 1;
   }

   if (*x_low >= width - 1) {
     *x_high = *x_low = width - 1;
     x = (T)*x_low;
   } else {
     *x_high = *x_low + 1;
   }

   T ly = y - *y_low;
   T lx = x - *x_low;
   T hy = 1. - ly, hx = 1. - lx;

   *w1 = hy * hx, *w2 = hy * lx, *w3 = ly * hx, *w4 = ly * lx;

   return;
 }

 template <typename T>
 __global__ void RoIAlignBackwardKernel(
     const int nthreads,
     const T* top_diff,
     const int num_rois,
     const T spatial_scale,
     const bool position_sensitive,
     const int channels,
     const int height,
     const int width,
     const int pooled_height,
     const int pooled_width,
     const int sampling_ratio,
     T* bottom_diff,
     const T* bottom_rois) {
   CUDA_KERNEL_LOOP(index, nthreads) {
     // (n, c, ph, pw) is an element in the pooled output
     int pw = index % pooled_width;
     int ph = (index / pooled_width) % pooled_height;
     int c = (index / pooled_width / pooled_height) % channels;
     int n = index / pooled_width / pooled_height / channels;

     const T* offset_bottom_rois = bottom_rois + n * 5;
     int roi_batch_ind = offset_bottom_rois[0];
     if (roi_batch_ind < 0) continue;

     // Do not using rounding; this implementation detail is critical
     T roi_start_w = offset_bottom_rois[1] * spatial_scale;
     T roi_start_h = offset_bottom_rois[2] * spatial_scale;
     T roi_end_w = offset_bottom_rois[3] * spatial_scale;
     T roi_end_h = offset_bottom_rois[4] * spatial_scale;
     // T roi_start_w = round(offset_bottom_rois[1] * spatial_scale);
     // T roi_start_h = round(offset_bottom_rois[2] * spatial_scale);
     // T roi_end_w = round(offset_bottom_rois[3] * spatial_scale);
     // T roi_end_h = round(offset_bottom_rois[4] * spatial_scale);

     // Force malformed ROIs to be 1x1
     T roi_width = max(roi_end_w - roi_start_w, (T)1.);
     T roi_height = max(roi_end_h - roi_start_h, (T)1.);
     T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
     T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);

     int c_unpooled = c;
     int channels_unpooled = channels;
     if (position_sensitive) {
       c_unpooled = c * pooled_height * pooled_width + ph * pooled_width + pw;
       channels_unpooled = channels * pooled_height * pooled_width;
     }
     T* offset_bottom_diff =
         bottom_diff + (roi_batch_ind * channels_unpooled + c_unpooled)
         * height * width;

     int top_offset = (n * channels + c) * pooled_height * pooled_width;
     const T* offset_top_diff = top_diff + top_offset;
     const T top_diff_this_bin = offset_top_diff[ph * pooled_width + pw];

     // We use roi_bin_grid to sample the grid and mimic integral
     int roi_bin_grid_h = (sampling_ratio > 0)
         ? sampling_ratio
         : ceil(roi_height / pooled_height);  // e.g., = 2
     int roi_bin_grid_w =
         (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);

     // We do average (integral) pooling inside a bin
     const T count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4

     for (int iy = 0; iy < roi_bin_grid_h; iy++) {  // e.g., iy = 0, 1
       const T y = roi_start_h + ph * bin_size_h +
           static_cast<T>(iy + .5f) * bin_size_h /
               static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
       for (int ix = 0; ix < roi_bin_grid_w; ix++) {
         const T x = roi_start_w + pw * bin_size_w +
             static_cast<T>(ix + .5f) * bin_size_w /
                 static_cast<T>(roi_bin_grid_w);

         T w1, w2, w3, w4;
         int x_low, x_high, y_low, y_high;

         bilinear_interpolate_gradient(
             height,
             width,
             y,
             x,
             &w1,
             &w2,
             &w3,
             &w4,
             &x_low,
             &x_high,
             &y_low,
             &y_high,
             index);

         T g1 = top_diff_this_bin * w1 / count;
         T g2 = top_diff_this_bin * w2 / count;
         T g3 = top_diff_this_bin * w3 / count;
         T g4 = top_diff_this_bin * w4 / count;

         if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
           atomicAdd(
               offset_bottom_diff + y_low * width + x_low, static_cast<T>(g1));
           atomicAdd(
               offset_bottom_diff + y_low * width + x_high, static_cast<T>(g2));
           atomicAdd(
               offset_bottom_diff + y_high * width + x_low, static_cast<T>(g3));
           atomicAdd(
               offset_bottom_diff + y_high * width + x_high, static_cast<T>(g4));
         }  // if
       }  // ix
     }  // iy
   }  // CUDA_KERNEL_LOOP
 }  // RoIAlignBackward

 template<typename xpu>
 void ROIAlignForwardCompute(const nnvm::NodeAttrs& attrs,
                             const OpContext& ctx,
                             const std::vector<TBlob>& in_data,
                             const std::vector<OpReqType>& req,
                             const std::vector<TBlob>& out_data) {
   using namespace mshadow;
   size_t expected_in = 2;
   size_t expected_out = 1;
   CHECK_EQ(in_data.size(), expected_in);
   CHECK_EQ(out_data.size(), expected_out);
   CHECK_EQ(out_data[roialign::kOut].shape_[0], in_data[roialign::kBox].shape_[0]);

   const ROIAlignParam param = nnvm::get<ROIAlignParam>(attrs.parsed);

   const int count = out_data[roialign::kOut].Size();
   const int num_rois = in_data[roialign::kBox].size(0);
   const int channels = out_data[roialign::kOut].size(1);  // channels of pooled output
   const int height = in_data[roialign::kData].size(2);
   const int width = in_data[roialign::kData].size(3);
   const int pooled_height = out_data[roialign::kOut].size(2);
   const int pooled_width = out_data[roialign::kOut].size(3);

   Stream<gpu> *s = ctx.get_stream<gpu>();
   cudaStream_t stream = mshadow::Stream<gpu>::GetStream(s);
   MSHADOW_REAL_TYPE_SWITCH(in_data[0].type_flag_, DType, {
     const DType *bottom_data = in_data[roialign::kData].dptr<DType>();
     const DType *bottom_rois = in_data[roialign::kBox].dptr<DType>();
     DType *top_data = out_data[roialign::kOut].dptr<DType>();
     RoIAlignForwardKernel<DType>
       <<<ROI_GET_BLOCKS(count),
          kMaxThreadsPerBlock,
          0,
          stream>>>(
           count,
           bottom_data,
           param.spatial_scale,
           param.position_sensitive,
           channels,
           height,
           width,
           pooled_height,
           pooled_width,
           param.sample_ratio,
           bottom_rois,
           top_data);
   })
 }


 template<typename xpu>
 void ROIAlignBackwardCompute(const nnvm::NodeAttrs& attrs,
                              const OpContext& ctx,
                              const std::vector<TBlob>& inputs,
                              const std::vector<OpReqType>& req,
                              const std::vector<TBlob>& outputs) {
   using namespace mshadow;

   CHECK_EQ(inputs.size(), 2);
   CHECK_EQ(outputs.size(), 2);
   // the order here relates to the order in ROIAlignGrad
   std::vector<TBlob> out_grad(1, inputs[0]);
   std::vector<TBlob> in_data(1, inputs[1]);
   // std::vector<TBlob> out_data(1, inputs[2]);

   CHECK_EQ(out_grad[0].shape_[0], in_data[0].shape_[0]);
   CHECK_NE(req[0], kWriteInplace) <<
     "ROIAlign: Backward doesn't support kWriteInplace.";
   CHECK_NE(req[1], kWriteInplace) <<
     "ROIAlign: Backward doesn't support kWriteInplace.";

   const ROIAlignParam param = nnvm::get<ROIAlignParam>(attrs.parsed);

   const int count = out_grad[0].Size();
   const int num_rois = in_data[0].size(0);
   const int channels = out_grad[0].size(1);  // channels of pooled output
   const int height = outputs[0].size(2);
   const int width = outputs[0].size(3);
   const int pooled_height = out_grad[0].size(2);
   const int pooled_width = out_grad[0].size(3);

   Stream<gpu> *s = ctx.get_stream<gpu>();
   cudaStream_t stream = mshadow::Stream<gpu>::GetStream(s);

   // assume all the data and gradient have the same type
   MSHADOW_REAL_TYPE_SWITCH(out_grad[0].type_flag_, DType, {
     const DType *top_diff = out_grad[0].dptr<DType>();
     const DType *bottom_rois = in_data[0].dptr<DType>();
     DType *grad_in = outputs[0].dptr<DType>();

     if (kWriteTo == req[roialign::kBox]) {
       Fill<false>(s, outputs[1], kWriteTo, static_cast<DType>(0));
     }
     if (kNullOp == req[roialign::kData]) return;
     if (kWriteTo == req[roialign::kData]) {
       Fill<false>(s, outputs[0], kWriteTo, static_cast<DType>(0));
     }
     RoIAlignBackwardKernel<DType>
     <<<ROI_GET_BLOCKS(count),
        kMaxThreadsPerBlock,
        0,
        stream>>>(
         count,
         top_diff,
         num_rois,
         param.spatial_scale,
         param.position_sensitive,
         channels,
         height,
         width,
         pooled_height,
         pooled_width,
         param.sample_ratio,
         grad_in,
         bottom_rois);
   })
 }


 NNVM_REGISTER_OP(_contrib_ROIAlign)
 .set_attr<FCompute>("FCompute<gpu>", ROIAlignForwardCompute<gpu>);

 NNVM_REGISTER_OP(_backward_ROIAlign)
 .set_attr<FCompute>("FCompute<gpu>", ROIAlignBackwardCompute<gpu>);

 }  // namespace op
 }  // namespace mxnet
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/
	/*!
	* Copyright (c) 2018 by Contributors
	* \file roi_align.cu
	* \brief roi align operator
	* \author Hang Zhang, Shesung
	* Adapted from Caffe2
	*/
	#include "./roi_align-inl.h"
	#include "../mxnet_op.h"


	namespace mxnet {
	namespace op {

	using namespace mshadow::cuda;

	// The maximum number of blocks to use in the default kernel call.
	constexpr int ROI_MAXIMUM_NUM_BLOCKS = 4096;

	/**
	* @brief Compute the number of blocks needed to run N threads.
	*/
	inline int ROI_GET_BLOCKS(const int N) {
	return std::max(
	std::min(
	(N + kMaxThreadsPerBlock - 1) / kMaxThreadsPerBlock,
	ROI_MAXIMUM_NUM_BLOCKS),
	// Use at least 1 block, since CUDA does not allow empty block
	1);
	}


	template <typename T>
	__device__ T bilinear_interpolate(
	const T* bottom_data,
	const int height,
	const int width,
	T y,
	T x,
	const int index /* index for debug only*/) {
	// deal with cases that inverse elements are out of feature map boundary
	if (y < -1.0 \|\| y > height \|\| x < -1.0 \|\| x > width) {
	// empty
	return 0;
	}

	if (y <= 0) {
	y = 0;
	}
	if (x <= 0) {
	x = 0;
	}

	int y_low = static_cast<int>(y);
	int x_low = static_cast<int>(x);
	int y_high;
	int x_high;

	if (y_low >= height - 1) {
	y_high = y_low = height - 1;
	y = (T)y_low;
	} else {
	y_high = y_low + 1;
	}

	if (x_low >= width - 1) {
	x_high = x_low = width - 1;
	x = (T)x_low;
	} else {
	x_high = x_low + 1;
	}

	T ly = y - y_low;
	T lx = x - x_low;
	T hy = 1. - ly, hx = 1. - lx;
	// do bilinear interpolation
	T v1 = bottom_data[y_low * width + x_low];
	T v2 = bottom_data[y_low * width + x_high];
	T v3 = bottom_data[y_high * width + x_low];
	T v4 = bottom_data[y_high * width + x_high];
	T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;

	T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);

	return val;
	}

	template <typename T>
	__global__ void RoIAlignForwardKernel(
	const int nthreads,
	const T* bottom_data,
	const T spatial_scale,
	const bool position_sensitive,
	const int channels,
	const int height,
	const int width,
	const int pooled_height,
	const int pooled_width,
	const int sampling_ratio,
	const T* bottom_rois,
	T* top_data) {
	CUDA_KERNEL_LOOP(index, nthreads) {
	// (n, c, ph, pw) is an element in the pooled output
	int pw = index % pooled_width;
	int ph = (index / pooled_width) % pooled_height;
	int c = (index / pooled_width / pooled_height) % channels;
	int n = index / pooled_width / pooled_height / channels;

	const T* offset_bottom_rois = bottom_rois + n * 5;
	int roi_batch_ind = offset_bottom_rois[0];

	if (roi_batch_ind < 0) {
	top_data[index] = 0.;
	continue;
	}

	// Do not using rounding; this implementation detail is critical
	T roi_start_w = offset_bottom_rois[1] * spatial_scale;
	T roi_start_h = offset_bottom_rois[2] * spatial_scale;
	T roi_end_w = offset_bottom_rois[3] * spatial_scale;
	T roi_end_h = offset_bottom_rois[4] * spatial_scale;
	// T roi_start_w = round(offset_bottom_rois[1] * spatial_scale);
	// T roi_start_h = round(offset_bottom_rois[2] * spatial_scale);
	// T roi_end_w = round(offset_bottom_rois[3] * spatial_scale);
	// T roi_end_h = round(offset_bottom_rois[4] * spatial_scale);

	// Force malformed ROIs to be 1x1
	T roi_width = max(roi_end_w - roi_start_w, (T)1.);
	T roi_height = max(roi_end_h - roi_start_h, (T)1.);
	T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
	T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);

	int c_unpooled = c;
	int channels_unpooled = channels;
	if (position_sensitive) {
	c_unpooled = c * pooled_height * pooled_width + ph * pooled_width + pw;
	channels_unpooled = channels * pooled_height * pooled_width;
	}
	const T* offset_bottom_data =
	bottom_data + (roi_batch_ind * channels_unpooled + c_unpooled)
	* height * width;

	// We use roi_bin_grid to sample the grid and mimic integral
	int roi_bin_grid_h = (sampling_ratio > 0)
	? sampling_ratio
	: ceil(roi_height / pooled_height); // e.g., = 2
	int roi_bin_grid_w =
	(sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);

	// We do average (integral) pooling inside a bin
	const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4

	T output_val = 0.;
	for (int iy = 0; iy < roi_bin_grid_h; iy++) { // e.g., iy = 0, 1
	const T y = roi_start_h + ph * bin_size_h +
	static_cast<T>(iy + .5f) * bin_size_h /
	static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
	for (int ix = 0; ix < roi_bin_grid_w; ix++) {
	const T x = roi_start_w + pw * bin_size_w +
	static_cast<T>(ix + .5f) * bin_size_w /
	static_cast<T>(roi_bin_grid_w);

	T val = bilinear_interpolate(
	offset_bottom_data, height, width, y, x, index);
	output_val += val;
	}
	}
	output_val /= count;

	top_data[index] = output_val;
	}
	}


	template <typename T>
	__device__ void bilinear_interpolate_gradient(
	const int height,
	const int width,
	T y,
	T x,
	T* w1,
	T* w2,
	T* w3,
	T* w4,
	int* x_low,
	int* x_high,
	int* y_low,
	int* y_high,
	const int /index/ /* index for debug only*/) {
	// deal with cases that inverse elements are out of feature map boundary
	if (y < -1.0 \|\| y > height \|\| x < -1.0 \|\| x > width) {
	// empty
	w1 = w2 = w3 = w4 = 0.;
	x_low = x_high = y_low = y_high = -1;
	return;
	}

	if (y <= 0) {
	y = 0;
	}
	if (x <= 0) {
	x = 0;
	}

	*y_low = static_cast<int>(y);
	*x_low = static_cast<int>(x);

	if (*y_low >= height - 1) {
	y_high = y_low = height - 1;
	y = (T)*y_low;
	} else {
	y_high = y_low + 1;
	}

	if (*x_low >= width - 1) {
	x_high = x_low = width - 1;
	x = (T)*x_low;
	} else {
	x_high = x_low + 1;
	}

	T ly = y - *y_low;
	T lx = x - *x_low;
	T hy = 1. - ly, hx = 1. - lx;

	w1 = hy hx, w2 = hy lx, w3 = ly hx, w4 = ly lx;

	return;
	}

	template <typename T>
	__global__ void RoIAlignBackwardKernel(
	const int nthreads,
	const T* top_diff,
	const int num_rois,
	const T spatial_scale,
	const bool position_sensitive,
	const int channels,
	const int height,
	const int width,
	const int pooled_height,
	const int pooled_width,
	const int sampling_ratio,
	T* bottom_diff,
	const T* bottom_rois) {
	CUDA_KERNEL_LOOP(index, nthreads) {
	// (n, c, ph, pw) is an element in the pooled output
	int pw = index % pooled_width;
	int ph = (index / pooled_width) % pooled_height;
	int c = (index / pooled_width / pooled_height) % channels;
	int n = index / pooled_width / pooled_height / channels;

	const T* offset_bottom_rois = bottom_rois + n * 5;
	int roi_batch_ind = offset_bottom_rois[0];
	if (roi_batch_ind < 0) continue;

	// Do not using rounding; this implementation detail is critical
	T roi_start_w = offset_bottom_rois[1] * spatial_scale;
	T roi_start_h = offset_bottom_rois[2] * spatial_scale;
	T roi_end_w = offset_bottom_rois[3] * spatial_scale;
	T roi_end_h = offset_bottom_rois[4] * spatial_scale;
	// T roi_start_w = round(offset_bottom_rois[1] * spatial_scale);
	// T roi_start_h = round(offset_bottom_rois[2] * spatial_scale);
	// T roi_end_w = round(offset_bottom_rois[3] * spatial_scale);
	// T roi_end_h = round(offset_bottom_rois[4] * spatial_scale);

	// Force malformed ROIs to be 1x1
	T roi_width = max(roi_end_w - roi_start_w, (T)1.);
	T roi_height = max(roi_end_h - roi_start_h, (T)1.);
	T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
	T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);

	int c_unpooled = c;
	int channels_unpooled = channels;
	if (position_sensitive) {
	c_unpooled = c * pooled_height * pooled_width + ph * pooled_width + pw;
	channels_unpooled = channels * pooled_height * pooled_width;
	}
	T* offset_bottom_diff =
	bottom_diff + (roi_batch_ind * channels_unpooled + c_unpooled)
	* height * width;

	int top_offset = (n * channels + c) * pooled_height * pooled_width;
	const T* offset_top_diff = top_diff + top_offset;
	const T top_diff_this_bin = offset_top_diff[ph * pooled_width + pw];

	// We use roi_bin_grid to sample the grid and mimic integral
	int roi_bin_grid_h = (sampling_ratio > 0)
	? sampling_ratio
	: ceil(roi_height / pooled_height); // e.g., = 2
	int roi_bin_grid_w =
	(sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);

	// We do average (integral) pooling inside a bin
	const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4

	for (int iy = 0; iy < roi_bin_grid_h; iy++) { // e.g., iy = 0, 1
	const T y = roi_start_h + ph * bin_size_h +
	static_cast<T>(iy + .5f) * bin_size_h /
	static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
	for (int ix = 0; ix < roi_bin_grid_w; ix++) {
	const T x = roi_start_w + pw * bin_size_w +
	static_cast<T>(ix + .5f) * bin_size_w /
	static_cast<T>(roi_bin_grid_w);

	T w1, w2, w3, w4;
	int x_low, x_high, y_low, y_high;

	bilinear_interpolate_gradient(
	height,
	width,
	y,
	x,
	&w1,
	&w2,
	&w3,
	&w4,
	&x_low,
	&x_high,
	&y_low,
	&y_high,
	index);

	T g1 = top_diff_this_bin * w1 / count;
	T g2 = top_diff_this_bin * w2 / count;
	T g3 = top_diff_this_bin * w3 / count;
	T g4 = top_diff_this_bin * w4 / count;

	if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
	atomicAdd(
	offset_bottom_diff + y_low * width + x_low, static_cast<T>(g1));
	atomicAdd(
	offset_bottom_diff + y_low * width + x_high, static_cast<T>(g2));
	atomicAdd(
	offset_bottom_diff + y_high * width + x_low, static_cast<T>(g3));
	atomicAdd(
	offset_bottom_diff + y_high * width + x_high, static_cast<T>(g4));
	} // if
	} // ix
	} // iy
	} // CUDA_KERNEL_LOOP
	} // RoIAlignBackward

	template<typename xpu>
	void ROIAlignForwardCompute(const nnvm::NodeAttrs& attrs,
	const OpContext& ctx,
	const std::vector<TBlob>& in_data,
	const std::vector<OpReqType>& req,
	const std::vector<TBlob>& out_data) {
	using namespace mshadow;
	size_t expected_in = 2;
	size_t expected_out = 1;
	CHECK_EQ(in_data.size(), expected_in);
	CHECK_EQ(out_data.size(), expected_out);
	CHECK_EQ(out_data[roialign::kOut].shape_[0], in_data[roialign::kBox].shape_[0]);

	const ROIAlignParam param = nnvm::get<ROIAlignParam>(attrs.parsed);

	const int count = out_data[roialign::kOut].Size();
	const int num_rois = in_data[roialign::kBox].size(0);
	const int channels = out_data[roialign::kOut].size(1); // channels of pooled output
	const int height = in_data[roialign::kData].size(2);
	const int width = in_data[roialign::kData].size(3);
	const int pooled_height = out_data[roialign::kOut].size(2);
	const int pooled_width = out_data[roialign::kOut].size(3);

	Stream<gpu> *s = ctx.get_stream<gpu>();
	cudaStream_t stream = mshadow::Stream<gpu>::GetStream(s);
	MSHADOW_REAL_TYPE_SWITCH(in_data[0].type_flag_, DType, {
	const DType *bottom_data = in_data[roialign::kData].dptr<DType>();
	const DType *bottom_rois = in_data[roialign::kBox].dptr<DType>();
	DType *top_data = out_data[roialign::kOut].dptr<DType>();
	RoIAlignForwardKernel<DType>
	<<<ROI_GET_BLOCKS(count),
	kMaxThreadsPerBlock,
	0,
	stream>>>(
	count,
	bottom_data,
	param.spatial_scale,
	param.position_sensitive,
	channels,
	height,
	width,
	pooled_height,
	pooled_width,
	param.sample_ratio,
	bottom_rois,
	top_data);
	})
	}


	template<typename xpu>
	void ROIAlignBackwardCompute(const nnvm::NodeAttrs& attrs,
	const OpContext& ctx,
	const std::vector<TBlob>& inputs,
	const std::vector<OpReqType>& req,
	const std::vector<TBlob>& outputs) {
	using namespace mshadow;

	CHECK_EQ(inputs.size(), 2);
	CHECK_EQ(outputs.size(), 2);
	// the order here relates to the order in ROIAlignGrad
	std::vector<TBlob> out_grad(1, inputs[0]);
	std::vector<TBlob> in_data(1, inputs[1]);
	// std::vector<TBlob> out_data(1, inputs[2]);

	CHECK_EQ(out_grad[0].shape_[0], in_data[0].shape_[0]);
	CHECK_NE(req[0], kWriteInplace) <<
	"ROIAlign: Backward doesn't support kWriteInplace.";
	CHECK_NE(req[1], kWriteInplace) <<
	"ROIAlign: Backward doesn't support kWriteInplace.";

	const ROIAlignParam param = nnvm::get<ROIAlignParam>(attrs.parsed);

	const int count = out_grad[0].Size();
	const int num_rois = in_data[0].size(0);
	const int channels = out_grad[0].size(1); // channels of pooled output
	const int height = outputs[0].size(2);
	const int width = outputs[0].size(3);
	const int pooled_height = out_grad[0].size(2);
	const int pooled_width = out_grad[0].size(3);

	Stream<gpu> *s = ctx.get_stream<gpu>();
	cudaStream_t stream = mshadow::Stream<gpu>::GetStream(s);

	// assume all the data and gradient have the same type
	MSHADOW_REAL_TYPE_SWITCH(out_grad[0].type_flag_, DType, {
	const DType *top_diff = out_grad[0].dptr<DType>();
	const DType *bottom_rois = in_data[0].dptr<DType>();
	DType *grad_in = outputs[0].dptr<DType>();

	if (kWriteTo == req[roialign::kBox]) {
	Fill<false>(s, outputs[1], kWriteTo, static_cast<DType>(0));
	}
	if (kNullOp == req[roialign::kData]) return;
	if (kWriteTo == req[roialign::kData]) {
	Fill<false>(s, outputs[0], kWriteTo, static_cast<DType>(0));
	}
	RoIAlignBackwardKernel<DType>
	<<<ROI_GET_BLOCKS(count),
	kMaxThreadsPerBlock,
	0,
	stream>>>(
	count,
	top_diff,
	num_rois,
	param.spatial_scale,
	param.position_sensitive,
	channels,
	height,
	width,
	pooled_height,
	pooled_width,
	param.sample_ratio,
	grad_in,
	bottom_rois);
	})
	}


	NNVM_REGISTER_OP(_contrib_ROIAlign)
	.set_attr<FCompute>("FCompute<gpu>", ROIAlignForwardCompute<gpu>);

	NNVM_REGISTER_OP(_backward_ROIAlign)
	.set_attr<FCompute>("FCompute<gpu>", ROIAlignBackwardCompute<gpu>);

	} // namespace op
	} // namespace mxnet