src/operator/roi_pooling.cc - mxnet-test - Git at Google

 /*!
  * Copyright (c) 2015 by Contributors
  * \file roi_pooling.cc
  * \brief roi pooling operator
  * \author Ross Girshick, Kye-Hyeon Kim, Jian Guo
 */
 #include "./roi_pooling-inl.h"
 #include <mshadow/base.h>
 #include <mshadow/tensor.h>
 #include <mshadow/packet-inl.h>
 #include <mshadow/dot_engine-inl.h>
 #include <cassert>

 using std::max;
 using std::min;
 using std::floor;
 using std::ceil;

 namespace mshadow {
 template<typename Dtype>
 inline void ROIPoolForward(const Tensor<cpu, 4, Dtype> &out,
                            const Tensor<cpu, 4, Dtype> &data,
                            const Tensor<cpu, 2, Dtype> &bbox,
                            const Tensor<cpu, 4, Dtype> &max_idx,
                            const float spatial_scale_) {
   const Dtype *bottom_data = data.dptr_;
   const Dtype *bottom_rois = bbox.dptr_;
   Dtype *top_data = out.dptr_;
   Dtype *argmax_data = max_idx.dptr_;
   const int channels_ = data.size(1);
   const int height_ = data.size(2);
   const int width_ = data.size(3);
   const int pooled_height_ = out.size(2);
   const int pooled_width_ = out.size(3);

   const int num_rois = bbox.size(0);
   const int batch_size = data.size(0);
   const int data_size = data.size(1) * data.size(2) * data.size(3);
   // For each ROI R = [batch_index x1 y1 x2 y2]: max pool over R
   for (int n = 0; n < num_rois; ++n) {
     int roi_batch_ind = bottom_rois[0];
     int roi_start_w = round(bottom_rois[1] * spatial_scale_);
     int roi_start_h = round(bottom_rois[2] * spatial_scale_);
     int roi_end_w = round(bottom_rois[3] * spatial_scale_);
     int roi_end_h = round(bottom_rois[4] * spatial_scale_);
     assert(roi_batch_ind >= 0);
     assert(roi_batch_ind < batch_size);

     // force malformed ROIs to be 1 * 1
     int roi_height = max(roi_end_h - roi_start_h + 1, 1);
     int roi_width = max(roi_end_w - roi_start_w + 1, 1);
     const Dtype bin_size_h = static_cast<Dtype>(roi_height)
                              / static_cast<Dtype>(pooled_height_);
     const Dtype bin_size_w = static_cast<Dtype>(roi_width)
                              / static_cast<Dtype>(pooled_width_);

     const Dtype* batch_data = bottom_data + data_size * roi_batch_ind;

     for (int c = 0; c < channels_; ++c) {
       for (int ph = 0; ph < pooled_height_; ++ph) {
         for (int pw = 0; pw < pooled_width_; ++pw) {
           // Compute pooling region for this output unit:
           //  start (included) = floor(ph * roi_height / pooled_height_)
           //  end (excluded) = ceil((ph + 1) * roi_height / pooled_height_)
           int hstart = static_cast<int>(floor(static_cast<Dtype>(ph)
                                               * bin_size_h));
           int wstart = static_cast<int>(floor(static_cast<Dtype>(pw)
                                               * bin_size_w));
           int hend = static_cast<int>(ceil(static_cast<Dtype>(ph + 1)
                                            * bin_size_h));
           int wend = static_cast<int>(ceil(static_cast<Dtype>(pw + 1)
                                            * bin_size_w));

           hstart = min(max(hstart + roi_start_h, 0), height_);
           hend = min(max(hend + roi_start_h, 0), height_);
           wstart = min(max(wstart + roi_start_w, 0), width_);
           wend = min(max(wend + roi_start_w, 0), width_);

           bool is_empty = (hend <= hstart) || (wend <= wstart);

           const int pool_index = ph * pooled_width_ + pw;
           if (is_empty) {
             top_data[pool_index] = 0;
             argmax_data[pool_index] = -1;
           }

           for (int h = hstart; h < hend; ++h) {
             for (int w = wstart; w < wend; ++w) {
               const int index = h * width_ + w;
               if (batch_data[index] > top_data[pool_index]) {
                 top_data[pool_index] = batch_data[index];
                 argmax_data[pool_index] = index;
               }
             }
           }
         }
       }
       // Increment all data pointers by one channel
       batch_data += data.size(2) * data.size(3);
       top_data += out.size(2) * out.size(3);
       argmax_data += max_idx.size(2) * max_idx.size(3);
     }
     // Increment ROI data pointer
     bottom_rois += bbox.size(1);
   }

   return;
 }

 template<typename Dtype>
 inline void ROIPoolBackwardAcc(const Tensor<cpu, 4, Dtype> &in_grad,
                                const Tensor<cpu, 4, Dtype> &out_grad,
                                const Tensor<cpu, 2, Dtype> &bbox,
                                const Tensor<cpu, 4, Dtype> &max_idx,
                                const float spatial_scale_) {
   const Dtype *top_diff = out_grad.dptr_;
   const Dtype *bottom_rois = bbox.dptr_;
   Dtype *bottom_diff = in_grad.dptr_;
   Dtype *argmax_data = max_idx.dptr_;

   const int batch_size_ = in_grad.size(0);
   const int channels_ = in_grad.size(1);
   const int height_ = in_grad.size(2);
   const int width_ = in_grad.size(3);
   const int pooled_height_ = out_grad.size(2);
   const int pooled_width_ = out_grad.size(3);

   const int num_rois = bbox.size(0);

   for (int b = 0; b < batch_size_; ++b) {
     for (int c = 0; c < channels_; ++c) {
       for (int h = 0; h < height_; ++h) {
         for (int w = 0; w < width_; ++w) {
           int offset_bottom_diff = (b * channels_ + c) * height_ * width_;
           offset_bottom_diff += h * width_ + w;

           Dtype gradient = 0;
           // Accumulate gradient over all ROIs that pooled this element
           for (int roi_n = 0; roi_n < num_rois; ++roi_n) {
             const Dtype* offset_bottom_rois = bottom_rois + roi_n * 5;
             int roi_batch_ind = offset_bottom_rois[0];
             assert(roi_batch_ind >= 0);
             assert(roi_batch_ind < batch_size_);
             if (b != roi_batch_ind) {
               continue;
             }

             int roi_start_w = round(offset_bottom_rois[1] * spatial_scale_);
             int roi_start_h = round(offset_bottom_rois[2] * spatial_scale_);
             int roi_end_w = round(offset_bottom_rois[3] * spatial_scale_);
             int roi_end_h = round(offset_bottom_rois[4] * spatial_scale_);

             bool in_roi = (w >= roi_start_w && w <= roi_end_w &&
                            h >= roi_start_h && h <= roi_end_h);
             if (!in_roi) {
               continue;
             }

             // force malformed ROIs to be 1 * 1
             int roi_height = max(roi_end_h - roi_start_h + 1, 1);
             int roi_width = max(roi_end_w - roi_start_w + 1, 1);
             const Dtype bin_size_h = static_cast<Dtype>(roi_height)
                                      / static_cast<Dtype>(pooled_height_);
             const Dtype bin_size_w = static_cast<Dtype>(roi_width)
                                      / static_cast<Dtype>(pooled_width_);

             // compute pooled regions correspond to original (h, w) point
             int phstart = static_cast<int>(floor(static_cast<Dtype>(h - roi_start_h)
                                                  / bin_size_h));
             int pwstart = static_cast<int>(floor(static_cast<Dtype>(w - roi_start_w)
                                                  / bin_size_w));
             int phend = static_cast<int>(ceil(static_cast<Dtype>(h - roi_start_h + 1)
                                               / bin_size_h));
             int pwend = static_cast<int>(ceil(static_cast<Dtype>(w - roi_start_w + 1)
                                               / bin_size_w));

             // clip to boundaries of pooled region
             phstart = min(max(phstart, 0), pooled_height_);
             phend = min(max(phend, 0), pooled_height_);
             pwstart = min(max(pwstart, 0), pooled_width_);
             pwend = min(max(pwend, 0), pooled_width_);

             // accumulate over gradients in pooled regions
             int offset = (roi_n * channels_ + c) * pooled_height_ * pooled_width_;
             const Dtype* offset_top_diff = top_diff + offset;
             const Dtype* offset_argmax_data = argmax_data + offset;
             for (int ph = phstart; ph < phend; ++ph) {
               for (int pw = pwstart; pw < pwend; ++pw) {
                 const int pooled_index = ph * pooled_width_ + pw;
                 if (static_cast<int>(offset_argmax_data[pooled_index]) == h * width_ + w) {
                   gradient += offset_top_diff[pooled_index];
                 }
               }
             }
           }
           bottom_diff[offset_bottom_diff] += gradient;
         }
       }
     }
   }

   return;
 }
 }  // namespace mshadow

 namespace mxnet {
 namespace op {

 template<>
 Operator *CreateOp<cpu>(ROIPoolingParam param, int dtype) {
   Operator* op = NULL;
   MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
     op = new ROIPoolingOp<cpu, DType>(param);
   });
   return op;
 }

 Operator *ROIPoolingProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
                                            std::vector<int> *in_type) const {
   std::vector<TShape> out_shape, aux_shape;
   std::vector<int> out_type, aux_type;
   CHECK(InferType(in_type, &out_type, &aux_type));
   CHECK(InferShape(in_shape, &out_shape, &aux_shape));
   DO_BIND_DISPATCH(CreateOp, param_, in_type->at(0));
 }

 DMLC_REGISTER_PARAMETER(ROIPoolingParam);

 MXNET_REGISTER_OP_PROPERTY(ROIPooling, ROIPoolingProp)
 .describe(R"code(Performs region of interest(ROI) pooling on the input array.

 ROI pooling is a variant of a max pooling layer, in which the output size is fixed and
 region of interest is a parameter. Its purpose is to perform max pooling on the inputs
 of non-uniform sizes to obtain fixed-size feature maps. ROI pooling is a neural-net
 layer mostly used in training a `Fast R-CNN` network for object detection.

 This operator takes a 4D feature map as an input array and region proposals as `rois`,
 then it pools over sub-regions of input and produces a fixed-sized output array
 regardless of the ROI size.

 To crop the feature map accordingly, you can resize the bounding box coordinates
 by changing the parameters `rois` and `spatial_scale`.

 The cropped feature maps are pooled by standard max pooling operation to a fixed size output
 indicated by a `pooled_size` parameter. batch_size will change to the number of region
 bounding boxes after `ROIPooling`.

 The size of each region of interest doesn't have to be perfectly divisible by
 the number of pooling sections(`pooled_size`).

 Example::

   x = [[[[  0.,   1.,   2.,   3.,   4.,   5.],
          [  6.,   7.,   8.,   9.,  10.,  11.],
          [ 12.,  13.,  14.,  15.,  16.,  17.],
          [ 18.,  19.,  20.,  21.,  22.,  23.],
          [ 24.,  25.,  26.,  27.,  28.,  29.],
          [ 30.,  31.,  32.,  33.,  34.,  35.],
          [ 36.,  37.,  38.,  39.,  40.,  41.],
          [ 42.,  43.,  44.,  45.,  46.,  47.]]]]

   // region of interest i.e. bounding box coordinates.
   y = [[0,0,0,4,4]]

   // returns array of shape (2,2) according to the given roi with max pooling.
   ROIPooling(x, y, (2,2), 1.0) = [[[[ 14.,  16.],
                                     [ 26.,  28.]]]]

   // region of interest is changed due to the change in `spacial_scale` parameter.
   ROIPooling(x, y, (2,2), 0.7) = [[[[  7.,   9.],
                                     [ 19.,  21.]]]]

 )code" ADD_FILELINE)
 .add_argument("data", "NDArray-or-Symbol", "The input array to the pooling operator, "
                                             " a 4D Feature maps ")
 .add_argument("rois", "NDArray-or-Symbol", "Bounding box coordinates, a 2D array of "
 "[[batch_index, x1, y1, x2, y2]], where (x1, y1) and (x2, y2) are top left and bottom right "
 "corners of designated region of interest. `batch_index` indicates the index of corresponding "
 "image in the input array")
 .add_arguments(ROIPoolingParam::__FIELDS__());
 }  // namespace op
 }  // namespace mxnet
	/*!
	* Copyright (c) 2015 by Contributors
	* \file roi_pooling.cc
	* \brief roi pooling operator
	* \author Ross Girshick, Kye-Hyeon Kim, Jian Guo
	*/
	#include "./roi_pooling-inl.h"
	#include <mshadow/base.h>
	#include <mshadow/tensor.h>
	#include <mshadow/packet-inl.h>
	#include <mshadow/dot_engine-inl.h>
	#include <cassert>

	using std::max;
	using std::min;
	using std::floor;
	using std::ceil;

	namespace mshadow {
	template<typename Dtype>
	inline void ROIPoolForward(const Tensor<cpu, 4, Dtype> &out,
	const Tensor<cpu, 4, Dtype> &data,
	const Tensor<cpu, 2, Dtype> &bbox,
	const Tensor<cpu, 4, Dtype> &max_idx,
	const float spatial_scale_) {
	const Dtype *bottom_data = data.dptr_;
	const Dtype *bottom_rois = bbox.dptr_;
	Dtype *top_data = out.dptr_;
	Dtype *argmax_data = max_idx.dptr_;
	const int channels_ = data.size(1);
	const int height_ = data.size(2);
	const int width_ = data.size(3);
	const int pooled_height_ = out.size(2);
	const int pooled_width_ = out.size(3);

	const int num_rois = bbox.size(0);
	const int batch_size = data.size(0);
	const int data_size = data.size(1) * data.size(2) * data.size(3);
	// For each ROI R = [batch_index x1 y1 x2 y2]: max pool over R
	for (int n = 0; n < num_rois; ++n) {
	int roi_batch_ind = bottom_rois[0];
	int roi_start_w = round(bottom_rois[1] * spatial_scale_);
	int roi_start_h = round(bottom_rois[2] * spatial_scale_);
	int roi_end_w = round(bottom_rois[3] * spatial_scale_);
	int roi_end_h = round(bottom_rois[4] * spatial_scale_);
	assert(roi_batch_ind >= 0);
	assert(roi_batch_ind < batch_size);

	// force malformed ROIs to be 1 * 1
	int roi_height = max(roi_end_h - roi_start_h + 1, 1);
	int roi_width = max(roi_end_w - roi_start_w + 1, 1);
	const Dtype bin_size_h = static_cast<Dtype>(roi_height)
	/ static_cast<Dtype>(pooled_height_);
	const Dtype bin_size_w = static_cast<Dtype>(roi_width)
	/ static_cast<Dtype>(pooled_width_);

	const Dtype* batch_data = bottom_data + data_size * roi_batch_ind;

	for (int c = 0; c < channels_; ++c) {
	for (int ph = 0; ph < pooled_height_; ++ph) {
	for (int pw = 0; pw < pooled_width_; ++pw) {
	// Compute pooling region for this output unit:
	// start (included) = floor(ph * roi_height / pooled_height_)
	// end (excluded) = ceil((ph + 1) * roi_height / pooled_height_)
	int hstart = static_cast<int>(floor(static_cast<Dtype>(ph)
	* bin_size_h));
	int wstart = static_cast<int>(floor(static_cast<Dtype>(pw)
	* bin_size_w));
	int hend = static_cast<int>(ceil(static_cast<Dtype>(ph + 1)
	* bin_size_h));
	int wend = static_cast<int>(ceil(static_cast<Dtype>(pw + 1)
	* bin_size_w));

	hstart = min(max(hstart + roi_start_h, 0), height_);
	hend = min(max(hend + roi_start_h, 0), height_);
	wstart = min(max(wstart + roi_start_w, 0), width_);
	wend = min(max(wend + roi_start_w, 0), width_);

	bool is_empty = (hend <= hstart) \|\| (wend <= wstart);

	const int pool_index = ph * pooled_width_ + pw;
	if (is_empty) {
	top_data[pool_index] = 0;
	argmax_data[pool_index] = -1;
	}

	for (int h = hstart; h < hend; ++h) {
	for (int w = wstart; w < wend; ++w) {
	const int index = h * width_ + w;
	if (batch_data[index] > top_data[pool_index]) {
	top_data[pool_index] = batch_data[index];
	argmax_data[pool_index] = index;
	}
	}
	}
	}
	}
	// Increment all data pointers by one channel
	batch_data += data.size(2) * data.size(3);
	top_data += out.size(2) * out.size(3);
	argmax_data += max_idx.size(2) * max_idx.size(3);
	}
	// Increment ROI data pointer
	bottom_rois += bbox.size(1);
	}

	return;
	}

	template<typename Dtype>
	inline void ROIPoolBackwardAcc(const Tensor<cpu, 4, Dtype> &in_grad,
	const Tensor<cpu, 4, Dtype> &out_grad,
	const Tensor<cpu, 2, Dtype> &bbox,
	const Tensor<cpu, 4, Dtype> &max_idx,
	const float spatial_scale_) {
	const Dtype *top_diff = out_grad.dptr_;
	const Dtype *bottom_rois = bbox.dptr_;
	Dtype *bottom_diff = in_grad.dptr_;
	Dtype *argmax_data = max_idx.dptr_;

	const int batch_size_ = in_grad.size(0);
	const int channels_ = in_grad.size(1);
	const int height_ = in_grad.size(2);
	const int width_ = in_grad.size(3);
	const int pooled_height_ = out_grad.size(2);
	const int pooled_width_ = out_grad.size(3);

	const int num_rois = bbox.size(0);

	for (int b = 0; b < batch_size_; ++b) {
	for (int c = 0; c < channels_; ++c) {
	for (int h = 0; h < height_; ++h) {
	for (int w = 0; w < width_; ++w) {
	int offset_bottom_diff = (b * channels_ + c) * height_ * width_;
	offset_bottom_diff += h * width_ + w;

	Dtype gradient = 0;
	// Accumulate gradient over all ROIs that pooled this element
	for (int roi_n = 0; roi_n < num_rois; ++roi_n) {
	const Dtype* offset_bottom_rois = bottom_rois + roi_n * 5;
	int roi_batch_ind = offset_bottom_rois[0];
	assert(roi_batch_ind >= 0);
	assert(roi_batch_ind < batch_size_);
	if (b != roi_batch_ind) {
	continue;
	}

	int roi_start_w = round(offset_bottom_rois[1] * spatial_scale_);
	int roi_start_h = round(offset_bottom_rois[2] * spatial_scale_);
	int roi_end_w = round(offset_bottom_rois[3] * spatial_scale_);
	int roi_end_h = round(offset_bottom_rois[4] * spatial_scale_);

	bool in_roi = (w >= roi_start_w && w <= roi_end_w &&
	h >= roi_start_h && h <= roi_end_h);
	if (!in_roi) {
	continue;
	}

	// force malformed ROIs to be 1 * 1
	int roi_height = max(roi_end_h - roi_start_h + 1, 1);
	int roi_width = max(roi_end_w - roi_start_w + 1, 1);
	const Dtype bin_size_h = static_cast<Dtype>(roi_height)
	/ static_cast<Dtype>(pooled_height_);
	const Dtype bin_size_w = static_cast<Dtype>(roi_width)
	/ static_cast<Dtype>(pooled_width_);

	// compute pooled regions correspond to original (h, w) point
	int phstart = static_cast<int>(floor(static_cast<Dtype>(h - roi_start_h)
	/ bin_size_h));
	int pwstart = static_cast<int>(floor(static_cast<Dtype>(w - roi_start_w)
	/ bin_size_w));
	int phend = static_cast<int>(ceil(static_cast<Dtype>(h - roi_start_h + 1)
	/ bin_size_h));
	int pwend = static_cast<int>(ceil(static_cast<Dtype>(w - roi_start_w + 1)
	/ bin_size_w));

	// clip to boundaries of pooled region
	phstart = min(max(phstart, 0), pooled_height_);
	phend = min(max(phend, 0), pooled_height_);
	pwstart = min(max(pwstart, 0), pooled_width_);
	pwend = min(max(pwend, 0), pooled_width_);

	// accumulate over gradients in pooled regions
	int offset = (roi_n * channels_ + c) * pooled_height_ * pooled_width_;
	const Dtype* offset_top_diff = top_diff + offset;
	const Dtype* offset_argmax_data = argmax_data + offset;
	for (int ph = phstart; ph < phend; ++ph) {
	for (int pw = pwstart; pw < pwend; ++pw) {
	const int pooled_index = ph * pooled_width_ + pw;
	if (static_cast<int>(offset_argmax_data[pooled_index]) == h * width_ + w) {
	gradient += offset_top_diff[pooled_index];
	}
	}
	}
	}
	bottom_diff[offset_bottom_diff] += gradient;
	}
	}
	}
	}

	return;
	}
	} // namespace mshadow

	namespace mxnet {
	namespace op {

	template<>
	Operator *CreateOp<cpu>(ROIPoolingParam param, int dtype) {
	Operator* op = NULL;
	MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
	op = new ROIPoolingOp<cpu, DType>(param);
	});
	return op;
	}

	Operator ROIPoolingProp::CreateOperatorEx(Context ctx, std::vector<TShape> in_shape,
	std::vector<int> *in_type) const {
	std::vector<TShape> out_shape, aux_shape;
	std::vector<int> out_type, aux_type;
	CHECK(InferType(in_type, &out_type, &aux_type));
	CHECK(InferShape(in_shape, &out_shape, &aux_shape));
	DO_BIND_DISPATCH(CreateOp, param_, in_type->at(0));
	}

	DMLC_REGISTER_PARAMETER(ROIPoolingParam);

	MXNET_REGISTER_OP_PROPERTY(ROIPooling, ROIPoolingProp)
	.describe(R"code(Performs region of interest(ROI) pooling on the input array.

	ROI pooling is a variant of a max pooling layer, in which the output size is fixed and
	region of interest is a parameter. Its purpose is to perform max pooling on the inputs
	of non-uniform sizes to obtain fixed-size feature maps. ROI pooling is a neural-net
	layer mostly used in training a `Fast R-CNN` network for object detection.

	This operator takes a 4D feature map as an input array and region proposals as `rois`,
	then it pools over sub-regions of input and produces a fixed-sized output array
	regardless of the ROI size.

	To crop the feature map accordingly, you can resize the bounding box coordinates
	by changing the parameters `rois` and `spatial_scale`.

	The cropped feature maps are pooled by standard max pooling operation to a fixed size output
	indicated by a `pooled_size` parameter. batch_size will change to the number of region
	bounding boxes after `ROIPooling`.

	The size of each region of interest doesn't have to be perfectly divisible by
	the number of pooling sections(`pooled_size`).

	Example::

	x = [[[[ 0., 1., 2., 3., 4., 5.],
	[ 6., 7., 8., 9., 10., 11.],
	[ 12., 13., 14., 15., 16., 17.],
	[ 18., 19., 20., 21., 22., 23.],
	[ 24., 25., 26., 27., 28., 29.],
	[ 30., 31., 32., 33., 34., 35.],
	[ 36., 37., 38., 39., 40., 41.],
	[ 42., 43., 44., 45., 46., 47.]]]]

	// region of interest i.e. bounding box coordinates.
	y = [[0,0,0,4,4]]

	// returns array of shape (2,2) according to the given roi with max pooling.
	ROIPooling(x, y, (2,2), 1.0) = [[[[ 14., 16.],
	[ 26., 28.]]]]

	// region of interest is changed due to the change in `spacial_scale` parameter.
	ROIPooling(x, y, (2,2), 0.7) = [[[[ 7., 9.],
	[ 19., 21.]]]]

	)code" ADD_FILELINE)
	.add_argument("data", "NDArray-or-Symbol", "The input array to the pooling operator, "
	" a 4D Feature maps ")
	.add_argument("rois", "NDArray-or-Symbol", "Bounding box coordinates, a 2D array of "
	"[[batch_index, x1, y1, x2, y2]], where (x1, y1) and (x2, y2) are top left and bottom right "
	"corners of designated region of interest. `batch_index` indicates the index of corresponding "
	"image in the input array")
	.add_arguments(ROIPoolingParam::__FIELDS__());
	} // namespace op
	} // namespace mxnet