src/operator/contrib/multi_proposal.cc - mxnet - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 /*!
  * Copyright (c) 2017 Microsoft
  * Licensed under The Apache-2.0 License [see LICENSE for details]
  * \file multi_proposal.cc
  * \brief
  * \author Xizhou Zhu, Kan Wu
 */

 #include "./multi_proposal-inl.h"

 //============================
 // Bounding Box Transform Utils
 //============================
 namespace mxnet {
 namespace op {
 namespace utils {

 // bbox prediction and clip to the image borders
 inline void BBoxTransformInv(const mshadow::Tensor<cpu, 2>& boxes,
                              const mshadow::Tensor<cpu, 3>& deltas,
                              const float im_height,
                              const float im_width,
                              const int real_height,
                              const int real_width,
                              mshadow::Tensor<cpu, 2> *out_pred_boxes) {
   CHECK_GE(boxes.size(1), 4);
   CHECK_GE(out_pred_boxes->size(1), 4);
   int anchors = deltas.size(0) / 4;
   int heights = deltas.size(1);
   int widths = deltas.size(2);

   #pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
   for (int index = 0; index < anchors * heights * widths; ++index) {
     // index_t index = h * (widths * anchors) + w * (anchors) + a;
     int a = index % anchors;
     int w = (index / anchors) % widths;
     int h = index / (widths * anchors);

     float width = boxes[index][2] - boxes[index][0] + 1.0;
     float height = boxes[index][3] - boxes[index][1] + 1.0;
     float ctr_x = boxes[index][0] + 0.5 * (width - 1.0);
     float ctr_y = boxes[index][1] + 0.5 * (height - 1.0);

     float dx = deltas[a*4 + 0][h][w];
     float dy = deltas[a*4 + 1][h][w];
     float dw = deltas[a*4 + 2][h][w];
     float dh = deltas[a*4 + 3][h][w];

     float pred_ctr_x = dx * width + ctr_x;
     float pred_ctr_y = dy * height + ctr_y;
     float pred_w = std::exp(dw) * width;
     float pred_h = std::exp(dh) * height;

     float pred_x1 = pred_ctr_x - 0.5 * (pred_w - 1.0);
     float pred_y1 = pred_ctr_y - 0.5 * (pred_h - 1.0);
     float pred_x2 = pred_ctr_x + 0.5 * (pred_w - 1.0);
     float pred_y2 = pred_ctr_y + 0.5 * (pred_h - 1.0);

     pred_x1 = std::max(std::min(pred_x1, im_width - 1.0f), 0.0f);
     pred_y1 = std::max(std::min(pred_y1, im_height - 1.0f), 0.0f);
     pred_x2 = std::max(std::min(pred_x2, im_width - 1.0f), 0.0f);
     pred_y2 = std::max(std::min(pred_y2, im_height - 1.0f), 0.0f);

     (*out_pred_boxes)[index][0] = pred_x1;
     (*out_pred_boxes)[index][1] = pred_y1;
     (*out_pred_boxes)[index][2] = pred_x2;
     (*out_pred_boxes)[index][3] = pred_y2;

     if (h >= real_height || w >= real_width) {
       (*out_pred_boxes)[index][4] = -1.0;
     }
   }
 }

 // iou prediction and clip to the image border
 inline void IoUTransformInv(const mshadow::Tensor<cpu, 2>& boxes,
                             const mshadow::Tensor<cpu, 3>& deltas,
                             const float im_height,
                             const float im_width,
                             const int real_height,
                             const int real_width,
                             mshadow::Tensor<cpu, 2> *out_pred_boxes) {
   CHECK_GE(boxes.size(1), 4);
   CHECK_GE(out_pred_boxes->size(1), 4);
   int anchors = deltas.size(0) / 4;
   int heights = deltas.size(1);
   int widths = deltas.size(2);

   #pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
   for (int index = 0; index < anchors * heights * widths; ++index) {
     // index_t index = h * (widths * anchors) + w * (anchors) + a;
     int a = index % anchors;
     int w = (index / anchors) % widths;
     int h = index / (widths * anchors);

     float x1 = boxes[index][0];
     float y1 = boxes[index][1];
     float x2 = boxes[index][2];
     float y2 = boxes[index][3];

     float dx1 = deltas[a * 4 + 0][h][w];
     float dy1 = deltas[a * 4 + 1][h][w];
     float dx2 = deltas[a * 4 + 2][h][w];
     float dy2 = deltas[a * 4 + 3][h][w];

     float pred_x1 = x1 + dx1;
     float pred_y1 = y1 + dy1;
     float pred_x2 = x2 + dx2;
     float pred_y2 = y2 + dy2;

     pred_x1 = std::max(std::min(pred_x1, im_width - 1.0f), 0.0f);
     pred_y1 = std::max(std::min(pred_y1, im_height - 1.0f), 0.0f);
     pred_x2 = std::max(std::min(pred_x2, im_width - 1.0f), 0.0f);
     pred_y2 = std::max(std::min(pred_y2, im_height - 1.0f), 0.0f);

     (*out_pred_boxes)[index][0] = pred_x1;
     (*out_pred_boxes)[index][1] = pred_y1;
     (*out_pred_boxes)[index][2] = pred_x2;
     (*out_pred_boxes)[index][3] = pred_y2;

     if (h >= real_height || w >= real_width) {
       (*out_pred_boxes)[index][4] = -1.0f;
     }
   }
 }

 // filter box by set confidence to zero
 // * height or width < rpn_min_size
 inline void FilterBox(mshadow::Tensor<cpu, 2> *dets,
                       const float min_size) {
   #pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
   for (int i = 0; i < static_cast<int>(dets->size(0)); ++i) {
     float iw = (*dets)[i][2] - (*dets)[i][0] + 1.0f;
     float ih = (*dets)[i][3] - (*dets)[i][1] + 1.0f;
     if (iw < min_size || ih < min_size) {
       (*dets)[i][0] -= min_size / 2;
       (*dets)[i][1] -= min_size / 2;
       (*dets)[i][2] += min_size / 2;
       (*dets)[i][3] += min_size / 2;
       (*dets)[i][4] = -1.0f;
     }
   }
 }

 }  // namespace utils
 }  // namespace op
 }  // namespace mxnet

 //=====================
 // NMS Utils
 //=====================
 namespace mxnet {
 namespace op {
 namespace utils {

 struct ReverseArgsortCompl {
   const float *val_;
   explicit ReverseArgsortCompl(float *val)
     : val_(val) {}
   bool operator() (float i, float j) {
     return (val_[static_cast<index_t>(i)] >
             val_[static_cast<index_t>(j)]);
   }
 };

 // copy score and init order
 inline void CopyScore(const mshadow::Tensor<cpu, 2>& dets,
                       mshadow::Tensor<cpu, 1> *score,
                       mshadow::Tensor<cpu, 1> *order) {
   #pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
   for (int i = 0; i < static_cast<int>(dets.size(0)); ++i) {
     (*score)[i] = dets[i][4];
     (*order)[i] = i;
   }
 }

 // sort order array according to score
 inline void ReverseArgsort(const mshadow::Tensor<cpu, 1>& score,
                            mshadow::Tensor<cpu, 1> *order) {
   ReverseArgsortCompl cmpl(score.dptr_);
   std::stable_sort(order->dptr_, order->dptr_ + score.size(0), cmpl);
 }

 // reorder proposals according to order and keep the pre_nms_top_n proposals
 // dets.size(0) == pre_nms_top_n
 inline void ReorderProposals(const mshadow::Tensor<cpu, 2>& prev_dets,
                              const mshadow::Tensor<cpu, 1>& order,
                              const index_t pre_nms_top_n,
                              mshadow::Tensor<cpu, 2> *dets) {
   CHECK_EQ(dets->size(0), pre_nms_top_n);
   const int dets_size0 = static_cast<int>(dets->size(0));
   const int dets_size1 = static_cast<int>(dets->size(1));
   #pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
   for (int k = 0; k < dets_size0 * dets_size1; ++k) {
     int i = k / dets_size1;
     int j = k % dets_size1;
     const index_t index = order[i];
     (*dets)[i][j] = prev_dets[index][j];
   }
 }

 // greedily keep the max detections (already sorted)
 inline void NonMaximumSuppression(const mshadow::Tensor<cpu, 2>& dets,
                                   const float thresh,
                                   const index_t post_nms_top_n,
                                   mshadow::Tensor<cpu, 1> *area,
                                   mshadow::Tensor<cpu, 1> *suppressed,
                                   mshadow::Tensor<cpu, 1> *keep,
                                   int *out_size) {
   CHECK_EQ(dets.shape_[1], 5) << "dets: [x1, y1, x2, y2, score]";
   CHECK_GT(dets.shape_[0], 0);
   CHECK_EQ(dets.CheckContiguous(), true);
   CHECK_EQ(area->CheckContiguous(), true);
   CHECK_EQ(suppressed->CheckContiguous(), true);
   CHECK_EQ(keep->CheckContiguous(), true);
   // calculate area
   #pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
   for (int i = 0; i < static_cast<int>(dets.size(0)); ++i) {
     (*area)[i] = (dets[i][2] - dets[i][0] + 1) *
                  (dets[i][3] - dets[i][1] + 1);
   }

   // calculate nms
   *out_size = 0;
   for (index_t i = 0; i < dets.size(0) && (*out_size) < static_cast<int>(post_nms_top_n); ++i) {
     float ix1 = dets[i][0];
     float iy1 = dets[i][1];
     float ix2 = dets[i][2];
     float iy2 = dets[i][3];
     float iarea = (*area)[i];

     if ((*suppressed)[i] > 0.0f) {
       continue;
     }

     (*keep)[(*out_size)++] = i;
     #pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
     for (int j = i + 1; j < static_cast<int>(dets.size(0)); ++j) {
       if ((*suppressed)[j] > 0.0f) {
         continue;
       }
       float xx1 = std::max(ix1, dets[j][0]);
       float yy1 = std::max(iy1, dets[j][1]);
       float xx2 = std::min(ix2, dets[j][2]);
       float yy2 = std::min(iy2, dets[j][3]);
       float w = std::max(0.0f, xx2 - xx1 + 1.0f);
       float h = std::max(0.0f, yy2 - yy1 + 1.0f);
       float inter = w * h;
       float ovr = inter / (iarea + (*area)[j] - inter);
       if (ovr > thresh) {
         (*suppressed)[j] = 1.0f;
       }
     }
   }
 }

 }  // namespace utils
 }  // namespace op
 }  // namespace mxnet


 namespace mxnet {
 namespace op {

 template<typename xpu>
 class MultiProposalOp : public Operator{
  public:
   explicit MultiProposalOp(MultiProposalParam param) {
     this->param_ = param;
   }

   virtual void Forward(const OpContext &ctx,
                        const std::vector<TBlob> &in_data,
                        const std::vector<OpReqType> &req,
                        const std::vector<TBlob> &out_data,
                        const std::vector<TBlob> &aux_states) {
     using namespace mshadow;
     using namespace mshadow::expr;
     CHECK_EQ(in_data.size(), 3);
     CHECK_EQ(out_data.size(), 2);
     CHECK_GT(req.size(), 1);
     CHECK_EQ(req[proposal::kOut], kWriteTo);

     Stream<xpu> *s = ctx.get_stream<xpu>();

     Tensor<cpu, 4> scores = in_data[proposal::kClsProb].get<cpu, 4, real_t>(s);
     Tensor<cpu, 4> bbox_deltas = in_data[proposal::kBBoxPred].get<cpu, 4, real_t>(s);
     Tensor<cpu, 2> im_info = in_data[proposal::kImInfo].get<cpu, 2, real_t>(s);

     Tensor<cpu, 2> out = out_data[proposal::kOut].get<cpu, 2, real_t>(s);
     Tensor<cpu, 2> out_score = out_data[proposal::kScore].get<cpu, 2, real_t>(s);

     int num_images = scores.size(0);
     int num_anchors = scores.size(1) / 2;
     int height = scores.size(2);
     int width = scores.size(3);
     int count_anchors = num_anchors * height * width;
     int rpn_pre_nms_top_n =
         (param_.rpn_pre_nms_top_n > 0) ? param_.rpn_pre_nms_top_n : count_anchors;
     rpn_pre_nms_top_n = std::min(rpn_pre_nms_top_n, count_anchors);
     int rpn_post_nms_top_n = std::min(param_.rpn_post_nms_top_n, rpn_pre_nms_top_n);

     int workspace_size =
         num_images * (count_anchors * 5 + 2 * count_anchors +
         rpn_pre_nms_top_n * 5 + 3 * rpn_pre_nms_top_n);

     Tensor<cpu, 1> workspace = ctx.requested[proposal::kTempResource].get_space<cpu>(
       Shape1(workspace_size), s);
     int start = 0;
     Tensor<cpu, 3> workspace_proposals(workspace.dptr_ +
             start, Shape3(num_images, count_anchors, 5));
     start += num_images * count_anchors * 5;
     Tensor<cpu, 3> workspace_pre_nms(workspace.dptr_ + start, Shape3(num_images, 2, count_anchors));
     start += num_images * 2 * count_anchors;
     Tensor<cpu, 3> workspace_ordered_proposals(workspace.dptr_ + start,
                                                Shape3(num_images, rpn_pre_nms_top_n, 5));
     start += num_images * rpn_pre_nms_top_n * 5;
     Tensor<cpu, 3> workspace_nms(workspace.dptr_ + start, Shape3(num_images, 3, rpn_pre_nms_top_n));
     start += num_images * 3 * rpn_pre_nms_top_n;
     CHECK_EQ(workspace_size, start) << workspace_size << " " << start << std::endl;

     // Generate anchors
     std::vector<float> base_anchor(4);
     base_anchor[0] = 0.0;
     base_anchor[1] = 0.0;
     base_anchor[2] = param_.feature_stride - 1.0;
     base_anchor[3] = param_.feature_stride - 1.0;
     CHECK_EQ(num_anchors, param_.ratios.ndim() * param_.scales.ndim());
     std::vector<float> anchors;
     utils::GenerateAnchors(base_anchor,
                            param_.ratios,
                            param_.scales,
                            &anchors);
     std::memcpy(workspace_proposals.dptr_, &anchors[0], sizeof(float) * anchors.size());

     Tensor<cpu, 2> workspace_proposals0 = workspace_proposals[0];
     // Enumerate all shifted anchors
     #pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
     for (int index = 0; index < num_anchors * height * width; ++index) {
       // index_t index = j * (width * num_anchors) + k * (num_anchors) + i;
       int i = index % num_anchors;
       int k = (index / num_anchors) % width;
       int j = index / (width * num_anchors);
       workspace_proposals0[index][0] =
           workspace_proposals0[i][0] + k * param_.feature_stride;
       workspace_proposals0[index][1] =
           workspace_proposals0[i][1] + j * param_.feature_stride;
       workspace_proposals0[index][2] =
           workspace_proposals0[i][2] + k * param_.feature_stride;
       workspace_proposals0[index][3] =
           workspace_proposals0[i][3] + j * param_.feature_stride;
       workspace_proposals0[index][4] = scores[0][i + num_anchors][j][k];
     }

     // Copy shifted anchors to other images
     #pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
     for (int t = count_anchors; t < num_images * count_anchors; ++t) {
         int b = t / count_anchors;
         int index = t % count_anchors;
         int i = index % num_anchors;
         int k = (index / num_anchors) % width;
         int j = index / (width * num_anchors);
         for (int w = 0; w < 4; ++w) {
             workspace_proposals[b][index][w] = workspace_proposals[0][index][w];
         }
         workspace_proposals[b][index][4] = scores[b][i + num_anchors][j][k];
     }

     // Assign Foreground Scores for each anchor
     #pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
     for (int b = 0; b < num_images; ++b) {
       // prevent padded predictions
       int real_height = static_cast<int>(im_info[b][0] / param_.feature_stride);
       int real_width = static_cast<int>(im_info[b][1] / param_.feature_stride);
       CHECK_GE(height, real_height) << height << " " << real_height << std::endl;
       CHECK_GE(width, real_width) << width << " " << real_width << std::endl;

       Tensor<cpu, 2> workspace_proposals_i = workspace_proposals[b];
       Tensor<cpu, 2> workspace_pre_nms_i = workspace_pre_nms[b];
       Tensor<cpu, 2> workspace_ordered_proposals_i =
                        workspace_ordered_proposals[b];
       Tensor<cpu, 2> workspace_nms_i = workspace_nms[b];

       if (param_.iou_loss) {
         utils::IoUTransformInv(workspace_proposals_i, bbox_deltas[b], im_info[b][0], im_info[b][1],
                                real_height, real_width, &(workspace_proposals_i));
       } else {
         utils::BBoxTransformInv(workspace_proposals_i, bbox_deltas[b], im_info[b][0], im_info[b][1],
                                 real_height, real_width, &(workspace_proposals_i));
       }
       utils::FilterBox(&workspace_proposals_i, param_.rpn_min_size * im_info[b][2]);

       Tensor<cpu, 1> score = workspace_pre_nms_i[0];
       Tensor<cpu, 1> order = workspace_pre_nms_i[1];

       utils::CopyScore(workspace_proposals_i,
                        &score,
                        &order);
       utils::ReverseArgsort(score,
                             &order);
       utils::ReorderProposals(workspace_proposals_i,
                               order,
                               rpn_pre_nms_top_n,
                               &workspace_ordered_proposals_i);
       int out_size = 0;
       Tensor<cpu, 1> area = workspace_nms_i[0];
       Tensor<cpu, 1> suppressed = workspace_nms_i[1];
       Tensor<cpu, 1> keep = workspace_nms_i[2];
       suppressed = 0;  // surprised!

       utils::NonMaximumSuppression(workspace_ordered_proposals_i,
                                    param_.threshold,
                                    rpn_post_nms_top_n,
                                    &area,
                                    &suppressed,
                                    &keep,
                                    &out_size);

       // fill in output rois and output scores
       #pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
       for (int i = 0; i < param_.rpn_post_nms_top_n; ++i) {
         int out_index = b * param_.rpn_post_nms_top_n + i;
         out[out_index][0] = b;
         if (i < out_size) {
           index_t index = keep[i];
           for (index_t j = 0; j < 4; ++j) {
             out[out_index][j + 1] =  workspace_ordered_proposals_i[index][j];
           }
           out_score[out_index][0] = workspace_ordered_proposals_i[index][4];
         } else {
           index_t index = keep[i % out_size];
           for (index_t j = 0; j < 4; ++j) {
             out[out_index][j + 1] = workspace_ordered_proposals_i[index][j];
           }
           out_score[out_index][0] = workspace_ordered_proposals_i[index][4];
         }
       }
     }
   }

   virtual void Backward(const OpContext &ctx,
                         const std::vector<TBlob> &out_grad,
                         const std::vector<TBlob> &in_data,
                         const std::vector<TBlob> &out_data,
                         const std::vector<OpReqType> &req,
                         const std::vector<TBlob> &in_grad,
                         const std::vector<TBlob> &aux_states) {
     using namespace mshadow;
     using namespace mshadow::expr;
     CHECK_EQ(in_grad.size(), 3);

     Stream<xpu> *s = ctx.get_stream<xpu>();
     Tensor<xpu, 4> gscores = in_grad[proposal::kClsProb].get<xpu, 4, real_t>(s);
     Tensor<xpu, 4> gbbox = in_grad[proposal::kBBoxPred].get<xpu, 4, real_t>(s);
     Tensor<xpu, 2> ginfo = in_grad[proposal::kImInfo].get<xpu, 2, real_t>(s);

     // can not assume the grad would be zero
     Assign(gscores, req[proposal::kClsProb], 0);
     Assign(gbbox, req[proposal::kBBoxPred], 0);
     Assign(ginfo, req[proposal::kImInfo], 0);
   }

  private:
   MultiProposalParam param_;
 };  // class MultiProposalOp

 template<>
 Operator *CreateOp<cpu>(MultiProposalParam param) {
   return new MultiProposalOp<cpu>(param);
 }

 Operator* MultiProposalProp::CreateOperator(Context ctx) const {
   DO_BIND_DISPATCH(CreateOp, param_);
 }

 DMLC_REGISTER_PARAMETER(MultiProposalParam);

 MXNET_REGISTER_OP_PROPERTY(_contrib_MultiProposal, MultiProposalProp)
 .describe("Generate region proposals via RPN")
 .add_argument("cls_prob", "NDArray-or-Symbol", "Score of how likely proposal is object.")
 .add_argument("bbox_pred", "NDArray-or-Symbol", "BBox Predicted deltas from anchors for proposals")
 .add_argument("im_info", "NDArray-or-Symbol", "Image size and scale.")
 .add_arguments(MultiProposalParam::__FIELDS__());

 }  // namespace op
 }  // namespace mxnet
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/

	/*!
	* Copyright (c) 2017 Microsoft
	* Licensed under The Apache-2.0 License [see LICENSE for details]
	* \file multi_proposal.cc
	* \brief
	* \author Xizhou Zhu, Kan Wu
	*/

	#include "./multi_proposal-inl.h"

	//============================
	// Bounding Box Transform Utils
	//============================
	namespace mxnet {
	namespace op {
	namespace utils {

	// bbox prediction and clip to the image borders
	inline void BBoxTransformInv(const mshadow::Tensor<cpu, 2>& boxes,
	const mshadow::Tensor<cpu, 3>& deltas,
	const float im_height,
	const float im_width,
	const int real_height,
	const int real_width,
	mshadow::Tensor<cpu, 2> *out_pred_boxes) {
	CHECK_GE(boxes.size(1), 4);
	CHECK_GE(out_pred_boxes->size(1), 4);
	int anchors = deltas.size(0) / 4;
	int heights = deltas.size(1);
	int widths = deltas.size(2);

	#pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
	for (int index = 0; index < anchors * heights * widths; ++index) {
	// index_t index = h * (widths * anchors) + w * (anchors) + a;
	int a = index % anchors;
	int w = (index / anchors) % widths;
	int h = index / (widths * anchors);

	float width = boxes[index][2] - boxes[index][0] + 1.0;
	float height = boxes[index][3] - boxes[index][1] + 1.0;
	float ctr_x = boxes[index][0] + 0.5 * (width - 1.0);
	float ctr_y = boxes[index][1] + 0.5 * (height - 1.0);

	float dx = deltas[a*4 + 0][h][w];
	float dy = deltas[a*4 + 1][h][w];
	float dw = deltas[a*4 + 2][h][w];
	float dh = deltas[a*4 + 3][h][w];

	float pred_ctr_x = dx * width + ctr_x;
	float pred_ctr_y = dy * height + ctr_y;
	float pred_w = std::exp(dw) * width;
	float pred_h = std::exp(dh) * height;

	float pred_x1 = pred_ctr_x - 0.5 * (pred_w - 1.0);
	float pred_y1 = pred_ctr_y - 0.5 * (pred_h - 1.0);
	float pred_x2 = pred_ctr_x + 0.5 * (pred_w - 1.0);
	float pred_y2 = pred_ctr_y + 0.5 * (pred_h - 1.0);

	pred_x1 = std::max(std::min(pred_x1, im_width - 1.0f), 0.0f);
	pred_y1 = std::max(std::min(pred_y1, im_height - 1.0f), 0.0f);
	pred_x2 = std::max(std::min(pred_x2, im_width - 1.0f), 0.0f);
	pred_y2 = std::max(std::min(pred_y2, im_height - 1.0f), 0.0f);

	(*out_pred_boxes)[index][0] = pred_x1;
	(*out_pred_boxes)[index][1] = pred_y1;
	(*out_pred_boxes)[index][2] = pred_x2;
	(*out_pred_boxes)[index][3] = pred_y2;

	if (h >= real_height \|\| w >= real_width) {
	(*out_pred_boxes)[index][4] = -1.0;
	}
	}
	}

	// iou prediction and clip to the image border
	inline void IoUTransformInv(const mshadow::Tensor<cpu, 2>& boxes,
	const mshadow::Tensor<cpu, 3>& deltas,
	const float im_height,
	const float im_width,
	const int real_height,
	const int real_width,
	mshadow::Tensor<cpu, 2> *out_pred_boxes) {
	CHECK_GE(boxes.size(1), 4);
	CHECK_GE(out_pred_boxes->size(1), 4);
	int anchors = deltas.size(0) / 4;
	int heights = deltas.size(1);
	int widths = deltas.size(2);

	#pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
	for (int index = 0; index < anchors * heights * widths; ++index) {
	// index_t index = h * (widths * anchors) + w * (anchors) + a;
	int a = index % anchors;
	int w = (index / anchors) % widths;
	int h = index / (widths * anchors);

	float x1 = boxes[index][0];
	float y1 = boxes[index][1];
	float x2 = boxes[index][2];
	float y2 = boxes[index][3];

	float dx1 = deltas[a * 4 + 0][h][w];
	float dy1 = deltas[a * 4 + 1][h][w];
	float dx2 = deltas[a * 4 + 2][h][w];
	float dy2 = deltas[a * 4 + 3][h][w];

	float pred_x1 = x1 + dx1;
	float pred_y1 = y1 + dy1;
	float pred_x2 = x2 + dx2;
	float pred_y2 = y2 + dy2;

	pred_x1 = std::max(std::min(pred_x1, im_width - 1.0f), 0.0f);
	pred_y1 = std::max(std::min(pred_y1, im_height - 1.0f), 0.0f);
	pred_x2 = std::max(std::min(pred_x2, im_width - 1.0f), 0.0f);
	pred_y2 = std::max(std::min(pred_y2, im_height - 1.0f), 0.0f);

	(*out_pred_boxes)[index][0] = pred_x1;
	(*out_pred_boxes)[index][1] = pred_y1;
	(*out_pred_boxes)[index][2] = pred_x2;
	(*out_pred_boxes)[index][3] = pred_y2;

	if (h >= real_height \|\| w >= real_width) {
	(*out_pred_boxes)[index][4] = -1.0f;
	}
	}
	}

	// filter box by set confidence to zero
	// * height or width < rpn_min_size
	inline void FilterBox(mshadow::Tensor<cpu, 2> *dets,
	const float min_size) {
	#pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
	for (int i = 0; i < static_cast<int>(dets->size(0)); ++i) {
	float iw = (dets)[i][2] - (dets)[i][0] + 1.0f;
	float ih = (dets)[i][3] - (dets)[i][1] + 1.0f;
	if (iw < min_size \|\| ih < min_size) {
	(*dets)[i][0] -= min_size / 2;
	(*dets)[i][1] -= min_size / 2;
	(*dets)[i][2] += min_size / 2;
	(*dets)[i][3] += min_size / 2;
	(*dets)[i][4] = -1.0f;
	}
	}
	}

	} // namespace utils
	} // namespace op
	} // namespace mxnet

	//=====================
	// NMS Utils
	//=====================
	namespace mxnet {
	namespace op {
	namespace utils {

	struct ReverseArgsortCompl {
	const float *val_;
	explicit ReverseArgsortCompl(float *val)
	: val_(val) {}
	bool operator() (float i, float j) {
	return (val_[static_cast<index_t>(i)] >
	val_[static_cast<index_t>(j)]);
	}
	};

	// copy score and init order
	inline void CopyScore(const mshadow::Tensor<cpu, 2>& dets,
	mshadow::Tensor<cpu, 1> *score,
	mshadow::Tensor<cpu, 1> *order) {
	#pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
	for (int i = 0; i < static_cast<int>(dets.size(0)); ++i) {
	(*score)[i] = dets[i][4];
	(*order)[i] = i;
	}
	}

	// sort order array according to score
	inline void ReverseArgsort(const mshadow::Tensor<cpu, 1>& score,
	mshadow::Tensor<cpu, 1> *order) {
	ReverseArgsortCompl cmpl(score.dptr_);
	std::stable_sort(order->dptr_, order->dptr_ + score.size(0), cmpl);
	}

	// reorder proposals according to order and keep the pre_nms_top_n proposals
	// dets.size(0) == pre_nms_top_n
	inline void ReorderProposals(const mshadow::Tensor<cpu, 2>& prev_dets,
	const mshadow::Tensor<cpu, 1>& order,
	const index_t pre_nms_top_n,
	mshadow::Tensor<cpu, 2> *dets) {
	CHECK_EQ(dets->size(0), pre_nms_top_n);
	const int dets_size0 = static_cast<int>(dets->size(0));
	const int dets_size1 = static_cast<int>(dets->size(1));
	#pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
	for (int k = 0; k < dets_size0 * dets_size1; ++k) {
	int i = k / dets_size1;
	int j = k % dets_size1;
	const index_t index = order[i];
	(*dets)[i][j] = prev_dets[index][j];
	}
	}

	// greedily keep the max detections (already sorted)
	inline void NonMaximumSuppression(const mshadow::Tensor<cpu, 2>& dets,
	const float thresh,
	const index_t post_nms_top_n,
	mshadow::Tensor<cpu, 1> *area,
	mshadow::Tensor<cpu, 1> *suppressed,
	mshadow::Tensor<cpu, 1> *keep,
	int *out_size) {
	CHECK_EQ(dets.shape_[1], 5) << "dets: [x1, y1, x2, y2, score]";
	CHECK_GT(dets.shape_[0], 0);
	CHECK_EQ(dets.CheckContiguous(), true);
	CHECK_EQ(area->CheckContiguous(), true);
	CHECK_EQ(suppressed->CheckContiguous(), true);
	CHECK_EQ(keep->CheckContiguous(), true);
	// calculate area
	#pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
	for (int i = 0; i < static_cast<int>(dets.size(0)); ++i) {
	(area)[i] = (dets[i][2] - dets[i][0] + 1)
	(dets[i][3] - dets[i][1] + 1);
	}

	// calculate nms
	*out_size = 0;
	for (index_t i = 0; i < dets.size(0) && (*out_size) < static_cast<int>(post_nms_top_n); ++i) {
	float ix1 = dets[i][0];
	float iy1 = dets[i][1];
	float ix2 = dets[i][2];
	float iy2 = dets[i][3];
	float iarea = (*area)[i];

	if ((*suppressed)[i] > 0.0f) {
	continue;
	}

	(keep)[(out_size)++] = i;
	#pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
	for (int j = i + 1; j < static_cast<int>(dets.size(0)); ++j) {
	if ((*suppressed)[j] > 0.0f) {
	continue;
	}
	float xx1 = std::max(ix1, dets[j][0]);
	float yy1 = std::max(iy1, dets[j][1]);
	float xx2 = std::min(ix2, dets[j][2]);
	float yy2 = std::min(iy2, dets[j][3]);
	float w = std::max(0.0f, xx2 - xx1 + 1.0f);
	float h = std::max(0.0f, yy2 - yy1 + 1.0f);
	float inter = w * h;
	float ovr = inter / (iarea + (*area)[j] - inter);
	if (ovr > thresh) {
	(*suppressed)[j] = 1.0f;
	}
	}
	}
	}

	} // namespace utils
	} // namespace op
	} // namespace mxnet



	namespace mxnet {
	namespace op {

	template<typename xpu>
	class MultiProposalOp : public Operator{
	public:
	explicit MultiProposalOp(MultiProposalParam param) {
	this->param_ = param;
	}

	virtual void Forward(const OpContext &ctx,
	const std::vector<TBlob> &in_data,
	const std::vector<OpReqType> &req,
	const std::vector<TBlob> &out_data,
	const std::vector<TBlob> &aux_states) {
	using namespace mshadow;
	using namespace mshadow::expr;
	CHECK_EQ(in_data.size(), 3);
	CHECK_EQ(out_data.size(), 2);
	CHECK_GT(req.size(), 1);
	CHECK_EQ(req[proposal::kOut], kWriteTo);

	Stream<xpu> *s = ctx.get_stream<xpu>();

	Tensor<cpu, 4> scores = in_data[proposal::kClsProb].get<cpu, 4, real_t>(s);
	Tensor<cpu, 4> bbox_deltas = in_data[proposal::kBBoxPred].get<cpu, 4, real_t>(s);
	Tensor<cpu, 2> im_info = in_data[proposal::kImInfo].get<cpu, 2, real_t>(s);

	Tensor<cpu, 2> out = out_data[proposal::kOut].get<cpu, 2, real_t>(s);
	Tensor<cpu, 2> out_score = out_data[proposal::kScore].get<cpu, 2, real_t>(s);

	int num_images = scores.size(0);
	int num_anchors = scores.size(1) / 2;
	int height = scores.size(2);
	int width = scores.size(3);
	int count_anchors = num_anchors * height * width;
	int rpn_pre_nms_top_n =
	(param_.rpn_pre_nms_top_n > 0) ? param_.rpn_pre_nms_top_n : count_anchors;
	rpn_pre_nms_top_n = std::min(rpn_pre_nms_top_n, count_anchors);
	int rpn_post_nms_top_n = std::min(param_.rpn_post_nms_top_n, rpn_pre_nms_top_n);

	int workspace_size =
	num_images * (count_anchors * 5 + 2 * count_anchors +
	rpn_pre_nms_top_n * 5 + 3 * rpn_pre_nms_top_n);

	Tensor<cpu, 1> workspace = ctx.requested[proposal::kTempResource].get_space<cpu>(
	Shape1(workspace_size), s);
	int start = 0;
	Tensor<cpu, 3> workspace_proposals(workspace.dptr_ +
	start, Shape3(num_images, count_anchors, 5));
	start += num_images * count_anchors * 5;
	Tensor<cpu, 3> workspace_pre_nms(workspace.dptr_ + start, Shape3(num_images, 2, count_anchors));
	start += num_images * 2 * count_anchors;
	Tensor<cpu, 3> workspace_ordered_proposals(workspace.dptr_ + start,
	Shape3(num_images, rpn_pre_nms_top_n, 5));
	start += num_images * rpn_pre_nms_top_n * 5;
	Tensor<cpu, 3> workspace_nms(workspace.dptr_ + start, Shape3(num_images, 3, rpn_pre_nms_top_n));
	start += num_images * 3 * rpn_pre_nms_top_n;
	CHECK_EQ(workspace_size, start) << workspace_size << " " << start << std::endl;

	// Generate anchors
	std::vector<float> base_anchor(4);
	base_anchor[0] = 0.0;
	base_anchor[1] = 0.0;
	base_anchor[2] = param_.feature_stride - 1.0;
	base_anchor[3] = param_.feature_stride - 1.0;
	CHECK_EQ(num_anchors, param_.ratios.ndim() * param_.scales.ndim());
	std::vector<float> anchors;
	utils::GenerateAnchors(base_anchor,
	param_.ratios,
	param_.scales,
	&anchors);
	std::memcpy(workspace_proposals.dptr_, &anchors[0], sizeof(float) * anchors.size());

	Tensor<cpu, 2> workspace_proposals0 = workspace_proposals[0];
	// Enumerate all shifted anchors
	#pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
	for (int index = 0; index < num_anchors * height * width; ++index) {
	// index_t index = j * (width * num_anchors) + k * (num_anchors) + i;
	int i = index % num_anchors;
	int k = (index / num_anchors) % width;
	int j = index / (width * num_anchors);
	workspace_proposals0[index][0] =
	workspace_proposals0[i][0] + k * param_.feature_stride;
	workspace_proposals0[index][1] =
	workspace_proposals0[i][1] + j * param_.feature_stride;
	workspace_proposals0[index][2] =
	workspace_proposals0[i][2] + k * param_.feature_stride;
	workspace_proposals0[index][3] =
	workspace_proposals0[i][3] + j * param_.feature_stride;
	workspace_proposals0[index][4] = scores[0][i + num_anchors][j][k];
	}

	// Copy shifted anchors to other images
	#pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
	for (int t = count_anchors; t < num_images * count_anchors; ++t) {
	int b = t / count_anchors;
	int index = t % count_anchors;
	int i = index % num_anchors;
	int k = (index / num_anchors) % width;
	int j = index / (width * num_anchors);
	for (int w = 0; w < 4; ++w) {
	workspace_proposals[b][index][w] = workspace_proposals[0][index][w];
	}
	workspace_proposals[b][index][4] = scores[b][i + num_anchors][j][k];
	}

	// Assign Foreground Scores for each anchor
	#pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
	for (int b = 0; b < num_images; ++b) {
	// prevent padded predictions
	int real_height = static_cast<int>(im_info[b][0] / param_.feature_stride);
	int real_width = static_cast<int>(im_info[b][1] / param_.feature_stride);
	CHECK_GE(height, real_height) << height << " " << real_height << std::endl;
	CHECK_GE(width, real_width) << width << " " << real_width << std::endl;

	Tensor<cpu, 2> workspace_proposals_i = workspace_proposals[b];
	Tensor<cpu, 2> workspace_pre_nms_i = workspace_pre_nms[b];
	Tensor<cpu, 2> workspace_ordered_proposals_i =
	workspace_ordered_proposals[b];
	Tensor<cpu, 2> workspace_nms_i = workspace_nms[b];

	if (param_.iou_loss) {
	utils::IoUTransformInv(workspace_proposals_i, bbox_deltas[b], im_info[b][0], im_info[b][1],
	real_height, real_width, &(workspace_proposals_i));
	} else {
	utils::BBoxTransformInv(workspace_proposals_i, bbox_deltas[b], im_info[b][0], im_info[b][1],
	real_height, real_width, &(workspace_proposals_i));
	}
	utils::FilterBox(&workspace_proposals_i, param_.rpn_min_size * im_info[b][2]);

	Tensor<cpu, 1> score = workspace_pre_nms_i[0];
	Tensor<cpu, 1> order = workspace_pre_nms_i[1];

	utils::CopyScore(workspace_proposals_i,
	&score,
	&order);
	utils::ReverseArgsort(score,
	&order);
	utils::ReorderProposals(workspace_proposals_i,
	order,
	rpn_pre_nms_top_n,
	&workspace_ordered_proposals_i);
	int out_size = 0;
	Tensor<cpu, 1> area = workspace_nms_i[0];
	Tensor<cpu, 1> suppressed = workspace_nms_i[1];
	Tensor<cpu, 1> keep = workspace_nms_i[2];
	suppressed = 0; // surprised!

	utils::NonMaximumSuppression(workspace_ordered_proposals_i,
	param_.threshold,
	rpn_post_nms_top_n,
	&area,
	&suppressed,
	&keep,
	&out_size);

	// fill in output rois and output scores
	#pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
	for (int i = 0; i < param_.rpn_post_nms_top_n; ++i) {
	int out_index = b * param_.rpn_post_nms_top_n + i;
	out[out_index][0] = b;
	if (i < out_size) {
	index_t index = keep[i];
	for (index_t j = 0; j < 4; ++j) {
	out[out_index][j + 1] = workspace_ordered_proposals_i[index][j];
	}
	out_score[out_index][0] = workspace_ordered_proposals_i[index][4];
	} else {
	index_t index = keep[i % out_size];
	for (index_t j = 0; j < 4; ++j) {
	out[out_index][j + 1] = workspace_ordered_proposals_i[index][j];
	}
	out_score[out_index][0] = workspace_ordered_proposals_i[index][4];
	}
	}
	}
	}

	virtual void Backward(const OpContext &ctx,
	const std::vector<TBlob> &out_grad,
	const std::vector<TBlob> &in_data,
	const std::vector<TBlob> &out_data,
	const std::vector<OpReqType> &req,
	const std::vector<TBlob> &in_grad,
	const std::vector<TBlob> &aux_states) {
	using namespace mshadow;
	using namespace mshadow::expr;
	CHECK_EQ(in_grad.size(), 3);

	Stream<xpu> *s = ctx.get_stream<xpu>();
	Tensor<xpu, 4> gscores = in_grad[proposal::kClsProb].get<xpu, 4, real_t>(s);
	Tensor<xpu, 4> gbbox = in_grad[proposal::kBBoxPred].get<xpu, 4, real_t>(s);
	Tensor<xpu, 2> ginfo = in_grad[proposal::kImInfo].get<xpu, 2, real_t>(s);

	// can not assume the grad would be zero
	Assign(gscores, req[proposal::kClsProb], 0);
	Assign(gbbox, req[proposal::kBBoxPred], 0);
	Assign(ginfo, req[proposal::kImInfo], 0);
	}

	private:
	MultiProposalParam param_;
	}; // class MultiProposalOp

	template<>
	Operator *CreateOp<cpu>(MultiProposalParam param) {
	return new MultiProposalOp<cpu>(param);
	}

	Operator* MultiProposalProp::CreateOperator(Context ctx) const {
	DO_BIND_DISPATCH(CreateOp, param_);
	}

	DMLC_REGISTER_PARAMETER(MultiProposalParam);

	MXNET_REGISTER_OP_PROPERTY(_contrib_MultiProposal, MultiProposalProp)
	.describe("Generate region proposals via RPN")
	.add_argument("cls_prob", "NDArray-or-Symbol", "Score of how likely proposal is object.")
	.add_argument("bbox_pred", "NDArray-or-Symbol", "BBox Predicted deltas from anchors for proposals")
	.add_argument("im_info", "NDArray-or-Symbol", "Image size and scale.")
	.add_arguments(MultiProposalParam::__FIELDS__());

	} // namespace op
	} // namespace mxnet