src/operator/contrib/multibox_detection.cu - mxnet - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 /*!
  * \file multibox_detection.cu
  * \brief MultiBoxDetection op
  * \author Joshua Zhang
  */
 #include "./multibox_detection-inl.h"
 #include <mshadow/cuda/tensor_gpu-inl.cuh>

 #define MULTIBOX_DETECTION_CUDA_CHECK(condition)                      \
   /* Code block avoids redefinition of cudaError_t error */           \
   do {                                                                \
     cudaError_t error = condition;                                    \
     CHECK_EQ(error, cudaSuccess) << " " << cudaGetErrorString(error); \
   } while (0)

 namespace mshadow {
 namespace cuda {
 template <typename DType>
 __device__ void Clip(DType* value, const DType lower, const DType upper) {
   if ((*value) < lower)
     *value = lower;
   if ((*value) > upper)
     *value = upper;
 }

 template <typename DType>
 __device__ void CalculateOverlap(const DType* a, const DType* b, DType* iou) {
   DType w = max(DType(0), min(a[2], b[2]) - max(a[0], b[0]));
   DType h = max(DType(0), min(a[3], b[3]) - max(a[1], b[1]));
   DType i = w * h;
   DType u = (a[2] - a[0]) * (a[3] - a[1]) + (b[2] - b[0]) * (b[3] - b[1]) - i;
   (*iou)  = u <= 0.f ? static_cast<DType>(0) : static_cast<DType>(i / u);
 }

 template <typename DType>
 __global__ __launch_bounds__(cuda::kMaxThreadsPerBlock) void DetectionForwardKernel(
     DType* out,
     const DType* cls_prob,
     const DType* loc_pred,
     const DType* anchors,
     DType* temp_space,
     const int num_classes,
     const int num_anchors,
     const float threshold,
     const bool clip,
     const float vx,
     const float vy,
     const float vw,
     const float vh,
     const float nms_threshold,
     const bool force_suppress,
     const int nms_topk) {
   const int nbatch = blockIdx.x;  // each block for each batch
   int index        = threadIdx.x;
   __shared__ int valid_count;
   out += nbatch * num_anchors * 6;
   cls_prob += nbatch * num_anchors * num_classes;
   loc_pred += nbatch * num_anchors * 4;

   if (index == 0) {
     valid_count = 0;
   }
   __syncthreads();

   // apply prediction to anchors
   for (int i = index; i < num_anchors; i += blockDim.x) {
     DType score = -1;
     int id      = 0;
     for (int j = 1; j < num_classes; ++j) {
       DType temp = cls_prob[j * num_anchors + i];
       if (temp > score) {
         score = temp;
         id    = j;
       }
     }
     if (id > 0 && score < threshold) {
       id = 0;
     }

     if (id > 0) {
       // valid class
       int pos          = atomicAdd(&valid_count, 1);
       out[pos * 6]     = id - 1;  // restore original class id
       out[pos * 6 + 1] = (id == 0 ? DType(-1) : score);
       int offset       = i * 4;
       DType al         = anchors[offset];
       DType at         = anchors[offset + 1];
       DType ar         = anchors[offset + 2];
       DType ab         = anchors[offset + 3];
       DType aw         = ar - al;
       DType ah         = ab - at;
       DType ax         = (al + ar) / 2.f;
       DType ay         = (at + ab) / 2.f;
       DType ox         = loc_pred[offset] * vx * aw + ax;
       DType oy         = loc_pred[offset + 1] * vy * ah + ay;
       DType ow         = exp(loc_pred[offset + 2] * vw) * aw / 2;
       DType oh         = exp(loc_pred[offset + 3] * vh) * ah / 2;
       DType xmin       = ox - ow;
       DType ymin       = oy - oh;
       DType xmax       = ox + ow;
       DType ymax       = oy + oh;
       if (clip) {
         Clip(&xmin, DType(0), DType(1));
         Clip(&ymin, DType(0), DType(1));
         Clip(&xmax, DType(0), DType(1));
         Clip(&ymax, DType(0), DType(1));
       }
       out[pos * 6 + 2] = xmin;
       out[pos * 6 + 3] = ymin;
       out[pos * 6 + 4] = xmax;
       out[pos * 6 + 5] = ymax;
     }
   }
   __syncthreads();

   if (valid_count < 1 || nms_threshold <= 0 || nms_threshold > 1)
     return;
   // if (index == 0) printf("%d\n", valid_count);

   // descent sort according to scores
   const int size = valid_count;
   temp_space += nbatch * num_anchors * 6;
   DType* src = out;
   DType* dst = temp_space;
   for (int width = 2; width < (size << 1); width <<= 1) {
     int slices = (size - 1) / (blockDim.x * width) + 1;
     int start  = width * index * slices;
     for (int slice = 0; slice < slices; ++slice) {
       if (start >= size)
         break;
       int middle = start + (width >> 1);
       if (middle > size)
         middle = size;
       int end = start + width;
       if (end > size)
         end = size;
       int i = start;
       int j = middle;
       for (int k = start; k < end; ++k) {
         DType score_i = i < size ? src[i * 6 + 1] : DType(-1);
         DType score_j = j < size ? src[j * 6 + 1] : DType(-1);
         if (i < middle && (j >= end || score_i > score_j)) {
           for (int n = 0; n < 6; ++n) {
             dst[k * 6 + n] = src[i * 6 + n];
           }
           ++i;
         } else {
           for (int n = 0; n < 6; ++n) {
             dst[k * 6 + n] = src[j * 6 + n];
           }
           ++j;
         }
       }
       start += width;
     }
     __syncthreads();
     src = src == out ? temp_space : out;
     dst = dst == out ? temp_space : out;
   }
   __syncthreads();

   if (src == temp_space) {
     // copy from temp to out
     for (int i = index; i < size * 6; i += blockDim.x) {
       out[i] = temp_space[i];
     }
     __syncthreads();
   }

   // keep top k detections
   int ntop = size;
   if (nms_topk > 0 && nms_topk < ntop) {
     ntop = nms_topk;
     for (int i = ntop + index; i < size; i += blockDim.x) {
       out[i * 6] = -1;
     }
     __syncthreads();
   }

   // apply NMS
   for (int compare_pos = 0; compare_pos < ntop; ++compare_pos) {
     DType compare_id = out[compare_pos * 6];
     if (compare_id < 0)
       continue;  // not a valid positive detection, skip
     DType* compare_loc_ptr = out + compare_pos * 6 + 2;
     for (int i = compare_pos + index + 1; i < ntop; i += blockDim.x) {
       DType class_id = out[i * 6];
       if (class_id < 0)
         continue;
       if (force_suppress || (class_id == compare_id)) {
         DType iou;
         CalculateOverlap(compare_loc_ptr, out + i * 6 + 2, &iou);
         if (iou >= nms_threshold) {
           out[i * 6] = -1;
         }
       }
     }
     __syncthreads();
   }
 }
 }  // namespace cuda

 template <typename DType>
 inline void MultiBoxDetectionForward(const Tensor<gpu, 3, DType>& out,
                                      const Tensor<gpu, 3, DType>& cls_prob,
                                      const Tensor<gpu, 2, DType>& loc_pred,
                                      const Tensor<gpu, 2, DType>& anchors,
                                      const Tensor<gpu, 3, DType>& temp_space,
                                      const float threshold,
                                      const bool clip,
                                      const mxnet::Tuple<float>& variances,
                                      const float nms_threshold,
                                      const bool force_suppress,
                                      const int nms_topk) {
   CHECK_EQ(variances.ndim(), 4) << "Variance size must be 4";
   const int num_classes = cls_prob.size(1);
   const int num_anchors = cls_prob.size(2);
   const int num_batches = cls_prob.size(0);
   const int num_threads = cuda::kMaxThreadsPerBlock;
   int num_blocks        = num_batches;
   cuda::CheckLaunchParam(num_blocks, num_threads, "MultiBoxDetection Forward");
   cudaStream_t stream = Stream<gpu>::GetStream(out.stream_);
   cuda::DetectionForwardKernel<<<num_blocks, num_threads, 0, stream>>>(out.dptr_,
                                                                        cls_prob.dptr_,
                                                                        loc_pred.dptr_,
                                                                        anchors.dptr_,
                                                                        temp_space.dptr_,
                                                                        num_classes,
                                                                        num_anchors,
                                                                        threshold,
                                                                        clip,
                                                                        variances[0],
                                                                        variances[1],
                                                                        variances[2],
                                                                        variances[3],
                                                                        nms_threshold,
                                                                        force_suppress,
                                                                        nms_topk);
   MULTIBOX_DETECTION_CUDA_CHECK(cudaGetLastError());
 }
 }  // namespace mshadow

 namespace mxnet {
 namespace op {
 template <>
 Operator* CreateOp<gpu>(MultiBoxDetectionParam param, int dtype) {
   Operator* op = nullptr;
   MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { op = new MultiBoxDetectionOp<gpu, DType>(param); });
   return op;
 }
 }  // namespace op
 }  // namespace mxnet
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/

	/*!
	* \file multibox_detection.cu
	* \brief MultiBoxDetection op
	* \author Joshua Zhang
	*/
	#include "./multibox_detection-inl.h"
	#include <mshadow/cuda/tensor_gpu-inl.cuh>

	#define MULTIBOX_DETECTION_CUDA_CHECK(condition) \
	/* Code block avoids redefinition of cudaError_t error */ \
	do { \
	cudaError_t error = condition; \
	CHECK_EQ(error, cudaSuccess) << " " << cudaGetErrorString(error); \
	} while (0)

	namespace mshadow {
	namespace cuda {
	template <typename DType>
	__device__ void Clip(DType* value, const DType lower, const DType upper) {
	if ((*value) < lower)
	*value = lower;
	if ((*value) > upper)
	*value = upper;
	}

	template <typename DType>
	__device__ void CalculateOverlap(const DType* a, const DType* b, DType* iou) {
	DType w = max(DType(0), min(a[2], b[2]) - max(a[0], b[0]));
	DType h = max(DType(0), min(a[3], b[3]) - max(a[1], b[1]));
	DType i = w * h;
	DType u = (a[2] - a[0]) * (a[3] - a[1]) + (b[2] - b[0]) * (b[3] - b[1]) - i;
	(*iou) = u <= 0.f ? static_cast<DType>(0) : static_cast<DType>(i / u);
	}

	template <typename DType>
	__global__ __launch_bounds__(cuda::kMaxThreadsPerBlock) void DetectionForwardKernel(
	DType* out,
	const DType* cls_prob,
	const DType* loc_pred,
	const DType* anchors,
	DType* temp_space,
	const int num_classes,
	const int num_anchors,
	const float threshold,
	const bool clip,
	const float vx,
	const float vy,
	const float vw,
	const float vh,
	const float nms_threshold,
	const bool force_suppress,
	const int nms_topk) {
	const int nbatch = blockIdx.x; // each block for each batch
	int index = threadIdx.x;
	__shared__ int valid_count;
	out += nbatch * num_anchors * 6;
	cls_prob += nbatch * num_anchors * num_classes;
	loc_pred += nbatch * num_anchors * 4;

	if (index == 0) {
	valid_count = 0;
	}
	__syncthreads();

	// apply prediction to anchors
	for (int i = index; i < num_anchors; i += blockDim.x) {
	DType score = -1;
	int id = 0;
	for (int j = 1; j < num_classes; ++j) {
	DType temp = cls_prob[j * num_anchors + i];
	if (temp > score) {
	score = temp;
	id = j;
	}
	}
	if (id > 0 && score < threshold) {
	id = 0;
	}

	if (id > 0) {
	// valid class
	int pos = atomicAdd(&valid_count, 1);
	out[pos * 6] = id - 1; // restore original class id
	out[pos * 6 + 1] = (id == 0 ? DType(-1) : score);
	int offset = i * 4;
	DType al = anchors[offset];
	DType at = anchors[offset + 1];
	DType ar = anchors[offset + 2];
	DType ab = anchors[offset + 3];
	DType aw = ar - al;
	DType ah = ab - at;
	DType ax = (al + ar) / 2.f;
	DType ay = (at + ab) / 2.f;
	DType ox = loc_pred[offset] * vx * aw + ax;
	DType oy = loc_pred[offset + 1] * vy * ah + ay;
	DType ow = exp(loc_pred[offset + 2] * vw) * aw / 2;
	DType oh = exp(loc_pred[offset + 3] * vh) * ah / 2;
	DType xmin = ox - ow;
	DType ymin = oy - oh;
	DType xmax = ox + ow;
	DType ymax = oy + oh;
	if (clip) {
	Clip(&xmin, DType(0), DType(1));
	Clip(&ymin, DType(0), DType(1));
	Clip(&xmax, DType(0), DType(1));
	Clip(&ymax, DType(0), DType(1));
	}
	out[pos * 6 + 2] = xmin;
	out[pos * 6 + 3] = ymin;
	out[pos * 6 + 4] = xmax;
	out[pos * 6 + 5] = ymax;
	}
	}
	__syncthreads();

	if (valid_count < 1 \|\| nms_threshold <= 0 \|\| nms_threshold > 1)
	return;
	// if (index == 0) printf("%d\n", valid_count);

	// descent sort according to scores
	const int size = valid_count;
	temp_space += nbatch * num_anchors * 6;
	DType* src = out;
	DType* dst = temp_space;
	for (int width = 2; width < (size << 1); width <<= 1) {
	int slices = (size - 1) / (blockDim.x * width) + 1;
	int start = width * index * slices;
	for (int slice = 0; slice < slices; ++slice) {
	if (start >= size)
	break;
	int middle = start + (width >> 1);
	if (middle > size)
	middle = size;
	int end = start + width;
	if (end > size)
	end = size;
	int i = start;
	int j = middle;
	for (int k = start; k < end; ++k) {
	DType score_i = i < size ? src[i * 6 + 1] : DType(-1);
	DType score_j = j < size ? src[j * 6 + 1] : DType(-1);
	if (i < middle && (j >= end \|\| score_i > score_j)) {
	for (int n = 0; n < 6; ++n) {
	dst[k * 6 + n] = src[i * 6 + n];
	}
	++i;
	} else {
	for (int n = 0; n < 6; ++n) {
	dst[k * 6 + n] = src[j * 6 + n];
	}
	++j;
	}
	}
	start += width;
	}
	__syncthreads();
	src = src == out ? temp_space : out;
	dst = dst == out ? temp_space : out;
	}
	__syncthreads();

	if (src == temp_space) {
	// copy from temp to out
	for (int i = index; i < size * 6; i += blockDim.x) {
	out[i] = temp_space[i];
	}
	__syncthreads();
	}

	// keep top k detections
	int ntop = size;
	if (nms_topk > 0 && nms_topk < ntop) {
	ntop = nms_topk;
	for (int i = ntop + index; i < size; i += blockDim.x) {
	out[i * 6] = -1;
	}
	__syncthreads();
	}

	// apply NMS
	for (int compare_pos = 0; compare_pos < ntop; ++compare_pos) {
	DType compare_id = out[compare_pos * 6];
	if (compare_id < 0)
	continue; // not a valid positive detection, skip
	DType* compare_loc_ptr = out + compare_pos * 6 + 2;
	for (int i = compare_pos + index + 1; i < ntop; i += blockDim.x) {
	DType class_id = out[i * 6];
	if (class_id < 0)
	continue;
	if (force_suppress \|\| (class_id == compare_id)) {
	DType iou;
	CalculateOverlap(compare_loc_ptr, out + i * 6 + 2, &iou);
	if (iou >= nms_threshold) {
	out[i * 6] = -1;
	}
	}
	}
	__syncthreads();
	}
	}
	} // namespace cuda

	template <typename DType>
	inline void MultiBoxDetectionForward(const Tensor<gpu, 3, DType>& out,
	const Tensor<gpu, 3, DType>& cls_prob,
	const Tensor<gpu, 2, DType>& loc_pred,
	const Tensor<gpu, 2, DType>& anchors,
	const Tensor<gpu, 3, DType>& temp_space,
	const float threshold,
	const bool clip,
	const mxnet::Tuple<float>& variances,
	const float nms_threshold,
	const bool force_suppress,
	const int nms_topk) {
	CHECK_EQ(variances.ndim(), 4) << "Variance size must be 4";
	const int num_classes = cls_prob.size(1);
	const int num_anchors = cls_prob.size(2);
	const int num_batches = cls_prob.size(0);
	const int num_threads = cuda::kMaxThreadsPerBlock;
	int num_blocks = num_batches;
	cuda::CheckLaunchParam(num_blocks, num_threads, "MultiBoxDetection Forward");
	cudaStream_t stream = Stream<gpu>::GetStream(out.stream_);
	cuda::DetectionForwardKernel<<<num_blocks, num_threads, 0, stream>>>(out.dptr_,
	cls_prob.dptr_,
	loc_pred.dptr_,
	anchors.dptr_,
	temp_space.dptr_,
	num_classes,
	num_anchors,
	threshold,
	clip,
	variances[0],
	variances[1],
	variances[2],
	variances[3],
	nms_threshold,
	force_suppress,
	nms_topk);
	MULTIBOX_DETECTION_CUDA_CHECK(cudaGetLastError());
	}
	} // namespace mshadow

	namespace mxnet {
	namespace op {
	template <>
	Operator* CreateOp<gpu>(MultiBoxDetectionParam param, int dtype) {
	Operator* op = nullptr;
	MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { op = new MultiBoxDetectionOp<gpu, DType>(param); });
	return op;
	}
	} // namespace op
	} // namespace mxnet