blob: 32447144935ea72a98f4bdc8b7df4ad23ba5ad00 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
/*!
* \file multibox_detection.cu
* \brief MultiBoxDetection op
* \author Joshua Zhang
*/
#include "./multibox_detection-inl.h"
#include <mshadow/cuda/tensor_gpu-inl.cuh>
#define MULTIBOX_DETECTION_CUDA_CHECK(condition) \
/* Code block avoids redefinition of cudaError_t error */ \
do { \
cudaError_t error = condition; \
CHECK_EQ(error, cudaSuccess) << " " << cudaGetErrorString(error); \
} while (0)
namespace mshadow {
namespace cuda {
template <typename DType>
__device__ void Clip(DType* value, const DType lower, const DType upper) {
if ((*value) < lower)
*value = lower;
if ((*value) > upper)
*value = upper;
}
template <typename DType>
__device__ void CalculateOverlap(const DType* a, const DType* b, DType* iou) {
DType w = max(DType(0), min(a[2], b[2]) - max(a[0], b[0]));
DType h = max(DType(0), min(a[3], b[3]) - max(a[1], b[1]));
DType i = w * h;
DType u = (a[2] - a[0]) * (a[3] - a[1]) + (b[2] - b[0]) * (b[3] - b[1]) - i;
(*iou) = u <= 0.f ? static_cast<DType>(0) : static_cast<DType>(i / u);
}
template <typename DType>
__global__ __launch_bounds__(cuda::kMaxThreadsPerBlock) void DetectionForwardKernel(
DType* out,
const DType* cls_prob,
const DType* loc_pred,
const DType* anchors,
DType* temp_space,
const int num_classes,
const int num_anchors,
const float threshold,
const bool clip,
const float vx,
const float vy,
const float vw,
const float vh,
const float nms_threshold,
const bool force_suppress,
const int nms_topk) {
const int nbatch = blockIdx.x; // each block for each batch
int index = threadIdx.x;
__shared__ int valid_count;
out += nbatch * num_anchors * 6;
cls_prob += nbatch * num_anchors * num_classes;
loc_pred += nbatch * num_anchors * 4;
if (index == 0) {
valid_count = 0;
}
__syncthreads();
// apply prediction to anchors
for (int i = index; i < num_anchors; i += blockDim.x) {
DType score = -1;
int id = 0;
for (int j = 1; j < num_classes; ++j) {
DType temp = cls_prob[j * num_anchors + i];
if (temp > score) {
score = temp;
id = j;
}
}
if (id > 0 && score < threshold) {
id = 0;
}
if (id > 0) {
// valid class
int pos = atomicAdd(&valid_count, 1);
out[pos * 6] = id - 1; // restore original class id
out[pos * 6 + 1] = (id == 0 ? DType(-1) : score);
int offset = i * 4;
DType al = anchors[offset];
DType at = anchors[offset + 1];
DType ar = anchors[offset + 2];
DType ab = anchors[offset + 3];
DType aw = ar - al;
DType ah = ab - at;
DType ax = (al + ar) / 2.f;
DType ay = (at + ab) / 2.f;
DType ox = loc_pred[offset] * vx * aw + ax;
DType oy = loc_pred[offset + 1] * vy * ah + ay;
DType ow = exp(loc_pred[offset + 2] * vw) * aw / 2;
DType oh = exp(loc_pred[offset + 3] * vh) * ah / 2;
DType xmin = ox - ow;
DType ymin = oy - oh;
DType xmax = ox + ow;
DType ymax = oy + oh;
if (clip) {
Clip(&xmin, DType(0), DType(1));
Clip(&ymin, DType(0), DType(1));
Clip(&xmax, DType(0), DType(1));
Clip(&ymax, DType(0), DType(1));
}
out[pos * 6 + 2] = xmin;
out[pos * 6 + 3] = ymin;
out[pos * 6 + 4] = xmax;
out[pos * 6 + 5] = ymax;
}
}
__syncthreads();
if (valid_count < 1 || nms_threshold <= 0 || nms_threshold > 1)
return;
// if (index == 0) printf("%d\n", valid_count);
// descent sort according to scores
const int size = valid_count;
temp_space += nbatch * num_anchors * 6;
DType* src = out;
DType* dst = temp_space;
for (int width = 2; width < (size << 1); width <<= 1) {
int slices = (size - 1) / (blockDim.x * width) + 1;
int start = width * index * slices;
for (int slice = 0; slice < slices; ++slice) {
if (start >= size)
break;
int middle = start + (width >> 1);
if (middle > size)
middle = size;
int end = start + width;
if (end > size)
end = size;
int i = start;
int j = middle;
for (int k = start; k < end; ++k) {
DType score_i = i < size ? src[i * 6 + 1] : DType(-1);
DType score_j = j < size ? src[j * 6 + 1] : DType(-1);
if (i < middle && (j >= end || score_i > score_j)) {
for (int n = 0; n < 6; ++n) {
dst[k * 6 + n] = src[i * 6 + n];
}
++i;
} else {
for (int n = 0; n < 6; ++n) {
dst[k * 6 + n] = src[j * 6 + n];
}
++j;
}
}
start += width;
}
__syncthreads();
src = src == out ? temp_space : out;
dst = dst == out ? temp_space : out;
}
__syncthreads();
if (src == temp_space) {
// copy from temp to out
for (int i = index; i < size * 6; i += blockDim.x) {
out[i] = temp_space[i];
}
__syncthreads();
}
// keep top k detections
int ntop = size;
if (nms_topk > 0 && nms_topk < ntop) {
ntop = nms_topk;
for (int i = ntop + index; i < size; i += blockDim.x) {
out[i * 6] = -1;
}
__syncthreads();
}
// apply NMS
for (int compare_pos = 0; compare_pos < ntop; ++compare_pos) {
DType compare_id = out[compare_pos * 6];
if (compare_id < 0)
continue; // not a valid positive detection, skip
DType* compare_loc_ptr = out + compare_pos * 6 + 2;
for (int i = compare_pos + index + 1; i < ntop; i += blockDim.x) {
DType class_id = out[i * 6];
if (class_id < 0)
continue;
if (force_suppress || (class_id == compare_id)) {
DType iou;
CalculateOverlap(compare_loc_ptr, out + i * 6 + 2, &iou);
if (iou >= nms_threshold) {
out[i * 6] = -1;
}
}
}
__syncthreads();
}
}
} // namespace cuda
template <typename DType>
inline void MultiBoxDetectionForward(const Tensor<gpu, 3, DType>& out,
const Tensor<gpu, 3, DType>& cls_prob,
const Tensor<gpu, 2, DType>& loc_pred,
const Tensor<gpu, 2, DType>& anchors,
const Tensor<gpu, 3, DType>& temp_space,
const float threshold,
const bool clip,
const mxnet::Tuple<float>& variances,
const float nms_threshold,
const bool force_suppress,
const int nms_topk) {
CHECK_EQ(variances.ndim(), 4) << "Variance size must be 4";
const int num_classes = cls_prob.size(1);
const int num_anchors = cls_prob.size(2);
const int num_batches = cls_prob.size(0);
const int num_threads = cuda::kMaxThreadsPerBlock;
int num_blocks = num_batches;
cuda::CheckLaunchParam(num_blocks, num_threads, "MultiBoxDetection Forward");
cudaStream_t stream = Stream<gpu>::GetStream(out.stream_);
cuda::DetectionForwardKernel<<<num_blocks, num_threads, 0, stream>>>(out.dptr_,
cls_prob.dptr_,
loc_pred.dptr_,
anchors.dptr_,
temp_space.dptr_,
num_classes,
num_anchors,
threshold,
clip,
variances[0],
variances[1],
variances[2],
variances[3],
nms_threshold,
force_suppress,
nms_topk);
MULTIBOX_DETECTION_CUDA_CHECK(cudaGetLastError());
}
} // namespace mshadow
namespace mxnet {
namespace op {
template <>
Operator* CreateOp<gpu>(MultiBoxDetectionParam param, int dtype) {
Operator* op = nullptr;
MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { op = new MultiBoxDetectionOp<gpu, DType>(param); });
return op;
}
} // namespace op
} // namespace mxnet