blob: de215ce35a9848a2e807ec6e6afaf76085ba7896 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
/*!
* \file bounding_box-inl.cuh
* \brief bounding box CUDA operators
* \author Joshua Zhang
*/
#ifndef MXNET_OPERATOR_CONTRIB_BOUNDING_BOX_INL_CUH_
#define MXNET_OPERATOR_CONTRIB_BOUNDING_BOX_INL_CUH_
#include <cmath>
#include <cstdio>
#include <mxnet/operator_util.h>
#include <thrust/copy.h>
#include <thrust/execution_policy.h>
#include "../mshadow_op.h"
#include "../mxnet_op.h"
#include "../operator_common.h"
#include "./bounding_box-common.h"
namespace mxnet {
namespace op {
template<typename DType>
struct valid_value {
__host__ __device__ bool operator()(const DType x) {
return static_cast<bool>(x);
}
};
template<typename DType, typename FType>
int CopyIf(mshadow::Tensor<gpu, 1, DType> out,
mshadow::Tensor<gpu, 1, DType> value,
mshadow::Tensor<gpu, 1, FType> flag) {
valid_value<FType> pred;
DType *end_out = thrust::copy_if(thrust::device, value.dptr_, value.dptr_ + value.MSize(),
flag.dptr_, out.dptr_, pred);
return end_out - out.dptr_;
}
// compute line intersect along either height or width
template<typename DType>
MSHADOW_XINLINE DType Intersect2(const DType *a, const DType b1, const DType b2, int encode) {
const DType a1 = a[0];
const DType a2 = a[2];
DType left, right;
if (box_common_enum::kCorner == encode) {
left = a1 > b1 ? a1 : b1;
right = a2 < b2 ? a2 : b2;
} else {
const DType aw = a2 / 2;
const DType bw = b2 / 2;
const DType al = a1 - aw;
const DType ar = a1 + aw;
const DType bl = b1 - bw;
const DType br = b1 + bw;
left = bl > al ? bl : al;
right = br < ar ? br : ar;
}
const DType w = right - left;
return w > 0 ? w : DType(0);
}
template<typename DType, int N, bool check_class>
__launch_bounds__(512)
__global__ void nms_apply_kernel(const int topk, int32_t *index,
const int32_t *batch_start,
const DType *input,
const DType *areas,
const int num, const int stride,
const int offset_box, const int offset_id,
const float thresh, const bool force,
const int encode, const int start_offset) {
constexpr int block_size = 512;
const int start = static_cast<int>(batch_start[blockIdx.x]) + start_offset;
const int size_of_batch = static_cast<int>(batch_start[blockIdx.x + 1]) - start;
const int end = min(min(size_of_batch, topk - start_offset), N * block_size);
__shared__ int s_index[N * block_size];
for (int i = threadIdx.x; i < end; i += block_size) {
s_index[i] = static_cast<int>(index[start + i]);
}
__syncthreads();
for (int ref = 0; ref < end; ++ref) {
const int ref_area_offset = static_cast<int>(s_index[ref]);
if (ref_area_offset >= 0) {
const int ref_offset = ref_area_offset * stride + offset_box;
int ref_id = 0;
if (check_class) {
ref_id = static_cast<int>(input[ref_offset - offset_box + offset_id]);
}
for (int i = 0; i < N; ++i) {
const int my_pos = threadIdx.x + i * block_size;
if (my_pos > ref && my_pos < end && s_index[my_pos] >= 0) {
const int pos_area_offset = static_cast<int>(s_index[my_pos]);
const int pos_offset = pos_area_offset * stride + offset_box;
if (check_class) {
const int pos_id = static_cast<int>(input[pos_offset - offset_box + offset_id]);
if (ref_id != pos_id) continue; // different class
}
DType intersect = Intersect(input + ref_offset, input + pos_offset, encode);
intersect *= Intersect(input + ref_offset + 1, input + pos_offset + 1, encode);
const DType iou = intersect /
(areas[ref_area_offset] + areas[pos_area_offset] - intersect);
if (iou > thresh) {
s_index[my_pos] = -1;
}
}
}
__syncthreads();
}
}
for (int i = threadIdx.x; i < end; i += block_size) {
index[start + i] = s_index[i];
}
}
template<typename DType, int N, bool check_class>
__launch_bounds__(512)
__global__ void nms_apply_kernel_rest(const int topk, int32_t *index,
const int32_t *batch_start,
const DType *input,
const DType *areas,
const int num, const int stride,
const int offset_box, const int offset_id,
const float thresh, const bool force,
const int encode, const int start_offset,
const int blocks_per_batch) {
constexpr int block_size = 512;
const int batch = blockIdx.x / blocks_per_batch;
const int start_ref = static_cast<int>(batch_start[batch]) + start_offset;
const int block_offset = (N + blockIdx.x % blocks_per_batch) * block_size;
const int start = start_ref + block_offset;
const int size_of_batch = static_cast<int>(batch_start[batch + 1]) - start;
const int end = min(size_of_batch, topk - start_offset - block_offset);
const int my_pos = start + threadIdx.x;
if (threadIdx.x < end && index[my_pos] >= 0) {
const int pos_area_offset = static_cast<int>(index[my_pos]);
const int pos_offset = pos_area_offset * stride + offset_box;
DType my_box[4];
#pragma unroll
for (int i = 0; i < 4; ++i) {
my_box[i] = input[pos_offset + i];
}
const DType my_area = areas[pos_area_offset];
int pos_id = 0;
if (check_class) {
pos_id = static_cast<int>(input[pos_offset - offset_box + offset_id]);
}
for (int ref = start_ref; ref < start_ref + N * block_size; ++ref) {
const int ref_area_offset = static_cast<int>(index[ref]);
if (ref_area_offset >= 0) {
const int ref_offset = ref_area_offset * stride + offset_box;
int ref_id = 0;
if (check_class) {
ref_id = static_cast<int>(input[ref_offset - offset_box + offset_id]);
if (ref_id != pos_id) continue; // different class
}
DType intersect = Intersect2(input + ref_offset, my_box[0], my_box[2], encode);
intersect *= Intersect2(input + ref_offset + 1, my_box[1], my_box[3], encode);
const DType iou = intersect /
(areas[ref_area_offset] + my_area - intersect);
if (iou > thresh) {
index[my_pos] = -1;
break;
}
}
}
}
}
template<typename DType>
void NMSApply(mshadow::Stream<gpu> *s,
int num_batch, int topk,
mshadow::Tensor<gpu, 1, int32_t>* sorted_index,
mshadow::Tensor<gpu, 1, int32_t>* batch_start,
mshadow::Tensor<gpu, 3, DType>* buffer,
mshadow::Tensor<gpu, 1, DType>* areas,
int num_elem, int width_elem,
int coord_start, int id_index,
float threshold, bool force_suppress,
int in_format) {
using namespace mxnet_op;
constexpr int THRESHOLD = 1024;
for (int ref = 0; ref < topk; ref += THRESHOLD) {
constexpr int block_size = 512;
constexpr int N = THRESHOLD / block_size;
auto stream = mshadow::Stream<gpu>::GetStream(s);
if (!force_suppress && id_index >= 0) {
nms_apply_kernel<DType, N, true><<<num_batch, block_size, 0, stream>>>(topk,
sorted_index->dptr_,
batch_start->dptr_,
buffer->dptr_,
areas->dptr_,
num_elem,
width_elem,
coord_start,
id_index,
threshold,
force_suppress,
in_format,
ref);
int blocks_per_batch = (topk - ref - THRESHOLD + block_size - 1)/block_size;
int blocks = blocks_per_batch * num_batch;
if (blocks > 0) {
nms_apply_kernel_rest<DType, N, true><<<blocks, block_size, 0, stream>>>(topk,
sorted_index->dptr_,
batch_start->dptr_,
buffer->dptr_,
areas->dptr_,
num_elem,
width_elem,
coord_start,
id_index,
threshold,
force_suppress,
in_format,
ref,
blocks_per_batch);
}
} else {
nms_apply_kernel<DType, N, false><<<num_batch, block_size, 0, stream>>>(topk,
sorted_index->dptr_,
batch_start->dptr_,
buffer->dptr_,
areas->dptr_,
num_elem,
width_elem,
coord_start,
id_index,
threshold,
force_suppress,
in_format,
ref);
int blocks_per_batch = (topk - ref - THRESHOLD + block_size - 1)/block_size;
int blocks = blocks_per_batch * num_batch;
if (blocks > 0) {
nms_apply_kernel_rest<DType, N, false><<<blocks, block_size, 0, stream>>>(topk,
sorted_index->dptr_,
batch_start->dptr_,
buffer->dptr_,
areas->dptr_,
num_elem,
width_elem,
coord_start,
id_index,
threshold,
force_suppress,
in_format,
ref,
blocks_per_batch);
}
}
}
}
__launch_bounds__(512)
__global__ void nms_calculate_batch_start_kernel(int32_t * batch_start,
int32_t * valid_batch_id,
size_t N,
int num_batch) {
size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < N) {
#if __CUDA_ARCH__ >= 350
const int32_t previous = tid > 0 ? __ldg(valid_batch_id + tid - 1) : -1;
const int32_t my = __ldg(valid_batch_id + tid);
#else
const int32_t previous = tid > 0 ? valid_batch_id[tid - 1] : -1;
const int32_t my = valid_batch_id[tid];
#endif
if (my > previous) {
for (int32_t current = previous + 1; current <= my; ++current) {
batch_start[current] = tid;
}
}
if (tid == N - 1) {
for (int32_t current = my + 1; current <= num_batch; ++current) {
batch_start[current] = tid + 1;
}
}
}
}
inline void NMSCalculateBatchStart(mshadow::Stream<gpu> *s,
mshadow::Tensor<gpu, 1, int32_t>* batch_start,
mshadow::Tensor<gpu, 1, int32_t>* valid_batch_id,
int num_batch) {
using namespace mshadow;
using namespace mshadow::expr;
using namespace mxnet_op;
auto stream = mshadow::Stream<gpu>::GetStream(s);
constexpr int block_size = 512;
const int num_elements = valid_batch_id->size(0);
const int blocks = (num_elements + block_size - 1) / block_size;
nms_calculate_batch_start_kernel<<<blocks, block_size, 0, stream>>>(batch_start->dptr_,
valid_batch_id->dptr_,
num_elements,
num_batch);
}
} // namespace op
} // namespace mxnet
#endif // MXNET_OPERATOR_CONTRIB_BOUNDING_BOX_INL_CUH_